In [31]:
import numpy as np
import pandas as pd
import hazm as hz
from hazm import Normalizer, word_tokenize,stopwords_list,Stemmer
import re
from keras.preprocessing import sequence
import tensorflow as tf
from keras.models import Sequential
from tensorflow import keras
from tensorflow.keras import layers
import itertools    
from keras.layers import Embedding, LSTM, Dense, Dropout,Masking
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPooling1D

In [40]:
df=pd.read_csv('snappfood.csv', on_bad_lines='skip' , delimiter='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


In [41]:
#remove nan values
df=df[['comment','label_id']]
df=df.dropna()
normalizer = hz.Normalizer()
def preprocess(text):
    text = re.sub(r"[\{\}\؛\*\=\-\_\+\/\n]"," ",str(text))
    text = re.sub("[ ]+"," ",text)
    text = re.sub("\!+","!",text)
    text = re.sub("[؟]+","؟",text)
    text = re.sub("[.]+","",text)
    text = re.sub("[،]+","",text)
    # replace Finglish words with an empty string
    finglish_pattern = r"[a-zA-Z]+"
    if finglish_pattern in text:   
        text = re.sub(finglish_pattern, "", text)
    for c in "..آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئ":
        text = re.sub(f"[{c}]+", c, text)
    # \u200c:separate two characters that should not be connected,\r\n:remove line break
    text=text.replace('\u200c', '').replace('\r\n',' ').replace('|',' ')
    #normalize the text
    text = normalizer.normalize(text)
    words = []
    words.append(hz.word_tokenize(text))
    return words

train_data = df['comment'].apply(preprocess)
df['comment'] = list(itertools.chain(*train_data))

# Remove stopwords
stopwords=stopwords_list()
df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x if word not in stopwords]))

#remove english comments
english_text=df[df.comment.str.contains(r'[a-zA-Z]+')]
idx=english_text.index
df=df.drop(idx).reset_index()

# find the stemm of words
stemmer = hz.Stemmer()
def stem_comment(comment):
    return ' '.join([stemmer.stem(word) for word in comment.split()])

# Apply stemming to 'comment' column
df['comment'] = df['comment'].apply(stem_comment)
df=df[['comment','label_id']]

In [42]:
# Preprocessing
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(df['comment'])
print(dict(list(tokenizer.word_index.items())[0:3]))
#transforms each text in texts to a sequence of integers
X = tokenizer.texts_to_sequences(df['comment'])
#adding padding to comments
X = pad_sequences(X, maxlen=100)
# Splitting data into training and testing set
y = df['label_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

{'غذا': 1, 'کیف': 2, 'سفار': 3}


array([[  0,   0,   0, ..., 212,  88,  63],
       [  0,   0,   0, ...,  41, 202,  42],
       [  0,   0,   0, ..., 100,  10,   8],
       ...,
       [  0,   0,   0, ..., 837,   4, 246],
       [  0,   0,   0, ..., 100,   6,  28],
       [  0,   0,   0, ...,  54, 334,   4]], dtype=int32)

In [43]:

class TransformerEncoder(layers.Layer):
  
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        count = 0
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
            if not count:
                print("mask: ", mask)
                count += 1
        # print("input shape: ", inputs.shape)
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
  
    def get_config(self):
        config = super().get_config()
        config.update({
          "embed_dim": self.embed_dim,
          "num_heads": self.num_heads,  
          "dense_dim": self.dense_dim,
        })
        return config

In [46]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32
def get_model():
    model = Sequential()
    model.add(Masking(mask_value=0))
    model.add(Embedding(vocab_size, embed_dim))
    # Use TransformerEncoder
    model.add(TransformerEncoder(embed_dim, dense_dim, num_heads))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
model=get_model()


callbacks = [
        keras.callbacks.ModelCheckpoint("transformer_encoder.keras",
                                        save_best_only=True)
]
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1,
          callbacks=callbacks)
model = keras.models.load_model(
          "transformer_encoder.keras",
          custom_objects={"TransformerEncoder": TransformerEncoder})
print(model.summary())
print(f"Test acc: {model.evaluate(X_test, y_test)[1]:.3f}")


Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_17 (Masking)        (None, 100)               0         
                                                                 
 embedding_17 (Embedding)    (None, 100, 256)          5120000   
                                                                 
 transformer_encoder_17 (Tra  (None, 100, 256)         543776    
 nsformerEncoder)                                                
                                                                 
 global_max_pooling1d_4 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_55 (Dense)            (None, 1)               