In [None]:
!pip install fasttext hazm

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 2.9 MB/s 
[?25hCollecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 13.3 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Collecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 15.2 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 16.0 MB/s 
Building wheels for collected packages: fasttext, nltk, libwapiti
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3131502 sha256=b3c5a64562a8cafcf8bc07d50bfc2a700334a9672f4c456631d2219d8bbfbc2c
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c0

# **import section**

In [None]:
import pandas as pd
import numpy as np
import codecs
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
import fasttext
from hazm import Normalizer, Lemmatizer, WordTokenizer, stopwords_list
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import *
from keras.layers import *
from keras.optimizers import *

# **metrics function**

In [None]:
def get_metrics(y_true, y_pred):
  return {'accuracy': accuracy_score(y_true, y_pred), 'balance_accuracy': balanced_accuracy_score(y_true, y_pred)}

# **data preparation**

### reading data and split for train and test

In [None]:
df = pd.read_csv('https://github.com/SinRas/Quera1401NLP/blob/main/sentipers.csv?raw=true')
df_train, df_test = train_test_split(df, random_state=13)

### preprocessing and prepare labels

In [None]:
stopwords = set(stopwords_list())
normalizer = Normalizer()
hazm_tokenizer = WordTokenizer()
lemmatizer = Lemmatizer()
def preprocess(text, remove_stopwords=False, lemmatize=False):
    text = normalizer.normalize(text)
    tokenized_words = hazm_tokenizer.tokenize(text)
    if lemmatize:
      tokenized_words = [lemmatizer.lemmatize(word).split('#')[0] for word in tokenized_words]
    if remove_stopwords:
      tokenized_words = [word for word in tokenized_words if word not in stopwords]
    return ' '.join(tokenized_words)

train_texts = [preprocess(t, True, True) for t in df_train['text']]
test_texts = [preprocess(t, True, True) for t in df_test['text']]
train_labels = list(df_train['label'].astype(int))
test_labels = list(df_test['label'].astype(int))

# **bag of words with classic ml**


[count vectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
vectorizer = CountVectorizer(max_features=10000, ngram_range=(1,2)).fit(train_texts)
x_train = vectorizer.transform(train_texts)
x_test = vectorizer.transform(test_texts)

### fit model

[logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

[decision tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

[multi layer perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)

In [None]:
clf = LogisticRegression(random_state=0, max_iter=500).fit(x_train, train_labels)

### test model

In [None]:
lr_preds = clf.predict(x_test)
get_metrics(test_labels, lr_preds)

{'accuracy': 0.6046926804386636, 'balance_accuracy': 0.46627937732783575}

# **fasttext**

## **unsupervised**

### prepare fasttext data

In [None]:
with open('fasttext_unsupervised', 'w', encoding='utf-8') as out_file:
  for text in tramulti_classin_texts:
    out_file.write(text+"\n")

### train model

[fasttext](https://fasttext.cc/docs/en/python-module.html)

In [None]:
model = fasttext.train_unsupervised('fasttext_unsupervised')


### get word vector

In [None]:
v = model['کیفیت']
v

array([-0.08174179,  0.02283131, -0.20606835, -0.15568598, -0.01498603,
       -0.21466225,  0.06228063, -0.02303107,  0.02668864,  0.4945953 ,
       -0.28417784,  0.04637766,  0.05042268, -0.0167454 ,  0.02650889,
        0.05333986,  0.02427731,  0.07475328,  0.10023021,  0.00977277,
        0.2543495 ,  0.0171654 , -0.15237947, -0.0315598 ,  0.07570711,
       -0.28095865,  0.2917196 ,  0.08144619, -0.07472356,  0.25846672,
        0.05157208, -0.04778065, -0.04101653,  0.07141962, -0.16854843,
       -0.23237541,  0.08648346, -0.15194973, -0.08310773,  0.1507775 ,
        0.07506619,  0.3127003 ,  0.0302394 ,  0.05907486,  0.3800783 ,
       -0.05382648, -0.24196891, -0.06017956, -0.06612045,  0.05207014,
       -0.18682751, -0.0673539 , -0.1981201 ,  0.16740301, -0.02330521,
       -0.00714078, -0.13274136,  0.0534226 , -0.0905301 ,  0.18917784,
        0.01110097,  0.29115242,  0.33193773,  0.05842251,  0.09977193,
       -0.24751475,  0.15286   , -0.0882332 ,  0.00382062, -0.20

### get similar words

In [None]:
model.get_nearest_neighbors('کیفیت', k=10)

[(0.9502526521682739, 'با\u200cکیفیت'),
 (0.9434821009635925, 'کیفیته'),
 (0.9333087801933289, 'باکیفیت'),
 (0.9270477294921875, 'فیلمبرداریش'),
 (0.9258806705474854, 'شفافیت'),
 (0.9210011959075928, 'عکسبرداری'),
 (0.9183793663978577, 'واضح'),
 (0.9171732664108276, 'روشن'),
 (0.9139121174812317, 'full'),
 (0.9123570919036865, 'عکس\u200cبرداری')]

## **supervised**

In [None]:
with open('fasttext_train', 'w', encoding='utf-8') as out_file:
  for text, label in zip(train_texts, train_labels):
    label = '__label__' + str(label)
    out_file.write(label + ' ' + text + '\n')

with open('fasttext_test', 'w', encoding='utf-8') as out_file:
  for text, label in zip(test_texts, test_labels):
    label = '__label__' + str(label)
    out_file.write(label + ' ' + text + '\n')

model = fasttext.train_supervised(input="fasttext_train")
# model.test('fasttext_test')

In [None]:
ft_preds = [int(i[0].split('_')[-1]) for i in model.predict(test_texts)[0]]
get_metrics(test_labels, ft_preds)

{'accuracy': 0.5608263198163733, 'balance_accuracy': 0.3845901284561627}

# **deep lstm**

### prepare data

In [None]:
max_len = 200
max_words = 10000
tokenizer = Tokenizer(split=' ', num_words=max_words)
tokenizer.fit_on_texts(train_texts)
x_train = tokenizer.texts_to_sequences(train_texts)
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = tokenizer.texts_to_sequences(test_texts)
x_test = pad_sequences(x_test, maxlen=max_len)
y_train = LabelBinarizer().fit_transform(train_labels)
y_test = LabelBinarizer().fit_transform(test_labels)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train)

### create model

In [None]:
embed_dim = 100
lstm_out = 100

model = Sequential()
model.add(Embedding(max_words, embed_dim, input_length=max_len))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3, return_sequences=False))
model.add(Dense(50, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 100)          1000000   
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 200, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 dense_6 (Dense)             (None, 50)                5050      
                                                                 
 dense_7 (Dense)             (None, 5)                 255       
                                                                 
Total params: 1,085,705
Trainable params: 1,085,705
Non-trainable params: 0
____________________________________________

### train model

In [None]:
model.fit(x_train, y_train, epochs = 50, batch_size=264, verbose = 2,
          callbacks=[EarlyStopping(patience=5)], validation_data=(x_val, y_val))

Epoch 1/50
34/34 - 61s - loss: 1.4130 - accuracy: 0.3628 - val_loss: 1.3157 - val_accuracy: 0.3890 - 61s/epoch - 2s/step
Epoch 2/50
34/34 - 56s - loss: 1.3296 - accuracy: 0.3859 - val_loss: 1.3010 - val_accuracy: 0.4162 - 56s/epoch - 2s/step
Epoch 3/50
34/34 - 56s - loss: 1.2540 - accuracy: 0.4552 - val_loss: 1.1893 - val_accuracy: 0.4995 - 56s/epoch - 2s/step
Epoch 4/50
34/34 - 55s - loss: 1.0317 - accuracy: 0.5841 - val_loss: 1.1225 - val_accuracy: 0.5335 - 55s/epoch - 2s/step
Epoch 5/50
34/34 - 60s - loss: 0.8196 - accuracy: 0.6857 - val_loss: 1.1263 - val_accuracy: 0.5315 - 60s/epoch - 2s/step
Epoch 6/50
34/34 - 55s - loss: 0.6693 - accuracy: 0.7499 - val_loss: 1.1867 - val_accuracy: 0.5519 - 55s/epoch - 2s/step
Epoch 7/50
34/34 - 55s - loss: 0.5864 - accuracy: 0.7856 - val_loss: 1.2611 - val_accuracy: 0.5661 - 55s/epoch - 2s/step
Epoch 8/50
34/34 - 55s - loss: 0.5094 - accuracy: 0.8195 - val_loss: 1.3711 - val_accuracy: 0.5553 - 55s/epoch - 2s/step
Epoch 9/50
34/34 - 55s - loss: 0

<keras.callbacks.History at 0x7f0eedf3ef90>

### test model

In [None]:
score,acc = model.evaluate(x_test, y_test, verbose = 2, batch_size = 64)

62/62 - 5s - loss: 1.3508 - accuracy: 0.5545 - 5s/epoch - 85ms/step


In [None]:
def pred_text(text):
    tmp = [preprocess(text)]
    seq = tokenizer.texts_to_sequences(tmp)
    seq = pad_sequences(seq, maxlen=max_len)
    y_pred = np.argmax(model.predict(seq), axis=1)
    return y_pred[0]


pred_text('همه چی عالی بود')

4

In [None]:
y_test_num = np.argmax(y_test, axis=1)
lstm_preds = model.predict(x_test)
lstm_preds = np.argmax(lstm_preds, axis=1)
get_metrics(y_test_num, lstm_preds)

{'accuracy': 0.5544503953073195, 'balance_accuracy': 0.42878316690278295}

# **using transformers and bert**

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModel
config = AutoConfig.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model = AutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased", output_hidden_states=True)
model = model.to(device)

def bert_emb(text):   
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**encoded_input)
    return output['pooler_output'].detach().cpu().numpy()[0]

bert_emb('رییس جمهور ایران به قطر رفت')