In [1]:
from __future__ import unicode_literals
from hazm import *
import tensorflow as tf
from keras.models import Sequential
import pandas as pd
from keras.layers import Dense
import numpy as np
import re
from urlextract import URLExtract
import emojis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta,Adam,RMSprop
from keras.utils import np_utils
from tqdm import tqdm

In [2]:
corpus = pd.read_csv('Snappfood - Sentiment Analysis.csv', on_bad_lines='skip', delimiter='\t')

In [96]:
corpus.head()

Unnamed: 0,comment,label,label_id,Cleaned
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0,قرار بود 1 ساعته برسه ولی نیم ساعت زودتر از مو...
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0,عالی بود همه چه درست و به اندازه و کیفیت خوب، ...
4,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0,شیرینی وانیلی فقط یک مدل بود


In [3]:
def _multiple_replace(mapping, text):
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

def convert_fa_numbers(input_str):
    mapping = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
        '.': '.',
    }
    return _multiple_replace(mapping, input_str)


def convert_ar_characters(input_str):
    """
    Converts Arabic chars to related Persian unicode char
    :param input_str: String contains Arabic chars
    :return: New str with converted arabic chars
    """
    mapping = {
        'ك': 'ک',
        'ى': 'ی',
        'ي': 'ی',
        'ئ':'ی',
        'إ':'ا',
        'أ':'ا',
        'ة':'ه',
        'ؤ':'و'
    }
    return _multiple_replace(mapping, input_str)


def preprocess(text):
    extractor = URLExtract()
    for url in extractor.gen_urls(text):
        text = text.replace(url,'<URL>')
    emj = emojis.get(text)
    for i in emj:
        if i in text:
            text = text.replace(i,'<emoji>')
    text = convert_fa_numbers(text)
    text = convert_ar_characters(text)
    # regex to detect and replace all smilies in the text with <smiley>
    text = re.sub(r"(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:\s?D|8-\)|:\s?\||;\s?\)|:-\*|:-\||:-\(|:\s?P|:-P|:-p|:-b|:-O|:-o|:-0|:-\@|:\$|:-\^|:-&|:-\*|:-\+|:-\~|:-\`|:-\>|:-\<|:-\}|:-\{|\[:\s?\]|\[:\s?\]|:\s?\]|:\s?\[|:\s?\}|:\s?\{)",'<smiley>',text)
    text = text.lower() # we lowercase here to prevent changes in the URLs and smilies
    text = text.strip()
    text = re.sub(r'[<>#.:()"\'!?؟،,@$%^&*_+\[\]/]', ' ', text)
    text = re.sub(r'[\s]{2,}', ' ', text)
    text = re.sub(r'(\w)\1{2,}', r'\1',text)
    if re.search(r'[\u0600-\u06FF]', text):
        return(text)
    else:
        return 'None'

In [4]:
tqdm.pandas()

In [5]:
corpus['Cleaned'] = corpus['comment'].progress_apply(preprocess)

100%|████████████████████████████████████████████████████████████████████████████| 70000/70000 [37:40<00:00, 30.96it/s]


In [6]:
 corpus.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id,Cleaned
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0,قرار بود 1 ساعته برسه ولی نیم ساعت زودتر از مو...
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0,عالی بود همه چه درست و به اندازه و کیفیت خوب، ...
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0,شیرینی وانیلی فقط یک مدل بود


In [7]:
corpus = corpus.drop('Unnamed: 0', axis=1)

In [8]:
corpus = corpus.dropna()

In [9]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69480 entries, 0 to 69999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   comment   69480 non-null  object 
 1   label     69480 non-null  object 
 2   label_id  69480 non-null  float64
 3   Cleaned   69480 non-null  object 
dtypes: float64(1), object(3)
memory usage: 2.7+ MB


In [10]:
corpus.to_csv('snappfood_comments_preprocessed.csv')

In [11]:
# with open('stop words - Farsi.txt', 'r', encoding='utf-8') as f:
#     stopwords = f.read().splitlines()

# corpus['Cleaned_sw_rmvd'] = corpus['Cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [12]:
# def text_normalizer(text):
#   stemmer = Stemmer()
#   text = stemmer.stem(text)

#   normalizer = Normalizer()
#   text = normalizer.normalize(text)

#   return text

In [13]:
# corpus['Cleaned_normalized'] = corpus['Cleaned'].progress_apply(lambda x: text_normalizer(x))

In [14]:
count_vectorizer = CountVectorizer()
X_count_vectorized = count_vectorizer.fit_transform(corpus.Cleaned).todense()

In [15]:
vectorizer = TfidfVectorizer(min_df=2, max_features= 10000)
X_tfidf_vectorized = vectorizer.fit_transform(corpus.Cleaned).todense()

In [17]:
labels = corpus['label_id'].values

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_count_vectorized, labels, test_size=0.2, random_state=42) 

In [19]:
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf_vectorized, labels, test_size=0.2, random_state=42)

In [23]:
input_dim = X_tfidf_train.shape

In [101]:
print(input_dim)

(55584, 10000)


In [24]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8488054116292458


In [27]:
clf = LogisticRegression()
clf.fit(X_tfidf_train, y_tfidf_train)
tfidf_score = clf.score(X_tfidf_test, y_tfidf_test)
print("Accuracy:", tfidf_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8645653425446171


In [28]:
nb_classes = 2
batch_size = 32
nb_epochs = 10

In [29]:
y_tfidf_train_cat = np_utils.to_categorical(y_tfidf_train,)

In [30]:
model = Sequential()

model.add(Dense(1000,input_shape= (input_dim[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(500))

model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(50))

model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
tf.config.run_functions_eagerly(True)

In [32]:
model.fit(X_tfidf_train, y_tfidf_train_cat, batch_size=batch_size, epochs=nb_epochs,verbose=2)



Epoch 1/10
1737/1737 - 131s - loss: 0.3614 - 131s/epoch - 75ms/step
Epoch 2/10
1737/1737 - 133s - loss: 0.2851 - 133s/epoch - 77ms/step
Epoch 3/10
1737/1737 - 127s - loss: 0.2203 - 127s/epoch - 73ms/step
Epoch 4/10
1737/1737 - 129s - loss: 0.1307 - 129s/epoch - 74ms/step
Epoch 5/10
1737/1737 - 131s - loss: 0.0721 - 131s/epoch - 75ms/step
Epoch 6/10
1737/1737 - 129s - loss: 0.0448 - 129s/epoch - 74ms/step
Epoch 7/10
1737/1737 - 131s - loss: 0.0325 - 131s/epoch - 75ms/step
Epoch 8/10
1737/1737 - 129s - loss: 0.0265 - 129s/epoch - 74ms/step
Epoch 9/10
1737/1737 - 130s - loss: 0.0208 - 130s/epoch - 75ms/step
Epoch 10/10
1737/1737 - 129s - loss: 0.0178 - 129s/epoch - 74ms/step


<keras.callbacks.History at 0x266919a39d0>

In [33]:
y_test_pred = model.predict(X_tfidf_test)
y_test_predclass = np.argmax(y_test_pred, axis=1)
y_trian_pred = model.predict(X_tfidf_train)
y_train_predclass = np.argmax(y_trian_pred, axis=1)



In [34]:
from sklearn.metrics import accuracy_score,classification_report
print ("nDeep Neural Network - Test accuracy:",(round(accuracy_score(y_tfidf_test, y_test_predclass),4)*100))
print ("nDeep Neural Network - Train accuracy:",(round(accuracy_score(y_tfidf_train, y_train_predclass),4)*100))

nDeep Neural Network - Test accuracy: 83.97
nDeep Neural Network - Train accuracy: 99.71


In [130]:
X_pred = vectorizer.transform([preprocess('دیگه از این رستوران سفارش نمیدم')]).todense()

In [131]:
model.predict(X_pred)



array([[0.00547247, 0.9945275 ]], dtype=float32)