## STEP 1: Import the required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

## STEP 2: Import the arabic Dataset

In [12]:
data = pd.read_excel(r"data.xlsx")
#print(data.head(5))

## STEP 3: Data Preprocessing

In [13]:
#def lower_case (text):
#    text = text.lower()
#    return text
data['Comment'] = data['Comment'].str.lower()
#print(data.head(10))

In [14]:
data['Comment'][0]

'بربي مقداد قداش من مرة تجيبوه افهمونا راهو ماسط و الشعب ما يحبوش'

In [15]:
#def remove_links (text):
 #   text = re.sub(r'http\S+', '', text)
  #  return text
data['Comment'] = data['Comment'].str.replace(r'http\S+', '')
#print(data.head(10))

In [16]:
df_list = data["Comment"].values.tolist()
print(df_list[0])

بربي مقداد قداش من مرة تجيبوه افهمونا راهو ماسط و الشعب ما يحبوش


In [17]:
from collections import Counter
 
def remove_duplicates(input):
 
    # split input string separated by space
    input = input.split(" ")
 
    # joins two adjacent elements in iterable way
    for i in range(0, len(input)):
        input[i] = "".join(input[i])
 
    # now create dictionary using counter method
    # which will have strings as key and their
    # frequencies as value
    UniqW = Counter(input)
 
    # joins two adjacent elements in iterable way
    s = " ".join(UniqW.keys())
    return (s)

In [18]:
print(remove_duplicates(df_list[0]))

بربي مقداد قداش من مرة تجيبوه افهمونا راهو ماسط و الشعب ما يحبوش


In [19]:
lis=[]
for i in range(len(df_list)):
    #print(i)
    sen=remove_duplicates(df_list[i])
    #print(sen)
    lis.append(sen)
#print(lis)

In [20]:
from pandas import DataFrame
data["Comment"] = DataFrame (lis,columns=['Comment'])

In [21]:
data

Unnamed: 0,Comment,Sentiment
0,بربي مقداد قداش من مرة تجيبوه افهمونا راهو ماس...,sad
1,karim il gharbi masit i sitcom ma3jibnich,sad
2,mabldou,sad
3,الهالكا مختصة في قناة الزيتونة فقط وغير محايدة...,sad
4,إتحاد الخراب والدمار ،بعدما خرب بلادنا يريدون ...,sad
5,سلب لفلوسهم خاطر التوانسة فاقو بسرقتهم,sad
6,و المؤسسات الخاصة المصانع لا يشملهم الاضراب,sad
7,اش مدخل المقروض في الاضراب يامعلم,sad
8,والخاصة طز فيهم اخاطر يخلصو في زوز فرنك علي ال...,angry
9,اتحاد الخراب لن ينجح,angry


In [22]:
print(data.head(10))

                                             Comment Sentiment
0  بربي مقداد قداش من مرة تجيبوه افهمونا راهو ماس...       sad
1          karim il gharbi masit i sitcom ma3jibnich       sad
2                                            mabldou       sad
3  الهالكا مختصة في قناة الزيتونة فقط وغير محايدة...       sad
4  إتحاد الخراب والدمار ،بعدما خرب بلادنا يريدون ...       sad
5             سلب لفلوسهم خاطر التوانسة فاقو بسرقتهم       sad
6        و المؤسسات الخاصة المصانع لا يشملهم الاضراب       sad
7                  اش مدخل المقروض في الاضراب يامعلم       sad
8  والخاصة طز فيهم اخاطر يخلصو في زوز فرنك علي ال...     angry
9                               اتحاد الخراب لن ينجح     angry


In [23]:
def clean_diacritics (text):
    arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', text)
    return text
data['Comment'] = data['Comment'].apply(clean_diacritics)
#print(data.head(10))

In [24]:
def clean_text(text):
    text = re.sub('[^\w]+|_', ' ', text, flags=re.U)
    text = re.sub('x005F', '', text)
    text = re.sub('x005D', '', text)
    text = re.sub('x000D', '', text)
    #" ".join([word for word in text if word not in string.punctuation])
    #text = remove_emoji(text)
    #text = remove_diacritics(text)
    #tokens = word_tokenize(text)
    #text = ' '.join([word for word in tokens if word not in stop_words])
    return text
data['Comment'] = data['Comment'].apply(clean_text)
print(data.head(10))

                                             Comment Sentiment
0  بربي مقداد قداش من مرة تجيبوه افهمونا راهو ماس...       sad
1          karim il gharbi masit i sitcom ma3jibnich       sad
2                                            mabldou       sad
3  الهالكا مختصة في قناة الزيتونة فقط وغير محايدة...       sad
4  إتحاد الخراب والدمار بعدما خرب بلادنا يريدون إ...       sad
5             سلب لفلوسهم خاطر التوانسة فاقو بسرقتهم       sad
6        و المؤسسات الخاصة المصانع لا يشملهم الاضراب       sad
7                  اش مدخل المقروض في الاضراب يامعلم       sad
8  والخاصة طز فيهم اخاطر يخلصو في زوز فرنك علي ال...     angry
9                               اتحاد الخراب لن ينجح     angry


In [25]:
def remove_emoji(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r' ',text)

def clean_text(text):
    text = re.sub('[^\w]+|_', ' ', text, flags=re.U)
    text = remove_emoji(text)
    return text
def preprocess(text):
    text = clean_text(text)
    text=remove_emoji(text)
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    return text
  
data['Comment'] = data['Comment'].apply(preprocess)
print(data['Comment'][0])

بربي مقداد قداش من مره تجيبوه افهمونا راهو ماسط و الشعب ما يحبوش


In [26]:
#Eliminer les stops words selon la liste disponible
with open("StopWords.txt", "r", encoding="utf8") as f:
    mylist = f.read().splitlines() 
f.close()
data['Comment'] = data['Comment'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in mylist))
#print(data)

In [27]:
uniqueValues = data.Sentiment.unique()
uniqueValues

array(['sad', 'angry', 'happy', 'neutre'], dtype=object)

In [28]:
uniqueValues1 = data['Sentiment'].value_counts()
uniqueValues1

sad       2628
angry     2598
neutre    2598
happy     2570
Name: Sentiment, dtype: int64

In [29]:
data['Sentiment'] = data['Sentiment'].factorize()[0]
data.sample(100)

Unnamed: 0,Comment,Sentiment
1423,خير شاء فرحت اسغيراتك,2
2911,جينرال وحشه,0
84,افففف جبالي,0
9951,lzem y7ki biha yt3lmha,3
107,ماسط لاسط,0
5369,انسان رخيص,1
4138,وخياني الكلب الحيوان الديوث نشدوا,1
9032,ههههه ولدي يكسرها لحظه اختصاصو,3
8684,liya 9edech 3adit m3akom commande ltw wsolnii,3
3140,عملتولنا العار شوهتم طبايع التونسي الاصيل تفوه...,1


In [30]:
above_35 = data[data["Sentiment"] == 1]
above_35

Unnamed: 0,Comment,Sentiment
8,والخاصه طز اخاطر يخلصو زوز فرنك السميك,1
9,اتحاد الخراب ينجح,1
10,اطمع يهز للهاويه بلاد شافه الافلاس والاتحاد هو...,1
12,تشخر زادت بوف يهلك التحاد الخراب,1
21,akhmaj etayaran tunisair nul,1
22,marra tisma33 7kaya jdida bled 7arfaa ken lmsa...,1
23,يهلكوو ماصيرو يتشد ماعندو هارب,1
26,walah te7chmou,1
31,الاتحاد غربان وجراد شءم,1
34,اتحاد الخراب والفساد تربحهم,1


In [31]:
data.sample(5)

Unnamed: 0,Comment,Sentiment
8424,bravooooo twensa baldia chnwa ynajmo ya3mlo,2
6642,جيناهاش تقوللهم التونسيه تهز الطحين لترويكا يق...,1
10263,ken ma3andkomch techri chaweya kenoun f7am 9ar...,3
4691,tbarakkah 3la southa,2
4565,tbarkallah wini lamis,2


In [32]:
print("Number of null elements in train set: \n{}".format(data["Comment"].isna().sum()))

Number of null elements in train set: 
0


In [33]:
dd=data["Comment"].tolist()
#dd

In [34]:
len(dd)

10394

In [35]:
for el in dd:
    if el=='':
        dd.remove(el)
len(dd)

10386

In [36]:
data1 = data["Comment"] != " "
dfNew = data[data1]

In [37]:
print(dfNew[dfNew['Comment'] == ' '].index)

Int64Index([], dtype='int64')


In [38]:
dfNew.loc[284]

Comment      ماسططططططططط
Sentiment               0
Name: 284, dtype: object

In [39]:
print("Number of null elements in train set: \n{}".format(dfNew.isna().sum()))

Number of null elements in train set: 
Comment      0
Sentiment    0
dtype: int64


In [40]:
# create excel writer object
writer = pd.ExcelWriter('output.xlsx')
# write dataframe to excel
dfNew.to_excel(writer)
# save the excel
writer.save()
print('DataFrame is written successfully to Excel File.')
data=dfNew

DataFrame is written successfully to Excel File.


In [41]:
data1 = pd.read_excel(r"output.xlsx")
print(data1.head(5))

   Unnamed: 0                                            Comment  Sentiment
0           0     مقداد قداش مره تجيبوه افهمونا ماسط الشعب يحبوش          0
1           1             karim gharbi masit i sitcom ma3jibnich          0
2           2                                            mabldou          0
3           3  الهالكا مختصه قناه الزيتونه محايده وانتهت مهمت...          0
4           4  اتحاد الخراب والدمار خرب بلادنا يريدون افريقيا...          0


In [42]:
#data1.drop('Unnamed: 0', axis='columns', inplace=True)
data1

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,مقداد قداش مره تجيبوه افهمونا ماسط الشعب يحبوش,0
1,1,karim gharbi masit i sitcom ma3jibnich,0
2,2,mabldou,0
3,3,الهالكا مختصه قناه الزيتونه محايده وانتهت مهمت...,0
4,4,اتحاد الخراب والدمار خرب بلادنا يريدون افريقيا...,0
5,5,سلب لفلوسهم خاطر التوانسه فاقو بسرقتهم,0
6,6,المءسسات الخاصه المصانع يشملهم الاضراب,0
7,7,مدخل المقروض الاضراب يامعلم,0
8,8,والخاصه طز اخاطر يخلصو زوز فرنك السميك,1
9,9,اتحاد الخراب ينجح,1


## STEP 4: Création du modèle de Machine Learning

In [43]:
# splitting the data into target and feature
feature = data1.Comment.fillna(' ')
target = data1.Sentiment
# splitting into train and tests
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size =.2, random_state=100)

In [None]:
Y_train

In [None]:
X_train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(X_train)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform([X_train[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

In [None]:
vector1 = vectorizer.transform(X_train)
vector2 = vectorizer.transform(X_test)


In [None]:
vector1

In [None]:
X_train_modif=vector1.toarray()
X_test_modif=vector2.toarray()

In [None]:
X_train_modif

In [None]:
# make pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score
pipe = make_pipeline(LogisticRegression(solver='lbfgs',max_iter=10000))

In [None]:
# make param grid
param_grid = {'logisticregression__C': [0.01, 0.1, 0.25, 0.75, 0.5, 1, 6,7,8,9,10,11,12,15,20, 100]}

# create and fit the model
model = GridSearchCV(pipe, param_grid, cv=5)
model.fit(X_train_modif,Y_train)

In [None]:
predictions = model.predict(X_test_modif)

In [None]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(Y_test)
confusion_matrix(predictions,Y_test)

In [None]:
print(classification_report(predictions,Y_test))

In [None]:



# make prediction and print accuracy
prediction = model.predict(X_test_modif)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

precision = precision_score(Y_test, prediction, average='micro')
recall = recall_score(Y_test, prediction, average='micro')
accuracy = accuracy_score(Y_test, prediction)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

In [None]:
# make pipeline
from sklearn.metrics import precision_score, recall_score, accuracy_score
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score

param_C= [0.01, 0.1, 0.25, 0.75, 0.5, 1, 6,7,8,9,10,11,12,15,20, 100]
algo=['newton-cg', 'liblinear','sag','saga','lbfgs']
# create and fit the model

for c in param_C:
    for al in algo:
        pipe = make_pipeline(TfidfVectorizer(),
                    LogisticRegression(multi_class='multinomial', solver=al,C=c, max_iter=10000))
    #model = GridSearchCV(pipe,  cv=5)
    #model.fit(X_train,Y_train)
        model.fit(X_train_modif,Y_train)
        scores = cross_val_score(pipe, X_train_modif, Y_train, scoring='accuracy', cv=3, n_jobs=-1)
# report performance
        print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
        #pipe.fit(X_train, Y_train)
        #print('Accuracy for C=%s: %s'
        # % (c, accuracy_score(Y_test, pipe.predict(X_test))))





# make prediction and print accuracy
#prediction = model.predict(X_test)
#print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
#print(classification_report(Y_test, prediction))

#precision = precision_score(Y_test, prediction, average='micro')
#recall = recall_score(Y_test, prediction, average='micro')
#accuracy = accuracy_score(Y_test, prediction)
#print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

### Random Forest Classifier

In [None]:
#pipe = make_pipeline(TfidfVectorizer(),
#                    RandomForestClassifier())

#param_grid = {'randomforestclassifier__n_estimators':[10, 100, 1000],
  #           'randomforestclassifier__max_features':['sqrt', 'log2']}

#rf_model = GridSearchCV(pipe, param_grid, cv=5)
#rf_model.fit(X_train,Y_train)

#prediction = rf_model.predict(X_test)
#print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")

#precision = precision_score(Y_test, predictionة average='micro')
#recall = recall_score(Y_test, predictionة average='micro')
#accuracy = accuracy_score(Y_test, prediction)
#print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

### Naive Bayes Classifier (Multinomial)

In [None]:
#pipe = make_pipeline(TfidfVectorizer(),
            #        MultinomialNB())
#pipe.fit(X_train,Y_train)
#prediction = pipe.predict(X_test)
#print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
#print(classification_report(Y_test, prediction))

### Support Vector Machine

In [None]:
#pipe = make_pipeline(TfidfVectorizer(),
                   #  SVC())
#param_grid = {'svc__kernel': ['rbf', 'linear', 'poly'],
          ##   'svc__gamma': [0.1, 1, 10, 100],
          #   'svc__C': [0.1, 1, 10, 100]}

#svc_model = GridSearchCV(pipe, param_grid, cv=3)
#svc_model.fit(X_train, Y_train)

#prediction = svc_model.predict(X_test)
#print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
#print(classification_report(Y_test, prediction))

### Tous les autres modèles

In [44]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
classifiers = [LinearSVC(), 
               SVC(gamma='auto'), 
               MultinomialNB(),
               LogisticRegression(solver='liblinear',max_iter=10000),
               BernoulliNB(), 
               SGDClassifier(), 
               DecisionTreeClassifier(max_depth=5),
               RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
               KNeighborsClassifier(3)
               ]

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
ngrams = (1, 2, 3)

results = []

for g in ngrams:
    #svc_model = GridSearchCV(pipe, param_grid, cv=3)
    #svc_model.fit(X_train, Y_train)

    #prediction = svc_model.predict(X_test)
    

    
    for alg in classifiers:
        pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=0.0001, max_df=0.95,
                                 analyzer='word', lowercase=False,
                                 ngram_range=(1, g))),
        ('clf', alg), ])
        pipeline.fit(X_train, Y_train)
        feature_names = pipeline.named_steps['vect'].get_feature_names()
        prediction = pipeline.predict(X_test)
        print(f"**************************************")
        print(f"Classifier= "+str(alg)+" and the ngrams is= "+str(g))
        print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
        print(classification_report(Y_test, prediction))
        # find accuracy, precision, recall:
        new = np.asarray(Y_test)
        print(confusion_matrix(prediction,Y_test))

**************************************
Classifier= LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) and the ngrams is= 1
Accuracy score is 0.73
              precision    recall  f1-score   support

           0       0.68      0.58      0.63       516
           1       0.77      0.67      0.72       506
           2       0.81      0.80      0.81       517
           3       0.67      0.84      0.74       540

    accuracy                           0.73      2079
   macro avg       0.73      0.72      0.72      2079
weighted avg       0.73      0.73      0.72      2079

[[300  79  30  34]
 [ 74 340  10  19]
 [ 54   9 416  33]
 [ 88  78  61 454]]
**************************************
Classifier= SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='aut

  'precision', 'predicted', average, warn_for)


**************************************
Classifier= MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) and the ngrams is= 1
Accuracy score is 0.73
              precision    recall  f1-score   support

           0       0.58      0.65      0.62       516
           1       0.75      0.68      0.71       506
           2       0.79      0.83      0.81       517
           3       0.84      0.78      0.81       540

    accuracy                           0.73      2079
   macro avg       0.74      0.73      0.74      2079
weighted avg       0.74      0.73      0.74      2079

[[336 122  56  62]
 [ 82 343  16  18]
 [ 62  14 427  38]
 [ 36  27  18 422]]




**************************************
Classifier= LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False) and the ngrams is= 1
Accuracy score is 0.71
              precision    recall  f1-score   support

           0       0.66      0.55      0.60       516
           1       0.76      0.65      0.70       506
           2       0.81      0.77      0.79       517
           3       0.63      0.85      0.73       540

    accuracy                           0.71      2079
   macro avg       0.72      0.70      0.70      2079
weighted avg       0.71      0.71      0.70      2079

[[285  78  34  33]
 [ 70 328  16  18]
 [ 52  12 396  29]
 [109  88  71 460]]
**************************************
Classifier= BernoulliN

  'precision', 'predicted', average, warn_for)


**************************************
Classifier= MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) and the ngrams is= 2
Accuracy score is 0.74
              precision    recall  f1-score   support

           0       0.59      0.66      0.62       516
           1       0.76      0.68      0.72       506
           2       0.79      0.83      0.81       517
           3       0.84      0.78      0.81       540

    accuracy                           0.74      2079
   macro avg       0.74      0.74      0.74      2079
weighted avg       0.74      0.74      0.74      2079

[[340 122  54  61]
 [ 78 344  15  18]
 [ 62  14 430  40]
 [ 36  26  18 421]]




**************************************
Classifier= LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False) and the ngrams is= 2
Accuracy score is 0.71
              precision    recall  f1-score   support

           0       0.67      0.54      0.60       516
           1       0.76      0.66      0.71       506
           2       0.80      0.76      0.78       517
           3       0.62      0.85      0.72       540

    accuracy                           0.71      2079
   macro avg       0.72      0.70      0.70      2079
weighted avg       0.72      0.71      0.70      2079

[[280  72  30  33]
 [ 69 332  16  18]
 [ 54  13 395  29]
 [113  89  76 460]]
**************************************
Classifier= BernoulliN

  'precision', 'predicted', average, warn_for)


**************************************
Classifier= KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform') and the ngrams is= 2
Accuracy score is 0.36
              precision    recall  f1-score   support

           0       0.74      0.13      0.22       516
           1       0.88      0.11      0.20       506
           2       0.90      0.20      0.32       517
           3       0.29      0.99      0.45       540

    accuracy                           0.36      2079
   macro avg       0.70      0.36      0.30      2079
weighted avg       0.70      0.36      0.30      2079

[[ 65  15   7   1]
 [  4  57   3   1]
 [  7   0 101   4]
 [440 434 406 534]]
**************************************
Classifier= LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_clas

  'precision', 'predicted', average, warn_for)


**************************************
Classifier= MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) and the ngrams is= 3
Accuracy score is 0.74
              precision    recall  f1-score   support

           0       0.59      0.66      0.62       516
           1       0.76      0.67      0.71       506
           2       0.78      0.83      0.81       517
           3       0.84      0.77      0.80       540

    accuracy                           0.74      2079
   macro avg       0.74      0.74      0.74      2079
weighted avg       0.74      0.74      0.74      2079

[[343 124  55  62]
 [ 73 341  15  19]
 [ 62  15 429  42]
 [ 38  26  18 417]]




**************************************
Classifier= LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False) and the ngrams is= 3
Accuracy score is 0.71
              precision    recall  f1-score   support

           0       0.68      0.54      0.60       516
           1       0.77      0.65      0.71       506
           2       0.80      0.76      0.78       517
           3       0.62      0.86      0.72       540

    accuracy                           0.71      2079
   macro avg       0.72      0.70      0.70      2079
weighted avg       0.72      0.71      0.70      2079

[[279  71  31  31]
 [ 67 331  17  16]
 [ 56  14 395  31]
 [114  90  74 462]]
**************************************
Classifier= BernoulliN

  'precision', 'predicted', average, warn_for)


**************************************
Classifier= KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform') and the ngrams is= 3
Accuracy score is 0.35
              precision    recall  f1-score   support

           0       0.75      0.12      0.20       516
           1       0.89      0.11      0.20       506
           2       0.90      0.16      0.28       517
           3       0.29      0.99      0.45       540

    accuracy                           0.35      2079
   macro avg       0.71      0.35      0.28      2079
weighted avg       0.70      0.35      0.28      2079

[[ 61  13   6   1]
 [  3  56   3   1]
 [  7   0  85   2]
 [445 437 423 536]]


### Deep Learning model with CNN

In [46]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow
from nltk.corpus import stopwords
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
#from keras.models import Sequential
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers.core import Activation, Dropout, Dense
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import GlobalMaxPooling1D
from tensorflow.python.keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [47]:
feature = data1.Comment.fillna(' ')
target = data1.Sentiment
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size=0.2, random_state=42, shuffle=True)
X_train

10001     يفرغها الفرحه يسافر يمشي للماناج يعمل كمبينغ ...
2732       اعوذ سماهم وجوههم منظرك يخلي يايس الدنيا الاخره
9882     موتيفا بالصغار تحب تعطي رساله نبيله لربي سبحان...
1527                                           ماشاء وبارك
9541                 قدم فكره procédure recrutement متاحم 
3905     وشكون حاسبها اصلوا مسخه بايعه شرفها خاطر الفلو...
58                                                   khayb
7735                                  نحبووو خاطرو نيه قلب
9987                                   خليك متعبتك الحكايه
10059             مده كبيره نجمت تفهموا بالباهي تجيني عروض
6562     masset lasset aenten mel kalasset wooooooooooo...
8347                                ريت تعاودو عرضو النهار
10287    شوف دكتور عبد النور نومه مدنين جنراليست فالح ا...
6515     tfouh ye mou3a9 yeli meta7yech nsa tesma3 ema ...
9944     ظل السلبيات دايره تنساش صغيرك ولدك يستحق لهوه ...
6972               محلاك بنك امير عسل الغنايه غنيتها حلوه 
414                                            ensan taf

In [48]:
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from keras import layers
from sklearn import metrics


def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)


def Build_Model_DNN_Text(shape, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 30 # number of nodes
    nLayers = 2 # number of  hidden layer

    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(4, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)


model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1])
model_DNN.summary()
exit(1)
model_DNN.fit(X_train_tfidf, Y_train,
                              validation_data=(X_test_tfidf, Y_test),
                              validation_split=0.4,
                              epochs=100,
                              batch_size=128,
                              verbose=2,shuffle=True)

predicted = model_DNN.predict_classes(X_test_tfidf)

print(metrics.classification_report(Y_test, predicted))

tf-idf with 26392 features
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 30)                791790    
_________________________________________________________________
dropout_1 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                930       
_________________________________________________________________
dropout_3 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_4 (Dense)            

 - 3s - loss: 0.0434 - accuracy: 0.9864 - val_loss: 2.1492 - val_accuracy: 0.7225
Epoch 72/100
 - 3s - loss: 0.0464 - accuracy: 0.9862 - val_loss: 2.1167 - val_accuracy: 0.7210
Epoch 73/100
 - 3s - loss: 0.0416 - accuracy: 0.9858 - val_loss: 2.2635 - val_accuracy: 0.7104
Epoch 74/100
 - 3s - loss: 0.0442 - accuracy: 0.9856 - val_loss: 2.2432 - val_accuracy: 0.7128
Epoch 75/100
 - 3s - loss: 0.0371 - accuracy: 0.9877 - val_loss: 2.2940 - val_accuracy: 0.7143
Epoch 76/100
 - 3s - loss: 0.0381 - accuracy: 0.9865 - val_loss: 2.3497 - val_accuracy: 0.7124
Epoch 77/100
 - 3s - loss: 0.0454 - accuracy: 0.9867 - val_loss: 2.2387 - val_accuracy: 0.7205
Epoch 78/100
 - 3s - loss: 0.0383 - accuracy: 0.9874 - val_loss: 2.3446 - val_accuracy: 0.7133
Epoch 79/100
 - 3s - loss: 0.0421 - accuracy: 0.9865 - val_loss: 2.3179 - val_accuracy: 0.7167
Epoch 80/100
 - 3s - loss: 0.0423 - accuracy: 0.9869 - val_loss: 2.3267 - val_accuracy: 0.7114
Epoch 81/100
 - 3s - loss: 0.0388 - accuracy: 0.9875 - val_loss

In [None]:
X_train_tfidf.shape

In [None]:
Y_train.shape

In [None]:
X_test_tfidf

In [None]:
Y_test

In [49]:
seed = 7
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)

model1 = Sequential()
model1.add(Dense(64, activation='relu', input_dim=X_train_tfidf.shape[1]))
model1.add(Dropout(0.5))
model1.add(Dense(4, activation='softmax'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


model1.fit(X_train_tfidf, Y_train,
           epochs=15, 
           validation_data=(X_test_tfidf, Y_test),
           batch_size=128,
           verbose=2, shuffle=True)






tf-idf with 26392 features


ValueError: Error when checking target: expected dense_6 to have shape (4,) but got array with shape (1,)

### Using a CNN model

In [50]:
# Some handy libraries to build a baseline CNN model
from keras.models import Sequential
from keras.layers import Conv1D, MaxPool1D, Dense, Dropout, Flatten, MaxPooling1D
from keras.layers import Activation, GlobalMaxPooling1D
from keras import optimizers
from keras.layers.embeddings import Embedding

In [51]:
from numpy import newaxis

def baseline_cnn_model(fea_matrix, n_class, mode, compiler):
  # create model
    model = Sequential()
    model.add(Conv1D(64,3, activation = 'relu'))
    model.add(MaxPool1D(pool_size = 2))
    model.add(Conv1D(filters=128, kernel_size = (3), activation = 'relu'))
    model.add(MaxPool1D(pool_size = 2))
    model.add(Flatten())
    model.add(Activation('relu'))
    model.add(Dense(n_class, activation = 'softmax'))
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',  metrics=['acc'])
    #model_ptw2v.add(Dense(1, activation='sigmoid'))

    return model



In [52]:
# Tuning hyper-parameters
import math
print('shape=',X_train_tfidf.shape[1])
lr = math.exp(-3)
batch_size=128
num_epochs=100
decay= math.exp(-4)
mode="reg"
n_class=4 #5
adm=optimizers.Adam(lr=lr,decay=decay)
sgd=optimizers.SGD(lr=lr, nesterov=True,momentum=0.7,decay=decay)
Nadam= optimizers.Nadam(lr=lr, beta_1=0.9,beta_2=0.999,epsilon=math.exp(-8))

model=baseline_cnn_model(X_train_tfidf,n_class,mode,Nadam)
history = model.fit(X_train_tfidf, Y_train,epochs=10, 
                    validation_data=(X_test_tfidf, Y_test))


shape= 26392


ValueError: Input 0 is incompatible with layer conv1d_1: expected ndim=3, found ndim=2

In [None]:

# Model Training
history=model.fit(X_train_tfidf,Y_train,batch_size=batch_size,epochs=num_epochs,verbose=1,validation_split=0.2)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
input_shape = X_train_tfidf.shape  
model.build(input_shape) 

print(model.summary())

In [None]:
history = model.fit(X_train_tfidf, Y_train, batch_size=128, epochs=30, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, Y_test, verbose=1)

In [None]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

### Deep Learning models using an LSTM

In [None]:
# Import the tools needed from keras
import keras.backend as K
#from keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.layers import Dense, Embedding, LSTM
#from keras.models import Sequential
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
# Use that tokenizer to transform the text messages in the training and test sets
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_test_seq_padded = pad_sequences(X_test_seq, 50)

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(len(tokenizer.word_index)+1, 32))
model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()


In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
Y_test

In [None]:
# Fit the RNN model
history = model.fit(X_train_seq_padded, Y_train, 
                    batch_size=32, epochs=20,
                    validation_data=(X_test_seq_padded, Y_test.values))

# Deep Learning Model with CNN

In [53]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical


X = data1.Comment.fillna(' ')
Y = data1.Sentiment

label_encoder = LabelEncoder()

Y = label_encoder.fit_transform(Y)

Y = to_categorical(Y)

# Y = Y.reshape(-1, 1)
Y

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [54]:
# Program to find most frequent 
# element in a list
  
from collections import Counter
  
def most_frequent(List):
    occurence_count = Counter(List)
    #return occurence_count.most_common(1)[0][0]
    return occurence_count.most_common(1)
  
vocab= set(" ".join(X).split())
print(most_frequent(vocab))

[('يجيو', 1)]


In [55]:
def word_count(voc):
    counts = dict()
    

    for word in voc:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

vocab= set(" ".join(X).split())
print( word_count(vocab))

{'يجيو': 1, 'chenya': 1, 'انيه': 1, 'مثلاته': 1, 'السفاهه': 1, 'tab3od': 1, 'ﺍﻟﺨﻤﺞ': 1, 'كهنه': 1, 'حرفاء': 1, 'ta9raa': 1, 'الانحطاط': 1, 'بمنطق': 1, 'tanja777': 1, 'تهريج': 1, 'mosta9blou': 1, 'تترحم': 1, 'جاوبني': 1, 'وحاولي': 1, 'يقيدوله': 1, 'المولدي': 1, 'ومحلي': 1, 'الفلاحيه': 1, '3aychin': 1, 't5awelik': 1, 'للقحاب': 1, 'تاتنقد': 1, 'ادغفونا': 1, 'بروده': 1, 'الرجل': 1, 'dropbox': 1, 'يحمد': 1, 'klay': 1, 'latofti': 1, 'lagetat': 1, 'ya3tek': 1, 'premiers': 1, 'le9loub': 1, 'الاساسيه': 1, 'lbagra': 1, 'قولهولو': 1, 'annahdha': 1, 'ايييه': 1, 'ekher': 1, 'الجنون': 1, 'المتبادل': 1, 'طلعت': 1, 'دخلو': 1, 'عمرهاbonne': 1, 'freelance': 1, 'المليون': 1, 'parle': 1, 'fek': 1, 'امريكا': 1, 'sghar': 1, 'يحاجج': 1, 'توبيخ': 1, 'شعواء': 1, 'ترحمنا': 1, 'رهوط': 1, 'ورخيص': 1, 'الغمه': 1, 'وهف': 1, 'mokh': 1, 'نسا': 1, 'للمعاقين': 1, 'الاحترام': 1, 'والفناء': 1, 'محرم': 1, 'قرءوها': 1, 'رجاءا': 1, 'nikelha': 1, 'يااااااااا': 1, 'نقولك': 1, 'مجاني': 1, 'فسرلو': 1, 'وعافيه': 1, 'يجرم': 1, 'ا

In [56]:
X

0           مقداد قداش مره تجيبوه افهمونا ماسط الشعب يحبوش
1                   karim gharbi masit i sitcom ma3jibnich
2                                                  mabldou
3        الهالكا مختصه قناه الزيتونه محايده وانتهت مهمت...
4        اتحاد الخراب والدمار خرب بلادنا يريدون افريقيا...
5                   سلب لفلوسهم خاطر التوانسه فاقو بسرقتهم
6                   المءسسات الخاصه المصانع يشملهم الاضراب
7                              مدخل المقروض الاضراب يامعلم
8                   والخاصه طز اخاطر يخلصو زوز فرنك السميك
9                                        اتحاد الخراب ينجح
10       اطمع يهز للهاويه بلاد شافه الافلاس والاتحاد هو...
11       الاتحاد يتفاوظ الحكومه مداخيله الخاصه حل مشاكل...
12                        تشخر زادت بوف يهلك التحاد الخراب
13                مره ازيدنا كجيو القحوا اقلك نفاذ المخزون
14       الكميه ناقصه مشطلع سومها الحمد لله مرضنا وبرين...
15       تفدلك 6 ملاين تتحصل عامين اطيشها الزبله الشعب ...
16                                              لقاح مشك

In [57]:


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

vocab= set(" ".join(X_train).split())
max_words = len(set(" ".join(X_train).split()))
max_len = X_train.apply(lambda x: len(x)).max()

print(max_words)
print(max_len)
print(len(vocab))
print(vocab)



27827
4430
27827
{'يجيو', 'chenya', 'انيه', 'مثلاته', 'tab3od', 'ﺍﻟﺨﻤﺞ', 'كهنه', 'حرفاء', 'الانحطاط', 'tanja777', 'تهريج', 'جاوبني', 'وحاولي', 'يقيدوله', 'المولدي', 'ومحلي', 'الفلاحيه', '3aychin', 't5awelik', 'للقحاب', 'تاتنقد', 'ادغفونا', 'بروده', 'الرجل', 'dropbox', 'يحمد', 'klay', 'latofti', 'lagetat', 'ya3tek', 'premiers', 'le9loub', 'الاساسيه', 'قولهولو', 'annahdha', 'ايييه', 'ekher', 'الجنون', 'المتبادل', 'طلعت', 'دخلو', 'parle', 'fek', 'امريكا', 'sghar', 'توبيخ', 'شعواء', 'ترحمنا', 'رهوط', 'ورخيص', 'الغمه', 'وهف', 'mokh', 'نسا', 'الاحترام', 'محرم', 'قرءوها', 'رجاءا', 'nikelha', 'يااااااااا', 'نقولك', 'مجاني', 'فسرلو', 'يجرم', 'وعافيه', 'البركه', 'kifah', 'rjél', 'الماسوني', 'jendouba', 'الحاج', 'تتعلم', 'toghletch', 'matnajmch', 'he', 'نحيوه', 'متباعدين', 'السموم', 'صدقوني', '5rajml', '0', 'واحفض', 'يمنحك', 'تعطيها', 'tet3ada', 'بسيييف', 'نعالجو', 'توكلت', 'يفيدني', 'اطحنو', 'وامهاتكم', 'maadnous', 'المناطق', 'لزبيانات', 'المنسي', 'سكورو', 'esmik', 'ثراه', 'laysa', 'وشلاكه', 'sa

In [58]:
vocab2=[]
for word in vocab:
    if len(word)>=3: #or word not in ['1','2','3','4','5','6','7','8','9','0']:
        vocab2.append(word)
print(len(vocab2))
    

27499


In [59]:
import re
vocab3=[]
for w in vocab2:
    vocab3.append(re.sub(r'(.)\1+', r'\1\1', w))
print(len(vocab3))

27499


In [61]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_words)

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)
X_train_seq = tokenizer.texts_to_matrix(X_train, mode='freq')
print(X_train_seq.shape)
max_words=len(vocab3)
print(max_words)
print(max_len)
print(type(X_train))

(8834, 27827)
27499
4430
<class 'pandas.core.series.Series'>


In [63]:
print(X_train_seq)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [64]:
train=X_train.values.tolist()
s=0
for i in range(len(train)):
    #print(train[i])
    s=s+len(train[i])
moy=s/len(train)
print(moy)

40.96264432872991


In [65]:
max_len=40

In [66]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

tokenizer = Tokenizer(num_words=max_words)

tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)


In [67]:
X_train_seq[1]

[7780, 2733]

In [68]:
X_train[1]

'karim gharbi masit i sitcom ma3jibnich'

In [69]:
X_train_seq = sequence.pad_sequences(X_train_seq, maxlen=max_len)

In [70]:
X_train_seq[2]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0, 7781, 1226, 7782,
       7783, 1542, 7784, 2000, 2734, 2735, 4163])

In [71]:
X_train[7]

' مدخل المقروض الاضراب يامعلم'

In [72]:
print(X_train_seq[7])
print(X_train[7])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 569 736  17 433]
 مدخل المقروض الاضراب يامعلم


In [73]:
from sklearn.utils import class_weight as cw
# Calculate Class Weights
def get_weight(y):
    class_weight_current =  cw.compute_class_weight('balanced', np.unique(y), y)
    return class_weight_current

In [83]:
class_weight = get_weight(Y_train.flatten())

In [75]:
def get_rnn_model(num_class=2):
    model = Sequential()
    
    model.add(Embedding(max_words, 100, input_length=max_len))
    model.add(LSTM(256))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(512, activation='relu'))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    if num_class>2:
        model.add(Dense(num_class, activation='softmax'))
    else:
        model.add(Dense(2, activation='sigmoid'))
    
    model.summary()
    
    return model

In [76]:
def get_cnn_model(num_class=4):   
    model = Sequential()
    
    model.add(Embedding(max_words, 100, input_length=max_len))
    
    model.add(Conv1D(1024, 3, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Dense(2048, activation='relu'))
    
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    if num_class>2:
        model.add(Dense(num_class, activation='softmax'))
    else:
        model.add(Dense(2, activation='sigmoid'))
    
    model.summary()
    return model

In [77]:
def plot_performance(history=None, figure_directory=None, ylim_pad=[0, 0]):
    xlabel = 'Epoch'
    legends = ['Training', 'Validation']

    plt.figure(figsize=(20, 5))

    y1 = history.history['acc']
    y2 = history.history['val_acc']

    min_y = min(min(y1), min(y2))-ylim_pad[0]
    max_y = max(max(y1), max(y2))+ylim_pad[0]


    plt.subplot(121)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Accuracy\n'+date_time(1), fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Accuracy', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()

    y1 = history.history['loss']
    y2 = history.history['val_loss']

    min_y = min(min(y1), min(y2))-ylim_pad[1]
    max_y = max(max(y1), max(y2))+ylim_pad[1]


    plt.subplot(122)

    plt.plot(y1)
    plt.plot(y2)

    plt.title('Model Loss\n'+date_time(1), fontsize=17)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel('Loss', fontsize=15)
    plt.ylim(min_y, max_y)
    plt.legend(legends, loc='upper left')
    plt.grid()
    if figure_directory:
        plt.savefig(figure_directory+"/history")

    plt.show()

In [84]:
from keras.layers import Input, Add, concatenate, Dense, Activation, BatchNormalization, Dropout, Flatten
from keras.layers import LeakyReLU, PReLU, Lambda, Multiply
from keras.layers import Embedding, LSTM, Bidirectional
from keras.models import Sequential
num_class = 4
model1 = get_rnn_model(num_class=num_class)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 100)           2749900   
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 512)               131584    
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)              

In [85]:
loss = 'categorical_crossentropy'
# loss = 'binary_crossentropy' only for binary classification
metrics = ['accuracy']

In [86]:
def date_time(x):
    if x==1:
        return 'Timestamp: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
    if x==2:    
        return 'Timestamp: {:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())
    if x==3:  
        return 'Date now: %s' % datetime.datetime.now()
    if x==4:  
        return 'Date today: %s' % datetime.date.today() 

In [87]:
import time
import datetime
import keras.callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau
from keras.optimizers import RMSprop, Adam
import keras

from keras.callbacks import TensorBoard

print("Starting...\n")

start_time = time.time()
print(date_time(1))

print("\n\nCompliling Model ...\n")
learning_rate = 0.001
optimizer = Adam(learning_rate)
# optimizer = Adam()

model1.compile(optimizer=optimizer, loss=loss, metrics=metrics)

verbose = 1
epochs = 100
batch_size = 128
validation_split = 0.2

print("Trainning Model ...\n")

callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        #min_delta=1e-2,
        min_delta=1,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=90,
        verbose=1,
    )
]

history1 = model1.fit(
    X_train_seq,
    Y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=verbose,
    callbacks=callbacks,
    validation_split=validation_split,
    class_weight =class_weight
    )

elapsed_time = time.time() - start_time
elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

print("\nElapsed Time: " + elapsed_time)
print("Completed Model Trainning", date_time(1))

Starting...

Timestamp: 2021-07-18 22:13:34


Compliling Model ...

Trainning Model ...

Train on 7067 samples, validate on 1767 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 00091: early stopping

Elapsed Time: 00:21:04
Completed Model Trainning Timestamp: 2021-07-18 22:34:38


In [88]:
plot_performance(history=history1)

KeyError: 'acc'

<Figure size 1440x360 with 0 Axes>

In [89]:
num_class = 4
model2 = get_cnn_model(num_class=num_class)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 40, 100)           2749900   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 38, 1024)          308224    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 1024)              0         
_________________________________________________________________
batch_normalization_5 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_14 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 2048)             

In [90]:
print("Starting...\n")

start_time = time.time()
print(date_time(1))

print("\n\nCompliling Model ...\n")
learning_rate = 0.001
optimizer = Adam(learning_rate)
# optimizer = Adam()

model2.compile(optimizer=optimizer, loss=loss, metrics=metrics)

verbose = 1
epochs = 100
batch_size = 128
validation_split = 0.2

print("Trainning Model ...\n")

history2 = model2.fit(
    X_train_seq,
    Y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=verbose,
    callbacks=callbacks,
    validation_split=validation_split,
    class_weight =class_weight
    )

elapsed_time = time.time() - start_time
elapsed_time = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

print("\nElapsed Time: " + elapsed_time)
print("Completed Model Trainning", date_time(1))

Starting...

Timestamp: 2021-07-18 23:39:12


Compliling Model ...

Trainning Model ...

Train on 7067 samples, validate on 1767 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 00091: early stopping

Elapsed Time: 00:31:47
Completed Model Trainning Timestamp: 2021-07-19 00:10:59


In [91]:
plot_performance(history=history2)

KeyError: 'acc'

<Figure size 1440x360 with 0 Axes>

In [92]:
test_X_seq = tokenizer.texts_to_sequences(X_test)
test_X_seq = sequence.pad_sequences(test_X_seq, maxlen=max_len)
accuracy1 = model1.evaluate(test_X_seq, Y_test)
accuracy2 = model2.evaluate(test_X_seq, Y_test)



In [93]:
print("Model Performance of RNN (Test Accuracy):")
print('Accuracy: {:0.2f}%\nLoss: {:0.3f}\n'.format(accuracy1[1]*100, accuracy1[0]))

print("\nModel Performance of RNN (Test Accuracy):")
print('v: {:0.2f}%\nLoss: {:0.3f}\n'.format(accuracy2[1]*100, accuracy2[0]))

Model Performance of RNN (Test Accuracy):
Accuracy: 67.44%
Loss: 1.733


Model Performance of RNN (Test Accuracy):
v: 67.63%
Loss: 2.141



In [94]:
ypreds1 = model1.predict_classes(test_X_seq, verbose=1)
ypreds2 = model2.predict_classes(test_X_seq, verbose=1)



In [95]:
def plot_model_performace(result):
    sns.set_style("ticks")
    figsize=(22, 6)

    ticksize = 12
    titlesize = ticksize + 8
    labelsize = ticksize + 5

    xlabel = "Model"
    ylabel = "Score"

    title = "Model Performance"

    params = {'figure.figsize' : figsize,
              'axes.labelsize' : labelsize,
              'axes.titlesize' : titlesize,
              'xtick.labelsize': ticksize,
              'ytick.labelsize': ticksize}

    plt.rcParams.update(params)

    col1 = "model"
    col2 = "score"
    sns.barplot(x=col1, y=col2, data=result)
    plt.title(title.title())
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=90)
    plt.grid()
    plt.plot()
    plt.show()
    print(result)

In [96]:
plot_confusion_matrix(Y_test, ypreds1, title="RNN")

NameError: name 'plot_confusion_matrix' is not defined

In [97]:
result = pd.DataFrame({'model': 'RNN', 'score': accuracy1[1]*100}, index=[-1])
row2 = pd.DataFrame({'model': 'CNN', 'score': accuracy2[1]*100}, index=[-1])
result = pd.concat([row2, result.ix[:]]).reset_index(drop=True)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [98]:
plot_model_performace(result)

NameError: name 'sns' is not defined