In [18]:
import pandas as pd
import numpy as np
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.models import load_model
import urllib.request
import zipfile
import os
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,GRU,Dense
import nltk
from nltk.tokenize import word_tokenize
import warnings
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
nltk.download('punkt')
warnings.filterwarnings('ignore')
#higher training accuracy compared to other learning models like SVM, Random Forrest, Naive Bayes et cetera.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ousse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#extract the sentences and their respective emotion 
#we use the below code to place them into the training and testing data-frames.
f=open('train.txt','r')
x_train=[]
y_train=[]
for i in f:
    l=i.split(';')
    y_train.append(l[1].strip())
    x_train.append(l[0])
f=open('test.txt','r')
x_test=[]
y_test=[]
for i in f:
    l=i.split(';')
    y_test.append(l[1].strip())
    x_test.append(l[0])
f=open('val.txt','r')
for i in f:
    l=i.split(';')
    y_test.append(l[1].strip())
    x_test.append(l[0])
data_train=pd.DataFrame({'Text':x_train,'Emotion':y_train})
data_test=pd.DataFrame({'Text':x_test,'Emotion':y_test})
data=data_train.append(data_test,ignore_index=True)

In [3]:
# remove all the prepositions, articles, punctuation marks, stop words, leaving only the important words in the sentences
# bch tnajem tjini hight accurency
def clean_text(data):
    data=re.sub(r"(#[\d\w\.]+)", '', data)
    data=re.sub(r"(@[\d\w\.]+)", '', data)
    data=word_tokenize(data)
    return data
texts=[' '.join(clean_text(text)) for text in data.Text]
texts_train=[' '.join(clean_text(text)) for text in x_train]
texts_test=[' '.join(clean_text(text)) for text in x_test]

In [4]:
# Tokenization is an important process in NLP (Natural Language Processing) analyses.
# It tokenizes each sentence, extracts each unique word and creates a dictionary where each unique word is assigned an index.
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)
sequence_train=tokenizer.texts_to_sequences(texts_train)
sequence_test=tokenizer.texts_to_sequences(texts_test)
index_of_words=tokenizer.word_index
vocab_size=len(index_of_words)+1

In [5]:
# each emotion is assigned a categorical value (0–5).
# It is for this reason the ‘encoding’ dictionary and ‘to_categorical’ function are used
# lezemna fel train el max lengh howa bidou kima e sequence bch ma tsirech erreur
# nbr de classe teena howa 6 = lel sentiment teena
num_classes=6
embed_num_dims=300
max_seq_len=500
class_names=['anger','sadness','fear','joy','surprise','love']
X_train_pad=pad_sequences(sequence_train,maxlen=max_seq_len)
X_test_pad=pad_sequences(sequence_test,maxlen=max_seq_len)
encoding={'anger':0,'sadness':1,'fear':2,'joy':3,'surprise':4,'love':5}
y_train=[encoding[x] for x in data_train.Emotion]
y_test=[encoding[x] for x in data_test.Emotion]
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

In [6]:
#pour la creation de ce modéle staamalt une data de mot men wikpedia = 1 million de mots
# fama barcha kelmet najmou nestaamlohem yetsamew trained word vectors 
#w ki nestaamlou les trained word vector nkhaliw l model mteena yetrena khir b barcha w nhassnou fel occurency mté3na 
def create_embedding_matrix(filepath,word_index,embedding_dim):
    vocab_size=len(word_index)+1
    embedding_matrix=np.zeros((vocab_size,embedding_dim))
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word,*vector=line.split()
            if word in word_index:
                idx=word_index[word]
                embedding_matrix[idx] = np.array(vector,dtype=np.float32)[:embedding_dim]
    return embedding_matrix
fname='wiki-news-300d-1M.vec'
embedd_matrix=create_embedding_matrix(fname,index_of_words,embed_num_dims)

In [7]:
# taw aamalt creation mtaa architecture bch nreniw beha el model 
# donc naamlou embedding layer eli howa i khalina nbadlou des entiers positive l dense vector
# w zedna el bidirectional layer zeda eli i khalik tconnecti deux layer b baadhhom
# w i najem yakra mel future w el past 
# (the output layer can get information from past (backwards) and future (forward) states simultaneously.)
# Bidirectional recurrent neural networks (BRNN) 
# w estaamalna zeda el Dense Layer elli howa  
# The dense layer is connected deeply, w yekbel de input men aand les neurones lkol men aand elli kablou
embedd_layer=Embedding(vocab_size,embed_num_dims,input_length=max_seq_len,weights=[embedd_matrix],trainable=False)
gru_output_size=128
bidirectional=True
model=Sequential()
model.add(embedd_layer)
model.add(Bidirectional(GRU(units=gru_output_size,dropout=0.2,recurrent_dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))
# Adam’s optimizer is used as the optimizer and loss is calculated using ‘categorical_crossentropy’.
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 300)          5128500   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              330240    
 l)                                                              
                                                                 
 dense (Dense)               (None, 6)                 1542      
                                                                 
Total params: 5,460,282
Trainable params: 331,782
Non-trainable params: 5,128,500
_________________________________________________________________


In [8]:
# el batch howa nbr d'echantillon traiter kal el mis a jours mtaa el model 
# epochs max nhottou 8 bch ma i sirich over fitting 
# batch_size=128
# epochs=5
# hist=model.fit(X_train_pad,y_train,batch_size=batch_size,epochs=epochs,validation_data=(X_test_pad,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
# bech najmou njibou el emotion shiha lezemna nraj3oha kima kenet khater badelneha mel lowel 
message=['I am happy.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['I am happy.']
Emotion: joy


In [23]:
message=['I am so emotional.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['I am so emotional.']
Emotion: sadness


In [11]:
message=['I not rich but i live like a millionnaire.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['I not rich but i live like a millionnaire.']
Emotion: joy


In [12]:
message=['really ! he did that ?.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['really ! he did that ?.']
Emotion: anger


In [13]:
message=['im shocked.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['im shocked.']
Emotion: surprise


In [14]:
message=['i cant beleave it im so glad']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['i cant beleave it im so glad']
Emotion: joy
