In [101]:
import re 
import nltk 
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer ## stemming 
from nltk.stem import WordNetLemmatizer  ## lemmatization
import joblib,os 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences 
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder
from sklearn.model_selection import train_test_split



In [102]:
## Reading data from the text files 

train_data = open("./Data/train.txt").readlines()
val_data = open("./Data/val.txt").readlines()
test_data = open("./Data/test.txt").readlines()


In [103]:
print(len(train_data))
print(len(val_data))
print(len(test_data))


16000
2000
2000


In [104]:
combined_data = train_data + test_data + val_data 
print(len(combined_data))

20000


In [105]:
x=[]
y=[]
for temp in combined_data:
    complete = temp.split(';')
    if len(complete)==2:
        x.append(complete[0])
        y.append(complete[1].strip())

In [106]:
labels=[]
for item in y:
        if item not in labels:
            labels.append(item)
labels 

['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']

text cleaning function applying on each and every raw message 

In [107]:
def text_cleaning(sentences,stemming):
    cleaned_data = []
    for sentence in sentences: 
        message = sentence.lower()
        message = re.sub('[^a-z0-9 ]',"",message)
        ls_of_words = nltk.word_tokenize(message)  #words is splited in sentences
        ls_of_word_without_stopwords = [word for word in ls_of_words if word not in stopwords.words('english')]  #stopwords are i ,am
        stemmed_words = [stemming.stem(word) for word in ls_of_word_without_stopwords ] 
        message = " ".join(stemmed_words)
        cleaned_data.append(message)
    return cleaned_data

stem = PorterStemmer()
cleaned_data = text_cleaning(x,stemming=stem)


In [149]:
stopwords.words('english')


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [150]:
cleaned_data

['didnt feel humili',
 'go feel hopeless damn hope around someon care awak',
 'im grab minut post feel greedi wrong',
 'ever feel nostalg fireplac know still properti',
 'feel grouchi',
 'ive feel littl burden late wasnt sure',
 'ive take milligram time recommend amount ive fallen asleep lot faster also feel like funni',
 'feel confus life teenag jade year old man',
 'petrona year feel petrona perform well made huge profit',
 'feel romant',
 'feel like make suffer see mean someth',
 'feel run divin experi expect type spiritu encount',
 'think easiest time year feel dissatisfi',
 'feel low energi thirsti',
 'immens sympathi gener point possibl proto writer tri find time write corner life sign agent let alon publish contract feel littl preciou',
 'feel reassur anxieti side',
 'didnt realli feel embarrass',
 'feel pretti pathet time',
 'start feel sentiment doll child began collect vintag barbi doll sixti',
 'feel compromis skeptic valu everi unit work put',
 'feel irrit reject without an

In [108]:
os.makedirs('models',exist_ok=True)
joblib.dump(cleaned_data,"./models/cleaned_data.lb")
print("successfully saved your cleaned data!")

successfully saved your cleaned data!


In [109]:
## trying  on a sinlgle message 
ls_of_stopwords = stopwords.words('english')
print(x[1])
single_message = x[1].lower()
single_message = re.sub('[^a-z0-9 ]',"",single_message)
ls_of_words = nltk.word_tokenize(single_message)  # single_message.split()
ls_of_word_without_stopwords = []
for word in ls_of_words: 
    if word not in ls_of_stopwords:
        ls_of_word_without_stopwords.append(word)
print(ls_of_word_without_stopwords)
stem = PorterStemmer()
stemmed_words = []
for word in ls_of_word_without_stopwords: 
    stemmed_words.append(stem.stem(word))
print(stemmed_words)
" ".join(stemmed_words)

i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake
['go', 'feeling', 'hopeless', 'damned', 'hopeful', 'around', 'someone', 'cares', 'awake']
['go', 'feel', 'hopeless', 'damn', 'hope', 'around', 'someon', 'care', 'awak']


'go feel hopeless damn hope around someon care awak'

In [110]:
## Tokenaization 
tokenizer = Tokenizer(oov_token="<nothing>")  #out of vocalabaury
tokenizer.fit_on_texts(cleaned_data)
joblib.dump(tokenizer,'./models/tokenizer.lb')
print("successfully saved your tokenizer at this location : './models/tokenizer.lb'")

successfully saved your tokenizer at this location : './models/tokenizer.lb'


In [111]:
tokenizer.word_index

{'<nothing>': 1,
 'feel': 2,
 'like': 3,
 'im': 4,
 'get': 5,
 'time': 6,
 'know': 7,
 'realli': 8,
 'make': 9,
 'go': 10,
 'want': 11,
 'love': 12,
 'littl': 13,
 'think': 14,
 'peopl': 15,
 'day': 16,
 'thing': 17,
 'one': 18,
 'would': 19,
 'even': 20,
 'still': 21,
 'ive': 22,
 'life': 23,
 'bit': 24,
 'way': 25,
 'need': 26,
 'someth': 27,
 'much': 28,
 'dont': 29,
 'work': 30,
 'start': 31,
 'could': 32,
 'say': 33,
 'look': 34,
 'see': 35,
 'tri': 36,
 'back': 37,
 'good': 38,
 'pretti': 39,
 'come': 40,
 'right': 41,
 'alway': 42,
 'help': 43,
 'also': 44,
 'today': 45,
 'year': 46,
 'take': 47,
 'friend': 48,
 'use': 49,
 'around': 50,
 'cant': 51,
 'person': 52,
 'made': 53,
 'though': 54,
 'hate': 55,
 'well': 56,
 'got': 57,
 'happi': 58,
 'thought': 59,
 'someon': 60,
 'didnt': 61,
 'never': 62,
 'felt': 63,
 'find': 64,
 'write': 65,
 'lot': 66,
 'hope': 67,
 'quit': 68,
 'live': 69,
 'week': 70,
 'everi': 71,
 'sure': 72,
 'less': 73,
 'read': 74,
 'enough': 75,
 'give':

In [112]:
tokenizer.word_counts  

OrderedDict([('didnt', 334),
             ('feel', 21204),
             ('humili', 69),
             ('go', 1101),
             ('hopeless', 81),
             ('damn', 54),
             ('hope', 320),
             ('around', 382),
             ('someon', 335),
             ('care', 254),
             ('awak', 22),
             ('im', 3055),
             ('grab', 22),
             ('minut', 79),
             ('post', 234),
             ('greedi', 79),
             ('wrong', 152),
             ('ever', 260),
             ('nostalg', 63),
             ('fireplac', 3),
             ('know', 1192),
             ('still', 743),
             ('properti', 5),
             ('grouchi', 35),
             ('ive', 723),
             ('littl', 932),
             ('burden', 95),
             ('late', 167),
             ('wasnt', 119),
             ('sure', 310),
             ('take', 403),
             ('milligram', 1),
             ('time', 1215),
             ('recommend', 19),
             ('amoun

In [113]:
tokenized_data = tokenizer.texts_to_sequences(cleaned_data)

In [151]:
tokenized_data

[[61, 2, 522],
 [10, 2, 419, 682, 67, 50, 60, 96, 1229],
 [4, 1230, 431, 107, 2, 432, 192],
 [92, 2, 592, 3696, 7, 21, 2844],
 [2, 918],
 [22, 2, 13, 343, 170, 260, 72],
 [22, 47, 6002, 6, 1347, 712, 22, 2375, 1348, 66, 1610, 44, 2, 3, 355],
 [2, 339, 23, 1174, 713, 46, 196, 298],
 [4452, 46, 2, 4452, 897, 56, 53, 887, 2845],
 [2, 550],
 [2, 3, 9, 373, 35, 139, 27],
 [2, 134, 508, 240, 337, 449, 1199, 1685],
 [14, 4453, 6, 46, 2, 523],
 [2, 299, 371, 3697],
 [1762,
  1499,
  106,
  154,
  308,
  6003,
  958,
  36,
  64,
  6,
  65,
  1349,
  23,
  1142,
  2846,
  93,
  137,
  1350,
  1500,
  2,
  13,
  472],
 [2, 638, 919, 377],
 [61, 8, 2, 325],
 [2, 39, 496, 6],
 [31, 2, 786, 2082, 311, 514, 1269, 2083, 3698, 2082, 4454],
 [2, 2376, 759, 320, 71, 2377, 30, 111],
 [2, 160, 380, 90, 182, 80, 33, 80],
 [2,
  130,
  147,
  148,
  2217,
  43,
  2,
  787,
  1868,
  123,
  1351,
  1118,
  1017,
  143,
  103,
  524,
  959,
  17,
  639],
 [2, 760, 714],
 [113, 43, 4455, 4456, 89, 1143, 214, 2,

In [114]:
len(tokenized_data)   #same as x data

20000

In [115]:
cleaned_data[0]

'didnt feel humili'

In [116]:
tokenized_data[0]

[61, 2, 522]

In [117]:
# tokenizer.word_index

In [118]:
len_of_message = []
for i in range(len(tokenized_data)):
    len_of_message.append(len(tokenized_data[i]))
print("Your maximum length is : ",max(len_of_message))
# maximum length  

Your maximum length is :  35


In [119]:

print("Your maximum length is : ",max(list(map(len,tokenized_data))))

Your maximum length is :  35


In [120]:
sequences = pad_sequences(tokenized_data,maxlen=35,padding='post')   #for making same length of all the sentence

In [121]:
sequences

array([[  61,    2,  522, ...,    0,    0,    0],
       [  10,    2,  419, ...,    0,    0,    0],
       [   4, 1230,  431, ...,    0,    0,    0],
       ...,
       [   2,  194,  157, ...,    0,    0,    0],
       [ 328,    2,  175, ...,    0,    0,    0],
       [   2,    3,  916, ...,    0,    0,    0]])

In [122]:
labels=[]
for item in y:
        if item not in labels:
            labels.append(item)
label_dict = {label:i for i , label in enumerate(labels)}
label_dict


{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [123]:
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder
# automation 

In [124]:
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(y)

In [125]:
Y

array([4, 4, 0, ..., 2, 2, 2], dtype=int64)

In [126]:
label_encoder.classes_

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'], dtype='<U8')

In [127]:
joblib.dump(label_encoder,'./models/label_encoder.lb')

['./models/label_encoder.lb']

In [128]:
sequences    # <====  X DATA  cleaned data 
Y            # <====  Y DATA label 


array([4, 4, 0, ..., 2, 2, 2], dtype=int64)

In [129]:
## train test splitting 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(sequences,Y,test_size=0.15)

In [130]:
## Model training
## Model define  
# ANN , CNN , RNN  
### logistic regression ,
### DTC , RDC , 
## machine learning algorithm   | ANN   | RNN 



In [131]:
# Define the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM ,Dense 
# dense is hidden layer,lstm (long short term memory)
model = Sequential([
    LSTM(units=64, input_shape=(35, 1), return_sequences=True),
    LSTM(units=64),
    Dense(units=6, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  super().__init__(**kwargs)


In [132]:
history=model.fit(x_train,y_train,epochs=5,validation_data=(x_test,y_test))

Epoch 1/5
[1m532/532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 59ms/step - accuracy: 0.3349 - loss: 1.5933 - val_accuracy: 0.3373 - val_loss: 1.5943
Epoch 2/5
[1m532/532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 57ms/step - accuracy: 0.3338 - loss: 1.5750 - val_accuracy: 0.3283 - val_loss: 1.5889
Epoch 3/5
[1m532/532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 56ms/step - accuracy: 0.3379 - loss: 1.5693 - val_accuracy: 0.3440 - val_loss: 1.5877
Epoch 4/5
[1m532/532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 52ms/step - accuracy: 0.3436 - loss: 1.5723 - val_accuracy: 0.3373 - val_loss: 1.5872
Epoch 5/5
[1m532/532[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 50ms/step - accuracy: 0.3408 - loss: 1.5696 - val_accuracy: 0.3457 - val_loss: 1.5856


In [133]:
# from sklearn.ensemble import RandomForestClassifier
# obj=RandomForestClassifier()
# obj.fit(x_train,y_train)

In [134]:
prediction=model.predict(x_test)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step


In [135]:
prediction

array([[0.13823165, 0.12843555, 0.32051268, 0.07772487, 0.30117735,
        0.0339179 ],
       [0.1358217 , 0.12897684, 0.3314129 , 0.08516959, 0.28355682,
        0.03506207],
       [0.1347226 , 0.12905218, 0.3382727 , 0.09036013, 0.27153406,
        0.03605823],
       ...,
       [0.13833886, 0.12842363, 0.32009682, 0.07748155, 0.3017668 ,
        0.03389242],
       [0.13911197, 0.12818512, 0.31711254, 0.07554185, 0.3064348 ,
        0.03361369],
       [0.14050849, 0.1277648 , 0.31223553, 0.07257504, 0.31368172,
        0.0332345 ]], dtype=float32)

In [136]:
y_test[0]

4

In [137]:
np.argmax(prediction,axis=0)

array([1589, 1742, 1669, 1088, 1589, 2220], dtype=int64)

In [138]:
prediction.argmax()

10016

In [139]:
prediction=np.argmax(prediction,axis=1)

In [140]:
prediction

array([2, 2, 2, ..., 2, 2, 4], dtype=int64)

In [141]:
y_test

array([4, 1, 4, ..., 4, 4, 2], dtype=int64)

In [142]:
df=pd.DataFrame({"actual":y_test,"prediction":prediction})

In [143]:
label_dictionary={value:key for key,value in label_dict.items()}
label_dictionary

{0: 'sadness', 1: 'anger', 2: 'love', 3: 'surprise', 4: 'fear', 5: 'joy'}

In [144]:
df["actual"]=df["actual"].map(label_dictionary)
df["prediction"]=df["prediction"].map(label_dictionary)

In [145]:
df.head(40)

Unnamed: 0,actual,prediction
0,fear,love
1,anger,love
2,fear,love
3,anger,love
4,love,love
5,love,love
6,fear,love
7,love,love
8,fear,love
9,love,love


In [146]:
### using RNN 
# sequential SimpleRNN model defining
from tensorflow.keras.layers import SimpleRNN
model = Sequential()
model.add(SimpleRNN(32,input_shape=(35,1),return_sequences=False))
model.add(Dense(6,activation='softmax'))



## compiling the model
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()
# return_sequence = False  --> means if you don,t want to stack again this model then specify False

  super().__init__(**kwargs)


In [148]:
import os
os.makedirs("models",exist_ok=True)
model.save("/models/lstm_model.h5")



In [None]:
# to load the tensorflow model
from tensorflow.keras.models import load_model
loaded_model=load_model("/models/lstm_model.h5")

In [None]:
loaded_model