In [56]:
# load libs
import pandas as pd
import numpy as np
import keras
import tensorflow
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping
import os
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
datafold = r"D:\data\text\emotions"
trainfile = "train.txt"
traindatapath = os.path.join(datafold,trainfile)

df = pd.read_csv(traindatapath, sep=';')
df.columns = ["text", "emotions"]
df.head()       

Unnamed: 0,text,emotions
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness


In [3]:
texts = df["text"].tolist()
labels = df["emotions"].tolist()
texts[:5],labels[:5]

(['i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy',
  'ive been feeling a little burdened lately wasnt sure why that was'],
 ['sadness', 'anger', 'love', 'anger', 'sadness'])

In [4]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences[:1]

[[1,
  39,
  100,
  59,
  7,
  14,
  493,
  4,
  14,
  3495,
  552,
  31,
  59,
  60,
  127,
  147,
  75,
  1479,
  3,
  21,
  1254]]

In [5]:
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)
padded_sequences

array([[   0,    0,    0, ...,    3,   21, 1254],
       [   0,    0,    0, ...,    2,  494,  437],
       [   0,    0,    0, ...,   29,    5, 3496],
       ...,
       [   0,    0,    0, ...,    3,  101, 1331],
       [   0,    0,    0, ...,  339,    8,   42],
       [   0,    0,    0, ...,   25, 3585,   12]])

In [6]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels

array([4, 0, 3, ..., 2, 0, 4], dtype=int64)

In [7]:
onehotlabels = keras.utils.to_categorical(labels)
onehotlabels

array([[0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(padded_sequences, onehotlabels, test_size=0.2)

In [9]:
tokenizer.word_index

{'i': 1,
 'feel': 2,
 'and': 3,
 'to': 4,
 'the': 5,
 'a': 6,
 'feeling': 7,
 'that': 8,
 'of': 9,
 'my': 10,
 'in': 11,
 'it': 12,
 'like': 13,
 'so': 14,
 'for': 15,
 'im': 16,
 'me': 17,
 'but': 18,
 'was': 19,
 'have': 20,
 'is': 21,
 'this': 22,
 'am': 23,
 'with': 24,
 'not': 25,
 'about': 26,
 'be': 27,
 'as': 28,
 'on': 29,
 'you': 30,
 'just': 31,
 'at': 32,
 'when': 33,
 'or': 34,
 'all': 35,
 'because': 36,
 'more': 37,
 'do': 38,
 'can': 39,
 'really': 40,
 'up': 41,
 't': 42,
 'are': 43,
 'by': 44,
 'very': 45,
 'know': 46,
 'been': 47,
 'if': 48,
 'out': 49,
 'myself': 50,
 'time': 51,
 'how': 52,
 'what': 53,
 'get': 54,
 'little': 55,
 'had': 56,
 'now': 57,
 'will': 58,
 'from': 59,
 'being': 60,
 'they': 61,
 'people': 62,
 'them': 63,
 'would': 64,
 'he': 65,
 'want': 66,
 'her': 67,
 'some': 68,
 'think': 69,
 'one': 70,
 'still': 71,
 'ive': 72,
 'him': 73,
 'even': 74,
 'who': 75,
 'an': 76,
 'life': 77,
 'its': 78,
 'make': 79,
 'there': 80,
 'we': 81,
 'bit': 82

In [10]:
numpossiblewords = len(tokenizer.word_index)
numpossiblewords

15212

In [11]:
numemotions = len(np.unique(labels))
numemotions

6

In [12]:
model = Sequential()
model.add(Embedding(input_dim=numpossiblewords+1,output_dim=128,input_length=max_length))
model.add(Flatten())
model.add(Dense(units=128,activation="relu"))
model.add(Dense(units=numemotions,activation="softmax"))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 66, 128)           1947264   
                                                                 
 flatten (Flatten)           (None, 8448)              0         
                                                                 
 dense (Dense)               (None, 128)               1081472   
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                                 
Total params: 3,029,510
Trainable params: 3,029,510
Non-trainable params: 0
_________________________________________________________________


In [13]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [14]:
model.fit(xtrain, ytrain, epochs=20, batch_size=32, validation_data=(xtest, ytest),callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x24a044a48d0>

In [29]:
def getsentimentpred(txt):
    txt = "i am awesome"
    inpseq = tokenizer.texts_to_sequences([txt])
    padinpseq = pad_sequences(inpseq, maxlen=max_length)
    pred = model.predict(padinpseq)
    predemo = label_encoder.inverse_transform([(np.argmax(pred.flatten()))])
    return predemo

In [30]:
getsentimentpred("It brings a lot of negative emotions and this is quite normal.")



array(['anger'], dtype='<U8')

In [33]:
ypredtest = model.predict(xtest)



In [45]:
predemo = label_encoder.inverse_transform(np.argmax(ypredtest,axis=1))

In [44]:
actemo = label_encoder.inverse_transform(np.argmax(ytest,axis=1))

In [63]:
def getperfmetrics(y_actual,y_pred):
    
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred, average='macro')
    recall = recall_score(y_actual, y_pred, average='macro')  
    f1 = f1_score(y_actual, y_pred, average='macro')
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    return pd.crosstab(actemo,predemo).rename_axis("pred", axis="columns").rename_axis("act", axis="index")

In [64]:
getperfmetrics(actemo,predemo)

Accuracy: 0.828125
Precision: 0.8007668790243962
Recall: 0.7496319923990943
F1-score: 0.7709775207845649


pred,anger,fear,joy,love,sadness,surprise
act,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
anger,318,11,31,9,64,2
fear,20,298,32,5,33,10
joy,10,11,940,44,39,8
love,1,3,69,193,6,3
sadness,21,22,38,3,843,0
surprise,0,27,18,6,4,58
