# <center> Emotion Recognition using BiLSTM
    

Aims to classify the 6 different emotions (sadness, anger, love, surprise, fear, joy) using BiLSTM

# Importing the Libraries

In [1]:
!pip install nlp
!pip install datasets

import tensorflow as tf
import numpy as np
import pandas as pd
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nlp
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers import GlobalAvgPool1D
import random

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the Dataset

**Detailed dataset info: https://huggingface.co/datasets/emotion**

In [2]:
# Importing the dataset
import os
data = nlp.load_dataset('emotion')

# Converting the train, validation and test datasets into DataFrame format
train = pd.DataFrame(data['train'])
validation = pd.DataFrame(data['validation'])
test = pd.DataFrame(data['test'])

# Data Cleaning

In [3]:
train.isna().sum()

# Data Analysis

In [4]:
# Train dataset
train.head(10)

In [5]:
# Let's check the unique labels of the dataset
train['label'].unique()

In [42]:
test.head(10)

# Distribution of the Length of the Texts

In [6]:
train['length_of_text'] = [len(i.split(' ')) for i in train['text']]

fig = px.histogram(train['length_of_text'], marginal='box',
                   labels={"value": "Length of the Text"})

fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Length of the Texts',
                  title_x=0.5, title_font=dict(size=22))
fig.show()

**The length of the data points is distributed between 4 to 46. The outliers start from 48 words.**

# Distribution of the Length of the Texts by Emotions


In [7]:
fig = px.histogram(train['length_of_text'], marginal='box',
                   labels={"value": "Length of the Text"},
                   color=train['label'])
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Length of the Texts by Emotions',
                  title_x=0.5, title_font=dict(size=22))
fig.show()

# Distribution of the Labels


In [8]:
fig = px.histogram(train, x='label', color='label')
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Labels',
                  title_x=0.5, title_font=dict(size=22))
fig.show()

# Frequency of the Words in the Train Dataset


In [9]:
FreqOfWords = train['text'].str.split(expand=True).stack().value_counts()
FreqOfWords_top200 = FreqOfWords[:200]

fig = px.treemap(FreqOfWords_top200, path=[FreqOfWords_top200.index], values=0)
fig.update_layout(title_text='Frequency of the Words in the Train Dataset',
                  title_x=0.5, title_font=dict(size=22)
                  )
fig.update_traces(textinfo="label+value")
fig.show()

**According to graph above, the most frequent words include stopwords such as "i", "and", "to", etc. For the further steps, I will remove them.**

# Tokenizing with NLTK


In [10]:
def tokenization(inputs):
    return word_tokenize(inputs) #REFERENCE[1]


train['text_tokenized'] = train['text'].apply(tokenization)
validation['text_tokenized'] = validation['text'].apply(tokenization)

In [11]:
train.head()

**By using tokenization, I split each data point into words. Tokenization is one of the key steps for NLP applications.**

# Stopwords Removal


**As we have seen from the Frequency of the Words in the Train Dataset visualization, the most frequent words were the English stopwords such as "i", "you", "their", "to", etc. In this step, we will remove these words from the entire dataset by using the NLTK library.**

In [12]:
stop_words = set(stopwords.words('english'))

def stopwords_remove(inputs):
    return [item for item in inputs if item not in stop_words]

train['text_stop'] = train['text_tokenized'].apply(stopwords_remove)
validation['text_stop'] = validation['text_tokenized'].apply(stopwords_remove)

train.head()

**According to the first 5 rows of the train dataset, it is obvious that we achieved our goal.**

# Lemmatization

**Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word [2].** 

In [13]:
lemmatizer = WordNetLemmatizer()

def lemmatization(inputs):
    return [lemmatizer.lemmatize(word=x, pos='v') for x in inputs]

train['text_lemmatized'] = train['text_stop'].apply(lemmatization)
validation['text_lemmatized'] = validation['text_stop'].apply(lemmatization)

train.head()

# Joining Tokens into Sentences


In [14]:
train['text_cleaned'] = train['text_lemmatized'].str.join(' ')
validation['text_cleaned'] = validation['text_lemmatized'].str.join(' ')

train.head() # Final form of the dataset

# WordCloud of the Cleaned Dataset


In [15]:
WordCloud = WordCloud(max_words=100,
                      random_state=30,
                      collocations=True).generate(str((train['text_cleaned'])))

plt.figure(figsize=(15, 8))
plt.imshow(WordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Tokenizing with Tensorflow


In [16]:
num_words = 15000
tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text_cleaned'])

word_index = tokenizer.word_index
# print(word_index) 

In [17]:
Tokenized_train = tokenizer.texts_to_sequences(train['text_cleaned'])
Tokenized_val = tokenizer.texts_to_sequences(validation['text_cleaned'])

**Some Tokenziation Examples from the Dataset**

In [18]:
print('Non-tokenized Version: ', train['text_cleaned'][0])
print('Tokenized Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][0]]))
print('--'*50)
print('Non-tokenized Version: ', train['text_cleaned'][10])
print('Tokenized Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][10]]))
print('--'*50)
print('Non-tokenized Version: ', train['text'][100])
print('Tokenized Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][100]]))

# Padding

In [19]:
maxlen = 50
Padded_train = pad_sequences(Tokenized_train, maxlen=maxlen, padding='pre')
Padded_val = pad_sequences(Tokenized_val, maxlen=maxlen, padding='pre')

print('Non-padded Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][0]]))
print('Padded Version: ', Padded_train[0])
print('--'*50)
print('Non-padded Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][10]]))
print('Padded Version: ', Padded_train[10])


# Creating the Model


In [20]:
model = Sequential()

model.add(Embedding(num_words, 300, input_length=maxlen))
model.add(GlobalAvgPool1D())

tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, activation='tanh'))
model.add(Dropout(0.2))

tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, activation='tanh', return_sequences=True))
model.add(Dropout(0.2))

tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, activation='tanh', return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(6, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# Training the Model


In [21]:
# Replacing the string labels with integers
label_ = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5}
train['label'] = train['label'].replace(label_)
validation['label'] = validation['label'].replace(label_)

train.head()

In [22]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='auto', patience=5,
                                                 restore_best_weights=True)

epochs = 100
hist = model.fit(Padded_train, train['label'], 
                 epochs=epochs,
                 validation_data=(Padded_val, validation['label']), 
                 callbacks=[early_stopping]
                )

# Train and Validation Loss Graphs


In [23]:
plt.figure(figsize=(15, 8))
plt.plot(hist.history['loss'], label='Train Loss')
plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.title('Train and Validation Loss Graphs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Preparing the Test Data


In [24]:
test['text_tokenized'] = test['text'].apply(tokenization)
test['text_stop'] = test['text_tokenized'].apply(stopwords_remove)
test['text_lemmatized'] = test['text_stop'].apply(lemmatization)
test['text_cleaned'] = test['text_lemmatized'].str.join(' ')

Tokenized_test = tokenizer.texts_to_sequences(test['text_cleaned'])
Padded_test = pad_sequences(Tokenized_test, maxlen=maxlen, padding='pre')

test['label'] = test['label'].replace(label_)

test_evaluate = model.evaluate(Padded_test, test['label'])

# Making Predictions in the Test Data


In [25]:
test.head()

In [26]:
def make_predictions(text_input):
    text_input = str(text_input)
    text_input = tokenization(text_input)
    text_input = stopwords_remove(text_input)
    text_input = lemmatization(text_input)
    text_input = ' '.join(text_input)
    text_input = tokenizer.texts_to_sequences([text_input])
    text_input = pad_sequences(text_input, maxlen=maxlen, padding='pre')
    text_input = np.argmax(model.predict(text_input))
    
    if text_input == 0:
        print('Predicted Emotion: Sadness')
    elif text_input == 1:
        print('Predicted Emotion: Joy')
    elif text_input == 2:
        print('Predicted Emotion: Love')
    elif text_input == 3:
        print('Predicted Emotion: Anger')
    elif text_input == 4:
        print('Predicted Emotion: Fear')
    else:
        print('Predicted Emotion: Surprise')
    return text_input

label_ = {0: "Sadness", 1: "Joy", 2: "Love", 3: "Anger", 4: "Fear", 5: "Surprise"}
test['label'] = test['label'].replace(label_)

# Randomly chosen Test Dataset data points
i = random.randint(0, len(test) - 1)

print('Test Text:', test['text'][i])
print(' ')
print('Actual Emotion:', test['label'][i])
make_predictions(test['text'][i])
print('-'*50)
print('Test Text:', test['text'][i+1])
print(' ')
print('Actual Emotion:', test['label'][i+1])







make_predictions(test['text'][i+1])

# Confusion Matrix of the Test Data

In [27]:
from sklearn.metrics import confusion_matrix

label_ = {"Sadness": 0, "Joy": 1, "Love": 2, "Anger": 3, "Fear": 4, "Surprise": 5}
test['label'] = test['label'].replace(label_)

pred = model.predict_classes(Padded_test)
plt.figure(figsize=(15, 8))
conf_mat = confusion_matrix(test['label'].values, pred)
conf_mat = pd.DataFrame(conf_mat, columns=np.unique(test['label']), index=np.unique(pred))
conf_mat.index.name = 'Actual'
conf_mat.columns.name = 'Predicted'
sns.heatmap(conf_mat, annot=True, fmt='g')
plt.title('Confusion Matrix of the Test Data', fontsize=14)
plt.show()

# Having Fun with the Model

In [28]:
make_predictions('i feeling very bad for neural network')

In [29]:
make_predictions('I Learned bad Something new today')


In [30]:
make_predictions('i have no feeling ')

In [31]:
make_predictions('No one told you when to run, you missed the starting gun')

In [32]:
make_predictions("I just asked one question to confirm his request, and my boss bit my head off.")

In [33]:
make_predictions("She’s flying high after the successful product launch.")

In [34]:
make_predictions("I’m going to have the first meeting with a big client tomorrow, and I’m feeling butterflies in my stomach")

In [35]:
make_predictions("Sometimes the people who appear to be the most confident are actually afraid of their own shadows.")

In [36]:
make_predictions('how was your day ?')

In [37]:
make_predictions("I'm really impressed that Ashley can speak 7 languages, whereas I only speak one!")

In [38]:
make_predictions("Grandpa was very proud of me when I got a promotion at work. He took me out to dinner to celebrate.")

In [39]:
make_predictions("We are delighted that you will be coming to visit us. It will be so nice to have you here.")

In [40]:
make_predictions("I am anxious to hear back about the job interview I had on Friday. I hope I get the job!")

In [41]:
make_predictions("Blown away by the spectacular visuals! Just wow.. stunning!! Can't wait to watch the film")

# References

https://www.coursera.org/learn/tweet-emotion-tensorflow

https://www.geeksforgeeks.org/python-lemmatization-with-nltk/