# Predict Emotion

The main objective of this notebook is to predict emotions from tweets

In [1]:
# Add project path to the PYTHONPATH

import os
import sys
from pathlib import Path

sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())

In [2]:
import pickle

## Load Tokenizer

Load `.pickle` file with the tokenizer

In [3]:
tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()
with tokenizer_path.open('rb') as file:
    tokenizer = pickle.load(file)

## Load Model

Load the trained emotion recognition model

In [4]:
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate
from tensorflow.keras.models import Model

In [5]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
num_classes = 4
embedding_dim = 500
input_length = 100
lstm_units = 128
lstm_dropout = 0.1
recurrent_dropout = 0.1
spatial_dropout=0.2
filters=64
kernel_size=3

In [6]:
input_layer = Input(shape=(input_length,))
output_layer = Embedding(
  input_dim=input_dim,
  output_dim=embedding_dim,
  input_shape=(input_length,)
)(input_layer)

output_layer = SpatialDropout1D(spatial_dropout)(output_layer)

output_layer = Bidirectional(
LSTM(lstm_units, return_sequences=True,
     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)
)(output_layer)
output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',
                    kernel_initializer='glorot_uniform')(output_layer)

avg_pool = GlobalAveragePooling1D()(output_layer)
max_pool = GlobalMaxPooling1D()(output_layer)
output_layer = concatenate([avg_pool, max_pool])

output_layer = Dense(num_classes, activation='softmax')(output_layer)

model = Model(input_layer, output_layer)

In [7]:
model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()
model.load_weights(model_weights_path.as_posix())

## Load data

Load the data that will have the labels predicted by the model

**data_path**: Path to the `.csv` file that will be used

In [8]:
import pandas as pd

In [9]:
data_path = Path('../datasets/prediction/#RIPLewis.csv').resolve()
data = pd.read_csv(data_path, encoding = "ISO-8859-1")
data.head()

Unnamed: 0,id,date,user,text
0,1199615410137681920,2019-11-27 09:06:16,KnockaFN,"#RIPLewis, the little koala passed away that i..."
1,1199614317135638529,2019-11-27 09:01:55,7_revealed,Iâm glad Lewis the Koala was rescued. His b...
2,1199613738925592577,2019-11-27 08:59:37,viki__xx,heartbreaking news! ð #RIPLewis. https://t....
3,1199611053056937985,2019-11-27 08:48:57,orivios,I just realized #RIPLewis was about a Koala th...
4,1199610786601152512,2019-11-27 08:47:53,AnayiaMelanin,We got kicked out the science center btw but i...


## Load Encoder

Load `.pickle` file with the encoder

In [10]:
encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()
with encoder_path.open('rb') as file:
    encoder = pickle.load(file)

## Preprocess data

Preprocess the data that will be used

In [11]:
from nlp import preprocess
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vladislavklyuev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
cleaned_data = preprocess(data.text)
sequences = [text.split() for text in cleaned_data]
list_tokenized = tokenizer.texts_to_sequences(sequences)
x_data = pad_sequences(list_tokenized, maxlen=100)

Time to clean up: 1.10 sec


## Results

Predict the labels and generate a confusion matrix

In [13]:
import numpy as np

In [14]:
y_pred = model.predict(x_data)

In [15]:
for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)):
    print(encoder.classes_[index] + ": " + str(value))

anger: 0.20797765
fear: 0.3501146
joy: 0.13310508
sadness: 0.3088022


In [16]:
y_pred_argmax = y_pred.argmax(axis=1)
data_len = len(y_pred_argmax)
for index, value in enumerate(np.unique(y_pred_argmax)):
    print(encoder.classes_[index] + ": " + str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))

anger: 0.1427506522549385
fear: 0.4058889303019009
joy: 0.13827804696235557
sadness: 0.31308237048080506


In [17]:
y_pred[5:10].argmax(axis=1)

array([2, 1, 2, 1, 0])

In [18]:
data.text.iloc[9]

'Well this is a shit start to the day #RIPLewis - Iâ\x80\x99m just going to find a corner to sob uncontrollably in. Iâ\x80\x99ll be back later https://t.co/9hvi8FGtBI'