In [23]:
import pandas as pd
import numpy as np

In [24]:
#IMPORT DATASET
dataset_dir = "dataset/ISEAR.csv"
df = pd.read_csv(dataset_dir, encoding='latin-1')

df = df[df["Text"] != "[ No response.]"].dropna()

In [25]:
#CONSTANTS
max_embedding_length = 300 #MAX EMBEDDING DIMENSION
max_sequence_length = 400 #INPUT LENGTH

In [26]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [27]:
# Remove numbers
df['Text'] = df['Text'].apply(lambda x: re.sub('[0-9]+', '', x))

# Remove punctuations
df['Text'] = df['Text'].apply(lambda x: re.sub('[^\w\s]+', '', x))

# Convert to lowercase
df['Text'] = df['Text'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in stop_words]))

# Tokenize
df['Tokenized'] = df['Text'].apply(lambda x: nltk.word_tokenize(x))

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()

    lemmatized = [lemmatizer.lemmatize(token) for token in text]
    return lemmatized

df['Lemmatized'] = df['Tokenized'].apply(lemmatize)


detokenizer = TreebankWordDetokenizer()

df['Detokenized'] = df['Lemmatized'].apply(lambda x: detokenizer.detokenize(x))

In [28]:
df 

Unnamed: 0,Emotion,Text,Tokenized,Lemmatized,Detokenized
0,joy,period falling love time met especially met lo...,"[period, falling, love, time, met, especially,...","[period, falling, love, time, met, especially,...",period falling love time met especially met lo...
1,fear,involved traffic accident,"[involved, traffic, accident]","[involved, traffic, accident]",involved traffic accident
2,anger,driving home several days hard work motorist a...,"[driving, home, several, days, hard, work, mot...","[driving, home, several, day, hard, work, moto...",driving home several day hard work motorist ah...
3,sadness,lost person meant,"[lost, person, meant]","[lost, person, meant]",lost person meant
4,disgust,time knocked deer sight animals injuries helpl...,"[time, knocked, deer, sight, animals, injuries...","[time, knocked, deer, sight, animal, injury, h...",time knocked deer sight animal injury helpless...
...,...,...,...,...,...
7661,anger,two years back someone invited tutor granddaug...,"[two, years, back, someone, invited, tutor, gr...","[two, year, back, someone, invited, tutor, gra...",two year back someone invited tutor granddaugh...
7662,sadness,taken responsibility something prepared howeve...,"[taken, responsibility, something, prepared, h...","[taken, responsibility, something, prepared, h...",taken responsibility something prepared howeve...
7663,disgust,home heard loud sound spitting outside door th...,"[home, heard, loud, sound, spitting, outside, ...","[home, heard, loud, sound, spitting, outside, ...",home heard loud sound spitting outside door th...
7664,shame,homework teacher asked us scolded immediately,"[homework, teacher, asked, us, scolded, immedi...","[homework, teacher, asked, u, scolded, immedia...",homework teacher asked u scolded immediately


In [29]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [30]:
le = LabelEncoder()
y = le.fit_transform(df['Emotion'])

y = to_categorical(y)

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df['Detokenized'], y, test_size=0.2, random_state=42)

In [33]:
text_arr = [''.join(text) for text in df['Detokenized']]

text_train_arr = [''.join(text) for text in X_train]
text_test_arr = [''.join(text) for text in X_test]

In [34]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [35]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_arr)

sequences_train = tokenizer.texts_to_sequences(text_train_arr)
sequences_test = tokenizer.texts_to_sequences(text_test_arr)

In [36]:
X_train_padded = pad_sequences(sequences_train, maxlen = max_sequence_length )
X_test_padded = pad_sequences(sequences_test, maxlen = max_sequence_length )

In [37]:
from gensim.models import KeyedVectors

In [38]:
#EMBEDDING MATRIX CREATION -> CHOOSE ONE

#1ST CHOICE -> USING GLOVE OR FASTTEXT -> JUST CHANGE THE FILEPATH # wiki-news-300d-1M.vec # glove.6B.300d.txt
# embeddings = {}
# with open('embedding/wiki-news-300d-1M.vec', encoding='utf-8') as f:
#     for line in f:
#         token = line.split()[0]
#         embeddings[token] = np.array(line.split()[1:], dtype='float32')

#2ND CHOICE -> USING WORD2VEC
embeddings = KeyedVectors.load_word2vec_format('embedding/GoogleNews-vectors-negative300.bin', binary=True)

In [39]:
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, max_embedding_length))
for word, i in tokenizer.word_index.items():
    if i < len(tokenizer.word_index) + 1 and word in embeddings:
        embedding_matrix[i] = embeddings[word]

In [40]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.07080078, -0.21386719,  0.15332031, ..., -0.21679688,
        -0.01977539,  0.10644531],
       [-0.06689453,  0.07958984, -0.08398438, ...,  0.02575684,
         0.31640625, -0.16796875],
       ...,
       [ 0.19042969,  0.30859375, -0.08496094, ...,  0.17578125,
        -0.09912109,  0.31054688],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0546875 ,  0.13769531,  0.29296875, ...,  0.16699219,
         0.10693359,  0.1953125 ]])

In [41]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional, GRU
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [42]:
def test_model(type="LSTM"):
    model = Sequential()

    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, 
                        output_dim=max_embedding_length, 
                        input_length = max_sequence_length, 
                        weights = [embedding_matrix], 
                        trainable=False))

    if (type == "LSTM"):
        model.add(LSTM(128))
    elif (type == "BiLSTM"):
        model.add(Bidirectional(LSTM(128)))
    else:
        model.add(GRU(128))

    model.add(Dense(7, activation='softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()

    history = model.fit(X_train_padded, y_train, validation_data=(X_test_padded,y_test), epochs=10, batch_size= 128)

    prediction_results = model.predict(X_test_padded)
    prediction_results = np.argmax(prediction_results, axis=1)

    y_test_result = np.argmax(y_test, axis=1)

    print(f"Accuracy: {accuracy_score(y_test_result, prediction_results) * 100:.2f}%")
    print(f"Recall Score: {recall_score(y_test_result, prediction_results, average='weighted') * 100:.2f}")
    print(f"Precision: {precision_score(y_test_result, prediction_results, average='weighted') * 100:.2f}")
    print(f"F1 Score: {f1_score(y_test_result, prediction_results, average='weighted') * 100:.2f}")

In [43]:
test_model("LSTM")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 400, 300)          2481900   
                                                                 
 lstm_2 (LSTM)               (None, 128)               219648    
                                                                 
 dense_2 (Dense)             (None, 7)                 903       
                                                                 
Total params: 2,702,451
Trainable params: 220,551
Non-trainable params: 2,481,900
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 56.85%
Recall Score: 56.85
Precision: 57.85
F1 Score: 57.23


In [44]:
test_model("BiLSTM")

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 400, 300)          2481900   
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              439296    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 7)                 1799      
                                                                 
Total params: 2,922,995
Trainable params: 441,095
Non-trainable params: 2,481,900
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 57.58%
Recall Score: 57.58
Precision: 58.22
F1 Score: 57.76


In [45]:
test_model("GRU")

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 400, 300)          2481900   
                                                                 
 gru (GRU)                   (None, 128)               165120    
                                                                 
 dense_4 (Dense)             (None, 7)                 903       
                                                                 
Total params: 2,647,923
Trainable params: 166,023
Non-trainable params: 2,481,900
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 58.30%
Recall Score: 58.30
Precision: 58.59
F1 Score: 58.19
