## **Load the dataset**

In [1]:
import pandas as pd
df = pd.read_csv(r"train_sent_emo.csv")
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my company’s tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You must’ve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389"
3,4,So let’s talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572"
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917"
...,...,...,...,...,...,...,...,...,...,...,...
9984,10474,You or me?,Chandler,neutral,neutral,1038,13,2,3,"00:00:48,173","00:00:50,799"
9985,10475,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,neutral,neutral,1038,14,2,3,"00:00:51,009","00:00:53,594"
9986,10476,"You guys are messing with me, right?",Joey,surprise,positive,1038,15,2,3,"00:01:00,518","00:01:03,520"
9987,10477,Yeah.,All,neutral,neutral,1038,16,2,3,"00:01:05,398","00:01:07,274"


## **Considering required features**

In [2]:
df['Emotion'].value_counts()

Emotion
neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: count, dtype: int64

In [3]:
df.isna().sum()

Sr No.          0
Utterance       0
Speaker         0
Emotion         0
Sentiment       0
Dialogue_ID     0
Utterance_ID    0
Season          0
Episode         0
StartTime       0
EndTime         0
dtype: int64

In [4]:
train_df = df[['Utterance','Emotion']]
train_df

Unnamed: 0,Utterance,Emotion
0,also I was the point person on my company’s tr...,neutral
1,You must’ve had your hands full.,neutral
2,That I did. That I did.,neutral
3,So let’s talk a little bit about your duties.,neutral
4,My duties? All right.,surprise
...,...,...
9984,You or me?,neutral
9985,"I got it. Uh, Joey, women don't have Adam's ap...",neutral
9986,"You guys are messing with me, right?",surprise
9987,Yeah.,neutral


## **Solving the imbalance class problem**

In [5]:
train_df['Emotion'].value_counts()

Emotion
neutral     4710
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: count, dtype: int64

In [6]:
class_count_1, class_count_2, class_count_3, class_count_4, class_count_5, class_count_6,class_count_7  = train_df['Emotion'].value_counts()
print(class_count_1, class_count_2, class_count_3, class_count_4, class_count_6,class_count_7)

4710 1743 1205 1109 271 268


### **Upsampling**

In [7]:
import imblearn
from sklearn.model_selection import train_test_split
import numpy as np

X=train_df.iloc[:,:-1]
y=train_df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from imblearn.over_sampling import RandomOverSampler
# Apply oversampling to the training data
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_oversampled, y_oversampled = oversampler.fit_resample(X_train, y_train)

In [8]:
X_oversampled, y_oversampled

(                                               Utterance
 0                            Well yeah, sure, what’s up?
 1      Y'know, I-I don't even feel like I know you an...
 2                                                 Do it!
 3                                               Come on!
 4                                                Action!
 ...                                                  ...
 26070                                              What?
 26071                              It’s happened to you?
 26072                               You couldn’t do it?!
 26073            Dear God! This parachute is a knapsack!
 26074                                              Whoa!
 
 [26075 rows x 1 columns],
 0         neutral
 1           anger
 2             joy
 3           anger
 4         neutral
            ...   
 26070    surprise
 26071    surprise
 26072    surprise
 26073    surprise
 26074    surprise
 Name: Emotion, Length: 26075, dtype: object)

The sample size has increased from 9989 to 26075

### **Undersampling**

In [9]:
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X, y)

In [10]:
X_undersampled, y_undersampled

(                                              Utterance
 0                               What?!  What the hell?!
 1     Because I’m not going to spend one more day wi...
 2     Come on apartment! Come on apartment!  Oh! I k...
 3     And-and that’s only ‘cause it’s right there wh...
 4     No, no, no, don't! I've been waitin' for like,...
 ...                                                 ...
 1871                                         Sure! Why?
 1872  Are you serious?! Chandler, we ate an entire c...
 1873                                              What?
 1874                                We didn’t have sex.
 1875  Whoa—hey—wh-wh-what do you got there? What is ...
 
 [1876 rows x 1 columns],
 0          anger
 1          anger
 2          anger
 3          anger
 4          anger
           ...   
 1871    surprise
 1872    surprise
 1873    surprise
 1874    surprise
 1875    surprise
 Name: Emotion, Length: 1876, dtype: object)

The sample size has increased from 9989 to 1876

### **Using the Upsampled Dataset**

In [11]:
train_df=pd.DataFrame(X_oversampled, columns=X_train.columns)
train_df['Emotion']=y_oversampled

## **TEXT PREPROCESSING**

In [12]:
train_df

Unnamed: 0,Utterance,Emotion
0,"Well yeah, sure, what’s up?",neutral
1,"Y'know, I-I don't even feel like I know you an...",anger
2,Do it!,joy
3,Come on!,anger
4,Action!,neutral
...,...,...
26070,What?,surprise
26071,It’s happened to you?,surprise
26072,You couldn’t do it?!,surprise
26073,Dear God! This parachute is a knapsack!,surprise


### **Convert text to lower case**

In [13]:
train_df['clean_text'] = train_df['Utterance'].str.lower()
train_df.head()

Unnamed: 0,Utterance,Emotion,clean_text
0,"Well yeah, sure, what’s up?",neutral,"well yeah, sure, what’s up?"
1,"Y'know, I-I don't even feel like I know you an...",anger,"y'know, i-i don't even feel like i know you an..."
2,Do it!,joy,do it!
3,Come on!,anger,come on!
4,Action!,neutral,action!


### **Tokenization**

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
from nltk.tokenize import word_tokenize
train_df['text_without_stopwords'] = train_df['clean_text'].apply(word_tokenize)

### **Removing Stop Words**

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
train_df['text_without_stopwords'] = train_df['text_without_stopwords'].apply(lambda x: [word for word in x if word not in stop_words])
train_df.head()

Unnamed: 0,Utterance,Emotion,clean_text,text_without_stopwords
0,"Well yeah, sure, what’s up?",neutral,"well yeah, sure, what’s up?","[well, yeah, ,, sure, ,, ’, ?]"
1,"Y'know, I-I don't even feel like I know you an...",anger,"y'know, i-i don't even feel like i know you an...","[y'know, ,, i-i, n't, even, feel, like, know, ..."
2,Do it!,joy,do it!,[!]
3,Come on!,anger,come on!,"[come, !]"
4,Action!,neutral,action!,"[action, !]"


### **Word Normalization using Lemmatization**

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
train_df['normalized_text'] = train_df['text_without_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
train_df.head()

Unnamed: 0,Utterance,Emotion,clean_text,text_without_stopwords,normalized_text
0,"Well yeah, sure, what’s up?",neutral,"well yeah, sure, what’s up?","[well, yeah, ,, sure, ,, ’, ?]","[well, yeah, ,, sure, ,, ’, ?]"
1,"Y'know, I-I don't even feel like I know you an...",anger,"y'know, i-i don't even feel like i know you an...","[y'know, ,, i-i, n't, even, feel, like, know, ...","[y'know, ,, i-i, n't, even, feel, like, know, ..."
2,Do it!,joy,do it!,[!],[!]
3,Come on!,anger,come on!,"[come, !]","[come, !]"
4,Action!,neutral,action!,"[action, !]","[action, !]"


In [20]:
def return_sequence(tokens):
  return " ".join([token for token in tokens])

train_df['pre_processed_text'] = train_df['normalized_text'].apply(return_sequence)

## **Feature Representation using different embedding techniques**

### **Bag of Words**

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(train_df['pre_processed_text'].values.tolist())

In [22]:
count_matrix.toarray().shape

(26075, 4330)

#### **Visualizing the BOW representation**

In [23]:
bow_matrix = count_matrix.toarray()

# Get the feature names
feature_names = cv.get_feature_names_out()

# Create a DataFrame for visualization
import pandas as pd
df_bow = pd.DataFrame(bow_matrix, columns=feature_names)
print(df_bow)

       00  000  10  100  11  110  112  12  1200  13  ...  yuh  yum  zana  \
0       0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
1       0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
2       0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
3       0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
4       0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
...    ..  ...  ..  ...  ..  ...  ...  ..   ...  ..  ...  ...  ...   ...   
26070   0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
26071   0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
26072   0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
26073   0    0   0    0   0    0    0   0     0   0  ...    0    0     0   
26074   0    0   0    0   0    0    0   0     0   0  ...    0    0     0   

       zelner  zero  zillionaire  zine  zip  zoo  zygomatic  
0           0     0      

### **TF-IDF**

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(train_df['pre_processed_text'].values.tolist())

In [25]:
tfidf_array = tfidf_matrix.toarray()

In [26]:
# Get the feature names
feature_names = tfidf.get_feature_names_out()

# Create a DataFrame for visualization
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_array, columns=feature_names)
print(df_tfidf)

        00  000   10  100   11  110  112   12  1200   13  ...  yuh  yum  zana  \
0      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
1      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
2      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
3      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
4      0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
...    ...  ...  ...  ...  ...  ...  ...  ...   ...  ...  ...  ...  ...   ...   
26070  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
26071  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
26072  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
26073  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   
26074  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.0  0.0   0.0   

       zelner  zero  zillio

### **Continuous Bag of Words (CBOW)**

In [27]:
import numpy as np
from gensim.models.word2vec import Word2Vec
cbow = Word2Vec(train_df['pre_processed_text'].values.tolist(), vector_size=100, window=5, min_count=2, sg=0)
vocab = cbow.wv.index_to_key

def get_mean_vector(model, sentence):
    words = [word for word in sentence if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    return np.zeros((100,))

cbow_array = []
for sentence in train_df['pre_processed_text'].values.tolist():
    cbow_array.append(get_mean_vector(cbow, sentence))



In [28]:
cbow_array = np.array(cbow_array)
cbow_array.shape

(26075, 100)

In [29]:
for row in cbow_array:
    print(row)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.24242929  0.15845203  0.1025025   0.08312935 -0.13784213 -0.22150771
  0.34795123 -0.08579762 -0.4531056  -0.05540887  0.21406402  0.10111605
  0.22130553  0.09267872 -0.17327543  0.05238144 -0.07920843  0.11684602
 -0.05480652 -0.02795494  0.04579433 -0.01618441]
[-0.00610139 -0.48124495 -0.00536295 -0.52309924  0.16155481  0.13491295
 -0.12775733 -0.49118355  0.12446758  0.09252494 -0.28949115 -0.27408302
 -0.20637949  0.08312675 -0.03123325 -0.02171675 -0.17676479  0.05342051
  0.13933961  0.43280169 -0.30814201  0.03086886 -0.20667824 -0.32835317
  0.1236312  -0.095113    0.08575583 -0.27912781  0.02369314  0.03934857
 -0.00862092  0.17389527 -0.17810999  0.17404298 -0.26886493 -0.01939634
 -0.11135966  0.32287607  0.37329376 -0.04378391  0.18520296  0.02469978
  0.09316237 -0.08070456 -0.21365078  0.18751977 -0.05271953  0.01523965
  0.04629016  0.0828885   0.0727882  -0.27992332  0.44263718  0.06626207
 -0.04942

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.07056594  0.27114019  0.21239883  0.16399671  0.44278207 -0.02848724
  0.14888278 -0.19071378 -0.04524992  0.21657422 -0.14458823 -0.06901862
  0.05001803  0.0509157   0.12869878 -0.1821752   0.37251315  0.12026247
 -0.07665032  0.33725801  0.22280407  0.32486227  0.19299005  0.1423381
 -0.14734213 -0.1319616  -0.18006933  0.09755978  0.12498185  0.1907382
 -0.15541025 -0.45998117  0.25448176  0.14953645 -0.07555233 -0.14837514
  0.09283809  0.01936869  0.01059179 -0.18245201  0.14922099 -0.20926173
  0.13078015  0.07129866 -0.08921895  0.06791811 -0.11341779 -0.14214814
  0.29235235 -0.0103196  -0.2822718  -0.04471868  0.16612519  0.07561905
  0.00152179 -0.05248033 -0.1750889  -0.132039   -0.21279112  0.26362744
  0.20377062 -0.0269931   0.00676252  0.01055782]
[-0.04465551 -0.53286332 -0.10299206 -0.40953597  0.10068997  0.12317918
 -0.07336513 -0.47323498 -0.065186    0.14268044 -0.23433156 -0.05896314
 -0.1022496

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.1708491  -0.01252161  0.0373529  -0.11277337 -0.16975398 -0.24521182
  0.31845608 -0.12299626 -0.31775117 -0.05305259  0.04750222  0.00081682
  0.11312157  0.02697116 -0.1233752   0.04795345 -0.19720891  0.1774105
  0.13858755  0.009773   -0.00438683 -0.05551109]
[ 0.0110282  -0.16980124 -0.00991679 -0.22190388  0.3186239   0.01453629
 -0.12171551 -0.28109822 -0.18826778  0.09936325 -0.19951636 -0.18555938
  0.02994744  0.09064972  0.09359857  0.06102921 -0.0233911   0.15055066
  0.04737496  0.11979715 -0.16233888 -0.03465456 -0.26566353 -0.35606837
  0.16950023 -0.00378418  0.00724624 -0.13967012  0.0975886  -0.05106707
  0.00111917  0.23056079 -0.31554219  0.18484293 -0.1178931   0.04579782
 -0.09798936  0.14381206  0.20116651  0.01873378  0.26600626 -0.19849078
  0.14208074  0.03192366 -0.07357448  0.31972018 -0.04725872 -0.05425439
 -0.02867497 -0.11890962  0.25425044 -0.12162258  0.14129974  0.12791483
 -0.168649

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.12628944 -0.03816188  0.02327482 -0.07480754]
[ 0.03035275  0.08718443  0.02659719 -0.15467457  0.32779601 -0.17621152
  0.02358292 -0.31784141 -0.35894683  0.07462101 -0.30182496 -0.20489021
  0.30472687 -0.02310197  0.32916716  0.03757912 -0.01926349  0.20721364
  0.1279185  -0.07644244  0.04400908 -0.05237944 -0.62436384 -0.32570106
  0.13294119 -0.07694831 -0.0070776  -0.30838731  0.05982071 -0.31012356
  0.06911185  0.46038708 -0.55814767  0.13682666  0.05344643 -0.07147156
 -0.12716116  0.29121566  0.19720912  0.12780547  0.30037138 -0.43056729
  0.33271456  0.04928682  0.01577894  0.1712687  -0.01364696 -0.00110136
  0.0711539  -0.19628702  0.49238029  0.26237318  0.22119997  0.16320899
 -0.10690025  0.13561608  0.1928165   0.01028934 -0.15156251 -0.0805781
  0.01906324  0.00820206 -0.2595745   0.20780627  0.26691076  0.22611886
 -0.09369516 -0.32732633  0.32924858 -0.00933386 -0.23286133 -0.16324532
 -0.101386

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  7.26640582e-01 -7.91016370e-02 -6.89194381e-01 -2.55201727e-01
 -3.15777838e-01  1.03186831e-01  3.09504755e-02  4.06540543e-01
  7.41596222e-01  4.32144761e-01  9.50846434e-01  1.08067417e+00
  2.49738187e-01  3.90617549e-01 -3.94127034e-02 -6.52991652e-01
  4.44221824e-01  5.21525323e-01 -3.37259322e-01 -2.57847887e-02
  1.04234409e+00  4.14261362e-04  6.03841603e-01  3.72432321e-01
  4.90054488e-01  2.00524792e-01  1.83132812e-01  8.28562956e-03
  1.02380421e-02  9.80760604e-02 -4.35028762e-01 -2.33781710e-01]
[-0.08626744 -0.23792394  0.01986901 -0.15445164  0.36467591  0.002975
 -0.04694187 -0.36879575 -0.20359971  0.13993943 -0.27727783 -0.23718099
  0.08439287  0.1186583   0.17152961  0.10953952 -0.06522109  0.11773186
  0.0887273   0.12996514 -0.08246723 -0.01749746 -0.23485024 -0.50201541
  0.20501955 -0.02821004 -0.02693322 -0.17562275  0.03178854 -0.02750948
  0.06193351  0.29064962 -0.29566252  0.24955036 -0

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  3.79886366e-02 -6.59703882e-03 -1.12004131e-01 -2.94470996e-01
  3.50833595e-01 -5.25171421e-02 -2.91426271e-01 -5.27778789e-02
  2.35738650e-01 -2.25042179e-02  1.31854460e-01  6.72105886e-03
 -6.60751462e-02  2.86035705e-03 -2.03031659e-01  1.13501914e-01
  1.36017457e-01  3.84453125e-02 -6.00051992e-02 -8.12935606e-02]
[-0.06282789 -0.37178475 -0.02142316 -0.35080788  0.27293923  0.0989029
 -0.04729375 -0.43077177 -0.06345288  0.14626861 -0.26003379 -0.27158204
 -0.10011502  0.12512693 -0.04578475 -0.02863881 -0.13499074  0.03627775
  0.07336822  0.22965209 -0.24405378  0.04995682 -0.30180791 -0.34609371
  0.08560596 -0.09835286  0.07287349 -0.2367375   0.00569333 -0.09154552
  0.01807401  0.18963234 -0.27449164  0.23951092 -0.16866359 -0.04267557
 -0.1214169   0.29419175  0.4172864   0.03974032  0.2411584  -0.15382302
  0.10231482 -0.00145498 -0.0898716   0.22987705 -0.07805184  0.04082527
  0.093995   -0.04948433  

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -0.0532279  -0.04519578  0.09448606 -0.05665115 -0.01578277  0.15583859
  0.14541009  0.1638217  -0.2698535  -0.02939768 -0.33983558 -0.15353255
  0.15997432  0.06905811 -0.12718992 -0.25061527  0.17276318 -0.02698869
 -0.01288625  0.31280655 -0.27926263  0.15676597 -0.07272234 -0.20098938
 -0.29047307  0.23168719  0.33257979 -0.10657249  0.15806441 -0.12518026
  0.26093498  0.06568462 -0.2034208   0.11102472 -0.02475649  0.04778603
 -0.05992067 -0.18989396  0.29641345  0.02033189  0.24256079  0.04140467
 -0.14342107 -0.01320544  0.01961357  0.03251591  0.1148466  -0.1130311
  0.01451905 -0.02167049 -0.07513715  0.24256526  0.1817936   0.14200552
  0.0266717  -0.31048068  0.02061869  0.04339032 -0.11738648  0.05159103
 -0.0357971   0.02754214 -0.04400152  0.15630002  0.12723373 -0.00547905
  0.28553978  0.18905151  0.0770606  -0.04122137  0.02409938 -0.32514843
  0.37522069  0.02621274 -0.12487704  0.12773593  0.15081306

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.03047025  0.18329643 -0.16265571  0.24090566 -0.25241956 -0.00940072
 -0.23577203  0.29492041  0.44278666 -0.07186002  0.24767663 -0.03815228
  0.1117506  -0.0129455  -0.15858901  0.21672477 -0.04707453  0.0123037
  0.08115267 -0.04844955  0.09204751 -0.11779293  0.39096612  0.06265421
 -0.04165767  0.16267578  0.12664804  0.12490483  0.17228097 -0.01288367
 -0.01729063 -0.15497369 -0.13720059  0.01843484  0.04808252  0.10502288
 -0.06254587 -0.38252905  0.03919864  0.134101   -0.07137515 -0.07050409
  0.1159135   0.01756629 -0.07153723 -0.07835899  0.1083425  -0.1186593
  0.16855673 -0.0252224   0.02446886 -0.04288926 -0.08927259 -0.21984547
  0.39713266 -0.093174   -0.27151981 -0.01554778  0.1039215  -0.00662176
  0.12640278 -0.05432169 -0.17327604  0.02232685 -0.26564866  0.09530616
  0.13704816  0.04672061  0.06035528 -0.05160657]
[-4.77327928e-02 -4.09526736e-01 -8.93164128e-02 -4.36128139e-01
  1.89110905e-01  1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -0.08170598 -0.08983976 -0.18174724  0.07353295  0.10507332  0.14464281
 -0.12840858 -0.39709014  0.25023609  0.12257056 -0.1477346  -0.10761143
  0.0336295   0.00423045 -0.00750064 -0.03108041  0.15103209 -0.09381091
  0.23553641  0.18783697  0.03811627  0.05903501 -0.11349388 -0.22501414
  0.31605217  0.02296824 -0.33668873 -0.03267474  0.29079315  0.07915886
  0.08584961  0.02029196 -0.09891875 -0.02327004 -0.14273082  0.1501818
  0.0998757   0.0206268  -0.00813971 -0.08673549]
[-0.05476892 -0.3655456  -0.05506032 -0.36178136  0.20254765  0.08276273
 -0.03764592 -0.39811859 -0.12422124  0.09678555 -0.279504   -0.20724852
 -0.07517169  0.05189515  0.00262204  0.03379827 -0.10410766  0.0666589
  0.12662372  0.2244854  -0.20172399 -0.00121947 -0.24018358 -0.3877745
  0.12182414 -0.09580965  0.07322829 -0.22582172  0.03658788 -0.01273067
  0.0325424   0.2271167  -0.24076897  0.19382195 -0.20250686  0.02038057
 -0.11248973

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -0.08263464 -0.45505479  0.0180844   0.09880453 -0.26910254 -0.21269725
 -0.16201331  0.06676393 -0.05166953  0.02560689 -0.07536104  0.07652386
  0.10129825  0.3721807  -0.34099874 -0.0049017  -0.21508314 -0.39508525
  0.08425985 -0.0774238   0.04232008 -0.22525661  0.02802059  0.09821735
  0.04492167  0.13689686 -0.11558191  0.23192464 -0.3022179   0.00404967
 -0.15781724  0.32062161  0.40272206 -0.02422761  0.24583063  0.03664102
  0.06655578  0.01568238 -0.19022977  0.20012407 -0.00575705 -0.01086474
  0.0782506   0.00632396  0.03238233 -0.20600966  0.38528228  0.02905803
 -0.06659486  0.20817418  0.09982184  0.18325534  0.26852986  0.04755839
 -0.02385737 -0.14106949 -0.06618839 -0.00320588  0.00404665  0.08919448
 -0.08185067 -0.41805801  0.08135985  0.12265668 -0.02471601 -0.05450125
  0.14290252  0.05329675 -0.05699801 -0.07855631  0.07522532 -0.19723384
  0.12265372 -0.09716903  0.02740505 -0.03290329 -0.1146027

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### **Skipgram**

In [30]:
sg = Word2Vec(train_df['pre_processed_text'].values.tolist(), vector_size=100, window=5, min_count=2, sg=1)
vocab = sg.wv.index_to_key

def get_mean_vector(model, sentence):
    words = [word for word in sentence if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    return np.zeros((100,))

sg_array = []
for sentence in train_df['pre_processed_text'].values.tolist():
    sg_array.append(get_mean_vector(sg, sentence))



In [31]:
sg_array = np.array(sg_array)
sg_array.shape

(26075, 100)

### **Word2Vec**

In [32]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

#Using tokenized words
model = Word2Vec(sentences=train_df['text_without_stopwords'], vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec.model")

# Load the model
model = Word2Vec.load("word2vec.model")

# Get the vector representation of a word
vector = model.wv['hello']
print(f"Vector for 'hello': {vector}")

# Find similar words
similar_words = model.wv.most_similar('hello', topn=3)
print(f"Words similar to 'hello': {similar_words}")

Vector for 'hello': [-0.26250184  0.26271376  0.1679754   0.24583542  0.23452869 -0.3942596
  0.13973363  0.50309557 -0.335287   -0.22910655  0.00959099 -0.42672524
 -0.01201102  0.19897588  0.17479381 -0.22384726  0.3107742  -0.18957329
 -0.12933588 -0.57885206  0.21385866  0.05858045  0.37831095 -0.33381212
 -0.02423082 -0.02616998 -0.20343105  0.04650186 -0.38750175  0.17996909
  0.35323563 -0.02685308  0.08843762 -0.4006594  -0.16349745  0.32758218
  0.09918679 -0.0660677  -0.13963078 -0.21070363  0.06269559 -0.1830118
 -0.20361367  0.08948987  0.25273904  0.13343129 -0.19365574 -0.1539592
  0.01695514  0.23625635  0.1267373  -0.26510426 -0.16640683 -0.08546929
 -0.15184614  0.04773194  0.02596737 -0.08952665 -0.42738584  0.15960355
 -0.0015365   0.02589847  0.12757409  0.04036891 -0.27261603  0.32645258
  0.17301264  0.46778008 -0.3380949   0.33300915 -0.08405463  0.20797415
  0.29015753  0.03577985  0.30692032  0.00875841 -0.02976033  0.07758744
 -0.07462913 -0.07678604 -0.375575

## Building the Static Model: Logistic Regression

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def static_model(X,y):
    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    logistic_reg = LogisticRegression(max_iter=1000)
    logistic_reg.fit(X_train, y_train)
    y_pred = logistic_reg.predict(X_test)

    # Evaluating the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Classification report
    print(classification_report(y_test, y_pred))

### **Giving the bow representation as input to the static model**

In [34]:
y=train_df['Emotion']
static_model(bow_matrix,y)

Accuracy: 0.7202301054650048
              precision    recall  f1-score   support

       anger       0.75      0.64      0.69       779
     disgust       0.89      0.95      0.92       709
        fear       0.79      0.87      0.83       750
         joy       0.71      0.66      0.68       763
     neutral       0.51      0.39      0.44       744
     sadness       0.81      0.76      0.78       738
    surprise       0.59      0.78      0.67       732

    accuracy                           0.72      5215
   macro avg       0.72      0.72      0.72      5215
weighted avg       0.72      0.72      0.71      5215



### **Giving the tf-idf representation as input to the static model**

In [35]:
static_model(tfidf_array,y)

Accuracy: 0.6690316395014382
              precision    recall  f1-score   support

       anger       0.65      0.58      0.61       779
     disgust       0.86      0.95      0.90       709
        fear       0.75      0.86      0.80       750
         joy       0.66      0.61      0.64       763
     neutral       0.40      0.42      0.41       744
     sadness       0.72      0.70      0.71       738
    surprise       0.64      0.58      0.61       732

    accuracy                           0.67      5215
   macro avg       0.67      0.67      0.67      5215
weighted avg       0.67      0.67      0.67      5215



### **Giving the CBOW representation as input to the static model**

In [36]:
static_model(cbow_array,y)

Accuracy: 0.31140939597315437
              precision    recall  f1-score   support

       anger       0.28      0.30      0.29       779
     disgust       0.27      0.28      0.27       709
        fear       0.33      0.29      0.31       750
         joy       0.31      0.18      0.23       763
     neutral       0.28      0.41      0.33       744
     sadness       0.34      0.32      0.33       738
    surprise       0.39      0.40      0.40       732

    accuracy                           0.31      5215
   macro avg       0.31      0.31      0.31      5215
weighted avg       0.31      0.31      0.31      5215



### **Giving the Skipgram representation as input to the static model**

In [37]:
static_model(sg_array,y)

Accuracy: 0.312751677852349
              precision    recall  f1-score   support

       anger       0.30      0.31      0.30       779
     disgust       0.24      0.26      0.25       709
        fear       0.34      0.33      0.33       750
         joy       0.34      0.19      0.25       763
     neutral       0.28      0.41      0.33       744
     sadness       0.33      0.27      0.29       738
    surprise       0.42      0.42      0.42       732

    accuracy                           0.31      5215
   macro avg       0.32      0.31      0.31      5215
weighted avg       0.32      0.31      0.31      5215



## **Neural network**

In [38]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

### **NN input given as bow matrix**

In [39]:
y = pd.get_dummies(train_df['Emotion']).values  # Convert labels to one-hot encoding

In [40]:
# Split the data
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_matrix, y, test_size=0.2, random_state=42)

In [41]:
# Build the model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_bow.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(y_train_bow.shape[1], activation='softmax')
])

In [42]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [43]:
# Train the model
model.fit(X_train_bow, y_train_bow, epochs=10, batch_size=128, validation_data=(X_test_bow, y_test_bow))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d009f436f20>

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, InputLayer, Reshape

In [50]:
input_shape = X_train_bow.shape[1]  # Number of features

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_bow, y_test_bow)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 0.4647997319698334, Accuracy: 0.846596360206604


### **NN input given as tf-idf array**

In [53]:
# Split the data
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_array, y, test_size=0.2, random_state=42)

In [54]:
# Build the model
model_tfidf = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(y_train_tfidf.shape[1], activation='softmax')
])

In [55]:
# Compile the model
model_tfidf.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [56]:
# Train the model
model_tfidf.fit(X_train_tfidf, y_train_tfidf, epochs=10, batch_size=128, validation_data=(X_test_tfidf, y_test_tfidf))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cff803a7280>

In [57]:
# Evaluate the model
loss, accuracy = model_tfidf.evaluate(X_test_tfidf, y_test_tfidf)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 0.5034346580505371, Accuracy: 0.8312559723854065


### **NN input given as cbow matrix**

In [58]:
# Split the data
X_train_cbow, X_test_cbow, y_train_cbow, y_test_cbow = train_test_split(cbow_array, y, test_size=0.2, random_state=42)

In [59]:
# Build the model
model_cbow = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_cbow.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(y_train_cbow.shape[1], activation='softmax')
])

In [60]:
# Compile the model
model_cbow.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [61]:
# Train the model
model_cbow.fit(X_train_cbow, y_train_cbow, epochs=10, batch_size=128, validation_data=(X_test_cbow, y_test_cbow))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cff800cb970>

In [62]:
# Evaluate the model
loss, accuracy = model_cbow.evaluate(X_test_cbow, y_test_cbow)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 1.6212859153747559, Accuracy: 0.3616490960121155


### **NN input given as skipgram array**

In [63]:
# Split the data
X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(sg_array, y, test_size=0.2, random_state=42)

In [64]:
# Build the model
model_sg = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_sg.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(y_train_sg.shape[1], activation='softmax')
])

In [65]:
# Compile the model
model_sg.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [66]:
model_sg.fit(X_train_sg, y_train_sg, epochs=10, batch_size=128, validation_data=(X_test_sg, y_test_sg))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cff20719c90>

In [67]:
# Evaluate the model
loss, accuracy = model_sg.evaluate(X_test_sg, y_test_sg)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 1.6924107074737549, Accuracy: 0.34228187799453735


## **LSTM**

In [68]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding

In [74]:
# Example to reshape a 2D array to 3D
X_train_reshaped = X_train_bow.reshape(X_train_bow.shape[0], X_train_bow.shape[1], 1)  # Adding an extra dimension

# Adjust the model input shape
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y_train_bow.shape[1], activation='softmax'))


In [75]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [76]:
# Train the model
model.fit(X_train_reshaped, y_train_bow, epochs=10, batch_size=64, validation_data=(X_test_bow, y_test_bow))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cff1be47ac0>

## **Bidirectional LSTM**

In [None]:
print(y_train.shape)

(7991,)


In [None]:
from tensorflow.keras.utils import to_categorical

y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

In [None]:
from tensorflow.keras.layers import Bidirectional, Embedding

X_train_reshaped = X_train_bow.reshape(X_train_bow.shape[0], X_train_bow.shape[1], 1)  # Adding an extra dimension


# Assuming `max_features` is the size of your vocabulary and `embedding_dim` is the dimension of the embeddings
max_features = 10000  # Example value
embedding_dim = 100  # Example value

model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=X_train_reshaped.shape[1]))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
#model.add(Dense(y_train.shape[0], activation='softmax'))
model.add(Dense(7, activation='softmax'))  # Assuming 7 classes

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_reshaped, y_train_bow, epochs=10, batch_size=64, validation_data=(X_test_bow, y_test_bow))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e7bd83355a0>