In [83]:
import pandas as pd
import numpy as np
import tensorflow as tf
import random

In [84]:
SEED=42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [85]:
df=pd.read_csv('tweet_emotions.csv')

In [86]:
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [87]:
print((df['sentiment']=='empty').sum())


827


In [88]:
df=df[~df['sentiment'].isin(['empty'])]

In [89]:
df

Unnamed: 0,tweet_id,sentiment,content
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
5,1956968477,worry,Re-pinging @ghostridah14: why didn't you go to...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [90]:
df=df.reset_index(drop=True)

In [91]:
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
1,1956967696,sadness,Funeral ceremony...gloomy friday...
2,1956967789,enthusiasm,wants to hang out with friends SOON!
3,1956968416,neutral,@dannycastillo We want to trade with someone w...
4,1956968477,worry,Re-pinging @ghostridah14: why didn't you go to...
...,...,...,...
39168,1753918954,neutral,@JohnLloydTaylor
39169,1753919001,love,Happy Mothers Day All my love
39170,1753919005,love,Happy Mother's Day to all the mommies out ther...
39171,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [135]:
df['sentiment'].unique()

array(['sadness', 'enthusiasm', 'neutral', 'worry', 'surprise', 'love',
       'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [137]:
df['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [93]:
y=df['sentiment']

In [94]:
from sklearn.preprocessing import LabelEncoder

In [95]:
label_encoder=LabelEncoder()

In [96]:
y=label_encoder.fit_transform(y)

In [97]:
y

array([9, 9, 2, ..., 6, 4, 6], shape=(39173,))

In [98]:
print(np.unique(y))

[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [99]:
y=np.array(y)

In [100]:
from sklearn.model_selection import train_test_split

In [101]:
x_train_text,x_test_text,y_train,y_test=train_test_split(df['content'].values,y,test_size=0.2,stratify=y,random_state=42)

In [102]:
train_length=pd.Series(x_train_text).str.split().str.len()
sentence_length=int(np.percentile(train_length,98))
sentence_length

27

In [103]:
from sklearn.utils.class_weight import compute_class_weight

In [104]:
classes=np.unique(y_train)

In [105]:
weights=compute_class_weight(
    class_weight='balanced', #this means automatically compute inverse frequency weights
    classes=classes, #this is ofc self explanatory
    y=y_train #this also self explanatory as we dont wanna give test data to it
)
class_weights=dict(zip(classes,weights))  #this pairs the class label and its respective weight (dictionary)
#weights for one label is  like- total weights/(no of classes * no of occurances of that label)

In [106]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,log_loss,roc_auc_score

In [107]:
def printallmetrics(name,y_test,y_pred,y_pred_prob=None):
    print(f"{name}\naccuracy: {accuracy_score(y_test,y_pred)}\nconfustion matrix: \n{confusion_matrix(y_test,y_pred)}\nclassification report:{classification_report(y_test,y_pred)}") 
    if y_pred_prob is not None:
        print(f"\nlog loss:{log_loss(y_test,y_pred_prob)}\nroc auc score:{roc_auc_score(y_test,y_pred_prob,multi_class='ovr')}")  

In [108]:
from tensorflow.keras.layers import TextVectorization

In [109]:
tv=TextVectorization(output_mode='int',output_sequence_length=sentence_length,max_tokens=10000)

In [110]:
tv.adapt(x_train_text)

In [111]:
vocabulary_size=len(tv.get_vocabulary())
vocabulary_size

10000

In [112]:
train_ds=tf.data.Dataset.from_tensor_slices((x_train_text,y_train))
test_ds=tf.data.Dataset.from_tensor_slices((x_test_text,y_test))

In [113]:
train_ds=train_ds.batch(128).prefetch(tf.data.AUTOTUNE)
test_ds=test_ds.batch(128).prefetch(tf.data.AUTOTUNE)

In [114]:
from tensorflow.keras.layers import Input,Dropout,Dense,Bidirectional,LSTM,Embedding
from tensorflow.keras.models import Sequential

In [115]:
model_bilstm=Sequential([
    Input(shape=(),dtype=tf.string),
    tv,
    Embedding(input_dim=vocabulary_size,output_dim=50,mask_zero=True),
    Dropout(0.5),
    Bidirectional(LSTM(16,dropout=0.3,recurrent_dropout=0.3)),
    Dropout(0.5),
    Dense(len(np.unique(y)),activation='softmax')
])

In [116]:
from tensorflow.keras.optimizers import AdamW

In [117]:
model_bilstm.compile(loss='sparse_categorical_crossentropy',optimizer=AdamW(learning_rate=1e-3),metrics=['accuracy'])

In [118]:
early_stopping=tf.keras.callbacks.EarlyStopping(
    patience=7,
    min_delta=0.001,
    verbose=1,
    monitor='val_loss',
    restore_best_weights=True
)

In [119]:
model_bilstm.fit(train_ds,validation_data=(test_ds),epochs=30,callbacks=[early_stopping],class_weight=class_weights) #no need to mention batch_size because we prefetched batch_size before

Epoch 1/30


[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.1135 - loss: 2.4797 - val_accuracy: 0.1191 - val_loss: 2.4692
Epoch 2/30
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.1641 - loss: 2.3987 - val_accuracy: 0.1683 - val_loss: 2.3431
Epoch 3/30
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - accuracy: 0.1840 - loss: 2.2772 - val_accuracy: 0.1815 - val_loss: 2.2910
Epoch 4/30
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.2021 - loss: 2.1691 - val_accuracy: 0.2066 - val_loss: 2.2346
Epoch 5/30
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.2119 - loss: 2.0650 - val_accuracy: 0.1971 - val_loss: 2.2202
Epoch 6/30
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.2232 - loss: 1.9859 - val_accuracy: 0.1967 - val_loss: 2.2054
Epoch 7/30
[1m245/245[0m [32m

<keras.src.callbacks.history.History at 0x241d1aab250>

In [120]:
y_pred_bilstm=model_bilstm.predict(test_ds)

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [121]:
y_pred_bilstm

array([[0.02423854, 0.00902167, 0.13260196, ..., 0.11199786, 0.16970703,
        0.15730478],
       [0.00674286, 0.12475148, 0.01836986, ..., 0.28386885, 0.04053751,
        0.23944575],
       [0.00356071, 0.02642628, 0.03422906, ..., 0.25501326, 0.0964348 ,
        0.23793113],
       ...,
       [0.00977738, 0.00372954, 0.2013542 , ..., 0.06910355, 0.10988007,
        0.08392026],
       [0.01008316, 0.04841737, 0.03636617, ..., 0.3164455 , 0.07562312,
        0.23212196],
       [0.03664453, 0.00367584, 0.08776226, ..., 0.04353422, 0.25680795,
        0.06106127]], shape=(7835, 12), dtype=float32)

In [122]:
y_pred_bilstm_labels=np.argmax(y_pred_bilstm,axis=1)

In [123]:
y_pred_bilstm

array([[0.02423854, 0.00902167, 0.13260196, ..., 0.11199786, 0.16970703,
        0.15730478],
       [0.00674286, 0.12475148, 0.01836986, ..., 0.28386885, 0.04053751,
        0.23944575],
       [0.00356071, 0.02642628, 0.03422906, ..., 0.25501326, 0.0964348 ,
        0.23793113],
       ...,
       [0.00977738, 0.00372954, 0.2013542 , ..., 0.06910355, 0.10988007,
        0.08392026],
       [0.01008316, 0.04841737, 0.03636617, ..., 0.3164455 , 0.07562312,
        0.23212196],
       [0.03664453, 0.00367584, 0.08776226, ..., 0.04353422, 0.25680795,
        0.06106127]], shape=(7835, 12), dtype=float32)

In [124]:
printallmetrics("Bilstm",y_test,y_pred_bilstm_labels,y_pred_bilstm)

Bilstm
accuracy: 0.20727504786215697
confustion matrix: 
[[  0   0   3   1   0   3   2   0   3   3   3   4]
 [  1   8   1   5   0   3   2   0   3   6   4   3]
 [  6   3  21  25  17   2  13  10  15  23  14   3]
 [ 12   7  25 106  49   6  30   4  47  15  34  20]
 [ 27   5  71 189 208  11 200  28 142  37  87  37]
 [ 10  15  11  13   7  87   9   1  13  53  16  30]
 [ 10   9  20  71  77  13 375  15  75  39  40  24]
 [119  59 163 162 126  73 125 124 268 160 194 155]
 [ 10   6  25  35  32   9  32  14  80  19  23  20]
 [ 49  53  43  62  30 106  41  32  96 321  71 129]
 [ 31   9  37  54  31  25  48  16  33  52  57  44]
 [ 73  54 127 116  66 121  65  50 193 431 159 237]]
classification report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.04      0.22      0.06        36
           2       0.04      0.14      0.06       152
           3       0.13      0.30      0.18       355
           4       0.32      0.20    

the one above doesn't perform well, trying tfidf + svm now

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [126]:
tfidf=TfidfVectorizer(ngram_range=(1,3),max_features=20000)

In [127]:
x_train_tfidf=tfidf.fit_transform(x_train_text)
x_test_tfidf=tfidf.transform(x_test_text)

In [128]:
from sklearn.svm import LinearSVC

In [129]:
model_svm=LinearSVC(multi_class='ovr')

In [130]:
model_svm.fit(x_train_tfidf,y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [131]:
y_pred_svm=model_svm.predict(x_test_tfidf)

In [132]:
y_pred_svm

array([ 9, 11, 10, ...,  9,  9, 11], shape=(7835,))

In [133]:
printallmetrics("svm",y_test,y_pred_svm)

svm
accuracy: 0.3179323548181238
confustion matrix: 
[[  0   0   0   1   2   0   0   6   1   4   0   8]
 [  0   0   0   0   3   0   0  10   0   8   3  12]
 [  0   0   2   6  28   1   7  55   2  12   5  34]
 [  0   1   2  26  79   2  44 109   6  24   9  53]
 [  0   0   7  48 357   7 126 239  19  50  37 152]
 [  0   0   0   3  22  34   7  46   2  51   6  94]
 [  0   0   1  16 164   6 277 138  10  37  21  98]
 [  0   2   6  33 208  22  97 748  41 156  48 367]
 [  0   0   3   8  54   3  29 103  17  19   5  64]
 [  0   1   0  10  63  30  43 200  11 308  21 346]
 [  0   0   2  10  66   4  32 118   5  48  40 112]
 [  0   0   8  28 107  17  56 417  21 302  54 682]]
classification report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.00      0.00      0.00        36
           2       0.06      0.01      0.02       152
           3       0.14      0.07      0.10       355
           4       0.31      0.34      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
