In [23]:
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from gensim.models import Word2Vec

In [24]:
df_train = pd.read_csv("F:\\Text audio dataset\\cleaned_data.csv", names=['Text','Emotion'],skiprows=1)
df_test = pd.read_csv("F:\\Text audio dataset\\cleaned_data_test.csv", names=['Text','Emotion'], skiprows=1)
df_val = pd.read_csv("F:\\Text audio dataset\\cleaned_data_val.csv", names=['Text','Emotion'], skiprows=1)

In [25]:
print(df_train.head())
print(df_test.head())
print(df_val.head())

                                                Text  Emotion
0                              didnt feel humiliated  sadness
1  go feeling hopeless damned hopeful around some...  sadness
2          im grabbing minute post feel greedy wrong    anger
3  ever feeling nostalgic fireplace know still pr...     love
4                                    feeling grouchy    anger
                                                Text  Emotion
0        im feeling rather rotten im ambitious right  sadness
1                       im updating blog feel shitty  sadness
2    never make separate ever want feel like ashamed  sadness
3  left bouquet red yellow tulip arm feeling slig...      joy
4                            feeling little vain one  sadness
                                                Text  Emotion
0           im feeling quite sad sorry ill snap soon  sadness
1  feel like still looking blank canvas blank pie...  sadness
2                         feel like faithful servant     love
3       

In [26]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

(15969, 2)
(2000, 2)
(1998, 2)


In [27]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Emotion'])
y_test = label_encoder.transform(df_test['Emotion'])
y_val = label_encoder.transform(df_val['Emotion'])

In [28]:
tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(df_train['Text'])
tfidf_test = tfidf.transform(df_test['Text'])
tfidf_val = tfidf.transform (df_val['Text'])

In [29]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(tfidf_train,y_train)

y_pred = model_LR.predict(tfidf_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

f1 = f1_score(y_test,y_pred,average=None)
pd.DataFrame(f1,index=df_train.Emotion.unique(),columns=['F1 score'])

Accuracy: 0.871


Unnamed: 0,F1 score
sadness,0.857143
anger,0.836449
love,0.898236
surprise,0.707143
fear,0.912933
joy,0.660194


In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.83      0.86       275
           1       0.88      0.80      0.84       224
           2       0.85      0.95      0.90       695
           3       0.82      0.62      0.71       159
           4       0.90      0.93      0.91       581
           5       0.92      0.52      0.66        66

    accuracy                           0.87      2000
   macro avg       0.87      0.77      0.81      2000
weighted avg       0.87      0.87      0.87      2000



In [31]:
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier()
model_DT.fit(tfidf_train,y_train)

y_pred = model_DT.predict(tfidf_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

f1 = f1_score(y_test,y_pred,average=None)
pd.DataFrame(f1,index=df_train.Emotion.unique(),columns=['F1 score'])

Accuracy: 0.865


Unnamed: 0,F1 score
sadness,0.880143
anger,0.814159
love,0.896
surprise,0.768293
fear,0.898526
joy,0.601504


In [32]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       275
           1       0.81      0.82      0.81       224
           2       0.91      0.89      0.90       695
           3       0.75      0.79      0.77       159
           4       0.91      0.89      0.90       581
           5       0.60      0.61      0.60        66

    accuracy                           0.86      2000
   macro avg       0.80      0.82      0.81      2000
weighted avg       0.87      0.86      0.87      2000



In [33]:
from sklearn.svm import SVC

model_SVC = SVC()
model_SVC.fit(tfidf_train,y_train)

y_pred = model_SVC.predict(tfidf_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

f1 = f1_score(y_test,y_pred,average=None)
pd.DataFrame(f1,index=df_train.Emotion.unique(),columns=['F1 score'])

Accuracy: 0.8675


Unnamed: 0,F1 score
sadness,0.857685
anger,0.827907
love,0.893645
surprise,0.681648
fear,0.914821
joy,0.654206


In [34]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86       275
           1       0.86      0.79      0.83       224
           2       0.83      0.96      0.89       695
           3       0.84      0.57      0.68       159
           4       0.91      0.92      0.91       581
           5       0.85      0.53      0.65        66

    accuracy                           0.87      2000
   macro avg       0.87      0.77      0.80      2000
weighted avg       0.87      0.87      0.86      2000



In [35]:
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier()
model_RFC.fit(tfidf_train,y_train)

y_pred = model_RFC.predict(tfidf_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

f1 = f1_score(y_test,y_pred,average=None)
pd.DataFrame(f1,index=df_train.Emotion.unique(),columns=['F1 score'])

Accuracy: 0.8865


Unnamed: 0,F1 score
sadness,0.891697
anger,0.867841
love,0.906294
surprise,0.745763
fear,0.931364
joy,0.603448


In [36]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89       275
           1       0.86      0.88      0.87       224
           2       0.88      0.93      0.91       695
           3       0.81      0.69      0.75       159
           4       0.94      0.92      0.93       581
           5       0.70      0.53      0.60        66

    accuracy                           0.89      2000
   macro avg       0.85      0.81      0.82      2000
weighted avg       0.88      0.89      0.88      2000



In [37]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [38]:
tfidf_train_tensor = torch.tensor(tfidf_train.toarray(), dtype=torch.float32)
tfidf_test_tensor = torch.tensor(tfidf_test.toarray(), dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [39]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # LSTM expects input of shape (batch_size, seq_length, input_size)
        x = x.unsqueeze(1)  # Add a sequence length dimension
        _, (hn, _) = self.lstm(x)  # hn is the hidden state
        out = self.fc(hn[-1])  # Fully connected layer takes the last hidden state
        return out

In [40]:
train_dataset = TensorDataset(tfidf_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

input_size = tfidf_train_tensor.shape[1]  
hidden_size = 512
num_classes = len(label_encoder.classes_)
num_epochs = 10
learning_rate = 0.001

In [42]:
model = LSTMClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    y_pred = model(tfidf_test_tensor)
    y_pred_classes = torch.argmax(y_pred, dim=1).numpy()
    y_true = y_test_tensor.numpy()
    print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))

Epoch [1/10], Loss: 1.2218
Epoch [2/10], Loss: 0.3941
Epoch [3/10], Loss: 0.2378
Epoch [4/10], Loss: 0.0931
Epoch [5/10], Loss: 0.0550
Epoch [6/10], Loss: 0.0293
Epoch [7/10], Loss: 0.0450
Epoch [8/10], Loss: 0.0223
Epoch [9/10], Loss: 0.0054
Epoch [10/10], Loss: 0.0103
              precision    recall  f1-score   support

       anger       0.86      0.81      0.83       275
        fear       0.83      0.80      0.82       224
         joy       0.86      0.92      0.89       695
        love       0.74      0.69      0.71       159
     sadness       0.90      0.91      0.90       581
    surprise       0.81      0.64      0.71        66

    accuracy                           0.86      2000
   macro avg       0.83      0.79      0.81      2000
weighted avg       0.86      0.86      0.86      2000

