In [62]:
import os
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [63]:
df_train = pd.read_csv("F:\\Text audio dataset\\cleaned_data.csv", names=['Text','Emotion'],skiprows=1)
df_test = pd.read_csv("F:\\Text audio dataset\\cleaned_data_test.csv", names=['Text','Emotion'], skiprows=1)
df_val = pd.read_csv("F:\\Text audio dataset\\cleaned_data_val.csv", names=['Text','Emotion'], skiprows=1)

In [64]:
print(df_train.head())
print(df_test.head())
print(df_val.head())

                                                Text  Emotion
0                              didnt feel humiliated  sadness
1  go feeling hopeless damned hopeful around some...  sadness
2          im grabbing minute post feel greedy wrong    anger
3  ever feeling nostalgic fireplace know still pr...     love
4                                    feeling grouchy    anger
                                                Text  Emotion
0        im feeling rather rotten im ambitious right  sadness
1                       im updating blog feel shitty  sadness
2    never make separate ever want feel like ashamed  sadness
3  left bouquet red yellow tulip arm feeling slig...      joy
4                            feeling little vain one  sadness
                                                Text  Emotion
0           im feeling quite sad sorry ill snap soon  sadness
1  feel like still looking blank canvas blank pie...  sadness
2                         feel like faithful servant     love
3       

In [65]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

(15969, 2)
(2000, 2)
(1998, 2)


In [66]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Emotion'])
y_test = label_encoder.transform(df_test['Emotion'])
y_val = label_encoder.transform(df_val['Emotion'])

In [87]:
from gensim.models import Word2Vec

sentences_list = df_train['Text'].tolist()
word2vec_model = Word2Vec(sentences_list,vector_size=100, window=5, min_count=1, workers=4)

In [88]:
def get_sentence_vector(words):
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

df_train['sentence_vector'] = df_train['Text'].apply(get_sentence_vector)

In [89]:
df_train

Unnamed: 0,Text,Emotion,sentence_vector
0,didnt feel humiliated,sadness,"[-0.030749341, -0.07660068, -0.035946943, -0.1..."
1,go feeling hopeless damned hopeful around some...,sadness,"[0.061690614, -0.098852776, -0.026147062, -0.1..."
2,im grabbing minute post feel greedy wrong,anger,"[0.020267883, -0.02348079, -0.02079027, -0.164..."
3,ever feeling nostalgic fireplace know still pr...,love,"[0.040944714, -0.0627573, -0.0019784956, -0.14..."
4,feeling grouchy,anger,"[0.027666656, -0.068388306, -0.020949673, -0.1..."
...,...,...,...
15995,brief time beanbag said anna feel like beaten,sadness,"[0.02400795, -0.06073008, -0.0203814, -0.16797..."
15996,turning feel pathetic still waiting table subb...,sadness,"[-0.0023249232, -0.01736735, -0.028346237, -0...."
15997,feel strong good overall,joy,"[0.09887757, -0.09199757, 0.015945943, -0.1047..."
15998,feel like rude comment im glad,anger,"[0.014871376, -0.013296366, -0.024289943, -0.1..."


In [90]:
df_train['sentence_vector'][2],len(df_train['sentence_vector'][1])

(array([ 0.02026788, -0.02348079, -0.02079027, -0.16465977, -0.05461169,
         0.07319041, -0.09943788,  0.00741329, -0.07366572,  0.03487959,
        -0.15088406,  0.00300505,  0.03064199, -0.12002999, -0.01006528,
        -0.09578458, -0.18828069, -0.05713226,  0.06147506, -0.08930459,
        -0.16537702,  0.03539183, -0.1705182 ,  0.10579743, -0.20061593,
         0.00041037,  0.06382192, -0.05542972, -0.01031882,  0.03865582,
        -0.10915138, -0.00879679, -0.08587605, -0.00664711,  0.02229692,
         0.07838175, -0.03001799, -0.02499956, -0.17860948, -0.0152457 ,
         0.00097373, -0.03667154, -0.02235921,  0.1106775 ,  0.04620753,
        -0.22575113,  0.04205475, -0.05612154, -0.04742569, -0.09819559,
        -0.0014096 ,  0.08694533,  0.17120388,  0.0309212 , -0.04003087,
        -0.02142649, -0.1458838 , -0.0918768 ,  0.00920051, -0.10044868,
         0.11588292, -0.13533351,  0.07435954,  0.11380649, -0.03107921,
         0.06678881,  0.02395021,  0.1365176 , -0.1

In [91]:
data = pd.DataFrame(df_train['sentence_vector'].tolist(),columns=[f"column {i+1}" for i in range(100)])
data['Emotion'] = df_train['Emotion'].values
data

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8,column 9,column 10,...,column 92,column 93,column 94,column 95,column 96,column 97,column 98,column 99,column 100,Emotion
0,-0.030749,-0.076601,-0.035947,-0.184100,-0.023655,0.125620,-0.089059,-0.050042,-0.083613,0.080097,...,0.044132,-0.006866,-0.125381,0.082594,0.014156,0.191702,0.004496,0.188375,-0.023166,sadness
1,0.061691,-0.098853,-0.026147,-0.166410,-0.071026,0.119074,-0.102980,-0.066305,-0.021033,0.102118,...,0.061218,-0.017334,-0.083883,0.186593,0.009334,0.188301,-0.005093,0.169606,0.042517,sadness
2,0.020268,-0.023481,-0.020790,-0.164660,-0.054612,0.073190,-0.099438,0.007413,-0.073666,0.034880,...,0.039402,-0.035551,-0.056545,0.118788,-0.034679,0.143485,0.016581,0.160757,-0.028947,anger
3,0.040945,-0.062757,-0.001978,-0.145221,-0.044619,0.085670,-0.084848,-0.040827,-0.045717,0.047729,...,0.024890,-0.026125,-0.061855,0.140821,-0.003352,0.176887,-0.001226,0.187470,-0.006896,love
4,0.027667,-0.068388,-0.020950,-0.167342,-0.051749,0.107654,-0.074549,-0.051992,-0.085842,0.047894,...,0.025910,-0.068886,-0.040761,0.061464,-0.031593,0.149806,0.016473,0.217624,-0.028060,anger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15964,0.024008,-0.060730,-0.020381,-0.167978,-0.069138,0.118260,-0.076437,0.029902,-0.043210,0.044588,...,0.024269,0.001155,-0.113529,0.210936,-0.005457,0.179723,0.020440,0.166159,0.066233,sadness
15965,-0.002325,-0.017367,-0.028346,-0.177121,-0.041732,0.077047,-0.088043,0.015267,-0.096055,0.018570,...,0.028975,-0.029830,-0.100964,0.145333,-0.021165,0.170600,-0.013369,0.150233,-0.018255,sadness
15966,0.098878,-0.091998,0.015946,-0.104725,-0.091748,0.125566,-0.067995,-0.118043,0.000433,0.090823,...,-0.004177,-0.094924,-0.049542,0.153991,-0.036679,0.150837,-0.027715,0.171590,-0.001272,joy
15967,0.014871,-0.013296,-0.024290,-0.138008,-0.051715,0.068412,-0.097422,-0.004276,-0.064609,0.037537,...,0.062313,-0.027817,-0.071288,0.117009,-0.016226,0.145194,0.027745,0.139892,0.006722,anger


In [92]:
y_data = label_encoder.transform(data['Emotion'])
data

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8,column 9,column 10,...,column 92,column 93,column 94,column 95,column 96,column 97,column 98,column 99,column 100,Emotion
0,-0.030749,-0.076601,-0.035947,-0.184100,-0.023655,0.125620,-0.089059,-0.050042,-0.083613,0.080097,...,0.044132,-0.006866,-0.125381,0.082594,0.014156,0.191702,0.004496,0.188375,-0.023166,sadness
1,0.061691,-0.098853,-0.026147,-0.166410,-0.071026,0.119074,-0.102980,-0.066305,-0.021033,0.102118,...,0.061218,-0.017334,-0.083883,0.186593,0.009334,0.188301,-0.005093,0.169606,0.042517,sadness
2,0.020268,-0.023481,-0.020790,-0.164660,-0.054612,0.073190,-0.099438,0.007413,-0.073666,0.034880,...,0.039402,-0.035551,-0.056545,0.118788,-0.034679,0.143485,0.016581,0.160757,-0.028947,anger
3,0.040945,-0.062757,-0.001978,-0.145221,-0.044619,0.085670,-0.084848,-0.040827,-0.045717,0.047729,...,0.024890,-0.026125,-0.061855,0.140821,-0.003352,0.176887,-0.001226,0.187470,-0.006896,love
4,0.027667,-0.068388,-0.020950,-0.167342,-0.051749,0.107654,-0.074549,-0.051992,-0.085842,0.047894,...,0.025910,-0.068886,-0.040761,0.061464,-0.031593,0.149806,0.016473,0.217624,-0.028060,anger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15964,0.024008,-0.060730,-0.020381,-0.167978,-0.069138,0.118260,-0.076437,0.029902,-0.043210,0.044588,...,0.024269,0.001155,-0.113529,0.210936,-0.005457,0.179723,0.020440,0.166159,0.066233,sadness
15965,-0.002325,-0.017367,-0.028346,-0.177121,-0.041732,0.077047,-0.088043,0.015267,-0.096055,0.018570,...,0.028975,-0.029830,-0.100964,0.145333,-0.021165,0.170600,-0.013369,0.150233,-0.018255,sadness
15966,0.098878,-0.091998,0.015946,-0.104725,-0.091748,0.125566,-0.067995,-0.118043,0.000433,0.090823,...,-0.004177,-0.094924,-0.049542,0.153991,-0.036679,0.150837,-0.027715,0.171590,-0.001272,joy
15967,0.014871,-0.013296,-0.024290,-0.138008,-0.051715,0.068412,-0.097422,-0.004276,-0.064609,0.037537,...,0.062313,-0.027817,-0.071288,0.117009,-0.016226,0.145194,0.027745,0.139892,0.006722,anger


In [94]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = data[[f'column {i+1}'for i in range(100)]]
y = y_data

X_train, X_test, y_train2, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train2)

# Evaluate the model
y_pred = classifier.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.3688165309956168
