In [12]:
import spacy

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en_core_web_sm')

In [13]:
df = pd.read_csv('../txt/train.txt', sep=';', header=None)
df.drop(df[df[1] == 'surprise'].index, inplace = True)
df.drop(df[df[1] == 'love'].index, inplace = True)
df.drop(df[df[1] == 'fear'].index, inplace = True)

df_anger = df[df[1] == 'anger']
df_joy = df[df[1] == 'joy']
df_joy = df_joy.sample(2159, random_state=2023)
df_sadness = df[df[1] == 'sadness']
df_sadness = df_sadness.sample(2159, random_state=2023)

df2 = pd.concat([df_anger, df_joy, df_sadness])

le = LabelEncoder()
df2[1] = le.fit_transform(df2[1])

df2.sample(10)

Unnamed: 0,0,1
9114,i read after watching the film argued that it ...,0
13205,id love to see this campaign go viral to help ...,1
15737,i still can t get over the fact that i feel ab...,1
3268,i didnt use to feel embarrassed walking by peo...,2
14153,i feel i want to be carefree but all that is l...,1
159,i say whatever comes in my mind tell you direc...,0
9443,i apologize to anyone who may feel i have been...,2
11193,i am tired of feeling unloved undesired unappr...,2
8661,im already feeling less agitated,0
12131,i feel so unimportant to you now its not even ...,2


In [14]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [15]:
df2['preprocess_text'] = df2[0].apply(preprocess)

In [20]:
df_test = pd.read_csv('../txt/test.txt', header=None, sep=';')
df_test.drop(df_test[df_test[1] == 'surprise'].index, inplace = True)
df_test.drop(df_test[df_test[1] == 'love'].index, inplace = True)
df_test.drop(df_test[df_test[1] == 'fear'].index, inplace = True)

df_anger = df_test[df_test[1] == 'anger']
df_joy = df_test[df_test[1] == 'joy']
df_joy = df_joy.sample(275, random_state=2023)
df_sadness = df_test[df_test[1] == 'sadness']
df_sadness = df_sadness.sample(275, random_state=2023)

df3 = pd.concat([df_anger, df_joy, df_sadness])

le = LabelEncoder()
df3[1] = le.fit_transform(df3[1])

df3['preprocess_text'] = df3[0].apply(preprocess)

df3.sample(5)

Unnamed: 0,0,1,preprocess_text
551,i feel kind of awkward about doing this here goes,2,feel kind awkward go
619,i definitely feel there s some useful informat...,1,definitely feel s useful information face simi...
708,i feel lucky that theyve chosen to share their...,1,feel lucky ve choose share life
227,i feel like i cant be respected if i have self...,1,feel like not respect self respect regular hat...
1724,i declined to purchase any this time i enjoyed...,1,decline purchase time enjoy feel squish projec...


In [21]:
clf = Pipeline([
    ('count_vectoriser', CountVectorizer(ngram_range=(1,3))),
    ('random_forest', RandomForestClassifier(n_estimators=50))
])

clf.fit(df2['preprocess_text'], df2[1])

y_pred = clf.predict(df3['preprocess_text'])

print(classification_report(df3[1], y_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       275
           1       0.92      0.90      0.91       275
           2       0.87      0.90      0.88       275

    accuracy                           0.91       825
   macro avg       0.91      0.91      0.91       825
weighted avg       0.91      0.91      0.91       825



In [22]:
clf = Pipeline([
    ('count_vectoriser', TfidfVectorizer()),
    ('random_forest', RandomForestClassifier(n_estimators=50))
])

clf.fit(df2['preprocess_text'], df2[1])

y_pred = clf.predict(df3['preprocess_text'])

print(classification_report(df3[1], y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       275
           1       0.89      0.96      0.92       275
           2       0.94      0.85      0.89       275

    accuracy                           0.92       825
   macro avg       0.92      0.92      0.92       825
weighted avg       0.92      0.92      0.92       825

