In [85]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Loading the Dataset

In [69]:
from pathlib import Path
import zipfile


zip_path = Path("/content/emotions.zip")
dest_dir = Path("/content")

if not dest_dir.is_file():
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_dir}`...")
        zip_ref.extractall(dest_dir)

print(f"[INFO] Dataset succesfully downloaded to `{dest_dir}`..")

[INFO] Unzipping dataset `/content/emotions.zip` to `/content`...
[INFO] Dataset succesfully downloaded to `/content`..


## Peprocessing the Dataset

In [70]:
df_train = pd.read_csv(dest_dir / "train.txt", sep=";", names=["text", "emotion"])
df_test = pd.read_csv(dest_dir / "test.txt", sep=";", names=["text", "emotion"])
df_val = pd.read_csv(dest_dir / "val.txt", sep=";", names=["text", "emotion"])

df = pd.concat([df_train, df_test, df_val], ignore_index=True)

print(df.shape)
df.head(3)

(20000, 2)


Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger


In [71]:
# Printing the Distribution of `emotions`
df.value_counts("emotion")

emotion
joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
dtype: int64

In [72]:
emotions_d = {
    "joy": 0,
    "sadness": 1,
    "anger": 2,
    "fear": 3,
    "love": 4,
    "surprise": 5
}

df["Emotion"] = df["emotion"].map(emotions_d)
df.rename(columns={"text": "Text"}, inplace=True)
df = df[["Text", "Emotion"]]

df.head(3)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,1
1,i can go from feeling so hopeless to so damned...,1
2,im grabbing a minute to post i feel greedy wrong,2


In [73]:
df.value_counts("Emotion")

Emotion
0    6761
1    5797
2    2709
3    2373
4    1641
5     719
dtype: int64

## Splitting the Dataset into Training and Testing Sets

In [74]:
test_prop = 0.2

x_train, x_test, y_train, y_test = train_test_split(
    df["Text"].values,
    df["Emotion"],
    test_size = test_prop
)

print(len(x_train), len(y_train), len(x_test), len(y_test))

16000 16000 4000 4000


In [82]:
# Printing the Distribution of training and testing emations
print(f"Training Emotions: \n{y_train.value_counts()}\n")
print(f"Testing Emotions: \n{y_test.value_counts()}")

Training Emotions: 
0    5413
1    4639
2    2150
3    1905
4    1312
5     581
Name: Emotion, dtype: int64

Testing Emotions: 
0    1348
1    1158
2     559
3     468
4     329
5     138
Name: Emotion, dtype: int64


## Naive Bayes

In [88]:
clf_nb = Pipeline([
    ("tf_idf", TfidfVectorizer()),
    ("Multi NB", MultinomialNB())
])

clf_nb.fit(x_train, y_train.values)

print(classification_report(y_test.values, clf_nb.predict(x_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.58      0.99      0.73      1348
           1       0.69      0.91      0.78      1158
           2       0.96      0.17      0.29       559
           3       0.98      0.12      0.21       468
           4       1.00      0.02      0.05       329
           5       0.00      0.00      0.00       138

    accuracy                           0.64      4000
   macro avg       0.70      0.37      0.34      4000
weighted avg       0.72      0.64      0.54      4000



## KNN

In [89]:
clf_knn = Pipeline([
    ("tf_idf", TfidfVectorizer()),
    ("Knn", KNeighborsClassifier())
])

clf_knn.fit(x_train, y_train.values)

print(classification_report(y_test.values, clf_knn.predict(x_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.67      0.87      0.76      1348
           1       0.70      0.78      0.74      1158
           2       0.78      0.57      0.65       559
           3       0.73      0.54      0.62       468
           4       0.72      0.34      0.46       329
           5       0.68      0.32      0.43       138

    accuracy                           0.70      4000
   macro avg       0.71      0.57      0.61      4000
weighted avg       0.71      0.70      0.69      4000



## Random Forest

In [90]:
clf_rf = Pipeline([
    ("tf_idf", TfidfVectorizer()),
    ("RFC", RandomForestClassifier())
])

clf_rf.fit(x_train, y_train.values)

print(classification_report(y_test.values, clf_rf.predict(x_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1348
           1       0.91      0.90      0.90      1158
           2       0.90      0.79      0.84       559
           3       0.83      0.82      0.83       468
           4       0.84      0.67      0.75       329
           5       0.86      0.65      0.74       138

    accuracy                           0.86      4000
   macro avg       0.86      0.80      0.82      4000
weighted avg       0.86      0.86      0.86      4000

