In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv("../data/processed/dataset.csv")

df.head()

Unnamed: 0,text,label
0,That game hurt.,sadness
1,>sexuality shouldn’t be a grouping category I...,admiration
2,"You do right, if you don't care then fuck 'em!",neutral
3,Man I love reddit.,love
4,"[NAME] was nowhere near them, he was by the Fa...",neutral


In [4]:
print(df.shape)
df["label"].value_counts()

(211225, 2)


label
neutral           55298
admiration        20542
approval          15530
annoyance         11929
disapproval        8917
amusement          8862
gratitude          8437
anger              7956
curiosity          7707
disappointment     6769
confusion          6600
love               5310
caring             5147
realization        5125
joy                5120
optimism           4994
excitement         4375
sadness            3863
surprise           3472
disgust            3420
desire             3002
fear               2514
embarrassment      1720
remorse            1648
nervousness         946
relief              814
pride               714
grief               494
Name: count, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(max_features=20000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

preds = model.predict(X_test_vec)

print(classification_report(y_test, preds))

                precision    recall  f1-score   support

    admiration       0.50      0.47      0.49      4146
     amusement       0.54      0.56      0.55      1824
         anger       0.40      0.25      0.30      1596
     annoyance       0.24      0.10      0.14      2406
      approval       0.29      0.12      0.17      3005
        caring       0.29      0.14      0.19       980
     confusion       0.31      0.10      0.15      1318
     curiosity       0.34      0.12      0.18      1541
        desire       0.39      0.17      0.24       606
disappointment       0.26      0.07      0.11      1327
   disapproval       0.25      0.08      0.13      1797
       disgust       0.39      0.15      0.21       690
 embarrassment       0.42      0.08      0.14       345
    excitement       0.36      0.12      0.18       866
          fear       0.47      0.29      0.36       483
     gratitude       0.67      0.70      0.68      1675
         grief       0.29      0.02      0.04  

In [6]:
texts = [
    "I feel lonely today",
    "I am excited about my future",
    "Everything stresses me out",
    "I am angry at myself"
]

vec = vectorizer.transform(texts)
model.predict(vec)

array(['sadness', 'excitement', 'neutral', 'anger'], dtype=object)