In [1]:
# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

In [2]:
# 2. LOAD DATA (GoEmotions training split)
url = 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv'
df = pd.read_csv(url)

In [3]:
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# 3. EXPLORE DATA
print(df.columns)

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')


In [5]:
# 3. DEFINE TEXT AND EMOTION COLUMNS
text_col = 'text'
emotion_cols = df.columns[9:]  # All emotion columns start from index 9 onward

In [6]:
# 4. DROP MISSING
df.dropna(subset=[text_col], inplace=True)

In [7]:
# 5. INPUTS & TARGETS
X = df[text_col]
y = df[emotion_cols]

In [20]:
X

0                                          That game hurt.
1         >sexuality shouldn’t be a grouping category I...
2           You do right, if you don't care then fuck 'em!
3                                       Man I love reddit.
4        [NAME] was nowhere near them, he was by the Fa...
                               ...                        
69995          It's about fucking time, hope this is real.
69996    This is great! Can anyone make a request with ...
69997    I’m sorry. Can you please explain what are the...
69998                                  No but it should be
69999    This is so cruel. I literally feel physically ...
Name: text, Length: 70000, dtype: object

In [22]:
y.columns #28

Index(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [8]:
X.shape

(70000,)

In [9]:
y.shape

(70000, 28)

In [10]:
print(y)

       admiration  amusement  anger  annoyance  approval  caring  confusion  \
0               0          0      0          0         0       0          0   
1               0          0      0          0         0       0          0   
2               0          0      0          0         0       0          0   
3               0          0      0          0         0       0          0   
4               0          0      0          0         0       0          0   
...           ...        ...    ...        ...       ...     ...        ...   
69995           0          0      0          1         0       0          0   
69996           1          0      0          0         0       0          0   
69997           0          0      0          0         0       0          0   
69998           0          0      0          0         0       0          0   
69999           0          0      0          0         0       0          0   

       curiosity  desire  disappointment  ...  love

In [11]:
# 7. CREATE PIPELINE
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000))
])

In [12]:
# 6. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = OneVsRestClassifier(pipeline)
model.fit(X_train, y_train)

In [14]:
# 8. EVALUATION
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=emotion_cols))

                precision    recall  f1-score   support

    admiration       0.69      0.22      0.33      1132
     amusement       0.59      0.27      0.37       621
         anger       0.52      0.06      0.10       519
     annoyance       0.38      0.01      0.02       946
      approval       0.64      0.02      0.05      1132
        caring       0.45      0.02      0.04       397
     confusion       0.45      0.02      0.04       500
     curiosity       0.52      0.04      0.08       680
        desire       0.50      0.05      0.09       242
disappointment       0.50      0.00      0.00       574
   disapproval       0.39      0.02      0.03       754
       disgust       0.38      0.03      0.06       328
 embarrassment       0.33      0.01      0.02       158
    excitement       0.58      0.04      0.07       378
          fear       0.62      0.08      0.14       210
     gratitude       0.90      0.68      0.78       816
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# 9. SAVE MODEL AND LABELS
with open("emotion_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("emotion_labels.pkl", "wb") as f:
    pickle.dump(list(emotion_cols), f)