In [4]:
import os
import json
import numpy as np
from datasets import load_dataset

# Local folder to save trained models
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)
print("✅ Model directory ready:", model_dir)


✅ Model directory ready: /Users/ravitejakondamuri/Downloads/Feel2Stream/feel2stream/ml/text_emotion/models


In [5]:
# Load GoEmotions dataset (train/validation/test)
dataset = load_dataset("go_emotions")

# Show number of labels
label_names = dataset['train'].features['labels'].feature.names
print("Number of labels:", len(label_names))
print(label_names)

# Convert to pandas for easy processing
df_train = dataset['train'].to_pandas()
df_val   = dataset['validation'].to_pandas()
df_test  = dataset['test'].to_pandas()

print("Train samples:", len(df_train))
print("Val samples:", len(df_val))
print("Test samples:", len(df_test))
df_train.head()


Generating train split: 100%|██████████| 43410/43410 [00:00<00:00, 1216556.66 examples/s]
Generating validation split: 100%|██████████| 5426/5426 [00:00<00:00, 1328253.39 examples/s]
Generating test split: 100%|██████████| 5427/5427 [00:00<00:00, 2055674.87 examples/s]

Number of labels: 28
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
Train samples: 43410
Val samples: 5426
Test samples: 5427





Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list(range(len(label_names))))
Y_train = mlb.fit_transform(df_train['labels'])
Y_val   = mlb.transform(df_val['labels'])
Y_test  = mlb.transform(df_test['labels'])

print("Y_train shape:", Y_train.shape)


Y_train shape: (43410, 28)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(df_train['text'].astype(str))
X_val   = vectorizer.transform(df_val['text'].astype(str))
X_test  = vectorizer.transform(df_test['text'].astype(str))

print("X_train shape:", X_train.shape)


X_train shape: (43410, 20000)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import time

clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='saga'))

t0 = time.time()
clf.fit(X_train, Y_train)
print("Training time (s):", round(time.time() - t0, 1))


Training time (s): 6.1


In [9]:
from sklearn.metrics import f1_score, classification_report

Y_pred = clf.predict(X_test)

print("Micro F1:", f1_score(Y_test, Y_pred, average='micro'))
print("Macro F1:", f1_score(Y_test, Y_pred, average='macro'))

# Per-class report
report = classification_report(Y_test, Y_pred, target_names=label_names, zero_division=0)
print(report)


Micro F1: 0.3992194674012856
Macro F1: 0.20972689273416487
                precision    recall  f1-score   support

    admiration       0.75      0.30      0.43       504
     amusement       0.82      0.40      0.54       264
         anger       0.67      0.09      0.16       198
     annoyance       0.75      0.01      0.02       320
      approval       0.70      0.05      0.10       351
        caring       0.83      0.07      0.14       135
     confusion       0.36      0.03      0.05       153
     curiosity       0.55      0.08      0.14       284
        desire       0.90      0.11      0.19        83
disappointment       1.00      0.01      0.01       151
   disapproval       0.53      0.04      0.07       267
       disgust       0.81      0.14      0.24       123
 embarrassment       0.00      0.00      0.00        37
    excitement       0.69      0.11      0.18       103
          fear       0.86      0.08      0.14        78
     gratitude       0.97      0.80      0.8

In [10]:
import joblib

# Save vectorizer, classifier, and label names locally
joblib.dump(vectorizer, os.path.join(model_dir, "text_vectorizer.joblib"))
joblib.dump(clf, os.path.join(model_dir, "text_clf.joblib"))
with open(os.path.join(model_dir, "text_labels.json"), "w") as f:
    json.dump(label_names, f)

print("✅ Saved files:", os.listdir(model_dir))


✅ Saved files: ['text_labels.json', 'text_vectorizer.joblib', 'text_clf.joblib']


In [11]:
def predict_single(text):
    vec = vectorizer.transform([text])
    probs = clf.predict_proba(vec)  # shape (1, n_labels)
    top_idx = probs.argmax(axis=1)[0]
    top_label = label_names[top_idx]
    top_conf = float(probs.max())
    return {"label": top_label, "confidence": top_conf, "probs_per_label": probs.reshape(-1)}

# Test
print(predict_single("I am so happy and excited about this!"))
print(predict_single("I'm feeling nervous and scared."))


{'label': 'joy', 'confidence': 0.7769911128516354, 'probs_per_label': array([0.04651455, 0.01063979, 0.01475206, 0.01685379, 0.03594727,
       0.01555538, 0.02102665, 0.02922412, 0.00756872, 0.02575382,
       0.01002369, 0.00996529, 0.00646541, 0.60998882, 0.01338255,
       0.03608113, 0.00237585, 0.77699111, 0.02515943, 0.00755628,
       0.01126097, 0.0041273 , 0.01528611, 0.00574413, 0.00864157,
       0.03136969, 0.01318175, 0.02835595])}
{'label': 'fear', 'confidence': 0.7713924866210388, 'probs_per_label': array([0.02632959, 0.01063469, 0.01942455, 0.03065062, 0.02886474,
       0.03842683, 0.01339707, 0.00739136, 0.01248241, 0.01807423,
       0.01978399, 0.01952902, 0.01005187, 0.01920673, 0.77139249,
       0.00891942, 0.00180421, 0.03140519, 0.01076271, 0.13701752,
       0.01261661, 0.00227246, 0.0227321 , 0.00370255, 0.00661075,
       0.02469692, 0.01008147, 0.07945032])}
