In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("go_emotions_dataset.csv") 
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [2]:
df.columns

Index(['id', 'text', 'example_very_unclear', 'admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')

In [3]:
# 1. Define the emotion label columns
emotion_labels = df.columns[3:]  # all columns after 'example_very_unclear'

# 2. Create a new column with a list of emotions for each row
def get_emotions(row):
    return [emotion for emotion in emotion_labels if row[emotion] == 1]

df["emotions"] = df.apply(get_emotions, axis=1)

# 3. Show the new structure
df[['text', 'emotions']].head()

Unnamed: 0,text,emotions
0,That game hurt.,[sadness]
1,>sexuality shouldn’t be a grouping category I...,[]
2,"You do right, if you don't care then fuck 'em!",[neutral]
3,Man I love reddit.,[love]
4,"[NAME] was nowhere near them, he was by the Fa...",[neutral]


In [4]:
# Sample 1000 rows from original df for quick training
sampled_df = df.sample(n=1000, random_state=42).reset_index(drop=True)

In [6]:
from transformers import DistilBertTokenizer
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import DistilBertForSequenceClassification

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Encode the texts from sampled_df
encodings = tokenizer(
    list(sampled_df['text']), 
    truncation=True, 
    padding=True, 
    max_length=128, 
    return_tensors='pt'
)

# Convert only sampled_df emotion columns to tensor
labels = torch.tensor(sampled_df[emotion_labels].values).float()

# Split sampled_df into train and validation sets
train_idx, val_idx = train_test_split(range(len(sampled_df)), test_size=0.2, random_state=42)

train_encodings = {key: val[train_idx] for key, val in encodings.items()}
val_encodings = {key: val[val_idx] for key, val in encodings.items()}

train_labels = labels[train_idx]
val_labels = labels[val_idx]

# Define custom dataset class
class GoEmotionsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create datasets
train_dataset = GoEmotionsDataset(train_encodings, train_labels)
val_dataset = GoEmotionsDataset(val_encodings, val_labels)

# Load model for multi-label classification
num_labels = len(emotion_labels)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits))  # apply sigmoid to logits
    preds = (probs > 0.5).int().numpy()  # threshold to get binary predictions
    labels = labels.astype(int)

    f1 = f1_score(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {'f1': f1, 'accuracy': acc}


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
)




In [9]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)

    f1 = f1_score(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
    }


In [10]:
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # your tokenized training data
    eval_dataset=val_dataset,     # your tokenized validation data
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # optional for evaluating during training
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.172,0.160854,0.035,0.0
2,0.1611,0.158611,0.035,0.0
3,0.158,0.158418,0.035,0.0


TrainOutput(global_step=300, training_loss=0.19705010414123536, metrics={'train_runtime': 230.5602, 'train_samples_per_second': 10.409, 'train_steps_per_second': 1.301, 'total_flos': 26091611481600.0, 'train_loss': 0.19705010414123536, 'epoch': 3.0})

In [11]:
trainer.save_model("./emotion-aware-distilbert")

In [12]:
eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)

Evaluation Results:
{'eval_loss': 0.15841835737228394, 'eval_accuracy': 0.035, 'eval_f1': 0.0, 'eval_runtime': 2.6222, 'eval_samples_per_second': 76.271, 'eval_steps_per_second': 9.534, 'epoch': 3.0}


In [13]:
# Save model
trainer.save_model("./emotion-aware-distilbert")

# Save tokenizer
tokenizer.save_pretrained("./emotion-aware-distilbert")


('./emotion-aware-distilbert\\tokenizer_config.json',
 './emotion-aware-distilbert\\special_tokens_map.json',
 './emotion-aware-distilbert\\vocab.txt',
 './emotion-aware-distilbert\\added_tokens.json')

In [14]:
from transformers import pipeline

# Load the model and tokenizer from saved directory
emotion_pipeline = pipeline("text-classification", 
                            model="./emotion-aware-distilbert", 
                            tokenizer="./emotion-aware-distilbert", 
                            return_all_scores=True)

# Predict on a new sentence
text = "I am feeling really happy and excited today!"
predictions = emotion_pipeline(text)

# Display results
for emotion, score in zip(emotion_labels, predictions[0]):
    print(f"{emotion}: {score['score']:.4f}")


Device set to use cpu


admiration: 0.0944
amusement: 0.0401
anger: 0.0388
annoyance: 0.0623
approval: 0.0889
caring: 0.0294
confusion: 0.0246
curiosity: 0.0450
desire: 0.0237
disappointment: 0.0347
disapproval: 0.0480
disgust: 0.0169
embarrassment: 0.0130
excitement: 0.0294
fear: 0.0126
gratitude: 0.0448
grief: 0.0118
joy: 0.0407
love: 0.0505
nervousness: 0.0152
optimism: 0.0374
pride: 0.0078
realization: 0.0388
relief: 0.0092
remorse: 0.0148
sadness: 0.0282
surprise: 0.0254
neutral: 0.2491




In [5]:
emotion_labels = df.columns[3:]

In [6]:
from transformers import pipeline

# Load the model and tokenizer from saved directory
emotion_pipeline = pipeline("text-classification", 
                            model="./emotion-aware-distilbert", 
                            tokenizer="./emotion-aware-distilbert", 
                            return_all_scores=True)

# Predict on a new sentence
text = "It’s been such a rollercoaster week. On Monday, I was full of energy, thrilled to start my new project at work — it felt like everything was finally falling into place. But by midweek, I started doubting myself. The pressure, the deadlines, the constant comparison to others... it all got overwhelming. I even cried a bit on Thursday night because I felt like I wasn’t good enough. Still, today, something shifted. A close friend reminded me how far I’ve come, and their words sparked something inside me — a sense of hope, love, and determination. I’m not giving up. I’m learning, growing, and I believe that something wonderful is coming."
predictions = emotion_pipeline(text)

# Display results
for emotion, score in zip(emotion_labels, predictions[0]):
    print(f"{emotion}: {score['score']:.4f}")


Device set to use cpu


admiration: 0.0922
amusement: 0.0406
anger: 0.0384
annoyance: 0.0643
approval: 0.0907
caring: 0.0300
confusion: 0.0254
curiosity: 0.0447
desire: 0.0240
disappointment: 0.0347
disapproval: 0.0503
disgust: 0.0174
embarrassment: 0.0132
excitement: 0.0300
fear: 0.0126
gratitude: 0.0463
grief: 0.0122
joy: 0.0416
love: 0.0488
nervousness: 0.0160
optimism: 0.0387
pride: 0.0080
realization: 0.0411
relief: 0.0093
remorse: 0.0155
sadness: 0.0283
surprise: 0.0260
neutral: 0.2521


In [7]:
from transformers import pipeline

# Load the model and tokenizer from saved directory
emotion_pipeline = pipeline("text-classification", 
                            model="./emotion-aware-distilbert", 
                            tokenizer="./emotion-aware-distilbert", 
                            return_all_scores=True)

# Predict on a new sentence
text = "i was so sad"
predictions = emotion_pipeline(text)

# Display results
for emotion, score in zip(emotion_labels, predictions[0]):
    print(f"{emotion}: {score['score']:.4f}")


Device set to use cpu


admiration: 0.0948
amusement: 0.0396
anger: 0.0392
annoyance: 0.0609
approval: 0.0880
caring: 0.0289
confusion: 0.0247
curiosity: 0.0449
desire: 0.0236
disappointment: 0.0348
disapproval: 0.0476
disgust: 0.0168
embarrassment: 0.0127
excitement: 0.0295
fear: 0.0128
gratitude: 0.0448
grief: 0.0118
joy: 0.0402
love: 0.0508
nervousness: 0.0153
optimism: 0.0378
pride: 0.0078
realization: 0.0383
relief: 0.0092
remorse: 0.0149
sadness: 0.0285
surprise: 0.0251
neutral: 0.2525
