In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("go_emotions_dataset.csv") 
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldnâ€™t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
selected_emotions = ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']

samples_per_emotion = 1000  # 1000 Ã— 6 = 6000 total samples
balanced_df = pd.DataFrame()

for emotion in selected_emotions:
    subset = df[df[emotion] == 1]
    if len(subset) >= samples_per_emotion:
        balanced_df = pd.concat([balanced_df, subset.sample(n=samples_per_emotion, random_state=42)])
    else:
        # If not enough samples, take all available (optional fallback)
        balanced_df = pd.concat([balanced_df, subset])

# Drop duplicates and reset index
balanced_df = balanced_df.drop_duplicates().reset_index(drop=True)


In [7]:
balanced_df.shape

(5784, 31)

In [8]:
import torch
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# 1. Use the 6 core emotions
emotion_labels = ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']

# 2. Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

encodings = tokenizer(
    list(balanced_df['text']), 
    truncation=True, 
    padding=True, 
    max_length=128, 
    return_tensors='pt'
)

# 3. Labels
labels = torch.tensor(balanced_df[emotion_labels].values).float()

# 4. Train-test split
train_idx, val_idx = train_test_split(range(len(balanced_df)), test_size=0.2, random_state=42)

train_encodings = {key: val[train_idx] for key, val in encodings.items()}
val_encodings = {key: val[val_idx] for key, val in encodings.items()}

train_labels = labels[train_idx]
val_labels = labels[val_idx]

# 5. Custom Dataset
class GoEmotionsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = GoEmotionsDataset(train_encodings, train_labels)
val_dataset = GoEmotionsDataset(val_encodings, val_labels)

# 6. Load Model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(emotion_labels),
    problem_type="multi_label_classification"
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:

# 7. Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
)

# 8. Metrics
def compute_metrics(pred):
    logits, labels = pred
    preds = torch.sigmoid(torch.tensor(logits)) > 0.5
    f1 = f1_score(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
    }




In [10]:

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 10. Train
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3051,0.302584,0.539326,0.627057
2,0.2434,0.303911,0.560069,0.632501
3,0.1598,0.327328,0.573034,0.641156


TrainOutput(global_step=1737, training_loss=0.25806822691693965, metrics={'train_runtime': 1713.8884, 'train_samples_per_second': 8.099, 'train_steps_per_second': 1.013, 'total_flos': 179581164928200.0, 'train_loss': 0.25806822691693965, 'epoch': 3.0})