In [1]:
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.preprocessing import MultiLabelBinarizer
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
#load the dataset
#data came from GoEmotions Dataset
#tsvs are tab separated vals, so specify the separator moreover it has no headers/col names
data = pd.read_csv("train.tsv", sep = "\t", header = None, names = ["text","labels", "id"])

#convert lists of strings into comma separated vals (some text have multiple emotions separated by comma)
data["labels"] = data["labels"].apply(lambda x: list(map(int, x.split(","))))

In [4]:
#load up emotion labels
with open("emotions.txt", "r") as f:
    LABELS = list((line.strip() for line in f.readlines()))
NUM_CLASSES = len(LABELS)
print(LABELS)
print(NUM_CLASSES)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
28


In [5]:
#binarize labels to indicate precense of certain emtion 1 for yes 0 for no

mlb = MultiLabelBinarizer(classes = range(NUM_CLASSES))
y = mlb.fit_transform(data["labels"])

In [6]:
#tokenizing phase, turn text into digits

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


In [7]:
#create a custom dataset
class GoEmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenzer, max_len = 64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item
    
    

In [8]:
#prepare dataset

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["text"], y, test_size=0.1, random_state=42
)

In [9]:
train_dataset = GoEmotionDataset(train_texts.tolist(), train_labels, tokenizer)
val_dataset = GoEmotionDataset(val_texts.tolist(), val_labels, tokenizer)

In [10]:
#create dataloaders

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers= 4, persistent_workers= True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, pin_memory=True)

In [11]:
#define the model

class EmotionClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size, NUM_CLASSES)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        cls_output = self.dropout(cls_output)
        return self.fc(cls_output)

In [12]:
#training setup

model = EmotionClassifier().to(device)
#adamW is standard for huggingface transformers, adamW introduces weight decays which prevent overfitting
optimizer = torch.optim.AdamW(model.parameters(), lr =  0.00005, weight_decay= 0.01)
#standard loss for transformers
criterion = nn.BCEWithLogitsLoss()
scaler = GradScaler()

  scaler = GradScaler()


In [13]:
def train_model(model, dataloaders, criterion, optimizer, device, num_epochs=3, threshold=0.5):
    scaler = GradScaler()

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0
            total_samples = 0

            for batch in dataloaders[phase]:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    with autocast():
                        outputs = model(input_ids, attention_mask)
                        loss = criterion(outputs, labels)

                    if phase == 'train':
                        scaler.scale(loss).backward()
                        scaler.step(optimizer)
                        scaler.update()

                # Update loss
                batch_size = input_ids.size(0)
                running_loss += loss.item() * batch_size
                total_samples += batch_size

                # Compute accuracy (multi-label: threshold sigmoid outputs)
                preds = (torch.sigmoid(outputs) > threshold).float()
                corrects = (preds == labels).float().sum()
                running_corrects += corrects

            epoch_loss = running_loss / total_samples
            epoch_acc = running_corrects / (total_samples * labels.size(1))

            print(f"{phase.capitalize()} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

        torch.cuda.empty_cache()
    

In [14]:
def eval_model(model, val_loader, device, threshold = 0.5):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            outputs = torch.sigmoid(model(input_ids, attention_mask)).cpu().numpy()
            preds = (outputs > threshold).astype(int)

            all_preds.extend(preds)
            all_labels.extend(labels)

    report = classification_report(all_labels, all_preds, target_names=LABELS, zero_division=0)
    print(report)

In [None]:
dataloaders = {'train': train_loader, 'val': val_loader}
train_model(model, dataloaders, criterion, optimizer, device)
eval_model(model, val_loader, device)

  scaler = GradScaler()



Epoch 1/3


In [None]:
#save the model for reuse

os.makedirs("saved_model", exist_ok = True)
torch.save(model.state_dict(), "saved_model/emotion.pt")
tokenizer.save_pretrained("saved_model")
with open("saved_model/label_list.json", "w") as f:
    json.dump(LABELS, f)

print("✅ Model and files saved to 'saved_model/'")