In [1]:
from datasets import load_dataset
import pandas as pd

# Load GoEmotions dataset
dataset = load_dataset("go_emotions")

# Check structure
print(dataset["train"][0])


{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [2]:
# Get all emotion labels
emotion_labels = dataset['train'].features['labels'].feature.names

# Function to create binary multi-label vector
def one_hot_encode(labels, num_classes=28):
    vec = [0] * num_classes
    for l in labels:
        vec[l] = 1
    return vec

# Apply to datasets
def preprocess(example):
    example["multi_labels"] = one_hot_encode(example["labels"])
    return example

dataset = dataset.map(preprocess)


In [3]:
from transformers import AutoTokenizer

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'multi_labels'])


In [4]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True)
valid_loader = DataLoader(tokenized_dataset["validation"], batch_size=16)


In [5]:
import torch
import torch.nn as nn
from transformers import AutoModel

class BertMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)
        return self.sigmoid(logits)


In [None]:
# Required imports
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel

# Define DistilBERT-based multi-label classifier
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertMultiLabelClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use [CLS] token embedding as pooled output (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)
        x = self.dropout(cls_output)
        x = self.classifier(x)
        return torch.sigmoid(x)

# Setup device, model, loss function, optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertMultiLabelClassifier(num_labels=28).to(device)  # Update if your label count differs
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
# Modify inside train() to only process first N batches (e.g. 100)
def train(model, loader):
    model.train()
    total_loss = 0
    max_batches = 100  # just for quick testing on CPU

    for step, batch in enumerate(loader):
        if step >= max_batches:
            break

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["multi_labels"].float().to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        if step % 5 == 0:
            print(f"Step {step+1}/{max_batches} - Batch Loss: {loss.item():.4f}")

    return total_loss / max_batches


# Train the model for 1 epoch
for epoch in range(1):  # You can increase to 3 after testing
    loss = train(model, train_loader)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


Step 1/100 - Batch Loss: 0.6927
Step 6/100 - Batch Loss: 0.5979
Step 11/100 - Batch Loss: 0.4685
Step 16/100 - Batch Loss: 0.3854
Step 21/100 - Batch Loss: 0.3191
Step 26/100 - Batch Loss: 0.2784
Step 31/100 - Batch Loss: 0.2550
Step 36/100 - Batch Loss: 0.2435
Step 41/100 - Batch Loss: 0.2226
Step 46/100 - Batch Loss: 0.2155
Step 51/100 - Batch Loss: 0.2373
Step 56/100 - Batch Loss: 0.2243
Step 61/100 - Batch Loss: 0.1999
Step 66/100 - Batch Loss: 0.2043
Step 71/100 - Batch Loss: 0.2025
Step 76/100 - Batch Loss: 0.2013
Step 81/100 - Batch Loss: 0.2012
Step 86/100 - Batch Loss: 0.2022
Step 91/100 - Batch Loss: 0.1898
Step 96/100 - Batch Loss: 0.1978
Epoch 1, Loss: 0.2755


In [9]:
from sklearn.metrics import f1_score, hamming_loss
import torch

def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["multi_labels"].to(device)

            outputs = model(input_ids, attention_mask)

            # Apply threshold to get binary predictions
            preds = (outputs > 0.5).int()

            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

    # Concatenate all batches
    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()

    # Compute metrics
    f1 = f1_score(all_labels, all_preds, average='micro')
    hamming = hamming_loss(all_labels, all_preds)

    print(f"F1 Score: {f1:.4f}")
    print(f"Hamming Loss: {hamming:.4f}")



In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn

# Emotion labels from GoEmotions
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define model class
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertMultiLabelClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        return torch.sigmoid(self.classifier(pooled_output))

# Initialize model
model = BertMultiLabelClassifier(num_labels=len(emotion_labels))
model.to(device)

# Load your trained weights here if available
# model.load_state_dict(torch.load("your_model_path.pt"))

# Define prediction function
def predict(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    filtered_inputs = {
        "input_ids": inputs["input_ids"].to(device),
        "attention_mask": inputs["attention_mask"].to(device)
    }

    with torch.no_grad():
        output = model(**filtered_inputs)
        output = output.cpu().numpy()[0]

    # Top 3 highest emotion scores
    top_indices = output.argsort()[-3:][::-1]
    predicted_emotions = [emotion_labels[i] for i in top_indices]

    return predicted_emotions



# Run prediction automatically
sample_text = "I'm really happy but also kind of anxious about tomorrow."
result = predict(sample_text)
print("Predicted emotions:", result)


Predicted emotions: ['admiration', 'neutral', 'sadness']
