In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
from transformers import DistilBertTokenizer
import torch as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# Custom Dataset class for BERT
class EmotionDataset(nn.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = nn.tensor(labels, dtype=nn.float32)  # Multi-label = float32

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [3]:
# Loading the dataset
df = pd.read_csv('../dataset/preprocessed_go_emotions.csv')
emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

texts = df['cleaned_text'].tolist()
labels = df[emotion_columns].values.astype('float32')

In [4]:
print("Number of duplicate rows:", df.duplicated().sum())
print("df shape:", df.shape)
print("Number of unique rows:", df.drop_duplicates().shape[0])

Number of duplicate rows: 56410
df shape: (207814, 31)
Number of unique rows: 151404


In [5]:
df_unique = df.drop_duplicates()
print("df_unique shape:", df_unique.shape)

df_unique shape: (151404, 31)


In [6]:
X = df['cleaned_text'].values.reshape(-1, 1)

In [7]:
# Perform multi-label stratified split
X_train, y_train, X_test, y_test = iterative_train_test_split(X, labels, test_size=0.2)
# Further split training data for validation
val_size = 0.2  # 20% of the training set
X_train_new, y_train_new, X_val, y_val = iterative_train_test_split(X_train, y_train, test_size=val_size / (1 - 0.2))

In [8]:
# Converting everything to string
X_train = [str(x) for x in X_train]
X_val = [str(x) for x in X_val]
X_test = [str(x) for x in X_test]

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt' # PyTorch tensors instead of TensorFlow
    )

train_encodings = tokenize_texts(X_train)
val_encodings = tokenize_texts(X_val)
test_encodings = tokenize_texts(X_test)

### Creating PyTorch Data Loader

#### PyTorch provides two data primitives:
* Dataset stores the samples and their corresponding labels,
* DataLoader wraps an iterable around the Dataset to enable easy access to the samples.

In [10]:
train_dataset = EmotionDataset(train_encodings, y_train)
val_dataset   = EmotionDataset(val_encodings, y_val)
test_dataset  = EmotionDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)
test_loader  = DataLoader(test_dataset, batch_size=16)


### Now finally using DistilBERT

In [11]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=28,  # Change this to match your number of emotions
    problem_type="multi_label_classification"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Setting up Optimizer and Loss function

In [None]:
from torch.optim import AdamW
import torch

optimizer = AdamW(model.parameters(), lr=5e-5)

# for some reason torch.nn works and nn.BCEWithLogitsLoss() does not    
loss_fn = torch.nn.BCEWithLogitsLoss()# Binary Cross Entropy for multi-label

In [18]:
import transformers
print(nn.__version__)

2.7.1+cpu


In [21]:
from tqdm import tqdm
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        # Move batch to device
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return total_loss / len(loader)

In [22]:
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            preds = torch.sigmoid(logits).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(loader), all_preds, all_labels


In [None]:
for epoch in range(3):  # Adjust based on validation performance
    train_loss = train_epoch(model, train_loader)
    val_loss, val_preds, val_labels = evaluate(model, val_loader)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


  1%|          | 125/10387 [05:12<7:26:49,  2.61s/it]