# Step 1: Setup Colab Environment

In [2]:
# Install required libraries
!pip install transformers datasets scikit-learn torch --quiet


# Step 2: Upload GoEmotions Dataset (from your PC)

In [3]:
from google.colab import files
uploaded = files.upload()


Saving go_emotions_dataset.csv to go_emotions_dataset.csv


# Step 3: Load and Preprocess Dataset

In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("go_emotions_dataset.csv")  # replace with your uploaded file name
df.head()


Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Preprocessing: Binarize labels & handle imbalance

In [6]:
import pandas as pd

# Load the uploaded CSV file
df = pd.read_csv("go_emotions_dataset.csv")  # replace with your actual file name
print("Columns:", df.columns)

# Inspect first few rows
print(df.head())

# Auto-detect the label column
if 'labels' in df.columns:
    # Case 1: 'labels' column contains emotion lists (e.g., ['joy', 'anger'])
    df['labels'] = df['labels'].apply(eval)  # Convert string lists to Python lists
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    label_matrix = mlb.fit_transform(df['labels'])
    text_data = df['text']
elif 'emotions' in df.columns:
    # Case 2: column is named 'emotions' instead of 'labels'
    df['emotions'] = df['emotions'].apply(eval)
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    label_matrix = mlb.fit_transform(df['emotions'])
    text_data = df['text']
else:
    # Case 3: one-hot encoded labels — detect non-text columns
    text_column = 'text' if 'text' in df.columns else df.columns[0]  # assume first is text if not named
    text_data = df[text_column]
    emotion_columns = [col for col in df.columns if col != text_column]
    label_matrix = df[emotion_columns].values
    mlb = None  # we won’t inverse transform if labels are already one-hot

# Now continue with tokenization
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define Dataset
from torch.utils.data import Dataset
import torch

class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_data, label_matrix, test_size=0.2, random_state=42)

train_dataset = GoEmotionsDataset(X_train.tolist(), y_train, tokenizer)
test_dataset = GoEmotionsDataset(X_test.tolist(), y_test, tokenizer)


Columns: Index(['id', 'text', 'example_very_unclear', 'admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')
        id                                               text  \
0  eew5j0j                                    That game hurt.   
1  eemcysk   >sexuality shouldn’t be a grouping category I...   
2  ed2mah1     You do right, if you don't care then fuck 'em!   
3  eeibobj                                 Man I love reddit.   
4  eda6yn6  [NAME] was nowhere near them, he was by the Fa...   

   example_very_unclear  admiration  amusement  anger  annoyance  approval  \
0                 False           0          0      0          0         0   
1       

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Step 4: Fine-tune BERT for Multi-Label Classification

In [7]:
from transformers import BertModel, BertTokenizer, BertConfig
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
import torch.nn as nn

# Custom BERT model for multi-label classification
from transformers import BertPreTrainedModel

class BertForMultiLabel(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        logits = self.classifier(pooled_output)
        probs = self.sigmoid(logits)

        loss = None
        if labels is not None:
            loss_fn = nn.BCELoss()
            loss = loss_fn(probs, labels)

        return {'loss': loss, 'logits': probs}

# Model init
from transformers import BertConfig

config = BertConfig.from_pretrained("bert-base-uncased")
model = BertForMultiLabel(config=config, num_labels=label_matrix.shape[1])


# Step 5: Train the Model

In [16]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import numpy as np
from sklearn.metrics import f1_score, hamming_loss
from transformers import Trainer, TrainingArguments

# Assuming `df` contains the DataFrame with emotions as binary columns
emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Create the labels DataFrame by selecting the emotion columns
df_labels = df[emotion_columns].copy()

# Dataset Class
class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels  # This should now be the DataFrame with binary columns for emotions
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels.iloc[idx].values  # Access the binary emotion vector for the current sample

        # Ensure label is a numpy array of float32
        label = np.array(label, dtype=np.float32)

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float32)  # Ensure label is a float32 tensor
        }

# Load tokenizer (e.g., BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Initialize the dataset
train_texts = df['text'][:1000].tolist()  # Example: taking the first 1000 rows for training
train_labels = df_labels[:1000]  # Corresponding labels for training
train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer)

# Evaluation dataset (for simplicity, using a subset here, change it as per your data)
test_texts = df['text'][1000:1200].tolist()
test_labels = df_labels[1000:1200]
test_dataset = GoEmotionsDataset(test_texts, test_labels, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

# Compute metrics
def compute_metrics(pred):
    logits, labels = pred
    preds = (logits > 0.5).astype(int)  # Convert logits to binary predictions
    f1 = f1_score(labels, preds, average='micro')
    hamming = hamming_loss(labels, preds)
    return {"f1": f1, "hamming_loss": hamming}

# Initialize the model (e.g., BERT)
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(emotion_columns))

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,0.3121,0.280962,0.0,0.041429,77.6033,2.577,0.09
2,0.2268,0.21528,0.0,0.041429,76.4269,2.617,0.092
3,0.2046,0.202517,0.0,0.041429,81.0067,2.469,0.086


TrainOutput(global_step=189, training_loss=0.299350863411313, metrics={'train_runtime': 4397.5568, 'train_samples_per_second': 0.682, 'train_steps_per_second': 0.043, 'total_flos': 197379357696000.0, 'train_loss': 0.299350863411313, 'epoch': 3.0})

# Step 6: Evaluate and Test on Real-world Data

In [28]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import torch

# Assuming the 'emotion_columns' list is the set of emotions you're using for training
emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Fit MultiLabelBinarizer on training labels (df_labels) using the set of emotions in emotion_columns
mlb = MultiLabelBinarizer(classes=emotion_columns)  # Specify classes explicitly
mlb.fit(df_labels[emotion_columns])

# Example real-world text
test_texts = [
    "The meeting went as expected, with no surprises"
]

# Prepare inputs
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
outputs = model(**inputs)

# Convert logits to probabilities using sigmoid function
probs = torch.sigmoid(outputs['logits']).detach().numpy()

# Debugging: Print the actual probabilities for each text
for i, prob in enumerate(probs):
    print(f"Text: {test_texts[i]}")
    print(f"Probabilities: {prob}")  # Print all probabilities for this text
    print(f"Max Probability: {np.max(prob)}")  # Check the max probability to ensure any exceed the threshold

# Apply threshold and decode labels
predictions = (probs > 0.2).astype(int)

# Decode labels
for i, pred in enumerate(predictions):
    print(f"Text: {test_texts[i]}")
    print(f"Predicted Emotions: {mlb.inverse_transform(np.array([pred]))[0]}")


Text: The meeting went as expected, with no surprises
Probabilities: [0.12476579 0.13609469 0.13355736 0.14082526 0.16710907 0.10413802
 0.09399792 0.11858135 0.08988848 0.12177805 0.14194374 0.1075998
 0.10216357 0.12560995 0.10836727 0.14987296 0.11805334 0.09748437
 0.09677923 0.07754579 0.11783593 0.07965167 0.12601487 0.09486361
 0.09834033 0.13876821 0.10561875 0.26486456]
Max Probability: 0.26486456394195557
Text: The meeting went as expected, with no surprises
Predicted Emotions: ('neutral',)
