In [None]:
# Install dependencies (Run this only once in Colab)
!pip install transformers scikit-learn -q

import pandas as pd
import torch
import time
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report, accuracy_score
import numpy as np
from tqdm import tqdm
import os

# Load train and val datasets
train_df = pd.read_csv("/content/drive/MyDrive/emotion_train.tsv", sep='\t')
val_df = pd.read_csv("/content/drive/MyDrive/emotion_val.tsv", sep='\t')

# Convert label strings to lists
train_df['labels'] = train_df['labels'].apply(eval)
val_df['labels'] = val_df['labels'].apply(eval)

# Binarize labels
mlb = MultiLabelBinarizer()
mlb.fit(train_df['labels'])
y_train = mlb.transform(train_df['labels'])
y_val = mlb.transform(val_df['labels'])

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# Create datasets and loaders
train_dataset = EmotionDataset(train_df['sentence'].tolist(), y_train)
val_dataset = EmotionDataset(val_df['sentence'].tolist(), y_val)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Resume from checkpoint if available
checkpoint_dir = "emotion_model_epoch3"  # change based on what you want to resume
if os.path.exists(checkpoint_dir):
    print(f"Resuming from checkpoint: {checkpoint_dir}")
    model = DistilBertForSequenceClassification.from_pretrained(checkpoint_dir)
    tokenizer = DistilBertTokenizer.from_pretrained(checkpoint_dir)
else:
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(mlb.classes_))

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):
    print(f"\n--- Epoch {epoch+1} ---")
    start_time = time.time()
    total_loss = 0
    model.train()
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.cpu().numpy()
            preds.extend((logits > 0).astype(int))
            truths.extend(labels)

    f1 = f1_score(truths, preds, average='macro')
    acc = accuracy_score(np.array(truths), np.array(preds))
    print(f"Epoch {epoch+1} - Validation Macro F1: {f1:.4f}")
    print(f"Epoch {epoch+1} - Validation Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(truths, preds, target_names=mlb.classes_))

    # Save model after each epoch
    save_path = f"emotion_model_epoch{epoch+1}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # Timing info
    elapsed = time.time() - start_time
    print(f"Epoch {epoch+1} completed in {elapsed // 60:.0f} min {elapsed % 60:.0f} sec")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Epoch 1 ---


100%|██████████| 4363/4363 [04:12<00:00, 17.30it/s]


Epoch 1 - Training loss: 0.0961
Epoch 1 - Validation Macro F1: 0.3559
Epoch 1 - Validation Accuracy: 0.4178
Classification Report:
                precision    recall  f1-score   support

    admiration       0.59      0.69      0.63       274
     amusement       0.82      0.80      0.81       256
         anger       0.59      0.45      0.51       258
     annoyance       0.52      0.05      0.09       211
      approval       0.68      0.20      0.31       254
        caring       1.00      0.02      0.04        96
     confusion       0.73      0.08      0.14       101
     curiosity       0.65      0.65      0.65       244
        desire       0.73      0.41      0.52        54
disappointment       0.00      0.00      0.00        81
   disapproval       0.51      0.34      0.41       116
       disgust       0.64      0.29      0.40       129
 embarrassment       0.00      0.00      0.00        26
    excitement       0.58      0.13      0.21        55
          fear       0.81   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1 completed in 4 min 26 sec

--- Epoch 2 ---


100%|██████████| 4363/4363 [04:14<00:00, 17.17it/s]


Epoch 2 - Training loss: 0.0631
Epoch 2 - Validation Macro F1: 0.4411
Epoch 2 - Validation Accuracy: 0.4747
Classification Report:
                precision    recall  f1-score   support

    admiration       0.70      0.47      0.56       274
     amusement       0.79      0.87      0.83       256
         anger       0.55      0.42      0.48       258
     annoyance       0.59      0.09      0.16       211
      approval       0.62      0.22      0.33       254
        caring       0.56      0.36      0.44        96
     confusion       0.61      0.31      0.41       101
     curiosity       0.67      0.65      0.66       244
        desire       0.56      0.44      0.49        54
disappointment       0.70      0.09      0.15        81
   disapproval       0.51      0.42      0.46       116
       disgust       0.62      0.44      0.52       129
 embarrassment       0.69      0.35      0.46        26
    excitement       0.48      0.29      0.36        55
          fear       0.74   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2 completed in 4 min 25 sec

--- Epoch 3 ---


100%|██████████| 4363/4363 [04:14<00:00, 17.14it/s]


Epoch 3 - Training loss: 0.0511
Epoch 3 - Validation Macro F1: 0.4484
Epoch 3 - Validation Accuracy: 0.4948
Classification Report:
                precision    recall  f1-score   support

    admiration       0.64      0.62      0.63       274
     amusement       0.83      0.79      0.81       256
         anger       0.57      0.51      0.54       258
     annoyance       0.52      0.06      0.10       211
      approval       0.50      0.30      0.38       254
        caring       0.43      0.42      0.43        96
     confusion       0.62      0.28      0.38       101
     curiosity       0.63      0.71      0.67       244
        desire       0.63      0.44      0.52        54
disappointment       0.38      0.17      0.24        81
   disapproval       0.53      0.29      0.38       116
       disgust       0.73      0.42      0.53       129
 embarrassment       0.67      0.38      0.49        26
    excitement       0.55      0.20      0.29        55
          fear       0.73   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3 completed in 4 min 25 sec


In [None]:
# Install dependencies (Run this only once in Colab)
!pip install transformers scikit-learn -q

import pandas as pd
import torch
import time
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report, accuracy_score
import numpy as np
from tqdm import tqdm
import os

# Load train and val datasets
train_df = pd.read_csv("/content/drive/MyDrive/emotion_train.tsv", sep='\t')
val_df = pd.read_csv("/content/drive/MyDrive/emotion_val.tsv", sep='\t')

# Convert label strings to lists
train_df['labels'] = train_df['labels'].apply(eval)
val_df['labels'] = val_df['labels'].apply(eval)

# Binarize labels
mlb = MultiLabelBinarizer()
mlb.fit(train_df['labels'])
y_train = mlb.transform(train_df['labels'])
y_val = mlb.transform(val_df['labels'])

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# Create datasets and loaders
train_dataset = EmotionDataset(train_df['sentence'].tolist(), y_train)
val_dataset = EmotionDataset(val_df['sentence'].tolist(), y_val)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Resume from checkpoint if available
checkpoint_dir = "emotion_model_epoch3"  # change based on what you want to resume
if os.path.exists(checkpoint_dir):
    print(f"Resuming from checkpoint: {checkpoint_dir}")
    model = DistilBertForSequenceClassification.from_pretrained(checkpoint_dir)
    tokenizer = DistilBertTokenizer.from_pretrained(checkpoint_dir)
else:
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(mlb.classes_))

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3, 5):
    print(f"\n--- Epoch {epoch+1} ---")
    start_time = time.time()
    total_loss = 0
    model.train()
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Training loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    preds, truths, logits_list = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.cpu().numpy()
            preds.extend((logits > 0).astype(int))
            truths.extend(labels)
            logits_list.extend(logits)

    f1 = f1_score(truths, preds, average='macro')
    acc = accuracy_score(np.array(truths), np.array(preds))
    print(f"Epoch {epoch+1} - Validation Macro F1: {f1:.4f}")
    print(f"Epoch {epoch+1} - Validation Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(truths, preds, target_names=mlb.classes_))

    # Save model after each epoch
    save_path = f"emotion_model_epoch{epoch+1}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # Timing info
    elapsed = time.time() - start_time
    print(f"Epoch {epoch+1} completed in {elapsed // 60:.0f} min {elapsed % 60:.0f} sec")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resuming from checkpoint: emotion_model_epoch3

--- Epoch 4 ---


100%|██████████| 4363/4363 [04:12<00:00, 17.26it/s]


Epoch 4 - Training loss: 0.0390
Epoch 4 - Validation Macro F1: 0.4604
Epoch 4 - Validation Accuracy: 0.5132
Classification Report:
                precision    recall  f1-score   support

    admiration       0.65      0.59      0.62       274
     amusement       0.82      0.83      0.82       256
         anger       0.58      0.48      0.52       258
     annoyance       0.45      0.26      0.33       211
      approval       0.51      0.27      0.35       254
        caring       0.55      0.36      0.44        96
     confusion       0.50      0.31      0.38       101
     curiosity       0.60      0.70      0.65       244
        desire       0.65      0.44      0.53        54
disappointment       0.45      0.16      0.24        81
   disapproval       0.44      0.44      0.44       116
       disgust       0.71      0.42      0.53       129
 embarrassment       0.64      0.27      0.38        26
    excitement       0.29      0.38      0.33        55
          fear       0.56   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4 completed in 4 min 26 sec

--- Epoch 5 ---


100%|██████████| 4363/4363 [04:14<00:00, 17.14it/s]


Epoch 5 - Training loss: 0.0279
Epoch 5 - Validation Macro F1: 0.4669
Epoch 5 - Validation Accuracy: 0.5100
Classification Report:
                precision    recall  f1-score   support

    admiration       0.63      0.58      0.60       274
     amusement       0.77      0.86      0.82       256
         anger       0.59      0.42      0.49       258
     annoyance       0.35      0.32      0.33       211
      approval       0.42      0.35      0.38       254
        caring       0.48      0.44      0.46        96
     confusion       0.50      0.28      0.36       101
     curiosity       0.61      0.66      0.63       244
        desire       0.49      0.48      0.49        54
disappointment       0.25      0.23      0.24        81
   disapproval       0.40      0.47      0.43       116
       disgust       0.49      0.47      0.48       129
 embarrassment       0.67      0.54      0.60        26
    excitement       0.48      0.25      0.33        55
          fear       0.68   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5 completed in 4 min 25 sec


In [None]:
# Save final model and tokenizer
final_dir = "emotion_model_final"
model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print("✅ Final emotion model saved to 'emotion_model_final/'")


✅ Final emotion model saved to 'emotion_model_final/'


In [None]:
import shutil
from google.colab import files

# Replace this with the correct folder name (e.g., emotion_model_epoch4 or 5)
model_folder = "emotion_model_epoch5"

# Zip the folder
shutil.make_archive(model_folder, 'zip', model_folder)

# Download the zipped file
files.download(model_folder + ".zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Model Testing

In [None]:
# Install dependencies (if not already done)
!pip install transformers scikit-learn -q

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Load test set
test_df = pd.read_csv("/content/drive/MyDrive/emotion_test.tsv", sep='\t')
test_df['labels'] = test_df['labels'].apply(eval)

# Binarize labels
mlb = MultiLabelBinarizer()
mlb.fit(test_df['labels'])
y_test = mlb.transform(test_df['labels'])

# Load tokenizer and model
model_path = "/content/emotion_model_epoch5"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# Prepare DataLoader
test_dataset = EmotionDataset(test_df['sentence'].tolist(), y_test)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Evaluate
preds, truths = [], []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()

        preds.extend((logits > 0).astype(int))
        truths.extend(labels)

# Results
macro_f1 = f1_score(truths, preds, average='macro')
accuracy = accuracy_score(np.array(truths), np.array(preds))
print(f"Test Macro F1: {macro_f1:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(truths, preds, target_names=mlb.classes_))


100%|██████████| 546/546 [00:11<00:00, 46.87it/s]

Test Macro F1: 0.4762
Test Accuracy: 0.5160
Classification Report:
                precision    recall  f1-score   support

    admiration       0.62      0.52      0.56       270
     amusement       0.78      0.83      0.80       238
         anger       0.61      0.47      0.53       260
     annoyance       0.32      0.31      0.31       221
      approval       0.44      0.42      0.43       231
        caring       0.37      0.37      0.37        67
     confusion       0.52      0.28      0.36       100
     curiosity       0.61      0.74      0.67       202
        desire       0.49      0.47      0.48        49
disappointment       0.28      0.15      0.20        99
   disapproval       0.38      0.32      0.35       153
       disgust       0.47      0.49      0.48       131
 embarrassment       0.62      0.52      0.57        29
    excitement       0.47      0.26      0.34        65
          fear       0.70      0.67      0.68       138
     gratitude       0.83      0.81 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##setting per class threshold

In [None]:
# Install dependencies
!pip install transformers scikit-learn -q

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm
import json

# Load validation data
val_df = pd.read_csv("/content/drive/MyDrive/emotion_val.tsv", sep='\t')
val_df['labels'] = val_df['labels'].apply(eval)

# Binarize labels
mlb = MultiLabelBinarizer()
mlb.fit(val_df['labels'])
y_val = mlb.transform(val_df['labels'])

# Load tokenizer and model
model_path = "/content/emotion_model_epoch5"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# DataLoader
val_dataset = EmotionDataset(val_df['sentence'].tolist(), y_val)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Get raw logits from validation set
logits_list = []
true_labels = []
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        logits_list.extend(logits)
        true_labels.extend(batch['labels'].cpu().numpy())

logits_array = np.array(logits_list)
true_labels = np.array(true_labels)

# Threshold tuning per class
best_thresholds = {}

for i, emotion in enumerate(mlb.classes_):
    best_f1 = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.9, 0.01):
        preds = (logits_array[:, i] > thresh).astype(int)
        f1 = f1_score(true_labels[:, i], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    best_thresholds[emotion] = float(round(best_thresh, 2))
    print(f"{emotion:<15} | Best Threshold: {best_thresh:.2f} | F1 Score: {best_f1:.4f}")

# Save to JSON
with open("emotion_thresholds.json", "w") as f:
    json.dump(best_thresholds, f)

print("\nSaved best thresholds to emotion_thresholds.json")


100%|██████████| 546/546 [00:10<00:00, 54.35it/s]


admiration      | Best Threshold: 0.25 | F1 Score: 0.6067
amusement       | Best Threshold: 0.32 | F1 Score: 0.8180
anger           | Best Threshold: 0.15 | F1 Score: 0.4954
annoyance       | Best Threshold: 0.14 | F1 Score: 0.3325
approval        | Best Threshold: 0.16 | F1 Score: 0.3878
caring          | Best Threshold: 0.68 | F1 Score: 0.4671
confusion       | Best Threshold: 0.11 | F1 Score: 0.3709
curiosity       | Best Threshold: 0.22 | F1 Score: 0.6358
desire          | Best Threshold: 0.70 | F1 Score: 0.5217
disappointment  | Best Threshold: 0.11 | F1 Score: 0.2207
disapproval     | Best Threshold: 0.31 | F1 Score: 0.4454
disgust         | Best Threshold: 0.15 | F1 Score: 0.4938
embarrassment   | Best Threshold: 0.42 | F1 Score: 0.6364
excitement      | Best Threshold: 0.29 | F1 Score: 0.3457
fear            | Best Threshold: 0.73 | F1 Score: 0.6569
gratitude       | Best Threshold: 0.10 | F1 Score: 0.8436
grief           | Best Threshold: 0.50 | F1 Score: 0.0000
guilt         

##applying merged trained model on semeval sentences file for tagging

In [None]:
# Install required packages
!pip install transformers scikit-learn -q

import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
from tqdm import tqdm

# Load SemEval balanced sentences
semeval_df = pd.read_csv("/content/drive/MyDrive/semeval_balanced_sentences.tsv", sep='\t')

# Load tokenizer and model
model_path = "/content/emotion_model_epoch5"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load best per-class thresholds
with open("emotion_thresholds.json", "r") as f:
    threshold_dict = json.load(f)

# Load emotion classes from the model config
emotion_labels = list(threshold_dict.keys())

# Dataset class
class SemEvalDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

# Prepare DataLoader
texts = semeval_df['sentence'].tolist()
dataset = SemEvalDataset(texts)
dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

# Predict emotions
predicted_emotions = []
with torch.no_grad():
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()

        for logit in logits:
            emotions = []
            for i, label in enumerate(emotion_labels):
                if logit[i] > threshold_dict[label]:
                    emotions.append(label)

            # Fallback to top-1 if no emotions above threshold
            if not emotions:
                top_index = np.argmax(logit)
                emotions = [emotion_labels[top_index]]

            predicted_emotions.append(emotions)

# Attach predictions to DataFrame
semeval_df['predicted_emotions'] = predicted_emotions

# Save results
semeval_df.to_csv("semeval_emotion_predictions.tsv", sep='\t', index=False)
print("\n✅ Saved SemEval emotion predictions with top-1 fallback to 'semeval_emotion_predictions.tsv'")


100%|██████████| 13485/13485 [04:08<00:00, 54.33it/s]



✅ Saved SemEval emotion predictions with top-1 fallback to 'semeval_emotion_predictions.tsv'


In [None]:
import pandas as pd

# Load your predicted emotion tagging output
pred_df = pd.read_csv("semeval_emotion_predictions.tsv", sep="\t")

# Show first 10 rows
print("🔍 Preview of SemEval Emotion Predictions:")
display(pred_df.head(10))

# Total number of sentences
print(f"\n📦 Total rows: {len(pred_df)}")


🔍 Preview of SemEval Emotion Predictions:


Unnamed: 0,Text-ID,Sentence-ID,sentence,predicted_emotions
0,BG_002,1,THE STATE AGAINST LOZAN PANOV FOR SUSPICIONS T...,['neutral']
1,BG_002,2,KB.,['neutral']
2,BG_002,3,MORE BUNGALOWS AT LOWER PRICES The avalanche o...,['annoyance']
3,BG_002,4,For months now he has been the subject of an i...,['neutral']
4,BG_002,5,For years the prosecutor's office has been a p...,['neutral']
5,BG_002,6,Many members of the judiciary bought property ...,['anger']
6,BG_002,7,"Just for Lozan Panov, however, the National Re...",['anger']
7,BG_002,8,"Because of this case, as well as because of th...",['realization']
8,BG_002,9,It is for 2012 and was originally under the ge...,['approval']
9,BG_002,10,This means that the tax authorities presumptiv...,['neutral']



📦 Total rows: 107875


##merging both files emotion prediction and semeval labels

In [None]:
import pandas as pd

# Load both datasets
emotion_df = pd.read_csv("semeval_emotion_predictions.tsv", sep="\t")
value_df = pd.read_csv("/content/drive/MyDrive/semeval_balanced_labels.tsv", sep="\t")

# Merge on correct columns
merged_df = pd.merge(emotion_df, value_df, on=["Text-ID", "Sentence-ID"], how="inner")

# Show a few rows
print("✅ Merged Dataset Preview:")
display(merged_df.head())

# Save final merged file
merged_df.to_csv("semeval_merged_emotion_values.tsv", sep="\t", index=False)
print("\n💾 Saved final merged file as 'semeval_merged_emotion_values.tsv'")


✅ Merged Dataset Preview:


Unnamed: 0,Text-ID,Sentence-ID,sentence,predicted_emotions,Self-direction: thought attained,Self-direction: thought constrained,Self-direction: action attained,Self-direction: action constrained,Stimulation attained,Stimulation constrained,...,Benevolence: caring attained,Benevolence: caring constrained,Benevolence: dependability attained,Benevolence: dependability constrained,Universalism: concern attained,Universalism: concern constrained,Universalism: nature attained,Universalism: nature constrained,Universalism: tolerance attained,Universalism: tolerance constrained
0,BG_002,1,THE STATE AGAINST LOZAN PANOV FOR SUSPICIONS T...,['neutral'],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BG_002,1,THE STATE AGAINST LOZAN PANOV FOR SUSPICIONS T...,['neutral'],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BG_002,1,THE STATE AGAINST LOZAN PANOV FOR SUSPICIONS T...,['neutral'],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BG_002,2,KB.,['neutral'],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BG_002,2,KB.,['neutral'],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



💾 Saved final merged file as 'semeval_merged_emotion_values.tsv'


##Pre-prcoessing data and splitting for training 2 models (baseline and advanced)

In [None]:
# Install dependencies (if needed)
!pip install transformers scikit-learn -q

import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Load merged SemEval dataset
merged_df = pd.read_csv("/content/drive/MyDrive/semeval_merged_emotion_values.tsv", sep="\t")

# Parse stringified list
merged_df['predicted_emotions'] = merged_df['predicted_emotions'].apply(eval)

# Create input columns
merged_df['emotion_text'] = merged_df['predicted_emotions'].apply(lambda x: ' '.join(x))
merged_df['input_sentence_only'] = merged_df['sentence']
merged_df['input_with_emotions'] = merged_df['sentence'] + ' [EMO] ' + merged_df['emotion_text']

# Identify label columns (all after 'sentence' and 'predicted_emotions')
label_columns = merged_df.columns.difference(['Text-ID', 'Sentence-ID', 'sentence', 'predicted_emotions', 'emotion_text', 'input_sentence_only', 'input_with_emotions']).tolist()

# Save label columns for future reference
with open("value_label_columns.json", "w") as f:
    json.dump(label_columns, f)

# Train/val/test split
X_temp, X_test = train_test_split(merged_df, test_size=0.10, random_state=42)
X_train, X_val = train_test_split(X_temp, test_size=0.1111, random_state=42)  # 0.1111 * 0.9 = 0.10

# Save to disk
X_train.to_csv("value_train.tsv", sep="\t", index=False)
X_val.to_csv("value_val.tsv", sep="\t", index=False)
X_test.to_csv("value_test.tsv", sep="\t", index=False)

print("✅ Preprocessing complete.")
print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")
print("\n💾 Saved: value_train.tsv, value_val.tsv, value_test.tsv, value_label_columns.json")


✅ Preprocessing complete.
Train: 771213 | Val: 96391 | Test: 96401

💾 Saved: value_train.tsv, value_val.tsv, value_test.tsv, value_label_columns.json


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/semeval_merged_emotion_values.tsv", sep="\t")
print(df.columns.tolist())


['Text-ID', 'Sentence-ID', 'sentence', 'predicted_emotions', 'Self-direction: thought attained', 'Self-direction: thought constrained', 'Self-direction: action attained', 'Self-direction: action constrained', 'Stimulation attained', 'Stimulation constrained', 'Hedonism attained', 'Hedonism constrained', 'Achievement attained', 'Achievement constrained', 'Power: dominance attained', 'Power: dominance constrained', 'Power: resources attained', 'Power: resources constrained', 'Face attained', 'Face constrained', 'Security: personal attained', 'Security: personal constrained', 'Security: societal attained', 'Security: societal constrained', 'Tradition attained', 'Tradition constrained', 'Conformity: rules attained', 'Conformity: rules constrained', 'Conformity: interpersonal attained', 'Conformity: interpersonal constrained', 'Humility attained', 'Humility constrained', 'Benevolence: caring attained', 'Benevolence: caring constrained', 'Benevolence: dependability attained', 'Benevolence: d

##model train for baseline with sentences only no emotions

In [None]:
# Install necessary packages
!pip install transformers scikit-learn -q

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import f1_score, classification_report
import json
from tqdm import tqdm
import time

# Load data
train_df = pd.read_csv("/content/drive/MyDrive/value_train.tsv", sep="\t")
val_df = pd.read_csv("/content/drive/MyDrive/value_val.tsv", sep="\t")

# Reduce training set size for faster runs
train_df = train_df.sample(n=20000, random_state=42).reset_index(drop=True)

# Load label columns
with open("/content/drive/MyDrive/value_label_columns.json") as f:
    label_columns = json.load(f)

# Select input column (sentence only)
train_texts = train_df["input_sentence_only"].tolist()
val_texts = val_df["input_sentence_only"].tolist()

train_labels = train_df[label_columns].values
val_labels = val_df[label_columns].values

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_columns),
    problem_type="multi_label_classification"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Dataset class
class ValueDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

# DataLoaders
train_ds = ValueDataset(train_texts, train_labels)
val_ds = ValueDataset(val_texts, val_labels)
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop (3 epochs)
epochs = 3
model.train()
for epoch in range(epochs):
    start = time.time()
    total_loss = 0
    for batch in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"\nEpoch {epoch+1} loss: {total_loss:.4f} | Time: {round((time.time()-start)/60, 2)} min")

# Evaluation
# Evaluation
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in val_dl:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        preds = (logits > 0.5).astype(int)
        y_true.extend(labels)
        y_pred.extend(preds)

# 🔧 Fix types for sklearn
y_true = np.array(y_true).astype(int)
y_pred = np.array(y_pred).astype(int)

# ✅ Print classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_columns, zero_division=0))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   1%|          | 31/2500 [00:06<08:07,  5.06it/s]


KeyboardInterrupt: 