<a href="https://colab.research.google.com/github/OGharsh/emotion-detection/blob/main/emotion_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# ============================================
# GoEmotions (Full 28-label) — DeBERTa-v3-base (Multi-label)

%pip -q uninstall -y numba llvmlite jax jaxlib opencv-python opencv-python-headless cudf-cu12 pylibcudf-cu12 || true

%pip -q install -U \
  "numpy==2.0.2" \
  "scipy>=1.14.0" \
  "scikit-learn>=1.5.2" \
  "pyarrow>=14,<20" \
  "transformers>=4.44.0" \
  "datasets>=3.0.0" \
  "evaluate" \
  "matplotlib" \
  "sentencepiece" \
  "accelerate>=0.34.2"

# --- Imports
import os, json, math, random, csv
from datetime import datetime
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd

from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from sklearn.metrics import (precision_recall_fscore_support, classification_report,
                             average_precision_score, roc_auc_score, multilabel_confusion_matrix)

[0m

In [15]:
# ---------------------------
# Config & Reproducibility
# ---------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

MODEL_NAME = "microsoft/deberta-v3-base"
OUTPUT_DIR = "outputs_goemotions"
FIG_DIR = os.path.join(OUTPUT_DIR, "figures")
ARTIFACTS_DIR = os.path.join(OUTPUT_DIR, "artifacts")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)


Device: cuda


In [16]:
# ---------------------------
# Load GoEmotions
# ---------------------------
def load_goemotions_full():
    for cfg in ["original", "raw", None]:
        try:
            ds = load_dataset("go_emotions", cfg) if cfg else load_dataset("go_emotions")
            names = ds["train"].features["labels"].feature.names
            if len(names) == 28:
                print(f"Loaded go_emotions config: {cfg or 'default'} (28 labels)")
                return ds, names
        except Exception as e:
            print(f"Config {cfg} failed: {e}")
    raise ValueError("Could not load the full 28-label GoEmotions config. Try config='original'.")

ds_raw, label_names = load_goemotions_full()
num_labels = len(label_names)
assert num_labels == 28, f"Expected 28 labels; got {num_labels}."

print("Detected labels (28):")
print(label_names)

assert isinstance(ds_raw, DatasetDict), "Dataset must be a DatasetDict."


Config original failed: BuilderConfig 'original' not found. Available: ['raw', 'simplified']
Config raw failed: 'labels'
Loaded go_emotions config: default (28 labels)
Detected labels (28):
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [17]:
# ---------------------------
# Preprocessing (tokenization + multi-hot labels)
# ---------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.padding_side = "right"

def encode_batch(batch):
    tokenized = tokenizer(batch["text"], truncation=True, max_length=256)
    bin_labels = []
    for labs in batch["labels"]:
        vec = np.zeros(num_labels, dtype="float32")
        for lid in labs:
            if 0 <= lid < num_labels:
                vec[lid] = 1.0
        bin_labels.append(vec.tolist())
    tokenized["labels"] = bin_labels
    return tokenized

cols_to_remove = [c for c in ds_raw["train"].column_names if c not in ("text","labels")]
ds = ds_raw.map(encode_batch, batched=True, remove_columns=cols_to_remove)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

In [18]:
# ---------------------------
# Model (multi-label classification via BCEWithLogits)
# ---------------------------
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    id2label={i: n for i, n in enumerate(label_names)},
    label2id={n: i for i, n in enumerate(label_names)},
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(DEVICE)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# ---------------------------
# Metrics helpers
# ---------------------------
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def metrics_from_probs(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    micro = precision_recall_fscore_support(y_true, y_pred, average="micro", zero_division=0)
    macro = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    out = {
        "micro_precision": micro[0], "micro_recall": micro[1], "micro_f1": micro[2],
        "macro_precision": macro[0], "macro_recall": macro[1], "macro_f1": macro[2],
        "threshold": threshold
    }
    # AUPRC / AUROC (macro over labels)
    try:
        pr_aucs, roc_aucs = [], []
        for j in range(y_true.shape[1]):
            if y_true[:, j].sum() > 0:
                pr_aucs.append(average_precision_score(y_true[:, j], y_prob[:, j]))
                try:
                    roc_aucs.append(roc_auc_score(y_true[:, j], y_prob[:, j]))
                except Exception:
                    pass
        if pr_aucs:
            out["macro_auprc"] = float(np.mean(pr_aucs))
        if roc_aucs:
            out["macro_auroc"] = float(np.mean(roc_aucs))
    except Exception as e:
        print("AUC computation skipped:", e)
    return out, y_pred

best_threshold = 0.5  # tuned after training

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    metrics, _ = metrics_from_probs(labels, probs, threshold=best_threshold)
    return metrics


In [9]:
# ---------------------------
# Ensure labels are float32 (required for BCEWithLogitsLoss)
# ---------------------------
from datasets import Sequence, Value
ds = ds.cast_column("labels", Sequence(Value("float32")))

# ---------------------------
# Data collator that guarantees float labels
# ---------------------------
from transformers import DataCollatorWithPadding

class DataCollatorFloatLabels(DataCollatorWithPadding):
    def __call__(self, features):
        labels = [torch.tensor(f["labels"], dtype=torch.float32) for f in features]
        batch = super().__call__([{k:v for k,v in f.items() if k!="labels"} for f in features])
        batch["labels"] = torch.stack(labels)
        return batch

data_collator = DataCollatorFloatLabels(tokenizer=tokenizer)

# ---------------------------
# Training Arguments
# ---------------------------
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
bf16_ok = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
RUN_NAME = f"debertav3_base_goemotions_full_seed{SEED}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "hf_runs"),
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=(torch.cuda.is_available() and not bf16_ok),
    bf16=bf16_ok,
    report_to="none",
    seed=SEED
)

# ---------------------------
# Trainer (use processing_class instead of tokenizer)
# ---------------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

train_out = trainer.train()
print("\nTraining complete.")


Casting the dataset:   0%|          | 0/43410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5427 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Micro Precision,Micro Recall,Micro F1,Macro Precision,Macro Recall,Macro F1,Threshold,Macro Auprc,Macro Auroc
1,0.0919,0.091457,0.692214,0.445925,0.542421,0.473997,0.309845,0.337479,0.5,0.432958,0.890762


Epoch,Training Loss,Validation Loss,Micro Precision,Micro Recall,Micro F1,Macro Precision,Macro Recall,Macro F1,Threshold,Macro Auprc,Macro Auroc
1,0.0919,0.091457,0.692214,0.445925,0.542421,0.473997,0.309845,0.337479,0.5,0.432958,0.890762
2,0.0836,0.084618,0.70927,0.474922,0.568907,0.511193,0.360769,0.400748,0.5,0.48694,0.922016
3,0.0773,0.083604,0.686341,0.504859,0.581775,0.565775,0.386508,0.422843,0.5,0.507675,0.925815



Training complete.


In [10]:
# ---------------------------
# Threshold tuning on validation for best macro-F1
# ---------------------------
val_logits = trainer.predict(ds["validation"]).predictions
val_labels = np.vstack(ds["validation"]["labels"]).astype(int)
val_probs = sigmoid(val_logits)

candidate_thresholds = np.linspace(0.05, 0.95, 19)
scores = []
for th in candidate_thresholds:
    m, _ = metrics_from_probs(val_labels, val_probs, threshold=th)
    scores.append((th, m["macro_f1"]))

best_threshold, best_val_macro_f1 = max(scores, key=lambda x: x[1])
print(f"\nBest threshold on validation: {best_threshold:.2f} (macro-F1={best_val_macro_f1:.4f})")

# Optional: log a final HF-style eval with the tuned threshold
final_val = trainer.evaluate()
print("Final validation metrics (with tuned threshold):", final_val)


Best threshold on validation: 0.25 (macro-F1=0.4767)


Final validation metrics (with tuned threshold): {'eval_loss': 0.0836038589477539, 'eval_micro_precision': 0.5544593015708166, 'eval_micro_recall': 0.669435736677116, 'eval_micro_f1': 0.6065469005183555, 'eval_macro_precision': 0.45045118137874984, 'eval_macro_recall': 0.5242587193764686, 'eval_macro_f1': 0.4767443681480735, 'eval_threshold': 0.25, 'eval_macro_auprc': 0.5076746653575488, 'eval_macro_auroc': 0.9258154131406816, 'eval_runtime': 27.1857, 'eval_samples_per_second': 199.59, 'eval_steps_per_second': 12.507, 'epoch': 3.0}


In [11]:
# ---------------------------
# Final Evaluation on TEST with tuned threshold
# ---------------------------
test_pred = trainer.predict(ds["test"])
test_logits = test_pred.predictions
test_labels = np.vstack(ds["test"]["labels"]).astype(int)
test_probs = sigmoid(test_logits)

test_metrics, test_pred_bin = metrics_from_probs(test_labels, test_probs, threshold=best_threshold)
print("\n==== TEST METRICS (tuned threshold) ====")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")


==== TEST METRICS (tuned threshold) ====
micro_precision: 0.5507
micro_recall: 0.6767
micro_f1: 0.6072
macro_precision: 0.4447
macro_recall: 0.5271
macro_f1: 0.4704
threshold: 0.2500
macro_auprc: 0.4834
macro_auroc: 0.9239


In [12]:
# ---------------------------
# Per-label metrics & reports
# ---------------------------
per_label = []
for j, name in enumerate(label_names):
    y_true_j = test_labels[:, j]
    y_pred_j = test_pred_bin[:, j]
    p, r, f1, _ = precision_recall_fscore_support(y_true_j, y_pred_j, average="binary", zero_division=0)
    support = int(y_true_j.sum())
    per_label.append({"label": name, "precision": p, "recall": r, "f1": f1, "support": support})
per_label_sorted = sorted(per_label, key=lambda d: d["f1"], reverse=True)

pl_path = os.path.join(OUTPUT_DIR, "per_label_metrics.csv")
with open(pl_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["label","precision","recall","f1","support"])
    writer.writeheader()
    writer.writerows(per_label_sorted)

clf_report = classification_report(test_labels, test_pred_bin, target_names=label_names, zero_division=0)
with open(os.path.join(OUTPUT_DIR, "classification_report.txt"), "w") as f:
    f.write(clf_report)
print("\nClassification report saved.")



Classification report saved.


In [13]:
# ---------------------------
# Confusion Matrices (per-label)
# ---------------------------
ml_cms = multilabel_confusion_matrix(test_labels, test_pred_bin)
cols = 4
rows = math.ceil(num_labels / cols)
fig = plt.figure(figsize=(cols*3.2, rows*3.0))
for i in range(num_labels):
    ax = fig.add_subplot(rows, cols, i+1)
    cm = ml_cms[i]
    ax.imshow(cm, interpolation='nearest')
    ax.set_title(label_names[i], fontsize=9)
    ax.set_xticks([0,1]); ax.set_xticklabels(['Pred 0','Pred 1'], fontsize=7, rotation=45)
    ax.set_yticks([0,1]); ax.set_yticklabels(['True 0','True 1'], fontsize=7)
    for r in range(2):
        for c in range(2):
            ax.text(c, r, str(cm[r, c]), va='center', ha='center', fontsize=8)
    ax.tick_params(axis='both', which='both', length=0)
plt.tight_layout()
cm_grid_path = os.path.join(FIG_DIR, "multilabel_confusion_matrices.png")
plt.savefig(cm_grid_path, dpi=200, bbox_inches="tight")
plt.close()
print(f"Saved confusion matrices grid to {cm_grid_path}")


Saved confusion matrices grid to outputs_goemotions/figures/multilabel_confusion_matrices.png


In [20]:
# ---------------------------
# Misclassified Samples (top-5 probs)
# ---------------------------
def idx_to_labels(idx_list):
    return [label_names[i] for i in idx_list]

texts = ds_raw["test"]["text"]
mis_rows = []
for i in range(len(texts)):
    true_idx = np.where(test_labels[i]==1)[0].tolist()
    pred_idx = np.where(test_pred_bin[i]==1)[0].tolist()
    if set(true_idx) != set(pred_idx):
        topk = np.argsort(-test_probs[i])[:5].tolist()
        mis_rows.append({
            "idx": i,
            "text": texts[i],
            "true_labels": ", ".join(idx_to_labels(true_idx)),
            "pred_labels": ", ".join(idx_to_labels(pred_idx)),
            "top5_labels": ", ".join(idx_to_labels(topk)),
            "top5_probs": ", ".join([f"{test_probs[i][k]:.3f}" for k in topk])
        })
mis_rows = mis_rows[:1000]
mis_df = pd.DataFrame(mis_rows)
mis_path = os.path.join(OUTPUT_DIR, "misclassified_samples.csv")
mis_df.to_csv(mis_path, index=False)
print(f"Saved misclassified samples to {mis_path} (rows={len(mis_df)})")


Saved misclassified samples to outputs_goemotions/misclassified_samples.csv (rows=1000)


In [21]:
# ---------------------------
# Save artifacts & run metadata
# ---------------------------
trainer.save_model(os.path.join(ARTIFACTS_DIR, "model"))
tokenizer.save_pretrained(os.path.join(ARTIFACTS_DIR, "model"))

metrics_payload = {
    "seed": SEED,
    "model_name": MODEL_NAME,
    "epochs": 3,
    "batch_size": 16,
    "learning_rate": 2e-5,
    "device": DEVICE,
    "best_validation_threshold": round(float(best_threshold), 3),
    "validation_best_macro_f1": float(best_val_macro_f1),
    "test_metrics": {k: (float(v) if isinstance(v, (np.floating, float)) else v) for k, v in test_metrics.items()},
    "timestamp": datetime.now().isoformat()
}
with open(os.path.join(OUTPUT_DIR, "metrics_test.json"), "w") as f:
    json.dump(metrics_payload, f, indent=2)

with open(os.path.join(OUTPUT_DIR, "label_map.json"), "w") as f:
    json.dump({i: name for i, name in enumerate(label_names)}, f, indent=2)

print("\nArtifacts saved in:")
print(" -", OUTPUT_DIR)
print(" -", ARTIFACTS_DIR)
print("\nNEXT STEPS:")
print("1) Review metrics_test.json and per_label_metrics.csv")
print("2) Inspect misclassified_samples.csv for qualitative analysis")
print("3) (Optional) Adjust LR/EPOCHS or try DeBERTa-v3-large for an ablation table")


Artifacts saved in:
 - outputs_goemotions
 - outputs_goemotions/artifacts

NEXT STEPS:
1) Review metrics_test.json and per_label_metrics.csv
2) Inspect misclassified_samples.csv for qualitative analysis
3) (Optional) Adjust LR/EPOCHS or try DeBERTa-v3-large for an ablation table


In [39]:
!zip -r outputs_goemotions.zip outputs_goemotions


updating: outputs_goemotions/ (stored 0%)
updating: outputs_goemotions/misclassified_samples.csv (deflated 69%)
updating: outputs_goemotions/per_label_metrics.csv (deflated 50%)
updating: outputs_goemotions/metrics_test.json (deflated 47%)
updating: outputs_goemotions/figures/ (stored 0%)
updating: outputs_goemotions/figures/threshold_tuning_curve.png (deflated 11%)
updating: outputs_goemotions/figures/multilabel_confusion_matrices.png (deflated 24%)
updating: outputs_goemotions/figures/per_label_f1_heatmap.png (deflated 24%)
updating: outputs_goemotions/figures/class_distribution.png (deflated 20%)
updating: outputs_goemotions/figures/val_f1_curve.png (deflated 11%)
updating: outputs_goemotions/figures/train_loss_curve.png (deflated 11%)
updating: outputs_goemotions/artifacts/ (stored 0%)
updating: outputs_goemotions/artifacts/model/ (stored 0%)
updating: outputs_goemotions/artifacts/model/training_args.bin (deflated 53%)
updating: outputs_goemotions/artifacts/model/spm.model (deflate

In [40]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [41]:
!cp outputs_goemotions.zip /content/drive/MyDrive/


In [34]:
import json
import pandas as pd
from matplotlib import pyplot as plt
import os

state_path = "/content/outputs_goemotions/hf_runs/checkpoint-8142/trainer_state.json"

with open(state_path, "r") as f:
    state = json.load(f)

# 'log_history' is a list of dicts (events across training/eval)
hist = pd.DataFrame(state.get("log_history", []))

# Optional: sort by step if present
if "step" in hist.columns:
    hist = hist.sort_values("step").reset_index(drop=True)

# Extract series safely
train_loss = hist.loc[hist["loss"].notna(), "loss"].tolist() if "loss" in hist else []
eval_f1 = hist.loc[hist["eval_macro_f1"].notna(), "eval_macro_f1"].tolist() if "eval_macro_f1" in hist else []

# Plot + save
fig_dir = "/content/outputs_goemotions/figures"
os.makedirs(fig_dir, exist_ok=True)

plt.figure()
plt.plot(train_loss)
plt.title("Training Loss")
plt.xlabel("Logged steps (index)")
plt.ylabel("Loss")
plt.savefig(os.path.join(fig_dir, "train_loss_curve.png"))
plt.close()

plt.figure()
plt.plot(eval_f1)
plt.title("Validation Macro F1")
plt.xlabel("Eval events (index)")
plt.ylabel("Macro F1")
plt.savefig(os.path.join(fig_dir, "val_f1_curve.png"))
plt.close()

print("Saved plots to:", fig_dir)


Saved plots to: /content/outputs_goemotions/figures


In [42]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("/content/outputs_goemotions/per_label_metrics.csv")
plt.figure(figsize=(8,10))
sns.heatmap(df[["f1"]], annot=False, cmap="viridis", yticklabels=df["label"])
plt.title("Per-label F1 Heatmap")
plt.savefig("/content/outputs_goemotions/figures/per_label_f1_heatmap.png", dpi=200, bbox_inches='tight')
plt.close()


In [32]:
import numpy as np
import matplotlib.pyplot as plt

thresholds = np.linspace(0.05, 0.95, 19)
f1_scores = []

from sklearn.metrics import precision_recall_fscore_support

for th in thresholds:
    preds = (val_probs >= th).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(val_labels, preds, average='macro', zero_division=0)
    f1_scores.append(f1)

plt.figure(figsize=(6,4))
plt.plot(thresholds, f1_scores, marker='o')
plt.title("Threshold Tuning Curve (Validation Macro-F1)")
plt.xlabel("Threshold")
plt.ylabel("Macro-F1 Score")
plt.grid()
plt.savefig("/content/outputs_goemotions/figures/threshold_tuning_curve.png", dpi=200, bbox_inches="tight")
plt.close()


In [31]:
import pandas as pd
import matplotlib.pyplot as plt

train_df = pd.DataFrame(ds_raw["train"])
label_counts = {}

for labels in train_df["labels"]:
    for l in labels:
        label_counts[l] = label_counts.get(l, 0) + 1

label_counts_sorted = dict(sorted(label_counts.items(), key=lambda x: x[1], reverse=True))
plt.figure(figsize=(10,6))
plt.bar(range(len(label_counts_sorted)), list(label_counts_sorted.values()))
plt.xticks(range(len(label_counts_sorted)), [label_names[i] for i in label_counts_sorted.keys()], rotation=90)
plt.title("Training Set Label Distribution (Imbalance Visualization)")
plt.tight_layout()
plt.savefig("/content/outputs_goemotions/figures/class_distribution.png", dpi=200)
plt.close()


In [38]:
import numpy as np
from collections import Counter

# true/pred as binary arrays (you already have `test_labels`, `test_pred_bin`)
pair_counts = Counter()

for t, p in zip(test_labels, test_pred_bin):
    true_idx = np.where(t == 1)[0].tolist()
    pred_idx = np.where(p == 1)[0].tolist()
    # for each true label A, count any extra predicted B that's not in true
    extras = set(pred_idx) - set(true_idx)
    for a in true_idx:
        for b in extras:
            pair_counts[(label_names[a], label_names[b])] += 1

# Top 10 (A -> B means A present, B wrongly predicted alongside/ instead)
top_pairs = pair_counts.most_common(10)
top_pairs


[(('neutral', 'curiosity'), 143),
 (('approval', 'neutral'), 130),
 (('neutral', 'disapproval'), 121),
 (('curiosity', 'neutral'), 119),
 (('neutral', 'approval'), 116),
 (('disapproval', 'neutral'), 115),
 (('neutral', 'annoyance'), 110),
 (('anger', 'annoyance'), 107),
 (('annoyance', 'neutral'), 96),
 (('neutral', 'admiration'), 85)]