In [26]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, Trainer, TrainingArguments
)
from peft import LoraConfig, PromptTuningConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np, torch, random
import pandas as pd

In [17]:
# Base model and tokenizer
base_model = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(base_model)

# Load dataset
raw_dataset = load_dataset("go_emotions")
class_names = raw_dataset["train"].features["labels"].feature.names
num_labels  = len(class_names)
neutral_id  = class_names.index("neutral")

# Use 15 000 train / 3 000 val / 3 000 test samples
train_raw = raw_dataset["train"].shuffle(seed=42).select(range(15000))
val_raw   = raw_dataset["validation"].shuffle(seed=42).select(range(3000))
test_raw  = raw_dataset["test"].shuffle(seed=42).select(range(3000))

# Keep original multi-labels for any-of evaluation
def keep_multilabel(batch):
    batch["labels_multi"] = batch["labels"]
    return batch

train_raw = train_raw.map(keep_multilabel, batched=True)
val_raw   = val_raw.map(keep_multilabel, batched=True)
test_raw  = test_raw.map(keep_multilabel, batched=True)

# Convert multi-label → single label (pick first label if exists, else neutral)
def to_single_label(batch):
    chosen = []
    for lbls in batch["labels"]:
        if lbls:
            chosen.append(lbls[0])
        else:
            chosen.append(neutral_id)
    batch["labels"] = chosen
    return batch

train_ds = train_raw.map(to_single_label, batched=True)
val_ds   = val_raw.map(to_single_label, batched=True)
test_ds  = test_raw.map(to_single_label, batched=True)

# Tokenize
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

# Dynamic padding & formatting
data_collator = DataCollatorWithPadding(tokenizer)
cols = ["input_ids","attention_mask","labels"]
train_ds.set_format("torch", columns=cols)
val_ds.set_format("torch", columns=cols)
test_ds.set_format("torch", columns=cols)


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [18]:
_tmp = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=num_labels)
num_layers = _tmp.config.num_hidden_layers
token_dim  = _tmp.config.hidden_size
num_heads  = _tmp.config.num_attention_heads
del _tmp

# LoRA
lora_cfg = LoraConfig(
    r=4, lora_alpha=8, lora_dropout=0.05,
    bias="none", task_type="SEQ_CLS",
    target_modules=["q_lin","v_lin"]
)
model_lora = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=num_labels)
model_lora = get_peft_model(model_lora, lora_cfg)

# Prompt-Tuning
prompt_cfg = PromptTuningConfig(
    task_type="SEQ_CLS",
    num_virtual_tokens=8,
    prompt_tuning_init="RANDOM",
    num_layers=num_layers,
    token_dim=token_dim,
    num_attention_heads=num_heads
)
model_prompt = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=num_labels)
model_prompt = get_peft_model(model_prompt, prompt_cfg)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

In [20]:
args_common = dict(
    eval_strategy="no",
    save_strategy="no",
    logging_strategy="no",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    fp16=True,
    report_to="none"
)

args_lora   = TrainingArguments(output_dir="./results_lora",   learning_rate=1e-4, **args_common)
args_prompt = TrainingArguments(output_dir="./results_prompt", learning_rate=1e-4, **args_common)

In [21]:
def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  accuracy_score(y_true, preds),
        "precision": precision_score(y_true, preds, average="macro", zero_division=0),
        "recall":    recall_score(y_true, preds, average="macro", zero_division=0),
        "f1":        f1_score(y_true, preds, average="macro", zero_division=0),
    }


In [22]:
trainer_lora = Trainer(
    model=model_lora,
    args=args_lora,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer_prompt = Trainer(
    model=model_prompt,
    args=args_prompt,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer_lora = Trainer(
  trainer_prompt = Trainer(


In [23]:
print("=== LoRA Training ===")
trainer_lora.train()
print("LoRA Test:", trainer_lora.evaluate(test_ds))

print("\n=== Prompt-Tuning Training ===")
trainer_prompt.train()
print("Prompt Test:", trainer_prompt.evaluate(test_ds))


=== LoRA Training ===




Step,Training Loss


LoRA Test: {'eval_loss': 1.7515729665756226, 'eval_accuracy': 0.4756666666666667, 'eval_precision': 0.3702622846220332, 'eval_recall': 0.23745084936251362, 'eval_f1': 0.23656524674774546, 'eval_runtime': 43.6423, 'eval_samples_per_second': 68.741, 'eval_steps_per_second': 1.077, 'epoch': 2.0}

=== Prompt-Tuning Training ===




Step,Training Loss


Prompt Test: {'eval_loss': 2.5352187156677246, 'eval_accuracy': 0.308, 'eval_precision': 0.05318608519202302, 'eval_recall': 0.043482993631751676, 'eval_f1': 0.02871693866627035, 'eval_runtime': 51.6111, 'eval_samples_per_second': 58.127, 'eval_steps_per_second': 0.911, 'epoch': 2.0}


In [28]:
# =======================
# 8b. Any-of Precision / Recall / F1 (multilabel via multi-hot)
# =======================
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

def compute_anyof_prf(trainer, tokenized_ds, original_ds, num_labels):
    """
    Builds multilabel ground-truth (multi-hot) vs single-label predictions (one-hot),
    then computes multilabel precision/recall/F1.
    """
    # 1) Predictions: top-1 per sample
    out   = trainer.predict(tokenized_ds)
    preds = np.argmax(out.predictions, axis=-1)  # shape (N,)

    # 2) Ground truth: multi-hot matrix (N, C)
    gold_lists = original_ds["labels_multi"]     # list of lists (N,)
    N = len(gold_lists)
    y_true = np.zeros((N, num_labels), dtype=int)
    for i, labs in enumerate(gold_lists):
        if labs:
            y_true[i, labs] = 1

    # 3) Predictions: one-hot matrix (N, C)
    y_pred = np.zeros_like(y_true)
    y_pred[np.arange(N), preds] = 1

    # 4) Any-of accuracy (top-1 in gold) – same definition as before
    anyof_acc = (y_true[np.arange(N), preds] == 1).mean()

    # 5) Multilabel PR/F1
    prf = {
        "anyof_accuracy": anyof_acc,
        # micro: aggregates TP/FP/FN globally (good overall signal)
        "anyof_precision_micro": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "anyof_recall_micro":    recall_score( y_true, y_pred, average="micro", zero_division=0),
        "anyof_f1_micro":        f1_score(     y_true, y_pred, average="micro", zero_division=0),

        # macro: average of per-class scores (treats each emotion equally)
        "anyof_precision_macro": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "anyof_recall_macro":    recall_score( y_true, y_pred, average="macro", zero_division=0),
        "anyof_f1_macro":        f1_score(     y_true, y_pred, average="macro", zero_division=0),

        # samples: F1 computed per sample then averaged (useful when each sample has multiple labels)
        "anyof_f1_samples":      f1_score(     y_true, y_pred, average="samples", zero_division=0),
    }
    return prf

# ---- compute for both models
anyof_prf_lora   = compute_anyof_prf(trainer_lora,   test_ds, test_raw, num_labels=len(class_names))
anyof_prf_prompt = compute_anyof_prf(trainer_prompt, test_ds, test_raw, num_labels=len(class_names))

print("\n=== Any-of PRF (LoRA) ===")
for k, v in anyof_prf_lora.items():
    print(f"{k}: {v:.4f}")

print("\n=== Any-of PRF (Prompt-Tuning) ===")
for k, v in anyof_prf_prompt.items():
    print(f"{k}: {v:.4f}")








=== Any-of PRF (LoRA) ===
anyof_accuracy: 0.5250
anyof_precision_micro: 0.5250
anyof_recall_micro: 0.4504
anyof_f1_micro: 0.4848
anyof_precision_macro: 0.4190
anyof_recall_macro: 0.2259
anyof_f1_macro: 0.2420
anyof_f1_samples: 0.4935

=== Any-of PRF (Prompt-Tuning) ===
anyof_accuracy: 0.3420
anyof_precision_micro: 0.3420
anyof_recall_micro: 0.2934
anyof_f1_micro: 0.3158
anyof_precision_macro: 0.0574
anyof_recall_macro: 0.0433
anyof_f1_macro: 0.0298
anyof_f1_samples: 0.3267


In [25]:
def analyze_text(text, model_choice="lora"):
    if isinstance(text, str):
        text = [text]
    model = model_lora if model_choice.lower() == "lora" else model_prompt
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
    return [{"text": t, "predicted_label": class_names[int(p)]} for t, p in zip(text, preds)]

examples = [
    "I finally got the job offer I’ve been waiting for!",
    "I’m so thankful that everything turned out okay in the end.",
    "I studied so hard, but I still didn’t pass the exam.",
    "I can’t believe the delivery was delayed again for no reason!",
    "Wow, I didn’t expect to see you here after all these years!",
    "I’m really nervous about tomorrow’s presentation; my hands are shaking.",
    "That meme literally made me laugh out loud!",
    "The food tasted awful and the smell made me feel sick.",
    "I might just stay home today and watch some shows.",
    "Her dedication and hard work truly inspire everyone around her."
]

print("\nLoRA predictions:")
for r in analyze_text(examples, "lora"):
    print(f"• {r['text']} → {r['predicted_label']}")

print("\nPrompt-tuned predictions:")
for r in analyze_text(examples, "prompt"):
    print(f"• {r['text']} → {r['predicted_label']}")



LoRA predictions:
• I finally got the job offer I’ve been waiting for! → excitement
• I’m so thankful that everything turned out okay in the end. → gratitude
• I studied so hard, but I still didn’t pass the exam. → disappointment
• I can’t believe the delivery was delayed again for no reason! → surprise
• Wow, I didn’t expect to see you here after all these years! → surprise
• I’m really nervous about tomorrow’s presentation; my hands are shaking. → disappointment
• That meme literally made me laugh out loud! → neutral
• The food tasted awful and the smell made me feel sick. → sadness
• I might just stay home today and watch some shows. → neutral
• Her dedication and hard work truly inspire everyone around her. → admiration

Prompt-tuned predictions:
• I finally got the job offer I’ve been waiting for! → neutral
• I’m so thankful that everything turned out okay in the end. → neutral
• I studied so hard, but I still didn’t pass the exam. → neutral
• I can’t believe the delivery was del