<a href="https://colab.research.google.com/github/Sergio-ddf/emit-llm/blob/main/emit_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EMit Emotion Detection Task

# 1. Setup e import


In [None]:
# API Key WandB
from google.colab import userdata
import os, wandb

os.environ["WANDB_API_KEY"] = userdata.get('WANDB_KEY')
wandb.login()

In [None]:
# installazione pacchetti e configurazione ambiente
!pip install -q -U "transformers>=4.39.0" datasets scikit-multilearn emoji wandb
!pip install -q iterative-stratification

In [None]:
# Cell 2 : import e seme di riproducibilità
import os, re, random, numpy as np, pandas as pd, emoji, torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.multiclass import OneVsRestClassifier

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

from datasets import Dataset, DatasetDict, Sequence, Value
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False


print(f"PyTorch    : {torch.__version__} (CUDA disponibile: {torch.cuda.is_available()})")

# 2. Configurazione e percorsi


In [None]:
DATA_DIR     = ""  # path della cartella con i dati
MODEL_NAME   = "Musixmatch/umberto-commoncrawl-cased-v1"
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LABELS       = ['Anger','Anticipation','Disgust','Fear','Joy',
                'Love','Neutral','Sadness','Surprise','Trust']
NUM_LABELS   = len(LABELS)


# 3. Caricamento dati e statistiche


In [None]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "emit_train_A.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "emit_test.csv"))

print("Train A:", train_df.shape, "Test in-domain:", test_df.shape)
display(train_df.head())

# distribuzione etichette
counts = train_df[LABELS].sum().sort_values(ascending=False)
display(counts.to_frame("etichette"))


# 4. Pulizia testo


In [None]:
# Cell 5 : funzione clean + applicazione
URL, USER, TAG = "<URL>", "<USER>", "<HASHTAG>"

def clean(text: str) -> str:
    """Minimal text-normalizer per social media italiani."""
    t = re.sub(r'https?://\S+', URL, text)          # link → <URL>
    t = re.sub(r'@\w+', USER, t)                    # mention → <USER>
    t = re.sub(r'#(\w+)', TAG + r' \1', t)          # hashtag → <HASHTAG> parola
    t = emoji.demojize(t, language='it')            # 😀 → :grinning_face:
    return t.strip()

train_df["text_clean"] = train_df["text"].astype(str).map(clean)
test_df["text_clean"]  = test_df["text"].astype(str).map(clean)


# 5. Split stratificato 90/10


In [None]:
# Cell 6 : split multilabel
X = train_df["text_clean"].values
Y = train_df[LABELS].values

msss = MultilabelStratifiedShuffleSplit(test_size=0.1, random_state=SEED)
train_idx, val_idx = next(msss.split(X, Y))

train_split = train_df.iloc[train_idx].reset_index(drop=True)
val_split   = train_df.iloc[val_idx].reset_index(drop=True)

print(f"→ Train {train_split.shape}  Valid {val_split.shape}")


# 6. Baseline TF-IDF + Logistic Regression


In [None]:
vec = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_tr = vec.fit_transform(train_split['text_clean'])
X_va = vec.transform(val_split['text_clean'])

y_tr = train_split[LABELS].values
y_va = val_split[LABELS].values

clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
clf.fit(X_tr, y_tr)
y_pr = clf.predict(X_va)

print("TF-IDF+LR macro-F1:", f1_score(y_va, y_pr, average='macro'))
print(classification_report(y_va, y_pr, target_names=LABELS, zero_division=0))


# 7. Preparazione HF Dataset


In [None]:
# Cell 7 (aggiornata) — cast delle labels a float32
model_ckpt = "Musixmatch/umberto-commoncrawl-cased-v1"
tokenizer  = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

# Prepara la colonna labels come lista di float
for df in (train_split, val_split, test_df):
    df["labels"] = df[LABELS].apply(lambda row: [float(x) for x in row], axis=1)

def tokenize(batch):
    tokens = tokenizer(
        batch["text_clean"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # qui batch["labels"] è già lista di float
    tokens["labels"] = batch["labels"]
    return tokens

def to_ds(df):
    ds = Dataset.from_pandas(df[["text_clean", "labels"]])
    # assicurati che lo schema di labels sia float32
    ds = ds.cast_column("labels", Sequence(feature=Value("float32")))
    return ds.map(tokenize, batched=True)

hf_train = to_ds(train_split)
hf_val   = to_ds(val_split)
hf_test  = to_ds(test_df)

data = DatasetDict({"train": hf_train, "validation": hf_val, "test": hf_test})
data = data.remove_columns(["text_clean"])


# 8. Fine-tuning UmBERTo con BCE pesata


In [None]:
# Cell 8 – corretta per CamembertClassificationHead (UmBERTo)
import torch
from transformers import AutoModelForSequenceClassification

# 1) Carichiamo il modello base
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

# 2) Calcolo del bias iniziale come log(pi / (1 - pi))
label_sums = train_split[LABELS].sum()
pi = label_sums / len(train_split)
pi_tensor = torch.tensor(pi.values, dtype=torch.float32)
bias_init = torch.log(pi_tensor / (1.0 - pi_tensor + 1e-8))

# 3) Inizializzazione del bias del layer finale
model.classifier.out_proj.bias.data = bias_init.to(model.device)

# 4) Calcolo pos_weight per classi sbilanciate
neg_counts = len(train_split) - label_sums
pos_weight_np = (neg_counts / label_sums).values
pos_weight_tensor = torch.tensor(pos_weight_np, dtype=torch.float32, device=model.device)


In [None]:
# Cell 9 – hyperparametri + Warmup + Scheduler
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir               = "./ubert-emotions",
    eval_strategy            = "epoch",
    save_strategy            = "epoch",
    logging_strategy         = "steps",
    logging_steps            = 100,
    learning_rate            = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 32,
    num_train_epochs         = 5,
    weight_decay             = 0.01,
    warmup_ratio             = 0.1,            # 10% dei steps di warmup
    lr_scheduler_type        = "cosine",       # cosine decay
    metric_for_best_model    = "eval_f1",
    load_best_model_at_end   = True,
    seed                     = SEED,
    report_to                = ["wandb"],
)


In [None]:
# Cell 10 – metrica + CustomTrainer con pos_weight (fix device)

import torch.nn as nn
from transformers import Trainer
from sklearn.metrics import f1_score

# 1) compute_metrics invariata
def compute_metrics(pred):
    logits, labels = pred
    probs  = torch.sigmoid(torch.tensor(logits))
    y_pred = (probs > 0.5).int().cpu().numpy()
    y_true = labels
    return {"f1": f1_score(y_true, y_pred, average="macro")}

# 2) CustomTrainer – spostiamo pos_weight su GPU dentro compute_loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Estrai le labels e manda tutto su device
        labels = inputs.pop("labels").float().to(model.device)
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # Sposta pos_weight_tensor sullo stesso device di logits
        pw = pos_weight_tensor.to(logits.device)
        # BCEWithLogits con pos_weight corretto
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pw)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 3) instanzia e lancia il training come prima
trainer = CustomTrainer(
    model           = model,
    args            = args,
    train_dataset   = data["train"],
    eval_dataset    = data["validation"],
    tokenizer       = tokenizer,
    compute_metrics = compute_metrics,
)

trainer.train()


In [None]:
# Cell 11 – Calibrazione delle soglie su validation
# 1) Otteniamo le probabilità predette sulla validation
val_logits = trainer.predict(data["validation"]).predictions
val_probs  = torch.sigmoid(torch.tensor(val_logits)).cpu().numpy()

# 2) Preleviamo le vere etichette dal DataFrame val_split
y_val_true = val_split[LABELS].values

# 3) Ricerca soglia ottimale per ciascuna classe
best_thresholds = {}
for i, label in enumerate(LABELS):
    best_f1, best_thr = 0.0, 0.5
    for thr in np.arange(0.1, 0.9, 0.01):
        y_pred_i = (val_probs[:, i] > thr).astype(int)
        f1 = f1_score(y_val_true[:, i], y_pred_i)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    best_thresholds[label] = best_thr

print("Soglie ottimali per classe:")
for lbl, thr in best_thresholds.items():
    print(f"  {lbl:12s}: {thr:.2f}")


In [None]:
# Cell 12 – Inference sul test con soglie calibrate
# 1) Probabilità sul test set
test_logits = trainer.predict(data["test"]).predictions
test_probs  = torch.sigmoid(torch.tensor(test_logits)).cpu().numpy()

# 2) Applichiamo le soglie ottimali
threshold_list = np.array([best_thresholds[l] for l in LABELS])
binary_preds   = (test_probs > threshold_list).astype(int)

# 3) Costruiamo la submission
submission = pd.DataFrame({
    "id": test_df["id"],
    "labels": [
        " ".join([lbl for lbl, flag in zip(LABELS, row) if flag])
        if row.sum() > 0 else "Neutral"
        for row in binary_preds
    ]
})

submission.to_csv("emit_submission_calibrated.csv", index=False)
print("CSV salvato in emit_submission_calibrated.csv")


In [None]:
# Cell 13 – Valutazione del modello sul CSV di submission

# Percorsi
SUBM_PATH = "emit_submission_calibrated.csv"
TRUE_TEST = "emit_test.csv"  # il CSV di test in-domain con le etichette vere

# 1) Carica submission e ground truth
subm_df = pd.read_csv(SUBM_PATH)
true_df = pd.read_csv(TRUE_TEST)

# 2) Costruisci y_true e y_pred come matrici binarie [n_examples × 10]
#    y_true: direttamente dalle colonne LABELS di true_df
y_true = true_df[LABELS].values.astype(int)

#    y_pred: da subm_df["labels"] (stringhe tipo "Joy Love" o "Neutral")
def parse_labels(label_str):
    if label_str.strip() == "Neutral":
        # vettore tutto zero tranne Neutral=1
        vec = np.zeros(len(LABELS), dtype=int)
        vec[LABELS.index("Neutral")] = 1
        return vec
    toks = label_str.split()
    vec = np.array([1 if lbl in toks else 0 for lbl in LABELS], dtype=int)
    return vec

y_pred = np.vstack(subm_df["labels"].map(parse_labels).values)

# 3) Calcola le metriche
print("Macro-F1 generale:", f1_score(y_true, y_pred, average="macro"))
print("\nClassification Report (per classe):\n")
print(classification_report(y_true, y_pred, target_names=LABELS, digits=4))
