## Packages installation, imports, setup W&B

In [None]:
!pip -q install -U "transformers>=4.40.0" datasets wordcloud

import os
import random
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader, WeightedRandomSampler
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from wordcloud import WordCloud, STOPWORDS
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
    TrainerCallback,
)

# Global seed
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
set_seed(SEED)

WANDB_ON = False
print(f"W&B active: {WANDB_ON}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hW&B active: False


# Training on general data

## Paths, hyperparams and special tokens

In [None]:
DATA_DIR = "/content"

GEN_TRAIN = os.path.join(DATA_DIR, "train-emotion-gen.csv")
GEN_VALID = os.path.join(DATA_DIR, "valid-emotion-gen.csv")
GEN_TEST  = os.path.join(DATA_DIR, "test-emotion-gen.csv")

GEN_MODEL_DIR = "/content/models/alberto_finetuned"
os.makedirs(GEN_MODEL_DIR, exist_ok=True)

BASE_MODEL_ID = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
SPECIAL_TOKENS = ["<url>", "<user>", "<hashtag>", "<exclamation>", "<question>", "<interrobang>"]


## CSV reading and optional social normalization

In [None]:
def read_csv_any(path: str) -> pd.DataFrame:
    """Read CSV/TSV (auto-detect separator). Requires columns 'text' and 'label'."""
    df = pd.read_csv(path, sep=None, engine="python")
    cols = {c.lower(): c for c in df.columns}
    text_col  = cols.get("text")
    label_col = cols.get("label") or cols.get("labels")
    assert text_col and label_col, f"Columns 'text' and 'label' required in {path}. Found: {df.columns.tolist()}"
    df = df[[text_col, label_col]].rename(columns={text_col: "text", label_col: "label"})
    df["text"]  = df["text"].astype(str)
    df["label"] = df["label"].astype(str)
    return df

# Minimal regex-based normalization for social content
_URL_RE    = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
_USER_RE   = re.compile(r"@(\w+)")
_HASH_RE   = re.compile(r"#(\w+)")
_INTERO_RE = re.compile(r"(\?\!+|\!\?+)")
_EXCL_RE   = re.compile(r"!{2,}")
_QUEST_RE  = re.compile(r"\?{2,}")
_WS_RE     = re.compile(r"\s+")

def normalize_social(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text
    t = _URL_RE.sub(" <url> ", t)
    t = _USER_RE.sub(lambda m: f" <user> {m.group(1)}", t)
    t = _HASH_RE.sub(lambda m: f" <hashtag> {m.group(1)}", t)
    t = _INTERO_RE.sub(" <interrobang> ", t)
    t = _EXCL_RE.sub(" <exclamation> ", t)
    t = _QUEST_RE.sub(" <question> ", t)
    t = _WS_RE.sub(" ", t).strip()
    return t

## Data loading and label mapping

In [None]:
df_train = read_csv_any(GEN_TRAIN)
df_valid = read_csv_any(GEN_VALID)
df_test  = read_csv_any(GEN_TEST)

for name, df in [("TRAIN", df_train), ("VALID", df_valid), ("TEST", df_test)]:
    print(f"{name:<5} → {len(df):,} samples, classes: {df['label'].nunique()}")
    dist = df["label"].value_counts().rename("count").to_frame()
    dist["percent"] = (dist["count"] / len(df) * 100).round(2)
    print(dist.sort_index(), "\n")

labels_sorted = sorted(df_train["label"].unique().tolist())
label2id = {l: i for i, l in enumerate(labels_sorted)}
id2label = {i: l for l, i in label2id.items()}
num_labels = len(label2id)

# Valid/Test must not introduce unseen classes
missing_valid = set(df_valid["label"].unique()) - set(label2id.keys())
missing_test  = set(df_test["label"].unique())  - set(label2id.keys())
assert not missing_valid and not missing_test, f"Unknown classes: valid={missing_valid}, test={missing_test}"
print("Label2id:", label2id)


TRAIN → 5,461 samples, classes: 9
              count  percent
label                       
Anger           833    15.25
Anticipation    287     5.26
Fear            324     5.93
Joy             494     9.05
Love            173     3.17
Neutral        1525    27.93
Sadness         537     9.83
Surprise        440     8.06
Trust           848    15.53 

VALID → 1,347 samples, classes: 9
              count  percent
label                       
Anger           214    15.89
Anticipation     75     5.57
Fear             65     4.83
Joy             120     8.91
Love             52     3.86
Neutral         371    27.54
Sadness         128     9.50
Surprise        106     7.87
Trust           216    16.04 

TEST  → 781 samples, classes: 9
              count  percent
label                       
Anger           142    18.18
Anticipation     10     1.28
Fear             92    11.78
Joy              95    12.16
Love             29     3.71
Neutral         214    27.40
Sadness         113    14.

## Tokenizer and model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL_ID,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
model.resize_token_embeddings(len(tokenizer))

# Freeze embeddings + first encoder layer (warm start)
for p in model.parameters():
    p.requires_grad = True
for p in model.bert.embeddings.parameters():
    p.requires_grad = False
for p in model.bert.encoder.layer[0].parameters():
    p.requires_grad = False

class UnfreezeAfter2Epochs(TrainerCallback):
    """Unfreeze embeddings and first encoder layer after 2 epochs."""
    def on_epoch_end(self, args, state, control, **kwargs):
        if state.epoch and state.epoch >= 2 and not hasattr(self, "_done"):
            for p in kwargs["model"].bert.embeddings.parameters(): p.requires_grad = True
            for p in kwargs["model"].bert.encoder.layer[0].parameters(): p.requires_grad = True
            self._done = True
            print(">> Unfreeze: embeddings + encoder layer 0")

SOCIAL_NORMALIZATION = True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/740M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


## HF datasets, tokenization and collator

In [None]:
def preprocess(examples):
    texts = examples["text"]
    if SOCIAL_NORMALIZATION:
        texts = [normalize_social(t) for t in texts]
    return tokenizer(texts, truncation=True, max_length=min(256, tokenizer.model_max_length))

def df_to_ds(df: pd.DataFrame) -> Dataset:
    df = df.copy()
    df["labels"] = df["label"].map(label2id).astype(int)
    return Dataset.from_pandas(df[["text", "labels"]], preserve_index=False)

raw_ds = DatasetDict({
    "train": df_to_ds(df_train),
    "validation": df_to_ds(df_valid),
    "test": df_to_ds(df_test),
})
tokenized_ds = raw_ds.map(preprocess, batched=True, desc="Tokenizing")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Tokenizing:   0%|          | 0/5461 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1347 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/781 [00:00<?, ? examples/s]

## Metrics, dynamic oversampling and Trainer

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    macro_f1 = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)[2]
    acc = accuracy_score(labels, preds)
    return {"macro_f1": macro_f1, "accuracy": acc}

def make_sample_weights(ds: Dataset, n_classes: int, mult: float = 1.0) -> np.ndarray:
    y = np.array(ds["labels"])
    counts = np.bincount(y, minlength=n_classes).astype(float)
    counts[counts == 0.0] = 1.0
    base = 1.0 / counts
    class_weights = base / base.sum() * n_classes
    return (class_weights[y] ** mult)

class DynamicResampleCallback(TrainerCallback):
    """Update WeightedRandomSampler each epoch to mitigate imbalance."""
    def __init__(self, schedule):
        self.schedule = sorted(schedule, key=lambda x: x[0])

    def _mult_for_epoch(self, epoch: int) -> float:
        m = self.schedule[0][1]
        for ep, mult in self.schedule:
            if epoch >= ep:
                m = mult
        return m

    def on_epoch_begin(self, args, state, control, **kwargs):
        trainer: Trainer = kwargs["model"].trainer
        epoch = int(state.epoch or 0)
        mult = self._mult_for_epoch(epoch)
        ds = trainer.train_dataset
        weights = make_sample_weights(ds, n_classes=num_labels, mult=mult)
        sampler = WeightedRandomSampler(
            weights=torch.tensor(weights, dtype=torch.double),
            num_samples=len(weights),
            replacement=True
        )
        trainer._train_dataloader = DataLoader(
            ds,
            batch_size=args.train_batch_size,
            sampler=sampler,
            collate_fn=trainer.data_collator
        )

def attach_trainer_ref(trainer: Trainer):
    trainer.model.trainer = trainer
    return trainer

resample_schedule = [(0, 1.1), (2, 1.0)]
bf16_ok = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
fp16_ok = torch.cuda.is_available() and not bf16_ok

In [None]:
args = TrainingArguments(
    output_dir=GEN_MODEL_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2.5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=8,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    label_smoothing_factor=0.03,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    bf16=bf16_ok,
    fp16=fp16_ok,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    logging_steps=25,
    logging_first_step=True,
    seed=SEED,
    optim="adamw_torch",
    report_to=["wandb"] if WANDB_ON else "none",
    run_name="gen_finetune_clean" if WANDB_ON else None,
    save_safetensors=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        DynamicResampleCallback(resample_schedule),
        UnfreezeAfter2Epochs()
    ],
)
trainer = attach_trainer_ref(trainer)

train_out = trainer.train()
print(train_out)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy
1,1.2574,1.191427,0.615948,0.639941
2,1.0334,1.122201,0.623765,0.653304
3,0.6605,1.144693,0.647791,0.665924
4,0.5121,1.268403,0.638744,0.659985
5,0.3709,1.396544,0.63186,0.654788


>> Unfreeze: embeddings + encoder layer 0
TrainOutput(global_step=1710, training_loss=0.8130557647225453, metrics={'train_runtime': 537.5511, 'train_samples_per_second': 81.272, 'train_steps_per_second': 5.09, 'total_flos': 576786987787428.0, 'train_loss': 0.8130557647225453, 'epoch': 5.0})


In [None]:
# Global evaluation (macro/micro)
val_metrics = trainer.evaluate(eval_dataset=tokenized_ds["validation"])
test_metrics = trainer.evaluate(eval_dataset=tokenized_ds["test"])
print("VALID:", val_metrics)
print("TEST :", test_metrics)

target_names = [id2label[i] for i in range(len(id2label))]

# Predictions (test)
test_preds = trainer.predict(tokenized_ds["test"])
test_y_true = test_preds.label_ids
test_y_pred = test_preds.predictions.argmax(-1)

print("\nClassification report TEST:")
print(classification_report(test_y_true, test_y_pred, digits=4, target_names=target_names))

VALID: {'eval_loss': 1.1446927785873413, 'eval_macro_f1': 0.6477905231571567, 'eval_accuracy': 0.6659242761692651, 'eval_runtime': 5.7187, 'eval_samples_per_second': 235.541, 'eval_steps_per_second': 29.552, 'epoch': 5.0}
TEST : {'eval_loss': 0.876916229724884, 'eval_macro_f1': 0.6835285545887262, 'eval_accuracy': 0.7797695262483995, 'eval_runtime': 3.528, 'eval_samples_per_second': 221.37, 'eval_steps_per_second': 27.778, 'epoch': 5.0}

Classification report TEST:
              precision    recall  f1-score   support

       Anger     0.7547    0.8451    0.7973       142
Anticipation     0.5714    0.4000    0.4706        10
        Fear     0.9634    0.8587    0.9080        92
         Joy     0.8312    0.6737    0.7442        95
        Love     0.5294    0.3103    0.3913        29
     Neutral     0.8426    0.8505    0.8465       214
     Sadness     0.7280    0.8053    0.7647       113
    Surprise     0.8302    0.7213    0.7719        61
       Trust     0.3556    0.6400    0.4571

In [None]:
trainer.save_model(GEN_MODEL_DIR)
tokenizer.save_pretrained(GEN_MODEL_DIR)
print(f" GEN model saved to {GEN_MODEL_DIR}")

✅ GEN model saved to /content/models/alberto_finetuned


# Training on GBV dataset

## Data loading and consistent label mapping

In [None]:
GBV_TRAIN = "/content/train-emotion-gbv.csv"
GBV_VALID = "/content/valid-emotion-gbv.csv"
GBV_TEST  = "/content/test-emotion-gbv.csv"

gbv_df_train = read_csv_any(GBV_TRAIN)
gbv_df_valid = read_csv_any(GBV_VALID)
gbv_df_test  = read_csv_any(GBV_TEST)

print(f"GBV sizes → train:{len(gbv_df_train)}, valid:{len(gbv_df_valid)}, test:{len(gbv_df_test)}")
for name, df in [("TRAIN", gbv_df_train), ("VALID", gbv_df_valid), ("TEST", gbv_df_test)]:
    print(f"{name:<5} → {len(df):,} samples, classes: {df['label'].nunique()}")
    dist = df["label"].value_counts().rename("count").to_frame()
    dist["percent"] = (dist["count"] / len(df) * 100).round(2)
    print(dist.sort_index(), "\n")

labels_sorted = sorted(gbv_df_train["label"].unique().tolist())
label2id = {l: i for i, l in enumerate(labels_sorted)}
id2label = {i: l for l, i in label2id.items()}
num_labels = len(label2id)

GBV sizes → train:363, valid:63, test:108
TRAIN → 363 samples, classes: 7
          count  percent
label                   
Anger        93    25.62
Fear          8     2.20
Joy          70    19.28
Love         88    24.24
Neutral      33     9.09
Sadness      55    15.15
Surprise     16     4.41 

VALID → 63 samples, classes: 7
          count  percent
label                   
Anger        16    25.40
Fear          1     1.59
Joy          11    17.46
Love         16    25.40
Neutral       6     9.52
Sadness      10    15.87
Surprise      3     4.76 

TEST  → 108 samples, classes: 7
          count  percent
label                   
Anger        28    25.93
Fear          2     1.85
Joy          20    18.52
Love         27    25.00
Neutral      10     9.26
Sadness      16    14.81
Surprise      5     4.63 



## Load GEN checkpoint, build HF dataset and tokenizer for GBV

In [None]:
GBV_MODEL_DIR = "/content/models/alberto_finetuned_gbv"
os.makedirs(GBV_MODEL_DIR, exist_ok=True)

gbv_model = AutoModelForSequenceClassification.from_pretrained(GEN_MODEL_DIR)
gbv_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_DIR, use_fast=True)

# Label maps from GEN checkpoint
gbv_label2id = {k: int(v) for k, v in gbv_model.config.label2id.items()}
gbv_id2label = {int(k): v for k, v in gbv_model.config.id2label.items()}

# All GBV labels must exist in GEN mapping
missing = set(gbv_df_train["label"].unique()) - set(gbv_label2id.keys())
assert not missing, f"Unknown GBV labels in GEN model: {missing}"

GBV_SOCIAL_NORMALIZATION = True

def gbv_preprocess_texts(texts):
    return [normalize_social(t) for t in texts] if GBV_SOCIAL_NORMALIZATION else texts

def gbv_df_to_ds(df: pd.DataFrame) -> Dataset:
    df = df.copy()
    df["labels"] = df["label"].map(gbv_label2id).astype(int)
    return Dataset.from_pandas(df[["text", "labels"]], preserve_index=False)

gbv_raw = DatasetDict({
    "train": gbv_df_to_ds(gbv_df_train),
    "validation": gbv_df_to_ds(gbv_df_valid),
    "test": gbv_df_to_ds(gbv_df_test),
})

def gbv_tokenize(batch):
    texts = gbv_preprocess_texts(batch["text"])
    return gbv_tokenizer(texts, truncation=True, max_length=min(256, gbv_tokenizer.model_max_length))

gbv_tokenized = gbv_raw.map(gbv_tokenize, batched=True, desc="Tokenizing GBV")
gbv_data_collator = DataCollatorWithPadding(tokenizer=gbv_tokenizer)

# --- Shrink head from 9 to 7 classes (drop 'Anticipation','Trust') ---
def get_linear_and_setter(model_):
    clf = model_.classifier
    if isinstance(clf, nn.Sequential):
        lin_idx = None
        for i, m in reversed(list(enumerate(clf))):
            if isinstance(m, nn.Linear):
                lin_idx = i
                break
        if lin_idx is None:
            raise TypeError("No nn.Linear found in model.classifier (Sequential).")
        old_linear = clf[lin_idx]
        def setter(new_linear):
            clf[lin_idx] = new_linear
            model_.classifier = clf
        return old_linear, setter
    elif isinstance(clf, nn.Linear):
        old_linear = clf
        def setter(new_linear):
            model_.classifier = new_linear
        return old_linear, setter
    else:
        raise TypeError(f"Unsupported classifier type: {type(clf)}")

all_labels_gen = [gbv_model.config.id2label[i] for i in range(gbv_model.config.num_labels)]
to_drop = {"Anticipation", "Trust"}
gbv_labels = [lbl for lbl in all_labels_gen if lbl not in to_drop]
keep_idx = [all_labels_gen.index(lbl) for lbl in gbv_labels]

old_linear, set_linear = get_linear_and_setter(gbv_model)
in_features = old_linear.in_features
out_features_new = len(keep_idx)

new_linear = nn.Linear(in_features, out_features_new)
with torch.no_grad():
    new_linear.weight.copy_(old_linear.weight[keep_idx, :])
    new_linear.bias.copy_(old_linear.bias[keep_idx])
set_linear(new_linear)

# Update config and label maps
gbv_label2id = {lbl: i for i, lbl in enumerate(gbv_labels)}
gbv_id2label = {i: lbl for lbl, i in gbv_label2id.items()}
gbv_model.config.num_labels = out_features_new
gbv_model.config.label2id = gbv_label2id
gbv_model.config.id2label = gbv_id2label

# Re-init bias using GBV priors
gbv_freq = (gbv_df_train["label"].map(gbv_label2id).value_counts().sort_index())
gbv_prior = gbv_freq / gbv_freq.sum()
with torch.no_grad():
    new_linear.bias.copy_(torch.log(torch.tensor(gbv_prior.values, dtype=torch.float32)))

# Remap labels in tokenized datasets (old 0..8 -> new 0..6)
old_label2id = {lbl: i for i, lbl in enumerate(all_labels_gen)}
oldid_to_newid = {old_label2id[lbl]: gbv_label2id[lbl] for lbl in gbv_labels}

def remap_labels(batch):
    old = int(batch["labels"])
    if old not in oldid_to_newid:
        raise ValueError(f"Found label id {old} not present in GBV classes. Check your GBV split.")
    batch["labels"] = int(oldid_to_newid[old])
    return batch

gbv_tokenized["train"]      = gbv_tokenized["train"].map(remap_labels)
gbv_tokenized["validation"] = gbv_tokenized["validation"].map(remap_labels)
gbv_tokenized["test"]       = gbv_tokenized["test"].map(remap_labels)

print(" gbv_model head shrunk to:", gbv_model.config.num_labels, "classes")
print("   Classes:", [gbv_model.config.id2label[i] for i in range(gbv_model.config.num_labels)])
print("   New head shape:", gbv_model.classifier.weight.shape if isinstance(gbv_model.classifier, nn.Linear)
      else gbv_model.classifier[-1].weight.shape)
print("   Example labels (train):", np.unique(np.array(gbv_tokenized['train']['labels'])))

Tokenizing GBV:   0%|          | 0/363 [00:00<?, ? examples/s]

Tokenizing GBV:   0%|          | 0/63 [00:00<?, ? examples/s]

Tokenizing GBV:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/363 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

 gbv_model head shrunk to: 7 classes
   Classes: ['Anger', 'Fear', 'Joy', 'Love', 'Neutral', 'Sadness', 'Surprise']
   New head shape: torch.Size([7, 768])
   Example labels (train): [0 1 2 3 4 5 6]


## Initial freeze, dynamic oversampling and TrainingArguments (GBV)

In [None]:
# Selective freeze for BERT-like models
if hasattr(gbv_model, "bert"):
    for p in gbv_model.bert.embeddings.parameters(): p.requires_grad = False
    for p in gbv_model.bert.encoder.layer[0].parameters(): p.requires_grad = False

gbv_resample_schedule = [(0, 1.2), (2, 1.05), (4, 1.0)]

gbv_args = TrainingArguments(
    output_dir=GBV_MODEL_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2.0e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    label_smoothing_factor=0.02,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=True,
    bf16=bf16_ok,
    fp16=fp16_ok,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    logging_steps=25,
    logging_first_step=True,
    seed=SEED,
    optim="adamw_torch",
    report_to=["wandb"] if WANDB_ON else "none",
    run_name="gbv_finetune_round2" if WANDB_ON else None,
    save_safetensors=True,
)

gbv_trainer = Trainer(
    model=gbv_model,
    args=gbv_args,
    train_dataset=gbv_tokenized["train"],
    eval_dataset=gbv_tokenized["validation"],
    tokenizer=gbv_tokenizer,
    data_collator=gbv_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        DynamicResampleCallback(gbv_resample_schedule),
        UnfreezeAfter2Epochs()
    ],
)
gbv_trainer = attach_trainer_ref(gbv_trainer)

gbv_train_out = gbv_trainer.train()
print(gbv_train_out)

  gbv_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy
1,2.2238,0.565507,0.889928,0.888889
2,1.4117,0.55912,0.686752,0.809524
3,0.8686,0.483729,0.750055,0.84127


>> Unfreeze: embeddings + encoder layer 0
TrainOutput(global_step=69, training_loss=0.9699971503105717, metrics={'train_runtime': 64.5403, 'train_samples_per_second': 56.244, 'train_steps_per_second': 3.564, 'total_flos': 33550735127250.0, 'train_loss': 0.9699971503105717, 'epoch': 3.0})


In [None]:
gbv_val_metrics = gbv_trainer.evaluate(eval_dataset=gbv_tokenized["validation"])
gbv_test_metrics = gbv_trainer.evaluate(eval_dataset=gbv_tokenized["test"])
print("GBV VALID:", gbv_val_metrics)
print("GBV TEST :", gbv_test_metrics)

gbv_val = gbv_trainer.predict(gbv_tokenized["validation"])
gbv_test = gbv_trainer.predict(gbv_tokenized["test"])
y_true_test = gbv_test.label_ids
y_pred_test = gbv_test.predictions.argmax(-1)

gbv_target_names = [gbv_id2label[i] for i in range(len(gbv_id2label))]
print("\nGBV TEST:")
print(classification_report(y_true_test, y_pred_test, target_names=gbv_target_names, digits=4))

GBV VALID: {'eval_loss': 0.5655074715614319, 'eval_macro_f1': 0.8899278775055172, 'eval_accuracy': 0.8888888888888888, 'eval_runtime': 0.5566, 'eval_samples_per_second': 113.18, 'eval_steps_per_second': 14.372, 'epoch': 3.0}
GBV TEST : {'eval_loss': 0.7040181159973145, 'eval_macro_f1': 0.7346146218184962, 'eval_accuracy': 0.7777777777777778, 'eval_runtime': 0.7755, 'eval_samples_per_second': 139.273, 'eval_steps_per_second': 18.054, 'epoch': 3.0}



GBV TEST:
              precision    recall  f1-score   support

       Anger     0.9200    0.8214    0.8679        28
        Fear     1.0000    0.5000    0.6667         2
         Joy     0.6818    0.7500    0.7143        20
        Love     0.7097    0.8148    0.7586        27
     Neutral     0.6667    0.6000    0.6316        10
     Sadness     0.9333    0.8750    0.9032        16
    Surprise     0.6000    0.6000    0.6000         5

    accuracy                         0.7778       108
   macro avg     0.7874    0.7087    0.7346       108
weighted avg     0.7885    0.7778    0.7794       108



In [None]:
gbv_trainer.save_model(GBV_MODEL_DIR)
gbv_tokenizer.save_pretrained(GBV_MODEL_DIR)
print(f" GBV model saved to {GBV_MODEL_DIR}")

 GBV model saved to /content/models/alberto_finetuned_gbv


# Confusion Matrix and WordCLoud

## Confusion Matrix

In [None]:
assert os.path.isfile("/content/dataset_gbv_clean3.csv"), f"CSV not found: {'/content/dataset_gbv_clean3.csv'}"

# ---- Reload CSV and harmonize labels ----
def read_csv_gbv_clean(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=None, engine="python")
    cols = {c.lower(): c for c in df.columns}
    text_col = cols.get("text"); emo_col = cols.get("emotion")
    assert text_col and emo_col, f"Need TEXT/EMOTION; got: {df.columns.tolist()}"
    df = df.rename(columns={text_col: "TEXT", emo_col: "EMOTION"})
    df["TEXT"] = df["TEXT"].astype(str).fillna("").str.strip()
    df["EMOTION"] = df["EMOTION"].astype(str).str.strip().str.upper()
    df = df[(df["TEXT"] != "") & (df["TEXT"] != "-") & (df["EMOTION"] != "-")]
    return df

df = read_csv_gbv_clean("/content/dataset_gbv_clean3.csv")

# Sync num_labels
if hasattr(gbv_model, "classifier") and hasattr(gbv_model.classifier, "out_features"):
    n_head = int(gbv_model.classifier.out_features)
    gbv_model.num_labels = n_head
    gbv_model.config.num_labels = n_head
    gbv_model.config.problem_type = "single_label_classification"
    print(f"Synchronized num_labels: {n_head}")

# IT → EN mapping aligned with model labels
ital2eng = {
    "GIOIA":"Joy","TRISTEZZA":"Sadness","RABBIA":"Anger",
    "PAURA":"Fear","SORPRESA":"Surprise","NEUTRA":"Neutral"
}

label2id = {k: int(v) for k, v in gbv_model.config.label2id.items()}
id2label = {int(k): v for k, v in gbv_model.config.id2label.items()}
print(" Model label set:", id2label)

df = df[df["EMOTION"].isin(ital2eng.keys())].copy()
texts = df["TEXT"].tolist()
if SOCIAL_NORMALIZATION:
    texts = [normalize_social(t) for t in texts]
df["TEXT_NORM"] = texts
df["EMO_EN"] = df["EMOTION"].map(ital2eng)

missing = set(df["EMO_EN"].unique()) - set(label2id.keys())
if missing:
    raise ValueError(f"Labels not present in model: {missing}")

df["label_id"] = df["EMO_EN"].map(label2id).astype(int)

print("\n Distribution (ITA):")
print(df["EMOTION"].value_counts())

# ---- Dataset + tokenization ----
eval_ds = Dataset.from_pandas(
    df[["TEXT_NORM", "label_id"]].rename(columns={"TEXT_NORM": "text", "label_id": "labels"}),
    preserve_index=False
)
def _tok(batch):
    return gbv_tokenizer(batch["text"], truncation=True, max_length=min(256, gbv_tokenizer.model_max_length))
tok_eval = eval_ds.map(_tok, batched=True, desc="Tokenizing new data")

# ---- Metrics: macro-F1 over 6 emotions ----
ALLOWED_LABELS = ["Anger", "Fear", "Joy", "Neutral", "Sadness", "Surprise"]
ALLOWED_IDS = np.array([label2id[n] for n in ALLOWED_LABELS if n in label2id])

def argmax_restricted(logits: np.ndarray, allowed: np.ndarray) -> np.ndarray:
    mask = np.full(logits.shape[1], -1e9, dtype=logits.dtype)
    mask[allowed] = 0.0
    return np.argmax(logits + mask, axis=-1)

eval_label_names = ALLOWED_LABELS
eval_label_ids   = [label2id[n] for n in eval_label_names if n in label2id]

def compute_metrics_subset(pred):
    logits, labels = pred
    preds = argmax_restricted(logits, ALLOWED_IDS)
    f1 = precision_recall_fscore_support(labels, preds, average="macro", labels=eval_label_ids, zero_division=0)[2]
    acc = accuracy_score(labels, preds)
    return {"macro_f1": f1, "accuracy": acc}

bf16_ok = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
fp16_ok = torch.cuda.is_available() and not bf16_ok

eval_args = TrainingArguments(
    output_dir="/content/tmp_eval_gbv_clean",
    per_device_eval_batch_size=32,
    dataloader_num_workers=2,
    bf16=bf16_ok,
    fp16=fp16_ok,
    report_to="none",
)

eval_trainer = Trainer(
    model=gbv_model,
    args=eval_args,
    eval_dataset=tok_eval,
    tokenizer=gbv_tokenizer,
    compute_metrics=compute_metrics_subset
)

metrics = eval_trainer.evaluate()
print("\n METRICS on dataset_gbv_clean.csv (6 classes):")
print(metrics)

pred = eval_trainer.predict(tok_eval)
y_true = pred.label_ids
y_pred = argmax_restricted(pred.predictions, ALLOWED_IDS)

per_class = precision_recall_fscore_support(y_true, y_pred, labels=eval_label_ids, zero_division=0)
per_df = pd.DataFrame({
    "label_id": eval_label_ids,
    "label_name": [id2label[i] for i in eval_label_ids],
    "precision": per_class[0], "recall": per_class[1], "f1": per_class[2], "support": per_class[3]
}).sort_values("label_name")
print("\nPer-class detail (6 emotions):")
print(per_df.to_string(index=False))

In [None]:
# ---- Confusion matrix ----
ALLOWED_IDS = np.array([label2id[n] for n in ALLOWED_LABELS if n in label2id])
y_true = pred.label_ids
logits  = pred.predictions
y_pred_raw = np.argmax(logits, axis=-1)
y_pred_6   = argmax_restricted(logits, ALLOWED_IDS)

# 6×6 CM
cm6 = confusion_matrix(y_true, y_pred_6, labels=ALLOWED_IDS)
cm6_df = pd.DataFrame(
    cm6,
    index=[f"T:{id2label[i]}" for i in ALLOWED_IDS],
    columns=[f"P:{id2label[i]}" for i in ALLOWED_IDS]
)
cm6_csv = "/content/confusion_matrix_6x6.csv"
cm6_df.to_csv(cm6_csv, index=True)

cm6_row_norm = (cm6 / cm6.sum(axis=1, keepdims=True).clip(min=1))
cm6_row_df = pd.DataFrame(
    np.round(cm6_row_norm, 4),
    index=[f"T:{id2label[i]}" for i in ALLOWED_IDS],
    columns=[f"P:{id2label[i]}" for i in ALLOWED_IDS]
)
cm6_row_csv = "/content/confusion_matrix_6x6_rownorm.csv"
cm6_row_df.to_csv(cm6_row_csv, index=True)

print(" 6×6 CMs saved:")
print(" -", cm6_csv)
print(" -", cm6_row_csv)

plt.figure(figsize=(6,5))
plt.imshow(cm6, interpolation='nearest')
plt.title("Confusion Matrix")
plt.xticks(ticks=range(len(ALLOWED_IDS)), labels=[id2label[i] for i in ALLOWED_IDS], rotation=45)
plt.yticks(ticks=range(len(ALLOWED_IDS)), labels=[id2label[i] for i in ALLOWED_IDS])
plt.xlabel("Predicted"); plt.ylabel("True")
for i in range(cm6.shape[0]):
    for j in range(cm6.shape[1]):
        plt.text(j, i, cm6[i, j], ha="center", va="center", color="white")
plt.tight_layout()
plt.show()