In [None]:
# notebooks/train_sentiment_colab.ipynb

# Colab notebook per training sentiment con modello RoBERTa preaddestrato

# 1. Installazione librerie necessarie
!pip install -q transformers==4.54.1 datasets==4.0.0 evaluate==0.4.0 torch==2.7.0

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

def preprocess(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

def compute_metrics(eval_pred):
    metric_acc = evaluate.load("accuracy")
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=preds, references=labels)
    return {"accuracy": acc["accuracy"]}

def main(sample=False, epochs=1):
    ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
    if sample:
        ds = ds["train"].train_test_split(test_size=0.95)
        train_ds = ds["train"]
        eval_ds = ds["test"]
    else:
        train_ds = ds["train"]
        eval_ds = ds["validation"]

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_ds = train_ds.map(lambda x: preprocess(x, tokenizer), batched=True)
    eval_ds = eval_ds.map(lambda x: preprocess(x, tokenizer), batched=True)
    train_ds.set_format(type="torch", columns=["input_ids","attention_mask","label"])
    eval_ds.set_format(type="torch", columns=["input_ids","attention_mask","label"])

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

    args = TrainingArguments(
        output_dir="./runs",
        eval_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        save_strategy="epoch",
        logging_steps=10,
        load_best_model_at_end=True,
        report_to=[],
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model("./model_finetuned")
    print("Training completo. Modello salvato in ./model_finetuned")

main(sample=False, epochs=5)
