# Sentiment Analysis MLOps - Demo Completa

Questo notebook dimostra l'intero sistema di sentiment analysis con confronto Transformer vs FastText.

**Repository GitHub**: https://github.com/yourusername/sentiment-analysis-mlops

**Hugging Face Space**: https://huggingface.co/spaces/yourusername/sentiment-analysis


## 1. Setup Ambiente


In [None]:
# Installa dipendenze
%pip install -q transformers torch fasttext pandas scikit-learn mlflow evidently datasets


In [None]:
import sys
import os
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.download_dataset import download_dataset
from src.data.preprocessing import preprocess_dataframe
from src.data.validation import validate_dataset_quality
from src.data.split import stratified_split
from src.models.transformer_model import TransformerSentimentModel
from src.models.fasttext_model import FastTextSentimentModel
from src.evaluation.metrics import calculate_metrics, compare_models_metrics


## 2. Download e Preparazione Dataset


In [None]:
# Download dataset
df = download_dataset(
    dataset_name="cardiffnlp/tweet_sentiment_multilingual",
    language="it",
    cache_dir="../data/raw"
)

print(f"Dataset scaricato: {len(df)} campioni")
print(f"Colonne: {df.columns.tolist()}")
print(f"\nDistribuzione classi:")
print(df['label'].value_counts())


In [None]:
# Preprocessing
df_processed = preprocess_dataframe(
    df,
    text_column="text",
    min_length=3,
    max_length=512
)

print(f"Dopo preprocessing: {len(df_processed)} campioni")


In [None]:
# Split train/val/test
train_df, val_df, test_df, split_indices = stratified_split(
    df_processed,
    train_size=0.70,
    val_size=0.15,
    test_size=0.15,
    random_seed=42
)

print(f"Train: {len(train_df)} campioni")
print(f"Val: {len(val_df)} campioni")
print(f"Test: {len(test_df)} campioni")


## 3. Training e Valutazione Modelli

### 3.1 Transformer (Pre-addestrato)


In [None]:
# Carica modello Transformer pre-addestrato
transformer = TransformerSentimentModel(
    model_name="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

print("✅ Transformer caricato")


### 3.2 FastText


In [None]:
# Prepara formato FastText
from src.data.preprocessing import prepare_fasttext_format

os.makedirs("../data/processed", exist_ok=True)
train_file = "../data/processed/fasttext_train.txt"
prepare_fasttext_format(
    train_df["text"].tolist(),
    train_df["label"].tolist(),
    train_file
)

# Training FastText
os.makedirs("../models/fasttext", exist_ok=True)
fasttext_model = FastTextSentimentModel.train(
    train_file=train_file,
    output_path="../models/fasttext/fasttext_model.bin",
    epoch=25,
    lr=0.1
)

print("✅ FastText addestrato")


## 4. Valutazione e Confronto


In [None]:
# Valutazione su test set
test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

unique_labels = sorted(test_df["label"].unique())
label_to_num = {label: i for i, label in enumerate(unique_labels)}

# Transformer
transformer_preds = transformer.predict_labels(test_texts)
transformer_metrics = calculate_metrics(
    np.array([label_to_num[l] for l in test_labels]),
    transformer_preds,
    labels=unique_labels
)

print("Transformer Metrics:")
print(f"  Macro-F1: {transformer_metrics['macro_f1']:.4f}")
print(f"  Accuracy: {transformer_metrics['accuracy']:.4f}")

# FastText
fasttext_preds = fasttext_model.predict_labels(test_texts)
fasttext_metrics = calculate_metrics(
    np.array([label_to_num[l] for l in test_labels]),
    fasttext_preds,
    labels=unique_labels
)

print("\nFastText Metrics:")
print(f"  Macro-F1: {fasttext_metrics['macro_f1']:.4f}")
print(f"  Accuracy: {fasttext_metrics['accuracy']:.4f}")


In [None]:
# Confronto modelli
comparison = compare_models_metrics(
    transformer_metrics,
    fasttext_metrics,
    "Transformer",
    "FastText"
)

print("\nConfronto Modelli:")
print(comparison.to_string(index=False))


## 5. Esempi Inferenza


In [None]:
# Esempi testi
test_examples = [
    "Questo prodotto è fantastico! Lo consiglio a tutti.",
    "Il servizio è stato ok, niente di speciale.",
    "Terribile esperienza, non lo consiglio affatto."
]

print("Esempi Inferenza:\n")
for text in test_examples:
    print(f"Testo: {text}")
    
    # Transformer
    trans_result = transformer.predict(text)
    print(f"  Transformer: {trans_result['label']} (confidence: {trans_result['score']:.2f})")
    
    # FastText
    ft_result = fasttext_model.predict(text)
    print(f"  FastText: {ft_result['label']} (confidence: {ft_result['score']:.2f})")
    print()


## 6. Visualizzazioni


In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

cm_transformer = confusion_matrix(
    [label_to_num[l] for l in test_labels],
    transformer_preds
)

cm_fasttext = confusion_matrix(
    [label_to_num[l] for l in test_labels],
    fasttext_preds
)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(cm_transformer, annot=True, fmt="d", cmap="Blues", ax=axes[0],
            xticklabels=unique_labels, yticklabels=unique_labels)
axes[0].set_title("Transformer")
axes[0].set_ylabel("True Label")
axes[0].set_xlabel("Predicted Label")

sns.heatmap(cm_fasttext, annot=True, fmt="d", cmap="Blues", ax=axes[1],
            xticklabels=unique_labels, yticklabels=unique_labels)
axes[1].set_title("FastText")
axes[1].set_ylabel("True Label")
axes[1].set_xlabel("Predicted Label")

plt.tight_layout()
plt.show()


## Conclusioni

Il sistema dimostra:
- ✅ Pipeline dati completa e riproducibile
- ✅ Confronto equo tra Transformer e FastText
- ✅ Metriche standardizzate e tracciabili
- ✅ Inferenza funzionante per entrambi i modelli

**Prossimi passi**:
- Deploy su Hugging Face Spaces
- Setup monitoring con Evidently AI
- Implementazione retraining automatico
