# 12 Transformers BERT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

# NLP librerie
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Scikit-learn
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
)

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# Download NLTK data
for resource in [
    'punkt', 'stopwords', 'wordnet',
    'punkt_tab', 'omw-1.4',
]:
    nltk.download(resource, quiet=True)

np.random.seed(42)
torch.manual_seed(42)

# Device configuration
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu'
)
print(f"Using device: {device}")
print("Setup completato")

import os, urllib.request, joblib

# GitHub Release URL for pretrained weights (update with actual URL)
WEIGHTS_BASE_URL = os.environ.get('WEIGHTS_URL', 'https://github.com/SamueleBolotta/CEAR/releases/download/v1.0/')
WEIGHTS_DIR = '../pretrained_weights'
os.makedirs(WEIGHTS_DIR, exist_ok=True)

def load_or_train(model, train_fn, weights_filename, device='cpu', tokenizer=None):
    """Load pretrained weights if available, otherwise train and save.
    If tokenizer is provided, its word_index is saved/loaded alongside the weights
    to ensure vocabulary consistency.
    Also saves/loads training history as JSON alongside weights."""
    weights_path = os.path.join(WEIGHTS_DIR, weights_filename)
    history_path = weights_path.replace('.pt', '_history.json')
    vocab_path = weights_path.replace('.pt', '_vocab.json')

    def _load_weights(path):
        model.load_state_dict(torch.load(path, map_location=device, weights_only=True))
        # Restore tokenizer vocabulary if saved alongside weights
        if tokenizer is not None and os.path.exists(vocab_path):
            import json as _json
            with open(vocab_path, 'r') as f:
                tokenizer.word_index = _json.load(f)
            print(f"  Restored tokenizer vocabulary ({len(tokenizer.word_index)} words)")

    def _load_history():
        if os.path.exists(history_path):
            import json as _json
            with open(history_path, 'r') as f:
                return _json.load(f)
        return None

    if os.path.exists(weights_path):
        _load_weights(weights_path)
        print(f"Loaded pretrained weights from {weights_path}")
        return _load_history()
    elif WEIGHTS_BASE_URL:
        try:
            url = WEIGHTS_BASE_URL + weights_filename
            urllib.request.urlretrieve(url, weights_path)
            # Also try downloading vocabulary
            if tokenizer is not None:
                try:
                    urllib.request.urlretrieve(
                        WEIGHTS_BASE_URL + weights_filename.replace('.pt', '_vocab.json'), vocab_path)
                except Exception:
                    pass
            # Also try downloading history
            try:
                urllib.request.urlretrieve(
                    WEIGHTS_BASE_URL + weights_filename.replace('.pt', '_history.json'), history_path)
            except Exception:
                pass
            _load_weights(weights_path)
            print(f"Downloaded and loaded weights from {url}")
            return _load_history()
        except Exception as e:
            print(f"Could not download weights: {e}. Training from scratch...")

    history = train_fn()
    torch.save(model.state_dict(), weights_path)
    print(f"Saved weights to {weights_path}")
    # Save history
    if history is not None:
        import json as _json
        with open(history_path, 'w') as f:
            _json.dump(history, f)
        print(f"Saved training history to {history_path}")
    # Save tokenizer vocabulary alongside weights
    if tokenizer is not None:
        import json as _json
        with open(vocab_path, 'w') as f:
            _json.dump(tokenizer.word_index, f)
        print(f"Saved tokenizer vocabulary to {vocab_path}")
    return history

Using device: cuda
Setup completato


In [3]:
# Caricamento dataset IMDB
from datasets import load_dataset

# Carica IMDB da Hugging Face datasets
imdb_dataset = load_dataset('imdb')

X_train_text = np.array(imdb_dataset['train']['text'])
y_train_imdb = np.array(imdb_dataset['train']['label'])
X_test_text = np.array(imdb_dataset['test']['text'])
y_test_imdb = np.array(imdb_dataset['test']['label'])

# Shuffle training e test set (il dataset HuggingFace
# e' ordinato per label, la versione Keras era shuffled)
rng = np.random.RandomState(42)
train_shuffle = rng.permutation(len(X_train_text))
X_train_text = X_train_text[train_shuffle]
y_train_imdb = y_train_imdb[train_shuffle]

test_shuffle = rng.permutation(len(X_test_text))
X_test_text = X_test_text[test_shuffle]
y_test_imdb = y_test_imdb[test_shuffle]

print(f"Training samples: {len(X_train_text)}")
print(f"Test samples: {len(X_test_text)}")

# Visualizza esempi
print("\nReview 1 (Positiva):")
pos_idx = np.where(y_train_imdb == 1)[0][0]
print(X_train_text[pos_idx][:500])
print(
    f"\nSentiment: "
    f"{'Positivo' if y_train_imdb[pos_idx] == 1 else 'Negativo'}"
)
print("\n" + "=" * 60 + "\n")

# Trova una review negativa
neg_idx = np.where(y_train_imdb == 0)[0][0]
print("Review 2 (Negativa):")
print(X_train_text[neg_idx][:500])
print(
    f"\nSentiment: "
    f"{'Positivo' if y_train_imdb[neg_idx] == 1 else 'Negativo'}"
)

print(f"\nDataset: {len(X_train_text)} train, "
      f"{len(X_test_text)} test")
print(f"Esempio: {X_train_text[0][:200]}...")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Training samples: 25000
Test samples: 25000

Review 1 (Positiva):
I dug out from my garage some old musicals and this is another one of my favorites. It was written by Jay Alan Lerner and directed by Vincent Minelli. It won two Academy Awards for Best Picture of 1951 and Best Screenplay. The story of an American painter in Paris who tries to make it big. Nina Foch is a sophisticated lady of means and is very interested in helping him, but soon finds she loves the guy. Meanwhile Gene Kelly falls for lovely damsel, Leslie Caron. His main dancing partner, and I m

Sentiment: Positivo


Review 2 (Negativa):
Dumb is as dumb does, in this thoroughly uninteresting, supposed black comedy. Essentially what starts out as Chris Klein trying to maintain a low profile, eventually morphs into an uninspired version of "The Three Amigos", only without any laughs. In order for black comedy to work, it must be outrageous, which "Play Dead" is not. In order for black comedy to work, it cannot be mean spi

---

## 6. Foundation Models per NLP

I **foundation models** sono modelli pre-trained su enormi
corpus di testo.

### Transformers e Attention

Il meccanismo di **Attention** permette al modello di
"focalizzarsi" su parti rilevanti dell'input.

### Modelli principali:

- **BERT** (Google, 2018): Bidirectional Encoder
- **GPT** (OpenAI): Generative Pre-trained Transformer
- **T5** (Google): Text-to-Text Transfer Transformer
- **RoBERTa**: BERT ottimizzato
- **DistilBERT**: BERT piu' piccolo e veloce

### 6.1 Uso di modelli pre-trained con Hugging Face

**Hugging Face** e' la libreria standard per foundation models.

In [19]:
# Installa transformers se necessario
try:
    from transformers import (
        pipeline,
        AutoTokenizer,
        AutoModelForSequenceClassification,
    )
except ImportError:
    import subprocess
    subprocess.check_call(
        ['pip', 'install', 'transformers']
    )
    from transformers import (
        pipeline,
        AutoTokenizer,
        AutoModelForSequenceClassification,
    )

print("Transformers library pronta")

Transformers library pronta


### 6.2 Sentiment Analysis con modello pre-trained

In [20]:
# Pipeline per sentiment analysis
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=(
        "distilbert-base-uncased-"
        "finetuned-sst-2-english"
    ),
    framework="pt",
)

# Test
test_texts = [
    "This movie is absolutely amazing!",
    "Worst film I've ever seen. Terrible.",
    "It was okay, nothing special.",
]

print("Predizioni con DistilBERT pre-trained:\n")
for text in test_texts:
    result = sentiment_pipeline(text)[0]
    print(f"Text: {text}")
    print(
        f"Label: {result['label']}, "
        f"Score: {result['score']:.4f}"
    )
    print()

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Predizioni con DistilBERT pre-trained:

Text: This movie is absolutely amazing!
Label: POSITIVE, Score: 0.9999

Text: Worst film I've ever seen. Terrible.
Label: NEGATIVE, Score: 0.9998

Text: It was okay, nothing special.
Label: NEGATIVE, Score: 0.9821



### 6.3 Fine-tuning di BERT

Adattiamo un modello pre-trained al nostro task specifico.

> **Nota**: Il fine-tuning su CPU richiede tempo significativo
> (~25 min per 3 epoch con 1000 campioni).
> Su Google Colab, attivare il runtime GPU:
> Runtime -> Cambia tipo di runtime -> GPU

In [21]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# Device configuration
device = torch.device(
    'cuda' if torch.cuda.is_available() else 'cpu'
)
print(f"Using device: {device}")

# Caricamento modello e tokenizer
bert_model_name = "distilbert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name,
)
bert_model = AutoModelForSequenceClassification.from_pretrained(
    bert_model_name, num_labels=2,
)
bert_model.to(device)

print(f"Modello: {bert_model_name}")
print(f"Parametri: {bert_model.num_parameters():,}")


# Tokenization helper
def tokenize_for_bert(texts, tokenizer, max_length=128):
    """Tokenizza testi per BERT."""
    if hasattr(texts, 'tolist'):
        texts = texts.tolist()
    else:
        texts = list(texts)
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt',
    )


# IMPORTANTE: usiamo IMDB text + IMDB labels (matched!)
bert_n_train = 1000
bert_n_test = 200

bert_train_texts = X_train_text[:bert_n_train]
bert_train_labels = y_train_imdb[:bert_n_train]
bert_test_texts = X_test_text[:bert_n_test]
bert_test_labels = y_test_imdb[:bert_n_test]

print(
    f"Fine-tuning su {bert_n_train} campioni, "
    f"test su {bert_n_test}"
)

# Tokenize
bert_train_enc = tokenize_for_bert(
    bert_train_texts, bert_tokenizer,
)
bert_test_enc = tokenize_for_bert(
    bert_test_texts, bert_tokenizer,
)

# Labels come tensori
bert_y_train = torch.tensor(
    bert_train_labels.tolist()
    if hasattr(bert_train_labels, 'tolist')
    else list(bert_train_labels)
).long()
bert_y_test = torch.tensor(
    bert_test_labels.tolist()
    if hasattr(bert_test_labels, 'tolist')
    else list(bert_test_labels)
).long()

print(
    f"Input shape: {bert_train_enc['input_ids'].shape}"
)

# DataLoaders
bert_train_dataset = TensorDataset(
    bert_train_enc['input_ids'],
    bert_train_enc['attention_mask'],
    bert_y_train,
)
bert_test_dataset = TensorDataset(
    bert_test_enc['input_ids'],
    bert_test_enc['attention_mask'],
    bert_y_test,
)

bert_train_loader = DataLoader(
    bert_train_dataset, batch_size=16, shuffle=True,
)
bert_test_loader = DataLoader(
    bert_test_dataset, batch_size=16,
)

# Check for pretrained weights
bert_weights_path = os.path.join(WEIGHTS_DIR, 'nb07_distilbert.pt')
_bert_loaded = False

if os.path.exists(bert_weights_path):
    bert_model.load_state_dict(
        torch.load(bert_weights_path, map_location=device, weights_only=True)
    )
    print(f"Loaded pretrained DistilBERT weights from {bert_weights_path}")
    _bert_loaded = True
elif WEIGHTS_BASE_URL:
    try:
        url = WEIGHTS_BASE_URL + 'nb07_distilbert.pt'
        urllib.request.urlretrieve(url, bert_weights_path)
        bert_model.load_state_dict(
            torch.load(bert_weights_path, map_location=device, weights_only=True)
        )
        print(f"Downloaded and loaded DistilBERT weights from {url}")
        _bert_loaded = True
    except Exception as e:
        print(f"Could not download weights: {e}. Fine-tuning from scratch...")

if not _bert_loaded:
    # Optimizer
    bert_optimizer = torch.optim.Adam(
        bert_model.parameters(), lr=2e-5,
    )

    # Fine-tuning loop
    bert_epochs = 3
    bert_model.train()

    for epoch in range(bert_epochs):
        total_loss = 0
        correct = 0
        total = 0

        progress = tqdm(
            bert_train_loader,
            desc=f'Epoch {epoch + 1}/{bert_epochs}',
        )
        for batch in progress:
            input_ids, attention_mask, labels = [
                b.to(device) for b in batch
            ]

            bert_optimizer.zero_grad()

            outputs = bert_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            loss = outputs.loss
            loss.backward()
            bert_optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            progress.set_postfix({
                'loss': f'{loss.item():.3f}',
                'acc': f'{correct/total:.3f}',
            })

        print(
            f"Epoch {epoch + 1} - "
            f"Loss: {total_loss / len(bert_train_loader):.4f}, "
            f"Accuracy: {correct / total:.4f}"
        )

    print("\nFine-tuning completato")

    # Save weights
    torch.save(bert_model.state_dict(), bert_weights_path)
    print(f"Saved DistilBERT weights to {bert_weights_path}")

# Valutazione
bert_model.eval()
bert_correct = 0
bert_total = 0

with torch.no_grad():
    for batch in bert_test_loader:
        input_ids, attention_mask, labels = [
            b.to(device) for b in batch
        ]
        outputs = bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        preds = torch.argmax(outputs.logits, dim=1)
        bert_correct += (preds == labels).sum().item()
        bert_total += labels.size(0)

bert_acc = bert_correct / bert_total
print(f"BERT Fine-tuned Accuracy: {bert_acc:.4f}")

# Confronto finale
# NOTA: i modelli usano dimensioni diverse di training/test set:
# - Logistic Regression: 5000 train / 1000 test (TF-IDF)
# - LSTM/BiLSTM: 15000 train / 2000 test
# - BERT: 1000 train / 200 test
# Questo rende il confronto indicativo ma non perfettamente equo.
print("\n" + "=" * 60)
print("CONFRONTO FINALE (IMDB - NB: dimensioni training diverse):")
print("=" * 60)
print(f"Logistic Regression (TF-IDF): {acc_lr:.4f}")
print(f"LSTM:                         {lstm_acc:.4f}")
print(f"BiLSTM:                       {bilstm_acc:.4f}" if 'bilstm_acc' in dir() else "BiLSTM:                       N/A")
print(f"BERT fine-tuned:              {bert_acc:.4f}")

Using device: cuda


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Modello: distilbert-base-uncased
Parametri: 66,955,010
Fine-tuning su 1000 campioni, test su 200
Input shape: torch.Size([1000, 128])
Downloaded and loaded DistilBERT weights from https://github.com/SamueleBolotta/CEAR/releases/download/v1.0/nb07_distilbert.pt
BERT Fine-tuned Accuracy: 0.8150

CONFRONTO FINALE (IMDB - NB: dimensioni training diverse):
Logistic Regression (TF-IDF): 0.8610
LSTM:                         0.7310
BiLSTM:                       0.7825
BERT fine-tuned:              0.8150
