# 03b – DistilBERT Fine-Tuning (BBC News) – Split 80/10/10

Ziel: Fine-Tuning von DistilBERT für Textklassifikation und Vergleich gegen Baseline und BERT.

**Split-Strategie:** 80% Train / 10% Validation / 10% Test

In [None]:
import time
import random
import numpy as np
import pandas as pd
from pathlib import Path

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score

## Seeds & Config

In [None]:
SEED = 42
SPLIT_NAME = "80-10-10"

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
print(f"Split: {SPLIT_NAME}")

## 1) CSV laden & Labels encoden

In [None]:
PROJECT_ROOT = Path.cwd().parent
CSV_PATH = PROJECT_ROOT / "data" / "processed" / "bbc_news.csv"

df = pd.read_csv(CSV_PATH, encoding="utf-8")
df.shape, df["label"].nunique()

In [None]:
labels_sorted = sorted(df["label"].unique())
label2id = {lbl: i for i, lbl in enumerate(labels_sorted)}
id2label = {i: lbl for lbl, i in label2id.items()}

df["label_id"] = df["label"].map(label2id)

labels_sorted, label2id

## 2) Train/Validation/Test Split (80/10/10)

**Identisch zum BERT-Notebook für fairen Vergleich!**

In [None]:
X = df["text"].tolist()
y = df["label_id"].tolist()

# Erst Test-Set abspalten (10%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.1,
    random_state=SEED,
    stratify=y
)

# Dann Val-Set aus dem Rest (~11.1% von 90% = 10% vom Gesamtdataset)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.111111,
    random_state=SEED,
    stratify=y_temp
)

print(f"Split: {SPLIT_NAME}")
print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))
print(f"Actual Split: {len(X_train)/len(X)*100:.1f}% / {len(X_val)/len(X)*100:.1f}% / {len(X_test)/len(X)*100:.1f}%")

## 3) Tokenization + Dataset

In [None]:
MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
MAX_LEN = 256  # 256 ist oft ein guter Tradeoff bei News-Texten

def tokenize_texts(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    )

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenize_texts(texts)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_ds = TextDataset(X_train, y_train)
val_ds   = TextDataset(X_val, y_val)
test_ds  = TextDataset(X_test, y_test)

len(train_ds), len(val_ds), len(test_ds)

## 4) Model + Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels_sorted),
    id2label=id2label,
    label2id=label2id
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

In [None]:
OUT_DIR = PROJECT_ROOT / "results" / f"distilbert_{SPLIT_NAME}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

training_args = TrainingArguments(
    output_dir=str(OUT_DIR),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    seed=SEED
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

start = time.time()
train_output = trainer.train()
elapsed = time.time() - start

print(f"Training time: {elapsed/60:.1f} minutes")
train_output

## 5) Evaluation auf Validation Set

In [None]:
val_result = trainer.evaluate(val_ds)
acc_val = val_result["eval_accuracy"]
print(f"Validation Accuracy: {acc_val:.4f}")

## 6) Finale Evaluation auf Test-Set

In [None]:
pred = trainer.predict(test_ds)
test_logits = pred.predictions
test_labels = pred.label_ids
test_preds = np.argmax(test_logits, axis=-1)

acc_test = accuracy_score(test_labels, test_preds)

print("=" * 60)
print(f"FINALE TEST-SET EVALUATION (Split: {SPLIT_NAME})")
print("=" * 60)
print(f"Test Accuracy: {acc_test:.4f}")
print("\nClassification Report:\n")
print(classification_report(
    test_labels,
    test_preds,
    target_names=[id2label[i] for i in range(len(labels_sorted))]
))

## 7) Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(test_labels, test_preds, labels=list(range(len(labels_sorted))))
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues",
            xticklabels=[id2label[i] for i in range(len(labels_sorted))],
            yticklabels=[id2label[i] for i in range(len(labels_sorted))])
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title(f"Normalized Confusion Matrix – DistilBERT (TEST, {SPLIT_NAME})")
plt.tight_layout()

fig_path = OUT_DIR / f"confusion_matrix_{MODEL_NAME.replace('/', '_')}_{SPLIT_NAME}.png"
plt.savefig(fig_path, dpi=300)
print("Saved:", fig_path)
plt.show()

## 8) Model speichern

In [None]:
model_dir = OUT_DIR / "best_model"
trainer.save_model(str(model_dir))
tokenizer.save_pretrained(str(model_dir))

print("Saved model to:", model_dir)

## 9) Ergebnisse speichern (für Vergleich)

In [None]:
results_dict = {
    "model": f"DistilBERT ({MODEL_NAME})",
    "split": SPLIT_NAME,
    "train_size": len(X_train),
    "val_size": len(X_val),
    "test_size": len(X_test),
    "val_accuracy": acc_val,
    "test_accuracy": acc_test,
    "test_macro_f1": f1_score(test_labels, test_preds, average="macro"),
    "test_macro_precision": precision_score(test_labels, test_preds, average="macro"),
    "test_macro_recall": recall_score(test_labels, test_preds, average="macro"),
    "training_time_min": round(elapsed / 60, 2),
    "max_len": MAX_LEN,
    "epochs": 3,
    "learning_rate": 2e-5
}

results_df = pd.DataFrame([results_dict])
results_path = PROJECT_ROOT / "results" / f"distilbert_results_{SPLIT_NAME}.csv"
results_df.to_csv(results_path, index=False)
print(f"✅ Ergebnisse gespeichert: {results_path}")
results_df