# 07 â€“ Cross-Validation (Baseline Model)

Ziel: Robuste Evaluation der klassischen Baseline mittels k-fold Cross-Validation.
Die Ergebnisse werden als Mittelwert und Standardabweichung berichtet.


In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

PROJECT_ROOT = Path.cwd().parent
DATA_CSV = PROJECT_ROOT / "data" / "processed" / "bbc_news.csv"

df = pd.read_csv(DATA_CSV)
X = df["text"].astype(str).values
y = df["label"].values


In [2]:
def make_baseline_model():
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=50000,
            ngram_range=(1,2),
            stop_words="english"
        )),
        ("clf", LogisticRegression(
            max_iter=2000,
            n_jobs=-1
        ))
    ])


In [3]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores = []
f1_scores = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = make_baseline_model()
    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="macro")

    acc_scores.append(acc)
    f1_scores.append(f1)

    print(f"Fold {fold}: accuracy={acc:.4f}, macro-F1={f1:.4f}")


Fold 1: accuracy=0.9708, macro-F1=0.9699
Fold 2: accuracy=0.9888, macro-F1=0.9889
Fold 3: accuracy=0.9730, macro-F1=0.9731
Fold 4: accuracy=0.9865, macro-F1=0.9859
Fold 5: accuracy=0.9775, macro-F1=0.9773


In [4]:
print("Accuracy:")
print(f"Mean = {np.mean(acc_scores):.4f}")
print(f"Std  = {np.std(acc_scores):.4f}")

print("\nMacro-F1:")
print(f"Mean = {np.mean(f1_scores):.4f}")
print(f"Std  = {np.std(f1_scores):.4f}")


Accuracy:
Mean = 0.9793
Std  = 0.0072

Macro-F1:
Mean = 0.9790
Std  = 0.0073
