# Notebook 2 - Symbolische KI und ML Metriken


## 1) Lernziele
- Du laedst robust einen Klassifikationsdatensatz (Titanic bevorzugt, sonst Breast Cancer).
- Du verstehst den Unterschied zwischen symbolischer Baseline und datengetriebenen ML-Modellen.
- Du interpretierst Confusion Matrix, Accuracy, Precision, Recall, F1 und ROC AUC korrekt.
- Du beobachtest systematisch den Effekt von Threshold, Klassenungleichgewicht und Zufalls-Seed.


## 2) Datensatz Einblick (Titanic bevorzugt, sonst Breast Cancer Fallback)
- Kaggle: ` /kaggle/input/titanic/train.csv ` falls vorhanden.
- Sonst: `sklearn.datasets.load_breast_cancer`.
- Fokus auf wenige, robuste Features und transparente Vorbereitung.


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, Markdown

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)

DATA_INFO = {}
RAW_DF = None
X_ALL = None
y_ALL = None


def load_and_prepare_data():
    """Laedt Titanic aus Kaggle oder faellt robust auf Breast Cancer zurueck."""
    titanic_path = '/kaggle/input/titanic/train.csv'

    if os.path.exists(titanic_path):
        df = pd.read_csv(titanic_path)
        dataset_name = 'titanic'

        # Wenige Features, bewusst einfach gehalten.
        feature_cols = ['Pclass', 'Sex', 'Age', 'Fare']
        target_col = 'Survived'

        use_df = df[feature_cols + [target_col, 'SibSp', 'Parch']].copy()
        use_df['Age'] = use_df['Age'].fillna(use_df['Age'].median())
        use_df['Fare'] = use_df['Fare'].fillna(use_df['Fare'].median())

        X = pd.get_dummies(use_df[feature_cols], columns=['Sex', 'Pclass'], drop_first=False)
        y = use_df[target_col].astype(int)

        # Fuer Korrelation/Einblick ergaenzen wir wenige numerische Hilfsspalten.
        insight_df = use_df.copy()
        insight_df['FamilySize'] = insight_df['SibSp'] + insight_df['Parch'] + 1
        insight_df = pd.concat([insight_df, pd.get_dummies(insight_df['Pclass'], prefix='Pclass')], axis=1)
        insight_df['Sex_female'] = (insight_df['Sex'] == 'female').astype(int)

        info = {
            'dataset': dataset_name,
            'target_col': target_col,
            'feature_cols_model': list(X.columns),
            'insight_feature': 'Fare',
            'scaling_used': False,
        }
        return df, insight_df, X, y, info

    # Fallback: Breast Cancer
    bc = load_breast_cancer(as_frame=True)
    df = bc.frame.copy()
    dataset_name = 'breast_cancer_fallback'
    target_col = 'target'

    # Kompaktes Feature-Set
    feature_cols = [
        'mean radius',
        'mean texture',
        'mean perimeter',
        'mean area',
        'mean smoothness',
        'mean compactness',
        'mean concavity',
        'mean concave points',
    ]

    X_raw = df[feature_cols].copy()
    y = df[target_col].astype(int)

    # Optional standardisieren (hier aktiv fuer lineares Modell, transparent dokumentiert).
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_raw), columns=feature_cols)

    info = {
        'dataset': dataset_name,
        'target_col': target_col,
        'feature_cols_model': feature_cols,
        'insight_feature': 'mean radius',
        'scaling_used': True,
    }
    return df, df.copy(), X_scaled, y, info


RAW_DF, INSIGHT_DF, X_ALL, y_ALL, DATA_INFO = load_and_prepare_data()
print('Dataset:', DATA_INFO['dataset'])
print('Samples:', len(RAW_DF), '| Features fuer Modell:', len(DATA_INFO['feature_cols_model']))
print('Scaling genutzt:', DATA_INFO['scaling_used'])


In [None]:
# Kurzer Datensatz-Einblick: Shape, Spalten, Missing Values.
print('Shape:', RAW_DF.shape)
print('\nSpalten (erste 15):')
print(list(RAW_DF.columns[:15]))

missing = RAW_DF.isna().sum().sort_values(ascending=False).head(10)
missing_df = pd.DataFrame({'column': missing.index, 'missing_count': missing.values})
print('\nMissing Values (Top 10):')
display(missing_df)


In [None]:
# Plot 1: Zielverteilung
plt.figure(figsize=(5.2, 3.6))
vc = y_ALL.value_counts().sort_index()
plt.bar([str(i) for i in vc.index], vc.values, color=['tab:blue', 'tab:orange'])
plt.title('Zielverteilung')
plt.xlabel('Klasse')
plt.ylabel('Anzahl')
plt.tight_layout()
plt.show()


In [None]:
# Plot 2: Ein Feature gegen Ziel (Titanic: Fare, Cancer: mean radius)
feat = DATA_INFO['insight_feature']
if DATA_INFO['dataset'] == 'titanic':
    vis_df = INSIGHT_DF[[feat, DATA_INFO['target_col']]].copy()
    grp0 = vis_df[vis_df[DATA_INFO['target_col']] == 0][feat]
    grp1 = vis_df[vis_df[DATA_INFO['target_col']] == 1][feat]
else:
    vis_df = RAW_DF[[feat, DATA_INFO['target_col']]].copy()
    grp0 = vis_df[vis_df[DATA_INFO['target_col']] == 0][feat]
    grp1 = vis_df[vis_df[DATA_INFO['target_col']] == 1][feat]

plt.figure(figsize=(5.6, 3.8))
plt.boxplot([grp0.values, grp1.values], labels=['target=0', 'target=1'])
plt.title(f'{feat} nach Zielklasse')
plt.ylabel(feat)
plt.tight_layout()
plt.show()


In [None]:
# Plot 3: Korrelation Heatmap (8-12 numerische Features)
if DATA_INFO['dataset'] == 'titanic':
    corr_cols = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Survived']
    corr_df = INSIGHT_DF[corr_cols].copy()
else:
    corr_cols = DATA_INFO['feature_cols_model'][:9] + [DATA_INFO['target_col']]
    corr_df = RAW_DF[corr_cols].copy()

corr = corr_df.corr(numeric_only=True)

plt.figure(figsize=(7.2, 5.2))
im = plt.imshow(corr.values, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha='right')
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title('Korrelation Heatmap')
plt.tight_layout()
plt.show()


## 3) Warm-up Spielzelle (ipywidgets): Modellwahl, Split, Threshold
- Modelle: `rule_baseline`, `logistic_regression`, `decision_tree`.
- Regler: `test_size`, `random_state`, `threshold`.
- Ausgabe: Metrik-Tabelle, Confusion Matrix Heatmap, kurze Interpretation.


In [None]:
def compute_metrics(y_true, y_pred, y_prob=None):
    out = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': np.nan,
    }
    if y_prob is not None:
        try:
            out['roc_auc'] = roc_auc_score(y_true, y_prob)
        except Exception:
            out['roc_auc'] = np.nan
    return out


def symbolic_rule_predict(X_train, X_test, info):
    # Einfache symbolische Baseline je Datensatz.
    if info['dataset'] == 'titanic':
        pred = ((X_test['Sex_female'] == 1) & (X_test['Pclass_3'] == 0)).astype(int)
        explanation = 'Regel: female und nicht Pclass_3 => Klasse 1, sonst 0'
        return pred.values, None, explanation

    # Fallback-Regel fuer Breast Cancer mit trainbasierten Schwellen.
    radius_thr = X_train['mean radius'].quantile(0.55)
    conc_thr = X_train['mean concavity'].quantile(0.55)
    pred = ((X_test['mean radius'] < radius_thr) & (X_test['mean concavity'] < conc_thr)).astype(int)
    explanation = f'Regel: mean radius<{radius_thr:.3f} und mean concavity<{conc_thr:.3f} => 1'
    return pred.values, None, explanation


def train_predict_ml(model_name, X_train, X_test, y_train, threshold):
    if model_name == 'logistic_regression':
        model = LogisticRegression(max_iter=1000)
    elif model_name == 'decision_tree':
        model = DecisionTreeClassifier(max_depth=4, random_state=0)
    else:
        raise ValueError('Unbekanntes Modell')

    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    y_pred = (y_prob >= threshold).astype(int) if y_prob is not None else model.predict(X_test)
    return y_pred, y_prob


def plot_confusion_matrix(cm, title='Confusion Matrix'):
    plt.figure(figsize=(4.4, 3.8))
    im = plt.imshow(cm, cmap='Blues')
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks([0, 1], ['pred_0', 'pred_1'])
    plt.yticks([0, 1], ['true_0', 'true_1'])
    plt.title(title)
    for i in range(2):
        for j in range(2):
            plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='black')
    plt.tight_layout()
    plt.show()


model_dd = widgets.Dropdown(
    options=['rule_baseline', 'logistic_regression', 'decision_tree'],
    value='rule_baseline',
    description='model'
)

test_size_sl = widgets.FloatSlider(value=0.2, min=0.1, max=0.4, step=0.05, description='test_size', readout_format='.2f', continuous_update=False)
random_state_sl = widgets.IntSlider(value=42, min=0, max=99, step=1, description='random_state', continuous_update=False)
threshold_sl = widgets.FloatSlider(value=0.5, min=0.1, max=0.9, step=0.05, description='threshold', readout_format='.2f', continuous_update=False)
run_btn = widgets.Button(description='Train and evaluate', button_style='info')
out = widgets.Output()


def run_eval(_):
    with out:
        out.clear_output()

        X_train, X_test, y_train, y_test = train_test_split(
            X_ALL, y_ALL,
            test_size=test_size_sl.value,
            random_state=random_state_sl.value,
            stratify=y_ALL,
        )

        if model_dd.value == 'rule_baseline':
            y_pred, y_prob, explanation = symbolic_rule_predict(X_train, X_test, DATA_INFO)
            display(Markdown(f'**Symbolische Baseline:** {explanation}'))
        else:
            y_pred, y_prob = train_predict_ml(model_dd.value, X_train, X_test, y_train, threshold_sl.value)
            display(Markdown(f'**ML Modell:** `{model_dd.value}`, threshold={threshold_sl.value:.2f}'))

        m = compute_metrics(y_test, y_pred, y_prob)
        cm = confusion_matrix(y_test, y_pred)

        metric_df = pd.DataFrame([
            {'metric': 'accuracy', 'value': m['accuracy']},
            {'metric': 'precision', 'value': m['precision']},
            {'metric': 'recall', 'value': m['recall']},
            {'metric': 'f1', 'value': m['f1']},
            {'metric': 'roc_auc', 'value': m['roc_auc']},
        ])
        display(metric_df)

        plot_confusion_matrix(cm, title='Confusion Matrix')

        tp = cm[1, 1]
        fp = cm[0, 1]
        tn = cm[0, 0]
        fn = cm[1, 0]
        interpretation = [
            '- Hoher TP und TN sind gut fuer Gesamtleistung.',
            f'- FP={fp}: falsche Alarme (Precision sinkt bei vielen FP).',
            f'- FN={fn}: verpasste Positive (Recall sinkt bei vielen FN).',
        ]
        display(Markdown('**Kurzinterpretation:**\n' + '\n'.join(interpretation)))


run_btn.on_click(run_eval)

display(widgets.VBox([
    model_dd,
    test_size_sl,
    random_state_sl,
    threshold_sl,
    run_btn,
    out,
]))


## 4) Datensplitting erklaert (Train Dev Test, optional CV)
- Train: auf diesen Daten lernt das Modell seine Parameter.
- Dev/Validation: hier werden Modellwahl und Hyperparameter abgestimmt.
- Test: nur fuer die finale, unverzerrte Leistungsmessung.
- Ohne saubere Trennung droht Datenleck und ueberoptimistische Bewertung.
- Cross-Validation ersetzt optional den festen Dev-Split durch mehrere Folds.


## 5) Confusion Matrix und Metriken erklaert
### Was misst welche Metrik
- Accuracy: Anteil aller korrekten Vorhersagen.
- Precision: Anteil korrekter Positivvorhersagen unter allen Positivvorhersagen.
- Recall: Anteil gefundener Positiver unter allen tatsaechlich Positiven.
- F1: harmonisches Mittel aus Precision und Recall.
- ROC AUC: Trennschaerfe ueber alle moeglichen Thresholds.


In [None]:
# Mini-Rechnung aus TP, FP, TN, FN mit tabellarischer Ausgabe.

def metrics_from_counts(tp, fp, tn, fn):
    acc = (tp + tn) / (tp + fp + tn + fn)
    prec = tp / (tp + fp) if (tp + fp) else 0.0
    rec = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0
    return acc, prec, rec, f1

# Beispielwerte, die zu einer 2x2-Matrix passen.
TP, FP, TN, FN = 42, 8, 35, 15
acc, prec, rec, f1 = metrics_from_counts(TP, FP, TN, FN)

mini_df = pd.DataFrame([
    {'symbol': 'TP', 'value': TP},
    {'symbol': 'FP', 'value': FP},
    {'symbol': 'TN', 'value': TN},
    {'symbol': 'FN', 'value': FN},
    {'symbol': 'accuracy', 'value': round(acc, 4)},
    {'symbol': 'precision', 'value': round(prec, 4)},
    {'symbol': 'recall', 'value': round(rec, 4)},
    {'symbol': 'f1', 'value': round(f1, 4)},
])
display(mini_df)


## 6) Aufsteigende Erweiterungen
- a) Klassenungleichgewicht: Zielverteilung zeigt, warum Accuracy taeuschen kann.
- b) Threshold-Tradeoff: Precision/Recall verschieben sich mit dem Grenzwert.
- c) Multi-Seed: Stabilitaet ueber mehrere Zufallssplits statt Einzelwert.


In [None]:
# 6a) Klassenungleichgewicht kurz sichtbar machen.
class_counts = y_ALL.value_counts().sort_index()
ratio = class_counts.min() / class_counts.max()

plt.figure(figsize=(5.2, 3.6))
plt.bar([str(i) for i in class_counts.index], class_counts.values, color=['tab:blue', 'tab:orange'])
plt.title('Klassenungleichgewicht (Zielverteilung)')
plt.xlabel('Klasse')
plt.ylabel('Anzahl')
plt.tight_layout()
plt.show()

print(f'Min/Max Verhaeltnis: {ratio:.3f}')
print('Hinweis: Bei starkem Ungleichgewicht kann hohe Accuracy trotz schlechter Recall entstehen.')


In [None]:
# 6b) Threshold-Tradeoff als Tabelle (auf Logistic Regression, falls verfuegbar).
X_train, X_test, y_train, y_test = train_test_split(X_ALL, y_ALL, test_size=0.2, random_state=42, stratify=y_ALL)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
proba = logreg.predict_proba(X_test)[:, 1]

rows = []
for thr in [0.2, 0.35, 0.5, 0.65, 0.8]:
    pred = (proba >= thr).astype(int)
    rows.append({
        'threshold': thr,
        'precision': precision_score(y_test, pred, zero_division=0),
        'recall': recall_score(y_test, pred, zero_division=0),
        'f1': f1_score(y_test, pred, zero_division=0),
    })

tradeoff_df = pd.DataFrame(rows)
display(tradeoff_df)


In [None]:
# 6c) Multi-Seed-Evaluation ueber 5 Seeds mit Mittelwerten.

def evaluate_one_seed(seed):
    X_train, X_test, y_train, y_test = train_test_split(
        X_ALL, y_ALL, test_size=0.2, random_state=seed, stratify=y_ALL
    )
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)
    m = compute_metrics(y_test, pred, proba)
    return {
        'seed': seed,
        'accuracy': m['accuracy'],
        'precision': m['precision'],
        'recall': m['recall'],
        'f1': m['f1'],
        'roc_auc': m['roc_auc'],
    }

seed_rows = [evaluate_one_seed(s) for s in [0, 1, 2, 3, 4]]
seed_df = pd.DataFrame(seed_rows)
mean_row = {'seed': 'mean'}
for c in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    mean_row[c] = seed_df[c].mean()

result_df = pd.concat([seed_df, pd.DataFrame([mean_row])], ignore_index=True)
display(result_df)


## 7) Mini Leitfaden (7 bis 10 Minuten)
- Minute 0-1: Datensatzquelle pruefen (Titanic oder Fallback) und Zielverteilung ansehen.
- Minute 1-3: Warm-up mit `rule_baseline` laufen lassen, Metriken lesen.
- Minute 3-5: auf `logistic_regression` wechseln und Confusion Matrix vergleichen.
- Minute 5-7: threshold variieren und Precision/Recall-Tradeoff beobachten.
- Minute 7-9: Erweiterung Multi-Seed lesen und Stabilitaet statt Einzelwert einordnen.
- Minute 9-10: festhalten, wann symbolische Baseline reicht und wann ML noetig ist.


## Mini Uebungen
1. Passe die symbolische Titanic-Regel leicht an und pruefe den Effekt auf Precision/Recall.
2. Suche einen Threshold mit Recall > 0.90 und notiere den Precision-Verlust.
3. Vergleiche `decision_tree` und `logistic_regression` bei identischem Split.
4. Erhoehe test_size auf 0.4 und bewerte die Stabilitaet der Metriken.
5. Erweitere die Multi-Seed-Tabelle um Standardabweichungen.
