In [7]:
from tensorflow.keras.models import load_model

# Pfad zum Modell
model_path = 'C:/Users/JonasNiehus/Documents/Masterarbeit/Evaluation/german_credit_model.keras'

# Modell laden
loaded_model = load_model(model_path)

print("Modell wurde erfolgreich geladen.")


Modell wurde erfolgreich geladen.


In [8]:
import numpy as np
import pandas as pd


def _predict_label_and_proba(model, preprocessor, row_df, threshold=0.5):
    """
    Nimmt eine einzelne Zeile (DataFrame mit 1 Row), wendet den Preprocessor an,
    gibt (label:int, proba:float) zurück.
    """
    Xp = preprocessor.transform(row_df)
    proba = float(model.predict(Xp, verbose=0)[0, 0])
    label = int(proba >= threshold)
    return label, proba

def _numeric_grid_around(value, lo, hi, n_steps=9, strategy="quantiles"):
    """
    Erzeugt Testwerte (ohne Originalwert) in einem sinnvollen Bereich.
    - strategy="quantiles": nutzt [lo, hi] als Quantilsgrenzen (z.B. 5%–95%)
    - strategy="relative":  nutzt multiplikative Deltas um den Wert herum
    """
    if strategy == "quantiles":
        grid = np.linspace(lo, hi, n_steps)
        # Originalwert rausfiltern, kleine numerische Toleranz
        grid = [v for v in grid if abs(v - value) > 1e-12]
        return grid

    elif strategy == "relative":
        deltas = np.array([-0.3, -0.2, -0.1, 0.1, 0.2, 0.3])  # anpassbar
        grid = [(1 + d) * value for d in deltas]
        return grid

    else:
        raise ValueError(f"Unknown strategy {strategy}")

def alpha_test_numeric(model, preprocessor, df_row, feature, bounds, threshold=0.5,
                       n_steps=9, strategy="quantiles", clip_min=None):
    """
    Testet Notwendigkeit eines NUMERISCHEN Features für EIN Profil.
    - df_row: 1-Zeilen-DataFrame mit Originalspalten
    - feature: Spaltenname (numerisch)
    - bounds: (lo, hi) z.B. aus Trainings-Quantilen (5%, 95%)
    - threshold: Klassifikationsschwelle
    - strategy: "quantiles" oder "relative"
    Rückgabe:
      dict(is_necessary:bool, original_label:int, original_proba:float,
           cf_value:float|None, cf_label:int|None, cf_proba:float|None)
    """
    # Originalvorhersage
    orig_label, orig_proba = _predict_label_and_proba(model, preprocessor, df_row, threshold)

    lo, hi = bounds
    value = float(df_row[feature].iloc[0])
    test_values = _numeric_grid_around(value, lo, hi, n_steps=n_steps, strategy=strategy)

    best = None  # (abs(change), new_value, new_label, new_proba)
    for v in test_values:
        v_clip = max(v, clip_min) if clip_min is not None else v
        mod_row = df_row.copy()
        mod_row.at[df_row.index[0], feature] = v_clip

        new_label, new_proba = _predict_label_and_proba(model, preprocessor, mod_row, threshold)

        if new_label != orig_label:
            change = abs(v_clip - value)
            if (best is None) or (change < best[0]):
                best = (change, v_clip, new_label, new_proba)

    if best is None:
        return {
            "is_necessary": False,
            "original_label": orig_label,
            "original_proba": orig_proba,
            "cf_value": None,
            "cf_label": None,
            "cf_proba": None
        }
    else:
        _, v_star, l_star, p_star = best
        return {
            "is_necessary": True,
            "original_label": orig_label,
            "original_proba": orig_proba,
            "cf_value": v_star,
            "cf_label": l_star,
            "cf_proba": p_star
        }

def alpha_test_categorical(model, preprocessor, df_row, feature, all_categories, threshold=0.5):
    """
    Testet Notwendigkeit eines KATEGORIALEN Features für EIN Profil.
    - df_row: 1-Zeilen-DataFrame
    - feature: Spaltenname (kategorial)
    - all_categories: Liste aller im Training beobachteten Kategorien
    Rückgabe analog zu alpha_test_numeric, cf_value ist dann die neue Kategorie.
    """
    orig_label, orig_proba = _predict_label_and_proba(model, preprocessor, df_row, threshold)
    current_cat = df_row[feature].iloc[0]

    best = None  # (new_cat, new_label, new_proba)
    for cat in all_categories:
        if cat == current_cat:
            continue
        mod_row = df_row.copy()
        mod_row.at[df_row.index[0], feature] = cat

        new_label, new_proba = _predict_label_and_proba(model, preprocessor, mod_row, threshold)
        if new_label != orig_label:
            best = (cat, new_label, new_proba)
            break  # erste Änderung reicht als „notwendig“

    if best is None:
        return {
            "is_necessary": False,
            "original_label": orig_label,
            "original_proba": orig_proba,
            "cf_value": None,
            "cf_label": None,
            "cf_proba": None
        }
    else:
        cat_star, l_star, p_star = best
        return {
            "is_necessary": True,
            "original_label": orig_label,
            "original_proba": orig_proba,
            "cf_value": cat_star,   # neue Kategorie
            "cf_label": l_star,
            "cf_proba": p_star
        }

# =========================
#   ALPHA für 10×6 laufen
# =========================

def run_alpha_for_profiles(
    model, preprocessor, df_all, profiles_df,  # df_all = kompletter Trainings-DF (für Quantile & Kategorien)
    features_numeric, features_categorical,
    threshold=0.5, q_lo=0.05, q_hi=0.95,
    n_steps=9, num_strategy="quantiles"
):
    """
    Führt den Alpha-Test (Notwendigkeit) über mehrere Profile × Features aus.
    - model, preprocessor: dein eingefrorenes Modell & Preprocessor (fit auf Training)
    - df_all: kompletter Trainings-DataFrame (Originalkodierung), um Quantile/Kategorien zu bestimmen
    - profiles_df: DataFrame mit GENAU den 10 Profilen (Originalkodierung, gleiche Spaltenreihenfolge)
    - features_*: Listen deiner ausgewählten Features
    - threshold: Klassifikationsschwelle
    - q_lo, q_hi: Quantile für numerische Testbereiche (z. B. 5%–95%)
    - n_steps, num_strategy: Grid-Parameter für numerische Variation
    Rückgabe:
      results_table: DataFrame (Profile × Features) mit {0/1} (nicht notwendig/ notwendig)
      details: verschachteltes Dict mit Zusatzinfos (z. B. minimaler Gegenfakt, Probas)
    """

    # --- 1) Vorbereitung: Quantile-Bereiche & Kategorie-Listen aus Training ---
    num_bounds = {}
    for feat in features_numeric:
        lo = df_all[feat].quantile(q_lo)
        hi = df_all[feat].quantile(q_hi)
        num_bounds[feat] = (float(lo), float(hi))

    cat_values = {}
    for feat in features_categorical:
        # nur im Training gesehene Kategorien testen (robust ggü. OneHot)
        cat_values[feat] = sorted(df_all[feat].dropna().unique().tolist())

    # --- 2) Ergebnis-Container ---
    rows = []
    details = {}  # details[(profile_idx, feature)] = dict(...)

    # --- 3) Schleife: Profile × Features ---
    for i in profiles_df.index:
        row = profiles_df.loc[[i]]  # 1-Zeilen-DataFrame
        row_result = {}

        # Numerische Features
        for feat in features_numeric:
            res = alpha_test_numeric(
                model, preprocessor, row, feature=feat, bounds=num_bounds[feat],
                threshold=threshold, n_steps=n_steps, strategy=num_strategy, clip_min=0.0
            )
            row_result[feat] = int(res["is_necessary"])
            details[(i, feat)] = res

        # Kategoriale Features
        for feat in features_categorical:
            res = alpha_test_categorical(
                model, preprocessor, row, feature=feat,
                all_categories=cat_values[feat], threshold=threshold
            )
            row_result[feat] = int(res["is_necessary"])
            details[(i, feat)] = res

        # Zeile mit Profil-ID sammeln
        row_result["profile_index"] = i
        rows.append(row_result)

    # --- 4) Tabelle bauen (Profile × Features) ---
    results_table = pd.DataFrame(rows).set_index("profile_index").sort_index()

    # --- 5) Alpha je Feature (Anteil notwendiger Fälle) optional gleich mit ausrechnen
    alpha_per_feature = results_table.mean(axis=0).rename("alpha_rate")
    print("\nα (Notwendigkeit) – Anteil notwendiger Fälle pro Feature:")
    print(alpha_per_feature.sort_values(ascending=False).to_string())

    return results_table, details


In [12]:
import os
import numpy as np
import pandas as pd

# ÖKONOMISCHE FEATURES (12)
econ_features_all = [
    "Kreditbetrag",
    "Kreditgeschichte",
    "Kreditverwendungszweck",
    "Dauer_in_Monaten",
    "Ratenhöhe",
    "Sparkonto_Wertpapiere",
    "Vermögen",
    "Status_des_Girokontos",
    "Beschäftigt_seit",
    "Andere_Ratenverpflichtungen",
    "Weitere_Bürgen_Schuldner",
    "Anzahl_bestehender_Kredite",
]

# SOZIODEMOGRAPHISCHE/SONSTIGE FEATURES (8)
socio_features_all = [
    "Alter",
    "Familienstand_Geschlecht",
    "Wohnsitzdauer",
    "Wohnsituation",
    "Unterhaltspflichtige_Personen",
    "Telefon",
    "Ausländischer_Arbeiter",
    "Beruf",
]


def build_two_stage_profile_report_md(
    alpha_econ: pd.DataFrame,
    beta_econ: pd.DataFrame,
    econ_features_all: list,              # Reihenfolge für ökonomische Features
    alpha_socio: pd.DataFrame,
    beta_socio: pd.DataFrame,
    socio_features_all: list,             # Reihenfolge für soziodemographische Features
    suff_threshold: float = 0.7,
    round_beta: int = 4,
    out_path: str = 'C:/Users/JonasNiehus/Documents/Masterarbeit/Evaluation/Ergebnisse/two_stage_profile_report.md'
):
    # --- Sanity ---
    for name, obj in [
        ("alpha_econ", alpha_econ), ("beta_econ", beta_econ),
        ("alpha_socio", alpha_socio), ("beta_socio", beta_socio)
    ]:
        if not isinstance(obj, pd.DataFrame):
            raise TypeError(f"{name} ist kein DataFrame, sondern {type(obj)}")

    # Profile-Mengen müssen übereinstimmen
    idx_ref = alpha_econ.index
    if not (idx_ref.equals(beta_econ.index) and idx_ref.equals(alpha_socio.index) and idx_ref.equals(beta_socio.index)):
        raise ValueError("Die Indexmengen (Profile) von alpha_econ/beta_econ/alpha_socio/beta_socio sind nicht identisch.")

    # Features, die tatsächlich vorhanden sind (Schnittmenge mit DataFrames)
    econ_feats = [f for f in econ_features_all if f in alpha_econ.columns and f in beta_econ.columns]
    socio_feats = [f for f in socio_features_all if f in alpha_socio.columns and f in beta_socio.columns]

    if not econ_feats and not socio_feats:
        raise ValueError("Weder ökonomische noch soziodemographische Features wurden in den Matrizen gefunden.")

    md = []
    md.append("# Profilbericht (zweistufig): α/β je Profil – Ökonomisch vs. Soziodemographisch\n")
    md.append(f"- **Anzahl Profile:** {len(idx_ref)}")
    md.append(f"- **Hinreichend-Schwelle:** β ≥ {suff_threshold}")
    md.append(f"- **Ökonomische Features:** {', '.join(econ_feats) if econ_feats else '(keine)'}")
    md.append(f"- **Soziodemographische Features:** {', '.join(socio_feats) if socio_feats else '(keine)'}\n")

    # --- Optionale Gesamtübersicht je Profil (Aggregationswerte) ---
    overview_rows = []
    for pid in idx_ref:
        row = {"profile_index": pid}
        if econ_feats:
            row["econ_alpha_cnt"] = int(alpha_econ.loc[pid, econ_feats].sum())
            row["econ_hinr_cnt"]  = int((beta_econ.loc[pid, econ_feats] >= suff_threshold).sum())
        else:
            row["econ_alpha_cnt"] = 0
            row["econ_hinr_cnt"]  = 0

        if socio_feats:
            row["socio_alpha_cnt"] = int(alpha_socio.loc[pid, socio_feats].sum())
            row["socio_hinr_cnt"]  = int((beta_socio.loc[pid, socio_feats] >= suff_threshold).sum())
        else:
            row["socio_alpha_cnt"] = 0
            row["socio_hinr_cnt"]  = 0

        overview_rows.append(row)

    overview_df = pd.DataFrame(overview_rows).set_index("profile_index")
    md.append("## Überblick (Anzahl notwendiger / hinreichender Features je Stufe)\n")
    md.append(overview_df.to_markdown())
    md.append("\n---\n")

    # --- Detail je Profil: zwei Tabellen (Ökonomisch / Soziodemographisch) ---
    md.append("## Details je Profil\n")
    for pid in idx_ref:
        md.append(f"\n### Profil {pid}\n")

        if econ_feats:
            df_e = pd.DataFrame({
                "Feature": econ_feats,
                "alpha_notwendig": alpha_econ.loc[pid, econ_feats].astype(int).values,
                "beta": np.round(beta_econ.loc[pid, econ_feats].astype(float).values, round_beta),
            })
            df_e["hinreichend (β≥{:.2f})".format(suff_threshold)] = (df_e["beta"] >= suff_threshold).astype(int)
            md.append("\n**Ökonomische Features**\n")
            md.append(df_e.to_markdown(index=False))

        if socio_feats:
            df_s = pd.DataFrame({
                "Feature": socio_feats,
                "alpha_notwendig": alpha_socio.loc[pid, socio_feats].astype(int).values,
                "beta": np.round(beta_socio.loc[pid, socio_feats].astype(float).values, round_beta),
            })
            df_s["hinreichend (β≥{:.2f})".format(suff_threshold)] = (df_s["beta"] >= suff_threshold).astype(int)
            md.append("\n**Soziodemographische/sonstige Features**\n")
            md.append(df_s.to_markdown(index=False))

        md.append("\n---")

    # --- Datei schreiben ---
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(md))
    print(f"💾 Zweistufiger Profilbericht gespeichert unter: {out_path}")


# ===== Aufruf mit deinen vorhandenen Objekten =====
# Annahmen: alpha_econ, beta_econ, alpha_socio, beta_socio, economic_features_all, socio_features_all existieren
build_two_stage_profile_report_md(
    alpha_econ=alpha_econ,
    beta_econ=beta_econ,
    econ_features_all=econ_features_all,      # <- hier korrigiert
    alpha_socio=alpha_socio,
    beta_socio=beta_socio,
    socio_features_all=socio_features_all,
    suff_threshold=0.7,
    round_beta=4,
    out_path='C:/Users/JonasNiehus/Documents/Masterarbeit/Evaluation/Ergebnisse/profilbericht_alpha_beta_twostage.md'
)


💾 Zweistufiger Profilbericht gespeichert unter: C:/Users/JonasNiehus/Documents/Masterarbeit/Evaluation/Ergebnisse/profilbericht_alpha_beta_twostage.md
