In [37]:
# Corrected pipeline for German dataset experiments (no leakage, train-only resampling)
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score
import shap
import joblib
import warnings
warnings.filterwarnings("ignore")

In [39]:
RND = 42

In [41]:
# ---------- 1. Load and basic preprocessing ----------
data = pd.read_csv("german_credit_data (1).csv")

In [43]:
# Quick sanity check
print("Raw shape:", data.shape)
print(data.head(3))

Raw shape: (1000, 11)
   Unnamed: 0  Age     Sex  Job Housing Saving accounts Checking account  \
0           0   67    male    2     own             NaN           little   
1           1   22  female    2     own          little         moderate   
2           2   49    male    1     own          little              NaN   

   Credit amount  Duration    Purpose  Risk  
0           1169         6   radio/TV  good  
1           5951        48   radio/TV   bad  
2           2096        12  education  good  


In [45]:
# Encode categorical features (label encoding for simplicity; one-hot optionally)
label_encoders = {}
for col in data.select_dtypes(include="object").columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

In [47]:
# Separate features and target - ensure the target column name matches your file
TARGET_COL = "Risk"   # update if different
X_df = data.drop(columns=[TARGET_COL])
y = data[TARGET_COL].values

# ---------- 2. Single stratified split (fixed for fair comparisons) ----------
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, stratify=y, random_state=RND
)
print("\nOriginal class counts (full dataset):")
print(pd.Series(y).value_counts())
print("\nTrain counts:")
print(pd.Series(y_train).value_counts())
print("\nTest counts:")
print(pd.Series(y_test).value_counts())


Original class counts (full dataset):
1    700
0    300
Name: count, dtype: int64

Train counts:
1    560
0    240
Name: count, dtype: int64

Test counts:
1    140
0     60
Name: count, dtype: int64


In [49]:
# ---------- 3. Scale: fit scaler on train only ----------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)

In [51]:
# Save scaler for reproducibility
joblib.dump(scaler, "scaler_german.joblib")

['scaler_german.joblib']

In [53]:
# Utility: compute specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp) if (tn + fp) > 0 else 0.0

In [55]:
# ---------- 4. Define resampling strategies ----------
def resample_data(X_train, y_train, strategy):
    """
    strategy: one of
      'no_resample', 'undersample_1to1', 'undersample_30pct_majority',
      'undersample_30pct_both', 'smote', 'smoteenn'
    Returns X_train_res, y_train_res, and a string description.
    """
    if strategy == "no_resample":
        return X_train, y_train, "No resampling"
    elif strategy == "undersample_1to1":
        # downsample majority to minority count
        rus = RandomUnderSampler(sampling_strategy="auto", random_state=RND)
        Xr, yr = rus.fit_resample(X_train, y_train)
        return Xr, yr, "Random undersample to 1:1"
    elif strategy == "undersample_30pct_majority":
        # keep all minority, sample 30% of majority
        counts = pd.Series(y_train).value_counts()
        maj_label = counts.idxmax()
        min_label = counts.idxmin()
        n_maj = counts[maj_label]
        n_min = counts[min_label]
        target_maj = int(0.30 * n_maj)
        sampling = {maj_label: target_maj, min_label: n_min}
        rus = RandomUnderSampler(sampling_strategy=sampling, random_state=RND)
        Xr, yr = rus.fit_resample(X_train, y_train)
        return Xr, yr, "Keep 30% of majority; keep all minority"
    elif strategy == "undersample_30pct_both":
        # sample 30% of each class from training set (drastic downsample)
        frac = 0.3
        df = pd.DataFrame(X_train)
        df['y'] = y_train
        df_down = df.groupby('y', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=RND))
        yr = df_down['y'].values
        Xr = df_down.drop(columns=['y']).values
        return Xr, yr, "Sample 30% of both classes"
    elif strategy == "smote":
        sm = SMOTE(random_state=RND)
        Xr, yr = sm.fit_resample(X_train, y_train)
        return Xr, yr, "SMOTE oversampling"
    elif strategy == "smoteenn":
        sm_enn = SMOTEENN(random_state=RND)
        Xr, yr = sm_enn.fit_resample(X_train, y_train)
        return Xr, yr, "SMOTE-ENN hybrid"
    else:
        raise ValueError("Unknown strategy: " + str(strategy))

In [57]:
# ---------- 5. Models to evaluate ----------
models = {
    "Random Forest": RandomForestClassifier(random_state=RND, n_estimators=200),
    "AdaBoost": AdaBoostClassifier(random_state=RND, n_estimators=100),
    "XGBoost": XGBClassifier(random_state=RND, use_label_encoder=False, eval_metric="logloss", n_estimators=200),
    "LightGBM": LGBMClassifier(random_state=RND, n_estimators=200),
    "CART": DecisionTreeClassifier(random_state=RND),
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=RND),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=RND),
}

strategies = [
    "no_resample",
    "undersample_1to1",
    "undersample_30pct_majority",
    "undersample_30pct_both",
    "smote",
    "smoteenn",
]

results = [] 

In [59]:
# ---------- 6. Run experiments ----------
for strat in strategies:
    Xr, yr, desc = resample_data(X_train, y_train, strat)
    print("\n=== Strategy:", strat, "|", desc)
    print("Resampled train shape:", Xr.shape)
    print("Resampled class counts:\n", pd.Series(yr).value_counts())

    for name, model in models.items():
        # fit model on the resampled training set
        model.fit(Xr, yr)
        # evaluate on untouched X_test
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
        else:
            auc = np.nan
        acc = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        spec = specificity_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        # print per-model summary (concise)
        print(f"{name}: Acc={acc:.3f}, Recall={recall:.3f}, Spec={spec:.3f}, F1={f1:.3f}, AUC={auc:.3f}")
        results.append({
            "strategy": strat,
            "strategy_desc": desc,
            "model": name,
            "acc": acc,
            "recall": recall,
            "specificity": spec,
            "precision": precision,
            "f1": f1,
            "auc": auc,
            "confusion_matrix": cm,
            "resampled_train_n": Xr.shape[0],
            "resampled_train_counts": dict(pd.Series(yr).value_counts())
        })


=== Strategy: no_resample | No resampling
Resampled train shape: (800, 10)
Resampled class counts:
 1    560
0    240
Name: count, dtype: int64
Random Forest: Acc=0.770, Recall=0.921, Spec=0.417, F1=0.849, AUC=0.771
AdaBoost: Acc=0.770, Recall=0.886, Spec=0.500, F1=0.844, AUC=0.741
XGBoost: Acc=0.780, Recall=0.914, Spec=0.467, F1=0.853, AUC=0.764
[LightGBM] [Info] Number of positive: 560, number of negative: 240
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000990 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.700000 -> initscore=0.847298
[LightGBM] [Info] Start training from score 0.847298
LightGBM: Acc=0.755, Recall=0.900, Spec=0.417, F1=0.837, AUC=0.774
CART: Acc=0.605, Recall=0.736, Spec=0.300, F1=0.723, AUC=0.518
Logistic Regression: Acc=0.740, Rec

In [61]:
# ---------- 7. Save summary to CSV for reporting ----------
summary_df = pd.DataFrame([{
    "strategy": r["strategy"],
    "model": r["model"],
    "acc": r["acc"],
    "recall": r["recall"],
    "specificity": r["specificity"],
    "precision": r["precision"],
    "f1": r["f1"],
    "auc": r["auc"],
    "resampled_train_n": r["resampled_train_n"]
} for r in results])
summary_df.to_csv("german_experiment_summary.csv", index=False)
print("\nSaved summary to german_experiment_summary.csv")


Saved summary to german_experiment_summary.csv


In [63]:
# ---------- 8. Identify best model for SMOTE-ENN (by recall or f1) and run SHAP ----------
best_rows = [r for r in results if r["strategy"] == "smoteenn"]
# pick model with highest recall
best_by_recall = sorted(best_rows, key=lambda x: x["recall"], reverse=True)[0]
print("\nBest SMOTE-ENN model (by recall):", best_by_recall["model"], best_by_recall["recall"])


Best SMOTE-ENN model (by recall): AdaBoost 0.6857142857142857


In [65]:
# Refit that model on entire resampled training set used earlier and explain with SHAP
best_model_name = best_by_recall["model"]
best_model = models[best_model_name]
# get the resampled X_train, y_train used for smoteenn
Xr_smoteenn, yr_smoteenn, _ = resample_data(X_train, y_train, "smoteenn")
best_model.fit(Xr_smoteenn, yr_smoteenn)

In [67]:
# SHAP explanation on test set (faster & avoids leakage)
explainer = shap.TreeExplainer(best_model)
# use raw numpy arrays for SHAP
X_test_array = X_test  # already scaled numpy
shap_values = explainer.shap_values(X_test_array)
feature_names = X_df.columns.tolist()
# summary plot (this will open a matplotlib window / inline figure)
shap.summary_plot(shap_values, X_test_array, feature_names=feature_names, show=True)
# Optionally save shap plot as png using matplotlib API

InvalidModelError: Model type not yet supported by TreeExplainer: <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>