<a href="https://colab.research.google.com/github/Siddheshdumre/Capstone-P59/blob/main/train_german.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip install -q joblib scikit-learn pandas numpy tqdm
import os
import numpy as np
import pandas as pd
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [27]:
DATA_PATH = "/content/drive/MyDrive/datasets/german_credit_risk.csv"  # Path to your uploaded dataset
TARGET = "Risk"                                                      # Target column name
POS_LABEL = "good"                                                   # Positive class label (check in CSV)
SENSITIVE = "Sex"                                                    # Protected attribute
PRIVILEGED = "male"                                                  # Privileged group
OUTPUT_DIR = "/content/drive/MyDrive/model_outputs/german_credit_risk"           # Output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [28]:
df = pd.read_csv(DATA_PATH)


In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [30]:
print("Dataset shape:", df.shape)


Dataset shape: (1000, 11)


In [31]:
y = (df[TARGET] == POS_LABEL).astype(int)
X = df.drop(columns=[TARGET])


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [33]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# Handle sklearn version differences
if sklearn.__version__ >= "1.4":
    cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
else:
    cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', cat_encoder)
])

preproc = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')


In [34]:
models = {
    "lr": LogisticRegression(max_iter=500, solver='liblinear'),
    "rf": RandomForestClassifier(n_estimators=100, random_state=42),
    "mlp": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}


In [35]:
def logits_from_proba(p):
    p = np.clip(p, 1e-9, 1 - 1e-9)
    return np.log(p / (1 - p))



In [36]:
metrics = []

for name, model in models.items():
    print(f"\n🔹 Training model: {name.upper()}")

    # Build pipeline
    pipe = Pipeline([('preproc', preproc), ('clf', model)])
    pipe.fit(X_train, y_train)

    # Predictions
    y_pred = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1]
    logits = logits_from_proba(proba)

    # Save model
    joblib.dump(pipe, f"{OUTPUT_DIR}/{name}.joblib")

    # Save predictions
    results_df = pd.DataFrame({
        "y_true": y_test,
        "y_pred": y_pred,
        "proba": proba,
        "logit": logits,
        SENSITIVE: X_test[SENSITIVE].values
    })
    results_df.to_csv(f"{OUTPUT_DIR}/{name}_preds.csv", index=False)

    # Compute metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, proba)

    metrics.append([name, acc, f1, auc])
    print(f"✅ {name.upper()} done → Accuracy: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}")



🔹 Training model: LR
✅ LR done → Accuracy: 0.705, F1: 0.810, AUC: 0.663

🔹 Training model: RF
✅ RF done → Accuracy: 0.705, F1: 0.805, AUC: 0.684

🔹 Training model: MLP
✅ MLP done → Accuracy: 0.695, F1: 0.789, AUC: 0.627




In [37]:
metrics_df = pd.DataFrame(metrics, columns=["model", "accuracy", "f1", "auc"])
metrics_df.to_csv(f"{OUTPUT_DIR}/metrics_summary.csv", index=False)

print(f"\n🎯 Training complete for German Credit dataset!")
print(f"📁 Results saved in: {OUTPUT_DIR}")



🎯 Training complete for German Credit dataset!
📁 Results saved in: /content/drive/MyDrive/model_outputs/german_credit_risk
