In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

In [34]:
df = pd.read_csv("Task 3 and 4_Loan_Data.csv")
assert set(['customer_id','credit_lines_outstanding','loan_amt_outstanding',
            'total_debt_outstanding','income','years_employed','fico_score','default']).issubset(df.columns)

X = df.drop(columns=["customer_id", "default"])
y = df["default"].astype(int)

In [36]:
X = X.fillna(X.median(numeric_only=True))

In [38]:
pos_weight = (len(y) - y.sum()) / y.sum() if y.sum() > 0 else 1.0

In [40]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [42]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc = make_scorer(roc_auc_score, needs_proba=True)



In [44]:
pipelines = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500, class_weight="balanced", solver="lbfgs"))
    ]),
    "Decision Tree": Pipeline([
        ("clf", DecisionTreeClassifier(random_state=42, class_weight="balanced"))
    ]),
    "Random Forest": Pipeline([
        ("clf", RandomForestClassifier(random_state=42, n_jobs=-1, class_weight="balanced"))
    ]),
    "XGBoost": Pipeline([
        ("clf", xgb.XGBClassifier(
            objective="binary:logistic",
            n_estimators=300,
            eval_metric="logloss",
            tree_method="hist",
            n_jobs=-1,
            scale_pos_weight=float(pos_weight)
        ))
    ]),
    "SVM (RBF)": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42))
    ]),
}

In [46]:
param_grids = {
    "Logistic Regression": {
        "clf__C": [0.1, 1.0, 3.0, 10.0],
        "clf__penalty": ["l2"],
    },
    "Decision Tree": {
        "clf__max_depth": [3, 5, 7, None],
        "clf__min_samples_split": [2, 10, 50],
        "clf__min_samples_leaf": [1, 5, 20],
    },
    "Random Forest": {
        "clf__n_estimators": [200, 400],
        "clf__max_depth": [None, 6, 10],
        "clf__min_samples_split": [2, 20],
        "clf__min_samples_leaf": [1, 5],
        "clf__max_features": ["sqrt", "log2", 0.5],
    },
    "XGBoost": {
        "clf__max_depth": [3, 4, 6],
        "clf__learning_rate": [0.05, 0.1, 0.2],
        "clf__subsample": [0.7, 0.9, 1.0],
        "clf__colsample_bytree": [0.7, 0.9, 1.0],
        "clf__min_child_weight": [1, 5, 10],
    },
    "SVM (RBF)": {
        "clf__C": [0.5, 1.0, 3.0, 10.0],
        "clf__gamma": ["scale", 0.1, 0.01],
    },
}

In [48]:
cv_results = {}
best_models = {}

In [50]:
for name, pipe in pipelines.items():
    grid = param_grids[name]
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring=auc,
        cv=cv,
        n_jobs=-1,
        verbose=0,
        refit=True
    )
    gs.fit(X_tr, y_tr)
    best_models[name] = gs.best_estimator_
    cv_results[name] = {
        "best_params": gs.best_params_,
        "cv_auc": gs.best_score_,
    }

In [51]:
holdout_auc = {}
for name, model in best_models.items():
    proba = model.predict_proba(X_te)[:, 1]
    holdout_auc[name] = roc_auc_score(y_te, proba)

In [52]:
def pick_best(cv_results, holdout_auc):
    sorted_models = sorted(cv_results.items(), key=lambda kv: kv[1]["cv_auc"], reverse=True)
    top_name, top_info = sorted_models[0]
    return top_name

In [53]:
winner_name = pick_best(cv_results, holdout_auc)
winner_model = best_models[winner_name]

In [54]:
print("=== Cross-validated AUC (5-fold) ===")
for k, v in cv_results.items():
    print(f"{k:18s}  CV AUC: {v['cv_auc']:.4f}   Best params: {v['best_params']}")

=== Cross-validated AUC (5-fold) ===
Logistic Regression  CV AUC: 1.0000   Best params: {'clf__C': 10.0, 'clf__penalty': 'l2'}
Decision Tree       CV AUC: 0.9988   Best params: {'clf__max_depth': 5, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 50}
Random Forest       CV AUC: 0.9998   Best params: {'clf__max_depth': 10, 'clf__max_features': 0.5, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
XGBoost             CV AUC: 0.9999   Best params: {'clf__colsample_bytree': 1.0, 'clf__learning_rate': 0.2, 'clf__max_depth': 3, 'clf__min_child_weight': 1, 'clf__subsample': 0.9}
SVM (RBF)           CV AUC: 0.9999   Best params: {'clf__C': 10.0, 'clf__gamma': 0.01}


In [55]:
print("\n=== Holdout (20%) AUC ===")
for k, v in holdout_auc.items():
    print(f"{k:18s}  Holdout AUC: {v:.4f}")


=== Holdout (20%) AUC ===
Logistic Regression  Holdout AUC: 1.0000
Decision Tree       Holdout AUC: 0.9998
Random Forest       Holdout AUC: 0.9999
XGBoost             Holdout AUC: 0.9999
SVM (RBF)           Holdout AUC: 1.0000


In [56]:
print(f"\n>>> Selected best model (by CV AUC): {winner_name}")


>>> Selected best model (by CV AUC): Logistic Regression


In [57]:
winner_model.fit(X, y)

In [58]:
def expected_loss_from_pipeline(pipeline, loan_row: dict, recovery_rate: float = 0.10):
    row_df = pd.DataFrame([loan_row], columns=X.columns)
    pd_val = float(pipeline.predict_proba(row_df)[:, 1][0])
    lgd = 1.0 - recovery_rate
    ead = float(row_df["loan_amt_outstanding"].iloc[0])
    return {"PD": pd_val, "Expected Loss": pd_val * lgd * ead}

In [59]:
sample_loan = X.iloc[0].to_dict()
print("\n=== Expected Loss for sample loan (each tuned model) ===")
for name, model in best_models.items():
    print(f"{name:18s}: {expected_loss_from_pipeline(model, sample_loan)}")

print(f"\n=== Expected Loss using SELECTED best model: {winner_name} ===")
print(expected_loss_from_pipeline(winner_model, sample_loan))


=== Expected Loss for sample loan (each tuned model) ===
Logistic Regression: {'PD': 1.2862616965540298e-26, 'Expected Loss': 6.044646220723547e-23}
Decision Tree     : {'PD': 0.0, 'Expected Loss': 0.0}
Random Forest     : {'PD': 0.0, 'Expected Loss': 0.0}
XGBoost           : {'PD': 8.80005235348591e-10, 'Expected Loss': 4.135488395804342e-06}
SVM (RBF)         : {'PD': 1.0000000994736041e-07, 'Expected Loss': 0.0004699391141165328}

=== Expected Loss using SELECTED best model: Logistic Regression ===
{'PD': 1.2862616965540298e-26, 'Expected Loss': 6.044646220723547e-23}


In [71]:
df.groupby("default").mean()

Unnamed: 0_level_0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,4967005.0,0.74414,4092.629025,6322.164549,69883.889804,4.765247,646.938765
1,5007914.0,4.618044,4454.854897,19270.582961,70726.74014,3.617504,596.257699


In [73]:
borrower_good = X[y==0].iloc[0].to_dict()
borrower_bad  = X[y==1].iloc[0].to_dict()
print(expected_loss_from_pipeline(winner_model, borrower_good))
print(expected_loss_from_pipeline(winner_model, borrower_bad))

{'PD': 1.2862616965540298e-26, 'Expected Loss': 6.044646220723547e-23}
{'PD': 1.0, 'Expected Loss': 1763.0358534000002}
