In [20]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score

## Model Development

- Train on LR, DT and XGB
    - Evaluate on AUC, F1, Recall@ top 5% risk
    - Hyperparameter Tuning
- produce explainability
    - SHAP for LightGBM
    - Gain importance
    - Permutation importance

- select best model

### Load Data

In [3]:
train_df = pd.read_csv("../data/processed/train.csv")
val_df   = pd.read_csv("../data/processed/val.csv")
test_df  = pd.read_csv("../data/processed/test.csv")

print(train_df.shape, val_df.shape, test_df.shape)

(85299, 44) (18278, 44) (18279, 44)


In [4]:
target = "Default"

X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_val = val_df.drop(columns=[target])
y_val = val_df[target]

### Logistic Regression

In [14]:
logreg = LogisticRegression(max_iter=500, class_weight="balanced")

param_grid = {
    "max_iter": [500, 1000],
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs"]
}

grid = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1
)

In [15]:
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Params: {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
val_probs = best_model.predict_proba(X_val)[:, 1]
val_preds = best_model.predict(X_val)

In [17]:
acc = accuracy_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_probs)
f1 = f1_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)

In [18]:
# Recall @ Top 5%
threshold_5 = np.percentile(val_probs, 95)   # Top 5% highest risk
top5_preds = (val_probs >= threshold_5).astype(int)
recall_top5 = recall_score(y_val, top5_preds)

In [19]:
print("\n================ Logistic Regression Eval ================")
print(f"Accuracy     : {acc:.4f}")
print(f"AUC-ROC      : {auc:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"Recall @Top5%: {recall_top5:.4f}")
print("===========================================================\n")


Accuracy     : 0.5891
AUC-ROC      : 0.6286
F1 Score     : 0.1911
Recall       : 0.6005
Recall @Top5%: 0.1070



Accuracy is useless for imbalanced data. AUC is weak but acceptable for linear baseline. Low F1 score because of class-imbalance.

Recall at 60% is not bad. Recall @top5% is very low

The model failed to converge. But, this is normal because:
- features are not scaled
- there may be multicollinearity
- LR often struggles on credit data

### Decision Tree

In [32]:
dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=200,
    min_samples_leaf=100,
    class_weight="balanced",
    random_state=42
)

dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,200
,min_samples_leaf,100
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [33]:
val_probs = dt.predict_proba(X_val)[:, 1]
val_preds = dt.predict(X_val)

In [34]:
acc = accuracy_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_probs)
f1 = f1_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)

threshold_5 = np.percentile(val_probs, 95)
top5_preds = (val_probs >= threshold_5).astype(int)
recall_top5 = recall_score(y_val, top5_preds)

print("\n============== DECISION TREE EVALUATION ==============")
print(f"Accuracy     : {acc:.4f}")
print(f"AUC-ROC      : {auc:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"Recall@Top5% : {recall_top5:.4f}")
print("=======================================================\n")


Accuracy     : 0.7018
AUC-ROC      : 0.7054
F1 Score     : 0.2428
Recall       : 0.5917
Recall@Top5% : 0.1991



### DT Optimization

In [29]:
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 10],
    'min_samples_split': [50, 100, 200, 400],
    'min_samples_leaf': [20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}

dt = DecisionTreeClassifier(class_weight="balanced", random_state=42)

grid_dt = GridSearchCV(
    dt,
    param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_

print("Best DT Params:", grid_dt.best_params_)

val_probs = best_dt.predict_proba(X_val)[:, 1]
val_preds = best_dt.predict(X_val)

Fitting 3 folds for each of 224 candidates, totalling 672 fits
Best DT Params: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 100, 'min_samples_split': 400}


In [30]:
acc = accuracy_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_probs)
f1 = f1_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)

threshold_5 = np.percentile(val_probs, 95)
recall_top5 = recall_score(y_val, (val_probs >= threshold_5).astype(int))

print("\n===== Optimized Decision Tree =====")
print("Accuracy:", acc)
print("AUC:", auc)
print("F1:", f1)
print("Recall:", recall)
print("Recall@Top5%:", recall_top5)


===== Optimized Decision Tree =====
Accuracy: 0.6276397855345224
AUC: 0.7034530459043105
F1: 0.22641509433962265
Recall: 0.6743398781313473
Recall@Top5%: 0.16858496953283683


### XGBoost

In [25]:
# Estimate scale_pos_weight = negative/positive
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos


xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    tree_method="hist"
)

xgb.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [26]:
val_probs = xgb.predict_proba(X_val)[:, 1]
val_preds = xgb.predict(X_val)

In [27]:
acc = accuracy_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_probs)
f1 = f1_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)

# Recall @ Top 5%
threshold_5 = np.percentile(val_probs, 95)
top5_preds = (val_probs >= threshold_5).astype(int)
recall_top5 = recall_score(y_val, top5_preds)

print("\n============== XGBOOST EVALUATION ==============")
print(f"Accuracy     : {acc:.4f}")
print(f"AUC-ROC      : {auc:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"Recall@Top5% : {recall_top5:.4f}")
print("=================================================\n")


Accuracy     : 0.7493
AUC-ROC      : 0.7441
F1 Score     : 0.2789
Recall       : 0.5999
Recall@Top5% : 0.1821



### XGB Optimization via Optuna

In [None]:
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "scale_pos_weight": scale_pos_weight,
        "eval_metric": "logloss",
        "tree_method": "hist",
        "random_state": 42
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    return auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best Params:", study.best_params)
best_params = study.best_params

[I 2025-11-25 19:23:37,980] A new study created in memory with name: no-name-2917466f-c9f8-4e9b-a295-23c7c9fd4378
[I 2025-11-25 19:23:39,426] Trial 0 finished with value: 0.733829699581428 and parameters: {'n_estimators': 308, 'max_depth': 9, 'learning_rate': 0.2113497148570021, 'subsample': 0.9128717042771526, 'colsample_bytree': 0.7485776457495235, 'gamma': 3.107808027509078}. Best is trial 0 with value: 0.733829699581428.
[I 2025-11-25 19:23:41,609] Trial 1 finished with value: 0.7256808632912967 and parameters: {'n_estimators': 640, 'max_depth': 5, 'learning_rate': 0.1747871676266444, 'subsample': 0.6966903392307031, 'colsample_bytree': 0.9311353010987025, 'gamma': 2.193475251585985}. Best is trial 0 with value: 0.733829699581428.
[I 2025-11-25 19:23:44,005] Trial 2 finished with value: 0.7300478052113237 and parameters: {'n_estimators': 397, 'max_depth': 8, 'learning_rate': 0.16391696331458738, 'subsample': 0.8111798180596025, 'colsample_bytree': 0.9011372250846699, 'gamma': 0.998

Best Params: {'n_estimators': 708, 'max_depth': 8, 'learning_rate': 0.020942867352019036, 'subsample': 0.870326167195358, 'colsample_bytree': 0.6394223729541142, 'gamma': 4.916690960019693}


In [36]:
# Train final model
best_xgb = XGBClassifier(**best_params)
best_xgb.fit(X_train, y_train)

val_probs = best_xgb.predict_proba(X_val)[:, 1]
val_preds = best_xgb.predict(X_val)

In [37]:
acc = accuracy_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_probs)
f1 = f1_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)

threshold_5 = np.percentile(val_probs, 95)
recall_top5 = recall_score(y_val, (val_probs >= threshold_5).astype(int))

print("\n===== Optimized XGBoost =====")
print("Accuracy:", acc)
print("AUC:", auc)
print("F1:", f1)
print("Recall:", recall)
print("Recall@Top5%:", recall_top5)



===== Optimized XGBoost =====
Accuracy: 0.9191377612430244
AUC: 0.7488413999279552
F1: 0.01859229747675963
Recall: 0.009478672985781991
Recall@Top5%: 0.1949898442789438


### XGB v3

In [39]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
scale_pos_weight = round(scale_pos_weight, 2)
scale_pos_weight = min(scale_pos_weight, 10) # prevent explosion

xgb = XGBClassifier(
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    tree_method="hist",
    random_state=42
)

param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [4, 5, 6],
    "learning_rate": [0.02, 0.05, 0.1],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.6, 0.8],
}

grid = GridSearchCV(
    xgb, param_grid,
    scoring="recall",   # <-- FIXED
    cv=3,
    n_jobs=-1,
    verbose=1
)

In [40]:
grid.fit(X_train, y_train)
best = grid.best_estimator_

probs = best.predict_proba(X_val)[:,1]
preds = (probs >= 0.2).astype(int)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [42]:
auc = roc_auc_score(y_val, probs)
recall = recall_score(y_val, preds)
f1 = f1_score(y_val, preds)

threshold_5 = np.percentile(probs, 95)
recall_top5 = recall_score(y_val, (probs >= threshold_5).astype(int))

print("===== UPDATED XGBOOST RESULTS =====")
print("AUC:", auc)
print("Recall:", recall)
print("F1:", f1)
print("Recall@Top5%:", recall_top5)
print("==============================")

===== UPDATED XGBOOST RESULTS =====
AUC: 0.7392827352500256
Recall: 0.9580230196343941
F1: 0.16871348515559795
Recall@Top5%: 0.17874069058903183
