In [1]:
#import packages

import pandas as pd
import numpy as np

from utils import ks_scorer, max_ks

from sklearn.model_selection import cross_validate, GridSearchCV, cross_val_score, StratifiedKFold

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# filter warning messages
import warnings
warnings.filterwarnings('ignore')

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
# import data set and create a data frame
df_dataset_encoded = pd.read_csv("./model_dataset_with_sampling.csv")

X, y = df_dataset_encoded.drop('target_default', axis=1), df_dataset_encoded['target_default']

dev_dataset = df_dataset_encoded[df_dataset_encoded['Sampling'] == 'DEV']
X_dev = dev_dataset.drop(['target_default', 'Sampling'], axis=1)
y_dev = dev_dataset['target_default']

val_dataset = df_dataset_encoded[df_dataset_encoded['Sampling'] == 'VAL']
X_val = val_dataset.drop(['target_default', 'Sampling'], axis=1)
y_val = val_dataset['target_default']

X_dev.shape, X_val.shape

((23374, 143), (10018, 143))

In [3]:
# Define models

models = {
    'XGBoost': XGBClassifier(random_state=25), 
    "LightGBM": LGBMClassifier(random_state=25),
    "CatBoost": CatBoostClassifier(random_state=25)
}

results = []

for name, model in models.items():
    model.fit(X_dev, y_dev)

    y_dev_prob = model.predict_proba(X_dev)[:, 1]
    y_val_prob = model.predict_proba(X_val)[:, 1]
    
    dev_ks = max_ks(y_dev, y_dev_prob)
    val_ks = max_ks(y_val, y_val_prob)
    
    results.append({
        "Model": name,
        "DEV_KS": dev_ks,
        'VAL_KS': val_ks
    })

results_df = pd.DataFrame(results).sort_values("VAL_KS", ascending=False)
results_df


[LightGBM] [Info] Number of positive: 3730, number of negative: 19644
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2915
[LightGBM] [Info] Number of data points in the train set: 23374, number of used features: 135
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.159579 -> initscore=-1.661364
[LightGBM] [Info] Start training from score -1.661364
Learning rate set to 0.039572
0:	learn: 0.6676476	total: 153ms	remaining: 2m 33s
1:	learn: 0.6454663	total: 168ms	remaining: 1m 23s
2:	learn: 0.6246702	total: 183ms	remaining: 1m
3:	learn: 0.6064267	total: 266ms	remaining: 1m 6s
4:	learn: 0.5887615	total: 279ms	remaining: 55.6s
5:	learn: 0.5738889	total: 290ms	remaining: 48.1s
6:	learn: 0.5602174	total: 305ms	remaining: 43.3s
7:	learn: 0.5483857	total: 319ms	remaining: 39.6s
8:	learn: 0.5

Unnamed: 0,Model,DEV_KS,VAL_KS
2,CatBoost,62.416796,31.669529
1,LightGBM,58.59341,31.056098
0,XGBoost,81.274354,26.992788


In [4]:
pos_weight = (y_dev == 0).sum() / (y_dev == 1).sum()

xgb_unweighted = XGBClassifier(random_state=25)
xgb_weighted = XGBClassifier(
    scale_pos_weight=pos_weight,
    random_state=25
)

lgb_unweighted = LGBMClassifier(random_state=25)
lgb_weighted = LGBMClassifier(
    class_weight='balanced',
    random_state=25
)


cb_unweighted = CatBoostClassifier(
    verbose=0,
    random_state=25
)
cb_weighted = CatBoostClassifier(
    auto_class_weights='Balanced',
    verbose=0,
    random_state=25
)


In [5]:
models = {
    "XGB_Unweighted": xgb_unweighted,
    "XGB_Weighted": xgb_weighted,
    "LGB_Unweighted": lgb_unweighted,
    "LGB_Weighted": lgb_weighted,
    "CB_Unweighted": cb_unweighted,
    "CB_Weighted": cb_weighted
}

results = []

for name, model in models.items():
    model.fit(X_dev, y_dev)

    y_dev_prob = model.predict_proba(X_dev)[:, 1]
    y_val_prob = model.predict_proba(X_val)[:, 1]

    results.append({
        "Model": name,
        "DEV_KS": max_ks(y_dev, y_dev_prob),
        "VAL_KS": max_ks(y_val, y_val_prob),
        "KS_GAP": max_ks(y_dev, y_dev_prob) - max_ks(y_val, y_val_prob)
    })

results_df = pd.DataFrame(results).sort_values("VAL_KS", ascending=False)
results_df


[LightGBM] [Info] Number of positive: 3730, number of negative: 19644
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2915
[LightGBM] [Info] Number of data points in the train set: 23374, number of used features: 135
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.159579 -> initscore=-1.661364
[LightGBM] [Info] Start training from score -1.661364
[LightGBM] [Info] Number of positive: 3730, number of negative: 19644
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2915
[LightGBM] [Info] Number of data points in the train set: 23374, number of used features: 135
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[L

Unnamed: 0,Model,DEV_KS,VAL_KS,KS_GAP
4,CB_Unweighted,62.416796,31.669529,30.747267
3,LGB_Weighted,59.683967,31.308334,28.375633
2,LGB_Unweighted,58.59341,31.056098,27.537313
5,CB_Weighted,72.508037,29.996558,42.511479
0,XGB_Unweighted,81.274354,26.992788,54.281566
1,XGB_Weighted,83.989234,24.767217,59.222017


In [6]:
from itertools import product
import pandas as pd

def dev_val_grid_search(model_class, param_grid,X_dev, y_dev, X_val, y_val, fixed_params=None):
    
    """
    Manual grid search:
    - Train on DEV
    - Evaluate on DEV & VAL
    - Return DataFrame with KS metrics
    """

    if fixed_params is None:
        fixed_params = {}

    results = []

    for values in product(*param_grid.values()):
        params = dict(zip(param_grid.keys(), values))

        model = model_class(
            **fixed_params,
            **params
        )

        # Train ONLY on DEV
        model.fit(X_dev, y_dev)

        # Predict probabilities
        dev_pred = model.predict_proba(X_dev)[:, 1]
        val_pred = model.predict_proba(X_val)[:, 1]

        dev_ks = max_ks(y_dev, dev_pred)
        val_ks = max_ks(y_val, val_pred)

        results.append({
            **params,
            "DEV_KS": dev_ks,
            "VAL_KS": val_ks,
            "KS_GAP": dev_ks - val_ks
        })

    return pd.DataFrame(results)


### Tuning the CatBoost

In [7]:
param_grid_cb = {
    "iterations": [300, 500],
    "depth": [4, 5, 6],
    "learning_rate": [0.03, 0.05],
    "l2_leaf_reg": [5, 10, 20],
    "subsample": [0.7, 0.9]
}

results_cb_df = dev_val_grid_search(
    model_class=CatBoostClassifier,
    param_grid=param_grid_cb,
    X_dev=X_dev,
    y_dev=y_dev,
    X_val=X_val,
    y_val=y_val,
    fixed_params={
        "loss_function": "Logloss",
        "verbose": 0,
        "random_state": 25
    }
)

results_cb_df = results_cb_df.sort_values(by=['VAL_KS'], ascending=False)
results_cb_df.head(5)

Unnamed: 0,iterations,depth,learning_rate,l2_leaf_reg,subsample,DEV_KS,VAL_KS,KS_GAP
49,500,5,0.03,5,0.9,39.625978,33.522934,6.103044
51,500,5,0.03,10,0.9,38.945288,33.383281,5.562006
37,500,4,0.03,5,0.9,36.991917,33.357899,3.634018
36,500,4,0.03,5,0.7,36.861748,33.289967,3.571781
53,500,5,0.03,20,0.9,38.680549,33.198093,5.482455


### Tuning the LightGBoost

In [None]:
param_grid_lgb = {
    "n_estimators": [300, 500],
    "num_leaves": [15, 31],
    "max_depth": [-1, 5, 7],
    "min_data_in_leaf": [50, 100],
    "feature_fraction": [0.7, 0.9],
    "learning_rate": [0.03, 0.05]
}

results_lgb_df = dev_val_grid_search(
    model_class=LGBMClassifier,
    param_grid=param_grid_lgb,
    X_dev=X_dev,
    y_dev=y_dev,
    X_val=X_val,
    y_val=y_val,
    fixed_params={
        "objective": "binary",
        "random_state": 25
    }
)

[LightGBM] [Info] Number of positive: 3730, number of negative: 19644
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2907
[LightGBM] [Info] Number of data points in the train set: 23374, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.159579 -> initscore=-1.661364
[LightGBM] [Info] Start training from score -1.661364
[LightGBM] [Info] Number of positive: 3730, number of negative: 19644
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2907
[LightGBM] [Info] Number of data points in the train set: 23374, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.159579 -> initscore=-1.661364
[LightGBM] [Info] Start training from score -1.661364
[LightGBM] [

Unnamed: 0,iterations,depth,learning_rate,l2_leaf_reg,subsample,DEV_KS,VAL_KS,KS_GAP
49,500,5,0.03,5,0.9,39.625978,33.522934,6.103044
51,500,5,0.03,10,0.9,38.945288,33.383281,5.562006
37,500,4,0.03,5,0.9,36.991917,33.357899,3.634018
36,500,4,0.03,5,0.7,36.861748,33.289967,3.571781
53,500,5,0.03,20,0.9,38.680549,33.198093,5.482455


In [13]:
results_lgb_df = results_lgb_df.sort_values(by=['VAL_KS'], ascending=False)
results_lgb_df.head(5)

Unnamed: 0,n_estimators,num_leaves,max_depth,min_data_in_leaf,feature_fraction,learning_rate,DEV_KS,VAL_KS,KS_GAP
20,300,15,7,100,0.7,0.03,42.030802,33.055105,8.975697
34,300,31,5,50,0.9,0.03,44.78329,33.039996,11.743294
4,300,15,-1,100,0.7,0.03,42.541529,33.025422,9.516107
16,300,15,7,50,0.7,0.03,42.568434,33.010476,9.557958
12,300,15,5,100,0.7,0.03,40.846341,32.99761,7.848731


### Tuning the Xgbost

In [9]:
param_grid_xgb = {
    "n_estimators": [200, 300, 400],
    "max_depth": [3, 4, 5],
    "min_child_weight": [5, 10, 20],
    "subsample": [0.6, 0.8],
    "colsample_bytree": [0.6, 0.8],
    "learning_rate": [0.03, 0.05]
}

results_xgb_df = dev_val_grid_search(
    model_class=XGBClassifier,
    param_grid=param_grid_xgb,
    X_dev=X_dev,
    y_dev=y_dev,
    X_val=X_val,
    y_val=y_val,
    fixed_params={
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 25
    }
)


results_xgb_df = results_xgb_df.sort_values(by=['VAL_KS'], ascending=False)
results_xgb_df.head(5)


Unnamed: 0,n_estimators,max_depth,min_child_weight,subsample,colsample_bytree,learning_rate,DEV_KS,VAL_KS,KS_GAP
212,400,5,20,0.8,0.6,0.03,44.147299,33.432502,10.714798
37,200,4,10,0.8,0.6,0.05,39.954962,33.382293,6.572669
108,300,4,10,0.8,0.6,0.03,39.238111,33.285562,5.95255
116,300,4,20,0.8,0.6,0.03,38.454184,33.214027,5.240157
118,300,4,20,0.8,0.8,0.03,38.753387,33.18811,5.565277


In [15]:
selected_param_cb = {
    "iterations": 500,
    "depth": 4,
    "learning_rate": 0.03,
    "l2_leaf_reg": 5,
    "subsample":  0.9
}

selected_param_lgb = {
    "n_estimators": 300,
    "num_leaves": 15,
    "max_depth": 5,
    "min_data_in_leaf": 100,
    "feature_fraction": 0.7,
    "learning_rate": 0.03
}

selected_param_xgb = {
    "n_estimators":300,
    "max_depth": 4,
    "min_child_weight": 20,
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "learning_rate": 0.03
}

In [16]:
final_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=25,
    **selected_param_xgb
)

final_xgb.fit(X_dev, y_dev)

final_lgb = LGBMClassifier(
    objective="binary",
    random_state=25,
    **selected_param_lgb
)

final_lgb.fit(X_dev, y_dev)

final_cb = CatBoostClassifier(
    loss_function="Logloss",
    random_state=25,
    verbose=0,
    **selected_param_cb
)

final_cb.fit(X_dev, y_dev)


[LightGBM] [Info] Number of positive: 3730, number of negative: 19644
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2883
[LightGBM] [Info] Number of data points in the train set: 23374, number of used features: 120
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.159579 -> initscore=-1.661364
[LightGBM] [Info] Start training from score -1.661364


<catboost.core.CatBoostClassifier at 0x220ad71e8d0>

In [17]:
test_dataset = df_dataset_encoded[df_dataset_encoded['Sampling'] == 'TEST']
X_test = test_dataset.drop(['target_default', 'Sampling'], axis=1)
y_test = test_dataset['target_default']

In [18]:
def evaluate_ks(model, X, y):
    preds = model.predict_proba(X)[:, 1]
    return max_ks(y, preds)

results = []

models = {
    "XGBoost": final_xgb,
    "LightGBM": final_lgb,
    "CatBoost": final_cb
}

for name, model in models.items():
    results.append({
        "Model Type": name,
        "DEV_KS": evaluate_ks(model, X_dev, y_dev),
        "VAL_KS": evaluate_ks(model, X_val, y_val),
        "TEST_KS": evaluate_ks(model, X_test, y_test)
    })

final_scores_df = pd.DataFrame(results)
final_scores_df




Unnamed: 0,Model Type,DEV_KS,VAL_KS,TEST_KS
0,XGBoost,38.454184,33.214027,31.305472
1,LightGBM,40.846341,32.99761,31.347198
2,CatBoost,36.991917,33.357899,31.80012


In [19]:
df_scored = df_dataset_encoded.copy()

feature_cols = [col for col in df_scored.columns if col not in ["target_default", "Sampling"]]

df_scored["pd_score"] = final_cb.predict_proba(df_scored[feature_cols])[:, 1]

df_scored.groupby("Sampling")["pd_score"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Sampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DEV,23374.0,0.159265,0.106686,0.019706,0.082087,0.130989,0.206472,0.833428
TEST,8349.0,0.157137,0.103176,0.023713,0.081689,0.131562,0.203716,0.755828
VAL,10018.0,0.158622,0.105744,0.023183,0.081698,0.130613,0.205162,0.831766


In [20]:
df_scored.to_csv("credit_risk_scored_dataset.csv",index=False)


In [21]:
final_cb.save_model("final_catboost_model.cbm")


In [22]:
import joblib

joblib.dump(final_lgb, "final_lightgbm_model.pkl")
joblib.dump(final_xgb, "final_xgboost_model.pkl")



['final_xgboost_model.pkl']

In [1]:
feature_importance_cb = pd.DataFrame({
    "feature": final_cb.feature_names_,
    "importance": final_cb.get_feature_importance()
}).sort_values("importance", ascending=False)

feature_importance_cb.head(15)

NameError: name 'pd' is not defined