In [1]:
import pandas as pd
from utils import max_ks

In [None]:
model_dataset = pd.read_csv("./credit_risk_scored_dataset.csv")
model_dataset.shape
model_dataset.head(2)

Unnamed: 0,score_1_4DLlLW62jReXaqbPaHp1vQ==,facebook_profile,external_data_provider_email_seen_before,score_1_DGCQep2AE5QRkNCshIAlFQ==,income,last_borrowed_in_months,application_time_in_funnel,score_1_smzX0nxh5QlePvtVf6EAeg==,score_1_e4NYDor1NOw6XKGE60AWFw==,risk_rate,...,state_sjJbkqJS7cXalHLBFA+EOQ==,state_1DpYl6dtzY0qE33poow3iw==,state_/L8vvVesB5WyAv190Hw/rQ==,state_BB/zpwTH+8GCIVHlhzOU1Q==,real_state_nSpvDsIsslUaX6GE6m6eQA==,score_2_OlDYtdljgSSYM/M1L2CRaQ==,shipping_state_BR-PA,Sampling,pd_score,target_default
0,False,0,22.0,False,36030.62,0.0,311,False,False,0.4,...,False,False,False,True,False,False,False,DEV,0.156296,0
1,True,0,11.0,False,70289.24,0.0,241,False,False,0.14,...,False,False,False,False,False,False,False,DEV,0.053441,0


In [3]:
def pull_sample(model_data, sampling):
    sample_data = model_data[model_data['Sampling'] == sampling][['target_default', 'pd_score']]
    return sample_data

In [4]:
dev_data = pull_sample(model_dataset, 'DEV')
val_data = pull_sample(model_dataset, 'VAL')
test_data = pull_sample(model_dataset, 'TEST')

dev_data.shape, val_data.shape, test_data.shape

((23374, 2), (10018, 2), (8349, 2))

In [6]:
dev_KS = max_ks(dev_data['target_default'], dev_data['pd_score'])
val_KS = max_ks(val_data['target_default'], val_data['pd_score'])
test_KS = max_ks(test_data['target_default'], test_data['pd_score'])

dev_KS, val_KS, test_KS

(37.329117814524814, 33.581521174335336, 31.784103470721686)

In [None]:
dev_KS = max_ks(dev_data['target_default'], dev_data['pd_score'])
val_KS = max_ks(val_data['target_default'], val_data['pd_score'])
test_KS = max_ks(test_data['target_default'], test_data['pd_score'])

dev_KS, val_KS, test_KS

(36.99191725311074, 33.357898811475074, 31.800119914698797)

In [7]:
val_data.loc[:, 'Tier'], breakpoints = pd.qcut(
    val_data['pd_score'], 
    q=10, 
    retbins=True,
    duplicates='drop'
)

In [None]:
def gains_table(data_df):
    data_df['Total'] = 1
    tier_df = data_df[['target_default','Total', 'Tier']].groupby(['Tier'], observed=True).sum()
    tier_df["%_obs"] = ((
            tier_df["Total"] / tier_df["Total"].sum()
        ) * 100).round(2)
    tier_df["dv_rate"] = ((
            tier_df['target_default'] / tier_df["Total"]
        ) * 100).round(2)
    return tier_df

In [None]:

eps = 1e-8
min_val = min(val_data['pd_score'].min(), val_data['pd_score'].min())
max_val = max(val_data['pd_score'].max(), val_data['pd_score'].max())

# Adjust edges
breakpoints[0] = min_val - eps
breakpoints[-1] = max_val + eps

In [None]:

dev_data.loc[:, 'Tier'] = pd.cut(
    dev_data['pd_score'], 
    bins=breakpoints, 
    labels=range(1, len(breakpoints)),
    include_lowest=True
)

test_data.loc[:, 'Tier'] = pd.cut(
    test_data['pd_score'], 
    bins=breakpoints, 
    labels=range(1, len(breakpoints)),
    include_lowest=True
)

In [None]:
val_data_tierwise = gains_table(val_data)
val_data_tierwise

Unnamed: 0_level_0,target_default,Total,%_obs,dv_rate
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.0224, 0.0536]",42,1002,10.0,4.19
"(0.0536, 0.0729]",65,1002,10.0,6.49
"(0.0729, 0.0896]",77,1002,10.0,7.68
"(0.0896, 0.109]",87,1001,9.99,8.69
"(0.109, 0.13]",109,1002,10.0,10.88
"(0.13, 0.156]",136,1002,10.0,13.57
"(0.156, 0.187]",194,1001,9.99,19.38
"(0.187, 0.231]",207,1002,10.0,20.66
"(0.231, 0.301]",275,1002,10.0,27.45
"(0.301, 0.814]",407,1002,10.0,40.62


In [None]:
dev_data_tierwise = gains_table(dev_data)
dev_data_tierwise

Unnamed: 0_level_0,target_default,Total,%_obs,dv_rate
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,56,2198,9.41,2.55
2,103,2422,10.37,4.25
3,140,2287,9.79,6.12
4,193,2422,10.37,7.97
5,260,2338,10.01,11.12
6,333,2404,10.29,13.85
7,396,2299,9.84,17.22
8,464,2302,9.85,20.16
9,626,2319,9.93,26.99
10,1156,2371,10.15,48.76


In [13]:
test_data_tierwise = gains_table(test_data)
test_data_tierwise

Unnamed: 0_level_0,target_default,Total,%_obs,dv_rate
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,36,848,10.16,4.25
2,64,826,9.9,7.75
3,47,767,9.19,6.13
4,100,916,10.97,10.92
5,98,774,9.27,12.66
6,118,904,10.83,13.05
7,125,839,10.05,14.9
8,177,847,10.15,20.9
9,226,812,9.73,27.83
10,341,814,9.75,41.89


In [14]:
from catboost import CatBoostClassifier

loaded_cb = CatBoostClassifier()
loaded_cb.load_model("final_catboost_model.cbm")


<catboost.core.CatBoostClassifier at 0x1cbacb82720>

In [15]:
import pandas as pd

feature_importance_cb = pd.DataFrame({
    "feature": loaded_cb.feature_names_,
    "importance": loaded_cb.get_feature_importance()
}).sort_values("importance", ascending=False)

feature_importance_cb["importance_pct"] = (
    feature_importance_cb["importance"] /
    feature_importance_cb["importance"].sum()
)

feature_importance_cb.head(15)


Unnamed: 0,feature,importance,importance_pct
22,score_1_4DLlLW62jReXaqbPaHp1vQ==,18.280101,0.182801
9,facebook_profile,15.397976,0.15398
18,external_data_provider_email_seen_before,8.999012,0.08999
24,score_1_DGCQep2AE5QRkNCshIAlFQ==,7.596577,0.075966
8,income,6.567194,0.065672
6,last_borrowed_in_months,3.720484,0.037205
15,application_time_in_funnel,3.043648,0.030436
27,score_1_smzX0nxh5QlePvtVf6EAeg==,2.900051,0.029001
25,score_1_e4NYDor1NOw6XKGE60AWFw==,2.600785,0.026008
4,risk_rate,2.379914,0.023799


In [None]:
import numpy as np

def calculate_iv(df, feature, target, bins=10):
    df = df[[feature, target]].copy()
    
    # Bin feature
    df["bin"] = pd.qcut(df[feature], q=bins, duplicates="drop")
    
    grouped = df.groupby("bin")[target].agg(["count", "sum"])
    grouped.columns = ["total", "bads"]
    
    grouped["goods"] = grouped["total"] - grouped["bads"]
    
    # Distribution
    grouped["dist_bad"] = grouped["bads"] / grouped["bads"].sum()
    grouped["dist_good"] = grouped["goods"] / grouped["goods"].sum()
    
    # Avoid division by zero
    grouped["woe"] = np.log(
        (grouped["dist_good"] + 1e-6) /
        (grouped["dist_bad"] + 1e-6)
    )
    
    grouped["iv"] = (grouped["dist_good"] - grouped["dist_bad"]) * grouped["woe"]
    
    return grouped["iv"].sum()


In [17]:

iv_results = []
dev_dataset = model_dataset[model_dataset['Sampling'] == 'DEV']
X_dev = dev_dataset.drop(['target_default', 'Sampling', 'pd_score'], axis=1)
y_dev = dev_dataset['target_default']
for col in X_dev.columns:
    iv = calculate_iv(dev_dataset, col, "target_default")
    iv_results.append({
        "feature": col,
        "IV": iv
    })

iv_df = pd.DataFrame(iv_results).sort_values("IV", ascending=False)
iv_df.head(15)


  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].agg(["count", "sum"])
  grouped = df.groupby("bin")[target].ag

Unnamed: 0,feature,IV
9,risk_rate,0.071018
2,external_data_provider_email_seen_before,0.060775
5,last_borrowed_in_months,0.05699
4,income,0.032263
10,score_3,0.019806
17,last_amount_borrowed,0.011538
6,application_time_in_funnel,0.010076
16,n_issues,0.006095
11,score_5,0.005059
12,n_accounts,0.004869


In [18]:
feature_analysis = (
    feature_importance_cb
    .merge(iv_df, on="feature", how="left")
    .sort_values("importance", ascending=False)
)

feature_analysis.to_csv("final_features.csv", index=False)


In [19]:
selected_features = feature_analysis[
    (feature_analysis["importance"] > 0.1) |
    (feature_analysis["IV"] > 0)
]["feature"].tolist()

len(selected_features)


49

In [20]:
selected_param_cb = {
    "iterations": 500,
    "depth": 4,
    "learning_rate": 0.03,
    "l2_leaf_reg": 5,
    "subsample":  0.9
}
final_cb_retrained = CatBoostClassifier(
    loss_function="Logloss",
    random_state=25,
    verbose=0,
    **selected_param_cb
)

final_cb_retrained.fit(
    X_dev[selected_features],
    y_dev
)


<catboost.core.CatBoostClassifier at 0x1cffdcdc800>

In [21]:
from utils import max_ks

df_dataset_encoded = pd.read_csv("./model_dataset_with_sampling.csv")

X, y = df_dataset_encoded.drop('target_default', axis=1), df_dataset_encoded['target_default']

dev_dataset = df_dataset_encoded[df_dataset_encoded['Sampling'] == 'DEV']
X_dev = dev_dataset.drop(['target_default', 'Sampling'], axis=1)
y_dev = dev_dataset['target_default']

val_dataset = df_dataset_encoded[df_dataset_encoded['Sampling'] == 'VAL']
X_val = val_dataset.drop(['target_default', 'Sampling'], axis=1)
y_val = val_dataset['target_default']

test_dataset = df_dataset_encoded[df_dataset_encoded['Sampling'] == 'TEST']
X_test = test_dataset.drop(['target_default', 'Sampling'], axis=1)
y_test = test_dataset['target_default']

def eval_model(model, X, y):
    return max_ks(y, model.predict_proba(X)[:, 1])

results = {
    "DEV_KS": eval_model(final_cb_retrained, X_dev[selected_features], y_dev),
    "VAL_KS": eval_model(final_cb_retrained, X_val[selected_features], y_val),
    "TEST_KS": eval_model(final_cb_retrained, X_test[selected_features], y_test),
}
results


{'DEV_KS': 37.329117814524814,
 'VAL_KS': 33.581521174335336,
 'TEST_KS': 31.784103470721686}

In [25]:
df_scored = df_dataset_encoded.copy()

feature_cols = [col for col in df_scored.columns if col not in ["target_default", "Sampling"] and col in selected_features]

df_scored["pd_score"] = final_cb_retrained.predict_proba(df_scored[feature_cols])[:, 1]

df_scored = df_scored[selected_features + ["Sampling", "pd_score", "target_default"]]
df_scored.groupby("Sampling")["pd_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Sampling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DEV,23374.0,0.15929,0.10753,0.019683,0.082022,0.130011,0.20736,0.878052
TEST,8349.0,0.1572,0.10368,0.021706,0.082251,0.131052,0.205333,0.750903
VAL,10018.0,0.158697,0.106498,0.023448,0.081668,0.129966,0.206639,0.814198


In [26]:
df_scored.to_csv("credit_risk_scored_dataset_retrained.csv",index=False)