In [None]:
import pandas as pd
from utils import max_ks

In [3]:
model_dataset = pd.read_csv("../Data/credit_risk_scored_dataset.csv")
model_dataset.shape
model_dataset.head(2)

Unnamed: 0,score_1_4DLlLW62jReXaqbPaHp1vQ==,facebook_profile,external_data_provider_email_seen_before,score_1_DGCQep2AE5QRkNCshIAlFQ==,income,last_borrowed_in_months,application_time_in_funnel,score_1_smzX0nxh5QlePvtVf6EAeg==,score_1_e4NYDor1NOw6XKGE60AWFw==,risk_rate,...,state_sjJbkqJS7cXalHLBFA+EOQ==,state_1DpYl6dtzY0qE33poow3iw==,state_/L8vvVesB5WyAv190Hw/rQ==,state_BB/zpwTH+8GCIVHlhzOU1Q==,real_state_nSpvDsIsslUaX6GE6m6eQA==,score_2_OlDYtdljgSSYM/M1L2CRaQ==,shipping_state_BR-PA,Sampling,pd_score,target_default
0,False,0,22.0,False,36030.62,0.0,311,False,False,0.4,...,False,False,False,True,False,False,False,DEV,0.156296,0
1,True,0,11.0,False,70289.24,0.0,241,False,False,0.14,...,False,False,False,False,False,False,False,DEV,0.053441,0


In [4]:
def pull_sample(model_data, sampling):
    sample_data = model_data[model_data['Sampling'] == sampling][['target_default', 'pd_score']]
    return sample_data

In [5]:
dev_data = pull_sample(model_dataset, 'DEV')
val_data = pull_sample(model_dataset, 'VAL')
test_data = pull_sample(model_dataset, 'TEST')

dev_data.shape, val_data.shape, test_data.shape

((23374, 2), (10018, 2), (8349, 2))

In [6]:
dev_KS = max_ks(dev_data['target_default'], dev_data['pd_score'])
val_KS = max_ks(val_data['target_default'], val_data['pd_score'])
test_KS = max_ks(test_data['target_default'], test_data['pd_score'])

dev_KS, val_KS, test_KS

(37.329117814524814, 33.581521174335336, 31.784103470721686)

In [None]:
val_data.loc[:, 'Tier'], breakpoints = pd.qcut(
    val_data['pd_score'], 
    q=10, 
    retbins=True,
    duplicates='drop'
)

In [8]:
def gains_table(data_df):
    data_df['Total'] = 1
    tier_df = data_df[['target_default','Total', 'Tier']].groupby(['Tier'], observed=True).sum()
    tier_df["%_obs"] = ((
            tier_df["Total"] / tier_df["Total"].sum()
        ) * 100).round(2)
    tier_df["dv_rate"] = ((
            tier_df['target_default'] / tier_df["Total"]
        ) * 100).round(2)
    return tier_df

In [9]:

eps = 1e-8
min_val = min(val_data['pd_score'].min(), val_data['pd_score'].min())
max_val = max(val_data['pd_score'].max(), val_data['pd_score'].max())

# Adjust edges
breakpoints[0] = min_val - eps
breakpoints[-1] = max_val + eps

In [10]:

dev_data.loc[:, 'Tier'] = pd.cut(
    dev_data['pd_score'], 
    bins=breakpoints, 
    labels=range(1, len(breakpoints)),
    include_lowest=True
)

test_data.loc[:, 'Tier'] = pd.cut(
    test_data['pd_score'], 
    bins=breakpoints, 
    labels=range(1, len(breakpoints)),
    include_lowest=True
)

In [11]:
val_data_tierwise = gains_table(val_data)
val_data_tierwise

Unnamed: 0_level_0,target_default,Total,%_obs,dv_rate
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.0224, 0.0536]",42,1002,10.0,4.19
"(0.0536, 0.0729]",65,1002,10.0,6.49
"(0.0729, 0.0896]",77,1002,10.0,7.68
"(0.0896, 0.109]",87,1001,9.99,8.69
"(0.109, 0.13]",109,1002,10.0,10.88
"(0.13, 0.156]",136,1002,10.0,13.57
"(0.156, 0.187]",194,1001,9.99,19.38
"(0.187, 0.231]",207,1002,10.0,20.66
"(0.231, 0.301]",275,1002,10.0,27.45
"(0.301, 0.814]",407,1002,10.0,40.62


In [12]:
dev_data_tierwise = gains_table(dev_data)
dev_data_tierwise

Unnamed: 0_level_0,target_default,Total,%_obs,dv_rate
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,56,2198,9.41,2.55
2,103,2422,10.37,4.25
3,140,2287,9.79,6.12
4,193,2422,10.37,7.97
5,260,2338,10.01,11.12
6,333,2404,10.29,13.85
7,396,2299,9.84,17.22
8,464,2302,9.85,20.16
9,626,2319,9.93,26.99
10,1156,2371,10.15,48.76


In [None]:
test_data_tierwise = gains_table(test_data)
test_data_tierwise

Unnamed: 0_level_0,target_default,Total,%_obs,dv_rate
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,36,848,10.16,4.25
2,64,826,9.9,7.75
3,47,767,9.19,6.13
4,100,916,10.97,10.92
5,98,774,9.27,12.66
6,118,904,10.83,13.05
7,125,839,10.05,14.9
8,177,847,10.15,20.9
9,226,812,9.73,27.83
10,341,814,9.75,41.89
