In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

In [9]:
fp_data = Path('/data/credit_model', 'lec6', 'test_dataset.csv')

In [10]:
pd_data = pd.read_csv(fp_data)

In [11]:
pd_data

Unnamed: 0.1,Unnamed: 0,userid,dpd,term,score
0,851918,39668738,-6,202004,74.779712
1,922894,59561190,12,202004,70.099734
2,255080,160277597,-6,202004,50.432712
3,172859,7123385,-2,202004,77.274795
4,15052,95589182,3,202004,74.609002
...,...,...,...,...,...
29995,457180,56894279,-15,202002,66.514859
29996,34431,172568266,12,202002,22.917540
29997,514438,9087607,-9,202002,76.696414
29998,837015,148158746,12,202002,59.577740


In [13]:
pd_04 = pd_data[pd_data['term'] == 202004]

In [14]:
pd_04

Unnamed: 0.1,Unnamed: 0,userid,dpd,term,score
0,851918,39668738,-6,202004,74.779712
1,922894,59561190,12,202004,70.099734
2,255080,160277597,-6,202004,50.432712
3,172859,7123385,-2,202004,77.274795
4,15052,95589182,3,202004,74.609002
...,...,...,...,...,...
9995,779114,10077794,3,202004,65.901837
9996,435330,68082873,-7,202004,61.325703
9997,526877,21120622,-10,202004,78.743729
9998,253459,7354430,4,202004,76.420273


In [15]:
def get_binomial(num, total, **kwargs):
    z = kwargs.get('z', 1.96)
    z2 = z * z
    p = num / total

    mu = (p + z2 / (2 * total)) / (1 + z2 / total)
    sigma = np.sqrt(p * (1 - p) / total + z2 / (4 * total * total)) / (1 + z2 / total)

    return mu, sigma

In [16]:
def get_ci_binomial(num, total, **kwargs):
    z = kwargs.get('z', 1.96)
    mu, sigma = get_binomial(num, total, **kwargs)

    lower = mu - z * sigma
    upper = mu + z * sigma
    return lower, upper

In [17]:
def get_bin_bad_rate(df_tmp, dpd_threshold, lst_bin, tag_name):
    df_tmp['bin'] = pd.cut(df_tmp['score'], lst_bin)
    df_tmp.loc[df_tmp['dpd'] > dpd_threshold, f'dpd{dpd_threshold}'] = 1

    df_tmp = df_tmp.groupby('bin').agg({'dpd': 'count', f'dpd{dpd_threshold}': 'sum'})
    df_tmp.columns = [f'users_{tag_name}', f'dpd{dpd_threshold}_{tag_name}']
    df_tmp['bad_rate'] = df_tmp[f'dpd{dpd_threshold}_{tag_name}'] / df_tmp[f'users_{tag_name}']
    series_lower_bound, series_higher_bound = get_ci_binomial(df_tmp[f'dpd{dpd_threshold}_{tag_name}'],
                                                                     df_tmp[f'users_{tag_name}'], z=2.06)

    series_lower_bound = (series_lower_bound / 1.2).fillna(0.0).apply(lambda x: format(x, '.2%'))
    series_higher_bound = (series_higher_bound * 1.2).fillna(0.0).apply(lambda x: format(x, '.2%'))
    lst_range = list()
    for idx, item in series_lower_bound.items():
        lst_range.append([series_lower_bound[idx], series_higher_bound[idx]])

    df_tmp['bad_rate_range'] = lst_range

    return df_tmp

In [18]:
def get_cumulative_bad_rate(df, dpd_threshold, tag_name):
    cumul_user = 0
    cumul_bad = 0
    lst_cumul_perc = list()
    lst_cumul_user = list()
    df_de = df.sort_index(ascending=False)
    for bin_name, row in df_de.iterrows():
        if len(lst_cumul_perc) == 0:
            try:
                cumul_bad = row[f'dpd{dpd_threshold}_{tag_name}']
                cumul_user = row[f'users_{tag_name}']
                cumul_percent = cumul_bad / cumul_user
                lst_cumul_perc.append(cumul_percent)
                lst_cumul_user.append(cumul_user)
            except:
                lst_cumul_perc.append(0.0)
                lst_cumul_user.append(0.0)
        else:
            try:
                cumul_bad = cumul_bad + row[f'dpd{dpd_threshold}_{tag_name}']
                cumul_user = cumul_user + row[f'users_{tag_name}']
                cumul_percent = cumul_bad / cumul_user
                lst_cumul_perc.append(cumul_percent)
                lst_cumul_user.append(cumul_user)
            except:
                lst_cumul_perc.append(0.0)
                lst_cumul_user.append(0.0)
    lst_cumul_perc.reverse()
    lst_cumul_user.reverse()
    lst_cumul_user_density = lst_cumul_user / df[f'users_{tag_name}'].sum()
    return lst_cumul_perc, lst_cumul_user_density

In [19]:
def generate_performance_report(df, lst_bin, lst_term, dpd_threshold, tag_name):
    df_tmp = df[df.term.isin(lst_term)][['score', 'dpd']].copy()
    df_tmp.loc[df_tmp['score'] == 0.0, 'score'] = 0.1

    df_tmp.loc[df_tmp['dpd'] > dpd_threshold, 'label'] = 1
    df_tmp.loc[df_tmp['dpd'] <= dpd_threshold, 'label'] = 0

    df_tmp = get_bin_bad_rate(df_tmp, dpd_threshold, lst_bin, tag_name)

    df_tmp['user_density'] = df_tmp[f'users_{tag_name}'] / df_tmp[f'users_{tag_name}'].sum()

    lst_cumul_bad_rate, lst_cumul_user_density = get_cumulative_bad_rate(df_tmp, dpd_threshold, tag_name)

    df_tmp['cum_user_density'] = lst_cumul_user_density
    df_tmp[f'cum_bad_rate'] = lst_cumul_bad_rate

    df_tmp['bad_rate'] = df_tmp['bad_rate'].fillna(0.0).apply(lambda x: format(x, '.2%'))
    df_tmp['user_density'] = df_tmp['user_density'].fillna(0.0).apply(lambda x: format(x, '.2%'))
    df_tmp['cum_user_density'] = df_tmp['cum_user_density'].fillna(0.0).apply(lambda x: format(x, '.2%'))
    df_tmp['cum_bad_rate'] = df_tmp['cum_bad_rate'].fillna(0.0).apply(lambda x: format(x, '.2%'))


    return df_tmp

In [20]:
lst_bin = [0, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 100]
lst_term = [202004]
dpd_threshold = 7
tag_name = '202004'

In [21]:
generate_performance_report(pd_04, lst_bin, lst_term, dpd_threshold, tag_name)

Unnamed: 0_level_0,users_202004,dpd7_202004,bad_rate,bad_rate_range,user_density,cum_user_density,cum_bad_rate
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(0, 30]",56,40.0,71.43%,"[48.20%, 98.40%]",0.56%,100.00%,11.03%
"(30, 35]",68,48.0,70.59%,"[48.56%, 96.59%]",0.68%,99.44%,10.69%
"(35, 40]",107,52.0,48.60%,"[32.41%, 70.10%]",1.07%,98.76%,10.28%
"(40, 45]",99,39.0,39.39%,"[24.93%, 59.69%]",0.99%,97.69%,9.86%
"(45, 50]",182,73.0,40.11%,"[27.44%, 57.28%]",1.82%,96.70%,9.56%
"(50, 55]",319,95.0,29.78%,"[20.67%, 42.35%]",3.19%,94.88%,8.97%
"(55, 60]",534,116.0,21.72%,"[15.23%, 30.74%]",5.34%,91.69%,8.25%
"(60, 65]",942,165.0,17.52%,"[12.59%, 24.25%]",9.42%,86.35%,7.41%
"(65, 70]",1535,189.0,12.31%,"[8.91%, 16.97%]",15.35%,76.93%,6.17%
"(70, 75]",2642,205.0,7.76%,"[5.63%, 10.68%]",26.42%,61.58%,4.64%
