In [16]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, joblib
from IPython.display import display

In [17]:
model = joblib.load('../artifacts/bestModel.pkl')
df = joblib.load('../artifacts/data.pkl')
coef = joblib.load('../artifacts/coef.pkl')
intercept = joblib.load('../artifacts/intercept.pkl')

In [8]:
f = pd.read_csv("../../data/modelTable.csv")
name = f.columns.tolist()
name

['SK_ID_CURR',
 'TARGET',
 'NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'OCCUPATION_TYPE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_3',
 'FLA

In [14]:
df.columns.tolist()

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'EXT_SOURCE_2_woe',
 'EXT_SOURCE_3_woe',
 'DAYS_BIRTH_woe',
 'DAYS_EMPLOYED_woe',
 'AMT_CREDIT_woe',
 'AMT_INCOME_TOTAL_woe',
 'BUREAU_DAYS_CREDIT_MIN_woe',
 'BUREAU_DAYS_CREDIT_MAX_woe',
 'BUREAU_DAYS_CREDIT_MEAN_woe',
 'BUREAU_DAYS_CREDIT_ENDDATE_MEAN_woe',
 'BUREAU_AMT_CREDIT_SUM_SUM_woe',
 'BUREAU_AMT_CREDIT_SUM_MEAN_woe',
 'BUREAU_AMT_CREDIT_SUM_DEBT_SUM_woe',
 'BUREAU_AMT_CREDIT_SUM_DEBT_MEAN_woe',
 'BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM_woe',
 'BUREAU_AMT_CREDIT_SUM_OVERDUE_MEAN_woe',
 'BUREAU_DEBT_CREDIT_RATIO_MEAN_woe',
 'BUREAU_IS_ACTIVE_SUM_woe',
 'BUREAU_IS_CLOSED_SUM_woe',
 'BUREAU_IS_SOLD_SUM_woe',
 'BUREAU_IS_BAD_DEBT_SUM_woe',
 'BUREAU_IS_REVOLVING_SUM_woe',
 'BUREAU_IS_CONSUMER_SUM_woe',
 'BUREAU_IS_MORTGAGE_SUM_woe',
 'BUREAU_BB_MONTHS_BALANCE_COUNT_SUM

In [19]:
info = f[['SK_ID_CURR', 
'NAME_CONTRACT_TYPE', 
'NAME_INCOME_TYPE', 
'FLAG_OWN_REALTY', # secure | unsecure
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS']]

In [20]:
info.shape

(307511, 6)

In [21]:
base = df[["SK_ID_CURR", "TARGET", "PD"]]
base.shape

(307511, 3)

<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Segmentation 🪵
</span>

<span style="font-size:15px; font-family:'Times new roman'">
Group to rating grades so i can estimate stable default rates at the grade level --> not individual<br>
therefore, first problem is --> how to defind the cut-off each level!!
<span>

<span style="font-size:15px; font-family:'Times new roman'">
IFRS9 Two-layers segmantation approch<br>
- Portfolio level --> features that share similar credit risk characteristics<br>
- Risk grade level --> group by level of calculated PD<br>
<br>
# of buckets
5 might groups balance granularity against having enough defaults in each bucket to observe actual<br>default rates over calibration window --> not too small and too large
<span>

<span style="font-size:15px; font-family:'Times new roman'">
Stage 1: PD bucket’s 12-month PD ≤ X % and no other SICR triggers<br>
Stage 2: PD bucket’s 12-month PD > X % (e.g. >1 %) or other SICR indicators<br>
Stage 3: default status<br>
<span>

In [23]:
seg = pd.merge(
    base,
    info,
    on="SK_ID_CURR",
    how="left"
)
seg.shape
seg.head()

Unnamed: 0,SK_ID_CURR,TARGET,PD,NAME_CONTRACT_TYPE,NAME_INCOME_TYPE,FLAG_OWN_REALTY,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS
0,100002,1,0.851064,Cash loans,Working,1,Secondary / secondary special,Single / not married
1,100003,0,0.157916,Cash loans,State servant,0,Higher education,Married
2,100004,0,0.223813,Revolving loans,Working,1,Secondary / secondary special,Single / not married
3,100006,0,0.418342,Cash loans,Working,1,Secondary / secondary special,Civil marriage
4,100007,0,0.540706,Cash loans,Working,1,Secondary / secondary special,Single / not married


In [24]:
seg["portfolio"] = (
      seg["NAME_CONTRACT_TYPE"].astype(str) + "_"
    + seg["NAME_INCOME_TYPE"].astype(str) + "_"
    + seg["FLAG_OWN_REALTY"].map({1:"Secured",0:"Unsecured"}) + "_"
    + seg["NAME_EDUCATION_TYPE"].astype(str) + "_"
    + seg["NAME_FAMILY_STATUS"].astype(str)
)

In [25]:
seg.head()

Unnamed: 0,SK_ID_CURR,TARGET,PD,NAME_CONTRACT_TYPE,NAME_INCOME_TYPE,FLAG_OWN_REALTY,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,portfolio
0,100002,1,0.851064,Cash loans,Working,1,Secondary / secondary special,Single / not married,Cash loans_Working_Secured_Secondary / seconda...
1,100003,0,0.157916,Cash loans,State servant,0,Higher education,Married,Cash loans_State servant_Unsecured_Higher educ...
2,100004,0,0.223813,Revolving loans,Working,1,Secondary / secondary special,Single / not married,Revolving loans_Working_Secured_Secondary / se...
3,100006,0,0.418342,Cash loans,Working,1,Secondary / secondary special,Civil marriage,Cash loans_Working_Secured_Secondary / seconda...
4,100007,0,0.540706,Cash loans,Working,1,Secondary / secondary special,Single / not married,Cash loans_Working_Secured_Secondary / seconda...


In [26]:
n = 5 # # of buckets
seg["PDByBucket"] = seg.groupby('portfolio')['PD'].transform(
    # pd.qcut splits each portfolio’s PD distribution into equal-sized group
    lambda x: pd.qcut(x, n, labels=[f'B{i+1}' for i in range(n)]) 
)

ValueError: Bin edges must be unique: Index([0.06432572130765499, 0.06432572130765499, 0.06432572130765499,
       0.06432572130765499, 0.06432572130765499, 0.06432572130765499],
      dtype='float64', name='Cash loans_Commercial associate_Secured_Academic degree_Civil marriage').
You can drop duplicate edges by setting the 'duplicates' kwarg

<span style="font-size:30px; font-family:'Times new roman'; font-weight:bold">
Calibration🪵
</span>

<span style="font-size:15px; font-family:'Times new roman'">
Why calibration❓<br>
    - PD accuracy --> meaningful of calculated PDs<br>
    - Regulatory and Risk management --> IFRS9 or Basel need well-calibrated PDs to compute ECL<br>
    - Score Interpretability
<span>