## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Callable, List

In [2]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
!pwd

/home/xavi/Documents/CreditRisk/notebooks


In [4]:
file: Path = Path('/home/xavi/Documents/CreditRisk/data/curated/dataset.parquet')

In [5]:
df: pd.DataFrame = pd.read_parquet(file, engine='pyarrow')

### Depedent variable:

In [6]:
df['loan_status'].value_counts(dropna=False, normalize=True)

loan_status
Current                                                0.480878
Fully Paid                                             0.396193
Charged Off                                            0.091092
Late (31-120 days)                                     0.014798
In Grace Period                                        0.006747
Does not meet the credit policy. Status:Fully Paid     0.004263
Late (16-30 days)                                      0.002612
Default                                                0.001784
Does not meet the credit policy. Status:Charged Off    0.001632
Name: proportion, dtype: float64

In [7]:
defaults: List[str] = [
    'Charged Off',
    'Late (31-120 days)',
    'Default',
    'Does not meet the credit policy. Status:Charged Off'
]

In [8]:
is_in_defaults: Callable  = defaults.__contains__

In [9]:
df['non_default'] = df['loan_status'].apply(lambda v: 0 if is_in_defaults(v) else 1)

In [10]:
df['zip_code'].dtype

dtype('int16')

### Indepent variables

In [11]:
#TODO: calculate Weight of evidence and Information value

In [12]:
category_mask: pd.Series = df.dtypes == 'category'

In [13]:
df.dtypes[category_mask]

term                   category
grade                  category
sub_grade              category
emp_length             category
home_ownership         category
verification_status    category
loan_status            category
pymnt_plan             category
purpose                category
addr_state             category
initial_list_status    category
policy_code            category
application_type       category
issue_d                category
dtype: object

In [45]:
marginal_prob: pd.DataFrame = pd.crosstab(df['grade'], df['non_default'], margins=False, normalize='columns', dropna=False, colnames=['default'])

In [46]:
marginal_prob['weight_of_evidence'] = marginal_prob.apply(axis=1, func=lambda s: np.log(s[1]) - np.log(s[0]))

In [58]:
marginal_prob['woe_diff'] = marginal_prob.sort_index(ascending=False)['weight_of_evidence'].diff().abs()

In [59]:
marginal_prob.sort_index(ascending=False)

default,0,1,weight_of_evidence,woe_diff
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G,0.018129,0.005774,-1.144166,
F,0.062628,0.024167,-0.952214,0.191952
E,0.136635,0.069328,-0.678466,0.273748
D,0.23246,0.156603,-0.395001,0.283465
C,0.282216,0.267047,-0.055251,0.33975
B,0.211093,0.303792,0.364043,0.419294
A,0.05684,0.173289,1.11473,0.750686


In [48]:
pd.crosstab(df['grad'], df['non_default'], margins=False, dropna=False)

non_default,0,1
grade,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2897,71970
B,10759,126170
C,14384,110909
D,11848,65040
E,6964,28793
F,3192,10037
G,924,2398


In [49]:
f = lambda s: len(s) + 0.5

In [50]:
modified_marginal_prob = pd.pivot_table(data=df, index=df['grade'], columns=df['non_default'], aggfunc=f, fill_value=0.5, values=[])\
    .apply(lambda s: s.apply(lambda v : v / s.sum()))

In [51]:
modified_marginal_prob['weight_of_evidence'] = modified_marginal_prob.apply(axis=1, func=lambda s: np.log(s[1]) - np.log(s[0])) 

In [52]:
modified_marginal_prob

non_default,0,1,weight_of_evidence
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.056845,0.173289,1.114624
B,0.211089,0.303791,0.364061
C,0.282207,0.267046,-0.055221
D,0.232453,0.156603,-0.394975
E,0.136635,0.069328,-0.67846
F,0.062633,0.024168,-0.95226
G,0.018138,0.005775,-1.144438


In [62]:
modified_marginal_prob['information_value'] = modified_marginal_prob.apply(axis=1, func=lambda s: s['weight_of_evidence'] * (s[1] - s[0]))

In [63]:
modified_marginal_prob['information_value'] = modified_marginal_prob['information_value'].sum()

In [64]:
modified_marginal_prob

non_default,0,1,weight_of_evidence,information_value
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.056845,0.173289,1.114624,0.290778
B,0.211089,0.303791,0.364061,0.290778
C,0.282207,0.267046,-0.055221,0.290778
D,0.232453,0.156603,-0.394975,0.290778
E,0.136635,0.069328,-0.67846,0.290778
F,0.062633,0.024168,-0.95226,0.290778
G,0.018138,0.005775,-1.144438,0.290778
