In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

pd.set_option('mode.chained_assignment', None)


def num_event(event, dataframe ,field, val):
    if event == 'good':
        event = 1
    else:
        event = 0
    cond_main = dataframe[field] == val
    cond_event = dataframe['Loan_Status'] == event
    n = len(dataframe[cond_main & cond_event].index)
    return n


def dist_event(event, dataframe ,field, val):
    n = num_event(event, dataframe, field, val)
    sumd = 0
    all_elements = set(dataframe[field].to_numpy())
    for a in all_elements:
        s = num_event(event, dataframe, field, a)
        sumd = sumd + s
    return n/sumd

def WOE(dataframe, field, val):
    dg = dist_event('good', dataframe, field, val)
    db = dist_event('bad', dataframe, field, val)
    w = np.log(dg / db)
    return w


def IV(dataframe, field):
    elements = set(dataframe[field].to_numpy())
    n = len(elements)
    sumiv = 0
    for i in elements:
        dg = dist_event('good', dataframe, field, i)
        db = dist_event('bad', dataframe, field, i)
        s = (dg-db) * WOE(dataframe, field, i)
        sumiv = sumiv + s
    return sumiv


train = pd.read_csv('train.csv')
# drop Unname
train = train.drop('Unnamed: 0', axis = 'columns')

# Binning
bin_fields = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
for f in bin_fields:
    fbin = pd.qcut(train[f], 10, duplicates = 'drop')
    fname = f.upper() + '_BIN'
    train[fname] = fbin

    
#Check Attr
fields = ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area','APPLICANTINCOME_BIN','COAPPLICANTINCOME_BIN','LOANAMOUNT_BIN','LOAN_AMOUNT_TERM_BIN']

list_woe=[]
list_attr=[]
list_val=[]
list_ngood = []
list_nbad = []
list_dgood = []
list_dbad = []

for f in fields:
    the_set = set(train[f].to_numpy())
   
    for s in the_set:
        n_good = num_event('good', train, f, s)
        n_bad = num_event('bad', train, f, s)
        d_good = dist_event('good', train, f, s)
        d_bad = dist_event('bad', train, f, s)
        woe = WOE(train, f, s)
        
#         if 'BIN' in f:
#             s = str(s)
        list_attr.append(f)
        list_val.append(s)
        list_ngood.append(n_good)
        list_nbad.append(n_bad)
        list_dgood.append(d_good)
        list_dbad.append(d_bad)
        list_woe.append(woe)
        
x = list(zip(list_attr, list_val, list_ngood, list_nbad, list_dgood, list_dbad,list_woe)) 
xlabel = ['ATTR', 'VAL', 'N_GOOD', 'N_BAD', 'DIST_GOOD', 'DIST_BAD','WOE']
attr = pd.DataFrame(x, columns = xlabel )

attr = attr.reset_index(drop=True)

# Transform train -> train_woe
train_woe = pd.DataFrame()
train_woe['Loan_Status'] = train['Loan_Status']

ATTRS = ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area','APPLICANTINCOME_BIN','COAPPLICANTINCOME_BIN','LOANAMOUNT_BIN','LOAN_AMOUNT_TERM_BIN']


for f in fields:
    f_element = set(train[f].to_numpy())
    f_woe = dict()
    for i in f_element:
        w = WOE(train, f, i)
        f_woe[i] = w
#     print(f'WOE {f} -> {f_woe}')

    lst = []
    for s in train[f]:
        woe = f_woe[s]
        lst.append(woe)

    new_field = f + '_WOE'
    train_woe[new_field] = lst

# Regression Model
Y =  train_woe['Loan_Status'].to_numpy()
x_1 = train_woe['Gender_WOE'].to_numpy()
x_2 = train_woe['Married_WOE'].to_numpy()
x_3 = train_woe['Dependents_WOE'].to_numpy()
x_4 = train_woe['Education_WOE'].to_numpy()
x_5 = train_woe['Self_Employed_WOE'].to_numpy()
x_6 = train_woe['Credit_History_WOE'].to_numpy()
x_7 = train_woe['Property_Area_WOE'].to_numpy()
x_8 = train_woe['APPLICANTINCOME_BIN_WOE'].to_numpy()
x_9 = train_woe['COAPPLICANTINCOME_BIN_WOE'].to_numpy()
x_10 = train_woe['LOANAMOUNT_BIN_WOE'].to_numpy()
x_11 = train_woe['LOAN_AMOUNT_TERM_BIN_WOE'].to_numpy()

x_0 = np.ones(x_1.shape)
X = np.array([x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7,x_8, x_9, x_10, x_11])

X = np.transpose(X)
B = np.linalg.inv(np.matmul(np.transpose(X), X))
B = np.matmul(B, np.transpose(X))
B = np.matmul(B, Y)

# B = np.matmul(np.linalg.inv(np.matmul(np.transpose(X), X)), np.matmul(np.transpose(X), Y))

# Print model
eq = f'y = {B[0]:.3f} '
for i in range(1, len(B)):
    eq = eq + f'+ ({B[i]:.5f})x_{i}'
print(eq)

y = 0.697 + (0.21109)x_1+ (0.11063)x_2+ (0.59759)x_3+ (0.16011)x_4+ (0.00595)x_5+ (0.15925)x_6+ (0.12986)x_7+ (0.14358)x_8+ (0.03998)x_9+ (0.15228)x_10+ (0.14257)x_11


In [2]:
train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,APPLICANTINCOME_BIN,COAPPLICANTINCOME_BIN,LOANAMOUNT_BIN,LOAN_AMOUNT_TERM_BIN
0,1,2,1,1,1,4583,1508.0,128.0,360.0,1.0,2,0,"(3981.5, 4587.6]","(1211.5, 1673.4]","(120.0, 128.0]","(300.0, 360.0]"
1,1,2,0,1,0,3000,0.0,66.0,360.0,1.0,1,1,"(2663.0, 3083.3]","(-0.001, 1211.5]","(16.999, 74.7]","(300.0, 360.0]"
2,1,2,0,0,1,2583,2358.0,120.0,360.0,1.0,1,1,"(2272.6, 2663.0]","(2113.5, 2520.2]","(111.0, 120.0]","(300.0, 360.0]"
3,1,1,0,1,1,6000,0.0,141.0,360.0,1.0,1,1,"(5657.1, 6663.2]","(-0.001, 1211.5]","(138.0, 158.0]","(300.0, 360.0]"
4,1,2,1,1,0,5417,4196.0,267.0,360.0,1.0,1,1,"(4587.6, 5657.1]","(3750.0, 11300.0]","(237.2, 700.0]","(300.0, 360.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,1,1,0,0,1,3691,0.0,110.0,360.0,1.0,2,1,"(3500.0, 3981.5]","(-0.001, 1211.5]","(96.4, 111.0]","(300.0, 360.0]"
334,0,1,0,0,0,17263,0.0,225.0,360.0,1.0,3,1,"(10000.0, 81000.0]","(-0.001, 1211.5]","(180.0, 237.2]","(300.0, 360.0]"
335,1,2,0,1,1,4625,2857.0,111.0,12.0,1.0,1,1,"(4587.6, 5657.1]","(2520.2, 3750.0]","(96.4, 111.0]","(11.999, 300.0]"
336,1,2,1,1,0,2895,0.0,95.0,360.0,1.0,3,1,"(2663.0, 3083.3]","(-0.001, 1211.5]","(74.7, 96.4]","(300.0, 360.0]"


In [3]:
attr.to_pickle('Loan_attr.pickle')

In [4]:
df_iv = pd.DataFrame({'ATTR': [], 'IV': []})
for f in fields:
    d = pd.DataFrame({'ATTR':[f], 'IV': [IV(train, f)]})
    df_iv = df_iv.append(d, ignore_index = True)
df_iv

Unnamed: 0,ATTR,IV
0,Gender,0.012454
1,Married,0.051459
2,Dependents,0.000573
3,Education,0.0525
4,Self_Employed,0.008323
5,Credit_History,1.987847
6,Property_Area,0.116885
7,APPLICANTINCOME_BIN,0.074304
8,COAPPLICANTINCOME_BIN,0.052852
9,LOANAMOUNT_BIN,0.072998


In [5]:
train_woe

Unnamed: 0,Loan_Status,Gender_WOE,Married_WOE,Dependents_WOE,Education_WOE,Self_Employed_WOE,Credit_History_WOE,Property_Area_WOE,APPLICANTINCOME_BIN_WOE,COAPPLICANTINCOME_BIN_WOE,LOANAMOUNT_BIN_WOE,LOAN_AMOUNT_TERM_BIN_WOE
0,0,0.053301,0.174214,0.028608,0.117839,0.037128,0.581294,-0.405627,0.168712,0.022529,0.063351,0.075327
1,1,0.053301,0.174214,-0.020031,0.117839,-0.224331,0.581294,-0.051579,0.325715,-0.124074,-0.373367,0.075327
2,1,0.053301,0.174214,-0.020031,-0.447475,0.037128,0.581294,-0.051579,0.022529,0.687505,0.317132,0.075327
3,1,0.053301,-0.296652,-0.020031,0.117839,0.037128,0.581294,-0.051579,-0.115341,-0.124074,-0.020031,0.075327
4,1,0.053301,0.174214,0.028608,0.117839,-0.224331,0.581294,-0.051579,-0.547558,-0.159792,-0.496265,0.075327
...,...,...,...,...,...,...,...,...,...,...,...,...
333,1,0.053301,-0.296652,-0.020031,-0.447475,0.037128,0.581294,-0.405627,0.127890,-0.124074,0.245673,0.075327
334,1,-0.233900,-0.296652,-0.020031,-0.447475,-0.224331,0.581294,0.436191,-0.206312,-0.124074,-0.020031,0.075327
335,1,0.053301,0.174214,-0.020031,0.117839,0.037128,0.581294,-0.051579,-0.547558,0.063351,0.245673,-0.159792
336,1,0.053301,0.174214,0.028608,0.117839,-0.224331,0.581294,0.436191,0.325715,-0.124074,0.325715,0.075327


In [6]:
attr

Unnamed: 0,ATTR,VAL,N_GOOD,N_BAD,DIST_GOOD,DIST_BAD,WOE
0,Gender,0,39,21,0.164557,0.207921,-0.2339
1,Gender,1,198,80,0.835443,0.792079,0.053301
2,Married,1,75,43,0.316456,0.425743,-0.296652
3,Married,2,162,58,0.683544,0.574257,0.174214
4,Dependents,0,138,60,0.582278,0.594059,-0.020031
5,Dependents,1,99,41,0.417722,0.405941,0.028608
6,Education,0,39,26,0.164557,0.257426,-0.447475
7,Education,1,198,75,0.835443,0.742574,0.117839
8,Self_Employed,0,30,16,0.126582,0.158416,-0.224331
9,Self_Employed,1,207,85,0.873418,0.841584,0.037128
