In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve,roc_auc_score
from lightgbm import LGBMClassifier,plot_importance,plot_metric
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Defining Functions

In [2]:
def fillrate_1(df):
    data=pd.DataFrame(columns=['variable','unique_values',
                               'fill rate'])
    for var in df.columns:
        if ((df[var].dtype!='object') and (df[var].dtype!='datetime64[ns]')):
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'fill rate':fill_rate},ignore_index=True)
        else:
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'fill rate':fill_rate},ignore_index=True)
            
    return data
def fillrate(df):
    data=pd.DataFrame(columns=['variable','unique_values','count','mean','median','mode','max','min',
                               'fill rate'])
    for var in df.columns:
        if ((df[var].dtype!='object') and (df[var].dtype!='datetime64[ns]')):
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'count':len(df[var]),
                         'mean':np.round(df[var].mean(),3),'median':np.round(df[var].median(),3),'mode':np.NAN,'max':df[var].max(),
                          'min':df[var].min(),'fill rate':fill_rate},ignore_index=True)
        else:
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'count':len(df[var]),
                         'mean':np.NAN,'median':np.NAN,'mode':df[var].mode()[0],'max':np.NAN,
                          'min':np.NAN,'fill rate':fill_rate},ignore_index=True)
                        

    return data


def iv_woe(data, target, bins=10):
    
    newDF = pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] =np.maximum(d['Events'],1) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'],1) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        
    return newDF
def ks(data=None,target=None, prob=None):
    data['target0'] = 1 - data[target]
    data['bucket'] = pd.qcut(data[prob], 10,duplicates='drop')
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 9)
    ks=kstable.KS.max()
    return kstable,ks


def scorecard(model,X_train,X_test):
    y_pred=model.predict(X_test)
    y_prob=model.predict_proba(X_test)[:,1]
    cc=confusion_matrix(y_test,y_pred)
    TN=cc[0,0]
    TP=cc[1,1]
    FP=cc[0,1]
    FN=cc[1,0]
    accuracy=(TP+TN)/(TP+TN+FP+FN)
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    Tpr=TP/(TP+FN)
    Fpr=FP/(FP+TN)
    f1_ratio=2*((recall*precision)/(precision+recall))
    auc_test=roc_auc_score(y_test,y_pred)
    data=pd.DataFrame({'y':y_test,'p':y_prob})
    a,ks_test=ks(data=data,target='y',prob='p')
    y_pred_tr=model.predict(X_train)
    y_prob_tr=model.predict_proba(X_train)[:,1]
    cctrain=confusion_matrix(y_train,y_pred_tr)
    TN=cctrain[0,0]
    TP=cctrain[1,1]
    FP=cctrain[0,1]
    FN=cctrain[1,0]
    accuracy_tr=(TP+TN)/(TP+TN+FP+FN)
    precision_tr=TP/(TP+FP)
    recall_tr=TP/(TP+FN)
    Tpr_tr=TP/(TP+FN)
    Fpr_tr=FP/(FP+TN)
    f1_ratio_tr=2*((recall_tr*precision_tr)/(precision_tr+recall_tr))
    auc_tr=roc_auc_score(y_train,y_pred_tr)
    data_r=pd.DataFrame({'y':y_train,'p':y_prob_tr})
    a,ks_train=ks(data=data_r,target='y',prob='p')
    df=pd.DataFrame({'metrics':['accuracy','precision','recall','TPR','FPR','f1_ratio','auc_score','ks statistics'],
                     'train':[accuracy_tr,precision_tr,recall_tr,Tpr_tr,Fpr_tr,f1_ratio_tr,auc_tr,ks_train],
                      'test':[accuracy,precision,recall,Tpr,Fpr,f1_ratio,auc_test,ks_test]})
    df=df.set_index('metrics')
    return df
    
def raw_to_woe_values(data, target, bins=10):
    
    df = data.copy()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': target})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': target})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] =np.maximum(d['Events'],1) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'],1) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        z=list(d['Cutoff'])
        j=list(d['WoE'])
        dic={z[i]:j[i] for i in range(len(d))}
        df[ivars]=d0['x'].map(dic)
        
        
    return df
def correlation(d,poscutoff,negcutoff):
    v1=[]
    v2=[]
    corr=[]
    for a in d.columns:
        for b in d.drop(labels=a,axis=1).columns:
            if d[a].corr(d[b])>poscutoff:
                v1.append(a)
                v2.append(b)
                corr.append(d[a].corr(d[b]))
            elif d[a].corr(d[b])<negcutoff:
                v1.append(a)
                v2.append(b)
                corr.append(d[a].corr(d[b]))
    df=pd.DataFrame({'v1':v1,'v2':v2,'corr':corr})
    
    return df



# Loading data

In [3]:
data=pd.read_parquet('Project_data.parquet',engine='fastparquet')
data.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_3014,var_3015,var_3016,var_3017,var_3018,var_3019,var_3020,var_3021,var_3022,Delay
0,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29.0,10.0,8.0,6641.58,8948.17,10106.0,10075.2,13919.2,14014.5,-24
1,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,10647.0,10647.0,10647.0,31019.0,31019.0,31019.0,8
2,-1.0,0.0,1.0,0.880797,0.0,0.0,0.0,0.0,0.0,0.0,...,49.0,49.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,0.982014,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,0.083333,0.0,0.0,155.287,144.188,240.313,0
4,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,-5


In [4]:
print(' there are',data.shape[0],' rows and ',data.shape[1],' variables in the dataset')
print('data types in data is')
data.dtypes.value_counts()

 there are 11563  rows and  3023  variables in the dataset
data types in data is


float64    3004
int64        16
object        3
dtype: int64

# Deciding target variable based on Delay 

target variable

Max_delay_all<=2 is non_event (good customer)

max_dela_all>=15 is event (bad customer)

rest are grey

In [5]:
target=[]
for i in range(len(data)):
    if data['Delay'].iloc[i]<=2:
        target.append('non event')
    elif data['Delay'].iloc[i]>=15:
        target.append('event')
    else:
        target.append('grey')
        
data['Event']=target

In [6]:
data.Event.value_counts(normalize=True)*100

non event    77.609617
event        16.189570
grey          6.200813
Name: Event, dtype: float64

The above Event rate shows that there are only 6.2% of grey customers in the data so it does not help in classification of customers so dropping all grey customers

In [7]:
index_to_drop=data[data.Event=='grey'].index
data1=data.drop(index_to_drop)
print('before droping grey customer',data.shape)
print('after droping grey customer',data1.shape)

before droping grey customer (11563, 3024)
after droping grey customer (10846, 3024)


In [8]:
data1=data1.drop(labels='Delay',axis=1)

# FillRate

Fill rate is ratio of number of non null data points to the total number of data points.

Fill rate is useful for the Feature selection, because fill rate tells how much percentage of data a particular variable has.

using fillrate cutoff as 60%, assuming the variables with fillrate less than 60%, it does not give information


In [9]:
Fr_variables=fillrate(data1)
Fr_variables

Unnamed: 0,variable,unique_values,count,mean,median,mode,max,min,fill rate
0,var_1,4,10846,-0.379,0.00,,1.0,-2.0,92.956
1,var_2,8,10846,0.417,0.50,,1.0,0.0,92.956
2,var_3,4,10846,-0.774,0.00,,1.0,-2.0,92.956
3,var_4,8,10846,0.132,0.00,,1.0,0.0,92.956
4,var_5,1,10846,0.000,0.00,,0.0,0.0,92.956
...,...,...,...,...,...,...,...,...,...
3018,var_3019,1434,10846,3246.293,0.00,,2248740.0,0.0,76.572
3019,var_3020,5194,10846,4313.357,91.00,,1731560.0,0.0,87.304
3020,var_3021,4618,10846,4941.504,84.75,,1731560.0,0.0,84.925
3021,var_3022,3783,10846,5739.521,64.50,,1905920.0,0.0,76.572


Dropping variables less than or equal to fillrate cutoff (i,e 60%)

In [10]:
columns=list(Fr_variables[Fr_variables['fill rate']<=60].variable)
data_60_fillrate=data1.drop(labels=columns,axis=1)
print('before drop',data1.shape)
print('after drop', data_60_fillrate.shape)

before drop (10846, 3023)
after drop (10846, 3009)


# Encoding Event variable to from string to number

In [11]:
dic={'non event':0,'event':1}
data_60_fillrate['Event']=data_60_fillrate['Event'].map(dic)

# IV values

IV (Information value) : Information value is the single value representing the entire feature’s predictive power. This will be useful during the feature selection.   

                                 IV = ∑ (% of non-events - % of events) * WOE
                                 
based on IV values we can select Variables with higher information

In [12]:
iv=iv_woe(data=data_60_fillrate,target='Event',bins=10)
iv=iv.set_index('Variable')
iv.head(5)

Unnamed: 0_level_0,IV
Variable,Unnamed: 1_level_1
var_1,0.009017
var_2,0.007014
var_3,0.004729
var_4,0.004729
var_5,0.0


Creating buckets to select IV cutoff

In [13]:
buckets=[]
for i in range(len(iv)):
    if iv.IV.iloc[i]<=0.001:
        buckets.append('<=0.001')
    elif iv.IV.iloc[i]<=0.005:
        buckets.append('.001 to 0.005')
    elif iv.IV.iloc[i]<=0.02:
        buckets.append('0.005 to 0.02')
    elif iv.IV.iloc[i]<=0.1:
        buckets.append('0.02 to 0.1')
    else:
        buckets.append('>0.1')
        
        
iv['buckets']=buckets
iv.groupby('buckets').agg('count')

Unnamed: 0_level_0,IV
buckets,Unnamed: 1_level_1
.001 to 0.005,295
0.005 to 0.02,455
0.02 to 0.1,525
<=0.001,1601
>0.1,132


Based on the distribution of variables in each IV buckets selecting cutoff as 0.01

In [14]:
iv.head()

Unnamed: 0_level_0,IV,buckets
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
var_1,0.009017,0.005 to 0.02
var_2,0.007014,0.005 to 0.02
var_3,0.004729,.001 to 0.005
var_4,0.004729,.001 to 0.005
var_5,0.0,<=0.001


In [38]:
cols=iv[iv.IV<0.01].index
data2=data_60_fillrate.drop(labels=cols,axis=1)
print('before drop',data_60_fillrate.shape)
print('after drop ',data2.shape)

before drop (10846, 3009)
after drop  (10846, 899)


In [16]:
data2.columns

Index(['var_63', 'var_64', 'var_67', 'var_68', 'var_79', 'var_80', 'var_127',
       'var_128', 'var_129', 'var_130',
       ...
       'var_2976', 'var_2981', 'var_2982', 'var_2986', 'var_2987', 'var_3003',
       'var_3017', 'var_3018', 'var_3019', 'Event'],
      dtype='object', length=899)

# Splitting data into train(80%) and test(20%)

In [17]:
X=data2.drop(labels=['Event'],axis=1)
y=data2['Event']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

Checking for distribution of event rate i train and test

In [18]:
print('Y_train','\n',y_train.value_counts(normalize=True)*100)
print('Y_test','\n', y_test.value_counts(normalize=True)*100)

Y_train 
 0    82.745505
1    17.254495
Name: Event, dtype: float64
Y_test 
 0    82.718894
1    17.281106
Name: Event, dtype: float64


# Base Model LightGBM

In [19]:
lgbm1=LGBMClassifier(min_child_samples=500)
lgbm1.fit(X_train,y_train)
score=scorecard(lgbm1,X_train,X_test)
score

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.904564,0.852535
precision,0.903498,0.682119
recall,0.500334,0.274667
TPR,0.500334,0.274667
FPR,0.011144,0.026741
f1_ratio,0.644024,0.391635
auc_score,0.744595,0.623963
ks statistics,75.7,49.0


# droping variables Which has zero feature importance 

In [20]:
imp=pd.DataFrame({'variables':X_train.columns,'importance':lgbm1.feature_importances_})
columns=list(imp[imp.importance>0]['variables'])
print(len(imp)-len(columns), 'variables has 0 feature importance so droping them')
X_train_n=X_train[columns]
X_test_n=X_test[columns]

471 variables has 0 feature importance so droping them


By removing variables with zero feature importance the complexity of model is reduced with same scores as previous

In [21]:
lgbm2=LGBMClassifier(min_child_samples=500)
lgbm2.fit(X_train_n,y_train)
score=scorecard(lgbm2,X_train_n,X_test_n)
score

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.904564,0.852535
precision,0.903498,0.682119
recall,0.500334,0.274667
TPR,0.500334,0.274667
FPR,0.011144,0.026741
f1_ratio,0.644024,0.391635
auc_score,0.744595,0.623963
ks statistics,75.7,49.0


By observing above Train and Test scores there is significant difference hence the model is overfitting. so checking for multicolinearity

# Checking for correlation between variables

if the correlation is high between two variables then both the variables will give same meaning or influence on target variable. so we can drop one of them this will reduce complexity of model as well as chance of overfitting. 

In [22]:
corr=correlation(X_train_n,0.65,-0.65)
corr

Unnamed: 0,v1,v2,corr
0,var_131,var_132,0.816501
1,var_132,var_131,0.816501
2,var_192,var_937,0.655538
3,var_194,var_456,0.817583
4,var_197,var_198,0.671617
...,...,...,...
1777,var_2914,var_2912,0.954130
1778,var_2971,var_2753,0.766033
1779,var_2981,var_2753,0.710287
1780,var_3017,var_3019,0.957248


In [23]:
cols=set(corr['v1'])
len(cols)

351

# Lgbm model by removing high correlated variables

In [24]:
X_test_corr=X_test_n.drop(labels=cols,axis=1)
X_train_corr=X_train_n.drop(labels=cols,axis=1)
lgbm3=LGBMClassifier(min_child_samples=500)
lgbm3.fit(X_train_corr,y_train)
score3=scorecard(lgbm3,X_train_corr,X_test_corr)
score3

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.86745,0.841475
precision,0.877996,0.686747
recall,0.269205,0.152
TPR,0.269205,0.152
FPR,0.007801,0.014485
f1_ratio,0.412065,0.248908
auc_score,0.630702,0.568758
ks statistics,64.6,39.5


The above scores shows the improvement i,e overfitting reduced as the difference between train and test is reduced

# Checking for vif values of variables

if vif is high then variables are highly correlated with each other this leads to the overfitting of model. to avoid overfitting we remove variables having vif values.

Considering variables with vif less than or equal to 10


In [25]:
z=X_test_corr.fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = z.columns
vif_data["VIF"] = [variance_inflation_factor(z.values, i)
                          for i in range(len(z.columns))]

# model after droping high vif variables

In [26]:
vif_data=vif_data.set_index('feature')
vif_col=vif_data[vif_data.VIF<=10].index
len(vif_col)

70

In [27]:
X_test_vif=X_test_corr[vif_col]
X_train_vif=X_train_corr[vif_col]
lgbm4=LGBMClassifier(min_child_samples=500)
lgbm4.fit(X_train_vif,y_train)
score4=scorecard(lgbm4,X_train_vif,X_test_vif)
score4

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.86745,0.835945
precision,0.866808,0.614458
recall,0.273881,0.136
TPR,0.273881,0.136
FPR,0.008776,0.017827
f1_ratio,0.416244,0.222707
auc_score,0.632553,0.559086
ks statistics,65.2,41.7


There is improvement from previous scores

# Hyper parameter

Checking for the parameter which gives best result

In [28]:
param={'boosting_type':['dart','gbdt',],
       'max_depth':[4,8,12],'n_estimators':[75,100,150,200],
       'min_child_samples':[300,400,450,500,550]
        }
grid=GridSearchCV(estimator=LGBMClassifier(),param_grid=param,cv=2,scoring='accuracy')
grid.fit(X_train_vif,y_train)

GridSearchCV(cv=2, estimator=LGBMClassifier(),
             param_grid={'boosting_type': ['dart', 'gbdt'],
                         'max_depth': [4, 8, 12],
                         'min_child_samples': [300, 400, 450, 500, 550],
                         'n_estimators': [75, 100, 150, 200]},
             scoring='accuracy')

In [29]:
print('Best params','\n',grid.best_params_)

Best params 
 {'boosting_type': 'gbdt', 'max_depth': 8, 'min_child_samples': 300, 'n_estimators': 150}


# Model by best parameter

In [30]:
lgbm4=LGBMClassifier(boosting_type='gbdt',max_depth=8,min_child_samples=500,n_estimators=150,)
lgbm4.fit(X_train_vif,y_train)
score4=scorecard(lgbm4,X_train_vif,X_test_vif)
score4

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.880244,0.840553
precision,0.886824,0.635514
recall,0.350701,0.181333
TPR,0.350701,0.181333
FPR,0.009333,0.021727
f1_ratio,0.502633,0.282158
auc_score,0.670684,0.579803
ks statistics,69.6,40.1


The above scores shows the improvent in metrics compared to previous scores

# Ks tables for test and train

Checking event rate and non event rate in each bands

the band with least event rate is good because less bad customers

In [31]:
y_prob=lgbm4.predict_proba(X_test_vif)[:,1]
d=pd.DataFrame({'y':y_test,'p':y_prob})
kstable,ks_stat=ks(data=d,target='y',prob='p')
kstable

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.391968,0.836326,119,98,31.73%,5.46%,31.73%,5.46%,26.3
2,0.269319,0.389771,63,154,16.80%,8.58%,48.53%,14.04%,34.5
3,0.200047,0.268605,55,162,14.67%,9.03%,63.20%,23.06%,40.1
4,0.148799,0.19984,36,181,9.60%,10.08%,72.80%,33.15%,39.7
5,0.112722,0.148232,26,191,6.93%,10.64%,79.73%,43.79%,35.9
6,0.0825,0.11249,30,187,8.00%,10.42%,87.73%,54.21%,33.5
7,0.060164,0.082444,22,195,5.87%,10.86%,93.60%,65.07%,28.5
8,0.040753,0.060116,13,204,3.47%,11.36%,97.07%,76.43%,20.6
9,0.023638,0.040736,6,211,1.60%,11.75%,98.67%,88.19%,10.5
10,0.003263,0.023569,5,212,1.33%,11.81%,100.00%,100.00%,0.0


In [32]:
y_prob=lgbm4.predict_proba(X_train_vif)[:,1]
f=pd.DataFrame({'y':y_train,'p':y_prob})
kstable,ks_stat=ks(data=f,target='y',prob='p')
kstable

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.427536,0.903745,726,142,48.50%,1.98%,48.50%,1.98%,46.5
2,0.283511,0.427206,393,474,26.25%,6.60%,74.75%,8.58%,66.2
3,0.203312,0.283252,192,676,12.83%,9.42%,87.58%,18.00%,69.6
4,0.148538,0.203275,89,778,5.95%,10.84%,93.52%,28.83%,64.7
5,0.111409,0.148537,40,828,2.67%,11.53%,96.19%,40.37%,55.8
6,0.082286,0.111251,25,842,1.67%,11.73%,97.86%,52.10%,45.8
7,0.060058,0.082265,18,850,1.20%,11.84%,99.06%,63.94%,35.1
8,0.040463,0.060057,10,857,0.67%,11.94%,99.73%,75.87%,23.9
9,0.024638,0.040423,3,865,0.20%,12.05%,99.93%,87.92%,12.0
10,0.003053,0.02463,1,867,0.07%,12.08%,100.00%,100.00%,0.0


The above Model has a good scores but still shows overfitting. so trying traditional approach i,e Logistic Regression in next step

# Logistic Regression

In [75]:
data_lr=data2
data_lr.head()

Unnamed: 0,var_63,var_64,var_67,var_68,...,var_3017,var_3018,var_3019,Event
0,0.0,0.0,0.0,0.5,...,6641.58,8948.17,10106.0,0
2,-2.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,0
3,0.0,0.0,1.0,0.880797,...,0.083333,0.0,0.0,0
4,-2.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0
5,-2.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0


# Handling nan values

Logistic Regression accepts only numerical data and does not accept nan values so replacing nan values by mean

In [76]:
for col in data_lr.columns:
    data_lr[col]=data_lr[col].fillna(data_lr[col].median())
data_lr.head()

Unnamed: 0,var_63,var_64,var_67,var_68,...,var_3017,var_3018,var_3019,Event
0,0.0,0.0,0.0,0.5,...,6641.58,8948.17,10106.0,0
2,-2.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,0
3,0.0,0.0,1.0,0.880797,...,0.083333,0.0,0.0,0
4,-2.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0
5,-2.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0


# splitting data

In [77]:
x=data_lr.drop(labels='Event',axis=1)
y=data_lr['Event']
X_train_lr, X_test_lr, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

Distribution of event and non event in test and train data

In [78]:
print('train','\n',y_train.value_counts(normalize=True)*100)
print('test','\n',y_test.value_counts(normalize=True)*100)

train 
 0    82.745505
1    17.254495
Name: Event, dtype: float64
test 
 0    82.718894
1    17.281106
Name: Event, dtype: float64


# Logistic Regression base model

In [79]:
lr_base=LogisticRegression()
lr_base.fit(X_train_lr,y_train)
score_base=scorecard(lr_base,X_train_lr,X_test_lr)
score_base

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.825611,0.824885
precision,0.277778,0.272727
recall,0.00668,0.008
TPR,0.00668,0.008
FPR,0.003622,0.004457
f1_ratio,0.013046,0.015544
auc_score,0.501529,0.501772
ks statistics,22.8,21.4


the model metrics looks good but ks statistics is less than 40 so check for overfit is needed

# scaling data 

since the values in each columns have huge scales so scaling them by using a new approach i,e WOE values

WOE  (Weight of Evaluation) : the WOE tells the predictive power of an independent variable in relation to dependent variable. or it measure of seperation of good and bad customers.

                                 WOE = In(% of non-events ➗ % of events)



In [83]:
data_woe=raw_to_woe_values(data2,target=data2['Event'],bins=10)
data_woe.head()

Unnamed: 0,var_63,var_64,var_67,var_68,...,var_3017,var_3018,var_3019,Event
0,-0.119512,-0.088831,0.042751,0.027786,...,-0.216929,-0.254872,-0.158269,-7.534763
2,-0.002361,-0.088831,-0.392427,0.027786,...,0.018741,0.026833,0.016603,-7.534763
3,-0.119512,-0.088831,-0.293908,-0.293908,...,0.018741,0.026833,0.016603,-7.534763
4,-0.002361,-0.088831,0.042751,0.027786,...,0.018741,0.026833,0.016603,-7.534763
5,-0.002361,-0.088831,0.042751,0.027786,...,0.018741,0.026833,0.016603,-7.534763


splitting scaled data

In [84]:
x=data_woe.drop(labels='Event',axis=1)
y=data2['Event']
X_train_woe, X_test_woe, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,stratify=y)

# LR model using scaled data (woe values)

In [85]:
lr_woe=LogisticRegression()
lr_woe.fit(X_train_woe,y_train)
score_woe=scorecard(lr_woe,X_train_woe,X_test_woe)
score_woe

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.880014,0.842396
precision,0.738494,0.566265
recall,0.47161,0.376
TPR,0.47161,0.376
FPR,0.034824,0.060167
f1_ratio,0.575622,0.451923
auc_score,0.718393,0.657916
ks statistics,62.6,43.8


The model metrics shows a huge improvement from base model after replacing raw data with WOE values

# Checking for variables with correlation >0.65

After converting raw data to woe values it may developed a new trend in data this leads to correlation between variables.

so checking for correlation between variables

In [86]:
corr=correlation(X_test_woe,0.65,-0.65)
corr

Unnamed: 0,v1,v2,corr
0,var_63,var_64,0.652173
1,var_63,var_127,0.756339
2,var_64,var_63,0.652173
3,var_67,var_68,0.791306
4,var_68,var_67,0.791306
...,...,...,...
10763,var_3018,var_2976,0.675077
10764,var_3018,var_3017,0.853288
10765,var_3018,var_3019,0.852366
10766,var_3019,var_3017,0.707019


In [87]:
cols=set(corr['v1'])
print('removing',len(cols), 'variables since they have high correlation with other variables')

removing 746 variables since they have high correlation with other variables


# building model by removing variables with high correlation(0.65)

In [88]:
X_test_corr=X_test_woe.drop(labels=cols,axis=1)
X_train_corr=X_train_woe.drop(labels=cols,axis=1)
lr3=LogisticRegression()
lr3.fit(X_train_corr,y_train)
score_corr=scorecard(lr3,X_train_corr,X_test_corr)
score_corr

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.844629,0.840553
precision,0.633274,0.604317
recall,0.236473,0.224
TPR,0.236473,0.224
FPR,0.028556,0.030641
f1_ratio,0.344358,0.326848
auc_score,0.603959,0.59668
ks statistics,47.2,41.6


# Checking for high vif valued variables

if vif is high then variables are highly correlated with each other this leads to the overfitting of model. to avoid overfitting we remove variables having vif values.

Considering variables with vif less than or equal to 10

In [89]:
z=X_test_corr
vif_data = pd.DataFrame()
vif_data["feature"] = z.columns
vif_data["VIF"] = [variance_inflation_factor(z.values, i)
                          for i in range(len(z.columns))]
vif_data=vif_data.set_index('feature')
vif_col=vif_data[vif_data.VIF<10].index
len(vif_col)

  return 1 - self.ssr/self.uncentered_tss


151

# building model with variables having vif <10

In [90]:
X_test_vif=X_test_corr[vif_col]
X_train_vif=X_train_corr[vif_col]
final_lr=LogisticRegression()
final_lr.fit(X_train_vif,y_train)
score_vif=scorecard(final_lr,X_train_vif,X_test_vif)
score_vif

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.844629,0.840553
precision,0.633274,0.604317
recall,0.236473,0.224
TPR,0.236473,0.224
FPR,0.028556,0.030641
f1_ratio,0.344358,0.326848
auc_score,0.603959,0.59668
ks statistics,47.2,41.6


# ks table for final model

# train data

In [92]:
y_prob=final_lr.predict_proba(X_train_vif)[:,1]
f=pd.DataFrame({'y':y_train,'p':y_prob})
kstable,ks_stat=ks(data=f,target='y',prob='p')
kstable

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.420943,0.954881,484,384,32.33%,5.35%,32.33%,5.35%,27.0
2,0.288687,0.420914,313,554,20.91%,7.72%,53.24%,13.07%,40.2
3,0.20616,0.288573,227,641,15.16%,8.93%,68.40%,21.99%,46.4
4,0.153426,0.206103,159,708,10.62%,9.86%,79.02%,31.86%,47.2
5,0.114067,0.153319,111,757,7.41%,10.54%,86.44%,42.40%,44.0
6,0.083137,0.114054,76,791,5.08%,11.02%,91.52%,53.42%,38.1
7,0.057652,0.083137,43,825,2.87%,11.49%,94.39%,64.91%,29.5
8,0.037356,0.057607,46,821,3.07%,11.44%,97.46%,76.35%,21.1
9,0.021242,0.037354,28,840,1.87%,11.70%,99.33%,88.05%,11.3
10,0.000458,0.021214,10,858,0.67%,11.95%,100.00%,100.00%,0.0


# test data

In [94]:
y_prob=final_lr.predict_proba(X_test_vif)[:,1]
f=pd.DataFrame({'y':y_test,'p':y_prob})
kstable,ks_stat=ks(data=f,target='y',prob='p')
kstable

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.4232,0.941795,111,106,29.60%,5.91%,29.60%,5.91%,23.7
2,0.290049,0.423018,76,141,20.27%,7.86%,49.87%,13.76%,36.1
3,0.206681,0.290037,52,165,13.87%,9.19%,63.73%,22.95%,40.8
4,0.153236,0.206606,40,177,10.67%,9.86%,74.40%,32.81%,41.6
5,0.110677,0.152891,25,192,6.67%,10.70%,81.07%,43.51%,37.6
6,0.080059,0.110188,23,194,6.13%,10.81%,87.20%,54.32%,32.9
7,0.056264,0.079997,21,196,5.60%,10.92%,92.80%,65.24%,27.6
8,0.035026,0.056127,12,205,3.20%,11.42%,96.00%,76.66%,19.3
9,0.01981,0.035024,12,205,3.20%,11.42%,99.20%,88.08%,11.1
10,0.00121,0.01979,3,214,0.80%,11.92%,100.00%,100.00%,-0.0


The model metrics shows good values