In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve,roc_auc_score
from lightgbm import LGBMClassifier,plot_importance,plot_metric
from statsmodels.stats.outliers_influence import variance_inflation_factor

# defining function

In [2]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 2.9 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
def fillrate_1(df):
    data=pd.DataFrame(columns=['variable','unique_values',
                               'fill rate'])
    for var in df.columns:
        if ((df[var].dtype!='object') and (df[var].dtype!='datetime64[ns]')):
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'fill rate':fill_rate},ignore_index=True)
        else:
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'fill rate':fill_rate},ignore_index=True)
            
    return data
def fillrate(df):
    data=pd.DataFrame(columns=['variable','unique_values','count','mean','median','mode','max','min',
                               'fill rate'])
    for var in df.columns:
        if ((df[var].dtype!='object') and (df[var].dtype!='datetime64[ns]')):
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'count':len(df[var]),
                         'mean':np.round(df[var].mean(),3),'median':np.round(df[var].median(),3),'mode':np.NAN,'max':df[var].max(),
                          'min':df[var].min(),'fill rate':fill_rate},ignore_index=True)
        else:
            fill_rate=np.round((1-(df[var].isnull().sum()/len(df[var])))*100,3)
            data=data.append({'variable':var,'unique_values':len(df[var].value_counts()),'count':len(df[var]),
                         'mean':np.NAN,'median':np.NAN,'mode':df[var].mode()[0],'max':np.NAN,
                          'min':np.NAN,'fill rate':fill_rate},ignore_index=True)
                        

    return data


def iv_woe(data, target, bins=10):
    
    newDF = pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] =np.maximum(d['Events'],1) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'],1) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        
    return newDF
def ks(data=None,target=None, prob=None):
    data['target0'] = 1 - data[target]
    data['bucket'] = pd.qcut(data[prob], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 9)
    ks=kstable.KS.max()
    return kstable,ks


def scorecard(model,X_train,X_test):
    y_pred=model.predict(X_test)
    y_prob=model.predict_proba(X_test)[:,1]
    cc=confusion_matrix(y_test,y_pred)
    TN=cc[0,0]
    TP=cc[1,1]
    FP=cc[0,1]
    FN=cc[1,0]
    accuracy=(TP+TN)/(TP+TN+FP+FN)
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    Tpr=TP/(TP+FN)
    Fpr=FP/(FP+TN)
    f1_ratio=2*((recall*precision)/(precision+recall))
    auc_test=roc_auc_score(y_test,y_pred)
    data=pd.DataFrame({'y':y_test,'p':y_prob})
    a,ks_test=ks(data=data,target='y',prob='p')
    y_pred_tr=model.predict(X_train)
    y_prob_tr=model.predict_proba(X_train)[:,1]
    cctrain=confusion_matrix(y_train,y_pred_tr)
    TN=cctrain[0,0]
    TP=cctrain[1,1]
    FP=cctrain[0,1]
    FN=cctrain[1,0]
    accuracy_tr=(TP+TN)/(TP+TN+FP+FN)
    precision_tr=TP/(TP+FP)
    recall_tr=TP/(TP+FN)
    Tpr_tr=TP/(TP+FN)
    Fpr_tr=FP/(FP+TN)
    f1_ratio_tr=2*((recall_tr*precision_tr)/(precision_tr+recall_tr))
    auc_tr=roc_auc_score(y_train,y_pred_tr)
    data_r=pd.DataFrame({'y':y_train,'p':y_prob_tr})
    a,ks_train=ks(data=data_r,target='y',prob='p')
    df=pd.DataFrame({'metrics':['accuracy','precision','recall','TPR','FPR','f1_ratio','auc_score','ks statistics'],
                     'train':[accuracy_tr,precision_tr,recall_tr,Tpr_tr,Fpr_tr,f1_ratio_tr,auc_tr,ks_train],
                      'test':[accuracy,precision,recall,Tpr,Fpr,f1_ratio,auc_test,ks_test]})
    df=df.set_index('metrics')
    return df
    
def raw_to_woe_values(data, target, bins=10):
    
    df = data.copy()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': target})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': target})
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] =np.maximum(d['Events'],1) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'],1) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        z=list(d['Cutoff'])
        j=list(d['WoE'])
        dic={z[i]:j[i] for i in range(len(d))}
        df[ivars]=d0['x'].map(dic)
        
        
    return df
def correlation(d,poscutoff,negcutoff):
    v1=[]
    v2=[]
    corr=[]
    for a in d.columns:
        for b in d.drop(labels=a,axis=1).columns:
            if d[a].corr(d[b])>poscutoff:
                v1.append(a)
                v2.append(b)
                corr.append(d[a].corr(d[b]))
            elif d[a].corr(d[b])<negcutoff:
                v1.append(a)
                v2.append(b)
                corr.append(d[a].corr(d[b]))
    df=pd.DataFrame({'v1':v1,'v2':v2,'corr':corr})
    
    return df



In [6]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-0.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.9 MB/s eta 0:00:01
[?25hCollecting pandas>=1.1.0
  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 15.6 MB/s eta 0:00:01
Collecting cramjam>=2.3.0
  Downloading cramjam-2.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 9.5 MB/s eta 0:00:01
Installing collected packages: pandas, cramjam, fastparquet
  Attempting uninstall: pandas
    Found existing installation: pandas 1.0.1
    Uninstalling pandas-1.0.1:
      Successfully uninstalled pandas-1.0.1
Successfully installed cramjam-2.5.0 fastparquet-0.8.1 pandas-1.3.5
Note: you may need to restart the kernel to use updated packages.


# Loading data

In [3]:
df=pd.read_parquet('MASTER_DF_short_term_more_than3_with_delay.parquet',engine='fastparquet')
df.head()

Unnamed: 0,cid,app_lst3_sync_trend_bnk,app_lst3_sync_slope_bnk,app_lst3_sync_trend_tvl,app_lst3_sync_slope_tvl,app_lst3_sync_trend_auto,app_lst3_sync_slope_auto,app_lst3_sync_trend_but,app_lst3_sync_slope_but,app_lst3_sync_trend_bk,...,wallet_overall_cnt_bills_due_90_days,wallet_overall_cnt_bills_due_30_days,wallet_overall_amt_avg_total_credit_transactions_12_months,wallet_overall_amt_avg_total_credit_transactions_6_months,wallet_overall_amt_avg_total_credit_transactions_3_months,wallet_overall_amt_avg_total_debit_transactions_12_months,wallet_overall_amt_avg_total_debit_transactions_6_months,wallet_overall_amt_avg_total_debit_transactions_3_months,Max_delay_all,Max_delay_last2
0,65782,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,8.0,6641.58,8948.17,10106.0,10075.2,13919.2,14014.5,-24,-37
1,66488,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,10647.0,10647.0,10647.0,31019.0,31019.0,31019.0,8,6
2,171270,-1.0,0.0,1.0,0.880797,0.0,0.0,0.0,0.0,0.0,...,49.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,204497,1.0,0.982014,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.083333,0.0,0.0,155.287,144.188,240.313,0,-30
4,331733,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,-5,-5


In [12]:
a=[i for i in df.columns if(any([x in i for x in ['120']]))]
a

['ratio_cnt_avg_daily_debit_trans_30_days_to_60_120_days',
 'ratio_amt_avg_daily_debit_trans_30_days_to_60_120_days',
 'ratio_amt_avg_dbt_per_tran_30_to_60_120_days',
 'ratio_amt_avg_monthly_debit_trans_30_days_to_60_120_days',
 'ratio_cnt_avg_daily_credit_trans_30_days_to_60_120_days',
 'ratio_amt_avg_daily_credit_trans_30_days_to_60_120_days',
 'ratio_amt_avg_credit_per_tran_30_to_60_120_days',
 'ratio_amt_avg_monthly_credit_trans_30_days_to_60_120_days',
 'ratio_of_ratio_avg_mon_dbt_to_crdt_amt_30d_to_60d_120d',
 'ratio_cnt_avg_daily_debit_card_tran_30_days_to_60_120_days',
 'ratio_amt_avg_daily_debit_card_tran_30_days_to_60_120_days',
 'ratio_amt_avg_debit_card_trans_per_trans_30_days_to_60_120_days',
 'ratio_amt_avg_monthly_debit_card_tran_30_days_to_60_120_days',
 'ratio_amt_avg_withdrawal_per_tran_30_days_60_120_days',
 'ratio_amt_avg_monthly_atm_withdrawal_30_days_to_60_120_days',
 'ratio_cnt_cheques_returned_30_to_60_120_days',
 'casa_overall_regex_ratio_cnt_avg_daily_debit_tr

In [4]:
print(' there are',df.shape[0],' rows and ',df.shape[1],' variables in the dataset')
print('data types in data is')
df.dtypes.value_counts()


 there are 11563  rows and  3033  variables in the dataset
data types in data is


float64           3011
int64               18
object               3
datetime64[ns]       1
dtype: int64

replacing -9999 by np.NAN

In [5]:
df.replace(to_replace=-9999,value=np.NAN,inplace=True)

# Deciding target variable based on Delay

target variable

Max_delay_all<=2 is non_event ,

max_dela_all>=15 event,

rest are grey

In [6]:
target=[]
for i in range(len(df)):
    if df['Max_delay_all'].iloc[i]<=2:
        target.append('non event')
    elif df['Max_delay_all'].iloc[i]>=15:
        target.append('event')
    else:
        target.append('grey')
        
df['event']=target

In [7]:
df.event.value_counts(normalize=True)*100

non event    77.609617
event        16.189570
grey          6.200813
Name: event, dtype: float64

The above Event rate shows that there are only 6.2% of grey customers in the data so it does not help in classification of customers so dropping all grey customers

In [8]:
index_to_drop=df[df.event=='grey'].index
df1=df.drop(index_to_drop)
print('before drop grey customer',df.shape)
print('after droping grey customer',df1.shape)

before drop grey customer (11563, 3034)
after droping grey customer (10846, 3034)


droping all unique variables like cid ,loanid, datetime variable and max delay

In [9]:
b=[col for col in df.columns if col.endswith('contact_record')]
a=[col for col in df.columns if col.endswith('id')]
b.extend(a)
df2=df1.drop(labels=b,axis=1)
df3=df2.drop(labels=['Max_delay_all','Max_delay_last2','casa_overall_regex_variable_created_on'],axis=1)
print('before droping variables',df1.shape)
print('after droping variables',df3.shape)

before droping variables (10846, 3034)
after droping variables (10846, 3023)


# Fillrate

Fill rate is ratio of number of non null data points to the total number of data points.

Fill rate is useful for the Feature selection, because fill rate tells how much percentage of data a particular variable has.

using fillrate cutoff as 60%, assuming the variables with fillrate less than 60%, it does not give information


In [10]:
Fr_variables=fillrate(df3)
Fr_variables.head(5)

Unnamed: 0,variable,unique_values,count,mean,median,mode,max,min,fill rate
0,app_lst3_sync_trend_bnk,4,10846,-0.379,0.0,,1.0,-2.0,92.956
1,app_lst3_sync_slope_bnk,8,10846,0.417,0.5,,1.0,0.0,92.956
2,app_lst3_sync_trend_tvl,4,10846,-0.774,0.0,,1.0,-2.0,92.956
3,app_lst3_sync_slope_tvl,8,10846,0.132,0.0,,1.0,0.0,92.956
4,app_lst3_sync_trend_auto,1,10846,0.0,0.0,,0.0,0.0,92.956


dropping variables with fillrate < or =60%

In [11]:
columns=list(Fr_variables[Fr_variables['fill rate']<=60].variable)
df_60_fillrate=df3.drop(labels=columns,axis=1)
print('before drop',df3.shape)
print('after drop', df_60_fillrate.shape)
#encoding target variable
dic={'non event':0,'event':1}
df_60_fillrate['event']=df_60_fillrate['event'].map(dic)

before drop (10846, 3023)
after drop (10846, 3009)


# IV values

IV (Information value) : Information value is the single value representing the entire feature’s predictive power. This will be useful during the feature selection.  

                                 IV = ∑ (% of non-events - % of events) * WOE
                                 
based on IV values we can select Variables with higher information


In [12]:
iv=iv_woe(data=df_60_fillrate,target='event',bins=10)
iv=iv.set_index('Variable')

creating buckest for IV value

In [13]:
#creating buckest for IV value
buckets=[]
for i in range(len(iv)):
    if iv.IV.iloc[i]<=0.001:
        buckets.append('<=0.001')
    elif iv.IV.iloc[i]<=0.005:
        buckets.append('.001 to 0.005')
    elif iv.IV.iloc[i]<=0.02:
        buckets.append('0.005 to 0.02')
    elif iv.IV.iloc[i]<=0.1:
        buckets.append('0.02 to 0.1')
    else:
        buckets.append('>0.1')
        
        
iv['buckets']=buckets
iv.groupby('buckets').agg('count')

Unnamed: 0_level_0,IV
buckets,Unnamed: 1_level_1
.001 to 0.005,295
0.005 to 0.02,455
0.02 to 0.1,525
<=0.001,1601
>0.1,132


cutoff is 0.01 droping variables with IV value <0.01

In [14]:
#cutoff is 0.01 droping variables with IV value <0.01
cols=iv[iv.IV<0.01].index
data=df_60_fillrate.drop(labels=cols,axis=1)
print('before drop',df_60_fillrate.shape)
print('after drop ',data.shape)

before drop (10846, 3009)
after drop  (10846, 899)


# Splitting data into train(80%) and test(20%)

In [15]:
X=data.drop(labels=['event'],axis=1)
y=data['event']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

Checking for distribution of event rate i train and test

In [16]:
print('Y_train',y_train.value_counts(normalize=True))
print('Y_test',y_test.value_counts(normalize=True))

Y_train 0    0.827455
1    0.172545
Name: event, dtype: float64
Y_test 0    0.827189
1    0.172811
Name: event, dtype: float64


# building base model LGBM

In [17]:
lgbm1=LGBMClassifier(min_child_samples=500)
lgbm1.fit(X_train,y_train)
score=scorecard(lgbm1,X_train,X_test)
score

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.904564,0.852535
precision,0.903498,0.682119
recall,0.500334,0.274667
TPR,0.500334,0.274667
FPR,0.011144,0.026741
f1_ratio,0.644024,0.391635
auc_score,0.744595,0.623963
ks statistics,75.7,49.0


# droping variables Which has zero feature importance 

In [18]:
imp=pd.DataFrame({'variables':X_train.columns,'importance':lgbm1.feature_importances_})
columns=list(imp[imp.importance>0]['variables'])
print(len(imp)-len(columns), 'variables has 0 feature importance so droping them')
X_train_n=X_train[columns]
X_test_n=X_test[columns]

471 variables has 0 feature importance so droping them


By removing variables with zero feature importance the complexity of model is reduced with same scores as previous

In [19]:
lgbm2=LGBMClassifier(min_child_samples=500)
lgbm2.fit(X_train_n,y_train)
score=scorecard(lgbm2,X_train_n,X_test_n)
score

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.904564,0.852535
precision,0.903498,0.682119
recall,0.500334,0.274667
TPR,0.500334,0.274667
FPR,0.011144,0.026741
f1_ratio,0.644024,0.391635
auc_score,0.744595,0.623963
ks statistics,75.7,49.0


By observing above Train and Test scores there is significant difference hence the model is overfitting. so checking for multicolinearity

# checking for correlation of variables

if the correlation is high between two variables then both the variables will give same meaning or influence on target variable. so we can drop one of them this will reduce complexity of model as well as chance of overfitting.

In [20]:
corr=correlation(X_train_n,0.65,-0.65)
corr

Unnamed: 0,v1,v2,corr
0,app_all_sync_trend_tvl,app_all_sync_slope_tvl,0.816501
1,app_all_sync_slope_tvl,app_all_sync_trend_tvl,0.816501
2,app_all_sync_slope_unlst,app_ui_unlst_slope_all_sync,0.655538
3,unlst_app_cnt_tot,avg_unlst_hist_did_cnt,0.817583
4,tvl_app_cnt_tot,tvl_app_perc_tot,0.671617
...,...,...,...
1777,utility_overall_amt_avg_monthly_util_bills_30_...,utility_overall_amt_avg_monthly_util_bills_180...,0.954130
1778,wallet_overall_cnt_total_credit_transactions,no_wallet_sms,0.766033
1779,wallet_overall_cnt_total_debit_transactions,no_wallet_sms,0.710287
1780,wallet_overall_amt_avg_total_credit_transactio...,wallet_overall_amt_avg_total_credit_transactio...,0.957248


In [21]:
cols=set(corr['v1'])
len(cols)

351

# lgbm model by removing high correlated variables

In [22]:
X_test_corr=X_test_n.drop(labels=cols,axis=1)
X_train_corr=X_train_n.drop(labels=cols,axis=1)
lgbm3=LGBMClassifier(min_child_samples=500)
lgbm3.fit(X_train_corr,y_train)
score3=scorecard(lgbm3,X_train_corr,X_test_corr)
score3


Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.86745,0.841475
precision,0.877996,0.686747
recall,0.269205,0.152
TPR,0.269205,0.152
FPR,0.007801,0.014485
f1_ratio,0.412065,0.248908
auc_score,0.630702,0.568758
ks statistics,64.6,39.5


The above scores shows the improvement i,e overfitting reduced as the difference between train and test is reduced

# Checking for vif of variables

if vif is high then variables are highly correlated with each other this leads to the overfitting of model. to avoid overfitting we remove variables having vif values.

Considering variables with vif less than or equal to 10

In [23]:
z=X_test_corr.fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = z.columns
vif_data["VIF"] = [variance_inflation_factor(z.values, i)
                          for i in range(len(z.columns))]

In [24]:
vif_data=vif_data.set_index('feature')
vif_col=vif_data[vif_data.VIF<=10].index
len(vif_col)

70

# model after droping high vif variables

In [25]:
X_test_vif=X_test_corr[vif_col]
X_train_vif=X_train_corr[vif_col]
lgbm4=LGBMClassifier(min_child_samples=500)
lgbm4.fit(X_train_vif,y_train)
score4=scorecard(lgbm4,X_train_vif,X_test_vif)
score4


Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.86745,0.835945
precision,0.866808,0.614458
recall,0.273881,0.136
TPR,0.273881,0.136
FPR,0.008776,0.017827
f1_ratio,0.416244,0.222707
auc_score,0.632553,0.559086
ks statistics,65.2,41.7


There is improvement from previous scores

# hyper parameter

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
param={'boosting_type':['dart','gbdt',],
       'max_depth':[4,8,12],'n_estimators':[75,100,150,200],
       'min_child_samples':[300,400,450,500,550]
        }
grid=GridSearchCV(estimator=LGBMClassifier(),param_grid=param,cv=2,scoring='accuracy')
grid.fit(X_train_vif,y_train)

GridSearchCV(cv=2, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent='warn',
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'boosting_type': ['dart', 'gbdt'],
                         'max_de

In [28]:
grid.best_params_

{'boosting_type': 'gbdt',
 'max_depth': 8,
 'min_child_samples': 300,
 'n_estimators': 150}

In [29]:
lgbm4=LGBMClassifier(boosting_type='gbdt',max_depth=4,min_child_samples=500,n_estimators=150,)
lgbm4.fit(X_train_vif,y_train)
score4=scorecard(lgbm4,X_train_vif,X_test_vif)
score4

Unnamed: 0_level_0,train,test
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.863416,0.843318
precision,0.840611,0.692308
recall,0.257181,0.168
TPR,0.257181,0.168
FPR,0.010169,0.015599
f1_ratio,0.393862,0.270386
auc_score,0.623506,0.576201
ks statistics,60.0,41.6


The above scores shows the improvent in metrics compared to previous scores

# Ks tables for train and test

Checking event rate and non event rate in each bands

the band with least event rate is good because less bad customers

In [30]:
y_prob=lgbm4.predict_proba(X_test_vif)[:,1]
d=pd.DataFrame({'y':y_test,'p':y_prob})
kstable,ks_stat=ks(data=d,target='y',prob='p')
kstable

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.381388,0.89091,120,97,32.00%,5.40%,32.00%,5.40%,26.6
2,0.265435,0.381001,67,150,17.87%,8.36%,49.87%,13.76%,36.1
3,0.20131,0.265096,51,166,13.60%,9.25%,63.47%,23.01%,40.5
4,0.156618,0.201264,41,176,10.93%,9.81%,74.40%,32.81%,41.6
5,0.12364,0.156553,30,187,8.00%,10.42%,82.40%,43.23%,39.2
6,0.092686,0.12315,24,193,6.40%,10.75%,88.80%,53.98%,34.8
7,0.072649,0.092421,21,196,5.60%,10.92%,94.40%,64.90%,29.5
8,0.053194,0.072626,9,208,2.40%,11.59%,96.80%,76.49%,20.3
9,0.034729,0.052955,8,209,2.13%,11.64%,98.93%,88.13%,10.8
10,0.005802,0.034693,4,213,1.07%,11.87%,100.00%,100.00%,0.0


In [31]:
y_prob=lgbm4.predict_proba(X_train_vif)[:,1]
f=pd.DataFrame({'y':y_train,'p':y_prob})
kstable,ks_stat=ks(data=f,target='y',prob='p')
kstable

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.388237,0.899801,640,228,42.75%,3.18%,42.75%,3.18%,39.6
2,0.270999,0.38814,332,535,22.18%,7.45%,64.93%,10.63%,54.3
3,0.204422,0.270835,220,648,14.70%,9.03%,79.63%,19.65%,60.0
4,0.158761,0.204291,123,744,8.22%,10.36%,87.84%,30.02%,57.8
5,0.1225,0.158591,73,795,4.88%,11.07%,92.72%,41.09%,51.6
6,0.09575,0.122427,53,814,3.54%,11.34%,96.26%,52.43%,43.8
7,0.072619,0.095722,24,844,1.60%,11.76%,97.86%,64.19%,33.7
8,0.052167,0.072606,16,851,1.07%,11.85%,98.93%,76.04%,22.9
9,0.033831,0.052157,14,854,0.94%,11.90%,99.87%,87.94%,11.9
10,0.003957,0.0338,2,866,0.13%,12.06%,100.00%,100.00%,-0.0


The above Model has a good scores but still shows overfitting. so trying traditional approach i,e Logistic Regression in next step