In [176]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score, auc, precision_score, recall_score, f1_score, confusion_matrix,matthews_corrcoef
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

import seaborn as sns

%matplotlib inline

The following function takes 2 dataframes: dataframe: refers to the whole data and dataframe2: refers to the dataframe with only numeric columns where we want the outliers to be detected and removed. 
The boxplots of the data show that there is no outlier on the left side of the data and hence only right whisker values are checked with.
dataframe stores the columns after removing the values with outliers.

In [177]:
def outlier_removal(dataframe, dataframe2, limit = 1.5):
        
    for col in dataframe2:           # This is for the variables to be treated from upper right whisker
        rw = limit*(dataframe[col].quantile(0.75) - dataframe[col].quantile(0.25)) + dataframe[col].quantile(0.75)
        dataframe = dataframe[~(dataframe[col]>rw)]
        dataframe = dataframe   
    
    return dataframe



In [178]:
df=pd.read_csv("application_data.csv")

print(df.shape)

print(df.columns)

df.head()

(307511, 30)
Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE',
       'NAME_FAMILY_STATUS', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'OCCUPATION_TYPE',
       'CNT_FAM_MEMBERS', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE',
       'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')


Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0.139376,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0.729567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,,2.0,0.0,2.0,0.0,,,,,
4,100007,0,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
(df.TARGET.value_counts()/df.shape[0])*100

0    91.927118
1     8.072882
Name: TARGET, dtype: float64

We have highly imbalanced dataset.

Undersampling can be defined as removing some observations of the majority class. This is done until the majority and minority class is balanced out.

Undersampling can be a good choice when you have a ton of data -think millions of rows. But a drawback to undersampling is that we are removing information that may be valuable.

Oversampling can be defined as adding more copies to the minority class. Oversampling can be a good choice when you don’t have a ton of data to work with.

A con to consider when undersampling is that it can cause overfitting and poor generalization to your test set.

Therefore, we will consider oversampling technique.

Balance data with the imbalanced-learn python module A number of more sophisticated resampling techniques have been proposed in the scientific literature.

For example, we can cluster the records of the majority class, and do the under-sampling by removing records from each cluster, thus seeking to preserve information. In over-sampling, instead of creating exact copies of the minority class records, we can introduce small variations into those copies, creating more diverse synthetic samples.

Let’s apply some of these resampling techniques, using the Python library imbalanced-learn. It is compatible with scikit-learn and is part of scikit-learn-contrib projects.

# Data Preprocessing

## Dropping Unimportant Columns

In [180]:
df1 = df.copy()

df1.drop(['SK_ID_CURR'],axis=1,inplace=True)

In [181]:
df1.shape

(307511, 29)

In [182]:
df1.dtypes

TARGET                          int64
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
CNT_CHILDREN                    int64
AMT_INCOME_TOTAL              float64
AMT_CREDIT                    float64
AMT_ANNUITY                   float64
AMT_GOODS_PRICE               float64
NAME_INCOME_TYPE               object
NAME_FAMILY_STATUS             object
DAYS_BIRTH                      int64
DAYS_EMPLOYED                   int64
OCCUPATION_TYPE                object
CNT_FAM_MEMBERS               float64
WEEKDAY_APPR_PROCESS_START     object
ORGANIZATION_TYPE              object
EXT_SOURCE_1                  float64
EXT_SOURCE_2                  float64
EXT_SOURCE_3                  float64
OBS_30_CNT_SOCIAL_CIRCLE      float64
DEF_30_CNT_SOCIAL_CIRCLE      float64
OBS_60_CNT_SOCIAL_CIRCLE      float64
DEF_60_CNT_SOCIAL_CIRCLE      float64
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CRED

## Feature Engineering

In [183]:
df1['CLIENT_AGE']=(abs(df1['DAYS_BIRTH'])/365).round(0)
df1['YEARS_EMPLOYED']=(abs(df1['DAYS_EMPLOYED'])/365).round(2)

df1.drop(['DAYS_BIRTH','DAYS_EMPLOYED'],axis=1,inplace=True)

#### Removal of Outlier values

In [184]:
df1.select_dtypes('float64').columns

Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'CLIENT_AGE', 'YEARS_EMPLOYED'],
      dtype='object')

In [185]:
df_float = df1.select_dtypes('float')

df_float.head(2)

Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,CNT_FAM_MEMBERS,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CLIENT_AGE,YEARS_EMPLOYED
0,202500.0,406597.5,24700.5,351000.0,1.0,0.083037,0.262949,0.139376,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,26.0,1.75
1,270000.0,1293502.5,35698.5,1129500.0,2.0,0.311267,0.622246,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,3.25


In [186]:
float_cols = set(df_float.columns)
float_cols

{'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'CLIENT_AGE',
 'CNT_FAM_MEMBERS',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'YEARS_EMPLOYED'}

In [187]:
float_cols1 = set(df_float.columns) - set(['AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_YEAR'])

print(float_cols1)

{'EXT_SOURCE_3', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'EXT_SOURCE_2', 'CLIENT_AGE', 'AMT_GOODS_PRICE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'EXT_SOURCE_1', 'YEARS_EMPLOYED', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS', 'AMT_ANNUITY'}


In [188]:
float_cols = set(df_float.columns)
float_cols

{'AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'CLIENT_AGE',
 'CNT_FAM_MEMBERS',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'YEARS_EMPLOYED'}

In [189]:
df_int = df1.select_dtypes('int64')

int_cols = set(df_int.columns)

print(int_cols)

{'CNT_CHILDREN', 'TARGET'}


In [190]:
df2 = outlier_removal(df1, df_float[float_cols1], limit = 1.5)

In [191]:
df2.shape

(162282, 29)

In [192]:
(df2.TARGET.value_counts()/df2.shape[0])*100

0    91.329907
1     8.670093
Name: TARGET, dtype: float64

In [193]:
df2.select_dtypes('object').columns

Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE',
       'NAME_FAMILY_STATUS', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')

There are categorical columns with many categories that have very few data points and would not make sense to keep them as a separate category. Hence are combined together.

In [194]:
for col in df2.select_dtypes('object'):
    print(df2[col].value_counts())

F      102267
M       60013
XNA         2
Name: CODE_GENDER, dtype: int64
N    104868
Y     57414
Name: FLAG_OWN_CAR, dtype: int64
Y    109779
N     52503
Name: FLAG_OWN_REALTY, dtype: int64
Working                 104485
Commercial associate     44098
State servant            13676
Student                     12
Pensioner                    6
Businessman                  3
Maternity leave              2
Name: NAME_INCOME_TYPE, dtype: int64
Married                 101900
Single / not married     27103
Civil marriage           17690
Separated                10996
Widow                     4592
Unknown                      1
Name: NAME_FAMILY_STATUS, dtype: int64
Laborers                 36612
Sales staff              21180
Core staff               17754
Drivers                  11856
Managers                 11477
High skill tech staff     7385
Accountants               6053
Medicine staff            5607
Security staff            4623
Cooking staff             4008
Cleaning staff      

In [195]:
df2['ORGANIZATION_TYPE'].value_counts()*100/df2.shape[0]

Business Entity Type 3    26.684414
Self-employed             15.375704
Other                      6.664941
Medicine                   4.496494
Business Entity Type 2     4.193318
Government                 4.182842
School                     3.602987
Trade: type 7              3.110635
Kindergarten               2.749535
Construction               2.621363
Business Entity Type 1     2.373030
Transport: type 4          2.137021
Trade: type 3              1.458572
Security                   1.369838
Industry: type 3           1.347038
Industry: type 9           1.244131
Housing                    1.159710
Industry: type 11          1.083916
Bank                       0.991484
Military                   0.965603
Agriculture                0.960057
Postal                     0.897204
Transport: type 2          0.895355
Police                     0.873788
Restaurant                 0.738837
Security Ministries        0.732059
Trade: type 2              0.682146
Services                   0

In [196]:
df2['ORGANIZATION_TYPE1'] = np.where(((df2['ORGANIZATION_TYPE'] == 'Industry: type 8') | 
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 13') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 10') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 6') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 12') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 2') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 5') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 4') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 7') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 1') |
        (df2['ORGANIZATION_TYPE'] == 'Industry: type 11')), 'Industry_Other', 
                            np.where(((df2['ORGANIZATION_TYPE'] == 'Trade: type 4') | 
        (df2['ORGANIZATION_TYPE'] == 'Trade: type 5') |
        (df2['ORGANIZATION_TYPE'] == 'Trade: type 1') |
        (df2['ORGANIZATION_TYPE'] == 'Trade: type 6') |
        (df2['ORGANIZATION_TYPE'] == 'Trade: type 2')), 'Trade_Other', 
            np.where(((df2['ORGANIZATION_TYPE'] == 'Transport: type 1') | 
        (df2['ORGANIZATION_TYPE'] == 'Transport: type 3') |
                     (df2['ORGANIZATION_TYPE'] == 'Transport: type 2')), 'Transport_Other', 
                            np.where(((df2['ORGANIZATION_TYPE'] == 'Religion') | 
        (df2['ORGANIZATION_TYPE'] == 'Cleaning') |
        (df2['ORGANIZATION_TYPE'] == 'Legal Services') |
        (df2['ORGANIZATION_TYPE'] == 'Mobile') |
        (df2['ORGANIZATION_TYPE'] == 'Culture') |
        (df2['ORGANIZATION_TYPE'] == 'Realtor') |
        (df2['ORGANIZATION_TYPE'] == 'Advertising') |
        (df2['ORGANIZATION_TYPE'] == 'Emergency') |
        (df2['ORGANIZATION_TYPE'] == 'Insurance') |
        (df2['ORGANIZATION_TYPE'] == 'Telecom') |
        (df2['ORGANIZATION_TYPE'] == 'Hotel') |
        (df2['ORGANIZATION_TYPE'] == 'Electricity') |
        (df2['ORGANIZATION_TYPE'] == 'University') |
        (df2['ORGANIZATION_TYPE'] == 'Services') |
        (df2['ORGANIZATION_TYPE'] == 'Restaurant') |
        (df2['ORGANIZATION_TYPE'] == 'Security Ministries') |
        (df2['ORGANIZATION_TYPE'] == 'Postal')), 'Other', df2['ORGANIZATION_TYPE']))))

df2['OCCUPATION_TYPE1'] = np.where(((df2['OCCUPATION_TYPE'] == 'Low-skill Laborers') | 
        (df2['OCCUPATION_TYPE'] == 'Waiters/barmen staff') |
        (df2['OCCUPATION_TYPE'] == 'Secretaries') |
        (df2['OCCUPATION_TYPE'] == 'Realty agents') |
        (df2['OCCUPATION_TYPE'] == 'HR staff') |
        (df2['OCCUPATION_TYPE'] == 'IT staff')), 'Other',df2['OCCUPATION_TYPE'])

In [197]:
df2['ORGANIZATION_TYPE1'].value_counts()*100/df2.shape[0]

Business Entity Type 3    26.684414
Self-employed             15.375704
Other                     12.443771
Medicine                   4.496494
Business Entity Type 2     4.193318
Government                 4.182842
School                     3.602987
Trade: type 7              3.110635
Industry_Other             3.049630
Kindergarten               2.749535
Construction               2.621363
Business Entity Type 1     2.373030
Transport: type 4          2.137021
Transport_Other            1.479523
Trade: type 3              1.458572
Security                   1.369838
Industry: type 3           1.347038
Industry: type 9           1.244131
Housing                    1.159710
Trade_Other                1.129515
Bank                       0.991484
Military                   0.965603
Agriculture                0.960057
Police                     0.873788
Name: ORGANIZATION_TYPE1, dtype: float64

In [198]:
df2['OCCUPATION_TYPE1'].value_counts()*100/df2.shape[0]

Laborers                 22.560728
Sales staff              13.051355
Core staff               10.940215
Drivers                   7.305801
Managers                  7.072257
High skill tech staff     4.550720
Accountants               3.729927
Medicine staff            3.455097
Security staff            2.848745
Other                     2.705168
Cooking staff             2.469775
Cleaning staff            1.980503
Private service staff     1.055570
Name: OCCUPATION_TYPE1, dtype: float64

In [199]:
df3 = df2.copy()

df3.drop(['OCCUPATION_TYPE', 'ORGANIZATION_TYPE'], axis = 1, inplace = True)

In [200]:
df3.shape

(162282, 29)

In [201]:
str_cols = df3.select_dtypes('object').columns

df_str = df3[str_cols].astype('str')

from sklearn.preprocessing import LabelEncoder
df_str = df_str.apply(LabelEncoder().fit_transform)
df_str.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE1,OCCUPATION_TYPE1
1,0,0,0,4,1,1,16,3
2,1,1,1,6,3,1,6,6
3,0,0,1,6,0,6,4,6
4,1,0,1,6,3,4,14,3
5,1,0,1,4,1,6,14,6


In [202]:
df3[str_cols] = df_str

### Treating Null Values

In [203]:
df3.isnull().sum()*100/df3.shape[0]

TARGET                         0.000000
CODE_GENDER                    0.000000
FLAG_OWN_CAR                   0.000000
FLAG_OWN_REALTY                0.000000
CNT_CHILDREN                   0.000000
AMT_INCOME_TOTAL               0.000000
AMT_CREDIT                     0.000000
AMT_ANNUITY                    0.006778
AMT_GOODS_PRICE                0.112767
NAME_INCOME_TYPE               0.000000
NAME_FAMILY_STATUS             0.000000
CNT_FAM_MEMBERS                0.000616
WEEKDAY_APPR_PROCESS_START     0.000000
EXT_SOURCE_1                  52.468542
EXT_SOURCE_2                   0.224301
EXT_SOURCE_3                  20.445890
OBS_30_CNT_SOCIAL_CIRCLE       0.431964
DEF_30_CNT_SOCIAL_CIRCLE       0.431964
OBS_60_CNT_SOCIAL_CIRCLE       0.431964
DEF_60_CNT_SOCIAL_CIRCLE       0.431964
AMT_REQ_CREDIT_BUREAU_DAY     13.935002
AMT_REQ_CREDIT_BUREAU_WEEK    13.935002
AMT_REQ_CREDIT_BUREAU_MON     13.935002
AMT_REQ_CREDIT_BUREAU_QRT     13.935002
AMT_REQ_CREDIT_BUREAU_YEAR    13.935002


In [204]:
df4=df3.copy()

In [205]:
df4.isnull().sum()*100/df3.shape[0]

TARGET                         0.000000
CODE_GENDER                    0.000000
FLAG_OWN_CAR                   0.000000
FLAG_OWN_REALTY                0.000000
CNT_CHILDREN                   0.000000
AMT_INCOME_TOTAL               0.000000
AMT_CREDIT                     0.000000
AMT_ANNUITY                    0.006778
AMT_GOODS_PRICE                0.112767
NAME_INCOME_TYPE               0.000000
NAME_FAMILY_STATUS             0.000000
CNT_FAM_MEMBERS                0.000616
WEEKDAY_APPR_PROCESS_START     0.000000
EXT_SOURCE_1                  52.468542
EXT_SOURCE_2                   0.224301
EXT_SOURCE_3                  20.445890
OBS_30_CNT_SOCIAL_CIRCLE       0.431964
DEF_30_CNT_SOCIAL_CIRCLE       0.431964
OBS_60_CNT_SOCIAL_CIRCLE       0.431964
DEF_60_CNT_SOCIAL_CIRCLE       0.431964
AMT_REQ_CREDIT_BUREAU_DAY     13.935002
AMT_REQ_CREDIT_BUREAU_WEEK    13.935002
AMT_REQ_CREDIT_BUREAU_MON     13.935002
AMT_REQ_CREDIT_BUREAU_QRT     13.935002
AMT_REQ_CREDIT_BUREAU_YEAR    13.935002


In [206]:

df4.drop(['EXT_SOURCE_1'], axis = 1, inplace = True)

df4.shape

(162282, 28)

In [207]:
str_cols = ['AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
           'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR','OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
            'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'CNT_FAM_MEMBERS']

df_str = df4[str_cols].fillna(df[str_cols].median())

df4[str_cols] = df_str

In [208]:
int_cols = ['AMT_ANNUITY', 'AMT_GOODS_PRICE','EXT_SOURCE_2', 'EXT_SOURCE_3']

df4[int_cols]

df_int = df4[int_cols].fillna(df[int_cols].mean())

df4[int_cols] = df_int

In [209]:
df4.isnull().sum()*100/df3.shape[0]

TARGET                        0.0
CODE_GENDER                   0.0
FLAG_OWN_CAR                  0.0
FLAG_OWN_REALTY               0.0
CNT_CHILDREN                  0.0
AMT_INCOME_TOTAL              0.0
AMT_CREDIT                    0.0
AMT_ANNUITY                   0.0
AMT_GOODS_PRICE               0.0
NAME_INCOME_TYPE              0.0
NAME_FAMILY_STATUS            0.0
CNT_FAM_MEMBERS               0.0
WEEKDAY_APPR_PROCESS_START    0.0
EXT_SOURCE_2                  0.0
EXT_SOURCE_3                  0.0
OBS_30_CNT_SOCIAL_CIRCLE      0.0
DEF_30_CNT_SOCIAL_CIRCLE      0.0
OBS_60_CNT_SOCIAL_CIRCLE      0.0
DEF_60_CNT_SOCIAL_CIRCLE      0.0
AMT_REQ_CREDIT_BUREAU_DAY     0.0
AMT_REQ_CREDIT_BUREAU_WEEK    0.0
AMT_REQ_CREDIT_BUREAU_MON     0.0
AMT_REQ_CREDIT_BUREAU_QRT     0.0
AMT_REQ_CREDIT_BUREAU_YEAR    0.0
CLIENT_AGE                    0.0
YEARS_EMPLOYED                0.0
ORGANIZATION_TYPE1            0.0
OCCUPATION_TYPE1              0.0
dtype: float64

In [210]:
df4.head()

Unnamed: 0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,...,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CLIENT_AGE,YEARS_EMPLOYED,ORGANIZATION_TYPE1,OCCUPATION_TYPE1
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,46.0,3.25,16,3
2,0,1,1,1,0,67500.0,135000.0,6750.0,135000.0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0.62,6,6
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,6,...,0.0,0.0,0.0,0.0,0.0,1.0,52.0,8.33,4,6
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,55.0,8.32,14,3
5,0,1,0,1,0,99000.0,490495.5,27517.5,454500.0,4,...,0.0,0.0,0.0,0.0,1.0,1.0,46.0,4.35,14,6


In [211]:
cor_matrix = df4.corr().abs()

In [212]:
cor_matrix

Unnamed: 0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,...,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CLIENT_AGE,YEARS_EMPLOYED,ORGANIZATION_TYPE1,OCCUPATION_TYPE1
TARGET,1.0,0.053544,0.027701,0.006785,0.004506,0.023832,0.018868,0.006962,0.030603,0.03254,...,,0.000691,0.002099,0.014885,0.005807,0.014605,0.067706,0.075103,0.000263,0.009446
CODE_GENDER,0.053544,1.0,0.328245,0.036143,9.7e-05,0.197584,0.013749,0.05277,0.013723,0.023211,...,,0.0012,0.003537,0.001027,0.002992,0.003968,0.0805,0.101387,0.068977,0.055351
FLAG_OWN_CAR,0.027701,0.328245,1.0,0.014218,0.069906,0.18647,0.067407,0.09448,0.071555,0.005511,...,,0.001751,0.000784,0.015825,0.000681,0.017603,0.046993,0.022813,0.011557,0.047817
FLAG_OWN_REALTY,0.006785,0.036143,0.014218,1.0,0.015174,0.025834,0.024998,0.006326,0.032295,0.01183,...,,0.01165,0.004565,0.002087,0.011418,0.047781,0.109377,0.030184,0.005341,0.002919
CNT_CHILDREN,0.004506,9.7e-05,0.069906,0.015174,1.0,0.018328,0.017435,0.006555,0.021591,0.031091,...,,0.000336,0.000763,0.014966,0.000543,0.030162,0.237038,0.056224,0.006292,0.020571
AMT_INCOME_TOTAL,0.023832,0.197584,0.18647,0.025834,0.018328,1.0,0.312363,0.389773,0.315201,0.15563,...,,0.009531,0.016046,0.063904,0.03407,0.079242,0.036141,0.024146,0.04406,0.039827
AMT_CREDIT,0.018868,0.013749,0.067407,0.024998,0.017435,0.312363,1.0,0.7505,0.980365,0.062298,...,,0.004336,8.5e-05,0.044287,0.027379,0.014369,0.156145,0.087943,0.004489,0.017724
AMT_ANNUITY,0.006962,0.05277,0.09448,0.006326,0.006555,0.389773,0.7505,1.0,0.748278,0.078614,...,,0.00056,0.014227,0.02418,0.01768,0.017437,0.078997,0.040962,0.004457,0.017299
AMT_GOODS_PRICE,0.030603,0.013723,0.071555,0.032295,0.021591,0.315201,0.980365,0.748278,1.0,0.061687,...,,0.004893,0.000614,0.046772,0.027701,0.017097,0.152159,0.091276,0.005388,0.018761
NAME_INCOME_TYPE,0.03254,0.023211,0.005511,0.01183,0.031091,0.15563,0.062298,0.078614,0.061687,1.0,...,,0.007249,0.002063,0.0371,0.003894,0.002295,0.008956,0.034736,0.020517,0.017295


## Train_Test Division

In [213]:
print(df4['TARGET'].value_counts())
np.round(df4['TARGET'].value_counts()*100/df4.shape[0],2)

0    148212
1     14070
Name: TARGET, dtype: int64


0    91.33
1     8.67
Name: TARGET, dtype: float64

In [214]:
df4.head()

Unnamed: 0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,...,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CLIENT_AGE,YEARS_EMPLOYED,ORGANIZATION_TYPE1,OCCUPATION_TYPE1
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,46.0,3.25,16,3
2,0,1,1,1,0,67500.0,135000.0,6750.0,135000.0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0.62,6,6
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,6,...,0.0,0.0,0.0,0.0,0.0,1.0,52.0,8.33,4,6
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,55.0,8.32,14,3
5,0,1,0,1,0,99000.0,490495.5,27517.5,454500.0,4,...,0.0,0.0,0.0,0.0,1.0,1.0,46.0,4.35,14,6


In [215]:
from sklearn.model_selection import train_test_split

X1 = df4.drop(['TARGET'],axis=1)
y1 = df4.TARGET

X_train,X_test,y_train,y_test=train_test_split(X1, y1, test_size= 0.30, random_state=42)

print(X_train.shape)
print(X_test.shape)


(113597, 27)
(48685, 27)


In [216]:
rs = RobustScaler()

X_scaled = pd.DataFrame(rs.fit_transform(X_train))


X = X_scaled
y = y_train

In [217]:
X_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,1.0,1.0,0.0,0.0,-0.150000,0.185476,0.144548,0.010204,-0.4,0.0,...,0.0,0.0,0.0,1.0,0.0,-1.0,1.3125,2.060000,0.076923,0.000000
1,0.0,0.0,0.0,0.0,0.800000,-0.075023,0.642998,-0.102041,-1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.7500,-0.604615,0.230769,-0.714286
2,0.0,1.0,0.0,2.0,-0.600000,1.743656,0.772048,1.989796,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-0.1875,-0.632308,0.615385,0.714286
3,1.0,1.0,0.0,0.0,-0.350000,0.398822,0.268808,0.510204,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.8750,-0.593846,-0.538462,0.714286
4,0.0,1.0,-1.0,0.0,0.250000,-0.250408,0.707523,-0.204082,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-1.0,0.4375,1.661538,0.230769,0.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113592,0.0,0.0,-1.0,1.0,-0.100000,-0.100649,-0.095238,-0.173469,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.6250,-0.336923,0.076923,0.000000
113593,0.0,0.0,0.0,0.0,0.400000,0.629930,1.238377,0.520408,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,-0.5000,-0.483077,0.230769,-0.428571
113594,1.0,1.0,0.0,2.0,1.792175,1.326294,0.415328,1.530612,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.5625,-0.335385,-0.538462,1.000000
113595,0.0,0.0,-1.0,0.0,-0.350000,-0.498043,-0.028459,-0.510204,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,1.7500,3.375385,-0.615385,1.000000


Since the data is highly imbalanced, Logistic Regression won't give good results. We are using the bagging algorithms like DT and RF for modelling.
Boosting algorithms could be used, but they might highly overfit the imbalanced data.

"The experiments show that the bagging techniques generally outperform boosting, and hence in noisy data environments, bagging is the preferred method for handling class imbalance." taken from the research paper: https://www.researchgate.net/publication/220508931_Comparing_Boosting_and_Bagging_Techniques_With_Noisy_and_Imbalanced_Data#:~:text=The%20experiments%20show%20that%20the,method%20for%20handling%20class%20imbalance.

# Random Forest

In [218]:
randomForestClassifier = RandomForestClassifier(n_estimators = 800, max_depth = 10,
                                                  class_weight = "balanced",
                                                  random_state= 42)

If we increase max_depth, the model highly overfits as can be understood from test precision and recall scores. 

Max_depth can be reduced, but that isn't improving the model to a great extent. Therefore, 10 is chosen as max_depth.

In [220]:
randomForestClassifier.fit(X_scaled, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=800,
                       random_state=42)

In [221]:
X_train['pred']  = randomForestClassifier.predict(X_scaled)

print('Accuracy Score ->',round(accuracy_score(y_train, X_train['pred']),3))
print('Precision Score ->',round(precision_score(y_train, X_train['pred']),3))
print('Recall Score ->',round(recall_score(y_train, X_train['pred']),3))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_train, X_train['pred'])

Accuracy Score -> 0.77
Precision Score -> 0.23
Recall Score -> 0.71


***Confusion Matrix***


array([[80557, 23238],
       [ 2845,  6957]], dtype=int64)

In [222]:
X_sc_test = pd.DataFrame(rs.transform(X_test))

In [223]:
X_sc_test['pred'] = randomForestClassifier.predict(X_sc_test)

print('Accuracy Score ->',round(accuracy_score(y_test, X_sc_test['pred']),3))
print('Precision Score ->',round(precision_score(y_test, X_sc_test['pred']),3))
print('Recall Score ->',round(recall_score(y_test, X_sc_test['pred']),3))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_test, X_sc_test['pred'])

Accuracy Score -> 0.748
Precision Score -> 0.187
Recall Score -> 0.561


***Confusion Matrix***


array([[34023, 10394],
       [ 1874,  2394]], dtype=int64)

# Decision Tree

In [224]:
decision_tree_model = DecisionTreeClassifier(class_weight = "balanced",
                                                  random_state= 42,max_depth=10)

In [225]:
decision_tree_model.fit(X_scaled, y_train)

DecisionTreeClassifier(class_weight='balanced', max_depth=10, random_state=42)

In [226]:
X_train['pred']  = decision_tree_model.predict(X_scaled)

print('Accuracy Score ->',round(accuracy_score(y_train, X_train['pred']),3))
print('Precision Score ->',round(precision_score(y_train, X_train['pred']),3))
print('Recall Score ->',round(recall_score(y_train, X_train['pred']),3))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_train, X_train['pred'])

Accuracy Score -> 0.661
Precision Score -> 0.172
Recall Score -> 0.77


***Confusion Matrix***


array([[67501, 36294],
       [ 2256,  7546]], dtype=int64)

In [227]:
X_sc_test = pd.DataFrame(rs.transform(X_test))

In [228]:
X_sc_test['pred'] = decision_tree_model.predict(X_sc_test)

print('Accuracy Score ->',round(accuracy_score(y_test, X_sc_test['pred']),3))
print('Precision Score ->',round(precision_score(y_test, X_sc_test['pred']),3))
print('Recall Score ->',round(recall_score(y_test, X_sc_test['pred']),3))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_test, X_sc_test['pred'])

Accuracy Score -> 0.637
Precision Score -> 0.145
Recall Score -> 0.64


***Confusion Matrix***


array([[28300, 16117],
       [ 1535,  2733]], dtype=int64)

In [229]:
X_train.drop('pred',axis=1,inplace=True)
X_sc_test.drop('pred',axis=1,inplace=True)

## Under-Sampling

In [230]:
from imblearn.under_sampling import RandomUnderSampler

In [231]:
under = RandomUnderSampler(sampling_strategy=0.25, random_state=42)

X_resamp, y_resamp = under.fit_resample(X_train,y_train)

print(X_resamp.shape, y_resamp.shape)

print((y_resamp.value_counts()/y_resamp.shape[0])*100)
#print((y_resamp.value_counts()[1]/(y_resamp.value_counts()[0]+y_resamp.value_counts()[1])))
print()
X_train_undr,X_test_undr,y_train_undr,y_test_undr=X_resamp,X_test,y_resamp,y_test


(49010, 27) (49010,)
0    80.0
1    20.0
Name: TARGET, dtype: float64



In [232]:
rs_undr = RobustScaler()

X_scaled_undr = pd.DataFrame(rs_undr.fit_transform(X_train_undr))


In [233]:
randomForestClassifier_undr = RandomForestClassifier(n_estimators = 800, max_depth = 10,
                                                  class_weight = "balanced",
                                                  random_state= 42)

randomForestClassifier_undr.fit(X_scaled_undr, y_train_undr)

RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=800,
                       random_state=42)

In [234]:
X_train_undr['pred']  = randomForestClassifier_undr.predict(X_scaled_undr)

print('Accuracy Score ->',round(accuracy_score(y_train_undr, X_train_undr['pred']),3))
print('Precision Score ->',round(precision_score(y_train_undr, X_train_undr['pred']),3))
print('Recall Score ->',round(recall_score(y_train_undr, X_train_undr['pred']),3))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_train_undr, X_train_undr['pred'])

Accuracy Score -> 0.765
Precision Score -> 0.446
Recall Score -> 0.728


***Confusion Matrix***


array([[30360,  8848],
       [ 2665,  7137]], dtype=int64)

In [235]:
X_sc_test_undr = pd.DataFrame(rs.transform(X_test_undr))

In [236]:
X_sc_test_undr.shape

(48685, 27)

In [237]:
X_train_undr.shape

(49010, 28)

In [238]:
X_sc_test_undr['pred'] = randomForestClassifier_undr.predict(X_sc_test_undr)

print('Accuracy Score ->',round(accuracy_score(y_test, X_sc_test_undr['pred']),3))
print('Precision Score ->',round(precision_score(y_test, X_sc_test_undr['pred']),3))
print('Recall Score ->',round(recall_score(y_test, X_sc_test_undr['pred']),3))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_test, X_sc_test_undr['pred'])

Accuracy Score -> 0.719
Precision Score -> 0.177
Recall Score -> 0.602


***Confusion Matrix***


array([[32455, 11962],
       [ 1697,  2571]], dtype=int64)

In [239]:
X_train_undr.drop('pred',axis=1,inplace=True)
X_sc_test_undr.drop('pred',axis=1,inplace=True)

# Decision Tree

In [240]:
rs_undr = RobustScaler()

X_scaled_undr = pd.DataFrame(rs_undr.fit_transform(X_train_undr))
X_train_undr.shape

(49010, 27)

In [241]:
#Decision Tree

decision_tree_model_undr = DecisionTreeClassifier(class_weight = "balanced",random_state= 42,max_depth=10)

decision_tree_model_undr.fit(X_scaled_undr, y_train_undr)

DecisionTreeClassifier(class_weight='balanced', max_depth=10, random_state=42)

In [242]:
X_scaled_undr.shape

(49010, 27)

In [243]:
X_train_undr['pred']  = decision_tree_model_undr.predict(X_scaled_undr)



In [244]:
print('Accuracy Score ->',round(accuracy_score(y_train_undr,X_train_undr['pred'],3)))
print('Precision Score ->',round(precision_score(y_train_undr, X_train_undr['pred'],3)))
print('Recall Score ->',round(recall_score(y_train_undr, X_train_undr['pred'],3)))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_train_undr, X_train_undr['pred'])

Accuracy Score -> 1
Precision Score -> 0
Recall Score -> 1


***Confusion Matrix***


array([[25959, 13249],
       [ 2321,  7481]], dtype=int64)

In [245]:
X_sc_test_undr = pd.DataFrame(rs_undr.transform(X_test_undr))


In [246]:
X_sc_test_undr['pred'] = decision_tree_model_undr.predict(X_sc_test_undr)

print('Accuracy Score ->',round(accuracy_score(y_test, X_sc_test_undr['pred'],3)))
print('Precision Score ->',round(precision_score(y_test, X_sc_test_undr['pred'],3)))
print('Recall Score ->',round(recall_score(y_test, X_sc_test_undr['pred'],3)))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_test_undr, X_sc_test_undr['pred'])

Accuracy Score -> 1
Precision Score -> 0
Recall Score -> 1


***Confusion Matrix***


array([[28164, 16253],
       [ 1502,  2766]], dtype=int64)

# Oversampling

In [247]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy=0.25,random_state=42)

X_ros, y_ros = ros.fit_resample(X_train, y_train)

print((y_ros.value_counts()/y_ros.shape[0])*100)
#print((y_resamp.value_counts()[1]/(y_resamp.value_counts()[0]+y_resamp.value_counts()[1])))
print()
X_train_ovr,X_test_ovr,y_train_ovr,y_test_ovr=X_ros,X_test,y_ros,y_test

0    80.000462
1    19.999538
Name: TARGET, dtype: float64



In [248]:
ros_ovr = RobustScaler()

X_scaled_ovr = pd.DataFrame(ros_ovr.fit_transform(X_train_ovr))


In [249]:
randomForestClassifier_ovr = RandomForestClassifier(n_estimators = 800, max_depth = 10,
                                                  class_weight = "balanced",
                                                  random_state= 42)

randomForestClassifier_ovr.fit(X_scaled_ovr, y_train_ovr)

RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=800,
                       random_state=42)

In [250]:
X_train_ovr['pred']  = randomForestClassifier_undr.predict(X_scaled_ovr)

print('Accuracy Score ->',round(accuracy_score(y_train_ovr, X_train_ovr['pred'],3)))
print('Precision Score ->',round(precision_score(y_train_ovr, X_train_ovr['pred'],3)))
print('Recall Score ->',round(recall_score(y_train_ovr, X_train_ovr['pred'],3)))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_train_ovr, X_train_ovr['pred'])

Accuracy Score -> 1
Precision Score -> 0
Recall Score -> 1


***Confusion Matrix***


array([[78441, 25354],
       [ 7393, 18555]], dtype=int64)

In [251]:
X_sc_test_ovr = pd.DataFrame(ros_ovr.transform(X_test_ovr))
X_sc_test_ovr['pred'] = randomForestClassifier_ovr.predict(X_sc_test_ovr)

print('Accuracy Score ->',round(accuracy_score(y_test_ovr, X_sc_test_ovr['pred'],3)))
print('Precision Score ->',round(precision_score(y_test_ovr, X_sc_test_ovr['pred'],3,)))
print('Recall Score ->',round(recall_score(y_test_ovr, X_sc_test_ovr['pred'],3)))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_test_ovr, X_sc_test_ovr['pred'])

Accuracy Score -> 1
Precision Score -> 0
Recall Score -> 1


***Confusion Matrix***


array([[33390, 11027],
       [ 1804,  2464]], dtype=int64)

In [252]:
X_train_ovr.drop('pred',axis=1,inplace=True)
X_sc_test_ovr.drop('pred',axis=1,inplace=True)

# Decision Tree

In [253]:
ros_undr = RobustScaler()

X_scaled_ovr = pd.DataFrame(rs_undr.fit_transform(X_train_ovr))

In [254]:
#Decision Tree

decision_tree_model_ovr = DecisionTreeClassifier(class_weight = "balanced",random_state= 42,max_depth=10)

decision_tree_model_ovr.fit(X_scaled_ovr, y_train_ovr)

DecisionTreeClassifier(class_weight='balanced', max_depth=10, random_state=42)

In [255]:
X_train_ovr.shape

(129743, 27)

In [256]:
X_train_ovr['pred']  = decision_tree_model_ovr.predict(X_scaled_ovr)

In [257]:
print('Accuracy Score ->',round(accuracy_score(y_train_ovr,X_train_ovr['pred'],3)))
print('Precision Score ->',round(precision_score(y_train_ovr,X_train_ovr['pred'],3)))
print('Recall Score ->',round(recall_score(y_train_ovr,X_train_ovr['pred'],3)))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_train_ovr,X_train_ovr['pred'])

Accuracy Score -> 1
Precision Score -> 0
Recall Score -> 1


***Confusion Matrix***


array([[68743, 35052],
       [ 5849, 20099]], dtype=int64)

In [258]:
X_sc_test_ovr['pred'] = decision_tree_model_ovr.predict(X_sc_test_ovr)

print('Accuracy Score ->',round(accuracy_score(y_test, X_sc_test_ovr['pred'],3)))
print('Precision Score ->',round(precision_score(y_test, X_sc_test_ovr['pred'],3)))
print('Recall Score ->',round(recall_score(y_test,X_sc_test_ovr['pred'],3)))
print('\n')
print('***Confusion Matrix***')
confusion_matrix(y_test, X_sc_test_ovr['pred'])

Accuracy Score -> 1
Precision Score -> 0
Recall Score -> 1


***Confusion Matrix***


array([[28865, 15552],
       [ 1618,  2650]], dtype=int64)

Under-sampled random forest is giving the best precision and recall scores. Hence, we will choose that model.