In [1]:
##Importing libraries for data manipulation 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
### Read data
data = pd.read_excel("C:\\Users\\Himanshu\\Downloads\\Case study excel file.xls")

In [3]:
###Exclude records with negative age values
data = data[data['Age']>0]
data['category'] = data['category'].str.lower()

In [4]:
### View first five records
data.head()

Unnamed: 0,Ref,Credit_Score,Final_Grade,Term,Net_Advance,APR,Loan_Type,Mosaic,Mosaic_Class,Time_at_Address,...,CIFAS_detected,Time_since_most_recent_outstandi,Insurance_Description,PPI,code,prdt_desc,category,PPI_SINGLE,PPI_JOINT,PPI_LCI
0,1,918,A,36,3000.0,14.4,UnSecured,46,8,132,...,N,16,,0,,,,0,0,0
1,3,903,A,120,21000.0,7.9,secured,16,3,288,...,N,99,Life & Critical Illn,1,748.0,LIFE & CRITICAL ILLNESS,lci,0,0,1
2,4,1060,X,78,7200.0,7.9,secured,17,3,276,...,N,50,Joint,1,719.0,LASCI JOINT,joint,0,1,0
3,6,839,B,60,8000.0,16.9,UnSecured,47,8,48,...,N,99,Life & Critical Illn,1,748.0,LIFE & CRITICAL ILLNESS,lci,0,0,1
4,7,1057,X,60,7650.0,7.4,UnSecured,55,10,156,...,N,99,Single,1,718.0,LASCI,single,1,0,0


In [5]:
###Checking target variables
### There seems an imbalance in the target variable but not much. 58% of the customers have PPI insurance
data['PPI'].value_counts()

1    9461
0    6919
Name: PPI, dtype: int64

In [6]:
###Null values: Data is clean and null exist only for description and code which will not affect the data analysis and modelling
data.isnull().sum()

Ref                                    0
Credit_Score                           0
Final_Grade                            0
Term                                   0
Net_Advance                            0
APR                                    0
Loan_Type                              0
Mosaic                                 0
Mosaic_Class                           0
Time_at_Address                        0
Residential_Status                     0
Telephone_Indicator                    0
Number_of_Dependants                   0
Marital_Status                         0
Gender                                 0
Time_in_Employment                     0
Employment_Status                      0
Full_Part_Time_Empl_Ind                0
Perm_Temp_Empl_Ind                     0
Income_Range                           0
Current_Account                        0
ACCESS_Card                            0
VISA_Card                              0
American_Express                       0
Diners_Card     

# Feature Engineering

In [7]:
data.dtypes

Ref                                   int64
Credit_Score                          int64
Final_Grade                          object
Term                                  int64
Net_Advance                         float64
APR                                 float64
Loan_Type                            object
Mosaic                                int64
Mosaic_Class                          int64
Time_at_Address                       int64
Residential_Status                   object
Telephone_Indicator                  object
Number_of_Dependants                  int64
Marital_Status                       object
Gender                               object
Time_in_Employment                    int64
Employment_Status                    object
Full_Part_Time_Empl_Ind              object
Perm_Temp_Empl_Ind                   object
Income_Range                          int64
Current_Account                      object
ACCESS_Card                          object
VISA_Card                       

In [8]:
###Segregating categorical and numerical variables
data_object = data.loc[:, data.dtypes == np.object]
data_cont = data.loc[:, data.dtypes != np.object]

In [9]:
###Preparing a list of categorical variables
cat_feat = data_object.columns.to_list()

In [10]:
def encoder(data,label):
    for feat in label:
        le = LabelEncoder()
        col = data[feat].fillna("UNKNOWN").astype(str).values
        data.loc[:,feat] = le.fit_transform(col)
    return data

In [11]:
data_new = encoder(data,cat_feat)
data_new.head()

Unnamed: 0,Ref,Credit_Score,Final_Grade,Term,Net_Advance,APR,Loan_Type,Mosaic,Mosaic_Class,Time_at_Address,...,CIFAS_detected,Time_since_most_recent_outstandi,Insurance_Description,PPI,code,prdt_desc,category,PPI_SINGLE,PPI_JOINT,PPI_LCI
0,1,918,0,36,3000.0,14.4,0,46,8,132,...,0,16,24,0,,15,0,0,0,0
1,3,903,0,120,21000.0,7.9,1,16,3,288,...,0,99,18,1,748.0,14,2,0,0,1
2,4,1060,9,78,7200.0,7.9,1,17,3,276,...,0,50,12,1,719.0,11,1,0,1,0
3,6,839,1,60,8000.0,16.9,0,47,8,48,...,0,99,18,1,748.0,14,2,0,0,1
4,7,1057,9,60,7650.0,7.4,0,55,10,156,...,0,99,23,1,718.0,10,3,1,0,0


In [12]:
#cont_feat = ['Total_outstanding_balance__mortg','Time_at_Address','Time_in_Employment','Time_with_Bank','Total_Outstanding_Balances','Value_of_Property']

In [13]:
### For creating bins of numerical variables
# for cont in cont_feat:
#     discretizer = KBinsDiscretizer(n_bins = 10,encode = 'ordinal',strategy = 'quantile')
#     feat = cont+"_"+"bins"
#     data_new[feat] = discretizer.fit_transform(data_new[cont].values.reshape(-1,1)).astype(int)

In [14]:
###Checking correlated variables
corr = data_new.corr().abs()
corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
corr = corr.unstack().transpose().sort_values( ascending=False).dropna()

In [15]:
corr

Total_outstanding_balance__mortg  Total_Outstanding_Balances          0.977628
Time_since_most_recent_outstandi  Time_since_most_recent_Public_In    0.918679
category                          PPI                                 0.917678
Total___outstanding_CCJ_s         Total___Public_Info___CCJ____ban    0.909786
Bureau_Data___Monthly_Other_Co_R  Total_Outstanding_Balances          0.901623
PPI_SINGLE                        category                            0.887345
Total_outstanding_balance__mortg  Bureau_Data___Monthly_Other_Co_R    0.863396
Bankruptcy_Detected__SP_          Total_value__Public_Info___CCJ__    0.857784
Time_since_most_recent_outstandi  Total___outstanding_CCJ_s           0.835013
Time_since_most_recent_Public_In  Total___Public_Info___CCJ____ban    0.824792
__of_status_3_s_L6m               Worst_CUrrent_Status                0.783326
Worst_CUrrent_Status              Worst_status_L6m                    0.778428
Time_since_most_recent_outstandi  Total___Public_Inf

In [16]:
### To check VIF for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
X=data.drop(["Insurance_Description","PPI","code","prdt_desc","category"], axis=1)
vif = pd.DataFrame()
vif["ft"] = X.columns
vif["vif_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [17]:
vif.sort_values(by = 'vif_Factor',ascending = False)

Unnamed: 0,ft,vif_Factor
20,Current_Account,640.521252
33,Payment_Method,382.560616
50,Time_since_most_recent_outstandi,253.771421
38,Time_since_most_recent_Public_In,208.137272
1,Credit_Score,185.447054
30,Total_Outstanding_Balances,100.272488
35,Total_outstanding_balance__mortg,80.485833
34,Age,31.152044
3,Term,17.327668
11,Telephone_Indicator,15.144296


In [18]:
###Variable list to drop due to high correlation
drp_feat = ["Total_Outstanding_Balances","Total_outstanding_balance__mortg","Time_since_most_recent_Public_In","Total___outstanding_CCJ_s","Ref","Mosaic_Class","Worst_CUrrent_Status","APR","Current_Account","Payment_Method"]

In [19]:
###Final data preparation
data1 = data.copy(deep = True)
data2 = data.copy(deep = True)
data1 = data1.drop(['Insurance_Description','prdt_desc','code','category','PPI_SINGLE','PPI_JOINT','PPI_LCI'],axis=1)
data1 = data1.drop(drp_feat,axis = 1)

In [20]:
X=data1.drop(['PPI'],axis=1) 
y=data1['PPI']

# Model Building

In [21]:
###Importing required libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score,recall_score,roc_curve,f1_score,confusion_matrix,roc_auc_score,classification_report


In [22]:
###Oversampling of the data
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

randomsample=  RandomOverSampler()
x_balanced,y_balanced=randomsample.fit_resample(X,y)

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_balanced)))

Original dataset shape Counter({1: 9461, 0: 6919})
Resampled dataset shape Counter({0: 9461, 1: 9461})


In [23]:
###Splitting data in test and train

x_train,x_test,y_train,y_test = train_test_split(x_balanced,y_balanced,test_size=.20,random_state=100)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(15137, 41) (3785, 41) (15137,) (3785,)


In [24]:
#feature scaling
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [25]:
###Logistic Regression

logreg = LogisticRegression(random_state=100, penalty='l2',class_weight='balanced',solver = 'newton-cg')
logreg.fit(x_train,y_train)
ypred_lr = logreg.predict(x_test)

cm=confusion_matrix(y_test,ypred_lr)
auc_lr = roc_auc_score(y_test, ypred_lr)
print("======AUC=======") 
print(auc_lr)
print("======Confusion Matrix=======") 
print(cm)
print("======Classification Report=======") 

print(classification_report(y_test,logreg.predict(x_test)))

0.6504924405516809
[[1233  674]
 [ 649 1229]]
              precision    recall  f1-score   support

           0       0.66      0.65      0.65      1907
           1       0.65      0.65      0.65      1878

    accuracy                           0.65      3785
   macro avg       0.65      0.65      0.65      3785
weighted avg       0.65      0.65      0.65      3785



In [26]:
### XGB classifier
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state = 1,learning_rate =0.08,
                     n_estimators=750,
                     max_depth=15,
                     gamma=0.1,
                     n_jobs=-1,
                     min_samples_split = 5,
                     min_samples_leaf = 1)
                                  

xgb.fit(x_train, y_train)
ypred_xgb = xgb.predict(x_test)
auc_xgb = roc_auc_score(y_test, ypred_xgb)
cm=confusion_matrix(ypred_xgb,y_test)

print("======AUC=======") 
print(auc_xgb)
print("======Confusion Matrix=======") 
print(cm)
print("======Classification Report=======") 

print(classification_report(y_test,logreg.predict(x_test)))

Parameters: { min_samples_leaf, min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.739277215884754
[[1399  479]
 [ 508 1399]]
              precision    recall  f1-score   support

           0       0.66      0.65      0.65      1907
           1       0.65      0.65      0.65      1878

    accuracy                           0.65      3785
   macro avg       0.65      0.65      0.65      3785
weighted avg       0.65      0.65      0.65      3785



In [27]:
###Random Forest Classifier

rf=RandomForestClassifier(random_state=5, class_weight = "balanced_subsample",criterion = "entropy",n_estimators = 45,max_features = 5,max_depth = 50)

rf.fit(x_train,y_train)

ypred_rf = rf.predict(x_test)


cm=confusion_matrix(y_test,ypred_rf)
auc_rf = roc_auc_score(y_test, ypred_rf)
print("======AUC=======") 
print(auc_rf)
print("======Confusion Matrix=======") 
print(cm)
print("======Classification Report=======") 

print(classification_report(y_test,rf.predict(x_test)))

0.732940073369063
[[1386  521]
 [ 490 1388]]
              precision    recall  f1-score   support

           0       0.74      0.73      0.73      1907
           1       0.73      0.74      0.73      1878

    accuracy                           0.73      3785
   macro avg       0.73      0.73      0.73      3785
weighted avg       0.73      0.73      0.73      3785



In [28]:
###Important Features of the model
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending = False)

Credit_Score                        0.062824
Time_at_Address                     0.056501
Total_outstanding_balance___excl    0.056142
Bureau_Data___Monthly_Other_Co_R    0.055111
Time_in_Employment                  0.054944
Mosaic                              0.054339
Value_of_Property                   0.054227
Time_with_Bank                      0.053485
Age                                 0.052868
Net_Advance                         0.049447
Outstanding_Mortgage_Bal            0.049292
Years_on_ER_for_SP                  0.039273
Income_Range                        0.033666
Searches___Total___L6m              0.031969
Total___of_accounts                 0.031732
Term                                0.031516
Total_value__CAIS_8_9s              0.028548
Final_Grade                         0.023357
Number_of_Dependants                0.019876
Worst_status_L6m                    0.018450
Employment_Status                   0.017892
Worst_History_CT                    0.014833
Marital_St

# Type of PPI products to target customers

In [29]:
###Data to be used in targeting PPI product
data2 = data2[data2['PPI']==1]
y = data2.category
X = data2.drop(['Ref','Insurance_Description','prdt_desc','code','category','PPI_SINGLE','PPI_JOINT','PPI_LCI'],axis=1)

In [30]:
### Oversampling of the target class
### 1 - "joint", 2- "lci", 3 - "single"
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

randomsample=  RandomOverSampler()
x_balanced,y_balanced=randomsample.fit_resample(X,y)

from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_balanced)))

Original dataset shape Counter({3: 6264, 2: 1972, 1: 1225})
Resampled dataset shape Counter({2: 6264, 1: 6264, 3: 6264})


In [31]:
###Splitting data in test and train

x_train,x_test,y_train,y_test = train_test_split(x_balanced,y_balanced,test_size=.20,random_state=100)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(15033, 51) (3759, 51) (15033,) (3759,)


In [32]:
print("-------Random Forest Classifier-------")

rf=RandomForestClassifier(random_state=5,criterion = "entropy",class_weight = "balanced_subsample",n_estimators = 40,max_features = 8,max_depth = 25)

rf.fit(x_train,y_train)

ypred_rf = rf.predict(x_test)


cm=confusion_matrix(y_test,ypred_rf,labels=[1,2,3])
print("/n======Confusion Matrix=======\n") 
print(cm)
print("======Classification Report=======") 

print(classification_report(y_test,rf.predict(x_test)))

-------Random Forest Classifier-------

[[1211    0    9]
 [   4 1178   58]
 [  83   99 1117]]
              precision    recall  f1-score   support

           1       0.93      0.99      0.96      1220
           2       0.92      0.95      0.94      1240
           3       0.94      0.86      0.90      1299

    accuracy                           0.93      3759
   macro avg       0.93      0.93      0.93      3759
weighted avg       0.93      0.93      0.93      3759



In [33]:
###Important Features
feat_importances = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending = False)

Age                                 0.050697
Income_Range                        0.049592
Time_in_Employment                  0.047610
Credit_Score                        0.046867
Total_Outstanding_Balances          0.045054
Time_at_Address                     0.044516
Total_outstanding_balance___excl    0.044243
Time_with_Bank                      0.042580
Bureau_Data___Monthly_Other_Co_R    0.040009
Value_of_Property                   0.038286
Net_Advance                         0.037267
Employment_Status                   0.036804
APR                                 0.036407
Mosaic                              0.036233
Outstanding_Mortgage_Bal            0.033656
Years_on_ER_for_SP                  0.030891
Marital_Status                      0.030621
Total_value__CAIS_8_9s              0.029821
Searches___Total___L6m              0.028458
Total_outstanding_balance__mortg    0.026411
Total___of_accounts                 0.023991
Mosaic_Class                        0.021121
Term      