In [101]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from dmba import classificationSummary

In [102]:
%matplotlib inline

## Part 1: Data exploration and preprocessing

In [103]:
# read in data file
loan_df = pd.read_csv('UniversalBank_unprocessed.csv')

In [104]:
# print the data types of each column
print(loan_df.dtypes)

ID                 int64
Age                int64
Experience       float64
Income           float64
Family             int64
CCAvg            float64
Education         object
Mortgage           int64
CD Account         int64
Online             int64
CreditCard         int64
Personal Loan      int64
dtype: object


In [105]:
# print the shape of the dataframe
print(loan_df.shape)
print(len(loan_df))
print(len(loan_df.columns))
loan_df.head() #we used this command to get the idea about columns in an excel file

(1117, 12)
1117
12


Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,1,39,13.0,58.0,3,2.1,Undergraduate,169,0,1,0,0
1,2,51,25.0,18.0,1,0.3,Advanced,93,0,0,1,0
2,3,43,13.0,38.0,3,2.0,Advanced,0,0,1,0,0
3,4,37,12.0,60.0,4,2.1,Advanced,217,0,1,0,0
4,5,23,,149.0,1,6.33,Undergraduate,305,0,0,1,0


In [106]:
# check for null values
loan_df.isnull().sum()

ID               0
Age              0
Experience       4
Income           3
Family           0
CCAvg            0
Education        0
Mortgage         0
CD Account       0
Online           0
CreditCard       0
Personal Loan    0
dtype: int64

In [107]:
# drop ID
loan_df2 = loan_df.drop(['ID'], axis = 1)
loan_df2.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,39,13.0,58.0,3,2.1,Undergraduate,169,0,1,0,0
1,51,25.0,18.0,1,0.3,Advanced,93,0,0,1,0
2,43,13.0,38.0,3,2.0,Advanced,0,0,1,0,0
3,37,12.0,60.0,4,2.1,Advanced,217,0,1,0,0
4,23,,149.0,1,6.33,Undergraduate,305,0,0,1,0


In [108]:
predictors_df = loan_df2[['Age','Experience','Income','Family','CCAvg','Education','Mortgage','CD Account','Online','CreditCard']]
response_df_loan = loan_df2['Personal Loan']
print(len(predictors_df.corr())) 
predictors_df.corr()

9


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard
Age,1.0,0.994236,-0.043482,-0.046942,-0.036068,0.020851,0.024537,0.034029,0.024464
Experience,0.994236,1.0,-0.03969,-0.05389,-0.038741,0.026466,0.025706,0.028488,0.033042
Income,-0.043482,-0.03969,1.0,-0.045998,0.629227,0.25096,0.257815,0.016826,-0.013544
Family,-0.046942,-0.05389,-0.045998,1.0,-0.012103,0.029502,0.025519,0.024708,0.012454
CCAvg,-0.036068,-0.038741,0.629227,-0.012103,1.0,0.131017,0.197364,0.011636,0.00341
Mortgage,0.020851,0.026466,0.25096,0.029502,0.131017,1.0,0.14219,0.00424,0.037236
CD Account,0.024537,0.025706,0.257815,0.025519,0.197364,0.14219,1.0,0.26287,0.377198
Online,0.034029,0.028488,0.016826,0.024708,0.011636,0.00424,0.26287,1.0,0.002696
CreditCard,0.024464,0.033042,-0.013544,0.012454,0.00341,0.037236,0.377198,0.002696,1.0


In [109]:
# drop Experience
loan_df2 = predictors_df.drop(['Experience'], axis = 1)
loan_df2.head()

Unnamed: 0,Age,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard
0,39,58.0,3,2.1,Undergraduate,169,0,1,0
1,51,18.0,1,0.3,Advanced,93,0,0,1
2,43,38.0,3,2.0,Advanced,0,0,1,0
3,37,60.0,4,2.1,Advanced,217,0,1,0
4,23,149.0,1,6.33,Undergraduate,305,0,0,1


In [110]:
response_df_loan.value_counts()

0    637
1    480
Name: Personal Loan, dtype: int64

In [111]:
loan_df2['Education'].value_counts()

Undergraduate    389
Advanced         383
Masters          345
Name: Education, dtype: int64

In [112]:
# flag categorical varibales
loan_df3 = pd.get_dummies(loan_df2, drop_first = True)
loan_df3.describe()


Unnamed: 0,Age,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
count,1117.0,1114.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0
mean,45.45658,99.535009,2.502238,2.695801,72.025067,0.148612,0.601611,0.293644,0.308863,0.348254
std,11.514723,54.566059,1.1474,2.114854,127.799025,0.355865,0.489786,0.455635,0.462231,0.47663
min,23.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,50.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,45.0,95.0,3.0,2.2,0.0,0.0,1.0,0.0,0.0,0.0
75%,55.0,145.0,4.0,4.0,114.0,0.0,1.0,1.0,1.0,1.0
max,67.0,218.0,4.0,10.0,617.0,1.0,1.0,1.0,1.0,1.0


## Part 2: $k$-NN

In [113]:
# impute NA values with k-NN imputer 
# in the code below, be sure you change "predictors_df" if you use a different name for your predictors DataFrame

imputer = KNNImputer(n_neighbors=5)
loan_df4 = (imputer.fit_transform(loan_df3))
loan_df4 = pd.DataFrame(loan_df5, columns = loan_df3.columns)
loan_df4.head()

Unnamed: 0,Age,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,-0.560975,-0.76153,0.434011,-0.281848,0.759148,-0.417796,0.813758,-0.64476,-0.668499,1.368015
1,0.481636,-1.494915,-1.30984,-1.133352,0.164198,-0.417796,-1.228866,1.550964,-0.668499,-0.730986
2,-0.213438,-1.128222,0.434011,-0.329154,-0.563833,-0.417796,0.813758,-0.64476,-0.668499,-0.730986
3,-0.734744,-0.72486,1.305937,-0.281848,1.134906,-0.417796,0.813758,-0.64476,-0.668499,-0.730986
4,-1.951123,0.906923,-1.30984,1.719186,1.823796,-0.417796,-1.228866,1.550964,-0.668499,1.368015


In [114]:
#predictors_df_loan = loan_df4.drop(['Personal Loan'], axis = 1)
#response_df_loan = loan_df4.loc[:,'Personal Loan']
#predictors_df_loan.head()

In [115]:
# partition data into train and test sets

X_classifier = loan_df4
y_classifier = response_df_loan
train_X_classifier, test_X_classifier, train_y_classifier, test_y_classifier = train_test_split(X_classifier, 
                                                            y_classifier, test_size=0.3, random_state=61)

In [116]:
# normalize predictors using standardization

z_score_norm1 = preprocessing.StandardScaler()
z_score_norm1.fit(train_X_classifier)
train_X_classifier = pd.DataFrame(z_score_norm1.transform(train_X_classifier), 
                                          columns = predictors_df_loan.columns)
test_X_classifier = pd.DataFrame(z_score_norm1.transform(test_X_classifier), 
                                          columns = predictors_df_loan.columns)
test_X_classifier

Unnamed: 0,Age,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,-1.510105,-1.222006,0.430130,-0.778844,-0.559351,-0.423979,-1.249262,-0.640115,1.444630,-0.696269
1,0.935679,-0.453720,0.430130,-0.492155,-0.559351,-0.423979,0.800473,1.562219,-0.692219,-0.696269
2,-0.461912,-1.075666,-0.433447,-0.205466,0.295048,-0.423979,0.800473,-0.640115,-0.692219,-0.696269
3,-0.112514,-1.002496,0.430130,-0.301029,0.877955,2.358607,0.800473,1.562219,-0.692219,-0.696269
4,-1.248056,-0.490305,0.430130,-0.492155,-0.559351,-0.423979,0.800473,-0.640115,-0.692219,1.436226
...,...,...,...,...,...,...,...,...,...,...
331,0.498932,-1.587856,0.430130,-1.065533,-0.559351,2.358607,0.800473,1.562219,-0.692219,1.436226
332,-1.073358,0.387736,-1.297024,0.272348,-0.559351,-0.423979,0.800473,-0.640115,-0.692219,-0.696269
333,0.324233,1.137729,0.430130,1.849137,3.313392,2.358607,-1.249262,-0.640115,1.444630,-0.696269
334,1.634474,0.076763,-1.297024,-0.874407,-0.559351,-0.423979,0.800473,1.562219,-0.692219,-0.696269


In [117]:
# train the k-NN model and look at performance on train data

knn = KNeighborsClassifier(n_neighbors=5).fit(train_X_classifier, train_y_classifier)
predicted_y_training = knn.predict(train_X_classifier)
print("F1 Score: ", f1_score(train_y_classifier, predicted_y_training))

F1 Score:  0.9373134328358209


In [118]:
# performance of k-NN on test data

predicted_y_test = knn.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test)

0.9185185185185185

In [119]:
# train a classifier for different values of k

results = []
for k in range(1, 20):
    knn2 = KNeighborsClassifier(n_neighbors=k).fit(train_X_classifier, train_y_classifier)
    results.append({
        'k': k,
        'f1_score': f1_score(test_y_classifier, knn2.predict(test_X_classifier))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k  f1_score
0    1  0.907749
1    2  0.870968
2    3  0.912409
3    4  0.895753
4    5  0.918519
5    6  0.909091
6    7  0.909091
7    8  0.917910
8    9  0.918519
9   10  0.924812
10  11  0.933333
11  12  0.920755
12  13  0.933333
13  14  0.920755
14  15  0.929368
15  16  0.913208
16  17  0.929368
17  18  0.921348
18  19  0.925373


## Part 3: Logistic regression and model comparison

In [120]:
# Partition data into train and test sets

X = loan_df4
y = response_df_loan
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=616)

In [121]:
# train the LR model

logistic_model = LogisticRegression()
logistic_model = logistic_model.fit(train_X, train_y)

In [122]:
# Print coefficients
print('intercept ', logistic_model.intercept_)
print(pd.DataFrame({'coeff': logistic_model.coef_[0]}, index=X.columns).transpose())

intercept  [-0.66572239]
           Age    Income    Family     CCAvg  Mortgage  CD Account    Online  \
coeff  0.01066  2.672266  0.536584  0.425468  0.202732    1.233055 -0.345787   

       CreditCard  Education_Masters  Education_Undergraduate  
coeff   -0.457355          -0.041165                -1.568142  


In [124]:
# print performance metrics (i.e., confusion matrix and accuracy) on training set

predicted_y_training = logistic_model.predict(train_X)
classificationSummary(train_y, predicted_y_training)

Confusion Matrix (Accuracy 0.9040)

       Prediction
Actual   0   1
     0 409  38
     1  37 297


In [125]:
# Print the f_1 score

f1_score(train_y, predicted_y_training)

0.8878923766816142

In [126]:
# we can even look at the predicted probabilities

predicted_y_training_proba = logistic_model.predict_proba(train_X)
results = pd.DataFrame({'actual': train_y, 
                             'p(1)': [p[1] for p in predicted_y_training_proba],                       
                             'p(0)': [p[0] for p in predicted_y_training_proba],
                             'predicted': predicted_y_training })
results.head()

Unnamed: 0,actual,p(1),p(0),predicted
363,0,0.109787,0.890213,0
609,0,0.097962,0.902038,0
584,0,0.003535,0.996465,0
482,0,0.081704,0.918296,0
958,1,0.841097,0.158903,1


In [127]:
# changing the cutoff threshold

THRESHOLD = 0.5
predicted_y_training = np.where(logistic_model.predict_proba(train_X)[:,1] > THRESHOLD, 1, 0)

pd.DataFrame(data=[accuracy_score(train_y, predicted_y_training), f1_score(train_y, predicted_y_training)], 
             index=["accuracy", "f1"])

Unnamed: 0,0
accuracy,0.903969
f1,0.887892


In [128]:
# now, let's see how we perform on the test data

predicted_y_test = logistic_model.predict(test_X)  # these are our y_hat values!!
predicted_y_test_proba = logistic_model.predict_proba(test_X)

results2 = pd.DataFrame({'actual': test_y, 
                             'p(1)': [p[1] for p in predicted_y_test_proba],                       
                             'p(0)': [p[0] for p in predicted_y_test_proba],
                             'predicted': predicted_y_test })
results2.head()

Unnamed: 0,actual,p(1),p(0),predicted
540,0,0.150767,0.849233,0
672,0,0.095984,0.904016,0
795,1,0.998375,0.001625,1
218,0,0.663285,0.336715,1
381,0,0.734833,0.265167,1


In [129]:
# how did our model perform in prediction?

classificationSummary(test_y, predicted_y_test)

Confusion Matrix (Accuracy 0.8810)

       Prediction
Actual   0   1
     0 172  18
     1  22 124


In [130]:
# test f_1 score

f1_score(test_y, predicted_y_test)

0.861111111111111