# 1. Import the datasets and libraries, check datatype, statistical summary, shape, null values or incorrect imputation

### Import the datasets and libraries

In [202]:

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import os,sys
from scipy import stats
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

bank_details=pd.read_csv('Bank_Personal_Loan_Modelling.csv')
bank_details.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


### Check datatype

In [3]:
bank_details.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

### Statistical summary

In [4]:
bank_details.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,5000.0,2500.5,1443.520003,1.0,1250.75,2500.5,3750.25,5000.0
Age,5000.0,45.3384,11.463166,23.0,35.0,45.0,55.0,67.0
Experience,5000.0,20.1046,11.467954,-3.0,10.0,20.0,30.0,43.0
Income,5000.0,73.7742,46.033729,8.0,39.0,64.0,98.0,224.0
ZIP Code,5000.0,93152.503,2121.852197,9307.0,91911.0,93437.0,94608.0,96651.0
Family,5000.0,2.3964,1.147663,1.0,1.0,2.0,3.0,4.0
CCAvg,5000.0,1.937938,1.747659,0.0,0.7,1.5,2.5,10.0
Education,5000.0,1.881,0.839869,1.0,1.0,2.0,3.0,3.0
Mortgage,5000.0,56.4988,101.713802,0.0,0.0,0.0,101.0,635.0
Personal Loan,5000.0,0.096,0.294621,0.0,0.0,0.0,0.0,1.0


### Shape

In [32]:
bank_details.shape

(5000, 13)

### Null values or Incorrect imputation

In [6]:
bank_details.isnull().values.any() 

False

In [34]:
bank_details.isnull().sum()

Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [7]:
bank_details[~bank_details.applymap(np.isreal).all(1)]

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard


# 2. EDA: Study the data distribution in each attribute and target variable, share your findings (20 marks) 

## Number of unique in each column?

In [10]:
bank_details.nunique() 

ID                    5000
Age                     45
Experience              47
Income                 162
ZIP Code               467
Family                   4
CCAvg                  108
Education                3
Mortgage               347
Personal Loan            2
Securities Account       2
CD Account               2
Online                   2
CreditCard               2
dtype: int64

## Number of people with zero mortgage? 

In [44]:
len(bank_details[bank_details.Mortgage == 0])

3462

## Number of people with zero credit card spending per month? 

In [233]:
len(bank_details[bank_details['CCAvg']==0])

106

## Value counts of all categorical columns. 

In [46]:
print(bank_details.Family.value_counts())

1    1472
2    1296
4    1222
3    1010
Name: Family, dtype: int64


In [47]:
print(bank_details.Education.value_counts())

1    2096
3    1501
2    1403
Name: Education, dtype: int64


In [48]:
bank_details['Personal Loan'].value_counts()

0    4520
1     480
Name: Personal Loan, dtype: int64

In [49]:
bank_details['Securities Account'].value_counts()

0    4478
1     522
Name: Securities Account, dtype: int64

In [50]:
bank_details['CD Account'].value_counts()

0    4698
1     302
Name: CD Account, dtype: int64

In [51]:
bank_details['Online'].value_counts()

1    2984
0    2016
Name: Online, dtype: int64

In [52]:
bank_details['CreditCard'].value_counts()

0    3530
1    1470
Name: CreditCard, dtype: int64

In [53]:
for i in list(bank_details.columns[bank_details.dtypes=='object']):   # checking value counts of all object type columns
    print(bank_details[i].value_counts())
    print()

## Univariate and Bivariate 

In [56]:
# Drop the ID column because it is irrelevant to our analysis
bank_details = bank_details.drop ('ID', axis = 1)

In [None]:
# Univariate analysis of only the variable 'Income' using violin plot
sns.violinplot(bank_details['Personal Loan'])

In [None]:
# Histogram of first all columns
columns = list(bank_details)[:] 
bank_details[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));

In [None]:
# Bivariate analysis using pairplot()
sns.pairplot(bank_details)

## Get data model ready 


In [203]:
# independent variable
x = bank_details.drop(['Personal Loan'], axis=1)
# dependent variable
y = bank_details[['Personal Loan']]

# 3. Split the data into training and test set in the ratio of 70:30 respectively (5 marks) 

In [186]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1)

In [187]:
# Checking if data was split into the 70:30 ration correctly
print("{0:0.2f}% data is in training set".format((len(x_train)/len(bank_details.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(bank_details.index)) * 100))

70.00% data is in training set
30.00% data is in test set


# 4. Use Logistic Regression model to predict the number of a customers buying personal loans. Print all the metrics related for evaluating the model performance (15 marks) 

## Model building using Logistic Regression from Sklearn


In [205]:
logreg = LogisticRegression(random_state=30)
logreg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=30, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [206]:
y_predict = logreg.predict(x_test)

In [207]:
## function to get confusion matrix in a proper format
def bank_details_cm( actual, predicted ):
    cm = confusion_matrix( actual, predicted)
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
    plt.ylabel('Observed')
    plt.xlabel('Predicted')
    plt.show()

In [None]:
print('Confusion Matrix')
print()
print(bank_details_cm(y_test,y_predict))
print()
print()
print("Training accuracy=",logreg.score(x_train,y_train))  
print()
print("Testing accuracy=",logreg.score(x_test, y_test))
print()
print("Recall=",recall_score(y_test,y_predict))
print()
print("Precision=",precision_score(y_test,y_predict))
print()
print("F1 Score=",f1_score(y_test,y_predict))
print()
print("Roc Auc Score=",roc_auc_score(y_test,y_predict))

In [209]:
logreg_score = logreg.score(x_test, y_test)
print(logreg_score)

0.944


# 5. Give your reasoning on how can the model perform better? (10 marks) Hint: Check parameter

In [193]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [210]:
# Running a loop to check different values of 'solver'
# solver can only be used with l2, only 'liblinear' works with both 'l1' and 'l2'

train_score=[]
test_score=[]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
for i in solver:
    model = LogisticRegression(random_state=30,penalty='l2', C = 0.75,solver=i)  # changing values of solver
    model.fit(x_train, y_train) 
    y_predict = model.predict(x_test)     
    train_score.append(round(model.score(x_train, y_train),3))
    test_score.append(round(model.score(x_test, y_test),3))
    
print(solver)
print()
print(train_score)
print()
print(test_score)

['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

[0.951, 0.945, 0.951, 0.901, 0.901]

[0.947, 0.943, 0.945, 0.893, 0.894]


In [211]:
model = LogisticRegression(random_state=30,penalty='l1', solver='liblinear')  # changing penalty to l1

model.fit(x_train, y_train)

y_predict = model.predict(x_test)     

print("Training accuracy",model.score(x_train,y_train))  
print()
print("Testing accuracy",model.score(x_test, y_test))   

Training accuracy 0.952

Testing accuracy 0.9466666666666667


### Accuracy improved more in 'l1' compared to 'l2' with solvers

In [212]:
model = LogisticRegression(random_state=30,penalty='l1',class_weight='balanced', solver='liblinear') # changing class weight to balanced

model.fit(x_train, y_train) 

y_predict = model.predict(x_test)     

print("Training accuracy",model.score(x_train,y_train))  
print()
print("Testing accuracy",model.score(x_test, y_test))

Training accuracy 0.9005714285714286

Testing accuracy 0.8933333333333333


In [217]:
train_score=[]                                 
test_score=[]
C = [0.01,0.1,0.25,0.5,0.75,1]
for i in C:
    model = LogisticRegression(random_state=30,penalty='l1', solver='liblinear', class_weight='balanced',C=i)  # changing values of C
    model.fit(x_train, y_train) 
    y_predict = model.predict(x_test)     
    train_score.append(round(model.score(x_train,y_train),3)) 
    test_score.append(round(model.score(x_test, y_test),3))   
    
print(C)
print()
print(train_score)
print()
print(test_score)

[0.01, 0.1, 0.25, 0.5, 0.75, 1]

[0.875, 0.895, 0.901, 0.901, 0.901, 0.901]

[0.867, 0.889, 0.894, 0.892, 0.893, 0.893]


#### Best testing accuracy is obtained for C=0.25

In [None]:
# New model is
model = LogisticRegression(random_state=30,penalty='l1', solver='liblinear', class_weight='balanced',C=0.25) 
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print("Training accuracy",model.score(x_train,y_train))  
print()
print("Testing accuracy",model.score(x_test, y_test))
print()
print('Confusion Matrix')
print(bank_details_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))

### Precision in final model is less than that in the previous confusion matrix. 
### I would go with the 1st confusion matrix above (ln 208) to make my model better.

In [None]:
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, logreg.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# 6. Give Business understanding of your model? (5 marks) 

In [None]:
**Confusion matrix means**

*True Positive (observed=1,predicted=1): 124*

Predicted that liability customers will buy personal loan and the liability customers bought the personal loan

*False Positive (observed=0,predicted=1): 169*

Predicted that liability customers will buy personal loan, but liability customers did not buy the personal loan

*True Negative (observed=0,predicted=0): 1182*

Predicted that liability customers will not buy personal loan and liability customers did not buy any personal loan

*False Negative (observed=1,predicted=0): 25*

Predicted that liability customers will not buy personal loan, but liability customers bought personal loans


Here the bank wants to  convert its liability customers to personal loan customers while retaining them as depositors. 
The True Positive and True Negative should be high, while the False Positives and False Negatives should be low. 
Hence Precision and recall are important so the F1 Score is the most important metric.


After achieving the desired accuracy we can deploy the model for practical use. 
In the sense that the bank can now predict which liability customers will purchase personal loans 
and convert as many liability customers as they can without losing them as depositors. 
They can use the model for existing and future liability customers.