In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as pyo

In [22]:
df = pd.read_csv('Bank_Personal_Loan_Modelling.csv')
df.head(10)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0
7,8,50,24,22,93943,1,0.3,3,0,0,0,0,0,1
8,9,35,10,81,90089,3,0.6,2,104,0,0,0,1,0
9,10,34,9,180,93023,1,8.9,3,0,1,0,0,0,0


**Attribute Information:**

ID : Customer ID

Age : Customer's age in completed years

Experience : #years of professional experience

Income : Annual income of the customer ($000)

ZIP Code : Home Address ZIP code.

Family : Family size of the customer

CCAvg : Avg. spending on credit cards per month ($000)

Education : Education Level.
1: Undergrad;
2: Graduate;
3: Advanced/Professional

Mortgage : Value of house mortgage if any. ($000)

10.Personal Loan : Did this customer accept the personal loan offered in the last campaign?

11.Securities Account : Does the customer have a securities account with the bank?

12.CD Account : Does the customer have a certificate of deposit (CD) account with the bank?

13.Online : Does the customer use internet banking facilities?

14.Credit card : Does the customer use a credit card issued by

### DATA GATHERING AND EXPLORATION

In [23]:
df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [25]:
df.isna().any()

ID                    False
Age                   False
Experience            False
Income                False
ZIP Code              False
Family                False
CCAvg                 False
Education             False
Mortgage              False
Personal Loan         False
Securities Account    False
CD Account            False
Online                False
CreditCard            False
dtype: bool

**Identifying outliers in the dataset**

In [26]:
df.boxplot(figsize=(20, 6))

<Axes: xlabel='CD Account'>

A number columns have outliers in them such as; Income, CCAvg, Mortgage, Personal Loans, Securities Account, and CD Account. Our next step is to determine how much outliers exist within the above mentioned columns, we will be doing this using boxplot.

Secondly, we will be dropping columns not needed for this projects such as; ID and Zip Code.

In [27]:
#Outliers in the income column
sns.boxplot(data=df, x=df['Income'])

<Axes: xlabel='Income'>

In [28]:
#Outliers in the CCAvg column
sns.boxplot(data=df, x=df['CCAvg'])

<Axes: xlabel='CCAvg'>

**Dropping unnecessary columns from the dataset**

In [29]:
df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [30]:
df = df[['Age','Experience','Income','Family','Education','Mortgage','Personal Loan', 'Securities Account', 'CD Account', 'CreditCard']]

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   Age                 5000 non-null   int64
 1   Experience          5000 non-null   int64
 2   Income              5000 non-null   int64
 3   Family              5000 non-null   int64
 4   Education           5000 non-null   int64
 5   Mortgage            5000 non-null   int64
 6   Personal Loan       5000 non-null   int64
 7   Securities Account  5000 non-null   int64
 8   CD Account          5000 non-null   int64
 9   CreditCard          5000 non-null   int64
dtypes: int64(10)
memory usage: 390.8 KB


In [32]:
df.head(10)

Unnamed: 0,Age,Experience,Income,Family,Education,Mortgage,Personal Loan,Securities Account,CD Account,CreditCard
0,25,1,49,4,1,0,0,1,0,0
1,45,19,34,3,1,0,0,1,0,0
2,39,15,11,1,1,0,0,0,0,0
3,35,9,100,1,2,0,0,0,0,0
4,35,8,45,4,2,0,0,0,0,1
5,37,13,29,4,2,155,0,0,0,0
6,53,27,72,2,2,0,0,0,0,0
7,50,24,22,1,3,0,0,0,0,1
8,35,10,81,3,2,104,0,0,0,0
9,34,9,180,1,3,0,1,0,0,0


In [33]:
df['CreditCard'].value_counts()

0    3530
1    1470
Name: CreditCard, dtype: int64

### DATA VISUALIZATION AND MODEL SELECTION

In [34]:
import plotly.graph_objs as go
import plotly.offline as pyo

In [35]:
trace = go.Scatter(x=df['Personal Loan'], y=df['Income'], mode='markers', marker=dict(color=df['CreditCard']))
trace

data = [trace]

layout = go.Layout(title='Income vs Personal Loan',
                   xaxis=dict(title='Personal Loan'),
                   yaxis=dict(title='Income'))

fig = go.Figure(data=data, layout=layout)
fig.show()

The above scatter plot shows that there is an even distribution of people when it comes to not taking loans. While those who take loans tend to earn more income.

In [36]:
trace = go.Scatter(x=df['Personal Loan'], y=df['Age'], mode='markers', marker=dict(color=df['CreditCard']))
trace

data = [trace]

layout = go.Layout(title='Age vs Personal Loan',
                   xaxis=dict(title='Personal Loan'),
                   yaxis=dict(title='Age'))

fig = go.Figure(data=data, layout=layout)
fig.show()

The above scatter plot shows that age does not have a significant impact on taking personal loans as there is an even distribution of age groups between those who take loans and those who do not.

This is a classification problem and I have dedcided to use the following machine learning model;
Decision Tree and Random Forest

I will use this to compare which of the model performed well in predicting customers taking personal loan

### MODEL TRAINING AND TESTING

**Using Decision Tree**

In [37]:
#importing relevant libraries
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [38]:
X = df.drop('Personal Loan',axis=1)

#Load the target variable to y

y = df['Personal Loan']

In [39]:
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [40]:
X

Unnamed: 0,Age,Experience,Income,Family,Education,Mortgage,Securities Account,CD Account,CreditCard
0,25,1,49,4,1,0,1,0,0
1,45,19,34,3,1,0,1,0,0
2,39,15,11,1,1,0,0,0,0
3,35,9,100,1,2,0,0,0,0
4,35,8,45,4,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,3,0,0,0,0
4996,30,4,15,4,1,85,0,0,0
4997,63,39,24,2,3,0,0,0,0
4998,65,40,49,3,2,0,0,0,0


In [41]:
#applying tree algorithm
tree = tree.DecisionTreeClassifier()

In [42]:
 #fitting our model
tree.fit(X_train, y_train)

In [43]:
# evaluating our model
y_pred=tree.predict(X_test)
print("score:{}".format(accuracy_score(y_test, y_pred)))

score:0.973


After using the decision tree machine learning model, we got a 97.6% accuracy score which is very high. We will attempt this with Random Forest to see a higher or lower accuracy.

In [44]:
#confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)

<Axes: xlabel='Predicted', ylabel='Actual'>

In [45]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       910
           1       0.84      0.87      0.85        90

    accuracy                           0.97      1000
   macro avg       0.91      0.93      0.92      1000
weighted avg       0.97      0.97      0.97      1000



**Using Random Forest**

In [46]:
#importing random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [47]:
#Creating a random forest with 10 decision trees
clf=RandomForestClassifier(n_estimators=10)

In [48]:
 #fitting our model
clf.fit(X_train, y_train)

In [49]:
# evaluating our model
clf_y_pred=clf.predict(X_test)
print("score:{}".format(accuracy_score(y_test, clf_y_pred)))

score:0.987


In [50]:
#confusion matrix
confusion_matrix = pd.crosstab(y_test, clf_y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)

<Axes: xlabel='Predicted', ylabel='Actual'>

### MODEL DEPLOYMENT AND MONITORING

**Based on the machine learning models tested, I will be deploying this project using the results gotten from Random Forest as it gave the highest accuracy**

In [51]:
input_data = (30, 10, 50, 2, 2, 0, 1, 1, 1)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = clf.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person will take a loan')
else:
  print('The person would not take a loan')

[0]
The person will take a loan



X does not have valid feature names, but RandomForestClassifier was fitted with feature names



**Saving trained model**

In [52]:
import pickle

In [53]:
filename = 'trained_model_loan.sav'
pickle.dump(clf, open(filename, 'wb'))

In [54]:
loaded_model = pickle.load(open('trained_model_loan.sav', 'rb'))

In [55]:
input_data = (30, 10, 50, 2, 2, 0, 1, 1, 1)

# changing the input_data to numpy array
input_data_as_numpy_array = np.array(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person will take a loan')
else:
  print('The person will not take a loan')

[0]
The person will take a loan



X does not have valid feature names, but RandomForestClassifier was fitted with feature names

