In [1]:
#Basic libraries to be imported
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading dataset
df=pd.read_csv("Bank_Personal_Loan_Modelling.csv")

In [3]:
#to print the dataset
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [4]:
#checking null values in the dataset
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [5]:
#As there is no null data found hence we will be checking datatype in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [6]:
#Column 'ID' ,'ZIP Code' are of No use for the purpose of prediction hence we will be removing the same for the faster performance.
df.drop(["ID","ZIP Code"],axis=1,inplace=True)

In [7]:
#Just to cross check whether the column is been deleted or not
df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [8]:
#Here Column "Personal Loan" is the Target column

In [9]:
#Initially we will train our dataset with all the features
#We will separate the Independent and Dependent data from the dataset
X=df.drop(['Personal Loan'],axis=1)
Y=df['Personal Loan']

In [10]:
#Importing library for training and testing of the data
from sklearn.model_selection import train_test_split

In [11]:
#Separate train and test the splited data into X and Y
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)
#where test size=0.3 means 30% for testing the module and 70% is for training the module,random_states means data will be picked up randomly from the dataset

In [12]:
#Dataset needs to be predicted with the help of Classification Regression(Logistic Regression algo.)
#For this we will be calling library of Logistic regression
from sklearn.linear_model import LogisticRegression

In [13]:
#need to call classification_report
from sklearn.metrics import classification_report

In [14]:
#create object of LogisticRegression class
lr=LogisticRegression()

In [15]:
#create function for the purpose of 
def create_model(model):
    model.fit(X_train,Y_train) #we train the model
    y_pred=model.predict(X_test)  #test the model
    print(classification_report(Y_test,y_pred))
    return model

In [16]:
#call function
create_model(lr)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1351
           1       0.82      0.53      0.64       149

    accuracy                           0.94      1500
   macro avg       0.89      0.76      0.81      1500
weighted avg       0.94      0.94      0.94      1500



LogisticRegression()

In [17]:
#The score is very bad and hence we will have to use another algorithm Decision Tree

In [18]:
#calling DecisionTreeClassifier 
from sklearn.tree import DecisionTreeClassifier

In [19]:
#we will create object of DecisionTreeClassifier
dt=DecisionTreeClassifier()

In [20]:
#call function
create_model(dt)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1351
           1       0.90      0.89      0.90       149

    accuracy                           0.98      1500
   macro avg       0.95      0.94      0.94      1500
weighted avg       0.98      0.98      0.98      1500



DecisionTreeClassifier()

In [21]:
#The score is good but can also increase more and final score can be excellent with the help of other algorithms and techniques..
#We will try Naive Ensembling technique in this dataset
#We will create object of the 3 algorithms which are to be used in Naive Ensembling Technique
lr=LogisticRegression()
dt=DecisionTreeClassifier()
dt1=DecisionTreeClassifier(criterion="entropy")

In [22]:
#We will create Model List first to put all 3 algorithm in a list
model_list=[("LogisticRegression",lr),("DecisionTreeClassifier",dt),("DecisionTreeClassifier-entropy",dt1)]

In [23]:
#we will train Data first with Hard Voting ALgo. method
#Hard Voting

In [24]:
#Need to call Class of the (Naive Ensembling Technique)
from sklearn.ensemble import VotingClassifier

In [25]:
#need to create object for VotingClassifier
vc=VotingClassifier(estimators=model_list)
#estimators means all 3 algorithms which are to be used for the purpose of training and testing of the data.

In [26]:
#Call function
create_model(vc)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1351
           1       0.95      0.84      0.89       149

    accuracy                           0.98      1500
   macro avg       0.97      0.92      0.94      1500
weighted avg       0.98      0.98      0.98      1500



VotingClassifier(estimators=[('LogisticRegression', LogisticRegression()),
                             ('DecisionTreeClassifier',
                              DecisionTreeClassifier()),
                             ('DecisionTreeClassifier-entropy',
                              DecisionTreeClassifier(criterion='entropy'))])

In [27]:
#As per re-call score we are getting, Hard voting is not helpful for this dataset..
#Now we will try for Soft voting..

In [28]:
#Soft voting
vc1=VotingClassifier(estimators=model_list,voting="soft")

In [29]:
#call function
create_model(vc1)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1351
           1       0.95      0.85      0.90       149

    accuracy                           0.98      1500
   macro avg       0.97      0.92      0.94      1500
weighted avg       0.98      0.98      0.98      1500



VotingClassifier(estimators=[('LogisticRegression', LogisticRegression()),
                             ('DecisionTreeClassifier',
                              DecisionTreeClassifier()),
                             ('DecisionTreeClassifier-entropy',
                              DecisionTreeClassifier(criterion='entropy'))],
                 voting='soft')

In [30]:
#Soft voting is also not helpful for the increment in the prediction score..
#In this Scenario we will be using othe ensembling technique in order to get good Score
#We will now be using Boosting Ensembling technique for this dataset

In [31]:
#calling class of AdaBoost Ensembling method
from sklearn.ensemble import AdaBoostClassifier

In [32]:
#creating object of AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=75)

In [33]:
#call function
create_model(ada)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1351
           1       0.88      0.76      0.82       149

    accuracy                           0.97      1500
   macro avg       0.93      0.87      0.90      1500
weighted avg       0.96      0.97      0.96      1500



AdaBoostClassifier(n_estimators=75)

In [34]:
#using AdaBoosting Decresing the score accuracy and Hence now we will be using Gradient Boosting Method

In [35]:
#Calling Gradient Boosting Ensembling technique class
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
#Creating object of Gradient Boosting Classifier
gr=GradientBoostingClassifier(n_estimators=70)

In [37]:
#call function
create_model(gr)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1351
           1       0.96      0.86      0.90       149

    accuracy                           0.98      1500
   macro avg       0.97      0.93      0.95      1500
weighted avg       0.98      0.98      0.98      1500



GradientBoostingClassifier(n_estimators=70)

In [38]:
#Using Gradient Boosting Improving the score but not at that pace what we required and hence now we will be using Extreme Gradient Boosting
#calling class of ExtremeGradient Boosting Classifier
from xgboost import XGBClassifier

In [39]:
#creating object of Extreme GradientBoosting Classifier
xg=XGBClassifier(n_estimators=25,reg_alpha=1)

In [40]:
#call function
create_model(xg)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1351
           1       0.97      0.87      0.92       149

    accuracy                           0.98      1500
   macro avg       0.98      0.93      0.96      1500
weighted avg       0.98      0.98      0.98      1500



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=25, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=1, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [41]:
#Using Xtreme GradientBoosting Method has improved the score but still it can also increase and for that we will be using SVM method.

In [42]:
#calling library of Support vector machine
from sklearn.svm import LinearSVC

In [43]:
#create object of LinearSVC class
svc=LinearSVC(random_state=1)

In [44]:
#call function
create_model(svc)

              precision    recall  f1-score   support

           0       0.99      0.85      0.92      1351
           1       0.41      0.91      0.56       149

    accuracy                           0.86      1500
   macro avg       0.70      0.88      0.74      1500
weighted avg       0.93      0.86      0.88      1500



LinearSVC(random_state=1)

In [45]:
#we add some error on training time(to reduce overfit model)
#create object of LinearSVC class
svc1=LinearSVC(random_state=1,C=0.5)
# C inbuilt parameter, to add error on training time

In [46]:
#call function
create_model(svc1)

              precision    recall  f1-score   support

           0       0.95      0.92      0.94      1351
           1       0.45      0.60      0.52       149

    accuracy                           0.89      1500
   macro avg       0.70      0.76      0.73      1500
weighted avg       0.90      0.89      0.89      1500



LinearSVC(C=0.5, random_state=1)

In [47]:
#calling the class of svm(poly-kernel)
from sklearn.svm import SVC

In [48]:
#create object of SVC class
poly_svc=SVC(random_state=1,kernel="poly")

In [49]:
#function call
create_model(poly_svc)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1351
           1       0.95      0.14      0.25       149

    accuracy                           0.91      1500
   macro avg       0.93      0.57      0.60      1500
weighted avg       0.92      0.91      0.88      1500



SVC(kernel='poly', random_state=1)

In [50]:
#radial basis kernel function

In [51]:
#create object of SVC class
r_svc=SVC(random_state=1,kernel="rbf") #rbf means radial basis function

In [52]:
#call function
create_model(r_svc)

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1351
           1       0.71      0.11      0.20       149

    accuracy                           0.91      1500
   macro avg       0.81      0.55      0.57      1500
weighted avg       0.89      0.91      0.88      1500



SVC(random_state=1)

In [53]:
#Even after using support vector machine the score we got is not a satisfactory and hence now we will be using sampling method 
#to uplift the score

In [54]:
#calling class of sampling method
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [55]:
#create object of RandomOverSampler() class
ros=RandomOverSampler()

In [56]:
X_sample1,Y_sample1=ros.fit_sample(X_train,Y_train)

In [57]:
pd.Series(Y_train).value_counts()

0    3169
1     331
Name: Personal Loan, dtype: int64

In [58]:
pd.Series(Y_sample1).value_counts()

1    3169
0    3169
Name: Personal Loan, dtype: int64

In [59]:
#after sampling we use DecisionTreeClassifier
dt3=DecisionTreeClassifier(max_depth=3) #use pruning technique

In [60]:
dt3.fit(X_sample1,Y_sample1) #we train model

DecisionTreeClassifier(max_depth=3)

In [61]:
pd.Series(Y_test).value_counts()

0    1351
1     149
Name: Personal Loan, dtype: int64

In [62]:
X_sample_test,Y_sample_test=ros.fit_sample(X_test,Y_test)

In [63]:
pd.Series(Y_sample_test).value_counts()

1    1351
0    1351
Name: Personal Loan, dtype: int64

In [64]:
y_pred=dt3.predict(X_sample_test)  #test the model

In [65]:
print(classification_report(Y_sample_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1351
           1       0.95      0.94      0.95      1351

    accuracy                           0.95      2702
   macro avg       0.95      0.95      0.95      2702
weighted avg       0.95      0.95      0.95      2702



In [66]:
#Conclusion
#using Sampling method we are able to achieve good score consisting Precision, recall, f1 and accuracy score all score ratio
#is good and hence we can say that the sampling method is good for this particular dataset