# Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Importing the Raw Data

In [2]:
rawData = pd.read_csv('Bank-data.csv')
rawData

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...,...
513,513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,516,0.877,0.0,0.0,5.0,1.0,473.0,yes


# Data Preprocessing

### Descriptive Statistics

In [3]:
rawData.describe(include='all')

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0,518
unique,,,,,,,,2
top,,,,,,,,yes
freq,,,,,,,,259
mean,258.5,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,
std,149.677988,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,
min,0.0,0.635,0.0,0.0,0.0,0.0,9.0,
25%,129.25,1.04275,0.0,0.0,0.0,0.0,155.0,
50%,258.5,1.466,0.0,0.0,0.0,0.0,266.5,
75%,387.75,4.9565,0.0,1.0,0.0,0.0,482.75,


### Variables of Interest

In [4]:
rawData.columns.values

array(['Unnamed: 0', 'interest_rate', 'credit', 'march', 'may',
       'previous', 'duration', 'y'], dtype=object)

In [5]:
rawData = rawData.rename({'y':'loan'}, axis=1)
rawData = rawData.drop(['Unnamed: 0', 'may'], axis=1)        #Dropping "may" because of high p-value

cols = ['loan', 'interest_rate', 'credit', 'march', 'previous', 'duration']
rawData = rawData[cols]
rawData.head()

Unnamed: 0,loan,interest_rate,credit,march,previous,duration
0,no,1.334,0.0,1.0,0.0,117.0
1,yes,0.767,0.0,0.0,1.0,274.0
2,no,4.858,0.0,1.0,0.0,167.0
3,yes,4.12,0.0,0.0,0.0,686.0
4,no,4.856,0.0,1.0,0.0,157.0


### Dummy Categorical Variables

In [6]:
rawData['loan'] = rawData['loan'].map({'yes':1, 'no':0})
rawData.head()

Unnamed: 0,loan,interest_rate,credit,march,previous,duration
0,0,1.334,0.0,1.0,0.0,117.0
1,1,0.767,0.0,0.0,1.0,274.0
2,0,4.858,0.0,1.0,0.0,167.0
3,1,4.12,0.0,0.0,0.0,686.0
4,0,4.856,0.0,1.0,0.0,157.0


### Descriptive Statistics after preprocessing

In [7]:
rawData.describe(include='all')

Unnamed: 0,loan,interest_rate,credit,march,previous,duration
count,518.0,518.0,518.0,518.0,518.0,518.0
mean,0.5,2.835776,0.034749,0.266409,0.127413,382.177606
std,0.500483,1.876903,0.183321,0.442508,0.333758,344.29599
min,0.0,0.635,0.0,0.0,0.0,9.0
25%,0.0,1.04275,0.0,0.0,0.0,155.0
50%,0.5,1.466,0.0,0.0,0.0,266.5
75%,1.0,4.9565,0.0,1.0,0.0,482.75
max,1.0,4.97,1.0,1.0,1.0,2653.0


In [8]:
data = rawData.copy()

# Create the Regression

### Declare the dependent (target) and independent (inputs) variables

In [9]:
target = data['loan']
inputs = data.drop(['loan'], axis=1)

### Logistics Regression using StatsModels

In [10]:
import statsmodels.api as smapi

In [11]:
x = smapi.add_constant(inputs)
results = smapi.Logit(target, inputs).fit()

Optimization terminated successfully.
         Current function value: 0.336668
         Iterations 7


### Logistic Regression Summary

In [12]:
results.summary()

0,1,2,3
Dep. Variable:,loan,No. Observations:,518.0
Model:,Logit,Df Residuals:,513.0
Method:,MLE,Df Model:,4.0
Date:,"Sat, 16 Oct 2021",Pseudo R-squ.:,0.5143
Time:,10:44:10,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.185e-78

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
interest_rate,-0.8030,0.079,-10.201,0.000,-0.957,-0.649
credit,2.3459,1.071,2.190,0.029,0.246,4.445
march,-1.8387,0.315,-5.831,0.000,-2.457,-1.221
previous,1.5262,0.478,3.190,0.001,0.588,2.464
duration,0.0069,0.001,10.365,0.000,0.006,0.008


### Confusion Matrix

In [13]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        missclassification = (cm[1,0]+cm[0,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy.round(3), missclassification.round(3)

In [14]:
confusion_mat = pd.DataFrame(data=results.pred_table(),
                               columns=['Predicted 0', 'Predicted 1'])
confusion_mat = confusion_mat.rename(index={0: "Actual 0", 1: "Actual 1"})

confusion_mat

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,218.0,41.0
Actual 1,28.0,231.0


### Accuracy of the Model

In [15]:
cm = confusion_matrix(data=inputs, actual_values=target, model=results)
cm

(array([[218.,  41.],
        [ 28., 231.]]),
 0.867,
 0.133)

In [16]:
confusion_mat_array = np.array(confusion_mat)
accuracy = (confusion_mat_array[0,0] + confusion_mat_array[1,1]) / confusion_mat_array.sum()
missclassification_rate = (confusion_mat_array[1,0] + confusion_mat_array[0,1]) / confusion_mat_array.sum()
print('Accuracy is: ', accuracy.round(3),
     '\nMissclassification Rate is: ', missclassification_rate.round(3))

Accuracy is:  0.867 
Missclassification Rate is:  0.133


# Testing Dataset

### Importing the testing data

In [17]:
data_test = pd.read_csv('Bank-data-testing.csv')
data_test

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,3,4.120,0.0,0.0,0.0,0.0,1468.0,yes
4,4,4.963,0.0,0.0,0.0,0.0,36.0,no
...,...,...,...,...,...,...,...,...
217,217,4.963,0.0,0.0,0.0,0.0,458.0,yes
218,218,1.264,0.0,1.0,1.0,0.0,397.0,yes
219,219,1.281,0.0,1.0,0.0,0.0,34.0,no
220,220,0.739,0.0,0.0,2.0,0.0,233.0,no


### Preprocessing the testing data

##### Variables of Interest

In [18]:
data_test = data_test.drop(['Unnamed: 0', 'may'], axis=1)
data_test = data_test.rename({'y':'loan'}, axis=1)
data_test = data_test[cols]
data_test.head()

Unnamed: 0,loan,interest_rate,credit,march,previous,duration
0,no,1.313,0.0,1.0,0.0,487.0
1,no,4.961,0.0,0.0,0.0,132.0
2,no,4.856,0.0,1.0,0.0,92.0
3,yes,4.12,0.0,0.0,0.0,1468.0
4,no,4.963,0.0,0.0,0.0,36.0


##### Dummy Categorical Variables

In [19]:
data_test['loan'] = data_test['loan'].map({'yes':1, 'no':0})
data_test

Unnamed: 0,loan,interest_rate,credit,march,previous,duration
0,0,1.313,0.0,1.0,0.0,487.0
1,0,4.961,0.0,0.0,0.0,132.0
2,0,4.856,0.0,1.0,0.0,92.0
3,1,4.120,0.0,0.0,0.0,1468.0
4,0,4.963,0.0,0.0,0.0,36.0
...,...,...,...,...,...,...
217,1,4.963,0.0,0.0,0.0,458.0
218,1,1.264,0.0,1.0,0.0,397.0
219,0,1.281,0.0,1.0,0.0,34.0
220,0,0.739,0.0,0.0,0.0,233.0


##### Testing Descriptive Statistics

In [20]:
data_test.describe(include='all')

Unnamed: 0,loan,interest_rate,credit,march,previous,duration
count,222.0,222.0,222.0,222.0,222.0,222.0
mean,0.5,2.922095,0.031532,0.274775,0.099099,398.86036
std,0.50113,1.891766,0.175144,0.44741,0.29947,410.565798
min,0.0,0.639,0.0,0.0,0.0,6.0
25%,0.0,1.04925,0.0,0.0,0.0,144.75
50%,0.5,1.714,0.0,0.0,0.0,255.5
75%,1.0,4.96,0.0,1.0,0.0,525.25
max,1.0,4.968,1.0,1.0,1.0,3643.0


### Logistic Regression prediction

##### Declare the targets and the inputs

In [21]:
target_predict = data_test['loan']
inputs_predict = data_test.drop(['loan'], axis=1)

##### Prediction

In [22]:
results.predict(inputs_predict)

0      0.618883
1      0.044440
2      0.006060
3      0.998966
4      0.023302
         ...   
217    0.308182
218    0.475000
219    0.067139
220    0.735491
221    0.692285
Length: 222, dtype: float64

##### Confusion Matrix

In [23]:
cm_test = confusion_matrix(data=inputs_predict, actual_values=target_predict, model=results)
cm_test

(array([[93., 18.],
        [13., 98.]]),
 0.86,
 0.14)

In [24]:
print('Testing Accuracy is: ', cm_test[1],
     '\nTesting Missclassification Rate is: ', cm_test[2])

Testing Accuracy is:  0.86 
Testing Missclassification Rate is:  0.14
