# Calculating the Accuracy of the Model

Using the same dataset, expand the model by including all other features into the regression. 

Moreover, calculate the accuracy of the model and create a confusion matrix

## Import the relevant libraries

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [4]:
raw_data = pd.read_csv(r'C:\Users\Dell\Documents\bankdata.csv')

In [5]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [6]:
data = raw_data.copy()

In [7]:
data

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
5,5,0.899,0.0,0.0,1.0,0.0,126.0,no
6,6,4.962,0.0,0.0,0.0,0.0,84.0,no
7,7,4.858,0.0,1.0,0.0,0.0,17.0,no
8,8,4.962,0.0,0.0,0.0,0.0,704.0,yes
9,9,4.865,0.0,0.0,0.0,0.0,185.0,no


In [8]:
# Remove original index
data = data.drop(['Unnamed: 0'] , axis = 1)

# Map the yes = 1 and no = 0 in the column 'y'
data['y'] = data['y'].map({'yes':1, 'no':0})
data

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.120,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0
5,0.899,0.0,0.0,1.0,0.0,126.0,0
6,4.962,0.0,0.0,0.0,0.0,84.0,0
7,4.858,0.0,1.0,0.0,0.0,17.0,0
8,4.962,0.0,0.0,0.0,0.0,704.0,1
9,4.865,0.0,0.0,0.0,0.0,185.0,0


In [9]:
data.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


### Declare the dependent and independent variables

Use 'duration' as the independet variable.

In [10]:
y = data['y']
x1 = data ['duration']

### Simple Logistic Regression

Run the regression and graph the scatter plot.

In [11]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
result_log = reg_log.fit()


Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


  return ptp(axis=axis, out=out, **kwargs)


In [12]:
result_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 05 Nov 2019",Pseudo R-squ.:,0.2121
Time:,12:13:55,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


## Expand the model

We can be omitting many causal factors in our simple logistic model, so we instead switch to a multivariate logistic regression model. Add the ‘interest_rate’, ‘march’, ‘credit’ and ‘previous’ estimators to our model and run the regression again. 

### Declare the independent variable(s)

In [13]:
estimators =['interest_rate', 'march', 'credit', 'previous', 'duration']
x1 = data[estimators]
y = data['y']

In [14]:
X = sm.add_constant(x1)
reg_logit = sm.Logit(y,X)
results_logit = reg_logit.fit()
results_logit.summary2()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.514
Dependent Variable:,y,AIC:,360.7836
Date:,2019-11-05 12:14,BIC:,386.2834
No. Observations:,518,Log-Likelihood:,-174.39
Df Model:,5,LL-Null:,-359.05
Df Residuals:,512,LLR p-value:,1.2114e-77
Converged:,1.0000,Scale:,1.0
No. Iterations:,7.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.0211,0.3113,-0.0677,0.9460,-0.6313,0.5891
interest_rate,-0.8001,0.0895,-8.9434,0.0000,-0.9755,-0.6248
march,-1.8322,0.3297,-5.5563,0.0000,-2.4785,-1.1859
credit,2.3585,1.0875,2.1688,0.0301,0.2271,4.4900
previous,1.5363,0.5010,3.0666,0.0022,0.5544,2.5182
duration,0.0070,0.0007,9.3810,0.0000,0.0055,0.0084


### Confusion Matrix

Create the confusion matrix of the model and estimate its accuracy. 

<i> For convenience we have already provided you with a function that finds the confusion matrix and the model accuracy.</i>

In [15]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [17]:
confusion_matrix(X,y,results_logit)

(array([[218.,  41.],
        [ 30., 229.]]), 0.862934362934363)

In [None]:
# From the above results, we conculded that
# 1. our model is 86.3% accurate
# 2. the 447 predicted value out of 518 are correct


# Test the model

In [20]:
#load the new data. it is the data from the same file that split into two TEST and TRAIN data into 10:90 ratio
#now we load test data that our model never see and comapre it with actual values to find the accuracy of our model
raw_data2 = pd.read_csv(r'C:\Users\Dell\Documents\Bank-data-testing.csv')
data_test = raw_data2.copy()

In [22]:
# Remove the index column
data_test = data_test.drop(['Unnamed: 0'], axis =1)

In [23]:
# Coverting the outcome variable into 1s and 0s again. 
data_test['y'] = data_test['y'].map({'yes':1, 'no':0})
data_test

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,4.120,0.0,0.0,0.0,0.0,1468.0,1
4,4.963,0.0,0.0,0.0,0.0,36.0,0
5,0.697,0.0,1.0,4.0,0.0,131.0,0
6,0.639,1.0,0.0,0.0,0.0,215.0,1
7,4.120,0.0,0.0,0.0,0.0,499.0,0
8,1.281,0.0,1.0,1.0,0.0,809.0,1
9,4.966,0.0,0.0,0.0,0.0,389.0,0


## Declare the dependent and independent variables

In [24]:
y_test = data_test['y']
X1_test = data_test [estimators]
X_test = sm.add_constant(X1_test)

In [25]:
confusion_matrix(X_test,y_test,results_logit)

(array([[93., 18.],
        [13., 98.]]), 0.8603603603603603)

In [28]:
#Compare these values to the Confusion Matrix and the accuracy of the model with the old data.
# train accuracy
confusion_matrix(X,y, results_logit)

(array([[218.,  41.],
        [ 30., 229.]]), 0.862934362934363)

looking at the result we saw that our test accuracy is slightly lower than our Train accuracy and this happen due to the overfitting of our model but Overall our model has a good accuracy with test data that is 86.03%