#### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn 
seaborn.set()

#Apply a fix to the statsmodels library if needed
#from scipy import stats
#stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

#### Import Data

In [4]:
raw_data = pd.read_csv("Bank_data.csv")
raw_data.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [5]:
raw_data = raw_data.drop(["Unnamed: 0"], axis=1)

In [6]:
raw_data["y"] = raw_data["y"].map({"no":0,"yes":1})

In [7]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
raw_data.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


## Logistic Regression

### Set up variables

In [10]:
y = raw_data["y"]
x1 = raw_data.drop(["y"],axis=1)
x = sm.add_constant(x1)

### Set the regression

In [12]:
log_reg = sm.Logit(y,x)
log_results = log_reg.fit()
log_results.summary()

Optimization terminated successfully.
         Current function value: 0.335942
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,511.0
Method:,MLE,Df Model:,6.0
Date:,"Sat, 26 Oct 2024",Pseudo R-squ.:,0.5153
Time:,18:55:41,Log-Likelihood:,-174.02
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,7.579e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1385,0.339,-0.408,0.683,-0.804,0.527
interest_rate,-0.7802,0.092,-8.471,0.000,-0.961,-0.600
credit,2.4028,1.090,2.205,0.027,0.267,4.538
march,-1.8097,0.332,-5.459,0.000,-2.459,-1.160
may,0.1946,0.229,0.849,0.396,-0.255,0.644
previous,1.2746,0.583,2.186,0.029,0.132,2.417
duration,0.0070,0.001,9.386,0.000,0.006,0.008


### Remove insignificant regressors

In this case, "may" values seem insignificant due to having a big p-value

In [15]:
x2 = raw_data[["interest_rate","credit","march","previous","duration"]]
x2 = sm.add_constant(x2)
x2

Unnamed: 0,const,interest_rate,credit,march,previous,duration
0,1.0,1.334,0.0,1.0,0.0,117.0
1,1.0,0.767,0.0,0.0,1.0,274.0
2,1.0,4.858,0.0,1.0,0.0,167.0
3,1.0,4.120,0.0,0.0,0.0,686.0
4,1.0,4.856,0.0,1.0,0.0,157.0
...,...,...,...,...,...,...
513,1.0,1.334,0.0,1.0,0.0,204.0
514,1.0,0.861,0.0,0.0,1.0,806.0
515,1.0,0.879,0.0,0.0,0.0,290.0
516,1.0,0.877,0.0,0.0,1.0,473.0


In [16]:
log_reg = sm.Logit(y,x2)
log_results = log_reg.fit()
log_results.summary()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,512.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 26 Oct 2024",Pseudo R-squ.:,0.5143
Time:,18:55:41,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.211e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0211,0.311,-0.068,0.946,-0.631,0.589
interest_rate,-0.8001,0.089,-8.943,0.000,-0.975,-0.625
credit,2.3585,1.088,2.169,0.030,0.227,4.490
march,-1.8322,0.330,-5.556,0.000,-2.478,-1.186
previous,1.5363,0.501,3.067,0.002,0.554,2.518
duration,0.0070,0.001,9.381,0.000,0.006,0.008


## Testing the model

### Get the test data

In [38]:
test_data = pd.read_csv("Bank_data_testing.csv")
test_data["y"] = test_data["y"].map({"no":0,"yes":1})
test_data

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,3,4.120,0.0,0.0,0.0,0.0,1468.0,1
4,4,4.963,0.0,0.0,0.0,0.0,36.0,0
...,...,...,...,...,...,...,...,...
217,217,4.963,0.0,0.0,0.0,0.0,458.0,1
218,218,1.264,0.0,1.0,1.0,0.0,397.0,1
219,219,1.281,0.0,1.0,0.0,0.0,34.0,0
220,220,0.739,0.0,0.0,2.0,0.0,233.0,0


In [40]:
t_data = test_data.drop(["Unnamed: 0","may","y"],axis=1)

In [44]:
t_data = sm.add_constant(t_data)
t_data

Unnamed: 0,const,interest_rate,credit,march,previous,duration
0,1.0,1.313,0.0,1.0,0.0,487.0
1,1.0,4.961,0.0,0.0,0.0,132.0
2,1.0,4.856,0.0,1.0,0.0,92.0
3,1.0,4.120,0.0,0.0,0.0,1468.0
4,1.0,4.963,0.0,0.0,0.0,36.0
...,...,...,...,...,...,...
217,1.0,4.963,0.0,0.0,0.0,458.0
218,1.0,1.264,0.0,1.0,0.0,397.0
219,1.0,1.281,0.0,1.0,0.0,34.0
220,1.0,0.739,0.0,0.0,0.0,233.0


In [46]:
np.array(log_results.predict(t_data))

array([0.62, 0.04, 0.01, 1.00, 0.02, 0.18, 0.97, 0.54, 0.94, 0.22, 0.06,
       0.79, 0.68, 0.31, 0.06, 0.21, 0.88, 0.87, 0.80, 0.02, 0.01, 0.61,
       0.53, 0.37, 0.99, 0.44, 1.00, 0.04, 0.01, 0.45, 0.75, 0.98, 0.73,
       0.24, 0.05, 0.04, 0.70, 1.00, 0.86, 0.81, 0.99, 0.55, 0.07, 0.95,
       0.94, 0.97, 0.08, 0.01, 1.00, 0.55, 0.75, 0.21, 0.29, 0.09, 0.99,
       0.01, 1.00, 0.03, 0.11, 0.07, 0.97, 0.92, 0.97, 0.09, 0.07, 0.07,
       0.05, 0.58, 0.55, 0.13, 0.70, 0.01, 0.67, 0.01, 0.03, 0.03, 0.76,
       0.95, 0.51, 0.99, 0.02, 0.97, 0.10, 0.99, 0.01, 0.03, 0.93, 0.06,
       0.78, 0.93, 0.01, 0.44, 0.47, 0.67, 0.83, 0.92, 0.04, 0.14, 0.55,
       0.93, 0.01, 0.79, 0.95, 0.63, 0.53, 0.45, 0.06, 0.13, 0.66, 0.98,
       0.91, 0.29, 0.85, 0.01, 0.19, 0.03, 0.92, 0.04, 0.07, 0.88, 0.89,
       0.47, 0.86, 0.06, 0.83, 0.82, 0.07, 0.94, 0.03, 0.93, 0.10, 0.68,
       0.81, 0.02, 0.97, 0.03, 0.92, 1.00, 0.91, 0.67, 0.54, 0.99, 0.38,
       0.03, 1.00, 0.60, 0.04, 0.61, 0.41, 0.33, 0.

In [48]:
np.array(test_data["y"])

array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1], dtype=int64)

In [50]:
log_results.pred_table()

array([[218.00, 41.00],
       [30.00, 229.00]])

What's above is the measure of success of the model on the data it was built on.

In [53]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and the accuracy
        return cm, accuracy

In [60]:
cm = confusion_matrix(t_data,test_data["y"],log_results)
cm

(array([[93.00, 18.00],
        [13.00, 98.00]]),
 0.8603603603603603)

In [62]:
# Format for easier understanding (not needed later on)
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,93.0,18.0
Actual 1,13.0,98.0


Our model (which used "t_data" as the model trainer) has 86% accuracy on the test data as seen above.