# Bank Marketing
- Objectives : 
1. Predict the client subscribed a term deposit ? (yes/no)
2. Comparing the prediction with the actual outcome
3. Calculate the accuracy
4. Create a confusion matrix

### Import library

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#Apply a fix to the statsmodels library
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)

### Load the data

In [2]:
raw_data = pd.read_csv('bank-additional-full.csv', sep=';')
raw_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Data description

#### Bank client Data
- Age (numeric)
- Job : type of job (categorical: 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown')
- Marital : marital status (categorical: 'divorced', 'married', 'single', 'unknown' ; note: 'divorced' means divorced or widowed)
- Education (categorical: 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown')
- Default: has credit in default? (categorical: 'no', 'yes', 'unknown')
- Housing: has housing loan? (categorical: 'no', 'yes', 'unknown')
- Loan: has personal loan? (categorical: 'no', 'yes', 'unknown')

#### Related with the last contact of the current campaign
- Contact: contact communication type (categorical: 'cellular','telephone')
- Month: last contact month of year (categorical: 'jan', 'feb', 'mar', …, 'nov', 'dec')
- Day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
- Duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

#### Other attributes 
- Campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
- Pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
- Previous: number of contacts performed before this campaign and for this client (numeric)
- Poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

#### Social and economic context attributes
- Emp.var.rate: employment variation rate - quarterly indicator (numeric)
- Cons.price.idx: consumer price index - monthly indicator (numeric)
- Cons.conf.idx: consumer confidence index - monthly indicator (numeric)
- Euribor3m: euribor 3 month rate - daily indicator (numeric)
- Nr.employed: number of employees - quarterly indicator (numeric)

#### Output variable (Desire target)
- y - has the client subscribed a term deposit? (binary: 'yes', 'no')

### Convert categorical data into integer 

In [3]:
data = raw_data.copy()
data['y'] = data['y'].map({'yes':1, 'no':0})

data['job'] = data['job'].map({'admin.':1, 'blue-collar':2, 'entrepreneur':3, 'housemaid':4, 'management':5, 'retired':6,
                              'self-employed':7, 'services':8, 'student':9, 'technician':10, 'unemployed':0, 'unknown':11})

data['marital'] = data['marital'].map({'divorced':1, 'married':2, 'single':3, 'unknown':4, 'widowed':5})

data['education'] = data['education'].map({'basic.4y':1, 'basic.6y':2, 'basic.9y':3, 'high.school':4, 'illiterate':5,
                                          'professional.course':6, 'university.degree':7, 'unknown':0})

data['default'] = data['default'].map({'no':0, 'yes':1, 'unknown':2})

data['housing'] = data['housing'].map({'no':0, 'yes':1, 'unknown':2})

data['loan'] = data['loan'].map({'no':0, 'yes':1, 'unknown':2})

data['contact'] = data['contact'].map({'cellular': 1, 'telephone':2})

data['month'] = data['month'].map({'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10,
                                  'nov':11, 'dec':12})

data['day_of_week'] = data['day_of_week'].map({'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5})

data['poutcome'] = data['poutcome'].map({'failure':1, 'nonexistent':2, 'success':3})


In [4]:
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,4,2,1,0,0,0,2,5,1,...,1,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0
1,57,8,2,4,2,0,0,2,5,1,...,1,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0
2,37,8,2,4,0,1,0,2,5,1,...,1,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0
3,40,1,2,2,0,0,0,2,5,1,...,1,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0
4,56,8,2,4,0,0,1,2,5,1,...,1,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,6,2,6,0,1,0,1,11,5,...,1,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,2,2,6,0,0,0,1,11,5,...,1,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,6,2,7,0,1,0,1,11,5,...,2,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,10,2,6,0,0,0,1,11,5,...,1,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,1


### Checking the missing values

In [5]:
data.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

### Variables

In [6]:
y = data['y']
# x1 = data['campaign']
x1 = data.drop(['y'],axis=1)

### Regression

In [7]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
result_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.215038
         Iterations 9


### Summary

In [8]:
result_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,41188.0
Model:,Logit,Df Residuals:,41167.0
Method:,MLE,Df Model:,20.0
Date:,"Fri, 01 Sep 2023",Pseudo R-squ.:,0.3891
Time:,16:09:49,Log-Likelihood:,-8857.0
converged:,True,LL-Null:,-14499.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-82.4668,18.244,-4.520,0.000,-118.224,-46.710
age,0.0071,0.002,3.778,0.000,0.003,0.011
job,0.0105,0.006,1.851,0.064,-0.001,0.022
marital,0.1100,0.036,3.062,0.002,0.040,0.180
education,0.0458,0.009,4.950,0.000,0.028,0.064
default,-0.2102,0.033,-6.422,0.000,-0.274,-0.146
housing,0.0032,0.039,0.083,0.934,-0.073,0.079
loan,-0.0565,0.046,-1.215,0.224,-0.148,0.035
contact,-1.0225,0.069,-14.757,0.000,-1.158,-0.887


In [9]:
np.exp(-0.9502)

0.38666368298411513

### Prediction

In [10]:
np.set_printoptions(formatter = {'float' : lambda x : "{0:0.2f}".format(x)})
result_log.predict()

array([0.03, 0.01, 0.02, ..., 0.35, 0.62, 0.29])

In [11]:
np.round(result_log.predict())

array([0.00, 0.00, 0.00, ..., 0.00, 1.00, 0.00])

### Comparing the predictions with actual outcomes

In [12]:
data['y_predict'] = np.round(result_log.predict())

In [13]:
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,y_predict
0,56,4,2,1,0,0,0,2,5,1,...,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0,0.0
1,57,8,2,4,2,0,0,2,5,1,...,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0,0.0
2,37,8,2,4,0,1,0,2,5,1,...,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0,0.0
3,40,1,2,2,0,0,0,2,5,1,...,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0,0.0
4,56,8,2,4,0,0,1,2,5,1,...,999,0,2,1.1,93.994,-36.4,4.857,5191.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,6,2,6,0,1,0,1,11,5,...,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,1,1.0
41184,46,2,2,6,0,0,0,1,11,5,...,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,0,1.0
41185,56,6,2,7,0,1,0,1,11,5,...,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,0,0.0
41186,44,10,2,6,0,0,0,1,11,5,...,999,0,2,-1.1,94.767,-50.8,1.028,4963.6,1,1.0


### Confusion Matrix

In [14]:
cm_df = pd.DataFrame(result_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,35623.0,925.0
Actual 1,2758.0,1882.0


In [15]:
print('Missclasification rate : ' + str((2758+925)/41188))

Missclasification rate : 0.08941924832475479


### Calculate accuracy score

In [16]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0] + cm[1,1]) / cm.sum()
accuracy_train

0.9105807516752452