# `Part 4: Building Models`

In [1]:
import pandas as pd
import numpy as np

In [2]:
bank_dataset = pd.read_csv('bank dataset (cleaned).csv')

In [3]:
bank_dataset.columns

Index([u'Unnamed: 0', u'age', u'occupation', u'marital', u'education',
       u'housing_loan', u'personal_loan', u'contact', u'month', u'day',
       u'duration', u'contact_freq', u'days_passed', u'contact_bef',
       u'prev_outcome', u'emp_var_rate', u'cpi_index', u'cci_index', u'e3m',
       u'employees', u'subscription', u'prev_part'],
      dtype='object')

In [4]:
bank_dataset = bank_dataset.drop(labels=['Unnamed: 0','days_passed'],axis=1)

In [5]:
# Create a continuous variable dataframe.
# We would like to conduct a Pearson's correlation to identify for any potential correlation prior to modelling.
# This is a very basic & raw feature selection step.
continuous = {}
for cols in bank_dataset.columns:
        if bank_dataset[cols].dtypes == int:
            continuous[cols] = bank_dataset[cols]
        elif bank_dataset[cols].dtypes == 'float64':
            continuous[cols] = bank_dataset[cols]
        else:
            pass

In [6]:
continuous = pd.DataFrame(continuous)

In [7]:
#Drop numerical categorical columns, except 'subscription'.
continuous = continuous.drop(labels=['prev_part'],axis=1)

In [8]:
continuous['subscription'] = bank_dataset['subscription']

In [9]:
# correlation coefficients.
pearsons_table = continuous.corr(method='pearson')

In [10]:
pearsons_table[(pearsons_table>0.5) | (pearsons_table<-0.5)]

Unnamed: 0,age,cci_index,contact_bef,contact_freq,cpi_index,duration,e3m,emp_var_rate,employees,subscription
age,1.0,,,,,,,,,
cci_index,,1.0,,,,,,,,
contact_bef,,,1.0,,,,,,,
contact_freq,,,,1.0,,,,,,
cpi_index,,,,,1.0,,0.667198,0.765986,,
duration,,,,,,1.0,,,,
e3m,,,,,0.667198,,1.0,0.969408,0.944864,
emp_var_rate,,,,,0.765986,,0.969408,1.0,0.900361,
employees,,,,,,,0.944864,0.900361,1.0,
subscription,,,,,,,,,,1.0


In [11]:
y = bank_dataset['subscription']
X = bank_dataset.drop(labels=['subscription','age','e3m'],axis=1)

In [12]:
# Dummy encode categories with more than 2 outcomes.
X_dummed = pd.get_dummies(X, columns= ['occupation','marital','education','housing_loan','personal_loan','contact','month','day','prev_outcome'], drop_first=True)

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN

In [19]:
# Standard Scaler.
ss = StandardScaler()
X_scaled = ss.fit_transform(X_dummed)

In [20]:
# Train-test split.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.35, random_state=8)

In [21]:
# SMOTEENN combination of over- & under- sampling.
smote_enn = SMOTEENN(random_state=8)
X_trainresam, y_trainresam = smote_enn.fit_sample(X_train, y_train)

In [22]:
# Counting the y output variables.
from collections import Counter
print(sorted(Counter(y_trainresam).items()))

[(0, 13976), (1, 16311)]


### New Baseline Accuracy (following balancing of dataset)

In [23]:
np.mean(y_trainresam)

0.5385478918347806

### Model 6: SVM (with Gridsearch)

In [27]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
svc_a = svm.SVC()

gamma_range = np.logspace(-5, 2, 5)
C_range = np.logspace(-3, 2, 5)
kernel_range = ['rbf', 'sigmoid', 'linear', 'poly']

param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)

grid = GridSearchCV(svc_a, param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_trainresam, y_trainresam)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
print 'best parameters'
print grid.best_params_
print 'best score achieved'
print grid.best_score_

In [None]:
grid.score(X_test, y_test)

In [None]:
y_pred = grid.predict(X_test)

In [None]:
print_cm_cr(y_test, y_pred)