# `Part 4: Building Models`

In [1]:
import pandas as pd
import numpy as np

In [2]:
bank_dataset = pd.read_csv('bank dataset (cleaned).csv')

In [3]:
bank_dataset.columns

Index([u'Unnamed: 0', u'age', u'occupation', u'marital', u'education',
       u'housing_loan', u'personal_loan', u'contact', u'month', u'day',
       u'duration', u'contact_freq', u'days_passed', u'contact_bef',
       u'prev_outcome', u'emp_var_rate', u'cpi_index', u'cci_index', u'e3m',
       u'employees', u'subscription', u'prev_part'],
      dtype='object')

In [4]:
bank_dataset = bank_dataset.drop(labels=['Unnamed: 0','days_passed'],axis=1)

In [5]:
# Create a continuous variable dataframe.
# We would like to conduct a Pearson's correlation to identify for any potential correlation prior to modelling.
# This is a very basic & raw feature selection step.
continuous = {}
for cols in bank_dataset.columns:
        if bank_dataset[cols].dtypes == int:
            continuous[cols] = bank_dataset[cols]
        elif bank_dataset[cols].dtypes == 'float64':
            continuous[cols] = bank_dataset[cols]
        else:
            pass

In [6]:
continuous = pd.DataFrame(continuous)

In [7]:
#Drop numerical categorical columns, except 'subscription'.
continuous = continuous.drop(labels=['prev_part'],axis=1)

In [8]:
continuous['subscription'] = bank_dataset['subscription']

In [9]:
# correlation coefficients.
pearsons_table = continuous.corr(method='pearson')

In [10]:
pearsons_table[(pearsons_table>0.5) | (pearsons_table<-0.5)]

Unnamed: 0,age,cci_index,contact_bef,contact_freq,cpi_index,duration,e3m,emp_var_rate,employees,subscription
age,1.0,,,,,,,,,
cci_index,,1.0,,,,,,,,
contact_bef,,,1.0,,,,,,,
contact_freq,,,,1.0,,,,,,
cpi_index,,,,,1.0,,0.667198,0.765986,,
duration,,,,,,1.0,,,,
e3m,,,,,0.667198,,1.0,0.969408,0.944864,
emp_var_rate,,,,,0.765986,,0.969408,1.0,0.900361,
employees,,,,,,,0.944864,0.900361,1.0,
subscription,,,,,,,,,,1.0


In [11]:
import seaborn as sns

__Comments:__
- As we can see, there is a strong positive correlation between (e3m & emp_var_rate/employees) AND (emp_var_rate & employees).
- This is largely due to these factors having a strong influence on one another by definition.
- We will not drop them just yet (until we have conducted further feature selection).

***
***
***

## Step 1: Preliminary modelling (without up or down sampling)

In [12]:
y = bank_dataset['subscription']
X = bank_dataset.drop(labels='subscription',axis=1)

In [13]:
y.value_counts()

0    26616
1     3858
Name: subscription, dtype: int64

### Baseline model accuracy

In [14]:
# This is what you will be comparing your model accuracy against.
1.0 - np.mean(y)

0.8734002756448119

### Dummy encode, Standard Scale & Train-Test Split

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [16]:
# Dummy encode categories with more than 2 outcomes.
X_dummed = pd.get_dummies(X, columns= ['occupation','marital','education','housing_loan','personal_loan','contact','month','day','prev_outcome'], drop_first=True)

In [17]:
# Standard Scaler.
ss = StandardScaler()
X_scaled = ss.fit_transform(X_dummed)

In [18]:
# Train-test split.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.35, random_state=8)

### Model 1: Logistic regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [20]:
log_reg = LogisticRegression()

In [21]:
accuracy_scores = cross_val_score(log_reg, X_train, y_train, cv=10)
print 'accuracy_scores'
print accuracy_scores
print '--------'
print np.mean(accuracy_scores)

accuracy_scores
[0.89909183 0.89808274 0.89707366 0.90257446 0.89298334 0.90808081
 0.8979798  0.90353535 0.91161616 0.8969697 ]
--------
0.9007987851380148


In [77]:
# Deriving accuracy score on test set.
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
y_pred = log_reg.predict(X_test)
print ('accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred)))

accuracy = 0.900150009376


In [79]:
y_pred = log_reg.predict(X_test)

In [80]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.92      0.97      0.94      9271
          1       0.69      0.43      0.53      1395

avg / total       0.89      0.90      0.89     10666



__Comments:__
<br>The 'Accuracy Score' of 0.899 for logistic regression model is slightly better than baseline accuracy.

In [24]:
# Conf matrix, ROC-AUC?

### Model 2: KNN

Definition:
- Simple algorithm based on distances from a stipulated number of 'K' neighbours (e.g. 3, 5, 10 etc.)

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [26]:
knn = KNeighborsClassifier(n_neighbors=5)

In [27]:
# Comparing accuracy score to baseline.
accuracy_score = cross_val_score(knn, X_train, y_train, cv=10)
print(accuracy_score)
print np.mean(accuracy_score)

[0.88597376 0.88294652 0.87891019 0.88036345 0.8859162  0.88585859
 0.87424242 0.87474747 0.88232323 0.88737374]
0.8818655585552891


In [28]:
# Deriving accuracy score on test set.
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [29]:
y_pred = knn.predict(X_test)
print ('accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred)))

accuracy = 0.883367710482


__Comments:__
<br>The 'Accuracy Score' of 0.88 for knn model is slightly better than baseline accuracy.
<br> Chances are we will not use knn for evaluation.

***
***
***

## Step 2: Up/Down Sampling

In [30]:
from imblearn.combine import SMOTEENN

In [31]:
# SMOTEENN combination of over- & under- sampling.
smote_enn = SMOTEENN(random_state=8)
X_trainresam, y_trainresam = smote_enn.fit_sample(X_train, y_train)

In [32]:
# Counting the y output variables.
from collections import Counter
print(sorted(Counter(y_trainresam).items()))

[(0, 13981), (1, 16638)]


***
***
***

## Step 3: Feature Selection & Gridsearch

### Feature Selection (SelectKBest)

Note: The F-test is explained variance divided by unexplained variance. High numbers will result if our explained variance (what we know) is much greater than our unexplained variance (what we don't know).

In [33]:
cols = list(X_dummed.columns)

In [34]:
from sklearn.feature_selection import SelectKBest, f_classif

skb_f = SelectKBest(f_classif, k=5)
skb_f.fit(X_trainresam, y_trainresam)


kbest = pd.DataFrame([cols, list(skb_f.scores_)], 
                     index=['feature','f_classif']).T.sort_values('f_classif', ascending=False)

kbest

Unnamed: 0,feature,f_classif
8,employees,14519.8
7,e3m,13355.6
4,emp_var_rate,12233.7
1,duration,10326.0
9,prev_part,3873.94
45,prev_outcome_success,3587.35
30,contact_telephone,2993.28
44,prev_outcome_nonexistent,2772.19
3,contact_bef,2747.98
5,cpi_index,1879.09


### Feature Selection (RFE)

In [35]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

selector = RFECV(log_reg, step=1, cv=10)
selector = selector.fit(X_trainresam, y_trainresam)

print selector.support_
print selector.ranking_

[False  True  True  True  True  True  True  True  True  True  True False
  True False  True  True False  True False False  True  True  True False
 False  True  True  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True False False False False]
[10  1  1  1  1  1  1  1  1  1  1  7  1 11  1  1  2  1  6  4  1  1  1  9
 14  1  1  1 13  1  1  1  1  1  1  1  1  1  1  1  1  1  3  8 12  5]


In [36]:
rfecv_columns = np.array(cols)[selector.support_]
rfecv_columns

array(['duration', 'contact_freq', 'contact_bef', 'emp_var_rate',
       'cpi_index', 'cci_index', 'e3m', 'employees', 'prev_part',
       'occupation_blue-collar', 'occupation_housemaid',
       'occupation_retired', 'occupation_self-employed',
       'occupation_student', 'marital_married', 'marital_single',
       'education_basic.6y', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'personal_loan_yes', 'contact_telephone', 'month_aug', 'month_dec',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'day_mon', 'day_thu'], dtype='|S29')

In [37]:
RFE_excluded = []
for columns in cols:
    if columns not in rfecv_columns:
        RFE_excluded.append(columns)

In [38]:
# Excluded features.
RFE_excluded

['age',
 'occupation_entrepreneur',
 'occupation_management',
 'occupation_services',
 'occupation_technician',
 'occupation_unemployed',
 'education_basic.9y',
 'education_high.school',
 'housing_loan_yes',
 'day_tue',
 'day_wed',
 'prev_outcome_nonexistent',
 'prev_outcome_success']

### Feature Selection (Lasso Penalty)

In [39]:
from sklearn.linear_model import LogisticRegressionCV

log_rcv = LogisticRegressionCV(penalty='l1', Cs=100, cv=10, solver='liblinear')
log_rcv.fit(X_trainresam, y_trainresam)

LogisticRegressionCV(Cs=100, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [40]:
coeffs = pd.DataFrame(log_rcv.coef_, columns=X_dummed.columns)
coeffs_t = coeffs.transpose()
coeffs_t.columns = ['lasso_coefs']
coeffs_abs = coeffs_t.abs().sort_values('lasso_coefs', ascending=False)
coeffs_abs

Unnamed: 0,lasso_coefs
duration,2.842114
employees,1.327891
month_may,0.931404
emp_var_rate,0.927937
prev_part,0.801918
month_oct,0.524976
month_mar,0.381885
month_nov,0.335702
education_university.degree,0.262419
contact_bef,0.232046


__Comments:__
<br> Feature exclusion:
- From the lasso regularisation, we can tell that the main categories 'age' & 'e3m' have been zeroed.
- 'age' & 'prev_outcome' have been excluded via RFECV.

<br> The top 5 features for f_classif:
- 'employees', 'emp_var_rate', 'e3m', 'duration', 'prev_part'

<br> The top 5 features for lasso:
- 'duration', 'employees', 'emp_var_rate', 'month_may', 'prev_part'

<br> Based on the information provided (together with the Pearson's correlation table earlier), the columns age & e3m will be removed.

***
***
***

## Step 4: Remodelling

### Re-engineering the predictor variables following dropping off columns (via Feature Selection)

In [41]:
y = bank_dataset['subscription']
X = bank_dataset.drop(labels=['subscription','age','e3m'],axis=1)

In [42]:
# Dummy encode categories with more than 2 outcomes.
X_dummed = pd.get_dummies(X, columns= ['occupation','marital','education','housing_loan','personal_loan','contact','month','day','prev_outcome'], drop_first=True)

In [43]:
# Standard Scaler.
ss = StandardScaler()
X_scaled = ss.fit_transform(X_dummed)

In [44]:
# Train-test split.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.35, random_state=8)

In [45]:
# SMOTEENN combination of over- & under- sampling.
smote_enn = SMOTEENN(random_state=8)
X_trainresam, y_trainresam = smote_enn.fit_sample(X_train, y_train)

In [46]:
# Counting the y output variables.
from collections import Counter
print(sorted(Counter(y_trainresam).items()))

[(0, 13976), (1, 16311)]


### New Baseline Accuracy (following balancing of dataset)

In [47]:
np.mean(y_trainresam)

0.5385478918347806

### Model 3: Logistic Regression (after Feature Selection)

In [48]:
log_reg = LogisticRegression()
accuracy_scores = cross_val_score(log_reg, X_trainresam, y_trainresam, cv=10)
print 'accuracy_scores'
print accuracy_scores
print '--------'
print np.mean(accuracy_scores)

accuracy_scores
[0.94323432 0.93430175 0.9385936  0.93958402 0.93826345 0.93892374
 0.94352708 0.93956407 0.94418758 0.94451783]
--------
0.9404697445441386


In [49]:
# Deriving accuracy score on test set.
log_reg.fit(X_trainresam, y_trainresam)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
y_pred = log_reg.predict(X_test)
print ('accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred)))

accuracy = 0.835552222014


In [71]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.99      0.82      0.90      9271
          1       0.44      0.92      0.59      1395

avg / total       0.91      0.84      0.86     10666



__Comment__:
<br> Accuracy score of 0.835 is way higher than the new baseline score 0.538.

### Model 4: Logistic Regression (with Gridsearch)

In [51]:
from sklearn.model_selection import GridSearchCV

In [52]:
gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params, cv=5, verbose=1)

In [53]:
lr_gridsearch.fit(X_trainresam, y_trainresam)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  7.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e-05, 1.12332e-05, ..., 8.90215e-01, 1.00000e+00]), 'solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [54]:
lr_gridsearch.best_score_

0.941360979958398

In [55]:
lr_gridsearch.best_params_

{'C': 0.06892612104349695, 'penalty': 'l1', 'solver': 'liblinear'}

In [56]:
best_lr = lr_gridsearch.best_estimator_

In [57]:
best_lr.score(X_test, y_test)

0.8348959309956873

In [66]:
y_pred = best_lr.predict(X_test)

In [67]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.99      0.82      0.90      9271
          1       0.44      0.92      0.59      1395

avg / total       0.91      0.83      0.86     10666



__Comment__:
<br> Accuracy score of 0.834 is > than new baseline score 0.538.

### Model 5: SVM

__Note:__
1. A linearly seperable classification method with a "hinge" loss function.
2. Decision boundary is defined by the MMH.

In [64]:
from sklearn import svm
from sklearn.metrics import classification_report

In [59]:
svc = svm.SVC()
accuracy_scores = cross_val_score(svc, X_trainresam, y_trainresam, cv=10, scoring='accuracy')

print 'accuracy_scores'
print accuracy_scores
print '--------'
print np.mean(accuracy_scores)

accuracy_scores
[0.9660066  0.96335424 0.96500495 0.96632552 0.96401453 0.96467481
 0.97060766 0.9656539  0.96664465 0.96796565]
--------
0.9660252514117517


In [60]:
svc.fit(X_trainresam, y_trainresam)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [61]:
svc.score(X_test, y_test)

0.8466154134633415

In [62]:
y_pred = svc.predict(X_test)

In [65]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.98      0.84      0.91      9271
          1       0.46      0.88      0.60      1395

avg / total       0.91      0.85      0.87     10666



__Comment__:
<br> Accuracy score of 0.846 is > than the new baseline score 0.538.

### Model 6: SVM (with Gridsearch)

In [None]:
# svc_a = svm.SVC()

# gamma_range = np.logspace(-5, 2, 5)
# C_range = np.logspace(-3, 2, 5)
# kernel_range = ['rbf', 'sigmoid', 'linear', 'poly']

# param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range)

# grid = GridSearchCV(svc_a, param_grid, cv=5, scoring='accuracy', verbose=1)
# grid.fit(X_trainresam, y_trainresam)

In [None]:
# print 'best parameters'
# print grid.best_params_
# print 'best score achieved'
# print grid.best_score_

In [None]:
# grid.score(X_test, y_test)

In [None]:
# y_pred = grid.predict(X_test)

In [None]:
# print_cm_cr(y_test, y_pred)

__Comment__:
<br> Accuracy score of ??? is > than the new baseline score 0.538.

<br> __Currently:__
<br> Log reg (0.835)
<br> log reg, with GS (0.834)
<br> SVM (0.846)
<br> SVM, with GS (???)