# `Part 4: Classification Models`

In [41]:
import pandas as pd
import numpy as np

In [42]:
bank_dataset = pd.read_csv('bank dataset (cleaned).csv')

In [43]:
bank_dataset.columns

Index([u'Unnamed: 0', u'age', u'occupation', u'marital', u'education',
       u'housing_loan', u'personal_loan', u'contact', u'month', u'day',
       u'duration', u'contact_freq', u'days_passed', u'contact_bef',
       u'prev_outcome', u'emp_var_rate', u'cpi_index', u'cci_index', u'e3m',
       u'employees', u'subscription', u'prev_part'],
      dtype='object')

In [44]:
bank_dataset = bank_dataset.drop(labels=['Unnamed: 0','days_passed'],axis=1)

In [45]:
# Create a continuous variable dataframe.
# We would like to conduct a Pearson's correlation to identify for any potential correlation prior to modelling.
# This is a very basic & raw feature selection step.
continuous = {}
for cols in bank_dataset.columns:
        if bank_dataset[cols].dtypes == int:
            continuous[cols] = bank_dataset[cols]
        elif bank_dataset[cols].dtypes == 'float64':
            continuous[cols] = bank_dataset[cols]
        else:
            pass

In [46]:
continuous = pd.DataFrame(continuous)

In [47]:
#Drop numerical categorical columns, except 'subscription'.
continuous = continuous.drop(labels=['prev_part'],axis=1)

In [48]:
continuous['subscription'] = bank_dataset['subscription']

In [49]:
# correlation coefficients.
pearsons_table = continuous.corr(method='pearson')

In [50]:
pearsons_table[(pearsons_table>0.5) | (pearsons_table<-0.5)]

Unnamed: 0,age,cci_index,contact_bef,contact_freq,cpi_index,duration,e3m,emp_var_rate,employees,subscription
age,1.0,,,,,,,,,
cci_index,,1.0,,,,,,,,
contact_bef,,,1.0,,,,,,,
contact_freq,,,,1.0,,,,,,
cpi_index,,,,,1.0,,0.667198,0.765986,,
duration,,,,,,1.0,,,,
e3m,,,,,0.667198,,1.0,0.969408,0.944864,
emp_var_rate,,,,,0.765986,,0.969408,1.0,0.900361,
employees,,,,,,,0.944864,0.900361,1.0,
subscription,,,,,,,,,,1.0


In [51]:
import seaborn as sns

__Comments:__
- As we can see, there is a strong positive correlation between (e3m & emp_var_rate/employees) AND (emp_var_rate & employees).
- This is largely due to these factors having a strong influence on one another by definition.
- We will not drop them just yet (until we have conducted further feature selection).

Things to learn:
- SVM
- CART
- ensemble
- pipelines

***
***
***

## Step 1: Preliminary modelling (without up or down sampling)

In [52]:
y = bank_dataset['subscription']
X = bank_dataset.drop(labels='subscription',axis=1)

In [53]:
y.value_counts()

0    26616
1     3858
Name: subscription, dtype: int64

### Baseline model accuracy

In [54]:
# This is what you will be comparing your model accuracy against.
1.0 - np.mean(y)

0.8734002756448119

### Dummy encode, Standard Scale & Train-Test Split

In [77]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [71]:
# Dummy encode categories with more than 2 outcomes.
X_dummed = pd.get_dummies(X, columns= ['occupation','marital','education','housing_loan','personal_loan','contact','month','day','prev_outcome'], drop_first=True)

In [72]:
# Standard Scaler.
ss = StandardScaler()
X_scaled = ss.fit_transform(X_dummed)

In [78]:
# Train-test split.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.35, random_state=8)

### Model 1: Logistic regression

In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [87]:
log_reg = LogisticRegression()

In [91]:
accuracy_scores = cross_val_score(log_reg, X_train, y_train, cv=10)
print 'accuracy_scores'
print accuracy_scores
print '--------'
print np.mean(accuracy_scores)

accuracy_scores
[0.89909183 0.89808274 0.89707366 0.90257446 0.89298334 0.90808081
 0.8979798  0.90353535 0.91161616 0.8969697 ]
--------
0.9007987851380148


In [None]:
# Conf matrix, ROC-AUC?

__Comments:__
<br>The mean 'Accuracy Score' of 0.90 for a simple logistic regression model is pretty good given that it is higher than the baseline.

### Model 2: KNN

Definition:
- Simple algorithm based on distances from a stipulated number of 'K' neighbours (e.g. 3, 5, 10 etc.)

In [None]:
# Tune for best K

In [98]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [99]:
knn = KNeighborsClassifier(n_neighbors=5)

In [100]:
# Comparing accuracy score to baseline.
accuracy_score = cross_val_score(knn, X_train, y_train, cv=10)
print(accuracy_score)
print np.mean(accuracy_score)

[0.88597376 0.88294652 0.87891019 0.88036345 0.8859162  0.88585859
 0.87424242 0.87474747 0.88232323 0.88737374]
0.8818655585552891


In [103]:
# Deriving accuracy score on test set.
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [104]:
y_pred = knn.predict(X_test)
print ('accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred)))

accuracy = 0.883367710482


__Comments:__
<br>The mean 'Accuracy Score' of 0.88 for knn model is slightly better than baseline accuracy.
<br> Chances are we will not use knn for evaluation.

***
***
***

## Step 2: Up/Down Sampling

In [109]:
from imblearn.combine import SMOTEENN

In [110]:
# SMOTEENN combination of over- & under- sampling.
smote_enn = SMOTEENN(random_state=8)
X_trainresam, y_trainresam = smote_enn.fit_sample(X_train, y_train)

In [131]:
# Counting the y output variables.
from collections import Counter
print(sorted(Counter(y_trainresam).items()))

[(0, 13981), (1, 16638)]


***
***
***

## Step 3: Feature Selection & Gridsearch

### Feature Selection

In [None]:
# Feature selection.
from sklearn.feature_selection import SelectKBest, f_classif, chi2

cols = list(X_train.columns)

skb_f = SelectKBest(f_classif, k=5)
skb_chi2 = SelectKBest(chi2, k=5)

skb_f.fit(X_train, y_train)
skb_chi2.fit(X_train, y_train)

kbest = pd.DataFrame([cols, list(skb_f.scores_), list(skb_chi2.scores_)], 
                     index=['feature','f_classif','chi2 score']).T.sort_values('f_classif', ascending=False)
kbest

# Note: The F-test refers to explained variance divided by unexplained variance.

In [None]:
# Feature selection.
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
selector = RFECV(lr, step=1, cv=10)
selector = selector.fit(X, y)

print selector.support_
print selector.ranking_

In [None]:
rfecv_columns = np.array(cols)[selector.support_]
rfecv_columns

In [None]:
# Feature selection through regularisation.
Lasso Penalty

### Gridsearch

***
***
***

## Step 4: Remodelling