In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Build a neural network to predict which customers will sign up for a long term deposit


In [65]:
df = pd.read_csv('bank-full.csv')

In [66]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [67]:
df.marital.unique()

array(['married', 'single', 'divorced'], dtype=object)

In [68]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["job_cat"] = le.fit_transform(df['job'])

In [69]:
df.groupby('job')['job_cat'].unique()

job
admin.            [0]
blue-collar       [1]
entrepreneur      [2]
housemaid         [3]
management        [4]
retired           [5]
self-employed     [6]
services          [7]
student           [8]
technician        [9]
unemployed       [10]
unknown          [11]
Name: job_cat, dtype: object

In [70]:
df["martial_cat"] = le.fit_transform(df['marital'])
df.groupby('marital').martial_cat.unique()

marital
divorced    [0]
married     [1]
single      [2]
Name: martial_cat, dtype: object

In [71]:
df['edu_cat'] = le.fit_transform(df['education'])
df.groupby('education')['edu_cat'].unique()

education
primary      [0]
secondary    [1]
tertiary     [2]
unknown      [3]
Name: edu_cat, dtype: object

In [72]:
df.default = np.where(df.default=='yes', 1,0)

In [73]:
df.housing = np.where(df.housing == 'yes', 1, 0)

In [74]:
df.loan = np.where(df.loan == 'yes', 1, 0)

In [75]:
df['month_cat'] = le.fit_transform(df.month)
df.groupby('month')['month_cat'].unique()

month
apr     [0]
aug     [1]
dec     [2]
feb     [3]
jan     [4]
jul     [5]
jun     [6]
mar     [7]
may     [8]
nov     [9]
oct    [10]
sep    [11]
Name: month_cat, dtype: object

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 21 columns):
age            45211 non-null int64
job            45211 non-null object
marital        45211 non-null object
education      45211 non-null object
default        45211 non-null int64
balance        45211 non-null int64
housing        45211 non-null int64
loan           45211 non-null int64
contact        45211 non-null object
day            45211 non-null int64
month          45211 non-null object
duration       45211 non-null int64
campaign       45211 non-null int64
pdays          45211 non-null int64
previous       45211 non-null int64
poutcome       45211 non-null object
y              45211 non-null object
job_cat        45211 non-null int64
martial_cat    45211 non-null int64
edu_cat        45211 non-null int64
month_cat      45211 non-null int64
dtypes: int64(14), object(7)
memory usage: 7.2+ MB


In [77]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y', 'job_cat', 'martial_cat', 'edu_cat',
       'month_cat'],
      dtype='object')

In [78]:
col_to_use = ['age', 'job_cat', 'martial_cat', 'edu_cat', 'default', 'balance', 'housing',
       'loan', 'day', 'month_cat', 'duration', 'campaign', 'pdays',
       'previous',  'y']


In [79]:
df1 = df[col_to_use]

In [80]:
df1.shape

(45211, 15)

In [81]:
df1.y.value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [82]:
df1.y = np.where(df1.y == 'yes', 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [83]:
#train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, df1.y, test_size=.2)

In [84]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36168 entries, 2410 to 22305
Data columns (total 15 columns):
age            36168 non-null int64
job_cat        36168 non-null int64
martial_cat    36168 non-null int64
edu_cat        36168 non-null int64
default        36168 non-null int64
balance        36168 non-null int64
housing        36168 non-null int64
loan           36168 non-null int64
day            36168 non-null int64
month_cat      36168 non-null int64
duration       36168 non-null int64
campaign       36168 non-null int64
pdays          36168 non-null int64
previous       36168 non-null int64
y              36168 non-null int64
dtypes: int64(15)
memory usage: 4.4 MB


In [85]:
from imblearn.over_sampling import SMOTE
# random SMOTE

sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

#make sure it worked, should be 50%
print('train class balance: {}%'.format(len(y_train_res[y_train_res == 1]) / len(y_train_res) * 100))


train class balance: 50.0%


In [45]:
## MLP 

from sklearn.neural_network import MLPClassifier
import time

start_time = time.clock()
mlp = MLPClassifier(hidden_layer_sizes=(1000,)).fit(X_train_res, y_train_res)
y_res_pred = mlp.predict(X_train_res) 
print(mlp.score(X_train_res, y_train_res))
print('{} seconds'.format(time.clock() - start_time))

0.9988886801903331
154.226724 seconds


In [46]:
#Training confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_res, y_res_pred)

array([[31914,    30],
       [   41, 31903]])

In [52]:
# Testing 
y_test_pred = mlp.predict(X_test)

confusion_matrix(y_test, y_test_pred)

array([[7972,    6],
       [  10, 1055]])

In [53]:
# recall 

from sklearn.metrics import recall_score

recall_score(y_train_res, y_res_pred)  

0.9987165038817931

In [55]:
from sklearn.model_selection import cross_val_score
start_time = time.clock()
print('train set cv=5')
print(cross_val_score(mlp, X_train_res, y_train_res, cv=5).mean())
print('{} seconds'.format(time.clock() - start_time))

train set cv=5
0.9908436179003264
877.226159 seconds


In [56]:
# with changed layers

start_time = time.clock()
mlp = MLPClassifier(hidden_layer_sizes=(500,250,250,)
                   ).fit(X_train_res, y_train_res)
y_pred = mlp.predict(X_train_res)
print(mlp.score(X_train_res, y_train_res))
print('Training recall', recall_score(y_train_res, y_pred))
print('{} seconds'.format(time.clock() - start_time))

0.9979964938642625
Training recall 0.995992987728525
657.9019639999999 seconds


In [57]:
# Testing 
y_test_pred = mlp.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))

[[7978    0]
 [  16 1049]]


In [60]:
accuracy_score(y_test, y_test_pred)

0.9982306756607321

In [58]:
# Training cross-validation

start_time = time.clock()
print('train set cv=5')
print(cross_val_score(mlp, X_train_res, y_train_res, cv=5))
print('{} seconds'.format(time.clock() - start_time))

# took 40 minues to run

train set cv=5
[0.97573955 0.98857411 0.99593051 0.99467835 0.94716656]
2440.4903570000006 seconds


In [59]:
from sklearn.metrics import accuracy_score
accuracy_score

In [86]:
# Hyperparameter tuning 

single_hidden_layers = [1500, 2000, 2500, 3000]
double_hidden_layers = [(50, 25,), (500,100,), (1000, 250,), (1000,500)]

# Single layer testing 

for i in single_hidden_layers:
    start_time = time.clock()
    print('layer size: ', i )
    mlp = MLPClassifier(hidden_layer_sizes=(i))
    mlp.fit(X_train_res, y_train_res)
    y_pred = mlp.predict(X_train_res)
    print('Training accuracy:', mlp.score(X_train_res, y_train_res))
    print('Training recall', recall_score(y_train_res, y_pred))
    
    # Testing 
    y_test_pred = mlp.predict(X_test)
    print('Testing accuracy: ', accuracy_score(y_test, y_test_pred))
    print('Testing Conf Matrix: \n', confusion_matrix(y_test, y_test_pred))
    print('{} seconds'.format(time.clock() - start_time))

layer size:  1500
Training accuracy: 0.9984346628263728
Training recall 0.99827812910901
Testing accuracy:  0.9976777618047108
Testing Conf Matrix: 
 [[7966   14]
 [   7 1056]]
219.0891619999993 seconds
layer size:  2000
Training accuracy: 0.9951161480182832
Training recall 0.9999686932565275
Testing accuracy:  0.9913745438460688
Testing Conf Matrix: 
 [[7902   78]
 [   0 1063]]
266.6524419999996 seconds
layer size:  2500
Training accuracy: 0.9960710036941958
Training recall 0.9922672343622816
Testing accuracy:  0.9976777618047108
Testing Conf Matrix: 
 [[7979    1]
 [  20 1043]]
532.8598759999995 seconds
layer size:  3000
Training accuracy: 0.975596393463152
Training recall 0.9999686932565275
Testing accuracy:  0.9598584540528585
Testing Conf Matrix: 
 [[7617  363]
 [   0 1063]]
675.7403679999998 seconds


### Hyperparameter tuning with single layer: 
- 1000 neurons in one layer was performing the best. Increasing neuron size decreases testing accuracy. 

In [87]:
# Hyperparameter tuning 

singe_hidden_layers = [1500, 2000, 2500, 3000]
double_hidden_layers = [(50, 25,), (500,100,), (500, 400,), (1000,500,), (1000,750,)]

# Double layer testing 

for i in double_hidden_layers:
    start_time = time.clock()
    print('layer size: ', i )
    mlp = MLPClassifier(hidden_layer_sizes=i)
    mlp.fit(X_train_res, y_train_res)
    y_pred = mlp.predict(X_train_res)
    print('Training accuracy:', mlp.score(X_train_res, y_train_res))
    print('Training recall', recall_score(y_train_res, y_pred))
    
    # Testing 
    y_test_pred = mlp.predict(X_test)
    print('Testing accuracy: ', accuracy_score(y_test, y_test_pred))
    print('Testing Conf Matrix: \n', confusion_matrix(y_test, y_test_pred))
    print('{} seconds'.format(time.clock() - start_time))

layer size:  (50, 25)
Training accuracy: 0.9950691879030743
Training recall 0.9901383758061486
Testing accuracy:  0.9962401857790556
Testing Conf Matrix: 
 [[7978    2]
 [  32 1031]]
10.213826000000154 seconds
layer size:  (500, 100)
Training accuracy: 0.9959457767203056
Training recall 0.9926742220274247
Testing accuracy:  0.9981200928895279
Testing Conf Matrix: 
 [[7976    4]
 [  13 1050]]
178.59118000000035 seconds
layer size:  (500, 400)
Training accuracy: 0.9986068499154718
Training recall 0.9980589819047023
Testing accuracy:  0.9980095101183236
Testing Conf Matrix: 
 [[7974    6]
 [  12 1051]]
713.9102920000005 seconds
layer size:  (1000, 500)
Training accuracy: 0.9996086657065932
Training recall 0.9994364786174942
Testing accuracy:  0.9993365033727746
Testing Conf Matrix: 
 [[7975    5]
 [   1 1062]]
2039.0881600000002 seconds
layer size:  (1000, 750)
Training accuracy: 0.9995460522196481
Training recall 0.9999686932565275
Testing accuracy:  0.9990047550591618
Testing Conf Matri

### Hyperparameter tuning with 2 layers: 
- (1000,500) improves testing accuracy to 99.93%

In [98]:
mlp = MLPClassifier(hidden_layer_sizes=(500,250,250,)
                   ).fit(X_train_res, y_train_res)
y_pred = mlp.predict(X_train_res)

In [102]:
mlp.fit(X_test, y_test)
#confusion_matrix(y_test, y_test_pred)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 250, 250), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [103]:
y_test_pred = mlp.predict(X_test)

In [104]:
confusion_matrix(y_test, y_test_pred)

array([[7950,   72],
       [ 127,  894]])

In [None]:
# hyperparameter tuning - 

## no three layers, 2 layers and 
# start with single layers with small # of neurons, check accuracy 
# add a second layer, add more, and check accuracy 
#

In [82]:
# Random Forest

from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier

start_times = time.clock()
rfc = RandomForestClassifier().fit(X_train_res, y_train_res)

# CV
print('train set cv=5')
print(cross_val_score(rfc, X_train_res, y_train_res, cv=5))
print('{} seconds'.format(time.clock() - start_time))


# Test
start_time = time.clock()
print('test set cv=5')
print(cross_val_score(rfc, X_test_res, y_test_res, cv=5))
print('{} seconds'.format(time.clock() - start_time))



train set cv=5
[1. 1. 1. 1. 1.]
726.4921840000006 seconds
test set cv=5
[1. 1. 1. 1. 1.]
0.3316339999992124 seconds


In [95]:
rfc.fit(X_train_res, y_train_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [96]:
rfc_pred = rfc.predict(X_test)

confusion_matrix(y_test, rfc_pred)

array([[8022,    0],
       [   0, 1021]])

In [83]:
# Gradient Boosting 

from sklearn.ensemble import GradientBoostingClassifier

start_time = time.clock()
gbc = ensemble.GradientBoostingClassifier().fit(X_train_res, y_train_res)
print('train set cv=5')
print(cross_val_score(gbc, X_train_res, y_train_res, cv=5))
print('{} seconds'.format(time.clock() - start_time))

start_time = time.clock()
print('test set cv=5')
print(cross_val_score(gbc, X_test_res, y_test_res, cv=5))
print('{} seconds'.format(time.clock() - start_time))

train set cv=5
[1. 1. 1. 1. 1.]
10.018150000000787 seconds
test set cv=5
[1. 1. 1. 1. 1.]
1.4591129999998884 seconds


In [97]:
gbc.fit(X_train_res, y_train_res)

gbc_pred = gbc.predict(X_test)

confusion_matrix(y_test, gbc_pred)

array([[8022,    0],
       [   0, 1021]])