In [1]:
# Neural Network for Bank Data
# Data from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers import Dense
from keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split
from keras.losses import binary_crossentropy
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
bank = pd.read_csv("data/bank/bank-full.csv", sep=';')

In [3]:
bank

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


Alright we need to encode this because non-numeric data is as good as garbage

In [4]:
def baggize(values):
    sack = {}
    for v in values:
        if v not in sack.keys():
            sack[v] = len(sack)
    kcas = dict((v, k) for (k, v) in sack.items())
    return {'sack': sack, 'kcas': kcas}

In [5]:
columns = bank.columns
columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [6]:
sacks = {}
for c in columns:
    if bank[c].dtype == 'O':
        vals = bank[c].values
        result = baggize(vals)
        sacks[c] = result
        sack = result["sack"]
        bank[c] = [sack[v] for v in vals]

In [7]:
bank

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,0,0,0,0,2143,0,0,0,5,0,261,1,-1,0,0,0
1,44,1,1,1,0,29,0,0,0,5,0,151,1,-1,0,0,0
2,33,2,0,1,0,2,0,1,0,5,0,76,1,-1,0,0,0
3,47,3,0,2,0,1506,0,0,0,5,0,92,1,-1,0,0,0
4,33,4,1,2,0,1,1,0,0,5,0,198,1,-1,0,0,0
5,35,0,0,0,0,231,0,0,0,5,0,139,1,-1,0,0,0
6,28,0,1,0,0,447,0,1,0,5,0,217,1,-1,0,0,0
7,42,2,2,0,1,2,0,0,0,5,0,380,1,-1,0,0,0
8,58,5,0,3,0,121,0,0,0,5,0,50,1,-1,0,0,0
9,43,1,1,1,0,593,0,0,0,5,0,55,1,-1,0,0,0


In [8]:
ys = bank["y"].values.reshape(-1,1)

In [9]:
xs = bank[[c for c in columns if c != "y"]].values

In [10]:
xs, ys

(array([[ 58,   0,   0, ...,  -1,   0,   0],
        [ 44,   1,   1, ...,  -1,   0,   0],
        [ 33,   2,   0, ...,  -1,   0,   0],
        ...,
        [ 72,   5,   0, ..., 184,   3,   3],
        [ 57,   3,   0, ...,  -1,   0,   0],
        [ 37,   2,   0, ..., 188,  11,   2]]), array([[0],
        [0],
        [0],
        ...,
        [1],
        [0],
        [0]]))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.2)

In [12]:
train_len = len(X_train)
inp_shape = X_train[0].shape
out_shape = y_train[0].shape
train_len, inp_shape, out_shape

(36168, (16,), (1,))

In [13]:
model = Sequential()
model.add(Dense(units=16, activation='tanh', input_shape=inp_shape))
model.add(Dense(units=8, activation='tanh', input_shape=(16,)))
model.add(Dense(units=4, activation='tanh', input_shape=(8,)))
model.add(Dense(units=2, activation='tanh', input_shape=(4,)))
model.add(Dense(units=1, activation='tanh', input_shape=(2,)))
tb_callback = TensorBoard(log_dir='./logs/bank')

In [14]:
epochs = 50
model.compile(loss=binary_crossentropy, optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=epochs, batch_size=96,
          callbacks=[tb_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fa053b8e5c0>

In [15]:
score = model.evaluate(X_test, y_test, batch_size=64)
score



[0.2723594950381257, 0.8789118655774889]

In [16]:
predicted = model.predict(X_test)

In [17]:
predicted = [0 if p < 0.5 else 1 for p in predicted]

In [18]:
errors = [(y1, y2) for (y1, y2) in zip(predicted, y_test) if y1 != y2]

In [19]:
len(errors)

1094