# Part 3, Multilayer Perceptron Neural Network

In [2]:
import tensorflow as tf

import pandas as pd
import numpy as np
import sys
import h5py
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from scipy.sparse import csr_matrix, hstack
from sklearn.cross_validation import KFold, train_test_split
from keras.models import Sequential
from keras.models import save_model, load_model
from keras.layers.advanced_activations import PReLU
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras.callbacks import EarlyStopping
import matplotlib.patches as mpatches


%matplotlib inline

Using TensorFlow backend.


In [3]:
from keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
K.set_session(sess)

In [5]:
train = pd.read_csv('train.csv')

cat_names = [c for c in train.columns if 'cat' in c]

train = pd.get_dummies(data=train, columns=cat_names)

features = [x for x in train.columns if x not in ['id','loss']]

train_x = np.array(train[features])

ntrain = train_x.shape[0]

# np.log(train['loss'] + 200) provides
# a better score, but let's keep it simple now
train_y = np.array(train['loss'])

print (train_x.shape)
print (train_y.shape)

(188318, 1153)
(188318,)


In [8]:
# VERSION 4. Insights:
# – why not to test 4-layer architectures?
# — we need to introduce new optimizers
# — adding batch normalization (https://arxiv.org/abs/1502.03167)

# Describing the search space
space = {'hidden1_dropout': hp.choice('hidden1_dropout', np.linspace(0.4,0.6,20)),
        'hidden2_dropout': hp.choice('hidden2_dropout', np.linspace(0.2,0.5,10)),
        'hidden3_dropout': hp.choice('hidden3_dropout', np.linspace(0.1,0.5,10)),
         'hidden1_units': hp.choice('hidden1_units', np.linspace(300,550,30,dtype='int16')),
         'hidden2_units': hp.choice('hidden2_units', np.linspace(100,300,30,dtype='int16')),
         'hidden3_units': hp.choice('hidden3_units', np.linspace(20,80,30,dtype='int16')),
         'optimizer': hp.choice('optimizer', ['adam','nadam','adamax','adadelta'])
        }

# Implementing a function to minimize
def hyperopt_search(params):
    print ('Model Testing:', params)
    def mlp_model():
        model = Sequential()
        model.add(Dense(params['hidden1_units'], input_dim=train_x.shape[1]))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dropout(params['hidden1_dropout']))
        
        model.add(Dense(params['hidden2_units']))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dropout(params['hidden2_dropout']))

        model.add(Dense(params['hidden3_units'])) 
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dropout(params['hidden3_dropout']))
        
        model.add(Dense(1))
        model.compile(loss='mae', optimizer=params['optimizer'])
        return model
    
    cv_score = cross_validate_mlp(mlp_model)
    return {'loss': cv_score, 'status': STATUS_OK}

# Run the optimization and see the results
sys.stdout = open('hyperopt_v4.log', 'w')
trials = Trials()

# UNCOMMENT THE NEXT LINE TO LAUNCH HYPEROPT:
best = fmin(hyperopt_search, space, algo=tpe.suggest, max_evals = 50, trials=trials)

TypeError: 'generator' object is not subscriptable

## Step 6: The final model

It took several rounds of optimization to narrow down the parameters of the model. Here are the results.

First, the architecture. The final 4-layer model uses dropout as a regularization and a batch normalization prior to each hidden layer.

<img src="http://cdn.rawgit.com/dnkirill/allstate_capstone/master/images/mlp3.svg"></td>

And this is the model itself:

In [34]:
def hyper_model():
    model = Sequential()
    model.add(Dense(351, input_dim=train_x.shape[1], init='glorot_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.578947))
    
    model.add(Dense(293, init='glorot_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.26666))
    
    model.add(Dense(46, init='glorot_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.188888))
    
    model.add(Dense(1, init='glorot_normal'))
    model.compile(loss='mae', optimizer='adadelta')
    return model

In [35]:
if USE_PRETRAINED:
    with open('pretrained/mlp_f_score.pkl', 'rb') as f:
        cv_score = pickle.load(f)
else:
    sys.stdout = open('mlp_final_out.txt', 'w')
    cv_score = cross_validate_mlp(hyper_model)

In [36]:
sys.stdout = _stdout

In [37]:
print "CV score for the final model:", cv_score

CV score for the final model: 1150.0096524


Though this model is not adapted for mere 30 epochs of training, nor for 3-fold CV (I used 5-fold on Kaggle), even though this is a single unbagged model which has been cross-validated on three folds only, we see a very good score:
`CV = 1150` (your score may vary a little).

By the way, this single model, bagged, 5-fold CVed, scored 1116.28 on Kaggle LB.

As we see, this model is considerably better than any other models we had so far. We now take it as the second part of our final ensemble.