# Keras & XGBoost 

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
import matplotlib.pyplot as plt
% matplotlib inline
% config InlineBackend.figure_format = 'retina'

## Prep

In [5]:
from sklearn.datasets import load_boston

boston = load_boston()

In [6]:
type(boston.data)

numpy.ndarray

In [7]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [8]:
binput = pd.DataFrame(boston.data, columns=boston.feature_names)
boutput = pd.DataFrame(boston.target, columns=['MEDV'])

In [9]:
binput.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## Baseline

Simple model that has a single fully connected hidden layer with the same number of neurons as input attributes (13).

There is no activation function for the output layer because it is a regression problem that predicts numbers directly

In [10]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

 Keras wrappers require a function as an argument. This function that we must define is responsible for creating the neural network model to be evaluated.

In [11]:
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=2)

In [12]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, boston.data, boston.target, cv=kfold)

Epoch 1/100
 - 0s - loss: 286.1793
Epoch 2/100
 - 0s - loss: 120.6839
Epoch 3/100
 - 0s - loss: 91.7128
Epoch 4/100
 - 0s - loss: 75.9937
Epoch 5/100
 - 0s - loss: 72.4468
Epoch 6/100
 - 0s - loss: 70.4586
Epoch 7/100
 - 0s - loss: 67.8199
Epoch 8/100
 - 0s - loss: 67.6025
Epoch 9/100
 - 0s - loss: 64.2023
Epoch 10/100
 - 0s - loss: 61.2838
Epoch 11/100
 - 0s - loss: 59.5452
Epoch 12/100
 - 0s - loss: 57.5418
Epoch 13/100
 - 0s - loss: 56.6248
Epoch 14/100
 - 0s - loss: 54.0365
Epoch 15/100
 - 0s - loss: 52.5800
Epoch 16/100
 - 0s - loss: 51.6805
Epoch 17/100
 - 0s - loss: 51.6152
Epoch 18/100
 - 0s - loss: 49.2717
Epoch 19/100
 - 0s - loss: 48.5599
Epoch 20/100
 - 0s - loss: 47.5780
Epoch 21/100
 - 0s - loss: 45.8057
Epoch 22/100
 - 0s - loss: 44.8748
Epoch 23/100
 - 0s - loss: 43.4026
Epoch 24/100
 - 0s - loss: 42.2140
Epoch 25/100
 - 0s - loss: 41.3707
Epoch 26/100
 - 0s - loss: 40.4519
Epoch 27/100
 - 0s - loss: 39.7363
Epoch 28/100
 - 0s - loss: 38.8586
Epoch 29/100
 - 0s - loss: 

Epoch 36/100
 - 0s - loss: 30.5848
Epoch 37/100
 - 0s - loss: 30.6933
Epoch 38/100
 - 0s - loss: 31.8378
Epoch 39/100
 - 0s - loss: 30.1219
Epoch 40/100
 - 0s - loss: 29.8860
Epoch 41/100
 - 0s - loss: 28.2928
Epoch 42/100
 - 0s - loss: 28.1544
Epoch 43/100
 - 0s - loss: 27.4048
Epoch 44/100
 - 0s - loss: 28.5678
Epoch 45/100
 - 0s - loss: 28.1976
Epoch 46/100
 - 0s - loss: 26.7487
Epoch 47/100
 - 0s - loss: 27.8969
Epoch 48/100
 - 0s - loss: 26.6823
Epoch 49/100
 - 0s - loss: 26.5389
Epoch 50/100
 - 0s - loss: 25.2130
Epoch 51/100
 - 0s - loss: 24.9504
Epoch 52/100
 - 0s - loss: 24.9363
Epoch 53/100
 - 0s - loss: 25.4718
Epoch 54/100
 - 0s - loss: 24.9667
Epoch 55/100
 - 0s - loss: 25.8868
Epoch 56/100
 - 0s - loss: 25.4898
Epoch 57/100
 - 0s - loss: 24.6241
Epoch 58/100
 - 0s - loss: 24.2723
Epoch 59/100
 - 0s - loss: 23.7489
Epoch 60/100
 - 0s - loss: 23.8974
Epoch 61/100
 - 0s - loss: 23.9440
Epoch 62/100
 - 0s - loss: 23.4870
Epoch 63/100
 - 0s - loss: 25.3275
Epoch 64/100
 - 0s -

Epoch 71/100
 - 0s - loss: 22.4675
Epoch 72/100
 - 0s - loss: 23.4344
Epoch 73/100
 - 0s - loss: 22.3482
Epoch 74/100
 - 0s - loss: 22.3661
Epoch 75/100
 - 0s - loss: 22.5714
Epoch 76/100
 - 0s - loss: 22.5889
Epoch 77/100
 - 0s - loss: 21.8273
Epoch 78/100
 - 0s - loss: 21.7139
Epoch 79/100
 - 0s - loss: 21.9219
Epoch 80/100
 - 0s - loss: 22.0298
Epoch 81/100
 - 0s - loss: 22.8017
Epoch 82/100
 - 0s - loss: 21.8991
Epoch 83/100
 - 0s - loss: 21.1762
Epoch 84/100
 - 0s - loss: 22.3484
Epoch 85/100
 - 0s - loss: 21.7749
Epoch 86/100
 - 0s - loss: 20.9510
Epoch 87/100
 - 0s - loss: 23.5903
Epoch 88/100
 - 0s - loss: 21.2699
Epoch 89/100
 - 0s - loss: 21.6409
Epoch 90/100
 - 0s - loss: 22.9086
Epoch 91/100
 - 0s - loss: 21.5667
Epoch 92/100
 - 0s - loss: 21.1920
Epoch 93/100
 - 0s - loss: 21.2483
Epoch 94/100
 - 0s - loss: 20.8247
Epoch 95/100
 - 0s - loss: 20.9715
Epoch 96/100
 - 0s - loss: 20.6429
Epoch 97/100
 - 0s - loss: 20.0224
Epoch 98/100
 - 0s - loss: 20.7620
Epoch 99/100
 - 0s -

Epoch 6/100
 - 0s - loss: 47.9706
Epoch 7/100
 - 0s - loss: 47.2112
Epoch 8/100
 - 0s - loss: 46.3042
Epoch 9/100
 - 0s - loss: 44.8171
Epoch 10/100
 - 0s - loss: 43.1925
Epoch 11/100
 - 0s - loss: 42.7904
Epoch 12/100
 - 0s - loss: 41.9616
Epoch 13/100
 - 0s - loss: 40.1996
Epoch 14/100
 - 0s - loss: 39.9729
Epoch 15/100
 - 0s - loss: 38.6152
Epoch 16/100
 - 0s - loss: 38.6590
Epoch 17/100
 - 0s - loss: 36.7076
Epoch 18/100
 - 0s - loss: 35.5611
Epoch 19/100
 - 0s - loss: 34.0829
Epoch 20/100
 - 0s - loss: 33.7187
Epoch 21/100
 - 0s - loss: 32.6061
Epoch 22/100
 - 0s - loss: 33.7004
Epoch 23/100
 - 0s - loss: 31.7713
Epoch 24/100
 - 0s - loss: 31.0406
Epoch 25/100
 - 0s - loss: 29.9790
Epoch 26/100
 - 0s - loss: 29.1522
Epoch 27/100
 - 0s - loss: 28.9811
Epoch 28/100
 - 0s - loss: 28.3006
Epoch 29/100
 - 0s - loss: 29.3025
Epoch 30/100
 - 0s - loss: 27.0403
Epoch 31/100
 - 0s - loss: 27.3725
Epoch 32/100
 - 0s - loss: 27.2197
Epoch 33/100
 - 0s - loss: 26.4021
Epoch 34/100
 - 0s - los

Epoch 41/100
 - 0s - loss: 29.0260
Epoch 42/100
 - 0s - loss: 28.4670
Epoch 43/100
 - 0s - loss: 28.3753
Epoch 44/100
 - 0s - loss: 27.3459
Epoch 45/100
 - 0s - loss: 27.3771
Epoch 46/100
 - 0s - loss: 26.4127
Epoch 47/100
 - 0s - loss: 26.5873
Epoch 48/100
 - 0s - loss: 25.8330
Epoch 49/100
 - 0s - loss: 25.9345
Epoch 50/100
 - 0s - loss: 25.8937
Epoch 51/100
 - 0s - loss: 24.3007
Epoch 52/100
 - 0s - loss: 25.1006
Epoch 53/100
 - 0s - loss: 23.7821
Epoch 54/100
 - 0s - loss: 25.0616
Epoch 55/100
 - 0s - loss: 24.4397
Epoch 56/100
 - 0s - loss: 24.0987
Epoch 57/100
 - 0s - loss: 23.5930
Epoch 58/100
 - 0s - loss: 22.7152
Epoch 59/100
 - 0s - loss: 22.5361
Epoch 60/100
 - 0s - loss: 22.1291
Epoch 61/100
 - 0s - loss: 23.9818
Epoch 62/100
 - 0s - loss: 22.3175
Epoch 63/100
 - 0s - loss: 22.1315
Epoch 64/100
 - 0s - loss: 21.2791
Epoch 65/100
 - 0s - loss: 22.0629
Epoch 66/100
 - 0s - loss: 22.9266
Epoch 67/100
 - 0s - loss: 22.5441
Epoch 68/100
 - 0s - loss: 21.5285
Epoch 69/100
 - 0s -

### Why the MSE is negative:
greater_is_better : boolean, default=True

Whether score_func is a score function (default), meaning high is good, <br/>
or a loss function, meaning low is good. In the latter case, the scorer <br/>
object will sign-flip the outcome of the score_func. <br/>

In [13]:
# The result reports the mean squared error including the average and standard deviation (average variance) 
    # across all 10 folds of the cross validation evaluation.
print("Baseline: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

Baseline: -32.92 (23.16) MSE / 5.74 RMSE


In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Standardize Dataset

An important concern with the Boston house price dataset is that the input attributes all vary in their scales because they measure different quantities.

Continuing on from the above baseline model, we can re-evaluate the same model using a standardized version of the input dataset.

We can use scikit-learn’s Pipeline framework to perform the standardization during the model evaluation process, within each fold of the cross validation. This ensures that there is no data leakage from each testset cross validation fold into the training data.

In [None]:
np.random.seed(seed)

estimators = []
estimators.append(('standardize', StandardScaler() ))
estimators.append(( 'mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0) ))

pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, boston.data, boston.target, cv=kfold)

In [None]:
print("Standardized: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

Just by standardizing the data, we have improved performance

## Evaluating a Deeper Network

In [None]:
# simply adding another layer from the baseline above
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

Now apply the same pipeline standardization

In [None]:
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, boston.data, boston.target, cv=kfold)
print("Deeper: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

## Evaluating a Wider Network

In [None]:
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=wider_model, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, boston.data, boston.target, cv=kfold)
print("Wider: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

## XGBoost

In [17]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [18]:
seed = 7
kf = KFold(n_splits=10, shuffle=True, random_state=seed)

for train_index, test_index in kf.split(boston.data):
    xgb_model = xgb.XGBRegressor().fit(boston.data[train_index], boston.target[train_index])
    predictions = xgb_model.predict(boston.data[test_index])
    actual = boston.target[test_index]
    print( mean_squared_error(actual, predictions) )

5.474144014348979
25.180371828288603
4.37568118282692
9.230398104833037
13.293700199162322
10.049803829731978
9.239822316624457
8.601253687243446
6.0456974135553985
5.680116071490241


In [19]:
rgxb = xgb.XGBRegressor()
estimators = []
estimators.append(( 'standardize', StandardScaler() ))
estimators.append(( 'regressor', rgxb ))
rpipeline = Pipeline(estimators)

In [20]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(rpipeline, boston.data, boston.target, cv=kfold)
print("XGBoost: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

XGBoost: 0.48 (0.38) MSE / 0.69 RMSE


## Parameter Optimization

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
clf.fit(boston.data, boston.target)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
0.5984879606490934
{'max_depth': 4, 'n_estimators': 100}


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    1.4s finished
