# Keras & XGBoost 

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
import matplotlib.pyplot as plt
% matplotlib inline
% config InlineBackend.figure_format = 'retina'

## Prep

In [7]:
from sklearn.datasets import load_boston

boston = load_boston()

In [8]:
type(boston.data)

numpy.ndarray

In [9]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [10]:
binput = pd.DataFrame(boston.data, columns=boston.feature_names)
boutput = pd.DataFrame(boston.target, columns=['MEDV'])

In [11]:
binput.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## Baseline

Simple model that has a single fully connected hidden layer with the same number of neurons as input attributes (13).

There is no activation function for the output layer because it is a regression problem that predicts numbers directly

In [None]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

 Keras wrappers require a function as an argument. This function that we must define is responsible for creating the neural network model to be evaluated.

In [None]:
seed = 7
np.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=2)

In [None]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, boston.data, boston.target, cv=kfold)

### Why the MSE is negative:
greater_is_better : boolean, default=True

Whether score_func is a score function (default), meaning high is good, <br/>
or a loss function, meaning low is good. In the latter case, the scorer <br/>
object will sign-flip the outcome of the score_func. <br/>

In [None]:
# The result reports the mean squared error including the average and standard deviation (average variance) 
    # across all 10 folds of the cross validation evaluation.
print("Baseline: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

## Standardize Dataset

An important concern with the Boston house price dataset is that the input attributes all vary in their scales because they measure different quantities.

Continuing on from the above baseline model, we can re-evaluate the same model using a standardized version of the input dataset.

We can use scikit-learn’s Pipeline framework to perform the standardization during the model evaluation process, within each fold of the cross validation. This ensures that there is no data leakage from each testset cross validation fold into the training data.

In [None]:
np.random.seed(seed)

estimators = []
estimators.append(('standardize', StandardScaler() ))
estimators.append(( 'mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0) ))

pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, boston.data, boston.target, cv=kfold)

In [None]:
print("Standardized: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

Just by standardizing the data, we have improved performance

## Evaluating a Deeper Network

In [None]:
# simply adding another layer from the baseline above
def larger_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

Now apply the same pipeline standardization

In [None]:
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, boston.data, boston.target, cv=kfold)
print("Deeper: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

## Evaluating a Wider Network

In [None]:
def wider_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
np.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=wider_model, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, boston.data, boston.target, cv=kfold)
print("Wider: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

## XGBoost

In [12]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [13]:
seed = 7
kf = KFold(n_splits=10, shuffle=True, random_state=seed)

for train_index, test_index in kf.split(boston.data):
    xgb_model = xgb.XGBRegressor().fit(boston.data[train_index], boston.target[train_index])
    predictions = xgb_model.predict(boston.data[test_index])
    actual = boston.target[test_index]
    print( mean_squared_error(actual, predictions) )

5.474144014348979
25.180371828288603
4.37568118282692
9.230398104833037
13.293700199162322
10.049803829731978
9.239822316624457
8.601253687243446
6.0456974135553985
5.680116071490241


In [14]:
rgxb = xgb.XGBRegressor()
estimators = []
estimators.append(( 'standardize', StandardScaler() ))
estimators.append(( 'classifier', rgxb ))
rpipeline = Pipeline(estimators)

In [15]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(rpipeline, boston.data, boston.target, cv=kfold)
print("XGBoost: %.2f (%.2f) MSE / %.2f RMSE" % (results.mean(), results.std(), math.sqrt(abs(results.mean()))))

XGBoost: 0.48 (0.38) MSE / 0.69 RMSE


## Parameter Optimization

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
clf.fit(boston.data, boston.target)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
0.5984879606490934
{'max_depth': 4, 'n_estimators': 100}


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    1.4s finished
