In [1]:
from keras.datasets import boston_housing
from keras import models, layers
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.





In [10]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [2]:
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()
train_data.shape

(404, 13)

### Normalize the feature data

In [3]:
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean ) / std

# always use mean and std from TRAIN data
test_data = (test_data - mean) / std
train_data.shape

(404, 13)

### Build model generator

In [4]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(13,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae']) # mean absolute error
    return model

Note that the network ends with a single unit and no activation function, because the layer will be linear. This is because we are solving a regression problem, predicting a singular, continous value. An activation function would squash the range, which we don't want

### K-Fold CV
Because the dataset is small, building a validation set out of the training data would be even smaller, causing a high variance in validation scores. The best practice is to se K-fold CV  
Also, the strategy here is to CV the train data to tune the hyperparameters; then, use the entire training dataset with the tuned hyperparamters for the final model

In [None]:
# although the book rolls their own, we'll use sklearn's KFold
K = 3
NUM_EPOCHS = 500

kf = KFold(n_splits=K)
num_val_samples = len(train_data) // K
all_mae = []

for i, (train, test) in enumerate(kf.split(train_data)):
    print('Running Fold', i+1, '/', K)
    model = build_model()
    history = model.fit(train_data[train], train_targets[train],
                        validation_data=(train_data[test], train_targets[test]),
                        epochs=NUM_EPOCHS, batch_size=16, verbose=0)
    mae_hist = history.history['mean_absolute_error'] # results at each epoch
    all_mae.append(mae_hist)
print('\nMean MAE', np.mean(all_mae))

#todo: find mean of each epoch across all folds; plot!

Running Fold 1 / 3
Running Fold 2 / 3
Running Fold 3 / 3


In [22]:
_, mae = model.evaluate(test_data, test_targets) 
mae



2.7769166628519693

### Notes on batch sizing
The link below discusses batch, stochastic and mini-batch gradient descent
https://machinelearningmastery.com/difference-between-a-batch-and-an-epoch/

One thing the book does when rolling their own k-fold CV procedure is use a very small validation set within each fold.
Also, by coding the CV by hand, it's easy to get per epoch MAE, which can be averaged across all folds and graphed

In [15]:
print('\nMean MAE per fold', np.mean(all_mae, axis=1))


Mean MAE per fold [3.06275904 3.04646065 2.8189587 ]
