# Deep Learning with Python 
# Example 3.3 - Boston Housing Prices 

## Preparing Workspace

In [14]:

from tensorflow.keras.datasets import boston_housing

In [15]:
# Attempting to read the data directly
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()

In [16]:
# Examining the data
train_data.shape

(404, 13)

Only 404 training examples. But 13 features per example.

In [17]:
# What about the test data?
test_data.shape

(102, 13)

Only 102 examples, again with 13 features. This is clearly a very small dataset.

In [18]:
# What are the targets?
train_targets.shape 

(404,)

In [19]:
train_targets

array([15.2, 42.3, 50. , 21.1, 17.7, 18.5, 11.3, 15.6, 15.6, 14.4, 12.1,
       17.9, 23.1, 19.9, 15.7,  8.8, 50. , 22.5, 24.1, 27.5, 10.9, 30.8,
       32.9, 24. , 18.5, 13.3, 22.9, 34.7, 16.6, 17.5, 22.3, 16.1, 14.9,
       23.1, 34.9, 25. , 13.9, 13.1, 20.4, 20. , 15.2, 24.7, 22.2, 16.7,
       12.7, 15.6, 18.4, 21. , 30.1, 15.1, 18.7,  9.6, 31.5, 24.8, 19.1,
       22. , 14.5, 11. , 32. , 29.4, 20.3, 24.4, 14.6, 19.5, 14.1, 14.3,
       15.6, 10.5,  6.3, 19.3, 19.3, 13.4, 36.4, 17.8, 13.5, 16.5,  8.3,
       14.3, 16. , 13.4, 28.6, 43.5, 20.2, 22. , 23. , 20.7, 12.5, 48.5,
       14.6, 13.4, 23.7, 50. , 21.7, 39.8, 38.7, 22.2, 34.9, 22.5, 31.1,
       28.7, 46. , 41.7, 21. , 26.6, 15. , 24.4, 13.3, 21.2, 11.7, 21.7,
       19.4, 50. , 22.8, 19.7, 24.7, 36.2, 14.2, 18.9, 18.3, 20.6, 24.6,
       18.2,  8.7, 44. , 10.4, 13.2, 21.2, 37. , 30.7, 22.9, 20. , 19.3,
       31.7, 32. , 23.1, 18.8, 10.9, 50. , 19.6,  5. , 14.4, 19.8, 13.8,
       19.6, 23.9, 24.5, 25. , 19.9, 17.2, 24.6, 13

Training targets is a 404-dimensional vector, where each dimension or column represents the floating-point price of each house in the training set in thousands of dollars. These are 1970s prices that haven't been adjusted for inflation.

In [20]:
# The test set is the same
test_targets.shape

(102,)

In [21]:
test_targets

array([ 7.2, 18.8, 19. , 27. , 22.2, 24.5, 31.2, 22.9, 20.5, 23.2, 18.6,
       14.5, 17.8, 50. , 20.8, 24.3, 24.2, 19.8, 19.1, 22.7, 12. , 10.2,
       20. , 18.5, 20.9, 23. , 27.5, 30.1,  9.5, 22. , 21.2, 14.1, 33.1,
       23.4, 20.1,  7.4, 15.4, 23.8, 20.1, 24.5, 33. , 28.4, 14.1, 46.7,
       32.5, 29.6, 28.4, 19.8, 20.2, 25. , 35.4, 20.3,  9.7, 14.5, 34.9,
       26.6,  7.2, 50. , 32.4, 21.6, 29.8, 13.1, 27.5, 21.2, 23.1, 21.9,
       13. , 23.2,  8.1,  5.6, 21.7, 29.6, 19.6,  7. , 26.4, 18.9, 20.9,
       28.1, 35.4, 10.2, 24.3, 43.1, 17.6, 15.4, 16.2, 27.1, 21.4, 21.5,
       22.4, 25. , 16.6, 18.6, 22. , 42.8, 35.1, 21.5, 36. , 21.9, 24.1,
       50. , 26.7, 25. ])

## Data Preprocessing
Because data is numeric and occupies different ranges, it is a good practice to normalize the data. This ensures that all features occupy roughly the same range of values, and prevents features with exceedingly large values (such as area in square feet, which may be in hundreds or thousands) from influencing the model too much relative to others (such as number of rooms, which has values in range 1 - 10).

This is because we want the neural network to treat all input features as equally important - at least initially. 

For each feature, we subtract the mean of the feature and divide the resulting value by the standard deviation. This makes the data have the same mean value (0) and occupy roughly the same range.

In [27]:
mean = train_data.mean(axis=0) # column-wise mean of all rows in the dataset
std = train_data.std(axis=0)   # column-wise standard deviation of all rows

# Normalize by subtracting mean and dividing by the standard deviation
# Do this for the training data
train_data -= mean
train_data /= std

# Normalize test set, but don't use the test set's mean and standard deviation
# We should never use the test set in any form - directly or indirectly
# Because we're assuming that the test set will not be available to the model in any way.
# This means we can't use values derived from the test set to normalize it.
test_data -= mean
test_data /= std

## Training a Model

Defining a function instantiate a model because we will be building multiple models throughout the course of this notebook.

The data set is very small, so we're making a smaller model i.e. a model with only one hidden layer. Using a smaller model is one way to prevent overfitting.

For the final layer, we are not specifying any activation function. We want to predict a continuous value that should not necessarily be constrained between 0 and 1 (if we used `sigmoid`). The last layer is a linear layer - it will simply be the weighted sum of the outputs of the previous layer.  

## Compiling the Model
- Optimizer is still `rmsprop`.
- For regression problems, `mse` is a good loss quantity. `mse` stands for **Mean Squared Error** - it is the mean of the square of the residuals i.e. the errors between the actual values and the values predicted by our model. 
- Instead of measuring accuracy, we're measuring `mae` - **Mean Absolute Error**. It is the absolute value of the error between the actual value and predicted value. The reason we're using this metric is because it will give an error in the same units as the target variable - i.e. by using `mae`, we will be able to see, on average, by how many thousands of dollars our prediction differed from the actual value.

In [46]:
from tensorflow.keras import models, layers

def build_model():
    model = models.Sequential()
    
    # Getting the number of features per training example programmatically and using
    # it to define the shape of the input vector that the network will receive
    model.add(layers.Dense(64, activation='relu', input_shape=(train_data.shape[1], )))
    model.add(layers.Dense(64, activation='relu'))
    
    # Because we will be predicting a single value, we will use a single hidden unit
    
    model.add(layers.Dense(1))
    
    # Compile the model 
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    
    # Return the compiled model
    return model

## K-Fold Cross Validation - Naive Implementation
Because the data set is very small, we can try to minimise the model's variance (and thus improve its generalizing power) by using **k-fold cross validation**. We divide the training data into `k` folds (usually 5 or more), and train `k` different models. Each model sees all but 1 of the folds as a training data, and is validated on the remaining fold. 

This allows models to be trained on different sections or folds of the data and simultaneously validated on different sections of the entire data as well. The validation accuracy/error predicted as the average of all these models presents a more accurate estimate of the actual error than that of any one model.

In [54]:
import numpy as np

k = 4    # Number of folds 
num_val_samples = len(train_data) // k  # INTEGER DIVISION - validation set will have 1/k of the total samples
num_epochs = 500                        # holy shit
all_scores = []                         # empty list will store errors for all models tested
all_mae_histories = []

In [55]:
for i in range(k):
    print('Processing fold #', i)
    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples : (i + 1) * num_val_samples ]
    
    partial_train_data = np.concatenate(
        [train_data[: i * num_val_samples], 
        train_data[(i + 1) * num_val_samples:]],
        axis=0)
    
    partial_train_targets = np.concatenate(
        [train_targets[:i * num_val_samples], 
        train_targets[(i + 1) * num_val_samples:]],
        axis=0
    )
    
    model = build_model()
    history = model.fit(partial_train_data, partial_train_targets, 
                       epochs=num_epochs, batch_size=1, verbose=0)
    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
    all_scores.append(val_mae)
    mae_history = history.history['val_mean_absolute_error']
    all_mae_histories.append(mae_history)

Processing fold # 0


KeyError: 'val_mean_absolute_error'

In [53]:
avg_mae_history = [
    np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]

NameError: name 'all_mae_histories' is not defined

## Visualizing Results

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1, len(avg_mae_history) + 1), average_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

## Improving Results - Smoothing Data

In [None]:
def smooth_curve(points, factor=0.9):
    smoothed_points=[]
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

In [None]:
plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.grid(True)
plt.show()

## Training the Final Model

In [56]:
model = build_model()
model.fit(train_data, train_targets, epochs=80, batch_size=16, 
         verbose=0)
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)

