# Building Models with Keras

Involves four stages:

1. Model Specification

2. Compile the model

3. Fit the Model

4. Make the Predictions

### 1. Model Specification

In [None]:
###### WILL NOT RUN LOCALLY AS GPU REQ'D ##########
import keras
from keras.layers import Dense
from keras.models import Sequential

In [None]:
import pandas as pd
import numpy as np

In [None]:
# target -> 'wages_per_hour'
wages = pd.read_csv('./data/wages.txt')
print(wages.shape)
wages.head()

In [None]:
# convert predictors into numpy array
predictors = wages.drop('wage_per_hour', axis=1).as_matrix()
print(type(predictors))
print(predictors.shape)
predictors

In [None]:
target = wages['wage_per_hour'].as_matrix()
print(target.shape)

In [None]:
# Basic Neural Net - two hidden layers and one output layer

# Save the number of columns in predictors: n_cols
n_cols = predictors.shape[1]

# Set up the model: model
model = Sequential()

# Add the first layer
# Use the .add() method on model to add a Dense layer.
# Add 50 units, specify activation='relu', and the input_shape parameter to be 
# the tuple (n_cols,) which means it has n_cols items in each row of data, and any 
# number of rows of data are acceptable as inputs.
model.add(Dense(50, activation='relu', input_shape=(n_cols,)))

# Add the second layer
# Add another Dense layer. This should have 32 units and a 'relu' activation.
model.add(Dense(32, activation='relu'))

# Add the output layer
# which is a Dense layer with a single node, NO activation function
model.add(Dense(1))


### 2. Compile the Model

To compile the model, we need to specify the **optimizer** and **loss function** to use. the **Adam optimizer** is generally excellent choice for optimizer, we'll use **mean-squared error** as our loss function.

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Verify that model contains information from compiling
print("Loss function: " + model.loss)

### 3. Fit the model

In [None]:
model.fit(predictors, target)

### Modeling Classification Data

In [None]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical

In [1]:
import pandas as pd
import numpy as np

titanic = pd.read_csv('./data/titanic.csv')
print(titanic.shape)
titanic.head()

(891, 11)


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,male,age_was_missing,embarked_from_cherbourg,embarked_from_queenstown,embarked_from_southampton
0,0,3,22.0,1,0,7.25,1,False,0,0,1
1,1,1,38.0,1,0,71.2833,0,False,1,0,0
2,1,3,26.0,0,0,7.925,0,False,0,0,1
3,1,1,35.0,1,0,53.1,0,False,0,0,1
4,0,3,35.0,0,0,8.05,1,False,0,0,1


In [None]:
predictors = titanic.drop('survived', axis=1).as_matrix()

# Convert df.survived to a categorical variable using the to_categorical() function
target = to_categorical(titanic.survived)

n_cols = predictors.shape[1] # 10

print(predictors.shape, target.shape, n_cols)
print(Setup complete!)

In [None]:
# Set up the model
model = Sequential()

# Add the first layer
# Add a Dense layer with 32 nodes. Use 'relu' as the activation and (n_cols,) as the input_shape
model.add(Dense(32, activation='relu', input_shape=(n_cols,)))

# Add the output layer
# Add the Dense output layer. Because there are two outcomes, it should have 2 units, and 
# because it is a classification model, the activation should be 'softmax'.
model.add(Dense(2, activation='softmax'))

# Compile the model
# Compile the model, using 'sgd' as the optimizer, 'categorical_crossentropy' as the loss function, 
# and metrics=['accuracy'] to see the accuracy (what fraction of predictions were correct) 
# at the end of each epoch
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(predictors, target)

### 4. Make Predictions

In [5]:
pred_data = np.array(
    [
        [2, 34.0, 0, 0, 13.0, 1, False, 0, 0, 1],
        [2, 31.0, 1, 1, 26.25, 0, False, 0, 0, 1],
        [1, 11.0, 1, 2, 120.0, 1, False, 0, 0, 1],
        [3, 0.42, 0, 1, 8.5167, 1, False, 1, 0, 0],
        [3, 27.0, 0, 0, 6.975, 1, False, 0, 0, 1],
        [3, 31.0, 0, 0, 7.775, 1, False, 0, 0, 1],
        [1, 39.0, 0, 0, 0.0, 1, False, 0, 0, 1],
        [3, 18.0, 0, 0, 7.775, 0, False, 0, 0, 1],
        [2, 39.0, 0, 0, 13.0, 1, False, 0, 0, 1],
        [1, 33.0, 1, 0, 53.1, 0, False, 0, 0, 1],
        [3, 26.0, 0, 0, 7.8875, 1, False, 0, 0, 1],
        [3, 39.0, 0, 0, 24.15, 1, False, 0, 0, 1],
        [2, 35.0, 0, 0, 10.5, 1, False, 0, 0, 1],
        [3, 6.0, 4, 2, 31.275, 0, False, 0, 0, 1],
        [3, 30.5, 0, 0, 8.05, 1, False, 0, 0, 1],
        [1, 29.69911764705882, 0, 0, 0.0, 1, True, 0, 0, 1],
        [3, 23.0, 0, 0, 7.925, 0, False, 0, 0, 1],
        [2, 31.0, 1, 1, 37.0042, 1, False, 1, 0, 0],
        [3, 43.0, 0, 0, 6.45, 1, False, 0, 0, 1],
        [3, 10.0, 3, 2, 27.9, 1, False, 0, 0, 1],
        [1, 52.0, 1, 1, 93.5, 0, False, 0, 0, 1],
        [3, 27.0, 0, 0, 8.6625, 1, False, 0, 0, 1],
        [1, 38.0, 0, 0, 0.0, 1, False, 0, 0, 1],
        [3, 27.0, 0, 1, 12.475, 0, False, 0, 0, 1],
        [3, 2.0, 4, 1, 39.6875, 1, False, 0, 0, 1],
        [3, 29.69911764705882, 0, 0, 6.95, 1, True, 0, 1, 0],
        [3, 29.69911764705882, 0, 0, 56.4958, 1, True, 0, 0, 1],
        [2, 1.0, 0, 2, 37.0042, 1, False, 1, 0, 0],
        [3, 29.69911764705882, 0, 0, 7.75, 1, True, 0, 1, 0],
        [1, 62.0, 0, 0, 80.0, 0, False, 0, 0, 0],
        [3, 15.0, 1, 0, 14.4542, 0, False, 1, 0, 0],
        [2, 0.83, 1, 1, 18.75, 1, False, 0, 0, 1],
        [3, 29.69911764705882, 0, 0, 7.2292, 1, True, 1, 0, 0],
        [3, 23.0, 0, 0, 7.8542, 1, False, 0, 0, 1],
        [3, 18.0, 0, 0, 8.3, 1, False, 0, 0, 1],
        [1, 39.0, 1, 1, 83.1583, 0, False, 1, 0, 0],
        [3, 21.0, 0, 0, 8.6625, 1, False, 0, 0, 1],
        [3, 29.69911764705882, 0, 0, 8.05, 1, True, 0, 0, 1],
        [3, 32.0, 0, 0, 56.4958, 1, False, 0, 0, 1],
        [1, 29.69911764705882, 0, 0, 29.7, 1, True, 1, 0, 0],
        [3, 20.0, 0, 0, 7.925, 1, False, 0, 0, 1],
        [2, 16.0, 0, 0, 10.5, 1, False, 0, 0, 1],
        [1, 30.0, 0, 0, 31.0, 0, False, 1, 0, 0],
        [3, 34.5, 0, 0, 6.4375, 1, False, 1, 0, 0],
        [3, 17.0, 0, 0, 8.6625, 1, False, 0, 0, 1],
        [3, 42.0, 0, 0, 7.55, 1, False, 0, 0, 1],
        [3, 29.69911764705882, 8, 2, 69.55, 1, True, 0, 0, 1],
        [3, 35.0, 0, 0, 7.8958, 1, False, 1, 0, 0],
        [2, 28.0, 0, 1, 33.0, 1, False, 0, 0, 1],
        [1, 29.69911764705882, 1, 0, 89.1042, 0, True, 1, 0, 0],
        [3, 4.0, 4, 2, 31.275, 1, False, 0, 0, 1],
        [3, 74.0, 0, 0, 7.775, 1, False, 0, 0, 1],
        [3, 9.0, 1, 1, 15.2458, 0, False, 1, 0, 0],
        [1, 16.0, 0, 1, 39.4, 0, False, 0, 0, 1],
        [2, 44.0, 1, 0, 26.0, 0, False, 0, 0, 1],
        [3, 18.0, 0, 1, 9.35, 0, False, 0, 0, 1],
        [1, 45.0, 1, 1, 164.8667, 0, False, 0, 0, 1],
        [1, 51.0, 0, 0, 26.55, 1, False, 0, 0, 1],
        [3, 24.0, 0, 3, 19.2583, 0, False, 1, 0, 0],
        [3, 29.69911764705882, 0, 0, 7.2292, 1, True, 1, 0, 0],
        [3, 41.0, 2, 0, 14.1083, 1, False, 0, 0, 1],
        [2, 21.0, 1, 0, 11.5, 1, False, 0, 0, 1],
        [1, 48.0, 0, 0, 25.9292, 0, False, 0, 0, 1],
        [3, 29.69911764705882, 8, 2, 69.55, 0, True, 0, 0, 1],
        [2, 24.0, 0, 0, 13.0, 1, False, 0, 0, 1],
        [2, 42.0, 0, 0, 13.0, 0, False, 0, 0, 1],
        [2, 27.0, 1, 0, 13.8583, 0, False, 1, 0, 0],
        [1, 31.0, 0, 0, 50.4958, 1, False, 0, 0, 1],
        [3, 29.69911764705882, 0, 0, 9.5, 1, True, 0, 0, 1],
        [3, 4.0, 1, 1, 11.1333, 1, False, 0, 0, 1],
        [3, 26.0, 0, 0, 7.8958, 1, False, 0, 0, 1],
        [1, 47.0, 1, 1, 52.5542, 0, False, 0, 0, 1],
        [1, 33.0, 0, 0, 5.0, 1, False, 0, 0, 1],
        [3, 47.0, 0, 0, 9.0, 1, False, 0, 0, 1],
        [2, 28.0, 1, 0, 24.0, 0, False, 1, 0, 0],
        [3, 15.0, 0, 0, 7.225, 0, False, 1, 0, 0],
        [3, 20.0, 0, 0, 9.8458, 1, False, 0, 0, 1],
        [3, 19.0, 0, 0, 7.8958, 1, False, 0, 0, 1],
        [3, 29.69911764705882, 0, 0, 7.8958, 1, True, 0, 0, 1],
        [1, 56.0, 0, 1, 83.1583, 0, False, 1, 0, 0],
        [2, 25.0, 0, 1, 26.0, 0, False, 0, 0, 1],
        [3, 33.0, 0, 0, 7.8958, 1, False, 0, 0, 1],
        [3, 22.0, 0, 0, 10.5167, 0, False, 0, 0, 1],
        [2, 28.0, 0, 0, 10.5, 1, False, 0, 0, 1],
        [3, 25.0, 0, 0, 7.05, 1, False, 0, 0, 1],
        [3, 39.0, 0, 5, 29.125, 0, False, 0, 1, 0],
        [2, 27.0, 0, 0, 13.0, 1, False, 0, 0, 1],
        [1, 19.0, 0, 0, 30.0, 0, False, 0, 0, 1],
        [3, 29.69911764705882, 1, 2, 23.45, 0, True, 0, 0, 1],
        [1, 26.0, 0, 0, 30.0, 1, False, 1, 0, 0],
        [3, 32.0, 0, 0, 7.75, 1, False, 0, 1, 0]
    ]
)
print(pred_data.shape)

(91, 10)


In [None]:
# Create your predictions using the model's .predict() method on pred_data
predictions = model.predict(pred_data)

# Use NumPy indexing to find the column corresponding to predicted probabilities
# of survival being True. This is the second column (index 1) of predictions. 
predicted_prob_true = predictions[:, 1]

# print predicted_prob_true
print(predicted_prob_true)

### Changing Optimisation Parameters

Q. What could prevent a model from showing an improved loss in its first few epochs?

A. A learning rate that is either too high or too low, or poor choice of activation function.

We'll try optimizing a model at a very low learning rate, a very high learning rate, and a "just right" learning rate. We'll look at the results after running this exercise, remembering that a low value for the loss function is good.

We want the optimization to start from scratch every time we change the learning rate, to give a fair comparison of how each learning rate did in your results. So we have created a function get_new_model() that creates an unoptimized model to optimize.

In [1]:
# define a nn with two hidden layers for classification, with 10 units in each layer
def get_new_model():
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    
    return model

In [None]:
# Import the SGD optimizer
from keras.optimizers import SGD

# Create a list of learning rates to try optimizing with called lr_to_test. 
# The learning rates in it should be .000001, 0.01, and 1
lr_to_test = [0.000001, 0.01, 1.0]

# Loop over learning rates
for lr in lr_to_test:
    print('\n\nTesting model with learning rate: %f\n'%lr )
    
    # Build new model to test, unaffected by previous models
    model = get_new_model()
    
    # Create SGD optimizer with specified learning rate: my_optimizer
    my_optimizer = SGD(lr=lr)
    
    # Compile the model
    model.compile(optimizer=my_optimizer, loss='categorical_crossentropy')
    
    # Fit the model
    model.fit(predictors, target)
    

### Evaluating model accuracy on validation dataset

We can use part of the data as a validation set withthe `validation_split` argument. K-folds splits are not generally performed since deep learning generally involves using large data sets.

In [None]:
# Save the number of columns in predictors: n_cols
n_cols = predictors.shape[1]
input_shape = (n_cols,)

# Specify the model
model = get_new_model()

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(predictors, target, validation_split=0.3)

### Optimization through Early Stopping

In [None]:
# Import EarlyStopping
from keras.callbacks import EarlyStopping

# Save the number of columns in predictors: n_cols
n_cols = predictors.shape[1]
input_shape = (n_cols,)

# Specify the model
model = get_new_model()

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create an EarlyStopping object. Stop optimization when the validation loss 
# hasn't improved for 2 epochs by specifying the patience  parameter of 2.
early_stopping_monitor = EarlyStopping(patience=2)

# Fit the model, specifing the number of epochs and the size of the validation split and callbacks. 
model.fit(predictors, target, epochs=30, validation_split=0.3, callbacks=[early_stopping_monitor])

Because optimization will automatically stop when it is no longer helpful, it is okay to specify the maximum number of epochs as `30` rather than using the default of 10 that we've used so far. Here, it seems like the optimization stopped after 7 epochs.

### Comparing models - using more nodes

We'll create a new model called `model_2` which is similar to `model_1`, except it has 100 units in each hidden layer.

After we create `model_2`, both models will be fitted, and a graph showing both models loss score at each epoch will be shown. We added the argument `verbose=False` in the fitting commands to print out fewer updates, since you will look at these graphically instead of as text.

In [None]:
# Define early_stopping_monitor
early_stopping_monitor = EarlyStopping(patience=2)

# create & and compile model_1
model_1 = get_new_model()
model_1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create & compile model_2
model_2 = Sequential()
model_2.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model_2.add(Dense(100, activation='relu'))
model_2.add(Dense(2, activation='softmax'))
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit model_1
model_1_training = model_1.fit(predictors, target, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Fit model_2
model_2_training = model_2.fit(predictors, target, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Create the plot
plt.plot(model_1_training.history['val_loss'], 'r', model_2_training.history['val_loss'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Validation score')
plt.show()


![NN comparison](./img/nn-comparison.svg)

The blue model is `model_2`, the red is the original `model_1`. `model_2` has a lower loss value, so it is the better model. 

### Comparing models - using more layers

Next we'll try a deeper network (more hidden layers).

Once again, you have a baseline model called `model_1` as a starting point. It has 2 hidden layer, with 10 units/nodes each. We can print a summary of that model's structure with `model_1.summary()`. We will create a similar network with 3 hidden layers (still keeping the same number of units in each layer).

In [None]:
# The input shape to use in the first hidden layer
input_shape = (n_cols,)

# create and compile model_1
model_1 = get_new_model()
model_1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create the new model: model_2
model_2 = Sequential()

# Add the first, second, and third hidden layers
model_2.add(Dense(10, activation='relu', input_shape=input_shape))
model_2.add(Dense(10, activation='relu'))
model_2.add(Dense(10, activation='relu'))

# Add the output layer
model_2.add(Dense(2, activation='softmax'))

# Compile model_2
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit model 1
model_1_training = model_1.fit(
    predictors, target, epochs=20, validation_split=0.4, callbacks=[early_stopping_monitor], verbose=False)

# Fit model 2
model_2_training = model_2.fit(
    predictors, target, epochs=20, validation_split=0.4, callbacks=[early_stopping_monitor], verbose=False)

# fit both the models and visualize which one gives better results! 
# For both models, you should look for the best 'val_loss' and 'val_acc', 
# which won't be the last epoch for that model.
plt.plot(model_1_training.history['val_loss'], 'r', model_2_training.history['val_loss'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Validation score')
plt.show()


![nn comparison 2](./img/nn-comparison-2.svg)

The blue model is `model_2` and the red is the original, `model_1`. The model with the lower loss value is the better model.