In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

** intro to deep learning using keras **
https://campus.datacamp.com/courses/deep-learning-in-python/fine-tuning-keras-models?ex=13

- Deep Learning With Jupyter Notebooks In The Cloud
   - This step-by-step tutorial will show you how to set up and use Jupyter Notebook on Amazon Web Services (AWS) EC2 GPU for deep learning.
https://www.datacamp.com/community/tutorials/deep-learning-jupyter-aws

> Deep learning models capture interactions

    - linear regression doesn't consider interactions between features
    - Neural networks account for interactions really well



In [2]:
%%html
### deep learning interaction
<img src = 'interaction.png', width = 300, height =200>

> part 1. forward propagation

In [3]:
%%html
### forward propagation
<img src = 'forward.png', width = 300, height =200>

In [4]:
%%html
### multiply layers
<img src = 'multiple.png', width = 300, height =200>

### Forward Propagation
- do multiply addition in hidden layers
- activiation function on hidden layers 
  - activation function allows model to capture non-linearility
  - apply to node inputs to produce node outputs
  - ** ReLU **(Rectified Linear Activation)-- standard industry/ tanh()
      - if x <0, ReLU(x) = 0
      - if x >0, ReLU(x) = x
  


### deep learning(also called representation learning): - multiply hidden layers
    - deep networks internally build representations of pattern in the data
    - partially replace the need for feature engineering
    - subsequent layers build increasingly sophisticated representations of raw data
  

- example 1: dot product + ReLU for one row

In [None]:
def relu(input):
    '''Define your relu activation function here'''
    # Calculate the value for the output of the relu function: output
    output = max(0, input)
    
    # Return the value just calculated
    return(output)

# Calculate node 0 value: node_0_output
node_0_input = (input_data * weights['node_0']).sum()
node_0_output = relu(node_0_input)

# Calculate node 1 value: node_1_output
node_1_input = (input_data * weights['node_1']).sum()
node_1_output = relu(node_1_input)

# Put node values into array: hidden_layer_outputs
hidden_layer_outputs = np.array([node_0_output, node_1_output])

# Calculate model output (do not apply relu)
model_output = (hidden_layer_outputs * weights['output']).sum()

# Print model output
print(model_output)

- example 2: make a function and predict multiple rows

In [None]:
# Define predict_with_network()
def predict_with_network(input_data_row, weights):

    # Calculate node 0 value
    node_0_input = (input_data_row * weights['node_0']).sum()
    node_0_output = relu(node_0_input)

    # Calculate node 1 value
    node_1_input = (input_data_row * weights['node_1']).sum()
    node_1_output = relu(node_1_input)

    # Put node values into array: hidden_layer_outputs
    hidden_layer_outputs = np.array([node_0_output, node_1_output])
    
    # Calculate model output
    input_to_final_layer = (hidden_layer_outputs * weights['output']).sum()
    model_output = relu(input_to_final_layer)
    
    # Return model output
    return(model_output)


# Create empty list to store prediction results
results = []
for input_data_row in input_data:
    # Append prediction to results
    results.append(predict_with_network(input_data_row, weights))

# Print results
print(results)
        

- example with 2 hidden layers
%%html
### two layers
<img src = 'two_layer.png', width = 300, height =200>

In [None]:
def predict_with_network(input_data):
    # Calculate node 0 in the first hidden layer
    node_0_0_input = (input_data * weights['node_0_0']).sum()
    node_0_0_output = relu(node_0_0_input )

    # Calculate node 1 in the first hidden layer
    node_0_1_input = (input_data * weights['node_0_1']).sum()
    node_0_1_output = relu(node_0_1_input )

    # Put node values into array: hidden_0_outputs
    hidden_0_outputs = np.array([node_0_0_output, node_0_1_output])
    
    # Calculate node 0 in the second hidden layer
    node_1_0_input = (hidden_0_outputs * weights['node_1_0']).sum()
    node_1_0_output = relu(node_1_0_input)

    # Calculate node 1 in the second hidden layer
    node_1_1_input = (hidden_0_outputs * weights['node_1_1']).sum()
    node_1_1_output = relu(node_1_1_input)

    # Put node values into array: hidden_1_outputs
    hidden_1_outputs = np.array([node_1_0_output, node_1_1_output])

    # Calculate model output: model_output
    model_output = (hidden_1_outputs * weights['output']).sum()
    
    # Return model_output
    return(model_output)

output = predict_with_network(input_data)
print(output)


> part 2. Optimization
   - Loss function: Aggregates errors in prediction from many data points to a single number
        - for example, one common loss function for regression is MSE(mean square error)
   - find weights that give the lowest value for loss function: gradient descent
        - solution: learning rate-- update each weight by substracting learning_rate * slope
   - backpropogation:
 

In [9]:

%%html
### backpropagation
<img src = 'back.png', width = 500, height =400>

In [10]:
%%html
### stochastic gradient descent
<img src = 'stok.png', width = 500, height =400>

- ** gradient descent **

In [18]:
%%html
### slope of weights
<img src = 'slope1.png', width = 500, height =400>

In [20]:
%%html
### slope of weights
<img src = 'back_prop.png', width = 800, height =600>

In [None]:
# calculate slope: gradient = 2 * input_data * error

# Calculate the predictions: preds
preds = (input_data * weights).sum()

# Calculate the error: error
error = target - preds 

# Calculate the slope: slope
slope = 2 * input_data * error

# Print the slope
print(slope)


In [None]:
- example 1: improve model performance

In [None]:
# Set the learning rate: learning_rate
learning_rate = 0.01

# Calculate the predictions: preds
preds = (weights * input_data).sum()

# Calculate the error: error
error = preds - target

# Calculate the slope: slope
slope = 2 * input_data * error

# Update the weights: weights_updated
weights_updated = weights - slope * learning_rate

# Get updated predictions: preds_updated
preds_updated = (weights_updated * input_data).sum()

# Calculate updated error: error_updated
error_updated = preds_updated- target

# Print the original error
print(error)

# Print the updated error
print(error_updated)


- Making multiple updates to weights

In [None]:
n_updates = 20
mse_hist = []

# Iterate over the number of updates
for i in range(n_updates):
    # Calculate the slope: slope
    slope = get_slope(input_data, target, weights)
    
    # Update the weights: weights
    weights = weights - slope * 0.01
    
    # Calculate mse with new weights: mse
    mse = get_mse(input_data, target, weights)
    
    # Append the mse to mse_hist
    mse_hist.append(mse)

# Plot the mse history
plt.plot(mse_hist)
plt.xlabel('Iterations')
plt.ylabel('Mean Squared Error')
plt.show()

> ** part 3. Keras **

  - create a model
  - compile and fit the model

- 1) create a model

In [9]:
import tensorflow as tf

In [10]:
print ("TensorFlow version: " + tf.__version__)

TensorFlow version: 1.12.0


In [11]:
import keras
from keras.layers import Dense

In [None]:
# Import necessary modules
import keras
from keras.layers import Dense
from keras.models import Sequential

# Save the number of columns in predictors: n_cols
n_cols = predictors.shape[1]

print (n_cols)
# Set up the model: model
model = Sequential()

# Add the first layer
model.add(Dense(50, activation='relu', input_shape=(n_cols,)))

# Add the second layer
model.add(Dense(32, activation='relu'))

# Add the output layer
model.add(Dense(1))


# compile the model
model.compile(optimizer = 'adam', loss = 'mean_sqaure_error')

# fit
model.fit(predictors, target)


- 2) compile the model
    - specify the optimizer
       - control learning rate
       - many options and mathematically complex
       - 'adam' is a good one to use
    - loss function
       - 'mean square error' is a common one for regression
       - 'categorical_crossentropy' for classification (similar to log-loss): lower is better
        
- 3) fit the model
    - apply backpropagation and gradient descent with your data to update the weights
    - scaling data(standard scaler) before fitting can ease optimization
    

### classification
   - 'categorical_crossentropy' for classification (similar to log-loss): lower is better
   - add metrics = 'accuracy' to compile step for easy-to-understand diagonitics
   - Output layer has separate node for each possible outcome and uses 'softmax' activation
      - softmax activation ensures all prediction outcomes will sum to 1, so we can use prediction as propabilities.
      

In [None]:
# Import necessary modules
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical

# Convert the target to categorical: target
target = to_categorical(df.survived)

# Set up the model
model = Sequential()

# Add the first layer
model.add(Dense(32, activation = 'relu', input_shape = (n_cols,)))

# Add the output layer
model.add(Dense(2,activation = 'softmax'))

# Compile the model
model.compile(optimizer = 'sgd'
              , loss = 'categorical_crossentropy'
              , metrics = ['accuracy'])

# Fit the model
model.fit(predictors, target)




In [None]:
example 2: optimizer the compiler by different learning rates

In [None]:
# Import the SGD optimizer
from keras.optimizers import SGD

# Create list of learning rates: lr_to_test
lr_to_test = [0.000001, 0.01, 1]

# Loop over learning rates
for lr in lr_to_test:
    print('\n\nTesting model with learning rate: %f\n'%lr )
    
    # Build new model to test, unaffected by previous models
    model = get_new_model()
    
    # Create SGD optimizer with specified learning rate: my_optimizer
    my_optimizer = SGD(lr = lr)
    
    # Compile the model
    model.compile(optimizer = my_optimizer
                  , loss = 'categorical_crossentropy')
    
    # test and train split and Fit the model 
    model.fit(predictors, target, validation_split = 0.3)
    

> ###part 4. tuning the model
   - 1)dying neuron problem
      - a node starts to get negative inputs, it may continue only getting negative inputs
      - slope of weights, the gradient, is always 0. (due to slope of activation = 0)
      - if this happens, change activation function as a solution.
   - 2) model validation in deep learning
      - commonly use model validation split(train_test_split) rather than cross-validation.
      - deep learning widely used in large datasets
      - single validation score is based on large dataset and it is reliable.
      - repeated training from cross-validation is time-consuming.
   - 3) early stopping
      - early stopping if there is no improvement of metrics
         - use early stopping to stop optimization when it isn't helping any more. Since the optimization stops automatically when it isn't helping, you can also set a high value for epochs in your call to .fit()
      - based on this, we can experiment with different architectures: more/fewer layers, more/fewer nodes, etc.
      - create a great model needs experimentation
  
    

In [None]:
# Import EarlyStopping
from keras.callbacks import EarlyStopping

# Save the number of columns in predictors: n_cols
n_cols = predictors.shape[1]
input_shape = (n_cols,)

# Specify the model
model = Sequential()
model.add(Dense(100, activation='relu', input_shape = input_shape))
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Define early_stopping_monitor
early_stopping_monitor = EarlyStopping(patience = 2)

# Fit the model with early stopping and higher epochs threshold
model.fit(predictors, target, validation_split= 0.3, epochs = 30, callbacks = [early_stopping_monitor])


- example 3: experiemnt two models and compare scores

In [None]:
# Define early_stopping_monitor
early_stopping_monitor = EarlyStopping(patience=2)

# Create the new model: model_2
model_2 = Sequential()

# Add the first and second layers 
# model_1 only two layers with 10 units
model_2.add(Dense(100, activation='relu', input_shape = input_shape))
model_2.add(Dense(100, activation='relu'))


# # OR Add the first and second layers
# model_2.add(Dense(50, activation='relu', input_shape = input_shape))
# model_2.add(Dense(50, activation='relu'))
# model_2.add(Dense(50, activation='relu'))

# Add the output layer
model_2.add(Dense(2, activation = 'softmax'))

# Compile model_2
model_2.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Fit model_1
model_1_training = model_1.fit(predictors, target, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Fit model_2
model_2_training = model_2.fit(predictors, target, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Create the plot
plt.plot(model_1_training.history['val_loss'], 'r', model_2_training.history['val_loss'], 'b')
plt.xlabel('Epochs')
plt.ylabel('Validation score')
plt.show()


> Model capacity
      - validation score(test data score) is the ultimate measure of a model performance.
      - model capacity --> model complexity 

In [6]:
%%html
### slope of weights
<img src = 'capacity.png', width = 400, height =200>
### deeplearning capcity experimentation example
<img src = 'cap_example.png', width = 400, height =200>

- example 3. Building your own digit recognition model

In [None]:
# Create the model: model
model = Sequential()

# Add the first hidden layer
model.add(Dense(50, activation = 'relu', input_shape = (784,)))

# Add the second hidden layer
model.add(Dense(50, activation = 'relu'))

# Add the output layer
model.add(Dense(10, activation = 'softmax'))

# Compile the model
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Fit the model
model.fit(X,y,validation_split = 0.3)
