# Advanced Deep Learning Best Practices

## 7.1.1 Introduction to the functional API

In [None]:
from keras import Input, layers
input_tensor = Input (shape=(32,))     # A tensor
dense = layers.Dense(32, activation = 'relu')   # A layer is a function
output_tensor = dense(input_tensor)         # A layer may be called a tensor and it returns a tensor

### Minimal Eg., Sequential Model with its eq in functional API side by side:-

In [None]:
from keras.models import Sequential, Model
from keras import layers
from keras import Input

seq_model = Sequential()       # Seq Md
seq_model.add(layers.Dense(32, activation='relu', input_shape=(64,)))
seq_model.add(layers.Dense(32, activation='relu'))
seq_model.add(layers.Dense(10, activation='softmax'))

input_tensor = Input(shape=(64,))
x = layers.Dense(32, activation='relu')(input_tensor)
x = layers.Dense(32, activation='relu')(x)
output_tensor = layers.Dense(10, activation='softmax')(x)    # Func eq

model = Model(input_tensor, output_tensor)   # The Model class turns an input tensor and output tensor in2 a model

model.summary()

#### The APi is same as that of sequntial when it comes to compiling training or evaluating such an instance of the model

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')  # Compiles the model

import numpy as np
x_train = np.random.random((1000, 64))
y_train = np.random.random((1000, 10))         # Gen dummy numpy data to train on

model.fit(x_train, y_train,epochs=10, batch_size=128)     # Trains the model for 10 epochs
score = model.evaluate(x_train, y_train)       # evaluates the model

### Multi-input models

#### Functional API helps build models having multiple inputs. Use Keras merge operation such as keras.layers.add, keras.layers.concatenate.

## 7.1 Functional API implwemetation of a two-point question-answering model

In [None]:
from keras.models import Model
from keras import layers
from keras import Input

text_vocabulary_size = 10000
question_vocabulary_size = 10000
answer_vocabulary_size = 500

text_input = Input(shape=(None,), dtype = 'int32', name = 'text')     #The text input is a variable-length sequence
                                                                     #of integers. Note: U can optionally name the inputs.
embedded_text = layers.Embedding(         # Embeds the inputs into a sequence of vectors of size 64.                
    64, text_vocabulary_size)(text_input)    

encoded_text = layers.LSTM(32)(embedded_text)  #Encodes the vectors in a single vector via an LSTM.

question_input = Input(shape=(None,),     # Same process (with different layer instances) for the question.
                              dtype='int32',
                              name='question')

embedded_question = layers.Embedding(32,  question_vocabulary_size)(question_input)

encoded_question= layers.LSTM(16)(embedded_question)   

concatenated = layers.concatenate([encoded_text, encoded_question], axis = -1) #Concatenation

answer = layers.Dense(answer_vocabulary_size, activation='softmax')(concatenated) # Adds a softmax classifier on top.

model = Model([text_input, question_input], answer) # Specify the two inputs and outputs at model instantiation.

model.compile(optimizer='rmsprop',
                         loss='categorical_crossentropy',
                         metrics=['acc'])

## Listing 7.2  Feeding Data to a multi-input model

In [None]:
import numpy as np

num_samples = 1000
max_length = 100

text = np.random.randint(1, text_vocabulary_size,
                         size=(num_samples, max_length))      # Generates dummy Numpy data

question = np.random.randint(1, question_vocabulary_size,
                         size=(num_samples, max_length))

answers = np.random.randint(0, 1, 
                         size=(num_samples, answer_vocabulary_size))  # Answers are 1hot encoded, not integers.

model.fit([text, question], answers, epochs=10, batch_size=128)  # Fitting using a list of inputs.

model.fit({'text': text, 'question': question}, answers,
          epochs=10, batch_size=128)    # Fitting using a dictionary of inputs(only if inputs are named)

### 7.1.3 Multi-output models: Functional API also builds models with multiple(outputs)(heads). e.g., a network attempting
                           to simultaneously predict different props of the data such as social media posts predicting 
                           attributes of a person such as age, gender and income level.                 

## Listing 7.3: Functional API implementation of a three output model.

In [None]:
from keras import layers
from keras import Input
from keras.models import Model

vocabulary_size = 50000
num_income_groups = 10

posts_input = Input(shape=(None,), dtype='int32', name='posts')
embedded_posts = layers.Embedding(256, vocabulary_size)(posts_input)
x = layers.Conv1D(128, 5, activation='relu')(embedded_posts)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.Conv1D(256, 5, activation='relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation='relu')(x)

age_prediction = layers.Dense(1, name='age')(x)
income_prediction = layers.Dense(num_income_groups, activation='softmax', name='income')(x)
gender_prediction = layers.Dense(1, activation='sigmoid', name='gender')(x)

model = Model(posts_input, [age_prediction, income_prediction, gender_prediction])


## Listing 7.4: Compilation options of a multi-output model: multiple losses

In [None]:
model.compile(optimizer='rmsprop',
                 loss=['mse', 'categorical_crossentropy', 'binary_crossentropy'])

model.compile(optimizer='rmsprop',
              loss={'age': 'mse',
                    'income': 'categorical_crossentropy',
                    'gender': 'binary_crossentropy'}) # Equivalent possible only if you give names to the output layers


## Listing 7.5: Compilation options of a multi-output model: loss weighting

In [None]:
model.compile(optimizer='rmsprop',
                 loss=['mse', 'categorical_crossentropy', 'binary_crossentropy'],
              loss_weights=[0.25, 1., 10.])

model.compile(optimizer='rmsprop',
              loss={'age': 'mse',
                    'income': 'categorical_crossentropy',
                    'gender': 'binary_crossentropy'}, # Equivalent possible only if you give names to the output layers
              loss_weights={'age': 0.25,
                            'income': 1.,
                            'gender': 10.})      

### Listing 7.6: Feeding data to a multi-output model

In [None]:
model.fit(posts, [age_targerts, income_targets, gender_targets],
          epochs=10, batch_size=64)
#age_targets, income_targets and gender_targets are assumed to be numpy arrays
modelfit(posts, { 'age': age_targets,
                  'income': income_targets,
                  'gender': gender_targets},
         epochs=10, batch_size=64)   # Equivalent(possible only if you give name to the output layers)

## 7.1.4: Directed acyclic graphs of layers

### Neural networks in Keras are also allowed to be arbitrary directed acyclic graphs of layers.
# These graphs dont have cycles.
# Its impossible for a tensor x to become the input of one of the layers that generated x
# The only processing loops that are allowed(i.e., recurrent connections) are those internal to recurrent layers
# Several common nn components are implemented as graph. Notable (1) Inception Modules, (2) Residual connections

### EXAMPLE: MODEL IMPLEMENTATION OF INCEPTION MODULE EXAMPLE USING THE FUNCTIONAL API

In [None]:
from keras import layers

branch_a = layers.Conv2D(128, 1,
                         activation='relu', strides=2)(x)
#Above:Every branch has the same stride value (2),
#which is necessary to keep all branch outputs
#the same size so you can concatenate them
branch_b = layers.Conv2D(128, 1,
                         activation='relu')(x)
branch_b = layers.Conv2D(128, 3,
                         activation='relu', strides=2)(branch_b) # In this branch, striding occurs in the spatial convolution layer.
branch_c = layers.AveragePooling2D(3, strides=2)(x)
branch_c = layers.Conv2D(128, 3, activation='relu')(branch_c)  # In this branch, striding occurs in the average pooling layer.
branch_d = layers.Conv2D(128, 1,
                         activation='relu')(x)
branch_a = layers.Conv2D(128, 3,
                         activation='relu')(branch_d)
output = layers.concatenate(
    [branch_a, branch_b, branch_c, branch_d], axis=-1)  # Concatenates the branch outputs to obtain the module output.

#### Note: The full Inception V3 architecture is available in Keras as keras.applications.inception_v3.InceptionV3, including weights pretrained on the ImageNet dataset. 'Xception', another model. literally stands for extreme inception, inspired by Inception.

## RESIDUAL CONNECTIONS

### A residual connection consists of making the output of an earlier layer available as input to a later layer, effectivelycreating a shortcut in a sequential network. Example below assuming the existence of a 4D input tensor x:

In [None]:
from keras import layers

x = ...
y = layers.Conv2D(128, 3, activation='relu', padding='same')(x)  # Applies a transformation to x
y = layers.Conv2D(128, 3, activation='relu', padding='same')(y)
y = layers.Conv2D(128, 3, activation='relu', padding='same')(y)

y = layers.add([y, x])     # Adds the original x back to the output features

And the following imp a residual connection when the feature-map sizes differ, using a linear residual connection

In [None]:
from keras import layers

x = ...
y = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
y = layers.Conv2D(128, 3, activation='relu', padding='same')(y)
y = layers.MaxPooling2D(2, strides=2)(y)

residual = layers.Conv2D(128, 1, strides=2, padding='same')(x)  # Uses a 1 x 1 convolution to linearly downsample the 
                                                                # original x tensor to the same shape as y.

y = layers.add([y, residual])    # Adds the residual tensor back to the output features.

Representational bottlenecks in deep learning
Residual connections, by reinjecting earlier information downstream, help to mitigate line losses. when in sequential
models each successive representation layer is built on top of activated layer, with small layers, the models might
be constrained.

Vanishing gradients
Just like the LSTM layer in feedback recurrent networks introduces a carry tract that propagates information parallel to the
main processing track, Residual connections in feedforward deep networks introduce a purely linear information carry
track parallel to the main layer stack, this helping to propagate gradients through arbitrary deep stack of layers.

### 7.1.5 Layer weight sharing

In [None]:
Imp Feature of Functional API: ability to reuse a layer instance several times, calling layer instance twice, same 
                               weights are used with every call and no new layer for each call is instantiated.
                               Several branches share the same representations and learn these reps 4 diff input sets.
                               e.g., 0 and 1 output score by model with inputs two sentences (assess semantic similarity btw)
    
    Instead of learning two independent models for processing each input sentence, both sentences shall be processed
    with a single LSTM layer. The reps(weights) of this LSTM layer are learned based on both inputs simultaneously.
    This is what is called a Siamese LSTM model or shared LSTM.

In [None]:
# Siamese LSTM model implementation using layer sharing in the Keras functional API.

In [None]:
from keras import layers
from keras import Input
from keras.models import Model

lstm = layers.LSTM(32)                        # Instantiates a single LSTM layer once

left_input = Input(shape=(None, 128))  # Building left branch of the model: inputs are variable-length sequences of 
                                       # vectors of size 128 
left_output = lstm(left_input)

right_input = Input(shape=(None, 128))

right_output = lstm(right_input) # Building right branch of the model: when you call an existing layer instance
                                 # you reuse its weights

merged = layers.concatenate([left_output, right_output], axis=-1)
predictions = layers.Dense(1, activation='sigmoid')(merged)     # Builds the classifier on top

model = Model([left_input, right_input], predictions)
model.fit([left_data, right_data], targets)      # Instantiating and training the model: when you train such a model
#,the weights of the LSTM layer are updated based on both inputs.
                                 

## 7.1.6: Models as layers

In [None]:
y = model(x)    # Call a model on an input tensor and retrieve an output tensor.

In [None]:
y1, y2 = model[x1, x2]  # When there are multiple input and output tensors, a list of tensors should be called

# Weights of the model are reused when the model instance is called exactly like layer instance.
# e.g., a vision model using dual camera as its input; two parallel cameras, a few centimeters(1 inch apart).
# Such a model can percieve depth useful in many apps. No need for two indepndent models to extract visual 
# features for the left and right b4 merging the two feeds. Such lowlevel processing can be shared across the
# two inputs:i.e, done via layers using the same wights and thus representations. 

# Imp Siamese vision model(shared convolution base in keras)

In [None]:
from keras import layers
from keras import applications
from keras import Input

xception_base = application.Xception(weights = None, 
                                      include_top=False)  # The base image-processing model is the Xception network
                                                                        #(convolutional base only)
left_input = Input(shape=(250, 250, 3))
right_input = Input(shape=(250, 250, 3))   # The inputs are 250 x 250 RGB images

left_features = xception_base(left_input)
right_input = xception_base(right_input)      # Calls the same vision model twice

merged features = layers.concatenate(
        [left_features, right_input], axis=-1)  # The merged features contain information from the right visual feed
                                                # and the left visual feed.

## 7.2.1: Using Callbacks to act on a model during training

Stop training when the validation loss is no longer improving.  Achievable through a keras callback.
A callback is an object( a class inst imp specific meth) that is passed to the model in the call to fit and that
is called by the model at various points during training.
Examples of ways of using callbacks

Model checkpointing- Saving the current weights of the model at different points during training.

Early stopping- Interrupting training when the validation loss is no longer improving(optimization).

Dynamically adjusting the value of certain params during trining-such as the learning rate of the optimizer

Logging training and validation metrics during training or visualizing the reps learned by the model as dey r updated.

The Keras progress bar is a callback

The keras.callback includes a no. of built-in callbacks:-
    
    keras.callbacks.ModelCheckpoint
    keras.callbacks.EarlyStopping
    keras.callbacks.LearningRateScheduler
    keras.callbacks.ReduceLROnPlateau
    keras.callbacks.CSVLogger


### THE MODEL CHECKPOINT AND EARLYSTOPPING CALLBACKS

to interrupt training as soon as overfitting starts to avoid training model for a smaller no. of epochs
EarlyStopping callback typucally used in combo with ModelCheckpoint, continually saving the model during the training
and hence optionally save the best model so far; the version of model achieving the best per on end of an epoch):

In [None]:
import keras

callbacks_list = [ # Callbacks passed 2 model via callbacks args in fit, taking a list of call backs:any
    keras.callbacks.EarlyStopping(    # Interrupts training when improvement stops
        monitor = 'acc',              # Monitors the model's validation accuracy
        patience=1,                   # Interrupts training when accuracy has stopped improving for more than one epoch i.e, 2epochs
    ),
    keras.callbacks.ModelCheckpoint(  # Saves the current weights after every epoch
        filepath='my_model.h5',       # Path to the destination model file
        monitor='val_loss',     # These 2 args (one below) means model file wont be overwritten unless val_loss  
        save_best_only=True,    # has improved, allowing to keep the best model during training        
    )
]

model.compile(optimizer='rmsprop',         # Monitor accuracy to be part of the model's metrics. 
              loss='binary_crossentropy',
              metrics=['acc'])

model.fit(x, y,                                 # Note: Since callback monitors validation loss and validation
          epochs=10,                            #       accuracy, validation_data needs to be passed to call to fit.     
          batch_size=32,
          callbacks=callbacks_list,
          validation_data=(x_val, y_val))

### THE REDUCELRONPLATEAU CALLBACK

Reduce learning rate when validation loss stops improving. 
Reducing or increasing the learning rate in case of a loss plateau is an effective strategy to get out of local
minima during training. Example below

In [None]:
callbacks_list - [
    keras.callbacks.ReduceLROnPlateau(   
        monitor='val_loss'             # monitors the model's validation loss
        factor=0.1,                    # Divides the learning rate by 10 when trigerred
        patience=10,                   # Callback trig after val_loss stops improving for 1-0 epochs
    )
]

model.fit(x, y,                                 # Note: Since callback monitors validation loss and validation
          epochs=10,                            #       accuracy, validation_data needs to be passed to call to fit.     
          batch_size=32,
          callbacks=callbacks_list,
          validation_data=(x_val, y_val))

### WRITING OWN CALLBACK

In [None]:
Callbacks imp by subclassing the keras.callbacks.Callback
Methods: any;
    on_epoch_begin     # Called at the start of every epoch
    on_epoch_end     # Called at the end of every epoch
    
    on_batch_begin     # Called right before processing each batch
    on_batch_end     # Called right after processing each batch
    
    on_train_begin   # Called at the start of training
    on_train_end   # Called at the end of training

Methods called with a logs arg, a dict containing info abt the previous batch, epoch, or training run: training
and validations metrics and so on, callback has access to following attr:-
    self.model--The model instance from which the callback is being called
    self.validation_data-The value of what was passed to fit as validation dat


In [None]:
import keras 
import numpy as np

class ActivationLogger(keras.callbacks.Callback):
    
    def set_model(self, model):
        self.model = model   # Kald by da parent modl b4 trnng, 2inform da callback of wat model willbe calling it
        layer_outputs = [layer.output for layer in model.layers]
        self.activations_model = keras.models.Model(model.input,
                                                    layer_outputs) #Model inst returns da activations of every layer.
        
    def on_epoch_end(self, epoch, logs=None):
        if self.validation_data is None:
            raise RuntimeError('Requires validation_data.')
        validation_sample = self.validation_data[0][0:1]    # Obtains the 1st input sample of the validation data.
        activations = self.activations_model.predict(validation_sample)
        f = open('activations_at_epoch_' + str(epoch) + '.npz', 'w')  # Saves the array to disk(2 below)
        np.savez(f, activations)
        f.close()

## 7.2.2: Introduction to TensorBoard: the TensorFlow visualization framework

A browserbased visualization tool that comes packaged with TensorFlow. Only available for keras models 
TensorBoard gives access to several neat features in your browser as follows:-
    Visualizing monitoring metrics during training
    Visualizing your model architecture
    Visualizing histograms of activations and gradients
    Exploring embeddings in 3D

## LISTING 7.7: Text xlassification model to use with TensorBoard

In [None]:
import keras
from keras import layers
from keras.datasets import imdb
from keras.preprocessing import sequence

mx_features = 2000          # No. of words to consider as features
max_len = 500               # Cuts off text after this no. of words(among max_features most common words)

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

model = keras.models.Sequential()
model.add(layers.Embedding(max_features, 128,
                           input_length=max_len,
                           name='embed'))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D())
model.add(layers.Dense(1))
model.summary()
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

## LISTING 7.8 Creating a directory for TensorBoard log files

In [None]:
mkdir my_log_dir

## LISTING 7.9: Training the model with a TensorBoard callback

In [None]:
callbacks = [
    keras.callbacks.TensorBoard(
        log_dir='my_log_dir',    # Log files will be written at this location
        histogram_freq=1,        # Records activation histograms every 1 epoch        
        embeddings_freq=1,       # Records embedding data every 1 epoch
    )
]
history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=128,
                    validation_split=0.2,
                    callbacks=callbacks)

# Launch TensorBoard server by $ tensorboard --logdir=my_log_dir
# Browse to http://localhost:6006 live graphs of training and vlidation metrics, Histograms tab


In [None]:
#utility keras.utils.plot_model
# reqs: Python pydor and pydot_ng libs as well as the graphviz library installed

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [None]:
# Display shape info in graph of layers.

In [None]:
from keras.utils import plot_model
plot_model(model, show_shapes=True, to_file='model.png')

### 7.2.3: WRAPPING UP

Keras callbacks provide a simple way to monitor models during training and automatically take action based on the state of model
When you are using TensorFlow, TensorBoard is a great way to visualize model activity in yoour browser. 

### 7.3.1: Advance Architecture Patterns: Batch Normalization and Depthwise Separable Convolution

### BATCH NORMALIZATION

normalized_data = (data - np.mean(data, axis=...)) / np.std(data, axis=...)

The main effect of batch normalization is that it helps with gradient propagation much like residual connecitons
and thus allows for deeper networks. BatchNormalization is used liberally in many of the advanced convnet architectures
that come packaged with Keras such as ResNet50, Inception V3, and Xception.

The BatchNormalization layer is typically used after a convolution or densely connected layer.

conv_model.add(layers.Conv2D(32, 3, activation='relu'))   # After a Conv layer
conv_model.add(layers.BatchNormalization())

dense_model.add(layers.Dense(32, activation='relu'))   # After a Dense layer
dense_model.add(layers.BatchNormalization())

The BatchNormalization layer takes an axis arg; specifying the feature axis 2b normalized. this arg defaults to -1,
last axis in input tensor. This is the correct value when using Dense layers, Conv1D layers, RNN layers, and Conv2D
layers with data_format set to "channels_last". Axis arg in BatchNormalization may be set to 1

### DEPTHWISE SEPARABLE CONVOLUTION

SeparableConv2D provides a drop-in replacement for Conv2D making a lighter model(fewer trainable wt params) and 
faster (fewer floating point operations ) and cause it to perform a few % pts better on its task.
This layers performs a spatial convolution on each channel of its input, independently, b4 mixing output channels
via a pointwise convolution(1 x 1). akin to separating the learning of spatial features and learning of 
channel-wise features.

This strategy works when training small models from scratch on limited data for instance, 
a lightweight depthwise separable convnet for an image-classification task(softmax categorical classification)
on a small dataset

In [None]:
from keras.models import Sequential, Model
from keras import layers

height = 64
width = 64
channels = 3
num_classes = 10

model = Sequential()
model.add(layers.SeparableConv2D(32, 3,
                                 activation='relu',
                                 input_shape=(height, width, channels, )))
model.add(layers.SeparableConv2D(64, 3 ,activation='relu'))
model.add(layers.MaxPooling2D(2))

model.add(layers.SeparableConv2D(64, 3 ,activation='relu'))
model.add(layers.SeparableConv2D(128, 3 ,activation='relu'))
model.add(layers.MaxPooling2D(2))

model.add(layers.SeparableConv2D(64, 3 ,activation='relu'))
model.add(layers.SeparableConv2D(128, 3 ,activation='relu'))
model.add(layers.GlobalAveragePooling2D())

model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(num_classes, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')



## 7.3.2 HyperParameter Optimization

In [None]:
Process of optimizing hyperparameters:-
    1. Choose a set of hyperparameters (automatically).
    2. Build the corresponding model.
    3. Fit it to your training data, and measure the final performance on the validation_data.
    4. Choose the next set of paramters to try (automatically).
    5. Repeat.
    6. Eventually, measure performance on your test data.
    

### 7.3.3: Model ensembling

consists of pooling together the predictions of a set of different models to produce better predictions. 
Assumes that differnet good models trained independently are likely to be good for different reasons: each model
looks at different aspects of data getting part of truth. 
The easiest way to pool the predictions of a set of classifiers( to ensemble the classifiers) is to average
their predictions at inference time:

In [None]:
# Use four different models to compute initial predictions
preds_a = model_a.predict(x_val)
preds_b = model_b.predict(x_val)
preds_c = model_c.predict(x_val)
preds_d = model_d.predict(x_val)

# Below new prediction array should be more accurate than any of initial ones.

final_preds = 0.25 * (preds_a + preds_b + preds_c + preds_ d)

# Works if the classsifers are more or less equally good. One bad fish may spoil the final prediction to be worse than
# best classifier of the group.

In [None]:
# A smarter way to ensemble classifiers is to do a weighted average, where the weights are learned on the validation
# data-typically, the better classifiers are given a higher weight and vice versa

preds_a = model_a.predict(x_val)
preds_b = model_b.predict(x_val)
preds_c = model_c.predict(x_val)
preds_d = model_d.predict(x_val)

final_preds = 0.5 * preds_a + 0.25 + 0.25 * preds_b + 0.1 * preds_c + 0.15 * preds_d

# Diversity is the key here