# Import stuff

In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd

# Key Idea is that weights have to be randomly initialized
- Glorot -> None, tanh, softmax, logistic
- He -> Relu and Relu variants(RRelu, PRelu, ELU)
- Lecun -> Selu
  - ***Selu can only be used if the model is sequential, the inputs are standardized, kernel_init is lecun_normal, and all layers have to be Dense***

In [None]:
# Use kernel_initializer
tf.keras.layers.Dense(10, activation="selu", kernel_initializer="lecun_uniform")

<tensorflow.python.keras.layers.core.Dense at 0x7fec074a1050>

**SELU > ELU > LeakyRelu(other variants too) > Relu > tanh > logistic**

# To apply activation functions onto models
- Place the activation function right after the layer you want to apply it to for variants of LeakyRelu and LeakyRelu

In [None]:
# LeakyRelus, PRelus
tf.keras.layers.Dense(30)
tf.keras.layers.LeakyReLU(alpha=0.2) # 0.2 is often pretty good

# Selu -> Self-normalizes the whole model so it solves exploding/vanishing gradients
tf.keras.layers.Dense(10, activation="selu", kernel_initializer="lecun_uniform") # Have to have lecun

# Using Batch Normalization
- Always use this because it has a lot of good stuff and has a LOT OF BENEFITS
- Normalizes the inputs of the layers so if you do that right after the input/Flatten, you don't need to use StandardScaler to normalize
  - Has four parameters gamma(output scale), beta(output offset), mu(mean), sigma(std dev)

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Flatten(input_shape=(28,28)),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(300, activation="elu", kernel_initializer="he_uniform"),   
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(100, activation="elu", kernel_initializer="he_uniform"),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(10, activation="softmax")                           
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 784)               3136      
_________________________________________________________________
dense_2 (Dense)              (None, 300)               235500    
_________________________________________________________________
batch_normalization_2 (Batch (None, 300)               1200      
_________________________________________________________________
dense_3 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_3 (Batch (None, 100)               400       
_________________________________________________________________
dense_4 (Dense)              (None, 10)                1

# Reusing pretrained stuffs
- Already trained a model to classify different fruits and veggies in a picture. Now you want to classify cars. The two models seem pretty similar so cna just reuse part of the first model.

When you want to reuse layers, note that the bottom hidden layers are the most important and as you go further to the output layer, they become less and less important. Also, when you reuse layers, freeze the bottom few layers to prevent those hyperparameters from being tweaked.


Certain situations, though similar to the original model classification, might need you to drop certain layers and freeze others.

In [None]:
# Let's pretend there we already had a good classifier from the MNIST fashion stored in my_model_A.h5
model_A = tf.keras.models.load_model("my_model_A.h5") # Loads model
model_A_clone = tf.keras.models.clone_model(model_A)  # Clone the model because if you don't, when you manipulate the weights of model_B_on_A,
                                                      # you're going to be changing model_A's weights too.
model_A_clone.set_weights(model_A.get_weights())      # When you clone, it only returns the structure, not the weights. So, here you get and set the weights

model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1]) # Simply pass in the layers. This basically takes in all layers except for the last one(output)
                                                              # so you are basically reusing the layers.
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid")) # Add in the output layer.

for layer in model_B_on_A.layers[:-1]:  # Freezes the reused layers
  layer.trainable = False;

model_B_on_A.compile(loss="binary_crossentropy", # Binary_ce because it was the situation at hand
                     otpimizer="sgd",
                     metrics=["accuracy"])

# Then you just fit the ting

## ***Big thing to notice here is that this binary classification is really simple and book says that transfer learning doesn't work well for simple models. USE IT FOR DEEP DEEP NEURAL NETS***

# Situations where you don't have much labeled training data

What you do is you perform an auxiliary task. This means to train your model on a similar task that you have lots of labeled training data for. In this way, this data will provide good lower hidden layers for your model's original task. Then you can just reuse the lower layers from the aux task for your original task

# Fast optimizers

We've alrady explored a few ways to make training faster and better: kernel_init, better activation functions, Batch Normalization, and reusing parts of similar models.

Next we will look at better optimizers than simple SGD/GD
- Best is Adam

## Momentum
- As you go down curve, you can go faster and faster(you input the momentum cap(0.9))
- Can get to the valley a lot quicker
- Can roll past local_minimas

In [None]:
optimizer = tf.keras.optimizers.SGD(lr=0.003, momentum=0.9)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


## Adam
- Generally, always use Adam

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0003, beta_1=0.9, beta_2=0.999)

# Regularizers
- We have already seen Batch Normalization which does also help out with regularization

In [2]:
layer = tf.keras.layers.Dense(300, activation="elu",
                              kernel_initializer="he_uniform",
                              kernel_regularizer=tf.keras.regularizers.l2(0.01))

## You are going to be initializing a lot of activations and layers and because most layers are going to be pretty much the save, you can use functools

In [6]:
from functools import partial

RegularizedDense = partial(tf.keras.layers.Dense,
                           activation="elu",
                           kernel_initializer="he_uniform",
                           kernel_regularizer=tf.keras.regularizers.l2(0.01))

model = tf.keras.Sequential([
  tf.keras.layers.Flatten(input_shape=(28,28)),
  RegularizedDense(300),
  RegularizedDense(100),                            
  RegularizedDense(300, activation="softmax",
                   kernel_initializer="glorot_uniform")
])

## Dropout regularizers
- Dropout works because it makes each neuron stand on its own, making it become more independent, which then makes the whole entire model more robust.
- ***If you are going to be using SELU, use alpha dropout.***
- **If regular Dropout is too strong, only use one dropout and place that right after the last hidden layer. You are only supposed to place it in the top 1-3 layers.**

In [9]:
RegularizedDense = partial(tf.keras.layers.Dense,
                           activation="selu",
                           kernel_initializer="lecun_uniform",
                           kernel_regularizer=tf.keras.regularizers.l2(0.001),
                           )

model = tf.keras.Sequential([
  tf.keras.layers.Flatten(input_shape=(28,28)),
  tf.keras.layers.BatchNormalization(),
  RegularizedDense(300),
  tf.keras.layers.BatchNormalization(),
  RegularizedDense(100),
  tf.keras.layers.AlphaDropout(0.3),
  tf.keras.layers.Dense(10, activation="softmax", kernel_initializer="glorot_uniform")
])

## Monte Carlo
- You should always use it for dropout but i don't know how to implement

In [None]:
#def predict_proba(X, model, num_samples):
#    preds = [model(X, training=True) for _ in range(num_samples)]
#    return np.stack(preds).mean(axis=0)
#     
#def predict_class(X, model, num_samples):
#    proba_preds = predict_proba(X, model, num_samples)
#    return np.argmax(proba_preds, axis=1)
#
#y_pred = predict_class(X_test, model, 100) ----> predicting with MC
#acc = np.mean(y_pred == y_test) ----> Answer

# Notes

- If you need a sparse model, use l1 reg
