# Batch normalisation


In [None]:
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.BatchNormalization(),
keras.layers.Dense(300, activation="elu",
kernel_initializer="he_normal"),
keras.layers.BatchNormalization(),
keras.layers.Dense(100, activation="elu",
kernel_initializer="he_normal"),
keras.layers.BatchNormalization(),
keras.layers.Dense(10, activation="softmax")
])


# TL -> This affects model A as well


In [None]:

model_A = keras.models.load_model("my_model_A.h5")

# remove output layer
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])

# add new output layer 
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))


### Freeze layers : since last layer was initialised randomly, huge gradient, should not propagate backwards initially to layers


In [None]:

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
    model_B_on_A.compile(loss="binary_crossentropy", optimizer="sgd",metrics=["accuracy"])


### unfreezing layers : After a few runs, we can unfreeze lower layers, but reduce the learning rate


In [None]:
# train initially with frozen for 4 epochs
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))


# After 4 epochs,we can unfreeze the layers and then start training with smaller lr

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = keras.optimizers.SGD(lr=1e-4) # the default lr is 1e-2

model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer,metrics=["accuracy"])

history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B))


In [None]:
# with clone -> This does not affect model A

model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights()

# Optimizers 

In [None]:
# momentum


optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)

# nesterov

optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)


# Learning rate scheduling



### Power Scheduling

In [None]:
# decay = 1/s; for example in below example lr will be halved after 1000 steps

optimizer = keras.optimizers.SGD(lr=0.01, decay=1e-4)


### Exponential Decay

In [None]:
def exponential_decay_fn(epoch):
    return 0.01 * 0.1**(epoch / 20)
If you do not want to hardcode η and s, you can create a function that
returns a configured function:

# wo hardcoding
    
def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1**(epoch / s)
    return exponential_decay_fn
exponential_decay_fn = exponential_decay(lr0=0.01, s=20)


In [None]:
# using it
# keras.callbacks.LearningRateScheduler -> Updates at end of each epoch

lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(X_train_scaled, y_train, [...], callbacks= [lr_scheduler])


In [None]:
# Performance scheduling

lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)



# Regularizer 

In [None]:


kernel_regularizer=keras.regularizers.l2(0.01))
keras.regularizers.l1()

# Partial Function

In [None]:
# creates thin wrapper for any callable

from functools import partial
RegularizedDense = partial(keras.layers.Dense,
                            activation="elu",
                            kernel_initializer="he_normal",
                            kernel_regularizer=keras.regularizers.l2(0.01))


model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
RegularizedDense(300),
RegularizedDense(100),
RegularizedDense(10, activation="softmax",
kernel_initializer="glorot_uniform")
])

In [None]:
# DRopout 
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(300, activation="elu",
kernel_initializer="he_normal"),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(100, activation="elu",
kernel_initializer="he_normal"),
keras.layers.Dropout(rate=0.2),
keras.layers.Dense(10, activation="softmax")
])


Since dropout is only active during training, comparing the training loss and the
validation loss can be misleading. In particular, a model may be overfitting the
training set and yet have similar training and validation losses. So make sure to
evaluate the training loss without dropout (e.g., after training).


# Monte Carlo

In [None]:
y_probas = np.stack([model(X_test_scaled, training=True)for sample in range(100)])
y_proba = y_probas.mean(axis=0)


# Max Norm

In [None]:
keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal",
kernel_constraint=keras.constraints.max_norm(1.))