# Batch normalisation


In [None]:
model = keras.models.Sequential([
keras.layers.Flatten(input_shape=[28, 28]),
keras.layers.BatchNormalization(),
keras.layers.Dense(300, activation="elu",
kernel_initializer="he_normal"),
keras.layers.BatchNormalization(),
keras.layers.Dense(100, activation="elu",
kernel_initializer="he_normal"),
keras.layers.BatchNormalization(),
keras.layers.Dense(10, activation="softmax")
])


# TL -> This affects model A as well


In [None]:

model_A = keras.models.load_model("my_model_A.h5")

# remove output layer
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])

# add new output layer 
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))


### Freeze layers : since last layer was initialised randomly, huge gradient, should not propagate backwards initially to layers


In [None]:

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
    model_B_on_A.compile(loss="binary_crossentropy", optimizer="sgd",metrics=["accuracy"])


### unfreezing layers : After a few runs, we can unfreeze lower layers, but reduce the learning rate


In [None]:
# train initially with frozen for 4 epochs
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4, validation_data=(X_valid_B, y_valid_B))


# After 4 epochs,we can unfreeze the layers and then start training with smaller lr

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = keras.optimizers.SGD(lr=1e-4) # the default lr is 1e-2

model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer,metrics=["accuracy"])

history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16, validation_data=(X_valid_B, y_valid_B))


In [None]:
# with clone -> This does not affect model A

model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights()

# Optimizers 

In [None]:
# momentum


optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)

# nesterov

optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
