In [91]:
#import necessary library.
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

In [92]:
# Load the dataset.
fashion_mnist = keras.datasets.fashion_mnist
(x_train_full,y_train_full),(x_test,y_test) = fashion_mnist.load_data()
x_valid, x_train = x_train_full[:5000]/255.0, x_train_full[5000:]/255.0
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
classes_name = ['top','trouser','pullover','dress','coat','sandel','shirt','sneaker','bag','ankel_foot']

In [93]:
# model buliding
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[28,28]))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(300, kernel_initializer='he_normal'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('elu'))
model.add(keras.layers.Dense(100, kernel_initializer='he_normal'))
model.add(keras.layers.Activation('elu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(10, activation="softmax"))

### Why batch normalization?
> prevent from Internal Covariate Shift

The key issue that batch normalisation tackles is internal covariate shift. Internal covariate shift occurs due to the very nature of neural networks. At every epoch of training, weights are updated and different data is being processed, which means that the inputs to a neuron is slightly different every time. As these changes get passed on to the next neuron, it creates a situation where the input distribution of every neuron is different at every epoch.

Normally, this is not a big deal, but in deep networks, these small changes in input distribution add up fast and amplify greatly deeper into the network. Ultimately, the input distribution received by the deepest neurons changes greatly between every epoch.

As a result, these neurons need to continuously adapt to the changing input distribution, meaning that their learning capabilities are severely bottlenecked. This constantly changing input distribution is called internal covariate shift.

> To avoid vanishing and exploding gradient problem



##### Gradient clipping
> Another popular technique ti lessen exploding gradient problem is to simple clip the gradient during backpropagation so that they never exceed some thresold this is called **gradient clipping**. This method impelemented as follow:  


In [110]:
optimizer = tf.keras.optimizers.SGD(clipvalue=1.0) 

> this will clip every component of gradient between -1 to 1.

In [94]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_8 (Flatten)          (None, 784)               0         
_________________________________________________________________
batch_normalization_13 (Batc (None, 784)               3136      
_________________________________________________________________
dense_22 (Dense)             (None, 300)               235500    
_________________________________________________________________
batch_normalization_14 (Batc (None, 300)               1200      
_________________________________________________________________
activation_15 (Activation)   (None, 300)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 100)               30100     
_________________________________________________________________
activation_16 (Activation)   (None, 100)              

In [95]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

> If your targets are one-hot encoded, use categorical_crossentropy.

> But if your targets are integers, use sparse_categorical_crossentropy

In [96]:
# this is function for learning rate scheduler 
# learning rate exponatialy decrease after each epoch 
def decay(lr0,s):
    def exponential_decay(epoch):
        return lr0*0.1*(epoch/s)
    return exponential_decay

exponential_decay_fn = decay(lr0=0.01, s=20)

In [97]:
model.fit(x_train,
          y_train,
          epochs=200,
          validation_data=(x_valid,y_valid),
          callbacks=[keras.callbacks.EarlyStopping(patience=5),keras.callbacks.LearningRateScheduler(exponential_decay_fn)]
         )

Train on 55000 samples, validate on 5000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200


<keras.callbacks.callbacks.History at 0x208027dc550>

### Learning rate scheduling

#### performance scheduling
> performance scheduling use the ReduceLROnPlateau callback. if you pass following callback to fit method, it will multiply learning rate by 0.5 whenever the best validation loss does not improve for 5 consecutive epochs.

In [98]:
keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)

<keras.callbacks.callbacks.ReduceLROnPlateau at 0x20802280a90>

#### picewise scheduling
> For picewise scheduling, you can use following one then create LearningRateScheduler callback with this function and pass it to fit method. 

In [99]:
def picewise_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else:
        return 0.001

#### Exponential scheduling
> For Exponential scheduling, you can use following one then create LearningRateScheduler callback with this function and pass it to fit method.

In [109]:
def decay(lr0,s):
    def exponential_decay(epoch):
        return lr0*0.1*(epoch/s)
    return exponential_decay

exponential_decay_fn = decay(lr0=0.01, s=20)

#### or

In [108]:
s = 10
lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01,s,0.1)
tf.keras.optimizers.SGD(learning_rate=lr)

<tensorflow.python.keras.optimizer_v2.gradient_descent.SGD at 0x2082647f8e0>