In [None]:
'''
Reduce learning rate as the training progresses
    - When training deep neural networks, it is often useful to reduce learning rate as the training progresses
    
    - It can be done using below the methods
        - Learning rate schedules
        - Adaptive learning rate methods
'''

In [None]:
'''
Learning Rate Schedule :

    - common learinng rate schedules
            - Time-based decay
            - Step decay 
            - Exponential decay
'''

In [None]:
'''
    Time Based Decay
        - lr = lr0 * 1/(1 + decay * epoch)
        - lr = leanring rate 
        - lr0 = previous or initial lrearning rate
        - decay = hyperparameter
        - epoch = Iteration vaue


Note : When the decay argument is zero (the default), this has no effect on the learning rate.
'''

In [None]:
'''
    Step Decay or Drop-Based Learning Rate Decay 
        - Step decay schedule drops the learning rate by a factor every few epochs. The mathematical form of step decay is
        - lr = lr0 * DropRate^floor(Epoch / EpochDrop)
            - DropRate is the amount that the learning rate is modified each time it is changed such as 0.5
            - Epoch is the current epoch number and EpochDrop is how often to change the learning rate such as 10

'''

In [None]:
'''
    Exponential Decay  : Exponentially decreases the learning rate over time from starting point
        - lr = lr0 * e^(−decay_t)
            - lr0 = initiali learning rate
            - decay = decay hyperparameter
            - t = iteration number
'''

In [None]:
'''
    Polynominal decay
        - decay = (1 - (epoch / float(self.maxEpochs))) ** power
                - maxEpochs : The total number of epochs we’ll be training for
                - power : The power/exponent of the polynomial

        - lr = lr0 * decay
                -  lr0 = initiali learning rate
'''

In [None]:
'''
    Adaptive Learning Rate Methods
        - The challenge of using learning rate schedules is that their hyperparameters have to be defined in 
          advance and they depend heavily on the type of model and problem.
        - Another problem is that the same learning rate is applied to all parameter updates.

    - Optimizers Supported by Adaptive Learning Rate Methods
        - Adagrad
        - Adadelta
        - RMSprop
        - Adam
'''

In [None]:
'''
Note 
 - Adaptive learning rate methods demonstrate better performance than learning rate schedules
 - We can also use LearningRateScheduler in Keras to create custom learning rate schedules
 
 '''

In [None]:
# Create Simple Model

from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.preprocessing import LabelEncoder

# load dataset
dataframe = read_csv("ionosphere.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:34].astype(float)
Y = dataset[:,34]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)

# create model
model = Sequential()
model.add(Dense(34, input_dim=34, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Constant learning rate

sgd = tf.keras.optimizers.SGD(learning_rate=0.01)

model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(X, Y, validation_split=0.33, epochs=epochs, batch_size=28, verbose=2)

In [None]:
# Time Based Learning Rate Decay

# Compile model
epochs = 50
learning_rate = 0.1
decay_rate = learning_rate / epochs
momentum = 0.8

sgd = tf.keras.optimizers.SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

# Fit the model
model.fit(X, Y, validation_split=0.33, epochs=epochs, batch_size=28, verbose=2)

In [None]:
# Drop-Based Learning Rate Decay

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate
 
# Compile model
sgd = tf.keras.optimizers.SGD(lr=0.0, momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

# learning schedule callback
lrate = LearningRateScheduler(step_decay)
callbacks_list = [lrate]

# Fit the model
model.fit(X, Y, validation_split=0.33, epochs=50, batch_size=28, callbacks=callbacks_list, verbose=2)

In [None]:
# Exponential decay

initial_learning_rate = 0.01

def lr_exp_decay(epoch, lr):
    decay = 0.1
    return initial_learning_rate * math.exp(-decay*epoch)

# Fit the model to the training data
history_exp_decay = model.fit( X_train, y_train, epochs=100,validation_split=0.2,
                              batch_size=64, callbacks=[LearningRateScheduler(lr_exp_decay, verbose=1)],
)
