<a href="https://colab.research.google.com/github/Vigneshthanga/258-Deep-Learning/blob/master/AdaptiveLearningRate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Magic lines to model reload

In [0]:
%matplotlib inline

#To reload all modules before executing a new line
%reload_ext autoreload
%autoreload 2

##Importing Tensorflow 2

In [3]:
%tensorflow_version 2.x
import tensorflow
print(tensorflow.__version__)
from keras.datasets import mnist
from matplotlib import pyplot as plt
import numpy as np

TensorFlow 2.x selected.
2.1.0


Using TensorFlow backend.


##Getting the data from Keras MNIST

In [4]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


##Reshaping the data

In [0]:
x_train = np.float32(x_train[:])
y_train = np.int32(np.array(y_train[:])).reshape(-1, 1)
x_test  = np.float32(x_test[:])
y_test  = np.int32(np.array(y_test[:])).reshape(-1, 1)

In [6]:
y_train.shape

(60000, 1)

In [7]:
import numpy as np


# stack together for next step
X = np.vstack((x_train, x_test))
y = np.vstack((y_train, y_test))

print(X.shape)
# one-hot encoding
digits = 10
examples = y.shape[0]
y = y.reshape(1, examples)
Y_new = np.eye(digits)[y.astype('int32')]
Y_new = Y_new.T.reshape(digits, examples)


# number of training set
m = 60000
m_test = X.shape[0] - m
X_train, X_test = X[:m].T, X[m:].T
Y_train, Y_test = Y_new[:, :m], Y_new[:, m:]

(70000, 28, 28)


In [7]:
Y_train.shape

(10, 60000)


##Defining model hyperparameters using Argparse


In [0]:
import argparse

parser = argparse.ArgumentParser()

parser.add_argument('--lr', type=float, default=0.25, help='learning rate')
parser.add_argument('--epochs', type=int, default=100,
                    help='number of epochs to train')
parser.add_argument('--n_x', type=int, default=784, help='number of inputs')
parser.add_argument('--n_h', type=int, default=784, help='number of hidden units')
parser.add_argument('--beta', type=float, default=0.9,
                    help='parameter for momentum')
parser.add_argument('--batch_size', type=int,
                    default=100, help='input batch size')

opt, unknown = parser.parse_known_args()

In [0]:
params = {"W1": np.random.randn(opt.n_h, opt.n_x) * np.sqrt(1. / opt.n_x),
          "b1": np.zeros((opt.n_h, 1)) * np.sqrt(1. / opt.n_x),
          "W2": np.random.randn(digits, opt.n_h) * np.sqrt(1. / opt.n_h),
          "b2": np.zeros((digits, 1)) * np.sqrt(1. / opt.n_h)}

##Weights

In [54]:
params["W1"]

array([[ 0.0294798 , -0.00161355, -0.03193292, ..., -0.05675496,
        -0.02373124, -0.02421886],
       [ 0.00990358, -0.01155237, -0.00126738, ...,  0.02461497,
         0.00641721,  0.02784202],
       [ 0.00243215,  0.01341848,  0.01726891, ..., -0.00685754,
         0.05479625, -0.04815378],
       ...,
       [ 0.00264246,  0.06827996, -0.02101158, ...,  0.09112652,
         0.0211752 , -0.00939152],
       [-0.00995382, -0.03183652, -0.00815587, ...,  0.02137897,
         0.00063046, -0.03446351],
       [ 0.024663  ,  0.0290985 , -0.04257172, ..., -0.06176446,
        -0.03706899,  0.08575193]])

##Sigmoid activation Function

In [0]:
def sigmoid(z):
    s = 1. / (1. + np.exp(-z))
    return s

##Loss function

In [0]:
def compute_loss(Y, Y_hat):
    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L


##Forward Propogation

In [0]:
def feed_forward(X, params):
    cache = {}

    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]

    cache["A1"] = sigmoid(cache["Z1"])

    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]

    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)

    return cache

##Back Propagation

In [0]:
def back_propagate(X, Y, params, cache, m_batch):
    dZ2 = cache["A2"] - Y

    dW2 = (1. / m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1. / m_batch) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))

    dW1 = (1. / m_batch) * np.matmul(dZ1, X.T)
    db1 = (1. / m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return grads

##Reshaping the data for number of units in the network

In [0]:
X_train = X_train.reshape(784, 60000)

In [0]:
X_test = X_test.reshape(784, 10000)

In [0]:
batches = int(60000/opt.batch_size)

In [62]:
batches

600

##Initializing the weights adjustment for adaptive learning rate to zero.

In [0]:
dW1 = 0.0
db1 = 0.0
dW2 = 0.0
db2 = 0.0

##Training the model

In [64]:
for i in range(opt.epochs):

    # shuffle training set
    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[:, permutation]

    for j in range(batches):

        # get mini-batch
        begin = j * opt.batch_size
        end = min(begin + opt.batch_size, X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

        # forward and backward
        cache = feed_forward(X, params)
        grads = back_propagate(X, Y, params, cache, m_batch)

        # with momentum (optional)
        dW1 = (opt.beta * dW1 + (1. - opt.beta) * grads["dW1"])
        db1 = (opt.beta * db1 + (1. - opt.beta) * grads["db1"])
        dW2 = (opt.beta * dW2 + (1. - opt.beta) * grads["dW2"])
        db2 = (opt.beta * db2 + (1. - opt.beta) * grads["db2"])

        # gradient descent
        params["W1"] = params["W1"] - opt.lr * grads["dW1"]
        params["b1"] = params["b1"] - opt.lr * grads["db1"]
        params["W2"] = params["W2"] - opt.lr * grads["dW2"]
        params["b2"] = params["b2"] - opt.lr * grads["db2"]

    # forward pass on training set
    #print(X_train.shape)
    cache = feed_forward(X_train, params)
    train_loss = compute_loss(Y_train, cache["A2"])

    # forward pass on test set
    cache = feed_forward(X_test, params)
    test_loss = compute_loss(Y_test, cache["A2"])
    print("Epoch {}: training loss = {}, test loss = {}".format(
        i + 1, train_loss, test_loss))


  


Epoch 1: training loss = 0.6606738732421875, test loss = 0.6521100732622166
Epoch 2: training loss = 0.5251529244516097, test loss = 0.4976227324384374
Epoch 3: training loss = 1.3128759158967769, test loss = 1.3151056350428825
Epoch 4: training loss = 0.6908824201741226, test loss = 0.6798823345824773
Epoch 5: training loss = 0.6483697163155467, test loss = 0.6312633763675478
Epoch 6: training loss = 0.772048716333725, test loss = 0.7788567492136141
Epoch 7: training loss = 1.1041125122783837, test loss = 1.097442524248876
Epoch 8: training loss = 1.5282761196889587, test loss = 1.5390180586197064
Epoch 9: training loss = 0.7276709808010356, test loss = 0.7107148371475225
Epoch 10: training loss = 0.5920808651427377, test loss = 0.5822078903466904
Epoch 11: training loss = 0.7227974979482082, test loss = 0.7122925718687032
Epoch 12: training loss = 0.47757543335911673, test loss = 0.4674146505852819
Epoch 13: training loss = 0.5025260717205724, test loss = 0.4914527288997278
Epoch 14:

##Model Evaluation using confusion matrix

In [65]:
from sklearn.metrics import classification_report, confusion_matrix
cache = feed_forward(X_test, params)
predictions = np.argmax(cache["A2"], axis=0)
labels = np.argmax(Y_test, axis=0)

print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91      1099
           1       0.97      0.96      0.96      1152
           2       0.94      0.78      0.85      1250
           3       0.76      0.93      0.84       825
           4       0.85      0.93      0.89       897
           5       0.83      0.90      0.86       829
           6       0.90      0.96      0.93       900
           7       0.87      0.96      0.91       930
           8       0.86      0.79      0.83      1066
           9       0.89      0.85      0.87      1052

    accuracy                           0.89     10000
   macro avg       0.88      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



  
