<a href="https://colab.research.google.com/github/Sai-sakunthala/Assignment1/blob/main/raw_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.datasets import fashion_mnist
import numpy as np
import math
import random

In [2]:
(X_train, Y_train), (X_test, Y_test) = fashion_mnist.load_data()
X_train = X_train/255
X_test = Y_test/255
class_names = ['Tshirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'bag', 'Ankleboot']
classes = len(np.unique(Y_train))
split_index = int(0.9 * X_train.shape[0])
x_train_final, x_val_final = X_train[:split_index], X_train[split_index:]
y_train_final, y_val_final = Y_train[:split_index], Y_train[split_index:]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
def input_layer(x):
    x = np.array(x)
    if len(x.shape) == 3:
        x = x.reshape(x.shape[0], -1)
    return x

def sigmoid(a_x):
    a_x = np.clip(a_x, -700, 700)
    h_x = 1 / (1 + np.exp(-a_x))
    return h_x

def der_sigmoid(a_x):
    sig_x = sigmoid(a_x)
    del_sig = sig_x * (1 - sig_x)
    return del_sig

def Relu(a_x):
    h_x = np.clip(np.maximum(0, a_x), 0, 1e4)
    return h_x

def der_Relu(a_x):
    del_Relu = (a_x > 0).astype(float)
    return del_Relu

def tanh(a_x):
    a_x = np.clip(a_x, -700, 700)
    h_x = (np.exp(a_x) - np.exp(-a_x))/(np.exp(a_x) + np.exp(-a_x))
    return h_x

def der_tanh(a_x):
    del_tanh = 1 - ((np.exp(a_x) - np.exp(-a_x))/(np.exp(a_x) + np.exp(-a_x)))**2
    return del_tanh

def softmax(a_x):
    a_x = a_x - np.max(a_x)
    h_x = np.exp(a_x)
    h_x = h_x/np.sum(h_x)
    return h_x

def initialize_weights_random(num_neurons):
    np.random.seed(450)
    weights = []
    biases = []
    for i in range(len(num_neurons)-1):
        W = np.random.randn(num_neurons[i+1], num_neurons[i])*np.sqrt(1 / num_neurons[i])
        b = np.zeros((1, num_neurons[i+1]))
        weights.append(W)
        biases.append(b)
    return weights, biases

def initialize_weights_xavier(num_neurons):
    np.random.seed(450)
    weights = []
    biases = []
    for i in range(len(num_neurons)-1):
        W = np.random.randn(num_neurons[i+1], num_neurons[i])
        b = np.zeros((1, num_neurons[i+1]))
        weights.append(W)
        biases.append(b)
    return weights, biases

def pre_activation(h_x, W, b):
    a_x = np.dot(W, h_x.T) + b.flatten()
    return a_x

def bce_loss_function(h_x, y):
    h_x = np.clip(h_x, 1e-8, 1.0)
    loss = -np.log(h_x[np.argmax(y)])
    return loss

def mse_loss_function(h_x, y):
    loss = np.mean((h_x - y)**2)
    return loss

def forward_pass(x, y, weights, biases, activation_func, n_hidden, loss_function):
    activations = []
    pre_activations = []
    for i in range(n_hidden+1):
        a_x = pre_activation(x if i == 0 else activations[-1], weights[i], biases[i])
        h_x = softmax(a_x) if i == n_hidden else activation_func(a_x)
        activations.append(h_x)
        pre_activations.append(a_x)
    loss = loss_function(h_x, y)
    return activations, pre_activations, loss

def one_hot_encode(y, num_classes):
    return np.eye(num_classes)[y]

def back_propagation(activations, pre_activations, weights, biases, x, y, y_hat, n_hidden, activation_deriv, loss_function):
    del_L_a = {}
    del_L_w = {}
    del_L_b = {}
    del_L_h = {}
    for i in range(n_hidden, -1, -1):
        if i == n_hidden:
            if loss_function == bce_loss_function:
                del_L_a[i] = y_hat - y
            elif loss_function == mse_loss_function:
                del_L_a[i] = 2 * (y_hat - y) * y_hat * (1 - y_hat)
        elif i == 0:
            del_L_w[i] = np.dot(del_L_a[i][:, np.newaxis], x[np.newaxis, :])
            del_L_b[i] = del_L_a[i]
            break
        else:
            del_L_w[i] = np.dot(del_L_a[i][:, np.newaxis], activations[i-1][np.newaxis, :])
        del_L_b[i] = del_L_a[i]
        del_L_h[i-1] = np.matmul(weights[i].T, del_L_a[i])
        del_L_a[i-1] = del_L_h[i-1]*activation_deriv(pre_activations[i-1])
    return del_L_w, del_L_b

def gradient_descent(dw, db, weights, biases, learning_rate, weight_decay):
    for i in range(len(weights)):
        weights[i] -= learning_rate*(dw[i] + weight_decay * weights[i])
        biases[i] -= learning_rate*db[i]
    return weights, biases

def momentum_gradient(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, weight_decay):
    u_w = {}
    u_b = {}
    beta = 0.9
    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_u_w == {} and prev_u_b == {}:
            u_w[i] = learning_rate*dw[i]
            u_b[i] = learning_rate*db[i]
        else:
            u_w[i] = beta*prev_u_w[i] + learning_rate*dw[i]
            u_b[i] = beta*prev_u_b[i] + learning_rate*db[i]
        weights[i] -= u_w[i]
        biases[i] -= u_b[i]
    return weights, biases, u_w, u_b

def nestrov_gradient(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, weight_decay):
    u_w = {}
    u_b = {}
    beta = 0.9
    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_u_w == {} and prev_u_b == {}:
            u_w[i] = learning_rate*dw[i]
            u_b[i] = learning_rate*db[i]
        else:
            u_w[i] = beta*prev_u_w[i] + learning_rate*dw[i]
            u_b[i] = beta*prev_u_b[i] + learning_rate*db[i]
        weights[i] -= u_w[i]
        biases[i] -= u_b[i]
    return weights, biases, u_w, u_b

def rmsprop_gradient(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, weight_decay):
    u_w = {}
    u_b = {}
    beta = 0.9
    epsilon = 1e-6
    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_u_w == {} and prev_u_b == {}:
            u_w[i] = (1 - beta) * (dw[i] ** 2)
            u_b[i] = (1 - beta) * (db[i] ** 2)
        else:
            u_w[i] = beta * prev_u_w[i] + (1 - beta) * (dw[i] ** 2)
            u_b[i] = beta * prev_u_b[i] + (1 - beta) * (db[i] ** 2)

        weights[i] -= learning_rate * dw[i] / (np.sqrt(u_w[i] + epsilon))
        biases[i] -= learning_rate * db[i] / (np.sqrt(u_b[i] + epsilon))

    return weights, biases, u_w, u_b

def adagrad_gradient(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, weight_decay):
    u_w = {}
    u_b = {}
    epsilon = 1e-6
    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_u_w == {} and prev_u_b == {}:
            u_w[i] = (dw[i] ** 2)
            u_b[i] = (db[i] ** 2)
        else:
            u_w[i] = prev_u_w[i] + (dw[i] ** 2)
            u_b[i] = prev_u_b[i] + (db[i] ** 2)

        weights[i] -= learning_rate * dw[i] / (np.sqrt(u_w[i] + epsilon))
        biases[i] -= learning_rate * db[i] / (np.sqrt(u_b[i] + epsilon))

    return weights, biases, u_w, u_b

def adadelta_gradient(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, prev_v_w, prev_v_b, weight_decay):
    u_w = {}
    u_b = {}
    v_w = {}
    v_b = {}
    beta = 0.9
    epsilon = 1e-6

    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_v_b == {} and prev_v_w == {}:
            v_w[i] = (1 - beta) * (dw[i] ** 2)
            v_b[i] = (1 - beta) * (db[i] ** 2)
            update_w = (np.sqrt(epsilon) / np.sqrt(v_w[i] + epsilon)) * dw[i]
            update_b = (np.sqrt(epsilon) / np.sqrt(v_b[i] + epsilon)) * db[i]
            weights[i] -= update_w
            biases[i] -= update_b
            u_w[i] = (1 - beta) * (update_w ** 2)
            u_b[i] = (1 - beta) * (update_b ** 2)
        else:
            v_w[i] = beta * prev_v_w[i] + (1 - beta) * (dw[i] ** 2)
            v_b[i] = beta * prev_v_b[i] + (1 - beta) * (db[i] ** 2)
            update_w = (np.sqrt(prev_u_w[i] + epsilon) / np.sqrt(v_w[i] + epsilon)) * dw[i]
            update_b = (np.sqrt(prev_u_b[i] + epsilon) / np.sqrt(v_b[i] + epsilon)) * db[i]
            weights[i] -= update_w
            biases[i] -= update_b
            u_w[i] = beta * prev_u_w[i] + (1 - beta) * (update_w ** 2)
            u_b[i] = beta * prev_u_b[i] + (1 - beta) * (update_b ** 2)

    return weights, biases, u_w, u_b, v_w, v_b

def adam_gradient(dw, db, weights, biases, learning_rate, prev_m_w, prev_m_b, prev_v_w, prev_v_b, weight_decay, iteration):
    m_w = {}
    m_b = {}
    v_w = {}
    v_b = {}

    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-6

    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_m_w == {} and prev_m_b == {}:
            m_w[i] = (1 - beta1) * dw[i]
            m_b[i] = (1 - beta1) * db[i]
            v_w[i] = (1 - beta2) * (dw[i] ** 2)
            v_b[i] = (1 - beta2) * (db[i] ** 2)
        else:
            m_w[i] = beta1 * prev_m_w[i] + (1 - beta1) * dw[i]
            m_b[i] = beta1 * prev_m_b[i] + (1 - beta1) * db[i]
            v_w[i] = beta2 * prev_v_w[i] + (1 - beta2) * (dw[i] ** 2)
            v_b[i] = beta2 * prev_v_b[i] + (1 - beta2) * (db[i] ** 2)
        m_w_hat = m_w[i] / (1 - beta1**iteration)
        m_b_hat = m_b[i] / (1 - beta1**iteration)
        v_w_hat = v_w[i] / (1 - beta2**iteration)
        v_b_hat = v_b[i] / (1 - beta2**iteration)
        weights[i] -= learning_rate * m_w_hat / (np.sqrt(v_w_hat) + epsilon)
        biases[i] -= learning_rate * m_b_hat / (np.sqrt(v_b_hat) + epsilon)

    return weights, biases, m_w, m_b, v_w, v_b

def nadam_gradient(dw, db, weights, biases, learning_rate, prev_m_w, prev_m_b, prev_v_w, prev_v_b, weight_decay, iteration):
    m_w = {}
    m_b = {}
    v_w = {}
    v_b = {}

    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-6

    for i in range(len(weights)):
        dw[i] += weight_decay * weights[i]
        if prev_m_w == {} and prev_m_b == {}:
            m_w[i] = (1 - beta1) * dw[i]
            m_b[i] = (1 - beta1) * db[i]
            v_w[i] = (1 - beta2) * (dw[i] ** 2)
            v_b[i] = (1 - beta2) * (db[i] ** 2)
        else:
            m_w[i] = beta1 * prev_m_w[i] + (1 - beta1) * dw[i]
            m_b[i] = beta1 * prev_m_b[i] + (1 - beta1) * db[i]
            v_w[i] = beta2 * prev_v_w[i] + (1 - beta2) * (dw[i] ** 2)
            v_b[i] = beta2 * prev_v_b[i] + (1 - beta2) * (db[i] ** 2)
        m_w_hat = m_w[i] / (1 - beta1**iteration)
        m_b_hat = m_b[i] / (1 - beta1**iteration)
        v_w_hat = v_w[i] / (1 - beta2**iteration)
        v_b_hat = v_b[i] / (1 - beta2**iteration)
        lookahead_m_w = beta1 * m_w_hat + (1 - beta1) * dw[i] / (1 - beta1 ** iteration)
        lookahead_m_b = beta1 * m_b_hat + (1 - beta1) * db[i] / (1 - beta1 ** iteration)
        weights[i] -= learning_rate * lookahead_m_w / (np.sqrt(v_w_hat) + epsilon)
        biases[i] -= learning_rate * lookahead_m_b / (np.sqrt(v_b_hat) + epsilon)

    return weights, biases, m_w, m_b, v_w, v_b

def validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function):
    val_loss_final = 0
    y_pred_val = []
    y_val_j = []
    for j in range(0, len(x_val)):
        x_val_each = x_val[j]
        y_val_each = y_val[j]
        activ, _,val_loss = forward_pass(x_val_each, y_val_each, weights, biases, activation_func, n_hidden, loss_function)
        a_1 = activ[-1]
        y_pred_val.append(np.argmax(a_1))
        y_val_j.append(np.argmax(y_val[j]))
        val_loss_final = val_loss_final + val_loss
    accuracy = np.mean(np.array(y_pred_val) == np.array(y_val_j))
    return val_loss_final/len(x_val), accuracy

def Neuralnet(x_train, y_train, x_val, y_val, n_hidden, n_neurons_hidden, epochs, batch_size, activation, optimization, learning_rate, weight_decay, loss_function, weight_initialization):
    x_train = input_layer(x_train)
    y_train = one_hot_encode(y_train, classes)
    x_val = input_layer(x_val)
    y_val = one_hot_encode(y_val, classes)
    features = x_train.shape[1]
    num_neurons = [features] + [n_neurons_hidden]*(n_hidden) + [classes]
    initialize_weights = {"random": initialize_weights_random, "xavier": initialize_weights_xavier}[weight_initialization]
    activation_func = {"sigmoid": sigmoid, "tanh": tanh, "relu": Relu}[activation]
    activation_deriv = {"sigmoid": der_sigmoid, "tanh": der_tanh, "relu": der_Relu}[activation]
    optimization_func = {"momentum": momentum_gradient, "sgd": gradient_descent, "nestrov": nestrov_gradient, "rmsprop": rmsprop_gradient, "adagrad": adagrad_gradient, "adadelta": adadelta_gradient, "adam": adam_gradient, "nadam": nadam_gradient}[optimization]
    loss_function = {"bce": bce_loss_function, "mse": mse_loss_function}[loss_function]
    weights, biases = initialize_weights(num_neurons)

    if optimization_func == gradient_descent:
        val_loss, val_accuracy = validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function)
        print('Intial values')
        print('Validation Loss:', val_loss, 'Validation Accuracy:', val_accuracy)
        for epoch in range(epochs):
            epoch_loss = 0
            for i in range(0, len(x_train), batch_size):
                x_batch = x_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
                dw = {}
                db = {}
                for x,y in zip(x_batch,y_batch):
                    activations, pre_activations, loss = forward_pass(x, y, weights, biases, activation_func, n_hidden, loss_function)
                    epoch_loss += loss
                    del_L_w, del_L_b = back_propagation(activations, pre_activations, weights, biases, x, y, activations[-1], n_hidden, activation_deriv, loss_function)
                    for key,value in del_L_w.items():
                        if key not in dw:
                            dw[key] = value
                        else:
                            dw[key] = dw[key] + value
                    for key,value in del_L_b.items():
                        if key not in db:
                            db[key] = value
                        else:
                            db[key] = db[key] + value
                for key in dw:
                    dw[key] /= batch_size
                    db[key] /= batch_size
                weights, biases = optimization_func(dw, db, weights, biases, learning_rate, weight_decay)
            epoch_loss /= len(x_train)
            val_loss, val_accuracy = validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function)
            print('Epoch:', epoch, 'Train Loss:', epoch_loss)
            print('Validation Loss:', val_loss, 'Validation Accuracy:', val_accuracy)

    elif optimization_func == nestrov_gradient:
        val_loss, val_accuracy = validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function)
        print('Intial values')
        print('Validation Loss:', val_loss, 'Validation Accuracy:', val_accuracy)
        prev_u_w = {}
        prev_u_b = {}
        epoch_loss = 0
        for epoch in range(epochs):
            for i in range(0, len(x_train), batch_size):
                x_batch = x_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
                dw = {}
                db = {}
                for x,y in zip(x_batch,y_batch):
                    activations, pre_activations, loss = forward_pass(x, y, weights, biases, activation_func, n_hidden, loss_function)
                    epoch_loss += loss
                    if i == 0:
                        del_L_w, del_L_b = back_propagation(activations, pre_activations, weights, biases, x, y, activations[-1], n_hidden, activation_deriv, loss_function)
                    else:
                        look_ahead_weights = {key: weights[key] - weight_decay*prev_u_w[key] for key in range(len(weights))}
                        look_ahead_biases = {key: biases[key] - weight_decay*prev_u_b[key] for key in range(len(weights))}
                        del_L_w, del_L_b = back_propagation(activations, pre_activations, look_ahead_weights, look_ahead_biases, x, y, activations[-1], n_hidden, activation_deriv, loss_function)
                    for key,value in del_L_w.items():
                        if key not in dw:
                            dw[key] = value
                        else:
                            dw[key] = dw[key] + value
                    for key,value in del_L_b.items():
                        if key not in db:
                            db[key] = value
                        else:
                            db[key] = db[key] + value
                for key in dw:
                    dw[key] /= batch_size
                    db[key] /= batch_size
                weights, biases, prev_u_w, prev_u_b = optimization_func(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, weight_decay)
            epoch_loss /= len(x_train)
            val_loss, val_accuracy = validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function)
            print('Epoch:', epoch, 'Train Loss:', epoch_loss)
            print('Validation Loss:', val_loss, 'Validation Accuracy:', val_accuracy)

    elif optimization_func == momentum_gradient or optimization_func == rmsprop_gradient or optimization_func == adagrad_gradient or optimization_func == adadelta_gradient or optimization_func == adam_gradient or optimization_func == nadam_gradient:
        val_loss, val_accuracy = validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function)
        print('Intial values')
        print('Validation Loss:', val_loss, 'Validation Accuracy:', val_accuracy)
        if optimization_func == adadelta_gradient or optimization_func == adam_gradient or optimization_func == nadam_gradient:
            prev_u_w = {}
            prev_u_b = {}
            prev_v_w = {}
            prev_v_b = {}
        else:
            prev_u_w = {}
            prev_u_b = {}
        epoch_loss = 0
        for epoch in range(epochs):
            for i in range(0, len(x_train), batch_size):
                x_batch = x_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
                dw = {}
                db = {}
                for x,y in zip(x_batch,y_batch):
                    activations, pre_activations, loss = forward_pass(x, y, weights, biases, activation_func, n_hidden, loss_function)
                    epoch_loss += loss
                    del_L_w, del_L_b = back_propagation(activations, pre_activations, weights, biases, x, y, activations[-1], n_hidden, activation_deriv, loss_function)
                    for key,value in del_L_w.items():
                        if key not in dw:
                            dw[key] = value
                        else:
                            dw[key] = dw[key] + value
                    for key,value in del_L_b.items():
                        if key not in db:
                            db[key] = value
                        else:
                            db[key] = db[key] + value
                for key in dw:
                    dw[key] /= batch_size
                    db[key] /= batch_size
                if optimization_func == adadelta_gradient or optimization_func == adam_gradient or optimization_func == nadam_gradient:
                    weights, biases, prev_u_w, prev_u_b, prev_v_w, prev_v_b = optimization_func(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, prev_v_w, prev_v_b, weight_decay)
                else:
                    weights, biases, prev_u_w, prev_u_b = optimization_func(dw, db, weights, biases, learning_rate, prev_u_w, prev_u_b, weight_decay)
            epoch_loss /= len(x_train)
            val_loss, val_accuracy = validation(x_val, y_val, weights, biases, activation_func, n_hidden, loss_function)
            print('Epoch:', epoch, 'Train Loss:', epoch_loss)
            print('Validation Loss:', val_loss, 'Validation Accuracy:', val_accuracy)