<a href="https://colab.research.google.com/github/Sanali138/MyProject/blob/master/COMP5329_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import relevant libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Load datasets
train_data = np.load("/content/gdrive/MyDrive/COMP5329/Assignment1/train_data.npy")
train_label = np.load("/content/gdrive/MyDrive/COMP5329/Assignment1/train_label.npy")
test_data = np.load("/content/gdrive/MyDrive/COMP5329/Assignment1/test_data.npy")
test_label = np.load("/content/gdrive/MyDrive/COMP5329/Assignment1/test_label.npy")

#Check shape
print(train_data.shape)
print(train_label.shape)
print(test_data.shape)
print(test_label.shape)

(50000, 128)
(50000, 1)
(10000, 128)
(10000, 1)


In [None]:
X = train_data
X_mean = np.mean(X)
X_std = np.std(X)
Scaled_data = (X - X_mean)/X_std

print(f"{train_data.shape} is the shape of the original data.")
print(f"{Scaled_data.shape} is the shape of the scaled data.")

(50000, 128) is the shape of the original data.
(50000, 128) is the shape of the scaled data.


In [None]:
# create a activation class
# for each time, we can initiale a activation function object with one specific function
# for example: f = Activation("tanh")  means we create a tanh activation function.
# you can define more activation functions by yourself, such as relu!

class Activation(object):
    def __tanh(self, x):
        return np.tanh(x)

    def __tanh_deriv(self, a):
        # a = np.tanh(x)
        return 1.0 - a**2
    def __logistic(self, x):
        return 1.0 / (1.0 + np.exp(-x))

    def __logistic_deriv(self, a):
        # a = logistic(x)
        return  a * (1 - a )

    def __relu(self, x):
        return np.maximum(0, x)

    def __relu_deriv(self, a):
        return np.where(a <= 0, 0, 1)

    def __init__(self,activation='tanh'):
        if activation == 'logistic':
            self.f = self.__logistic
            self.f_deriv = self.__logistic_deriv
        elif activation == 'tanh':
            self.f = self.__tanh
            self.f_deriv = self.__tanh_deriv
        elif activation == 'relu':
          self.f = self.__relu
          self.f_deriv = self.__relu_deriv


In [None]:
# now we define the hidden layer for the mlp
# for example, h1 = HiddenLayer(10, 5, activation="tanh") means we create a layer with 10 dimension input and 5 dimension output, and using tanh activation function.
# notes: make sure the input size of hiddle layer should be matched with the output size of the previous layer!

class HiddenLayer(object):
    def __init__(self,n_in, n_out,
                 activation_last_layer='tanh',activation='tanh', W=None, b=None):
        """
        Typical hidden layer of a MLP: units are fully-connected and have
        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
        and the bias vector b is of shape (n_out,).

        NOTE : The nonlinearity used here is tanh

        Hidden unit activation is given by: tanh(dot(input,W) + b)

        :type n_in: int
        :param n_in: dimensionality of input

        :type n_out: int
        :param n_out: number of hidden units

        :type activation: string
        :param activation: Non linearity to be applied in the hidden
                           layer
        """
        self.input=None
        self.activation=Activation(activation).f

        # activation deriv of last layer
        self.activation_deriv=None
        if activation_last_layer:
            self.activation_deriv=Activation(activation_last_layer).f_deriv

        # we randomly assign small values for the weights as the initiallization
        self.W = np.random.uniform(
                low=-np.sqrt(6. / (n_in + n_out)),
                high=np.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)
        )
        if activation == 'logistic':
            self.W *= 4

        # we set the size of bias as the size of output dimension
        self.b = np.zeros(n_out,)

        # we set he size of weight gradation as the size of weight
        self.grad_W = np.zeros(self.W.shape)
        self.grad_b = np.zeros(self.b.shape)

        self.W_v = np.zeros(self.W.shape)
        self.b_v = np.zeros(self.b.shape)


    # the forward and backward progress (in the hidden layer level) for each training epoch
    # please learn the week2 lec contents carefully to understand these codes.
    def forward(self, input):
        '''
        :type input: numpy.array
        :param input: a symbolic tensor of shape (n_in,)
        '''
        lin_output = np.dot(input, self.W) + self.b
        self.output = (
            lin_output if self.activation is None
            else self.activation(lin_output)
        )
        self.input=input
        return self.output

    def backward(self, delta, output_layer=False):
        self.grad_W = np.atleast_2d(self.input).T.dot(np.atleast_2d(delta))
        self.grad_b = delta
        if self.activation_deriv:
            delta = delta.dot(self.W.T) * self.activation_deriv(self.input)
        return delta

In [None]:
class MLP:
    """
    """

    # for initiallization, the code will create all layers automatically based on the provided parameters.
    def __init__(self, layers, activation=[None,'relu','relu', 'softmax'], weight_decay=1.0):
        """
        :param layers: A list containing the number of units in each layer.
        Should be at least two values
        :param activation: The activation function to be used in each layer.
        :param weight_decay: The value for weight decay.
        """
        ### initialize layers
        self.layers=[]
        self.params=[]

        self.activation=activation
        self.weight_decay=weight_decay
        for i in range(len(layers)-1):
            last_hidden_layer = False
            if i == len(layers)-2:
              last_hidden_layer = True

            self.layers.append(HiddenLayer(layers[i],layers[i+1],activation[i],activation[i+1],last_hidden_layer=last_hidden_layer))

    # forward progress: pass the information through the layers and out the results of final output layer
    def forward(self,input):
        for layer in self.layers:
            output=layer.forward(input)
            input=output
        return output

    # define the objection/loss function, we use mean sqaure error (MSE) as the loss
    # you can try other loss, such as cross entropy.
    # when you try to change the loss, you should also consider the backward formula for the new loss as well!
    def criterion_MSE(self,y,y_hat):
        activation_deriv=Activation(self.activation[-1]).f_deriv
        # MSE
        error = y-y_hat
        loss=error**2
        # calculate the MSE's delta of the output layer
        delta=-2*error*activation_deriv(y_hat)
        # return loss and delta
        return loss,delta

    # backward progress
    def backward(self,delta):
        delta=self.layers[-1].backward(delta,output_layer=True)
        for layer in reversed(self.layers[:-1]):
            delta=layer.backward(delta)

    # update the network weights after backward.
    # make sure you run the backward function before the update function!
    def update(self,lr, momentum_of_gamma = 0.5):
        for layer in self.layers:
            layer.W_v = (momentum_of_gamma * layer.W_v) + (lr * layer.grad_W)
            layer.b_v = (momentum_of_gamma * layer.b_v) + (lr * layer.grad_b)
            layer.W -= layer.W_v
            layer.b -= layer.b_v

In [None]:
 def updates(self, lr, SGD_optim):
      '''
      The method to update the parameters under Stochastic Gradient Descent (SGD).
      Updates the weights and bias parameters based on the learning rate and respective gradient. Includes functionality for applying SGD Momentum optimization.

      Parameters:
      lr (float): The learning rate for the parameter updates.
      SGD_optim (dict of str: str): The SGD Optimization values as a dictionary with keys 'Type': as the type of optimisation and 'Parameters': for the optimization parameter value.

      Returns:
      None
      '''

      if SGD_optim is None:
          for layer in self.layers:
            layer.W -= lr * layer.grad_W
            layer.b -= lr * layer.grad_b

      elif SGD_optim['Type'] == 'Momentum':
          for layer in self.layers:
              layer.v_W = (SGD_optim['Parameter'] * layer.v_W) + (lr * layer.grad_W)
              layer.v_b = (SGD_optim['Parameter'] * layer.v_b) + (lr * layer.grad_b)
              layer.W = layer.W - layer.v_W
              layer.b = layer.b - layer.v_b

 def mini_batch(self, X, y, learning_rate=0.1, epochs=100, SGD_optim=None, batch_size=1):
        for k in range(epochs):
            # Assuming Utils.shuffle is defined correctly elsewhere
            X, y = Utils.shuffle(X, y)
            num_batches = int(np.ceil(X.shape[0] / batch_size))

            for batch_idx in range(num_batches):
                start_idx = batch_idx * batch_size
                end_idx = min((batch_idx + 1) * batch_size, X.shape[0])
                X_batch = X[start_idx:end_idx]
                y_batch = y[start_idx:end_idx]

                y_hat = self.forward(X_batch)
                loss, delta = self.criterion_MSE(y_batch, y_hat)
                self.backward(delta)
                self.updates(learning_rate, SGD_optim=SGD_optim)

            if k % 10 == 0:
                print(f'Epoch {k+1}/{epochs}, Loss: {loss}')

In [None]:
  def batch_normalize(self, Z):
        mean = Z.mean(axis=0)
        variance = Z.var(axis=0)
        Z_norm = (Z - mean) / np.sqrt(variance + 1e-8)
        self.normalized_output = Z_norm
        self.mean = mean
        self.variance = variance
        return self.gamma * Z_norm + self.beta

In [None]:
def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))

def gelu_deriv(x):
    cdf = 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
    pdf = np.exp(-0.5 * x**2) / np.sqrt(2 * np.pi)
    return cdf + x * pdf * (np.sqrt(2 / np.pi) * (1 + 3 * 0.044715 * x**2))
