In [1]:
import numpy as np
import math



TASK 1

In [2]:
class Activation(object):

    """
    Interface for activation functions (non-linearities).

    In all implementations, the state attribute must contain the result,
    i.e. the output of forward.
    """

    # No additional work is needed for this class, as it acts like an
    # abstract base class for the others

    # Note that these activation functions are scalar operations. I.e, they
    # shouldn't change the shape of the input.

    def __init__(self):
        self.state = None

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented

In [3]:
class Sigmoid(Activation):

    """
    Sigmoid non-linearity
    """

    # Remember do not change the function signatures as those are needed
    # to stay the same for AutoLab.

    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, x):
        # Might we need to store something before returning?
        self.state = 1/(1+np.exp(-x)) # the sigmoid function in the forward neural network
        return self.state

    def derivative(self):
        # Maybe something we need later in here...
        return self.state*(1-self.state) # the derivative of sigmoid can be written as sigmoid(x) * (1 - sigmoid(x))




In [4]:
class Tanh(Activation):

    """
    Tanh non-linearity
    """

    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, x):
        self.state = np.tanh(x) #forward is simply tanh

    def derivative(self):
        return 1.0 - self.state**2 #derivative is 1 - tanh^2


In [5]:
class ReLU(Activation):

    """
    ReLU non-linearity
    """

    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        self.state = np.maximum(0,x) #forward is simply max(0,x)

    def derivative(self):
        return np.where(self.state > 0, 1, 0) # Derivative of ReLU is 1 if x > 0 else 0

TASK 2

In [6]:
class Criterion(object):
    """
    Interface for loss functions.
    """

    # Nothing needs done to this class, it's used by the following Criterion classes

    def __init__(self):
        self.logits = None
        self.labels = None
        self.loss = None

    def __call__(self, x, y):
        return self.forward(x, y)

    def forward(self, x, y):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented

In [7]:
class SoftmaxCrossEntropy(Criterion):
    """
    Softmax loss
    """

    def __init__(self):
        super(SoftmaxCrossEntropy, self).__init__()
        self.logits = None  # To store input logits from forward pass
        self.labels = None  # To store labels from forward pass
        self.softmax_output = None  # To store softmax output for backward computation

    def forward(self, x, y):
        """
        Argument:
            x (np.array): (batch size, 10)
            y (np.array): (batch size, 10)
        Return:
            out (np.array): (batch size, )
        """
        # Using the LogSumExp trick for numerical stability
        log_sum_exp = np.log(np.sum(np.exp(x - np.max(x, axis=1, keepdims=True)), axis=1, keepdims=True)) + np.max(x, axis=1, keepdims=True)
        log_softmax = x - log_sum_exp  # Log of softmax
        self.softmax_output = np.exp(log_softmax)  # Softmax output

        # Computing cross-entropy loss
        batch_loss = -np.sum(y * log_softmax, axis=1)
        self.logits = x
        self.labels = y

        return batch_loss

    def derivative(self):
        """
        Return:
            out (np.array): (batch size, 10)
        """
        # Derivative of softmax cross-entropy w.r.t. logits
        return self.softmax_output - self.labels


TASK 3

In [8]:

class Linear():
    def __init__(self, in_feature, out_feature, weight_init_fn, bias_init_fn):
        """
        Argument:
            W (np.array): (in feature, out feature)
            dW (np.array): (in feature, out feature)
            momentum_W (np.array): (in feature, out feature)

            b (np.array): (1, out feature)
            db (np.array): (1, out feature)
            momentum_B (np.array): (1, out feature)
        """

        # Initialize weights and biases using provided functions
        self.W = weight_init_fn(in_feature, out_feature)  # Shape: (in_feature, out_feature)
        self.b = bias_init_fn(out_feature)  # Shape: (1, out_feature)

        # Gradients and momentum placeholders (with proper shapes)
        self.dW = np.zeros_like(self.W)  # Same shape as W
        self.db = np.zeros_like(self.b)  # Same shape as b

        self.momentum_W = np.zeros_like(self.W)  # Same shape as W
        self.momentum_b = np.zeros_like(self.b)  # Same shape as b

        # Placeholder for input, needed for backward computation
        self.input = None

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        """
        Argument:
            x (np.array): (batch size, in feature)
        Return:
            out (np.array): (batch size, out feature)
        """
        self.input = x  # Save the input for use in backward pass
        out = np.dot(x, self.W) + self.b  # Linear transformation: XW + b
        return out

    def backward(self, delta):
        """
        Argument:
            delta (np.array): (batch size, out feature) - Derivative of loss w.r.t output
        Return:
            out (np.array): (batch size, in feature) - Derivative of loss w.r.t input
        """
        # Gradient w.r.t weights (averaged over the batch)
        self.dW = np.dot(self.input.T, delta) / self.input.shape[0]  # Shape: (in_feature, out_feature)

        # Gradient w.r.t biases (averaged over the batch)
        self.db = np.sum(delta, axis=0, keepdims=True) / self.input.shape[0]  # Shape: (1, out_feature)

        # Gradient w.r.t input to the layer
        dx = np.dot(delta, self.W.T)  # Shape: (batch size, in_feature)

        return dx


TASK 4

In [9]:


class MLP(object):

    """
    A simple multilayer perceptron
    """

    def __init__(self, input_size, output_size, hiddens, activations, weight_init_fn,
                 bias_init_fn, criterion, lr):

        # Don't change this -->
        self.train_mode = True
        self.nlayers = len(hiddens) + 1
        self.input_size = input_size
        self.output_size = output_size
        self.activations = activations
        self.criterion = criterion
        self.lr = lr
        # <---------------------

        # Initialize linear layers
        layers_sizes = [input_size] + hiddens + [output_size]
        self.linear_layers = [
            Linear(layers_sizes[i], layers_sizes[i + 1], weight_init_fn, bias_init_fn)
            for i in range(len(layers_sizes) - 1)
        ]

    def forward(self, x):
        """
        Argument:
            x (np.array): (batch size, input_size)
        Return:
            out (np.array): (batch size, output_size)
        """
        self.inputs = []  # StorING inputs to each layer for backward pass
        out = x
        for i in range(self.nlayers):
            self.inputs.append(out)
            out = self.linear_layers[i].forward(out)  # Linear transformation
            out = self.activations[i].forward(out)  # Activation function
        self.output = out  # Store final output
        return out

    def zero_grads(self):
        """
        Zero out gradients in linear layers.
        """
        for layer in self.linear_layers:
            layer.dW.fill(0.0)
            layer.db.fill(0.0)

    def step(self):
        """
        Apply a gradient descent step to the parameters of the linear layers.
        """
        for layer in self.linear_layers:
            # Update weights and biases with learning rate
            layer.W -= self.lr * layer.dW
            layer.b -= self.lr * layer.db

    def backward(self, labels):
        """
        Backpropagate through the network to compute gradients.

        Argument:
            labels (np.array): (batch size, output_size)
        """
        # Compute loss gradient w.r.t output
        delta = self.criterion.derivative(self.output, labels)

        for i in reversed(range(self.nlayers)):
            # Backward pass through activation function
            delta = self.activations[i].derivative(delta)

            # Backward pass through linear layer
            delta = self.linear_layers[i].backward(delta)

    def error(self, labels):
        """
        Compute classification error.
        """
        return (np.argmax(self.output, axis=1) != np.argmax(labels, axis=1)).sum()

    def total_loss(self, labels):
        """
        Compute total loss for the given labels.
        """
        return self.criterion(self.output, labels).sum()

    def __call__(self, x):
        return self.forward(x)

    def train(self):
        """
        Set the model to training mode.
        """
        self.train_mode = True

    def eval(self):
        """
        Set the model to evaluation mode.
        """
        self.train_mode = False
