In [None]:
from keras.datasets import fashion_mnist
import numpy as np
from tensorflow.keras.utils import to_categorical
import wandb

In [None]:
# Initialize WandB
wandb.init(project="assignment1", entity="da6401-assignments")

In [None]:
# Loading the data
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# Normalize the data
x_train = x_train.reshape(-1, 784) / 255.0
x_test = x_test.reshape(-1, 784) / 255.0

# One-hot encode labels
y_train, y_test = to_categorical(y_train), to_categorical(y_test)

# Split validation data
val_split = int(0.1 * x_train.shape[0])

# Create validation data
x_val, y_val = x_train[:val_split], y_train[:val_split]
x_train, y_train = x_train[val_split:], y_train[val_split:]

In [None]:
class FeedforwardNeuralNetwork:
    def __init__(self, input_size, hidden_layers, output_size):
        """
        Initialize the neural network.
        :param input_size: Number of input features.
        :param hidden_layers: List containing the number of neurons in each hidden layer.
        :param output_size: Number of output classes.
        """
        self.layers = []
        self.weights = []
        self.biases = []
        self.velocities_w = []  # For momentum-based optimizers
        self.velocities_b = []
        self.squared_w = []  # For RMSprop/Adam
        self.squared_b = []
        self.moments_w = []  # For Nadam
        self.moments_b = []

        # Input layer to first hidden layer
        prev_size = input_size
        for layer_size in hidden_layers:
            self.weights.append(np.random.randn(prev_size, layer_size) * 0.01)
            self.biases.append(np.zeros((1, layer_size)))
            self.velocities_w.append(np.zeros((prev_size, layer_size)))
            self.velocities_b.append(np.zeros((1, layer_size)))
            self.squared_w.append(np.zeros((prev_size, layer_size)))
            self.squared_b.append(np.zeros((1, layer_size)))
            self.moments_w.append(np.zeros((prev_size, layer_size)))
            self.moments_b.append(np.zeros((1, layer_size)))
            prev_size = layer_size

        # Hidden layers to output layer
        self.weights.append(np.random.randn(prev_size, output_size) * 0.01)
        self.biases.append(np.zeros((1, output_size)))
        self.velocities_w.append(np.zeros((prev_size, output_size)))
        self.velocities_b.append(np.zeros((1, output_size)))
        self.squared_w.append(np.zeros((prev_size, output_size)))
        self.squared_b.append(np.zeros((1, output_size)))
        self.moments_w.append(np.zeros((prev_size, output_size)))
        self.moments_b.append(np.zeros((1, output_size)))
    
    def train(self, x_train, y_train, epochs, learning_rate, optimizer='sgd', batch_size=32):
        """
        Train the neural network.
        :param x_train: Training data.
        :param y_train: Training labels (one-hot encoded).
        :param epochs: Number of training iterations.
        :param learning_rate: Learning rate.
        :param optimizer: Optimization algorithm ('sgd', 'adam', etc.).
        :param batch_size: Size of mini-batches for training.
        """
        for epoch in range(epochs):
            indices = np.arange(x_train.shape[0])
            np.random.shuffle(indices)
            x_train, y_train = x_train[indices], y_train[indices]
            
            for i in range(0, x_train.shape[0], batch_size):
                x_batch = x_train[i:i+batch_size]
                y_batch = y_train[i:i+batch_size]
                self.forward(x_batch)
                self.backpropagation(x_batch, y_batch, learning_rate, optimizer)
            
            if epoch % 10 == 0:
                loss = -np.mean(y_train * np.log(self.forward(x_train) + 1e-8))
                print(f"Epoch {epoch}: Loss = {loss:.4f}")

    def sigmoid(self, x):
        """Sigmoid activation function."""
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        """Derivative of the sigmoid function."""
        return x * (1 - x)

    def softmax(self, x):
        """Softmax activation function."""
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, x):
        """
        Perform a forward pass through the network.
        :param x: Input data.
        :return: Output probabilities.
        """
        self.layers = [x]
        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            x = self.sigmoid(np.dot(x, w) + b)
            self.layers.append(x)

        # Output layer with softmax
        output = self.softmax(np.dot(x, self.weights[-1]) + self.biases[-1])
        self.layers.append(output)
        return output

    def backpropagation(self, x, y, learning_rate, optimizer='sgd', beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Perform backpropagation and update weights.
        :param x: Input data.
        :param y: True labels (one-hot encoded).
        :param learning_rate: Learning rate for weight updates.
        :param optimizer: Optimization algorithm ('sgd', 'rmsprop', 'adam', 'nadam', 'nag').
        :param beta1: Momentum parameter for Adam/Nadam.
        :param beta2: RMSprop/Adam decay parameter.
        :param epsilon: Small value to prevent division by zero.
        """
        m = y.shape[0]  # Number of samples
        deltas = [self.layers[-1] - y]  # Output layer error

        # Backpropagate errors for hidden layers
        for i in range(len(self.weights) - 1, 0, -1):
            deltas.append(deltas[-1].dot(self.weights[i].T) * self.sigmoid_derivative(self.layers[i]))
        deltas.reverse()

        # Update weights and biases based on optimizer
        for i in range(len(self.weights)):
            dw = self.layers[i].T.dot(deltas[i]) / m
            db = np.mean(deltas[i], axis=0, keepdims=True)

            if optimizer == 'sgd':
                self.weights[i] -= learning_rate * dw
                self.biases[i] -= learning_rate * db
            
            elif optimizer == 'momentum':
                self.velocities_w[i] = beta1 * self.velocities_w[i] - learning_rate * dw
                self.velocities_b[i] = beta1 * self.velocities_b[i] - learning_rate * db
                self.weights[i] += self.velocities_w[i]
                self.biases[i] += self.velocities_b[i]
            
            elif optimizer == 'nag':
                prev_w = self.velocities_w[i]
                prev_b = self.velocities_b[i]
                self.velocities_w[i] = beta1 * prev_w - learning_rate * dw
                self.velocities_b[i] = beta1 * prev_b - learning_rate * db
                self.weights[i] += -beta1 * prev_w + (1 + beta1) * self.velocities_w[i]
                self.biases[i] += -beta1 * prev_b + (1 + beta1) * self.velocities_b[i]
            
            elif optimizer == 'rmsprop':
                self.squared_w[i] = beta2 * self.squared_w[i] + (1 - beta2) * (dw ** 2)
                self.squared_b[i] = beta2 * self.squared_b[i] + (1 - beta2) * (db ** 2)
                self.weights[i] -= learning_rate * dw / (np.sqrt(self.squared_w[i]) + epsilon)
                self.biases[i] -= learning_rate * db / (np.sqrt(self.squared_b[i]) + epsilon)
            
            elif optimizer == 'adam' or optimizer == 'nadam':
                self.velocities_w[i] = beta1 * self.velocities_w[i] + (1 - beta1) * dw
                self.velocities_b[i] = beta1 * self.velocities_b[i] + (1 - beta1) * db
                self.squared_w[i] = beta2 * self.squared_w[i] + (1 - beta2) * (dw ** 2)
                self.squared_b[i] = beta2 * self.squared_b[i] + (1 - beta2) * (db ** 2)
                if optimizer == 'nadam':
                    dw = beta1 * self.velocities_w[i] + (1 - beta1) * dw
                    db = beta1 * self.velocities_b[i] + (1 - beta1) * db
                self.weights[i] -= learning_rate * dw / (np.sqrt(self.squared_w[i]) + epsilon)
                self.biases[i] -= learning_rate * db / (np.sqrt(self.squared_b[i]) + epsilon)