Q1. Download the fashion-MNIST dataset and plot 1 sample image for each class as shown in the grid below. Use from keras.datasets import fashion_mnist for getting the fashion mnist dataset.

In [1]:
import pandas as pd
import numpy as np
from keras.datasets import fashion_mnist
import wandb

In [2]:
sweep_config = {
    'method': 'random',
    'metric': {'name': 'accuracy', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'values': [1e-3,1e-4]},
        'batch_size': {'values': [16, 32, 64]},
        'epochs': {'values': [5, 10]},
        'hidden_layers': {'values': [3,4,5]},
        'hidden_size': {'values': [32, 64,128]},
        'activation': {'values': ['relu', 'sigmoid','tanh']},
        'optimizer': {'values': ['sgd', 'momentum','nesterov','rmsprop','adam','nadam']},
        'weight_init': {'values': ['random', 'xavier']},
        'weight_decay': {"values": [0,0.0005,0.5]}
    },
    "run_cap":100
}

# wandb.init(project="Assignment - 1")  # Ensure WandB is initialized before using config
# wandb.login()
sweep_id = wandb.sweep(sweep_config, project="DLA1")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: fwwuthks
Sweep URL: https://wandb.ai/da24m014-iit-madras/DLA1/sweeps/fwwuthks


In [3]:
# wandb.init(project="DLA1", entity="da24m014-iit-madras")

# # Load the Fashion-MNIST dataset
# (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()



# # Class names for Fashion-MNIST
# class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
#                'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# # Log sample images to wandb
# sample_images = []
# unique_classes = np.unique(y_train)

# for cls in unique_classes:
#     sample_idx = np.where(y_train == cls)[0][0]  # Find an example for the class
#     img = x_train[sample_idx]
    
#     sample_images.append(wandb.Image(img, caption=class_names[cls]))

# # Log images to wandb
# wandb.log({"Sample Images": sample_images})

# # Finish wandb run
# wandb.finish()

In [4]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train, x_test = x_train.reshape(x_train.shape[0], -1), x_test.reshape(x_test.shape[0], -1)
num_classes = 10

In [5]:
def one_hot_encode(y, num_classes):
    encoded = np.zeros((y.size,num_classes))
    encoded[np.arange(y.size),y] = 1
    return encoded

y_train, y_test = one_hot_encode(y_train,num_classes), one_hot_encode(y_test, num_classes)

In [6]:
split_idx = int(0.9*len(x_train))
x_train, x_val = x_train[:split_idx], x_train[split_idx:]
y_train, y_val = y_train[:split_idx], y_train[split_idx:]

In [7]:
# Activation Functions
def relu(Z):
    return np.maximum(0, Z)

def sigmoid(Z):
    Z = np.clip(Z, -500, 500)
    return 1 / (1 + np.exp(-Z))

def tanh(Z):
    return np.tanh(Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / np.sum(expZ, axis=1, keepdims=True)

activation_functions = {"relu": relu, "sigmoid": sigmoid, "tanh": tanh}

In [8]:
import numpy as np

# Stochastic Gradient Descent (SGD)
def sgd_update(weights, biases, grads_W, grads_b, learning_rate):
    for i in range(len(weights)):
        weights[i] -= learning_rate * grads_W[i]
        biases[i] -= learning_rate * grads_b[i]

# Momentum Optimizer
def momentum_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, momentum=0.9):
    for i in range(len(weights)):
        velocity_W[i] = momentum * velocity_W[i] - learning_rate * grads_W[i]
        velocity_b[i] = momentum * velocity_b[i] - learning_rate * grads_b[i]  # Corrected

        weights[i] += velocity_W[i]
        biases[i] += velocity_b[i] # Fixed Bias Update

# Nesterov Accelerated Gradient (NAG)
def nesterov_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W,velocity_b, momentum=0.9):
    for i in range(len(weights)):
        # Compute lookahead position
        lookahead_W = weights[i] + momentum * velocity_W[i]
        lookahead_b = biases[i] + momentum * velocity_b[i]  # Corrected

        # Update velocity
        velocity_W[i] = momentum * velocity_W[i] - learning_rate * grads_W[i]
        velocity_b[i] = momentum * velocity_b[i] - learning_rate * grads_b[i]  # Corrected

        # Update weights and biases with corrected lookahead step
        weights[i] = lookahead_W + velocity_W[i]
        biases[i] = lookahead_b + velocity_b[i]

# RMSprop Optimizer
def rmsprop_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, beta=0.9, epsilon=1e-6):
    for i in range(len(weights)):
        # Update velocity for weights and biases separately
        velocity_W[i] = beta * velocity_W[i] + (1 - beta) * (grads_W[i] ** 2)
        velocity_b[i] = beta * velocity_b[i] + (1 - beta) * (grads_b[i] ** 2)

        # Update weights
        weights[i] -= learning_rate * grads_W[i] / (np.sqrt(velocity_W[i]) + epsilon)

        # Update biases
        biases[i] -= learning_rate * grads_b[i] / (np.sqrt(velocity_b[i]) + epsilon)

# Adam Optimizer
# Adam Optimizer
def adam_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, moment2_W, moment2_b, beta1=0.9, beta2=0.999, epsilon=1e-6, t=1):
    for i in range(len(weights)):
        # First moment estimate
        velocity_W[i] = beta1 * velocity_W[i] + (1 - beta1) * grads_W[i]
        velocity_b[i] = beta1 * velocity_b[i] + (1 - beta1) * grads_b[i]

        # Second moment estimate
        moment2_W[i] = beta2 * moment2_W[i] + (1 - beta2) * (grads_W[i] ** 2)
        moment2_b[i] = beta2 * moment2_b[i] + (1 - beta2) * (grads_b[i] ** 2)

        # Bias correction
        velocity_W_corrected = velocity_W[i] / (1 - beta1 ** t)
        velocity_b_corrected = velocity_b[i] / (1 - beta1 ** t)

        moment2_W_corrected = moment2_W[i] / (1 - beta2 ** t)
        moment2_b_corrected = moment2_b[i] / (1 - beta2 ** t)

        # Check and correct shape mismatch
        if moment2_b_corrected.shape != biases[i].shape:
            print(f"Shape mismatch at layer {i}: {moment2_b_corrected.shape} vs {biases[i].shape}")
            moment2_b_corrected = np.reshape(moment2_b_corrected, biases[i].shape)

        # Parameter update
        weights[i] -= learning_rate * velocity_W_corrected / (np.sqrt(moment2_W_corrected) + epsilon)
        biases[i] -= learning_rate * velocity_b_corrected / (np.sqrt(moment2_b_corrected) + epsilon)

    return t + 1  # Increment time step

def nadam_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, moment2_W, moment2_b, beta1=0.9, beta2=0.999, epsilon=1e-6, t=1):
    for i in range(len(weights)):
        # First moment estimate
        velocity_W[i] = beta1 * velocity_W[i] + (1 - beta1) * grads_W[i]
        velocity_b[i] = beta1 * velocity_b[i] + (1 - beta1) * grads_b[i]

        # Second moment estimate
        moment2_W[i] = beta2 * moment2_W[i] + (1 - beta2) * (grads_W[i] ** 2)
        moment2_b[i] = beta2 * moment2_b[i] + (1 - beta2) * (grads_b[i] ** 2)

        # Bias correction
        velocity_W_corrected = (beta1 * velocity_W[i] + (1 - beta1) * grads_W[i]) / (1 - beta1 ** t)
        velocity_b_corrected = (beta1 * velocity_b[i] + (1 - beta1) * grads_b[i]) / (1 - beta1 ** t)

        moment2_W_corrected = moment2_W[i] / (1 - beta2 ** t)
        moment2_b_corrected = moment2_b[i] / (1 - beta2 ** t)

        # Parameter update
        weights[i] -= learning_rate * velocity_W_corrected / (np.sqrt(moment2_W_corrected) + epsilon)
        biases[i] -= learning_rate * velocity_b_corrected / (np.sqrt(moment2_b_corrected) + epsilon)

    return t + 1  # Increment time step


In [9]:
class NeuralNetwork:
    def __init__(self, layers, learning_rate=0.001, activation="relu", optimizer="sgd",
                 weight_init="random", weight_decay=0.0, beta=0.5, beta1=0.5, beta2=0.5, epsilon=1e-6):
        self.layers = layers
        self.activation = activation
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.weight_init = weight_init
        self.weight_decay = weight_decay
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        
        self.init_weights(weight_init)
        
        # Initialize optimizer-specific parameters
        self.velocity_W = [np.zeros_like(W) for W in self.weights]
        self.velocity_b = [np.zeros_like(b) for b in self.biases]
        self.moment2_W = [np.zeros_like(W) for W in self.weights]  # First moment estimate
        self.moment2_b = [np.zeros_like(b) for b in self.biases]  # Second moment estimate
        self.t = 0  # Timestep for Adam/Nadam

    def init_weights(self, method):
        self.weights = []
        self.biases = []
        for i in range(len(self.layers) - 1):
            if method == "xavier":
                limit = np.sqrt(6 / (self.layers[i] + self.layers[i+1]))
            else:  # Default to "random"
                limit = 0.1
            W = np.random.uniform(-limit, limit, (self.layers[i], self.layers[i+1]))
            self.weights.append(W)
            self.biases.append(np.zeros((1, self.layers[i+1])))

    def forward(self, X):
        self.A = [X]
        for i in range(len(self.weights) - 1):
            Z = self.A[-1] @ self.weights[i] + self.biases[i]
            A = activation_functions[self.activation](Z)
            self.A.append(A)
        Z = self.A[-1] @ self.weights[-1] + self.biases[-1]
        A = softmax(Z)
        self.A.append(A)
        return A
    
    def compute_loss(self, y_true, y_pred):
        loss = -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))
        loss += (self.weight_decay / 2) * sum(np.sum(W**2) for W in self.weights)
        return loss
    
    def backward(self, X, y):
        grads_W, grads_b = [], []
        dA = self.A[-1] - y
        for i in reversed(range(len(self.weights))):
            dW = self.A[i].T @ dA / X.shape[0]
            db = np.sum(dA, axis=0, keepdims=True) / X.shape[0]
            dW += self.weight_decay * self.weights[i]
            grads_W.append(dW)
            grads_b.append(db)
            if i > 0:
                dA = (dA @ self.weights[i].T) * (self.A[i] > 0)
        return grads_W[::-1], grads_b[::-1]
    
    def train(self, X_train, y_train, x_val, y_val, epochs, batch_size):
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            indices = np.random.permutation(num_samples)
            X_train, y_train = X_train[indices], y_train[indices]

            total_loss, total_acc = 0, 0
            num_batches = num_samples // batch_size

            for i in range(0, num_samples, batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
                
                # Forward Pass
                y_pred = self.forward(X_batch)
                
                # Compute Loss & Accuracy
                loss = self.compute_loss(y_batch, y_pred)
                acc = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_batch, axis=1))
                
                total_loss += loss * len(X_batch)
                total_acc += acc * len(X_batch)
                
                # Backward Pass
                grads_W, grads_b = self.backward(X_batch, y_batch)
                
                # Update Weights using the selected optimizer
                self.t += 1  # Increment timestep for Adam/Nadam
                if self.optimizer == "sgd":
                    sgd_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate)
                elif self.optimizer == "momentum":
                    momentum_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b)
                elif self.optimizer == "nesterov":
                    nesterov_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b)
                elif self.optimizer == "rmsprop":
                    rmsprop_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b, self.beta, self.epsilon)
                elif self.optimizer == "adam":
                    adam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b,self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)
                elif self.optimizer == "nadam":
                    nadam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b,self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)
            
            # Compute average loss and accuracy for the epoch
            avg_loss = total_loss / num_samples
            avg_acc = total_acc / num_samples

            # Validation Metrics
            y_val_pred = self.forward(x_val)
            val_loss = self.compute_loss(y_val, y_val_pred)
            val_acc = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

            # Log to Weights & Biases
            wandb.log({"epoch": epoch + 1, "loss": avg_loss, "accuracy": avg_acc, "val_loss": val_loss, "val_accuracy": val_acc})


In [10]:
def train_with_wandb():
    wandb.init(project="DLA1", entity="da24m014") # Ensure WandB is initialized before using config
    config = wandb.config
    run_name = f"hl_{config.hidden_layers}_bs_{config.batch_size}_ac_{config.activation}"
    wandb.run.name = run_name
    model = NeuralNetwork([784] + [config.hidden_size] * config.hidden_layers + [10],
                          learning_rate=config.learning_rate,
                          activation=config.activation,
                          optimizer=config.optimizer,
                          weight_init=config.weight_init,
                          weight_decay=config.weight_decay
                          )
    model.train(x_train, y_train, x_test, y_test, config.epochs, config.batch_size)
    # wandb.finish()
wandb.agent(sweep_id, function=train_with_wandb)


[34m[1mwandb[0m: Agent Starting Run: 0f3ad3tn with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier
[34m[1mwandb[0m: Currently logged in as: [33mda24m014[0m ([33mda24m014-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
accuracy,▁▃▅▇█
epoch,▁▃▅▆█
loss,█▃▂▁▁
val_accuracy,▁▆█▇█
val_loss,█▃▃▁▁

0,1
accuracy,0.5962
epoch,5.0
loss,0.98488
val_accuracy,0.5903
val_loss,0.98973


[34m[1mwandb[0m: Agent Starting Run: ddma95j8 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▇█████▇▇▇
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▁▁▁▂▃▃▄▅▅
val_accuracy,▁▅█▅▆▃▆▁▇▆
val_loss,▃▂▁▄▅▇▅█▆▆

0,1
accuracy,0.84146
epoch,10.0
loss,0.5364
val_accuracy,0.8392
val_loss,0.59012


[34m[1mwandb[0m: Agent Starting Run: i73ylonv with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▆▇██
epoch,▁▃▅▆█
loss,█▃▂▁▁
val_accuracy,▁▆▇██
val_loss,█▃▂▁▃

0,1
accuracy,0.86983
epoch,5.0
loss,0.41989
val_accuracy,0.8509
val_loss,0.48631


[34m[1mwandb[0m: Agent Starting Run: xgwurcwi with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▂▂▄▆██
epoch,▁▂▃▃▄▅▆▆▇█
loss,████████▆▁
val_accuracy,▁▁█▁▁▁▇▇▇▇
val_loss,███████▇▄▁

0,1
accuracy,0.17985
epoch,10.0
loss,2.1126
val_accuracy,0.1813
val_loss,2.08743


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
