Q1. Download the fashion-MNIST dataset and plot 1 sample image for each class as shown in the grid below. Use from keras.datasets import fashion_mnist for getting the fashion mnist dataset.

In [14]:
import pandas as pd
import numpy as np
from keras.datasets import fashion_mnist
import wandb

In [15]:
sweep_config = {
    'method': 'random',
    'metric': {'name': 'accuracy', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'values': [1e-3,1e-4]},
        'batch_size': {'values': [16, 32, 64]},
        'epochs': {'values': [5, 10]},
        'hidden_layers': {'values': [3,4,5]},
        'hidden_size': {'values': [32, 64, 128]},
        'activation': {'values': ['relu', 'sigmoid','tanh']},
        'optimizer': {'values': ['sgd', 'momentum','nesterov','rmsprop','adam','nadam']},
        'weight_init': {'values': ['random', 'xavier']},
        'weight_decay': {"values": [0,0.0005,0.5]}
    },
    "run_cap":100
}

# wandb.init(project="Assignment - 1")  # Ensure WandB is initialized before using config
# wandb.login()
sweep_id = wandb.sweep(sweep_config, project="DLA1")

Create sweep with ID: 2devlgbi
Sweep URL: https://wandb.ai/da24m014-iit-madras/DLA1/sweeps/2devlgbi


In [16]:
# wandb.init(project="DLA1", entity="da24m014-iit-madras")

# # Load the Fashion-MNIST dataset
# (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()



# # Class names for Fashion-MNIST
# class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
#                'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# # Log sample images to wandb
# sample_images = []
# unique_classes = np.unique(y_train)

# for cls in unique_classes:
#     sample_idx = np.where(y_train == cls)[0][0]  # Find an example for the class
#     img = x_train[sample_idx]
    
#     sample_images.append(wandb.Image(img, caption=class_names[cls]))

# # Log images to wandb
# wandb.log({"Sample Images": sample_images})

# # Finish wandb run
# wandb.finish()

In [17]:
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train, x_test = x_train.reshape(x_train.shape[0], -1), x_test.reshape(x_test.shape[0], -1)
num_classes = 10

In [18]:
def one_hot_encode(y, num_classes):
    encoded = np.zeros((y.size,num_classes))
    encoded[np.arange(y.size),y] = 1
    return encoded

y_train, y_test = one_hot_encode(y_train,num_classes), one_hot_encode(y_test, num_classes)

In [19]:
split_idx = int(0.9*len(x_train))
x_train, x_val = x_train[:split_idx], x_train[split_idx:]
y_train, y_val = y_train[:split_idx], y_train[split_idx:]

In [20]:
# Activation Functions
def relu(Z):
    return np.maximum(0, Z)

def sigmoid(Z):
    Z = np.clip(Z, -500, 500)
    return 1 / (1 + np.exp(-Z))

def tanh(Z):
    return np.tanh(Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / np.sum(expZ, axis=1, keepdims=True)

activation_functions = {"relu": relu, "sigmoid": sigmoid, "tanh": tanh}

def relu_derivative(A):
    return (A > 0).astype(float)

def sigmoid_derivative(A):
    return A * (1 - A)  # Works because in forward pass, A = sigmoid(Z)

def tanh_derivative(A):
    return 1 - A**2  # Works because in forward pass, A = tanh(Z)

activation_derivatives = {
    "relu": relu_derivative,
    "sigmoid": sigmoid_derivative,
    "tanh": tanh_derivative
}


In [None]:
# Stochastic Gradient Descent (SGD)
def sgd_update(weights, biases, grads_W, grads_b, learning_rate):
    for i in range(len(weights)):
        weights[i] -= learning_rate * grads_W[i]
        biases[i] -= learning_rate * grads_b[i]

# Momentum Optimizer
def momentum_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, momentum=0.5):
    for i in range(len(weights)):
        velocity_W[i] = momentum * velocity_W[i] - learning_rate * grads_W[i]
        velocity_b[i] = momentum * velocity_b[i] - learning_rate * grads_b[i]  # Corrected

        weights[i] += velocity_W[i]
        biases[i] += velocity_b[i] # Fixed Bias Update

# Nesterov Accelerated Gradient (NAG)
def nesterov_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W,velocity_b, momentum=0.5):
    for i in range(len(weights)):
        # Compute lookahead position
        lookahead_W = weights[i] + momentum * velocity_W[i]
        lookahead_b = biases[i] + momentum * velocity_b[i]  # Corrected

        # Update velocity
        velocity_W[i] = momentum * velocity_W[i] - learning_rate * grads_W[i]
        velocity_b[i] = momentum * velocity_b[i] - learning_rate * grads_b[i]  # Corrected

        # Update weights and biases with corrected lookahead step
        weights[i] = lookahead_W + velocity_W[i]
        biases[i] = lookahead_b + velocity_b[i]

# RMSprop Optimizer
def rmsprop_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, beta=0.5, epsilon=1e-6):
    for i in range(len(weights)):
        # Update velocity for weights and biases separately
        velocity_W[i] = beta * velocity_W[i] + (1 - beta) * (grads_W[i] ** 2)
        velocity_b[i] = beta * velocity_b[i] + (1 - beta) * (grads_b[i] ** 2)

        # Update weights
        weights[i] -= learning_rate * grads_W[i] / (np.sqrt(velocity_W[i]) + epsilon)

        # Update biases
        biases[i] -= learning_rate * grads_b[i] / (np.sqrt(velocity_b[i]) + epsilon)

# Adam Optimizer
# Adam Optimizer
def adam_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, moment2_W, moment2_b, beta1=0.5, beta2=0.5, epsilon=1e-6, t=1):
    for i in range(len(weights)):
        # First moment estimate
        velocity_W[i] = beta1 * velocity_W[i] + (1 - beta1) * grads_W[i]
        velocity_b[i] = beta1 * velocity_b[i] + (1 - beta1) * grads_b[i]

        # Second moment estimate
        moment2_W[i] = beta2 * moment2_W[i] + (1 - beta2) * (grads_W[i] ** 2)
        moment2_b[i] = beta2 * moment2_b[i] + (1 - beta2) * (grads_b[i] ** 2)

        # Bias correction
        velocity_W_corrected = velocity_W[i] / (1 - beta1 ** t)
        velocity_b_corrected = velocity_b[i] / (1 - beta1 ** t)

        moment2_W_corrected = moment2_W[i] / (1 - beta2 ** t)
        moment2_b_corrected = moment2_b[i] / (1 - beta2 ** t)

        # Check and correct shape mismatch
        if moment2_b_corrected.shape != biases[i].shape:
            print(f"Shape mismatch at layer {i}: {moment2_b_corrected.shape} vs {biases[i].shape}")
            moment2_b_corrected = np.reshape(moment2_b_corrected, biases[i].shape)

        # Parameter update
        weights[i] -= learning_rate * velocity_W_corrected / (np.sqrt(moment2_W_corrected) + epsilon)
        biases[i] -= learning_rate * velocity_b_corrected / (np.sqrt(moment2_b_corrected) + epsilon)

    return t + 1  # Increment time step

def nadam_update(weights, biases, grads_W, grads_b, learning_rate, velocity_W, velocity_b, moment2_W, moment2_b, beta1=0.5, beta2=0.5, epsilon=1e-6, t=1):
    for i in range(len(weights)):
        # First moment estimate
        velocity_W[i] = beta1 * velocity_W[i] + (1 - beta1) * grads_W[i]
        velocity_b[i] = beta1 * velocity_b[i] + (1 - beta1) * grads_b[i]

        # Second moment estimate
        moment2_W[i] = beta2 * moment2_W[i] + (1 - beta2) * (grads_W[i] ** 2)
        moment2_b[i] = beta2 * moment2_b[i] + (1 - beta2) * (grads_b[i] ** 2)

        # Bias correction
        velocity_W_corrected = (beta1 * velocity_W[i] + (1 - beta1) * grads_W[i]) / (1 - beta1 ** t)
        velocity_b_corrected = (beta1 * velocity_b[i] + (1 - beta1) * grads_b[i]) / (1 - beta1 ** t)

        moment2_W_corrected = moment2_W[i] / (1 - beta2 ** t)
        moment2_b_corrected = moment2_b[i] / (1 - beta2 ** t)

        # Parameter update
        weights[i] -= learning_rate * velocity_W_corrected / (np.sqrt(moment2_W_corrected) + epsilon)
        biases[i] -= learning_rate * velocity_b_corrected / (np.sqrt(moment2_b_corrected) + epsilon)

    return t + 1  # Increment time step


In [22]:

# class NeuralNetwork:
#     def __init__(self, layers, learning_rate=0.1, activation="sigmoid", optimizer="sgd",
#                  weight_init="random", weight_decay=0.0, beta=0.5, beta1=0.5, beta2=0.5, epsilon=1e-6):
#         self.layers = layers
#         self.activation = activation
#         self.optimizer = optimizer
#         self.learning_rate = learning_rate
#         self.weight_init = weight_init
#         self.weight_decay = weight_decay
#         self.beta = beta
#         self.beta1 = beta1
#         self.beta2 = beta2
#         self.epsilon = epsilon
        
#         self.init_weights(weight_init)

#         # Optimizer-specific parameters
#         self.velocity_W = [np.zeros_like(W) for W in self.weights]
#         self.velocity_b = [np.zeros_like(b) for b in self.biases]
#         self.moment2_W = [np.zeros_like(W) for W in self.weights]
#         self.moment2_b = [np.zeros_like(b) for b in self.biases]
#         self.t = 0  # Timestep for Adam/Nadam

#         # Store best model
#         global best_model_data
#         best_model_data = {
#             "weights": None,
#             "biases": None,
#             "params": None,
#             "best_accuracy": 0
#         }

#     def init_weights(self, method):
#         self.weights = []
#         self.biases = []
#         for i in range(len(self.layers) - 1):
#             if method == "xavier":
#                 limit = np.sqrt(6 / (self.layers[i] + self.layers[i+1]))
#             else:
#                 limit = 0.1
#             W = np.random.uniform(-limit, limit, (self.layers[i], self.layers[i+1]))
#             self.weights.append(W)
#             self.biases.append(np.zeros((1, self.layers[i+1])))

#     def forward(self, X):
#         self.A = [X]
#         for i in range(len(self.weights) - 1):
#             Z = self.A[-1] @ self.weights[i] + self.biases[i]
#             A = activation_functions[self.activation](Z)
#             self.A.append(A)
#         Z = self.A[-1] @ self.weights[-1] + self.biases[-1]
#         A = softmax(Z)
#         self.A.append(A)
#         return A

#     def compute_loss(self, y_true, y_pred):
#         loss = -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))
#         loss += (self.weight_decay / 2) * sum(np.sum(W**2) for W in self.weights)
#         return loss

#     def backward(self, X, y):
#         grads_W, grads_b = [], []
#         dA = self.A[-1] - y
#         for i in reversed(range(len(self.weights))):
#             dW = self.A[i].T @ dA / X.shape[0]
#             db = np.sum(dA, axis=0, keepdims=True) / X.shape[0]
#             dW += self.weight_decay * self.weights[i]
#             grads_W.append(dW)
#             grads_b.append(db)
#             if i > 0:
#                 dA = (dA @ self.weights[i].T) * activation_derivatives[self.activation](self.A[i])        
#         return grads_W[::-1], grads_b[::-1]

#     def train(self, X_train, y_train, x_val, y_val, epochs, batch_size):
#         num_samples = X_train.shape[0]

#         for epoch in range(epochs):
#             indices = np.random.permutation(num_samples)
#             X_train, y_train = X_train[indices], y_train[indices]

#             total_loss, total_acc = 0, 0
#             num_batches = num_samples // batch_size

#             for i in range(0, num_samples, batch_size):
#                 X_batch = X_train[i:i + batch_size]
#                 y_batch = y_train[i:i + batch_size]

#                 # Forward Pass
#                 y_pred = self.forward(X_batch)

#                 # Compute Loss & Accuracy
#                 loss = self.compute_loss(y_batch, y_pred)
#                 acc = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_batch, axis=1))

#                 total_loss += loss * len(X_batch)
#                 total_acc += acc * len(X_batch)

#                 # Backward Pass
#                 grads_W, grads_b = self.backward(X_batch, y_batch)

#                 # Update Weights using the selected optimizer
#                 self.t += 1
#                 if self.optimizer == "sgd":
#                     sgd_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate)
#                 elif self.optimizer == "momentum":
#                     momentum_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b)
#                 elif self.optimizer == "nesterov":
#                     nesterov_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b)
#                 elif self.optimizer == "rmsprop":
#                     rmsprop_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b, self.beta, self.epsilon)
#                 elif self.optimizer == "adam":
#                     adam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b,self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)
#                 elif self.optimizer == "nadam":
#                     nadam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b,self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)  # Generic optimizer function

#             # Compute average loss and accuracy for the epoch
#             avg_loss = total_loss / num_samples
#             avg_acc = total_acc / num_samples

#             # Validation Metrics
#             y_val_pred = self.forward(x_val)
#             val_loss = self.compute_loss(y_val, y_val_pred)
#             val_acc = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

#             # Save Best Model (Outside Class)
#             global best_model_data
#             if val_acc > best_model_data["best_accuracy"]:
#                 best_model_data = {
#                     "weights": [W.copy() for W in self.weights],
#                     "biases": [b.copy() for b in self.biases],
#                     "params": {
#                         "layers": self.layers,
#                         "learning_rate": self.learning_rate,
#                         "activation": self.activation,
#                         "optimizer": self.optimizer,
#                         "weight_init": self.weight_init,
#                         "weight_decay": self.weight_decay,
#                         "beta": self.beta,
#                         "beta1": self.beta1,
#                         "beta2": self.beta2,
#                         "epsilon": self.epsilon,
#                     },
#                     "best_accuracy": val_acc
#                 }

#             # Log to Weights & Biases
#             wandb.log({"epoch": epoch + 1, "loss": avg_loss, "accuracy": avg_acc, "val_loss": val_loss, "val_accuracy": val_acc})



In [23]:
class NeuralNetwork:
    def __init__(self, layers, learning_rate=0.1, activation="sigmoid", optimizer="sgd",
                 weight_init="random", weight_decay=0.0, beta=0.5, beta1=0.5, beta2=0.5, epsilon=1e-6):
        self.layers = layers
        self.activation = activation
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.weight_init = weight_init
        self.weight_decay = weight_decay
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        self.init_weights(weight_init)

        # Optimizer-specific parameters
        self.velocity_W = [np.zeros_like(W) for W in self.weights]
        self.velocity_b = [np.zeros_like(b) for b in self.biases]
        self.moment2_W = [np.zeros_like(W) for W in self.weights]
        self.moment2_b = [np.zeros_like(b) for b in self.biases]
        self.t = 0  # Timestep for Adam/Nadam

        # Store best model
        global best_model_data
        best_model_data = {
            "weights": None,
            "biases": None,
            "params": None,
            "best_accuracy": 0
        }

    def init_weights(self, method):
        self.weights = []
        self.biases = []
        for i in range(len(self.layers) - 1):
            if method == "xavier":
                limit = np.sqrt(6 / (self.layers[i] + self.layers[i+1]))
            else:
                limit = 0.1
            W = np.random.uniform(-limit, limit, (self.layers[i], self.layers[i+1]))
            self.weights.append(W)
            self.biases.append(np.zeros((1, self.layers[i+1])))

    def forward(self, X):
        self.A = [X]
        for i in range(len(self.weights) - 1):
            Z = self.A[-1] @ self.weights[i] + self.biases[i]
            A = activation_functions[self.activation](Z)
            self.A.append(A)
        Z = self.A[-1] @ self.weights[-1] + self.biases[-1]
        A = softmax(Z)
        self.A.append(A)
        return A

    def compute_loss(self, y_true, y_pred):
        loss = -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))
        loss += (self.weight_decay / 2) * sum(np.sum(W**2) for W in self.weights)
        return loss

    def backward(self, X, y):
        grads_W, grads_b = [], []
        dA = self.A[-1] - y
        for i in reversed(range(len(self.weights))):
            dW = self.A[i].T @ dA / X.shape[0]
            db = np.sum(dA, axis=0, keepdims=True) / X.shape[0]
            dW += self.weight_decay * self.weights[i]
            grads_W.append(dW)
            grads_b.append(db)
            if i > 0:
                dA = (dA @ self.weights[i].T) * activation_derivatives[self.activation](self.A[i])
        return grads_W[::-1], grads_b[::-1]

    def train(self, X_train, y_train, X_val, y_val, epochs, batch_size):
        from sklearn.metrics import confusion_matrix
        import wandb
        
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            indices = np.random.permutation(num_samples)
            X_train, y_train = X_train[indices], y_train[indices]

            total_loss, total_acc = 0, 0
            num_batches = num_samples // batch_size

            for i in range(0, num_samples, batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]

                # Forward Pass
                y_pred = self.forward(X_batch)

                # Compute Loss & Accuracy
                loss = self.compute_loss(y_batch, y_pred)
                acc = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_batch, axis=1))

                total_loss += loss * len(X_batch)
                total_acc += acc * len(X_batch)

                # Backward Pass
                grads_W, grads_b = self.backward(X_batch, y_batch)

                # Update Weights using the selected optimizer
                self.t += 1
                if self.optimizer == "sgd":
                    sgd_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate)
                elif self.optimizer == "momentum":
                    momentum_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W, self.velocity_b)
                elif self.optimizer == "nesterov":
                    nesterov_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W, self.velocity_b)
                elif self.optimizer == "rmsprop":
                    rmsprop_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W, self.velocity_b, self.beta, self.epsilon)
                elif self.optimizer == "adam":
                    adam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W, self.velocity_b, self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)
                elif self.optimizer == "nadam":
                    nadam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W, self.velocity_b, self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)

            # Compute average loss and accuracy for the epoch
            avg_loss = total_loss / num_samples
            avg_acc = total_acc / num_samples

            # Validation Metrics
            y_val_pred = self.forward(X_val)
            val_loss = self.compute_loss(y_val, y_val_pred)
            val_acc = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

            # Save Best Model
            global best_model_data
            if val_acc > best_model_data["best_accuracy"]:
                best_model_data = {
                    "weights": [W.copy() for W in self.weights],
                    "biases": [b.copy() for b in self.biases],
                    "params": {
                        "layers": self.layers,
                        "learning_rate": self.learning_rate,
                        "activation": self.activation,
                        "optimizer": self.optimizer,
                        "weight_init": self.weight_init,
                        "weight_decay": self.weight_decay,
                        "beta": self.beta,
                        "beta1": self.beta1,
                        "beta2": self.beta2,
                        "epsilon": self.epsilon,
                    },
                    "best_accuracy": val_acc
                }
            
            # Log to Weights & Biases
            wandb.log({
                "epoch": epoch + 1, 
                "loss": avg_loss, 
                "accuracy": avg_acc, 
                "val_loss": val_loss, 
                "val_accuracy": val_acc
            })
            
        # Log confusion matrix at the end of training
        # Get predictions using final model
        y_val_pred = self.forward(X_val)
        y_val_pred_classes = np.argmax(y_val_pred, axis=1)
        y_val_true_classes = np.argmax(y_val, axis=1)
        
        # Compute confusion matrix
        cm = confusion_matrix(y_val_true_classes, y_val_pred_classes)
        
        # Class names for Fashion-MNIST
        class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                      'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
        
        # Log confusion matrix
        wandb.log({
            "confusion_matrix": wandb.plot.confusion_matrix(
                probs=None,
                y_true=y_val_true_classes,
                preds=y_val_pred_classes,
                class_names=class_names
            )
        })



In [None]:
def train_with_wandb():
    wandb.init(project="DLA1", entity="da24m014") # Ensure WandB is initialized before using config
    config = wandb.config
    run_name = f"hl_{config.hidden_layers}_bs_{config.batch_size}_ac_{config.activation}"
    wandb.run.name = run_name
    model = NeuralNetwork([784] + [config.hidden_size] * config.hidden_layers + [10],
                          learning_rate=config.learning_rate,
                          activation=config.activation,
                          optimizer=config.optimizer,
                          weight_init=config.weight_init,
                          weight_decay=config.weight_decay,
                          )
    model.train(x_train, y_train, x_test, y_test, config.epochs, config.batch_size)
    # wandb.finish()
wandb.agent(sweep_id, function=train_with_wandb)


[34m[1mwandb[0m: Agent Starting Run: zf0mkg6x with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▂▃▃▄▄▅▆▇█
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▇▇▆▅▄▄▃▂▁
val_accuracy,▁▂▂▃▃▄▅▆▇█
val_loss,█▇▇▆▅▅▄▃▂▁

0,1
accuracy,0.36902
epoch,10.0
loss,2.23328
val_accuracy,0.3839
val_loss,2.22736


[34m[1mwandb[0m: Agent Starting Run: nh1vngp3 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▃▁▂▄▆▆▆▇█
epoch,▁▂▃▃▄▅▆▆▇█
loss,██▇▇▆▆▅▄▃▁
val_accuracy,▁▂▁▁▄▅▆▇▇█
val_loss,██▇▇▇▆▅▄▃▁

0,1
accuracy,0.33402
epoch,10.0
loss,2.2923
val_accuracy,0.3363
val_loss,2.29057


[34m[1mwandb[0m: Agent Starting Run: js7yvqhh with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▂▄▇█
epoch,▁▃▅▆█
loss,█▇▅▃▁
val_accuracy,▁▂▆██
val_loss,█▆▄▂▁

0,1
accuracy,0.67606
epoch,5.0
loss,0.97796
val_accuracy,0.6869
val_loss,0.89747


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: krm9695f with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 4
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▆▆▇█
epoch,▁▃▅▆█
loss,█▇▄▂▁
val_accuracy,▁▄▄▇█
val_loss,█▆▄▂▁

0,1
accuracy,0.58587
epoch,5.0
loss,1.33201
val_accuracy,0.6037
val_loss,1.26903


[34m[1mwandb[0m: Agent Starting Run: ue5qt47w with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▆▇▇██████
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▂▂▁▁▁▁▁▁▁
val_accuracy,▁▄▅▆▆██▇██
val_loss,█▅▄▂▂▁▂▄▃▄

0,1
accuracy,0.87433
epoch,10.0
loss,0.39339
val_accuracy,0.8587
val_loss,0.47412


[34m[1mwandb[0m: Agent Starting Run: ba45pnpz with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▅▇██
epoch,▁▃▅▆█
loss,█▄▂▂▁
val_accuracy,▁▆▆▄█
val_loss,█▃▄▅▁

0,1
accuracy,0.88139
epoch,5.0
loss,0.32229
val_accuracy,0.8729
val_loss,0.35032


[34m[1mwandb[0m: Agent Starting Run: 1wojwcyj with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▆▇▇▇▇████
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▂▁▁▁▂▁▁▁▁
val_accuracy,▂▃▁▄▂▄▇█▄▆
val_loss,▃▄▄█▇▅▁▁▅▂

0,1
accuracy,0.85472
epoch,10.0
loss,0.54674
val_accuracy,0.8389
val_loss,0.60087


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 208nhvhv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▃▆▇█
epoch,▁▃▅▆█
loss,██▇▅▁
val_accuracy,▄▁▆▆█
val_loss,██▇▅▁

0,1
accuracy,0.35144
epoch,5.0
loss,2.05014
val_accuracy,0.4028
val_loss,1.91693


[34m[1mwandb[0m: Agent Starting Run: d1ctt9zq with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_layers: 3
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


In [None]:
# import wandb
# from sklearn.metrics import confusion_matrix

# # Define the sweep configuration
# sweep_config = {
#     "method": "grid",
#     "metric": {"name": "step", "goal": "maximize"},
#     "parameters": {"step": {"values": [1]}}
# }

# # Initialize the W&B sweep
# sweep_id = wandb.sweep(sweep_config, project="DLA1")

# # Function to load best model
# def load_best_model(model):
#     """Load stored model weights & biases."""
#     global best_model_data
#     if not best_model_data:
#         raise ValueError("❌ Error: best_model_data is None. Ensure the model was saved correctly.")
    
#     if best_model_data["weights"] is not None and best_model_data["biases"] is not None:
#         model.weights = [W.copy() for W in best_model_data["weights"]]
#         model.biases = [b.copy() for b in best_model_data["biases"]]
#         print("✅ Best model loaded!")

# # Function to compute and log confusion matrix
# def log_confusion_matrix(model, X_test, y_test, step):
#     """Compute & log confusion matrix at different steps."""
#     wandb.init(project="DLA1", name=f"confusion_matrix_step_{step}", reinit=True)

#     # Load best model
#     load_best_model(model)

#     # Forward pass to get predictions
#     y_pred = model.forward(X_test)
#     y_pred_classes = np.argmax(y_pred, axis=1)
#     y_true_classes = np.argmax(y_test, axis=1)

#     # Compute confusion matrix
#     cm = confusion_matrix(y_true_classes, y_pred_classes)

#     # Log confusion matrix & step
#     wandb.log({
#         "step": step,
#         "confusion_matrix": wandb.plot.confusion_matrix(
#             probs=None,
#             y_true=y_true_classes,
#             preds=y_pred_classes,
#             class_names=[str(i) for i in range(cm.shape[0])]
#         )
#     })

#     print(f"✅ Confusion matrix logged for step {step}!")
#     wandb.finish()

# # Function to run the sweep
# def confusion_matrix_sweep():
#     wandb.init()
#     step = wandb.config.step  # Get step from sweep config

#     global best_model_data
#     if best_model_data is None:
#         raise ValueError("❌ Error: best_model_data is None. Load the model before running the sweep.")

#     print(f"🔍 Debug: best_model_data = {best_model_data}")  # Debugging print

#     # Step 1: Retrieve architecture & training parameters
#     params = best_model_data.get("params", {})
    
#     if not params:
#         raise ValueError("❌ Error: No 'params' found in best_model_data.")

#     layers = params.get("layers", None)
#     learning_rate = params["learning_rate"]
#     activation = params["activation"]
#     optimizer = params["optimizer"]
#     weight_init = params["weight_init"]
#     weight_decay = params["weight_decay"]
#     beta = params["beta"]
#     beta1 = params["beta1"]
#     beta2 = params["beta2"]
#     epsilon = params["epsilon"]

#     # Step 2: Recreate the model
#     model = NeuralNetwork(layers, learning_rate, activation, optimizer, weight_init, weight_decay, beta, beta1, beta2, epsilon)

#     # Step 3: Log confusion matrix
#     log_confusion_matrix(model, x_test, y_test, step)

# # Start the W&B sweep agent
# wandb.agent(sweep_id, function=confusion_matrix_sweep, count=5)


Create sweep with ID: u41u1yle
Sweep URL: https://wandb.ai/da24m014-iit-madras/DLA1/sweeps/u41u1yle


[34m[1mwandb[0m: Agent Starting Run: h2na2flp with config:
[34m[1mwandb[0m: 	step: 1


🔍 Debug: best_model_data = {'weights': [array([[-0.05788462,  0.08496019, -0.05119562, ...,  0.0701345 ,
         0.01479301,  0.04767529],
       [ 0.06669954,  0.01728071, -0.0856756 , ...,  0.02924542,
         0.08063394, -0.02059263],
       [ 0.050122  , -0.02708433, -0.06578396, ...,  0.09756137,
        -0.04458298, -0.09899987],
       ...,
       [ 0.08508743,  0.03929655,  0.01109719, ..., -0.06215915,
         0.0931534 , -0.02353104],
       [ 0.04418404, -0.07261056, -0.00472867, ...,  0.04211466,
        -0.06445477,  0.07507458],
       [-0.03740737, -0.06406585, -0.00175301, ..., -0.02057951,
        -0.04539039,  0.01478242]]), array([[-0.00771372, -0.04418907,  0.06296483, ...,  0.09870522,
        -0.027208  , -0.0299819 ],
       [-0.0623371 , -0.07893196, -0.09093062, ...,  0.06639135,
        -0.01731383, -0.03968825],
       [-0.07107343,  0.0255112 ,  0.02029273, ..., -0.06438042,
        -0.02434645, -0.01852116],
       ...,
       [ 0.08711777, -0.10203015, 

✅ Confusion matrix logged for step 1!


0,1
step,▁

0,1
step,1


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [None]:
# def load_best_model(model):
#     """Load the best stored model parameters from global variables into a given model."""
#     global best_model_data
#     if best_model_data["weights"] is not None and best_model_data["biases"] is not None:
#         model.weights = [W.copy() for W in best_model_data["weights"]]
#         model.biases = [b.copy() for b in best_model_data["biases"]]
#         print("✅ Best model loaded!")
#     else:
#         print("⚠️ No best model found. Train the model first.")


In [None]:
# from sklearn.metrics import confusion_matrix

# def log_confusion_matrix(model, X_test, y_test):
#     """
#     Uses the best stored model to compute predictions, create a confusion matrix, 
#     and log it in Weights & Biases.
#     """
#     # Initialize a separate WandB run for logging confusion matrix
#     wandb.init(project="DLA1", name="confusion_matrix_run", reinit=True)

#     # Load best model (from stored variables)
#     load_best_model(model)

#     # Forward pass to get predictions
#     y_pred = model.forward(X_test)
#     y_pred_classes = np.argmax(y_pred, axis=1)
#     y_true_classes = np.argmax(y_test, axis=1)

#     # Compute confusion matrix
#     cm = confusion_matrix(y_true_classes, y_pred_classes)

#     # Log confusion matrix to WandB
#     wandb.log({"confusion_matrix": wandb.plot.confusion_matrix(
#         probs=None,
#         y_true=y_true_classes,
#         preds=y_pred_classes,
#         class_names=[str(i) for i in range(cm.shape[0])]
#     )})

#     print("✅ Confusion matrix logged in WandB!")
    
#     # Finish the WandB run
#     wandb.finish()

In [None]:
# # Step 1: Retrieve the architecture and training parameters from saved model
# layers = best_model_data["params"]["layers"] if "layers" in best_model_data["params"] else None
# learning_rate = best_model_data["params"]["learning_rate"]
# activation = best_model_data["params"]["activation"]
# optimizer = best_model_data["params"]["optimizer"]
# weight_init = best_model_data["params"]["weight_init"]
# weight_decay = best_model_data["params"]["weight_decay"]
# beta = best_model_data["params"]["beta"]
# beta1 = best_model_data["params"]["beta1"]
# beta2 = best_model_data["params"]["beta2"]
# epsilon = best_model_data["params"]["epsilon"]

# # Step 2: Recreate the NeuralNetwork model using the saved architecture
# model = NeuralNetwork(layers, learning_rate, activation, optimizer, weight_init, weight_decay, beta, beta1, beta2, epsilon)

# # Step 3: Load the saved weights and biases into the new model
# def load_best_model(model):
#     global best_model_data
#     if best_model_data["weights"] is not None and best_model_data["biases"] is not None:
#         model.weights = [W.copy() for W in best_model_data["weights"]]
#         model.biases = [b.copy() for b in best_model_data["biases"]]
#         print("✅ Best model loaded!")

# # Load the best model
# load_best_model(model)

# # Step 4: Compute Confusion Matrix and Log Results
# log_confusion_matrix(model, x_test, y_test)


✅ Confusion matrix logged in WandB!
