<a href="https://colab.research.google.com/github/SaiRajesh228/DeepLearningAssignment1/blob/main/DA6401_ASSIGNMENT1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import wandb
from keras.datasets import fashion_mnist

# -----------------------
# Neural operations
def linear(z):
    return np.maximum(0, z)
def linear_grad(z):
    return (z > 0) * 1.0
def logistic(z):
    return 1 / (1 + np.exp(-z))
def logistic_grad(z):
    s = logistic(z)
    return s * (1 - s)
def hyperbolic(z):
    return np.tanh(z)
def hyperbolic_grad(z):
    return 1 - np.tanh(z)**2

neural_ops = {
    "relu": (linear, linear_grad),
    "sigmoid": (logistic, logistic_grad),
    "tanh": (hyperbolic, hyperbolic_grad)
}

# -----------------------
# Deep Neural Network Class
class DeepNeuralNet:
    def __init__(self, input_dim, hidden_dims, output_dim, activation="relu", init_scheme="Xavier"):
        self.layer_count = len(hidden_dims) + 1
        self.activation = activation
        self.weights = []
        self.biases = []

        dims = [input_dim] + hidden_dims + [output_dim]
        for i in range(self.layer_count):
            if init_scheme == "Xavier":
                if activation in ["tanh", "sigmoid"]:
                    scale = np.sqrt(2. / (dims[i] + dims[i+1]))
                else:  # ReLU
                    scale = np.sqrt(2. / dims[i])
                W = np.random.randn(dims[i], dims[i+1]) * scale
            else:
                W = np.random.randn(dims[i], dims[i+1]) * 0.01
            b = np.zeros((1, dims[i+1]))
            self.weights.append(W)
            self.biases.append(b)

    def predict(self, X):
        activate, _ = neural_ops[self.activation]
        self.z_records = []
        self.a_records = [X]
        A = X
        for idx in range(self.layer_count):
            Z = A.dot(self.weights[idx]) + self.biases[idx]
            self.z_records.append(Z)
            if idx == self.layer_count - 1:
                shifted = Z - np.max(Z, axis=1, keepdims=True)
                exp = np.exp(shifted)
                A = exp / np.sum(exp, axis=1, keepdims=True)
            else:
                A = activate(Z)
            self.a_records.append(A)
        return A

    def calculate_cost(self, Y_hat, Y_real, cost_type="cross_entropy"):
        m = Y_real.shape[0]
        if cost_type == "cross_entropy":
            return -np.sum(Y_real * np.log(Y_hat + 1e-8)) / m
        elif cost_type == "mean_squared_error":
            return np.sum((Y_real - Y_hat)**2) / (2 * m)

    def compute_gradients(self, X, Y, cost_type="cross_entropy"):
        m = X.shape[0]
        grad_weights = [None] * self.layer_count
        grad_biases = [None] * self.layer_count

        final_act = self.a_records[-1]
        if cost_type == "cross_entropy":
            delta = final_act - Y
        elif cost_type == "mean_squared_error":
            delta = (final_act - Y)

        for idx in reversed(range(self.layer_count)):
            if idx == self.layer_count - 1:
                dZ = delta
            else:
                _, grad_func = neural_ops[self.activation]
                dZ = delta * grad_func(self.z_records[idx])
            prev_act = self.a_records[idx]
            grad_weights[idx] = prev_act.T.dot(dZ) / m
            grad_biases[idx] = np.sum(dZ, axis=0, keepdims=True) / m
            if idx > 0:
                delta = dZ.dot(self.weights[idx].T)
        return grad_weights, grad_biases

    def adjust_params(self, grad_w, grad_b, optim, settings, states):
        lr = settings.learning_rate

        if optim == "sgd":
            for i in range(self.layer_count):
                self.weights[i] -= lr * grad_w[i]
                self.biases[i] -= lr * grad_b[i]

        elif optim == "momentum":
            momentum_val = getattr(settings, 'momentum', 0.5)
            if "momentum" not in states:
                states["momentum"] = {
                    "v_w": [np.zeros_like(w) for w in self.weights],
                    "v_b": [np.zeros_like(b) for b in self.biases]
                }
            for i in range(self.layer_count):
                states["momentum"]["v_w"][i] = momentum_val * states["momentum"]["v_w"][i] + grad_w[i]
                self.weights[i] -= lr * states["momentum"]["v_w"][i]
                states["momentum"]["v_b"][i] = momentum_val * states["momentum"]["v_b"][i] + grad_b[i]
                self.biases[i] -= lr * states["momentum"]["v_b"][i]

        elif optim == "nesterov":
            if "nesterov" not in states:
                states["nesterov"] = {
                    "v_w": [np.zeros_like(w) for w in self.weights],
                    "v_b": [np.zeros_like(b) for b in self.biases]
                }
            momentum = 0.9
            for i in range(self.layer_count):
                states["nesterov"]["v_w"][i] = momentum * states["nesterov"]["v_w"][i] + grad_w[i]
                self.weights[i] -= lr * (momentum * states["nesterov"]["v_w"][i] + grad_w[i])
                states["nesterov"]["v_b"][i] = momentum * states["nesterov"]["v_b"][i] + grad_b[i]
                self.biases[i] -= lr * (momentum * states["nesterov"]["v_b"][i] + grad_b[i])

        elif optim == "rmsprop":
            if "rmsprop" not in states:
                states["rmsprop"] = {
                    "cache_w": [np.zeros_like(w) for w in self.weights],
                    "cache_b": [np.zeros_like(b) for b in self.biases]
                }
            gamma = 0.9
            eps = 1e-8
            for i in range(self.layer_count):
                states["rmsprop"]["cache_w"][i] = gamma * states["rmsprop"]["cache_w"][i] + (1 - gamma) * (grad_w[i]**2)
                self.weights[i] -= lr * grad_w[i] / (np.sqrt(states["rmsprop"]["cache_w"][i]) + eps)
                states["rmsprop"]["cache_b"][i] = gamma * states["rmsprop"]["cache_b"][i] + (1 - gamma) * (grad_b[i]**2)
                self.biases[i] -= lr * grad_b[i] / (np.sqrt(states["rmsprop"]["cache_b"][i]) + eps)

        elif optim == "adam":
            if "adam" not in states:
                states["adam"] = {
                    "m_w": [np.zeros_like(w) for w in self.weights],
                    "v_w": [np.zeros_like(w) for w in self.weights],
                    "m_b": [np.zeros_like(b) for b in self.biases],
                    "v_b": [np.zeros_like(b) for b in self.biases],
                    "step": 0
                }
            beta1 = 0.9
            beta2 = 0.999
            eps = 1e-8
            states["adam"]["step"] += 1
            t = states["adam"]["step"]
            for i in range(self.layer_count):
                states["adam"]["m_w"][i] = beta1 * states["adam"]["m_w"][i] + (1 - beta1) * grad_w[i]
                states["adam"]["v_w"][i] = beta2 * states["adam"]["v_w"][i] + (1 - beta2) * (grad_w[i]**2)
                m_w_adj = states["adam"]["m_w"][i] / (1 - beta1**t)
                v_w_adj = states["adam"]["v_w"][i] / (1 - beta2**t)
                self.weights[i] -= lr * m_w_adj / (np.sqrt(v_w_adj) + eps)
                states["adam"]["m_b"][i] = beta1 * states["adam"]["m_b"][i] + (1 - beta1) * grad_b[i]
                states["adam"]["v_b"][i] = beta2 * states["adam"]["v_b"][i] + (1 - beta2) * (grad_b[i]**2)
                m_b_adj = states["adam"]["m_b"][i] / (1 - beta1**t)
                v_b_adj = states["adam"]["v_b"][i] / (1 - beta2**t)
                self.biases[i] -= lr * m_b_adj / (np.sqrt(v_b_adj) + eps)

        elif optim == "nadam":
            if "nadam" not in states:
                states["nadam"] = {
                    "m_w": [np.zeros_like(w) for w in self.weights],
                    "v_w": [np.zeros_like(w) for w in self.weights],
                    "m_b": [np.zeros_like(b) for b in self.biases],
                    "v_b": [np.zeros_like(b) for b in self.biases],
                    "step": 0
                }
            beta1 = getattr(settings, 'beta1', 0.9)
            beta2 = getattr(settings, 'beta2', 0.999)
            eps = getattr(settings, 'eps', 1e-8)
            states["nadam"]["step"] += 1
            t = states["nadam"]["step"]
            for i in range(self.layer_count):
                states["nadam"]["m_w"][i] = beta1 * states["nadam"]["m_w"][i] + (1 - beta1) * grad_w[i]
                states["nadam"]["v_w"][i] = beta2 * states["nadam"]["v_w"][i] + (1 - beta2) * (grad_w[i]**2)
                m_w_hat = states["nadam"]["m_w"][i] / (1 - beta1**t)
                v_w_hat = states["nadam"]["v_w"][i] / (1 - beta2**t)
                m_w_bar = beta1 * m_w_hat + ((1 - beta1) * grad_w[i]) / (1 - beta1**t)
                self.weights[i] -= lr * m_w_bar / (np.sqrt(v_w_hat) + eps)

                states["nadam"]["m_b"][i] = beta1 * states["nadam"]["m_b"][i] + (1 - beta1) * grad_b[i]
                states["nadam"]["v_b"][i] = beta2 * states["nadam"]["v_b"][i] + (1 - beta2) * (grad_b[i]**2)
                m_b_hat = states["nadam"]["m_b"][i] / (1 - beta1**t)
                v_b_hat = states["nadam"]["v_b"][i] / (1 - beta2**t)
                m_b_bar = beta1 * m_b_hat + ((1 - beta1) * grad_b[i]) / (1 - beta1**t)
                self.biases[i] -= lr * m_b_bar / (np.sqrt(v_b_hat) + eps)

        return states

# -----------------------
# Helper utilities
def encode_labels(y, num_labels):
    encoded = np.zeros((len(y), num_labels))
    encoded[np.arange(len(y)), y] = 1
    return encoded

def get_accuracy(Y_est, Y_actual):
    preds = np.argmax(Y_est, axis=1)
    truths = np.argmax(Y_actual, axis=1)
    return np.mean(preds == truths)

def log_confusion_matrix(Y_est, y_real, classes):
    from sklearn.metrics import confusion_matrix
    import seaborn as sns
    cm = confusion_matrix(y_real, np.argmax(Y_est, axis=1))
    plt.figure(figsize=(9,7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=classes, yticklabels=classes)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    wandb.log({"Confusion Matrix": wandb.Image(plt)})
    plt.close()

# -----------------------
# Question 1: Sample Images
def log_q1_samples():
    (train_X, train_y), _ = fashion_mnist.load_data()
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
    plt.figure(figsize=(10,5))
    for i in range(10):
        idx = np.where(train_y == i)[0][0]
        plt.subplot(2,5,i+1)
        plt.imshow(train_X[idx], cmap='gray')
        plt.title(class_names[i])
        plt.axis('off')
    plt.tight_layout()
    wandb.log({"Question 1 Samples": wandb.Image(plt)})
    plt.close()

# -----------------------
# Training procedure
def execute_training():
    wandb.init()
    cfg = wandb.config

    (train_X, train_y), (test_X, test_y) = fashion_mnist.load_data()
    train_X = train_X.reshape(train_X.shape[0], -1) / 255.0
    test_X = test_X.reshape(test_X.shape[0], -1) / 255.0
    num_classes = 10
    train_y_oh = encode_labels(train_y, num_classes)
    test_y_oh = encode_labels(test_y, num_classes)

    val_split = int(0.9 * train_X.shape[0])
    val_X, val_y_oh = train_X[val_split:], train_y_oh[val_split:]
    train_X, train_y_oh = train_X[:val_split], train_y_oh[:val_split]

    input_dim = train_X.shape[1]
    hidden_arch = [cfg.hiddennodes] * cfg.hiddenlayers
    model = DeepNeuralNet(input_dim, hidden_arch, num_classes,
                          activation=cfg.activation_func, init_scheme=cfg.initializer)

    optimizer_states = {}
    grad_clip_value = 1.0

    for epoch in range(cfg.num_epochs):
        shuffle_idx = np.random.permutation(train_X.shape[0])
        train_X = train_X[shuffle_idx]
        train_y_oh = train_y_oh[shuffle_idx]
        batches = train_X.shape[0] // cfg.batch_size
        epoch_loss = 0.0

        for batch in range(batches):
            start = batch * cfg.batch_size
            end = start + cfg.batch_size
            X_batch = train_X[start:end]
            y_batch = train_y_oh[start:end]

            outputs = model.predict(X_batch)
            loss = model.calculate_cost(outputs, y_batch, cost_type=cfg.loss)
            epoch_loss += loss

            grad_w, grad_b = model.compute_gradients(X_batch, y_batch, cost_type=cfg.loss)

            for i in range(len(grad_w)):
                grad_w[i] = np.clip(grad_w[i], -grad_clip_value, grad_clip_value)
                grad_b[i] = np.clip(grad_b[i], -grad_clip_value, grad_clip_value)

            optimizer_states = model.adjust_params(grad_w, grad_b, cfg.opt, cfg, optimizer_states)

        avg_loss = epoch_loss / batches
        val_outputs = model.predict(val_X)
        val_acc = get_accuracy(val_outputs, val_y_oh)
        wandb.log({"epoch": epoch+1, "loss": avg_loss, "val_accuracy": val_acc})

    test_outputs = model.predict(test_X)
    final_acc = get_accuracy(test_outputs, test_y_oh)
    wandb.log({"test_accuracy": final_acc})
    log_confusion_matrix(test_outputs, test_y, [str(i) for i in range(num_classes)])

# -----------------------
# Sweep configuration (unchanged)
sweep_config = {
    'name': "karapa-rajesh",
    'method': 'bayes',
    'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
    'parameters': {
        'hiddenlayers': {'values': [3,4,5]},
        'num_epochs': {'values': [10,15]},
        'hiddennodes': {'values': [128,256]},
        'learning_rate': {'values': [1e-3,5e-4]},
        'initializer': {'values': ["Xavier"]},
        'batch_size': {'values': [64,128]},
        'opt': {'values': ["adam","nesterov","rmsprop","momentum","nadam"]},
        'activation_func': {'values': ["relu"]},
        'loss': {'values': ["cross_entropy"]}
    }
}

# -----------------------
# Execution
if __name__ == "__main__":
    # Log Q1 samples
    wandb.init(project="DeepLearning", name="Q1_Samples", job_type="logging")
    log_q1_samples()
    wandb.finish()

    # Run sweep
    sweep_id = wandb.sweep(sweep_config, project="DeepLearning")
    wandb.agent(sweep_id, function=execute_training, count=30)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkarapa-rajesh[0m ([33mkarapa-rajesh-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Create sweep with ID: dzg25g50
Sweep URL: https://wandb.ai/karapa-rajesh-iit-madras/DeepLearning/sweeps/dzg25g50


[34m[1mwandb[0m: Agent Starting Run: m3eueaf6 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▅▃▆▆▇▆▆▆█▇▅▇██

0,1
epoch,15.0
loss,0.19283
test_accuracy,0.8824
val_accuracy,0.88683


[34m[1mwandb[0m: Agent Starting Run: 6c0joq1m with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 4
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: momentum


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▇▇▇▇▇████

0,1
epoch,15.0
loss,0.44253
test_accuracy,0.835
val_accuracy,0.842


[34m[1mwandb[0m: Agent Starting Run: al4czcif with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 4
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: momentum


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▃▂▂▂▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▆▆▇▇▇███

0,1
epoch,10.0
loss,0.47825
test_accuracy,0.8234
val_accuracy,0.833


[34m[1mwandb[0m: Agent Starting Run: 2bmkmaiq with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▃▆▅▆▆▆▇▇▇▆▇█▇▆

0,1
epoch,15.0
loss,0.18945
test_accuracy,0.8817
val_accuracy,0.88217


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lofc0gl6 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▃▄▅▆▆▆▇▇█▇▇▇▆█

0,1
epoch,15.0
loss,0.19035
test_accuracy,0.8812
val_accuracy,0.892


[34m[1mwandb[0m: Agent Starting Run: mjc3ulsj with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▃▃▃▃▂▂▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▃▂▆▆▅▇▇▇███▇█▇

0,1
epoch,15.0
loss,0.1885
test_accuracy,0.8804
val_accuracy,0.88233


[34m[1mwandb[0m: Agent Starting Run: miadaavt with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: momentum


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▇▇▇▇█████

0,1
epoch,15.0
loss,0.52682
test_accuracy,0.8081
val_accuracy,0.81317


[34m[1mwandb[0m: Agent Starting Run: hvfrssrk with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: momentum


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▇▇▇▇█████

0,1
epoch,15.0
loss,0.48894
test_accuracy,0.8186
val_accuracy,0.83


[34m[1mwandb[0m: Agent Starting Run: pbb91o3h with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: adam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▄▃▃▃▂▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▄▅▃▆▇▆▇▆▇▇█▆█

0,1
epoch,15.0
loss,0.18771
test_accuracy,0.8873
val_accuracy,0.8945


[34m[1mwandb[0m: Agent Starting Run: uanvnm5j with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▃▂▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▅▄█▇▅▇█▆▆

0,1
epoch,10.0
loss,0.34148
test_accuracy,0.8553
val_accuracy,0.86367


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jse0ev9q with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▁▅▅▆▇▆▆▆▇▇█▇██

0,1
epoch,15.0
loss,0.19517
test_accuracy,0.8862
val_accuracy,0.8895


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9wshp958 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▄▃▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▅▄▅▅▇▅██▇

0,1
epoch,10.0
loss,0.22979
test_accuracy,0.8795
val_accuracy,0.885


[34m[1mwandb[0m: Agent Starting Run: t4ef1ric with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: adam


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▃▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▄▆▅▆▆▇█▆▇

0,1
epoch,10.0
loss,0.23798
test_accuracy,0.875
val_accuracy,0.88133


[34m[1mwandb[0m: Agent Starting Run: soh3sfjf with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▄▃▃▂▂▂▁▁
test_accuracy,▁
val_accuracy,▁▂▆▄▆▇▇██▇

0,1
epoch,10.0
loss,0.22757
test_accuracy,0.8788
val_accuracy,0.88533


[34m[1mwandb[0m: Agent Starting Run: 2awts7mj with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 4
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: nesterov


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▃▂▂▂▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▇███

0,1
epoch,10.0
loss,0.40549
test_accuracy,0.8438
val_accuracy,0.85167


[34m[1mwandb[0m: Agent Starting Run: 5lz8p2gy with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▄▃▃▂▂▂▂▁▁▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▆▆▇▇▇█▇▇█▇█▇██

0,1
epoch,15.0
loss,0.26672
test_accuracy,0.8781
val_accuracy,0.8825


[34m[1mwandb[0m: Agent Starting Run: 40l609sf with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▃▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▃▆▆▆▆▇█▇█

0,1
epoch,10.0
loss,0.24586
test_accuracy,0.8773
val_accuracy,0.88283


[34m[1mwandb[0m: Agent Starting Run: usiffoqe with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: adam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▂▁▃▂▆▆▄▇▇█▆▇█▇█

0,1
epoch,15.0
loss,0.19905
test_accuracy,0.8922
val_accuracy,0.88717


[34m[1mwandb[0m: Agent Starting Run: 4uke6ixs with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: momentum


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▃▅▅▆▇▇███

0,1
epoch,10.0
loss,0.60872
test_accuracy,0.7984
val_accuracy,0.81083


[34m[1mwandb[0m: Agent Starting Run: dom0u3k0 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: adam


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▃▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▃▆▇▇█▆█▇█

0,1
epoch,10.0
loss,0.24478
test_accuracy,0.8819
val_accuracy,0.88367


[34m[1mwandb[0m: Agent Starting Run: glr32eho with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▂▂▅▆▅▇▇▇▇▇█▇▇▇

0,1
epoch,15.0
loss,0.20826
test_accuracy,0.88
val_accuracy,0.88433


[34m[1mwandb[0m: Agent Starting Run: 6lb0xihg with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: nesterov


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▃▂▂▂▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▆▇▇▇████

0,1
epoch,10.0
loss,0.40538
test_accuracy,0.8438
val_accuracy,0.85317


[34m[1mwandb[0m: Agent Starting Run: hnp6y9z7 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▂▁▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▆▄▆▆▇▇▃█

0,1
epoch,10.0
loss,0.2939
test_accuracy,0.8754
val_accuracy,0.88633


[34m[1mwandb[0m: Agent Starting Run: 4oxyeawu with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▅▄▃▃▂▂▂▁▁
test_accuracy,▁
val_accuracy,▁▅▅▅▃▇▇▇██

0,1
epoch,10.0
loss,0.21903
test_accuracy,0.885
val_accuracy,0.89067


[34m[1mwandb[0m: Agent Starting Run: 0v9y2l9z with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▄▃▃▃▂▂▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▅▅▆▅▇▆▇███▇█▇█

0,1
epoch,15.0
loss,0.16974
test_accuracy,0.8918
val_accuracy,0.891


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: brylqe0d with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: nadam


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▅▄▄▃▃▂▂▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▆▆▇▇▇▇███

0,1
epoch,15.0
loss,0.19527
test_accuracy,0.8813
val_accuracy,0.88867


[34m[1mwandb[0m: Agent Starting Run: qnf3fu95 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: momentum


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▂▂▂▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▇▇██

0,1
epoch,10.0
loss,0.54116
test_accuracy,0.8062
val_accuracy,0.8185


[34m[1mwandb[0m: Agent Starting Run: l9yollu7 with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 256
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 15
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▅▇▆▇▇█▇█████▇█

0,1
epoch,15.0
loss,0.27489
test_accuracy,0.8712
val_accuracy,0.87883


[34m[1mwandb[0m: Agent Starting Run: f1kah73c with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 3
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: nesterov


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▃▂▂▂▁▁▁▁▁
test_accuracy,▁
val_accuracy,▁▄▅▆▆▇▇███

0,1
epoch,10.0
loss,0.39626
test_accuracy,0.8488
val_accuracy,0.85717


[34m[1mwandb[0m: Agent Starting Run: kci824ma with config:
[34m[1mwandb[0m: 	activation_func: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	hiddenlayers: 5
[34m[1mwandb[0m: 	hiddennodes: 128
[34m[1mwandb[0m: 	initializer: Xavier
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	loss: cross_entropy
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	opt: rmsprop


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▄▃▃▂▂▂▁▁▁
test_accuracy,▁
val_accuracy,▁▅▆▇▅██▇▆▄

0,1
epoch,10.0
loss,0.25128
test_accuracy,0.8443
val_accuracy,0.85117
