In [1]:
import wandb
wandb.login()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mda24m015[0m ([33mda24m015-iitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import wandb
import numpy as np
import tensorflow as tf
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist

# Initialize wandb
# wandb.init(project="assignment_1", entity="da24m015-iitm")

# Define class names for Fashion MNIST dataset
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# Load Fashion MNIST dataset
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# Preprocess data
x_train = x_train.reshape(x_train.shape[0], 784) / 255.0
x_test = x_test.reshape(x_test.shape[0], 784) / 255.0
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Split into training & validation sets
val_size = int(0.1 * len(x_train))
x_val, y_val = x_train[:val_size], y_train[:val_size]
x_train, y_train = x_train[val_size:], y_train[val_size:]

# Activation functions and their gradients
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.maximum(0, x)

def tanh(x):
    return np.tanh(x)

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

def grad_sigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))

def grad_relu(x):
    return (x > 0).astype(float)

def grad_tanh(x):
    return 1 - tanh(x)**2

# Loss function
def cross_entropy_loss(y_pred, y_true):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))

# Forward propagation
def forward_propagation(X, theta, num_hidden_layers, activation_func):
    activations = {"a0": X.T}
    for i in range(1, num_hidden_layers + 2):
        W = theta[f"W{i}"]
        b = theta[f"b{i}"]
        Z = W @ activations[f"a{i-1}"] + b
        
        if i < num_hidden_layers + 1:
            activations[f"a{i}"] = activation_func(Z)
        else:
            activations[f"a{i}"] = softmax(Z.T).T
    
    return activations

# Backward propagation
def backward_propagation(Y_true, activations, theta, num_hidden_layers, activation_grad_func):
    grads = {}
    Y_pred = activations[f"a{num_hidden_layers + 1}"]
    grads[f"da{num_hidden_layers + 1}"] = Y_pred - Y_true.T
    
    for i in range(num_hidden_layers, 0, -1):
        grads[f"dW{i}"] = grads[f"da{i+1}"] @ activations[f"a{i}"].T / Y_true.shape[0]
        grads[f"db{i}"] = np.sum(grads[f"da{i+1}"], axis=1, keepdims=True) / Y_true.shape[0]
        grads[f"da{i}"] = (theta[f"W{i+1}"].T @ grads[f"da{i+1}"]) * activation_grad_func(activations[f"a{i}"])
    
    return grads

# Initialize weights
def initialize_weights(input_size, hidden_sizes, output_size, weight_init):
    theta = {}
    layer_sizes = [input_size] + hidden_sizes + [output_size]
    
    for i in range(1, len(layer_sizes)):
        if weight_init == 'xavier':
            theta[f"W{i}"] = np.random.randn(layer_sizes[i], layer_sizes[i-1]) * np.sqrt(2 / layer_sizes[i-1])
        else:
            theta[f"W{i}"] = np.random.randn(layer_sizes[i], layer_sizes[i-1]) * 0.01
        theta[f"b{i}"] = np.zeros((layer_sizes[i], 1))
    
    return theta

# Optimizers
def sgd(params, grads, learning_rate):
    for key in params:
        if key in grads:
            params[key] -= learning_rate * grads[key]
    return params

def momentum(params, grads, v, learning_rate, beta=0.9):
    for key in params:
        if key in grads:
            v[key] = beta * v[key] + learning_rate * grads[key]
            params[key] -= v[key]
    return params, v

def nesterov(params, grads, v, learning_rate, beta=0.9):
    for key in params:
        if key in grads:
            v_prev = v[key]
            v[key] = beta * v[key] + learning_rate * grads[key]
            params[key] -= beta * v_prev + (1 + beta) * v[key]
    return params, v

def rmsprop(params, grads, s, learning_rate, beta=0.9, epsilon=1e-8):
    for key in params:
        if key in grads:
            s[key] = beta * s[key] + (1 - beta) * (grads[key]**2)
            params[key] -= learning_rate * grads[key] / (np.sqrt(s[key]) + epsilon)
    return params, s

def adam(params, grads, m, v, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
    for key in params:
        if key in grads:
            m[key] = beta1 * m[key] + (1 - beta1) * grads[key]
            v[key] = beta2 * v[key] + (1 - beta2) * (grads[key] ** 2)
            m_corrected = m[key] / (1 - beta1**t)
            v_corrected = v[key] / (1 - beta2**t)
            params[key] -= learning_rate * m_corrected / (np.sqrt(v_corrected) + epsilon)
    return params, m, v

def nadam(params, grads, m, v, t, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8):
    for key in params:
        if key in grads:
            m[key] = beta1 * m[key] + (1 - beta1) * grads[key]
            v[key] = beta2 * v[key] + (1 - beta2) * (grads[key] ** 2)
            m_corrected = m[key] / (1 - beta1**t)
            v_corrected = v[key] / (1 - beta2**t)
            params[key] -= learning_rate * (beta1 * m_corrected + (1 - beta1) * grads[key] / (1 - beta1**t)) / (np.sqrt(v_corrected) + epsilon)
    return params, m, v


def train_model(config=None):
    with wandb.init(config=config):
        config = wandb.config
        
        theta = initialize_weights(784, [config.hidden_size] * config.number_hidden, 10, config.weight_init)
        s = {key: np.zeros_like(value) for key, value in theta.items()}  # For RMSprop, Adam, Nadam
        m = {key: np.zeros_like(value) for key, value in theta.items()}  # For Adam, Nadam
        v = {key: np.zeros_like(value) for key, value in theta.items()}  # For Momentum, Nesterov, Adam, Nadam
        t = 0  # Time step for Adam and Nadam
        
        num_samples = x_train.shape[0]  

        for epoch in range(config.epochs):
            # Shuffle training data at the start of each epoch
            indices = np.random.permutation(num_samples)
            x_train_shuffled, y_train_shuffled = x_train[indices], y_train[indices]

            total_loss, total_acc = 0, 0
            num_batches = num_samples // config.batch_size  

            for i in range(0, num_samples, config.batch_size):
                batch_x = x_train_shuffled[i:i + config.batch_size]
                batch_y = y_train_shuffled[i:i + config.batch_size]
                
                # Forward propagation
                activations = forward_propagation(batch_x, theta, config.number_hidden, globals()[config.activation])
                y_pred = activations[f"a{config.number_hidden+1}"].T  # Shape: (batch_size, 10)
                
                # Compute loss & accuracy for batch
                loss = cross_entropy_loss(y_pred, batch_y)
                acc = np.mean(np.argmax(y_pred, axis=1) == np.argmax(batch_y, axis=1))
                
                total_loss += loss * len(batch_x)  # Scale loss by batch size
                total_acc += acc * len(batch_x)  # Scale accuracy by batch size

                # Backward propagation
                grads = backward_propagation(batch_y, activations, theta, config.number_hidden, globals()[f"grad_{config.activation}"])
                
                t += 1  # Increment time step for Adam/Nadam
                
                # Apply optimizer update
                if config.optimizer == "sgd":
                    theta = sgd(theta, grads, config.learning_rate)
                elif config.optimizer == "momentum":
                    theta, v = momentum(theta, grads, v, config.learning_rate)
                elif config.optimizer == "nesterov":
                    theta, v = nesterov(theta, grads, v, config.learning_rate)
                elif config.optimizer == "rmsprop":
                    theta, s = rmsprop(theta, grads, s, config.learning_rate)
                elif config.optimizer == "adam":
                    theta, m, v = adam(theta, grads, m, v, t, config.learning_rate)
                elif config.optimizer == "nadam":
                    theta, m, v = nadam(theta, grads, m, v, t, config.learning_rate)

            # Compute average training loss and accuracy for the epoch
            avg_loss = total_loss / num_samples
            avg_acc = total_acc / num_samples

            # Compute validation loss and accuracy
            val_activations = forward_propagation(x_val, theta, config.number_hidden, globals()[config.activation])
            y_val_pred = val_activations[f"a{config.number_hidden+1}"].T  # Shape: (num_samples, 10)

            val_loss = cross_entropy_loss(y_val_pred, y_val)
            val_acc = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

            # Log metrics to Weights & Biases
            wandb.log({"epoch": epoch + 1, "loss": avg_loss, "accuracy": avg_acc, "val_loss": val_loss, "val_accuracy": val_acc})

def train(self, X_train, y_train, x_val, y_val, epochs, batch_size):
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            indices = np.random.permutation(num_samples)
            X_train, y_train = X_train[indices], y_train[indices]

            total_loss, total_acc = 0, 0
            num_batches = num_samples // batch_size

            for i in range(0, num_samples, batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
                
                # Forward Pass
                y_pred = self.forward(X_batch)
                
                # Compute Loss & Accuracy
                loss = self.compute_loss(y_batch, y_pred)
                acc = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_batch, axis=1))
                
                total_loss += loss * len(X_batch)
                total_acc += acc * len(X_batch)
                
                # Backward Pass
                grads_W, grads_b = self.backward(X_batch, y_batch)
                
                # Update Weights using the selected optimizer
                self.t += 1  # Increment timestep for Adam/Nadam
                if self.optimizer == "sgd":
                    sgd_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate)
                elif self.optimizer == "momentum":
                    momentum_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b)
                elif self.optimizer == "nesterov":
                    nesterov_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b)
                elif self.optimizer == "rmsprop":
                    rmsprop_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b, self.beta, self.epsilon)
                elif self.optimizer == "adam":
                    adam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b,self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)
                elif self.optimizer == "nadam":
                    nadam_update(self.weights, self.biases, grads_W, grads_b, self.learning_rate, self.velocity_W,self.velocity_b,self.moment2_W, self.moment2_b, self.beta1, self.beta2, self.epsilon, self.t)
            
            # Compute average loss and accuracy for the epoch
            avg_loss = total_loss / num_samples
            avg_acc = total_acc / num_samples

            # Validation Metrics
            y_val_pred = self.forward(x_val)
            val_loss = self.compute_loss(y_val, y_val_pred)
            val_acc = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

            # Log to Weights & Biases
            wandb.log({"epoch": epoch + 1, "loss": avg_loss, "accuracy": avg_acc, "val_loss": val_loss, "val_accuracy":val_acc})

# Sweep configuration
sweep_config = {
    'method': 'random',
    'metric': {'name': 'accuracy', 'goal': 'maximize'},
    'parameters': {
        'epochs': {'values': [5, 10]},
        'number_hidden': {'values': [3, 4, 5]},
        'hidden_size': {'values': [32, 64, 128]},
        'learning_rate': {'values': [1e-3, 1e-4]},
        'optimizer': {'values': ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam', 'nadam']},
        'batch_size': {'values': [16, 32, 64]},
        'activation': {'values': ['sigmoid', 'tanh', 'relu']},
        'weight_init': {'values': ['random', 'xavier']}
    }
}

# Run the sweep
sweep_id = wandb.sweep(sweep_config, project="assignment_1")
wandb.agent(sweep_id, train_model)



# def train_model(config=None):
#     with wandb.init(config=config):
#         config = wandb.config
        
#         theta = initialize_weights(784, [config.hidden_size] * config.number_hidden, 10, config.weight_init)
#         s = {key: np.zeros_like(value) for key, value in theta.items()}  # For RMSprop, Adam, Nadam
#         m = {key: np.zeros_like(value) for key, value in theta.items()}  # For Adam, Nadam
#         v = {key: np.zeros_like(value) for key, value in theta.items()}  # For Momentum, Nesterov, Adam, Nadam
#         t = 0  # Time step for Adam and Nadam
        
#         for epoch in range(config.epochs):
#             for i in range(0, len(x_train), config.batch_size):
#                 batch_x = x_train[i:i+config.batch_size]
#                 batch_y = y_train[i:i+config.batch_size]
                
#                 activations = forward_propagation(batch_x, theta, config.number_hidden, globals()[config.activation])
#                 grads = backward_propagation(batch_y, activations, theta, config.number_hidden, globals()[f"grad_{config.activation}"])
                
#                 t += 1  # Increment time step for Adam & Nadam
                
#                 # Select optimizer dynamically
#                 if config.optimizer == "sgd":
#                     theta = sgd(theta, grads, config.learning_rate)
#                 elif config.optimizer == "momentum":
#                     theta, v = momentum(theta, grads, v, config.learning_rate)
#                 elif config.optimizer == "nesterov":
#                     theta, v = nesterov(theta, grads, v, config.learning_rate)
#                 elif config.optimizer == "rmsprop":
#                     theta, s = rmsprop(theta, grads, s, config.learning_rate)
#                 elif config.optimizer == "adam":
#                     theta, m, v = adam(theta, grads, m, v, t, config.learning_rate)
#                 elif config.optimizer == "nadam":
#                     theta, m, v = nadam(theta, grads, m, v, t, config.learning_rate)
            
#             # Compute training and validation loss & accuracy
#             train_activations = forward_propagation(x_train, theta, config.number_hidden, globals()[config.activation])
#             val_activations = forward_propagation(x_val, theta, config.number_hidden, globals()[config.activation])
            
#             train_loss = cross_entropy_loss(train_activations[f"a{config.number_hidden+1}"].T, y_train)
#             train_acc = np.mean(np.argmax(train_activations[f"a{config.number_hidden+1}"].T, axis=1) == np.argmax(y_train, axis=1))
            
#             val_loss = cross_entropy_loss(val_activations[f"a{config.number_hidden+1}"].T, y_val)
#             val_acc = np.mean(np.argmax(val_activations[f"a{config.number_hidden+1}"].T, axis=1) == np.argmax(y_val, axis=1))

#             wandb.log({"epoch": epoch, "loss": train_loss, "accuracy": train_acc, "val_loss": val_loss, "val_accuracy": val_acc})


# def train_model(config=None):
#     with wandb.init(config=config):
#         config = wandb.config
        
#         theta = initialize_weights(784, [config.hidden_size] * config.number_hidden, 10, config.weight_init)
#         s = {key: np.zeros_like(value) for key, value in theta.items()}  # For RMSprop, Adam, Nadam
#         m = {key: np.zeros_like(value) for key, value in theta.items()}  # For Adam, Nadam
#         v = {key: np.zeros_like(value) for key, value in theta.items()}  # For Momentum, Nesterov, Adam, Nadam
#         t = 0  # Time step for Adam and Nadam
        
#         for epoch in range(config.epochs):
#             for i in range(0, len(x_train), config.batch_size):
#                 batch_x = x_train[i:i+config.batch_size]
#                 batch_y = y_train[i:i+config.batch_size]
                
#                 activations = forward_propagation(batch_x, theta, config.number_hidden, globals()[config.activation])
#                 grads = backward_propagation(batch_y, activations, theta, config.number_hidden, globals()[f"grad_{config.activation}"])
                
#                 t += 1  # Increment time step for Adam & Nadam
                
#                 # Select optimizer dynamically
#                 if config.optimizer == "sgd":
#                     theta = sgd(theta, grads, config.learning_rate)
#                 elif config.optimizer == "momentum":
#                     theta, v = momentum(theta, grads, v, config.learning_rate)
#                 elif config.optimizer == "nesterov":
#                     theta, v = nesterov(theta, grads, v, config.learning_rate)
#                 elif config.optimizer == "rmsprop":
#                     theta, s = rmsprop(theta, grads, s, config.learning_rate)
#                 elif config.optimizer == "adam":
#                     theta, m, v = adam(theta, grads, m, v, t, config.learning_rate)
#                 elif config.optimizer == "nadam":
#                     theta, m, v = nadam(theta, grads, m, v, t, config.learning_rate)
            
#             # Compute training and validation loss & accuracy
#             train_activations = forward_propagation(x_train, theta, config.number_hidden, globals()[config.activation])
#             val_activations = forward_propagation(x_val, theta, config.number_hidden, globals()[config.activation])
            
#             y_train_pred = train_activations[f"a{config.number_hidden+1}"].T  # Shape (num_samples, 10)
#             y_val_pred = val_activations[f"a{config.number_hidden+1}"].T  # Shape (num_samples, 10)

#             train_loss = cross_entropy_loss(y_train_pred, y_train)
#             val_loss = cross_entropy_loss(y_val_pred, y_val)

#             train_acc = np.mean(np.argmax(y_train_pred, axis=1) == np.argmax(y_train, axis=1))
#             val_acc = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1))

#             wandb.log({"epoch": epoch, "loss": train_loss, "accuracy": train_acc, "val_loss": val_loss, "val_accuracy": val_acc})



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 2jh1wylu
Sweep URL: https://wandb.ai/da24m015-iitm/assignment_1/sweeps/2jh1wylu


[34m[1mwandb[0m: Agent Starting Run: t8q861d0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: xavier
[34m[1mwandb[0m: Currently logged in as: [33mda24m015[0m ([33mda24m015-iitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.1143
epoch,5.0
loss,2.50199
val_accuracy,0.11033
val_loss,2.50552


[34m[1mwandb[0m: Agent Starting Run: j3o6n3ym with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.11309
epoch,5.0
loss,2.30258
val_accuracy,0.1115
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: 3kqupzs0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.11281
epoch,5.0
loss,2.30258
val_accuracy,0.11467
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: lbawwv3h with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.10826
epoch,10.0
loss,2.30258
val_accuracy,0.11183
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: fhltdioh with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.09215
epoch,5.0
loss,2.52126
val_accuracy,0.092
val_loss,2.52237


[34m[1mwandb[0m: Agent Starting Run: cxdleovb with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.14211
epoch,5.0
loss,2.30258
val_accuracy,0.13817
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: msqkio7k with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.10019
epoch,10.0
loss,2.49772
val_accuracy,0.09833
val_loss,2.50582


[34m[1mwandb[0m: Agent Starting Run: d06pyr82 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.0992
epoch,5.0
loss,2.58742
val_accuracy,0.10717
val_loss,2.57128


[34m[1mwandb[0m: Agent Starting Run: fha4g28e with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.09978
epoch,10.0
loss,2.30333
val_accuracy,0.102
val_loss,2.3034


[34m[1mwandb[0m: Agent Starting Run: wiex6pnm with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.11467
epoch,5.0
loss,2.45233
val_accuracy,0.11117
val_loss,2.46538


[34m[1mwandb[0m: Agent Starting Run: l4cv7bc9 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.15554
epoch,10.0
loss,2.30258
val_accuracy,0.149
val_loss,2.30258


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vc1xi5dg with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10652
epoch,5.0
loss,2.30258
val_accuracy,0.09983
val_loss,2.30259


[34m[1mwandb[0m: Agent Starting Run: lajk6pzu with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.14289
epoch,10.0
loss,2.30258
val_accuracy,0.14
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: 1pp4e0u9 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.12043
epoch,5.0
loss,2.30258
val_accuracy,0.115
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: qbcmsn05 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.09969
epoch,10.0
loss,2.30387
val_accuracy,0.10283
val_loss,2.30319


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mrra1m8w with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10202
epoch,5.0
loss,2.30258
val_accuracy,0.101
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: hrd6fbbn with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.08485
epoch,5.0
loss,2.30258
val_accuracy,0.0905
val_loss,2.30258


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9kyaqsdy with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.09572
epoch,10.0
loss,2.30258
val_accuracy,0.09267
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: jlhird9w with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.09985
epoch,10.0
loss,2.30307
val_accuracy,0.10133
val_loss,2.30299


[34m[1mwandb[0m: Agent Starting Run: 17ej3q2e with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.10074
epoch,10.0
loss,2.77349
val_accuracy,0.09333
val_loss,2.78432


[34m[1mwandb[0m: Agent Starting Run: k8jrpm95 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10019
epoch,5.0
loss,2.30305
val_accuracy,0.09833
val_loss,2.30327


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ta253ma5 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10626
epoch,5.0
loss,2.30258
val_accuracy,0.09833
val_loss,2.30258


[34m[1mwandb[0m: Agent Starting Run: hfsysf37 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10441
epoch,5.0
loss,2.45891
val_accuracy,0.1015
val_loss,2.46697


[34m[1mwandb[0m: Agent Starting Run: vipijyqo with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.0618
epoch,10.0
loss,2.34256
val_accuracy,0.05867
val_loss,2.34087


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ghe29bpq with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.04359
epoch,10.0
loss,3.02645
val_accuracy,0.042
val_loss,3.02708


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: v9bv0m7l with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_init: xavier


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.09969
epoch,5.0
loss,2.66804
val_accuracy,0.10283
val_loss,2.65819


[34m[1mwandb[0m: Agent Starting Run: e1y661xx with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10019
epoch,5.0
loss,2.30287
val_accuracy,0.09833
val_loss,2.3026


[34m[1mwandb[0m: Agent Starting Run: ky6unq34 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	number_hidden: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁
epoch,▁▃▅▆█
loss,▁▁▁▁▁
val_accuracy,▁▁▁▁▁
val_loss,▁▁▁▁▁

0,1
accuracy,0.10019
epoch,5.0
loss,2.30292
val_accuracy,0.09833
val_loss,2.30336


[34m[1mwandb[0m: Agent Starting Run: tquede94 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	number_hidden: 5
[34m[1mwandb[0m: 	optimizer: nesterov
[34m[1mwandb[0m: 	weight_init: random


0,1
accuracy,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
loss,▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.10385
epoch,10.0
loss,2.30259
val_accuracy,0.10283
val_loss,2.30259


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
