In [1]:
#JV

In [2]:
from train import *
import wandb

In [None]:
"""WandB API Key Masked!!!"""

In [4]:
# Creating a wandb sweep config

optim = Optimiser

sweep_config = {
    'method': 'grid',
    'name' : 'sweep cross entropy',
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'number_of_hidden_layers': {
            'values': [3,4,5]
        },    
         'hidden_size':{
            'values':[32,64,128]
        },
        'activation': {
            'values': ['sigmoid','relu','tanh']
        },
        
        'initialization': {
            'values': ["random","xavier"]

        },
        'optimiser': {
            'values': ["sgd","gd_momentum","gd_nesterov","rmsprop","adam","nadam"]
        },
        
        'epochs': {
            'values': [5,10]
        },

        'batch_sizes': {
            'values': [16,32,64]
        },
        
        'lr': {
            'values': [1e-3,1e-4]
        },
        'weight_decay': {
            'values': [0, 0.0005, 0.5]
        },
        'loss': {
            'values': ['cross entropy']
        },


    }
}

sweep_id = wandb.sweep(sweep=sweep_config, project='JV_First_Run')

Create sweep with ID: rl7b1vaw
Sweep URL: https://wandb.ai/tmajestical/JV_First_Run/sweeps/rl7b1vaw


In [5]:
## train-test data is got from the mnist fashion import data call
## now this data is being flattened, i.e each image into a 1d array
## then flattened train data is split into 80% train and 20% validation data.

(x_train,y_train),(x_test,y_test) = fashion_mnist.load_data()


seed = 76 #setting this as seed wherever randomness comes

x_train_flattened = x_train.flatten().reshape(x_train.shape[0],-1)/255
x_test_flattened = x_test.flatten().reshape(x_test.shape[0],-1)/255


#np.random.seed(seed)
random_num_generator = np.random.RandomState(seed)


validation_indices = random_num_generator.choice(x_train_flattened.shape[0],int(0.1*x_train_flattened.shape[0]),replace=False)
train_indices = np.array(list(set(np.arange(x_train_flattened.shape[0])).difference(set(validation_indices))))

x_train_data = x_train_flattened[train_indices]
y_train_data = y_train[train_indices]

x_validation_data = x_train_flattened[validation_indices]
y_validation_data = y_train[validation_indices]


In [6]:
def create_nnet_and_train(config):
    ##create a neural network.
    
    
    nn = NeuralNetwork(seed=seed)
    
    
    number_of_hidden_layers = config['number_of_hidden_layers']
    neurons_per_hidden_layer=[config['hidden_size']]*number_of_hidden_layers ## Assuming all layers have same number of neurons
    
    initialization = config['initialization']
    
    
    activation = config['activation']
    
    
    ## Create NNet with the current architecture config
    nn.createNNet(number_of_hidden_layers=number_of_hidden_layers,neurons_per_hidden_layer=neurons_per_hidden_layer,initialization = initialization,activation=activation)
    
    optim = Optimiser()
    
    
    loss_type=config['loss']
    optimiser_algo = config['optimiser']
    lr = config['lr']
    epochs = config['epochs']
    batch_size = config['batch_sizes']
    l2_param = config['weight_decay']
    
    ##train NNet.
    optim.train(nn,[x_train_data,y_train_data],[x_validation_data,y_validation_data],optimiser=optimiser_algo,lr=lr,epochs=epochs,batch_size=batch_size,l2_param=l2_param,print_val_accuracy=False,loss_type=loss_type,log_wandb_data=True)

In [None]:
def main():
    '''
    WandB calls main function each time with differnet combination.

    We can retrive the same and use the same values for our hypermeters.

    '''


    with wandb.init() as run:

        run_name="-hl_"+str(wandb.config.number_of_hidden_layers)+"-hs_"+str(wandb.config.hidden_size)+"-init_"+wandb.config.initialization+"-ac_"+wandb.config.activation

        run_name = run_name+"-optim_"+str(wandb.config.optimiser)+"-lr_"+str(wandb.config.lr) +"-epochs_"+str(wandb.config.epochs)+"-bs_"+str(wandb.config.batch_sizes)+"-reg_"+str(wandb.config.weight_decay)

        wandb.run.name=run_name

        create_nnet_and_train(wandb.config)
        

wandb.agent(sweep_id, function=main,count=100) # calls main function for count number of times.
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: lbb9moai with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: sgd
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: dewt5ck4 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0v4n385y with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: ij70cmev with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: gd_momentum
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: mk2lo3td with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: gd_momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: 6a2e5uoe with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: gd_momentum
[34m[1mwandb[0m: 	weight_decay: 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: hrlkh26j with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: gd_nesterov
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: 3zteqdz6 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: gd_nesterov
[34m[1mwandb[0m: 	weight_decay: 0.0005


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 14f8ppr9 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: gd_nesterov
[34m[1mwandb[0m: 	weight_decay: 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: n9ozc0si with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: 1qqdggoo with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: ry2yhiwm with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  return 1/(1+np.exp(-x))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: pnri1nw5 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: adam
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  m_w_hat = m_w/(1-np.power(beta1,update_count))
  m_b_hat = m_b/(1-np.power(beta1,update_count))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.014 MB uploaded\r'), FloatProgress(value=0.08081021949325994, max=1.…

[34m[1mwandb[0m: Agent Starting Run: vdk3uwx7 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  m_w_hat = m_w/(1-np.power(beta1,update_count))
  m_b_hat = m_b/(1-np.power(beta1,update_count))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.014 MB uploaded\r'), FloatProgress(value=0.0807817129956258, max=1.0…

[34m[1mwandb[0m: Agent Starting Run: drpyi133 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: adam
[34m[1mwandb[0m: 	weight_decay: 0.5


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  m_w_hat = m_w/(1-np.power(beta1,update_count))
  m_b_hat = m_b/(1-np.power(beta1,update_count))


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Agent Starting Run: qati81ck with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: nadam
[34m[1mwandb[0m: 	weight_decay: 0


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  m_w_hat = m_w/(1-np.power(beta1,update_count))
  m_b_hat = m_b/(1-np.power(beta1,update_count))
  mw_numerator = beta1*m_w_hat + ((1-beta1)/(1-np.power(beta1,update_count)))*np.array(dw,dtype=object)
  mb_numerator = beta1*m_b_hat + ((1-beta1)/(1-np.power(beta1,update_count)))*np.array(db,dtype=object)


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.17904612978889758, max=1.…

[34m[1mwandb[0m: Agent Starting Run: g47wqe2q with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_sizes: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	initialization: random
[34m[1mwandb[0m: 	loss: cross entropy
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	number_of_hidden_layers: 3
[34m[1mwandb[0m: 	optimiser: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  m_w_hat = m_w/(1-np.power(beta1,update_count))
  m_b_hat = m_b/(1-np.power(beta1,update_count))
  mw_numerator = beta1*m_w_hat + ((1-beta1)/(1-np.power(beta1,update_count)))*np.array(dw,dtype=object)
  mb_numerator = beta1*m_b_hat + ((1-beta1)/(1-np.power(beta1,update_count)))*np.array(db,dtype=object)


  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]

  0%|          | 0/54000 [00:00<?, ?it/s]