In [1]:
import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import cifar10

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import wandb
from wandb.keras import WandbCallback

In [3]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

x_train = x_train/255.
x_test = x_test/255.
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))

In [4]:
def Model():
    inputs = keras.Input(shape=(784,), name="digits")
    x1 = keras.layers.Dense(64, activation="relu")(inputs)
    x2 = keras.layers.Dense(64, activation="relu")(x1)
    outputs = keras.layers.Dense(10, name="predictions")(x2)

    return keras.Model(inputs=inputs, outputs=outputs)

    
def train_step(x, y, model, optimizer, loss_fn, train_acc_metric):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss_value = loss_fn(y, logits)

    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    train_acc_metric.update_state(y, logits)

    return loss_value

    
def test_step(x, y, model, loss_fn, val_acc_metric):
    val_logits = model(x, training=False)
    loss_value = loss_fn(y, val_logits)
    val_acc_metric.update_state(y, val_logits)

    return loss_value

In [5]:
def train(train_dataset,
          val_dataset, 
          model,
          optimizer,
          loss_fn,
          train_acc_metric,
          val_acc_metric,
          epochs=10, 
          log_step=200, 
          val_log_step=50):
  
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))

        train_loss = []   
        val_loss = []

        # Iterate over the batches of the dataset
        for step, (x_batch_train, y_batch_train) in tqdm.tqdm(enumerate(train_dataset), total=len(train_dataset)):
            loss_value = train_step(x_batch_train, y_batch_train, 
                                    model, optimizer, 
                                    loss_fn, train_acc_metric)
            train_loss.append(float(loss_value))

        # Run a validation loop at the end of each epoch
        for step, (x_batch_val, y_batch_val) in enumerate(val_dataset):
            val_loss_value = test_step(x_batch_val, y_batch_val, 
                                       model, loss_fn, 
                                       val_acc_metric)
            val_loss.append(float(val_loss_value))
            
        # Display metrics at the end of each epoch
        train_acc = train_acc_metric.result()
        print("Training acc over epoch: %.4f" % (float(train_acc),))

        val_acc = val_acc_metric.result()
        print("Validation acc: %.4f" % (float(val_acc),))

        # Reset metrics at the end of each epoch
        train_acc_metric.reset_states()
        val_acc_metric.reset_states()

        # log metrics using wandb.log
        wandb.log({'epochs': epoch,
                   'loss': np.mean(train_loss),
                   'acc': float(train_acc), 
                   'val_loss': np.mean(val_loss),
                   'val_acc':float(val_acc)})

In [6]:
sweep_config = {
  'method': 'random', 
  'metric': {
      'name': 'val_loss',
      'goal': 'minimize'
  },
  'early_terminate':{
      'type': 'hyperband',
      'min_iter': 5
  },
  'parameters': {
      'batch_size': {
          'values': [32, 64, 128, 256]
      },
      'learning_rate':{
          'values': [0.01, 0.005, 0.001, 0.0005, 0.0001]
      }
  }
}

In [7]:
def sweep_train(config_defaults=None):
    config_defaults = {
        "batch_size": 64,
        "learning_rate": 0.01
    }
    # Initialize wandb with a sample project name
    wandb.init(config=config_defaults) # over=written during sweep

    # Specify the other hyperparameters to the configuration, if any
    wandb.config.epochs = 2
    wandb.config.log_step = 20
    wandb.config.val_log_step = 50
    wandb.config.architecture_name = "MLP"
    wandb.config.dataset_name = "MNIST"

    # build input pipeline using tf.data
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_dataset = (train_dataset.shuffle(buffer_size=1024)
                                  .batch(wandb.config.batch_size)
                                  .prefetch(buffer_size=tf.data.AUTOTUNE))
    
    val_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    val_dataset = (val_dataset.batch(wandb.config.batch_size)
                              .prefetch(buffer_size=tf.data.AUTOTUNE))
    
    # initialize
    model = Model()

    # Instantiate an optimizer to train the model.
    optimizer = keras.optimizers.SGD(learning_rate=wandb.config.learning_rate)
    # Instantiate a loss function.
    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    # Prepare the metrics
    train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
    val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

    train(train_dataset,
          val_dataset,
          model,
          optimizer,
          loss_fn,
          train_acc_metric,
          val_acc_metric,
          epochs=wandb.config.epochs,
          log_step=wandb.config.epochs,
          val_log_step=wandb.config.val_log_step)

In [8]:
sweep_id = wandb.sweep(sweep_config, project="sweeps-tensorflow")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: vdgz7m7k
Sweep URL: https://wandb.ai/uj_fasci/sweeps-tensorflow/sweeps/vdgz7m7k


In [9]:
wandb.agent(sweep_id, function=sweep_train, count=10)

[34m[1mwandb[0m: Agent Starting Run: ctfkqjl1 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0005
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33muj_fasci[0m. Use [1m`wandb login --relogin`[0m to force relogin


Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB


Start of epoch 0


100%|██████████| 235/235 [00:02<00:00, 102.99it/s]


Training acc over epoch: 0.1138
Validation acc: 0.1264

Start of epoch 1


100%|██████████| 235/235 [00:02<00:00, 110.48it/s]


Training acc over epoch: 0.1414
Validation acc: 0.1586


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.14137
epochs,1.0
loss,2.2944
val_acc,0.1586
val_loss,2.26607


[34m[1mwandb[0m: Agent Starting Run: eaackpo3 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.005
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 938/938 [00:07<00:00, 120.01it/s]


Training acc over epoch: 0.6404
Validation acc: 0.8340

Start of epoch 1


100%|██████████| 938/938 [00:07<00:00, 120.07it/s]


Training acc over epoch: 0.8597
Validation acc: 0.8842


VBox(children=(Label(value='0.001 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.230750…

0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.85968
epochs,1.0
loss,0.53235
val_acc,0.8842
val_loss,0.41948


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vm8pidbs with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0001
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 235/235 [00:02<00:00, 109.33it/s]


Training acc over epoch: 0.1421
Validation acc: 0.1454

Start of epoch 1


100%|██████████| 235/235 [00:02<00:00, 111.71it/s]


Training acc over epoch: 0.1539
Validation acc: 0.1604


0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.1539
epochs,1.0
loss,2.30806
val_acc,0.1604
val_loss,2.30557


[34m[1mwandb[0m: Agent Starting Run: jkdm4z5o with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.01
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 1875/1875 [00:15<00:00, 119.99it/s]


Training acc over epoch: 0.8210
Validation acc: 0.9065

Start of epoch 1


100%|██████████| 1875/1875 [00:15<00:00, 117.88it/s]


Training acc over epoch: 0.9137
Validation acc: 0.9223


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.91375
epochs,1.0
loss,0.30173
val_acc,0.9223
val_loss,0.2686


[34m[1mwandb[0m: Agent Starting Run: tes56nk9 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 469/469 [00:04<00:00, 116.90it/s]


Training acc over epoch: 0.2187
Validation acc: 0.2803

Start of epoch 1


100%|██████████| 469/469 [00:04<00:00, 115.01it/s]


Training acc over epoch: 0.3772
Validation acc: 0.4753


0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.37723
epochs,1.0
loss,2.07161
val_acc,0.4753
val_loss,1.96322


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rsx3tsxn with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 1875/1875 [00:15<00:00, 122.30it/s]


Training acc over epoch: 0.1783
Validation acc: 0.2114

Start of epoch 1


100%|██████████| 1875/1875 [00:15<00:00, 122.63it/s]


Training acc over epoch: 0.2337
Validation acc: 0.2536


0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.23367
epochs,1.0
loss,2.2219
val_acc,0.2536
val_loss,2.19281


[34m[1mwandb[0m: Agent Starting Run: 770etvi9 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 1875/1875 [00:15<00:00, 122.93it/s]


Training acc over epoch: 0.4449
Validation acc: 0.6695

Start of epoch 1


100%|██████████| 1875/1875 [00:15<00:00, 123.16it/s]


Training acc over epoch: 0.7478
Validation acc: 0.8135


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.74777
epochs,1.0
loss,1.07801
val_acc,0.8135
val_loss,0.78753


[34m[1mwandb[0m: Agent Starting Run: pdbknk3q with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.005
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 235/235 [00:02<00:00, 114.12it/s]


Training acc over epoch: 0.3427
Validation acc: 0.5530

Start of epoch 1


100%|██████████| 235/235 [00:02<00:00, 113.67it/s]


Training acc over epoch: 0.6444
Validation acc: 0.7327


0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.64443
epochs,1.0
loss,1.52265
val_acc,0.7327
val_loss,1.19916


[34m[1mwandb[0m: Agent Starting Run: dq7746ie with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.005
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 938/938 [00:07<00:00, 126.00it/s]


Training acc over epoch: 0.6408
Validation acc: 0.8336

Start of epoch 1


100%|██████████| 938/938 [00:07<00:00, 121.49it/s]


Training acc over epoch: 0.8599
Validation acc: 0.8845


0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.85993
epochs,1.0
loss,0.53169
val_acc,0.8845
val_loss,0.42006


[34m[1mwandb[0m: Agent Starting Run: 5mlgkjo2 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.01
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.



Start of epoch 0


100%|██████████| 235/235 [00:02<00:00, 111.66it/s]


Training acc over epoch: 0.4909
Validation acc: 0.7272

Start of epoch 1


100%|██████████| 235/235 [00:02<00:00, 112.45it/s]


Training acc over epoch: 0.7850
Validation acc: 0.8376


0,1
acc,▁█
epochs,▁█
loss,█▁
val_acc,▁█
val_loss,█▁

0,1
acc,0.785
epochs,1.0
loss,0.89198
val_acc,0.8376
val_loss,0.64944
