# Weights and Biases

Notebook with code from [https://theaisummer.com/weights-and-biases-tutorial/](https://theaisummer.com/weights-and-biases-tutorial/)

In [1]:
# install and load wandb
!pip install --quiet wandb
import wandb

In [2]:
# define example net
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

class Net(nn.Module):

    def __init__(self, fc_layer_size=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, fc_layer_size)
        self.fc3 = nn.Linear(fc_layer_size, 10)

    def forward(self, x):

        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:06<00:00, 25172550.39it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [3]:
# set up device
if torch.cuda.is_available():
 dev = "cuda:0"
else:
 dev = "cpu"
print(dev)
device = torch.device(dev)

cuda:0


In [4]:
# init wandb
run = wandb.init(project='test')

# set config
config = wandb.config
config.learning_rate = 0.01
config.epochs = 5

# instatiate net
net = Net()
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(),lr=config.learning_rate)

# train
for epoch in range(config.epochs):
    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):

        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            wandb.log({'epoch': epoch+1, 'loss': running_loss/2000})
            wandb.watch(net, criterion, log="all") # Hook into the torch model to collect gradients and the topology.
            running_loss = 0.0

print('Finished Training')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/shiling/.netrc


[1,  2000] loss: 2.171
[1,  4000] loss: 1.854
[1,  6000] loss: 1.680
[1,  8000] loss: 1.581
[1, 10000] loss: 1.504
[1, 12000] loss: 1.470
[2,  2000] loss: 1.390
[2,  4000] loss: 1.379
[2,  6000] loss: 1.374
[2,  8000] loss: 1.338
[2, 10000] loss: 1.318
[2, 12000] loss: 1.322
[3,  2000] loss: 1.222
[3,  4000] loss: 1.247
[3,  6000] loss: 1.215
[3,  8000] loss: 1.224
[3, 10000] loss: 1.198
[3, 12000] loss: 1.221
[4,  2000] loss: 1.117
[4,  4000] loss: 1.138
[4,  6000] loss: 1.118
[4,  8000] loss: 1.138
[4, 10000] loss: 1.139
[4, 12000] loss: 1.128
[5,  2000] loss: 1.052
[5,  4000] loss: 1.043
[5,  6000] loss: 1.040
[5,  8000] loss: 1.081
[5, 10000] loss: 1.052
[5, 12000] loss: 1.060
Finished Training


# Visualization

In [5]:
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

columns=['image','label']
data = []

for i, batch in enumerate(trainloader, 0):
    inputs, labels = batch[0], batch[1]
    for j, image in enumerate(inputs,0):
        data.append([wandb.Image(image),classes[labels[j].item()]]) # Format images for logging to W&B.
    break

table= wandb.Table(data=data, columns=columns) # The Table class used to display and analyze tabular data.
run.log({"cifar10_images": table})

# Artifacts

In [6]:
# https://docs.wandb.ai/guides/artifacts
# You can use an artifact to log the dataset used to train the model as input and the resulting model checkpoints as outputs.
cifar10_artifact = wandb.Artifact("cifar10", type="dataset")
file_path = './data/cifar-10-batches-py'
cifar10_artifact.add_dir(file_path)
run.log_artifact(cifar10_artifact)

[34m[1mwandb[0m: Adding directory to artifact (./data/cifar-10-batches-py)... Done. 0.1s


<Artifact cifar10>

# Sweeps

In [7]:
import math
sweep_config = {
                'method': 'random',
                'metric': {'goal': 'minimize', 'name': 'loss'},
                'parameters': {
                    'batch_size': {
                        'distribution': 'q_log_uniform_values',
                        'max': 256,
                        'min': 32
                    },
                    'epochs': {'value': 5},
                    'fc_layer_size': {'values': [128, 256, 512]},
                    'learning_rate': {'distribution': 'uniform',
                                      'max': 0.1,
                                      'min': 0},
                    'optimizer': {'values': ['adam', 'sgd']}
                }
 }

sweep_id = wandb.sweep(sweep_config, project="test")

Create sweep with ID: xpscklyl
Sweep URL: https://wandb.ai/adl_shilingdeng/test/sweeps/xpscklyl


In [8]:
def train(config=None):

    with wandb.init(project='test', entity='serkar', config=config):
        config = wandb.config
        transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                                download=True, transform=transform)
        trainloader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size,
                                                  shuffle=True, num_workers=2)
        testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                               download=True, transform=transform)

        net = Net(config.fc_layer_size)
        net.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(net.parameters(), lr=config.learning_rate)

        if config.optimizer == "sgd":
            optimizer = torch.optim.SGD(net.parameters(),
                                  lr=config.learning_rate, momentum=0.9)
        elif optimizer == "adam":
            optimizer = torch.optim.Adam(net.parameters(),
                                   lr=config.learning_rate)

        wandb.watch(net, criterion, log="all")

        for epoch in range(config.epochs):  # loop over the dataset multiple times

            running_loss = 0.0

            for i, data in enumerate(trainloader, 0):

                inputs, labels = data[0].to(device), data[1].to(device)
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / len(trainloader)))

            wandb.log({'epoch': epoch + 1, 'loss': running_loss / len(trainloader)})

        print('Finished Training')

In [9]:
# run sweep
wandb.agent(sweep_id, function=train, count=5)



VBox(children=(Label(value='177.601 MB of 177.601 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Agent Starting Run: udhv7va0 with config:
[34m[1mwandb[0m: 	batch_size: 177
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	fc_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0637738261520032
[34m[1mwandb[0m: 	optimizer: adam
Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/home/shiling/anaconda3/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread NetStatThr:
Traceback (most recent call last):
  File "/home/shiling/anaconda3/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/home/shiling/anaconda3/lib/python3.11/threading.py", line 982, in run
    self.run()
  File "/home/shiling/anaconda3/lib/python3.11/threading.py", line 982, in run
Exception in thread IntMsgThr:
Traceback (most recent call last):
  File "/home/shiling/anaconda3/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self._target(*self._args, **self._kwargs)
  File "/home/shiling/ana

Files already downloaded and verified
Files already downloaded and verified
[1,   283] loss: 2.215
[2,   283] loss: 1.832
[3,   283] loss: 1.621
[4,   283] loss: 1.511
[5,   283] loss: 1.439
Finished Training


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
loss,█▅▃▂▁

0,1
epoch,5.0
loss,1.43857


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3wf1ybzl with config:
[34m[1mwandb[0m: 	batch_size: 132
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	fc_layer_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.05618474237073891
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: Currently logged in as: [33mshilingdeng7187[0m. Use [1m`wandb login --relogin`[0m to force relogin


Files already downloaded and verified
Files already downloaded and verified
[1,   379] loss: 2.107
[2,   379] loss: 1.731
[3,   379] loss: 1.558
[4,   379] loss: 1.444
[5,   379] loss: 1.359
Finished Training


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
loss,█▄▃▂▁

0,1
epoch,5.0
loss,1.35912


[34m[1mwandb[0m: Agent Starting Run: zr78f1nm with config:
[34m[1mwandb[0m: 	batch_size: 212
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	fc_layer_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.056205771469695257
[34m[1mwandb[0m: 	optimizer: sgd


Files already downloaded and verified
Files already downloaded and verified
[1,   236] loss: 1.810
[2,   236] loss: 1.384
[3,   236] loss: 1.213
[4,   236] loss: 1.107
[5,   236] loss: 1.033
Finished Training


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
loss,█▄▃▂▁

0,1
epoch,5.0
loss,1.03338


[34m[1mwandb[0m: Agent Starting Run: hsvkj4wh with config:
[34m[1mwandb[0m: 	batch_size: 38
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	fc_layer_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.05953399638148348
[34m[1mwandb[0m: 	optimizer: sgd


Files already downloaded and verified
Files already downloaded and verified
[1,  1316] loss: 1.858
[2,  1316] loss: 1.750
[3,  1316] loss: 1.754
[4,  1316] loss: 1.758
[5,  1316] loss: 1.773
Finished Training


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
loss,█▁▁▂▃

0,1
epoch,5.0
loss,1.77325


[34m[1mwandb[0m: Agent Starting Run: 0fiocwo7 with config:
[34m[1mwandb[0m: 	batch_size: 141
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	fc_layer_size: 512
[34m[1mwandb[0m: 	learning_rate: 0.07391521927287067
[34m[1mwandb[0m: 	optimizer: adam


Files already downloaded and verified
Files already downloaded and verified
[1,   355] loss: 2.083
[2,   355] loss: 1.681
[3,   355] loss: 1.512
[4,   355] loss: 1.407
[5,   355] loss: 1.321
Finished Training


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▃▅▆█
loss,█▄▃▂▁

0,1
epoch,5.0
loss,1.32141


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe7a266f990>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fe6247f7550, execution_count=9 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fe6247f7f50, raw_cell="# run sweep
wandb.agent(sweep_id, function=train, .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/home/shiling/git/university_of_copenhagen/advanced_deep_learning/assignment1/WandB.ipynb#X15sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe