In [None]:
!git clone https://github.com/Vaishnav-Jayaram/CS6886w_Assignment1.git

Cloning into 'CS6886w_Assignment1'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [None]:
%cd CS6886w_Assignment1/

/content/CS6886w_Assignment1


Setup & installs

In [1]:
# GPU check + installs (PyTorch CUDA is preinstalled on Colab)
import torch, platform, sys, importlib.metadata as im
print("Python:", sys.version)
print("Platform:", platform.platform())
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0), "| CUDA:", torch.version.cuda, "| cuDNN:", torch.backends.cudnn.version())

%pip -q install wandb==0.17.9 matplotlib==3.9.2 rich==13.9.2 -U scikit-learn
print("wandb:", im.version("wandb"))
print("sklearn:", im.version("scikit-learn"))
print("matplotlib:", im.version("matplotlib"))
print("rich:", im.version("rich"))

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
CUDA available: True
GPU: Tesla T4 | CUDA: 12.6 | cuDNN: 91002
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m133.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m144.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m135.2 MB/s[0m eta [36m0:00:00[0m
[?25hwandb: 0.17.9
sklearn: 1.7.2
matplotlib: 3.9.2
rich: 13.9.2


 Imports, seeding, device

In [2]:
import os, random, json, time, math, datetime
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import wandb

def set_seed(seed:int=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

def nowstamp():
    return datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

set_seed(42)

Using device: cuda


Data downloading & preprocessing (train aug, eval norm)

In [3]:
# CIFAR-10 normalization stats
CIFAR10_MEAN = (0.4914, 0.4822, 0.4465)
CIFAR10_STD  = (0.2470, 0.2435, 0.2616)

# Train aug + norm
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

# Val/Test norm
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN, CIFAR10_STD),
])

data_root = "./data"
train_full = datasets.CIFAR10(root=data_root, train=True,  download=True, transform=train_transform)
testset    = datasets.CIFAR10(root=data_root, train=False, download=True, transform=test_transform)

val_frac = 0.1
val_size = int(len(train_full)*val_frac)
train_size = len(train_full) - val_size
trainset, valset = random_split(train_full, [train_size, val_size], generator=torch.Generator().manual_seed(42))
print(f"Train/Val/Test sizes: {len(trainset)}/{len(valset)}/{len(testset)}")

100%|██████████| 170M/170M [00:44<00:00, 3.85MB/s]


Train/Val/Test sizes: 45000/5000/10000


Model: VGG6 (configurable activation)

In [4]:
ACTIVATIONS = {
    "relu": nn.ReLU(inplace=True),
    "sigmoid": nn.Sigmoid(),
    "tanh": nn.Tanh(),
    "silu": nn.SiLU(inplace=True),
    "gelu": nn.GELU(),
}

class VGG6(nn.Module):
    # 6 convs (64,64)→MP, (128,128)→MP, (256,256)→MP, then GAP+Linear
    def __init__(self, activation: str = "relu", num_classes: int = 10):
        super().__init__()
        act = ACTIVATIONS[activation]
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), act,
            nn.Conv2d(64, 64, 3, padding=1), act,
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1), act,
            nn.Conv2d(128, 128, 3, padding=1), act,
            nn.MaxPool2d(2),
            nn.Conv2d(128, 256, 3, padding=1), act,
            nn.Conv2d(256, 256, 3, padding=1), act,
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(256, num_classes),
        )
    def forward(self, x):
        return self.classifier(self.features(x))

Optimizers

In [6]:
def build_optimizer(name:str, params, lr:float, weight_decay:float=5e-4, momentum:float=0.9):
    name = name.lower()
    if name == "sgd":
        return optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=False)
    if name == "nesterov-sgd":
        return optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
    if name == "adam":
        return optim.Adam(params, lr=lr, weight_decay=weight_decay)
    if name == "adamw":
        return optim.AdamW(params, lr=lr, weight_decay=weight_decay)
    if name == "adagrad":
        return optim.Adagrad(params, lr=lr, weight_decay=weight_decay)
    if name == "rmsprop":
        return optim.RMSprop(params, lr=lr, momentum=momentum, weight_decay=weight_decay)
    if name == "nadam":
        return optim.NAdam(params, lr=lr, weight_decay=weight_decay)
    raise ValueError(f"Unknown optimizer: {name}")

Dataloaders & train/eval utilities

In [7]:
def make_loaders(batch_size:int=128, num_workers:int=2):
    tr = DataLoader(trainset, batch_size=batch_size, shuffle=True,  num_workers=num_workers, pin_memory=True)
    va = DataLoader(valset,   batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    te = DataLoader(testset,  batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    return tr, va, te

def accuracy_from_logits(logits, y):
    return (logits.argmax(dim=1) == y).float().mean().item()

def run_epoch(model, loader, criterion, optimizer=None):
    train_mode = optimizer is not None
    model.train(train_mode)
    losses, accs = [], []
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        logits = model(xb)
        loss = criterion(logits, yb)
        if train_mode:
            optimizer.zero_grad(); loss.backward(); optimizer.step()
        losses.append(loss.item())
        accs.append(accuracy_from_logits(logits, yb))
    return float(np.mean(losses)), float(np.mean(accs))

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    losses, accs = [], []
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        logits = model(xb)
        loss = criterion(logits, yb)
        losses.append(loss.item())
        accs.append(accuracy_from_logits(logits, yb))
    return float(np.mean(losses)), float(np.mean(accs))

Baseline training (saves timestamped best weight in weights/, logs to W&B)

In [8]:
wandb.login()  # paste key or set WANDB_API_KEY before

default_config = {
    "activation": "relu",
    "optimizer": "nesterov-sgd",
    "batch_size": 128,
    "epochs": 30,
    "lr": 0.05,
    "momentum": 0.9,
    "weight_decay": 5e-4,
    "seed": 42,
}

from pathlib import Path
Path("weights").mkdir(exist_ok=True, parents=True)
Path("runs").mkdir(exist_ok=True, parents=True)

def train_one_run(config=None, project="vgg6-cifar10"):
    run_name = f"vgg6_{nowstamp()}"
    with wandb.init(project=project, config=config, name=run_name) as run:
        cfg = wandb.config
        set_seed(cfg.seed)

        # per-run meta
        run_dir = Path("runs") / f"{run.name or run.id}_{nowstamp()}"
        run_dir.mkdir(parents=True, exist_ok=True)
        (run_dir / "config.json").write_text(json.dumps(dict(cfg), indent=2))

        tr, va, te = make_loaders(cfg.batch_size)
        model = VGG6(activation=cfg.activation).to(DEVICE)
        criterion = nn.CrossEntropyLoss()
        opt = build_optimizer(cfg.optimizer, model.parameters(), lr=cfg.lr,
                              weight_decay=cfg.weight_decay, momentum=cfg.momentum)

        best_val, best_path = 0.0, None
        for epoch in range(cfg.epochs):
            tr_loss, tr_acc = run_epoch(model, tr, criterion, opt)
            va_loss, va_acc = evaluate(model, va, criterion)
            wandb.log({"epoch": epoch,
                       "train_loss": tr_loss, "train_acc": tr_acc*100.0,
                       "val_loss": va_loss,   "val_acc": va_acc*100.0})
            if va_acc > best_val:
                best_val = va_acc
                best_path = Path("weights") / f"{run.name or run.id}_{nowstamp()}_best.pt"
                torch.save(model.state_dict(), best_path)

        # test on best
        if best_path is not None and best_path.exists():
            model.load_state_dict(torch.load(best_path, map_location=DEVICE))
        te_loss, te_acc = evaluate(model, te, criterion)
        wandb.summary["test_acc"] = te_acc*100.0
        wandb.summary["test_loss"] = te_loss
        print(f"[{run.name}] Best Val={best_val*100:.2f}% | Test={te_acc*100:.2f}%")
        print("Best weight:", best_path)

        # upload weight artifact to W&B
        if best_path is not None and best_path.exists():
            art = wandb.Artifact(f"{run.name}_best", type="model",
                                 metadata={"val_acc": float(best_val), "test_acc": float(te_acc)})
            art.add_file(str(best_path))
            wandb.log_artifact(art)

# run baseline
train_one_run(default_config)

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mee24d032[0m ([33mee24d032-iitm-india[0m). Use [1m`wandb login --relogin`[0m to force relogin


[vgg6_20251025-183155] Best Val=86.78% | Test=85.83%
Best weight: weights/vgg6_20251025-183155_20251025-184111_best.pt


VBox(children=(Label(value='4.387 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9967940806862756, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▁▃▄▅▆▆▆▇▇▇▇▇▇▇████████████████
train_loss,█▆▅▄▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▄▅▅▆▆▇▇▇▇▇▇██▇███▇▇██▇▇█████
val_loss,█▆▅▄▄▃▃▂▂▂▂▂▂▁▁▂▂▁▁▂▂▁▁▂▂▁▁▁▁▁

0,1
epoch,29.0
test_acc,85.82872
test_loss,0.46008
train_acc,90.39319
train_loss,0.27841
val_acc,86.01562
val_loss,0.41203


W&B Sweep (creates Parallel Coordinates plot automatically). Set count=22.

In [9]:
sweep_config = {
    "method": "random",
    "metric": {"name": "val_acc", "goal": "maximize"},
    "parameters": {
        "activation":   {"values": ["relu", "silu", "gelu", "tanh", "sigmoid"]},
        "optimizer":    {"values": ["sgd", "nesterov-sgd", "adam", "adagrad", "rmsprop", "nadam"]},
        "batch_size":   {"values": [64, 128, 256]},
        "epochs":       {"values": [20, 30]},
        "lr":           {"values": [0.01, 0.02, 0.05, 0.1]},
        "momentum":     {"values": [0.0, 0.9]},
        "weight_decay": {"values": [0.0, 5e-4, 5e-3]},
        "seed":         {"values": [1, 2, 3]},
    }
}

wandb.login()
sweep_id = wandb.sweep(sweep_config, project="vgg6-cifar10")
def sweep_train(): train_one_run(project="vgg6-cifar10")
# EXACTLY 25 runs:
wandb.agent(sweep_id, function=sweep_train, count=22)

Create sweep with ID: cfaixdo8
Sweep URL: https://wandb.ai/ee24d032-iitm-india/vgg6-cifar10/sweeps/cfaixdo8


[34m[1mwandb[0m: Agent Starting Run: 8cjqt4me with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-184503] Best Val=19.39% | Test=19.65%
Best weight: weights/vgg6_20251025-184503_20251025-185416_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▆█
train_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▆▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▁▁▂▁▂▁▁▂▁▁▁▁▁▁▁▂▁▁▁▁▁▂▁▂▂▂▄▆█
val_loss,███████████████████████████▇▅▁

0,1
epoch,29.0
test_acc,19.64844
test_loss,2.03794
train_acc,16.92605
train_loss,2.14867
val_acc,19.38764
val_loss,2.0512


[34m[1mwandb[0m: Agent Starting Run: btj6ea39 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.05
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.0005


[vgg6_20251025-185435] Best Val=10.68% | Test=9.96%
Best weight: weights/vgg6_20251025-185435_20251025-190128_best.pt


VBox(children=(Label(value='4.388 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9970272477727654, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▃▆▇▄▇▃▇▁▆▆▇▅▂▂▆▄▁▅▆▆▆▃▂▃▅▇▆█▇▃
train_loss,█▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▄▂▅█▄█▂▆▂▆▁▄▅▂▄█▄█▅▄▅█▆█▄▆▁▁▆█
val_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁

0,1
epoch,29.0
test_acc,9.95847
test_loss,50.48078
train_acc,9.83394
train_loss,3800949750.16909
val_acc,10.68359
val_loss,3.70006


[34m[1mwandb[0m: Agent Starting Run: vmjznbz0 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.05
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0


[vgg6_20251025-190416] Best Val=86.84% | Test=85.97%
Best weight: weights/vgg6_20251025-190416_20251025-190944_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▅▅▆▇▇▇▇▇▇█████████
train_loss,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁
val_acc,▁▄▅▆▆▇▇▇▇▇██████████
val_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,85.96717
test_loss,0.44795
train_acc,91.90366
train_loss,0.23577
val_acc,85.01953
val_loss,0.48166


[34m[1mwandb[0m: Agent Starting Run: ncrelb93 with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.1
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: nesterov-sgd
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-191059] Best Val=10.68% | Test=9.96%
Best weight: weights/vgg6_20251025-191059_20251025-191507_best.pt


VBox(children=(Label(value='4.387 MB of 4.388 MB uploaded\r'), FloatProgress(value=0.9997422288855355, max=1.0…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▆▆▇█▆▁▃▅▇▄▅▆▄▆▅██▂▅▆
train_loss,▄▂▃▆▄▇▂▇▂█▂▃▆▄▂▃▂▅▇▁
val_acc,▄▃▄▅▃▅▃▇▃▃▃▃█▁▅▇▃▃▃▁
val_loss,▅▇▆▆▂▁█▃▅▇▃▄▃▂▃▅▃▂▆▃

0,1
epoch,19.0
test_acc,9.95847
test_loss,2.30353
train_acc,9.99916
train_loss,2.30399
val_acc,9.49219
val_loss,2.30334


[34m[1mwandb[0m: Agent Starting Run: fttyg1z8 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-191737] Best Val=10.66% | Test=9.98%
Best weight: weights/vgg6_20251025-191737_20251025-192001_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▂▅▂▆▂█▇█▆▅▅▆▃▃█▅▅▃▁▂
train_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁
val_acc,▅▃▁▅▇▅██▇█▅▇▅▄▇▂▅▅▃▂
val_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁

0,1
epoch,19.0
test_acc,9.98209
test_loss,2.69875
train_acc,9.83443
train_loss,1875575.95295
val_acc,9.45411
val_loss,2.39842


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: l8vu4vas with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.02
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-192446] Best Val=10.67% | Test=9.91%
Best weight: weights/vgg6_20251025-192446_20251025-192656_best.pt


VBox(children=(Label(value='4.387 MB of 4.388 MB uploaded\r'), FloatProgress(value=0.9997428807527443, max=1.0…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▃▇▇█▄▄▂▅▆▃▆▇▅▄▅▆▅▃▅▁
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▆▂▆▅▁▁█▅▄▄▄▅▅█▅▄▅▅▁▅
val_loss,▂▂▃▂▂▂▁▁▃▁▃▄▃▂▁▁▂█▂▁

0,1
epoch,19.0
test_acc,9.91211
test_loss,2.32515
train_acc,9.69398
train_loss,4.41403
val_acc,9.98966
val_loss,2.31466


[34m[1mwandb[0m: Agent Starting Run: 1w81q76w with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.02
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	seed: 3
[34m[1mwandb[0m: 	weight_decay: 0


[vgg6_20251025-193113] Best Val=80.83% | Test=80.42%
Best weight: weights/vgg6_20251025-193113_20251025-193657_best.pt


VBox(children=(Label(value='4.401 MB of 4.401 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▄▅▆▆▆▇▇▇▇▇▇████████
train_loss,█▆▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
val_acc,▁▃▄▅▆▆▇▆▇▇▇▇█▇▇█████
val_loss,█▆▅▄▃▃▂▂▂▂▂▂▁▂▂▁▁▁▁▁

0,1
epoch,19.0
test_acc,80.42396
test_loss,0.56975
train_acc,83.54492
train_loss,0.46904
val_acc,80.22152
val_loss,0.56676


[34m[1mwandb[0m: Agent Starting Run: j25z8geo with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.05
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0


[vgg6_20251025-193812] Best Val=10.66% | Test=9.98%
Best weight: weights/vgg6_20251025-193812_20251025-194113_best.pt


VBox(children=(Label(value='4.387 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9967962414780976, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▆▆▅▆▂▆▆▂▆▅▇▅▆▆▆▆▆▃▅▁▃▅▄▁▇▅▆█▅▆
train_loss,█▁▂▂▂▁▂▁▂▁▁▂▁▁▁▁▂▂▁▁▂▁▂▁▂▂▁▁▁▁
val_acc,▁▅▇▅▃▅▅▇██▂▅▂▄▄▅▂▅▅▃▄▁▄▁▅▄▇▅▁▅
val_loss,▃▂▂▅▄▇▃▂▃▆▃▂▂▄▃▃▃▆▃▃▄▄▄█▄▃▃▁▃▄

0,1
epoch,29.0
test_acc,9.98209
test_loss,9.54935
train_acc,10.11186
train_loss,7.26518
val_acc,10.16614
val_loss,12.1904


[34m[1mwandb[0m: Agent Starting Run: gql7btbc with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.05
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0


[vgg6_20251025-194825] Best Val=10.66% | Test=9.98%
Best weight: weights/vgg6_20251025-194825_20251025-195108_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=0.9999334797214235, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▅▇▂▆▂▁▆▆▅▄▄▃▆▇▅▇▄▂▃▃▂█▇▃▄▅▂▇█▁
train_loss,█▇▆▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▇▅▁▅▅███▅▇▅▇▄▅█▅█▃▄▁▃▁▅▁▇▃▁▅
val_loss,▃▇█▆▃▅▄▁▂▃▂▂▁▁▁▁▂▃▁▁▃▁▁▄▂▃▁▂▁▁

0,1
epoch,29.0
test_acc,9.98209
test_loss,3.20634
train_acc,9.74121
train_loss,2.62269
val_acc,10.16614
val_loss,3.19151


[34m[1mwandb[0m: Agent Starting Run: m3bf9cph with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0


[vgg6_20251025-195844] Best Val=10.35% | Test=10.10%
Best weight: weights/vgg6_20251025-195844_20251025-200115_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▄▃▆▅▄▁▄▆▆▁█▇▃▄▅█▆▃▇▅
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▅▆▇▅▇▁█▅▁▅█▃▃▅▅▅▅▅▅
val_loss,▄▇▆▄▁▂█▄▄▆▃▃▃▂▂▃▃▂▂▄

0,1
epoch,19.0
test_acc,10.09691
test_loss,2.30304
train_acc,9.91556
train_loss,2.30322
val_acc,9.84375
val_loss,2.3034


[34m[1mwandb[0m: Agent Starting Run: 0345lunl with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0.0005


[vgg6_20251025-200521] Best Val=63.09% | Test=62.28%
Best weight: weights/vgg6_20251025-200521_20251025-201104_best.pt


VBox(children=(Label(value='4.387 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9967945120103948, max=1.0…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▄▄▅▅▆▆▆▇▇▇▇▇▇█████
train_loss,█▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
val_acc,▁▃▄▄▅▅▄▅▆▆▄▅▇▆▆▅█▅▇█
val_loss,█▆▅▆▄▄▅▄▄▄▅▄▃▃▄▅▁▆▂▁

0,1
epoch,19.0
test_acc,62.28105
test_loss,1.03487
train_acc,65.68493
train_loss,0.96174
val_acc,61.23418
val_loss,1.07464


[34m[1mwandb[0m: Agent Starting Run: i7orp7ll with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-201221] Best Val=10.66% | Test=9.98%
Best weight: weights/vgg6_20251025-201221_20251025-201506_best.pt


VBox(children=(Label(value='4.387 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9967942973902985, max=1.0…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▆▇▇█▇▅▇█▄▆▅▅▄▆▆▅▆▅▁▅
train_loss,█▃▃▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁
val_acc,▅▅▇▃▅▅▅█▄█▅▅▁▄▄▅▄█▃▃
val_loss,▂█▃▄▄▂▂▁▁▂▄▁▂▂▁▁▁▁▁▂

0,1
epoch,19.0
test_acc,9.98209
test_loss,2.30731
train_acc,9.88548
train_loss,2.30417
val_acc,9.79035
val_loss,2.30615


[34m[1mwandb[0m: Agent Starting Run: esbit8ec with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.1
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.0005


[vgg6_20251025-201924] Best Val=75.14% | Test=75.34%
Best weight: weights/vgg6_20251025-201924_20251025-202924_best.pt


VBox(children=(Label(value='4.387 MB of 4.388 MB uploaded\r'), FloatProgress(value=0.9997420115966548, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇███████████████
train_loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇█
val_loss,█▆▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁

0,1
epoch,29.0
test_acc,75.33838
test_loss,0.70168
train_acc,73.49077
train_loss,0.76013
val_acc,75.13845
val_loss,0.71754


[34m[1mwandb[0m: Agent Starting Run: wehcq0w0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.02
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-202941] Best Val=21.09% | Test=22.57%
Best weight: weights/vgg6_20251025-202941_20251025-203312_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▆▅▃▅▆▅▁▅▅▅▆▇▆▆▅▅▅█▇▅▅▄▅▇▇▅▇▄▅▆
train_loss,▅▃▄█▂▃▆▅▂▄▃▃▅▃▃▄▃▂▂▄▅▃▄▃▁▄▃▅▃▃
val_acc,▇▇▄▇▅▆▆▅▅▆██▅▅▅▇▆██▇▇▇██▅▄▇▅▇▁
val_loss,▂▂█▂▃▂▂▃▂▅▂▁▂▂▂▂▃▁▂▁▂▂▁▂▃▅▂▄▂▄

0,1
epoch,29.0
test_acc,22.56725
test_loss,2.20237
train_acc,18.69229
train_loss,2.22229
val_acc,9.51172
val_loss,2.39237


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: e2xpdleb with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.02
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0.0005


[vgg6_20251025-203932] Best Val=10.68% | Test=9.96%
Best weight: weights/vgg6_20251025-203932_20251025-204031_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▄▂▅▃▄▃▃▂█▅▃▅▂▂▄▃▄▃▃▁
train_loss,▃▂▂▁▁▂▂▃▁▁▁█▅▂▁▁▁▁▂▁
val_acc,█▄█▄██▁▁▄▆▅▂█▄▄▅▅▂▁▆
val_loss,▃▁▂▁▃▄▂▁▁▁▁█▂▁▁▁▂▂▁▂

0,1
epoch,19.0
test_acc,9.95847
test_loss,2.3672
train_acc,9.76119
train_loss,2.34058
val_acc,10.35156
val_loss,2.39944


[34m[1mwandb[0m: Agent Starting Run: ilt22oav with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.02
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0


[vgg6_20251025-204611] Best Val=10.68% | Test=9.96%
Best weight: weights/vgg6_20251025-204611_20251025-204748_best.pt


VBox(children=(Label(value='4.387 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9967971055195537, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▅▆▂▆▆▃▂▄▂▁▄▂▃▃▄▅▄▃▆▇▂▃▄▆▆▂▄█▇▃
train_loss,█▂▁▂▁▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▂▁▁▂▁▁▁▁▂▁
val_acc,▄▄▄▂█▅▁▄▄▆▄▆▄▁▆█▅▁▂▁▆▆▆▅▄█▄▄▆▆
val_loss,▅▃▃█▅▃▄▅▄▅▃▂▅▃▃▃▇▅▃▃▃▃▂▃▄▂▃▁▃▁

0,1
epoch,29.0
test_acc,9.95847
test_loss,9.28261
train_acc,9.88696
train_loss,6.43783
val_acc,10.19531
val_loss,3.3289


[34m[1mwandb[0m: Agent Starting Run: wiimdn29 with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	seed: 3
[34m[1mwandb[0m: 	weight_decay: 0.0005


[vgg6_20251025-205552] Best Val=10.68% | Test=9.96%
Best weight: weights/vgg6_20251025-205552_20251025-210100_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▃▇▂▅▂▅▇▆█▄█▄▅▆▆▄▅▃▅▁
train_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▆▂▁█▄█▄▆▄██▆▆█▄█▆▄█▄
val_loss,▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▁█▁

0,1
epoch,19.0
test_acc,9.95847
test_loss,2.40419
train_acc,9.71606
train_loss,129712.79372
val_acc,9.78516
val_loss,3.92372


[34m[1mwandb[0m: Agent Starting Run: xcn6vosq with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.1
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	seed: 2
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-210235] Best Val=10.68% | Test=9.96%
Best weight: weights/vgg6_20251025-210235_20251025-210643_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▆▆▅▆▅▁▂▂▆▃▄▆▅▅▄█▅▁▄▄▇▃▆▇▇▃▂▁▃▆
train_loss,▄▂▃▆▃▆▂▆▂█▂▄▅▄▂▃▃▄▇▁▃▅▄▃▂▄▅▅▆▂
val_acc,▄▃▄▅▃▅▃▇▃▃▃▃█▁▃▇▃▃▃▃▅▅▃▆▃▅▃▆▁▆
val_loss,▅▇▆▆▂▁█▃▅▇▂▄▃▂▃▅▄▃▆▃▄▂▃▂▅▁▇▂█▄

0,1
epoch,29.0
test_acc,9.95847
test_loss,2.30359
train_acc,10.09632
train_loss,2.30416
val_acc,10.35156
val_loss,2.30422


[34m[1mwandb[0m: Agent Starting Run: z8nb352k with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	lr: 0.1
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	optimizer: nesterov-sgd
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.005


Traceback (most recent call last):
  File "/tmp/ipython-input-301630850.py", line 32, in train_one_run
    opt = build_optimizer(cfg.optimizer, model.parameters(), lr=cfg.lr,
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-4211864865.py", line 6, in build_optimizer
    return optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/optim/sgd.py", line 64, in __init__
    raise ValueError("Nesterov momentum requires a momentum and zero dampening")
ValueError: Nesterov momentum requires a momentum and zero dampening


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run z8nb352k errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "/tmp/ipython-input-221680006.py", line 18, in sweep_train
    def sweep_train(): train_one_run(project="vgg6-cifar10")
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-301630850.py", line 32, in train_one_run
    opt = build_optimizer(cfg.optimizer, model.parameters(), lr=cfg.lr,
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-4211864865.py", line 6, in build_optimizer
    return optim.SGD(params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/optim/sgd.py", line 64, in __init__
    raise ValueError("Nesterov momentum requires a momentum and zero dam

[vgg6_20251025-211232] Best Val=84.18% | Test=84.29%
Best weight: weights/vgg6_20251025-211232_20251025-211848_best.pt


VBox(children=(Label(value='4.387 MB of 4.401 MB uploaded\r'), FloatProgress(value=0.9967955927535799, max=1.0…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▂▃▄▄▅▆▆▆▇▇▇▇███████
train_loss,██▆▅▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁
val_acc,▁▂▃▄▅▅▆▆▇▇▇▇████████
val_loss,█▇▆▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,19.0
test_acc,84.28711
test_loss,0.46914
train_acc,87.06179
train_loss,0.37551
val_acc,84.17854
val_loss,0.43997


[34m[1mwandb[0m: Agent Starting Run: a5qiqt1w with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	seed: 3
[34m[1mwandb[0m: 	weight_decay: 0.0005


[vgg6_20251025-211905] Best Val=10.37% | Test=10.21%
Best weight: weights/vgg6_20251025-211905_20251025-212214_best.pt


VBox(children=(Label(value='4.387 MB of 4.387 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,█▅▁▃▃▃▆▆▄▄▄▅▃▃▄▄▄▃▃▄▃▃▅▃▅▄▁▃▄▃
train_loss,▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▇█▇▆█▇█▇██▇▇█▇▇▇▆▇▆▇█▆▇▇▆▆▆▆▆
val_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,29.0
test_acc,10.20508
test_loss,2.3039
train_acc,9.8149
train_loss,2.30322
val_acc,9.38879
val_loss,2.30292


[34m[1mwandb[0m: Agent Starting Run: t9r3i4rb with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 30
[34m[1mwandb[0m: 	lr: 0.05
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	seed: 1
[34m[1mwandb[0m: 	weight_decay: 0.005


[vgg6_20251025-212835] Best Val=10.66% | Test=9.98%
Best weight: weights/vgg6_20251025-212835_20251025-213118_best.pt


VBox(children=(Label(value='4.387 MB of 4.388 MB uploaded\r'), FloatProgress(value=0.9997415770191768, max=1.0…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train_acc,▅▆▄▅▆▃▅██▇▄▅▂▄▃▄▆▅▄▄█▆▆▁█▆▄▃▇▃
train_loss,█▁▂▂▂▃▂▁▁▁▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▂▂
val_acc,▅▅▁▅▂▅▂█▅█▅▅▁▄█▅▅▇▇▃▂▇▄▂▅▁▅▄▇█
val_loss,▂▅▄▆▇▃▄▁▃▃█▂▅▅▃▂▃▂▁▂▆▂▄▄▃▄▃▅▂▂

0,1
epoch,29.0
test_acc,9.98209
test_loss,2.30434
train_acc,9.84553
train_loss,2.3081
val_acc,10.6606
val_loss,2.30537


Re-run the best configuration (from W&B) & keep the weight

In [None]:
best_config = {
    "activation": "silu",  # <-- replace from W&B best run
    "optimizer": "adamw",
    "batch_size": 128,
    "epochs": 30,
    "lr": 0.02,
    "momentum": 0.9,
    "weight_decay": 5e-4,
    "seed": 2,
}
# train_one_run(best_config)  # optional confirmatory run