[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1dDaaTAipo6C-Z-lkf-mJvHzG0z8G6gjL#scrollTo=lWOsAvobfEdK)

In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.15.12-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.34.0-py2.py3-none-any.whl (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.9/243.9 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.m

In [None]:
import torch
from torchvision import datasets, transforms
from tqdm import tqdm
from torchvision.transforms import v2
from torchvision.transforms import Lambda
# Define the transform to convert images to grayscale
transform = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize((28, 28), antialias=True),
    v2.Grayscale(),
    Lambda(lambda x: torch.flatten(x))
])

# Load the CIFAR-10 dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

# Placeholder for the sum and squared sum of the images (used for mean and std calculation)
sum_intensity = 0.0
sum_squared_intensity = 0.0
num_elements = 0

# Loop over the dataset
for data, _ in tqdm(train_dataset, desc='Computing mean and std'):
    sum_intensity += torch.mean(data)
    sum_squared_intensity += torch.mean(data**2)
    num_elements += 1

# Calculate mean and standard deviation
mean = sum_intensity / num_elements
std = (sum_squared_intensity / num_elements - mean ** 2) ** 0.5

print(f'Mean: {mean}')
print(f'Std: {std}')


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:10<00:00, 16396738.04it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


Computing mean and std: 100%|██████████| 50000/50000 [00:43<00:00, 1145.02it/s]


Mean: 0.48075222969055176
Std: 0.23171930015087128


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms import v2
from torchvision.transforms import Lambda
import wandb

# Sweep Configuration
sweep_config = {
    'name': 'my-sweep3',
    'method': 'random',  # can be grid, random, or bayes
    'metric': {
      'goal': 'maximize',
      'name': 'epoch_validation_accuracy'
    },
    'parameters': {
        'optimizer': {
            'values': ['sgd','adam', 'rmsprop','adagrad','sgd_sam']
        },
        'learning_rate': {
            'values': [1e-3, 1e-4] # 1e-5
        },
        'batch_size': {
            'values': [16, 32, 64]
        },
        'weight_decay': {
            'values': [0, 1e-4, 1e-2]  # Example values; 1e-3
        },
        'momentum': {  # Only relevant for SGD and SGD_SAM
            'values': [0.9, 0.99] #  0.95
        },
        # 'nesterov': {
        #     'values': [False, True]  # Typically used for SGD
        # },
        # 'rho': {  # SAM-specific hyperparameter
        #     'values': [0.05, 0.1, 0.2]
        # }
        # ... add other parameters as needed
    }
}



# Your MLP and accuracy functions remain the same
class MLP(nn.Module):
    def __init__(self, input_dim=784, hidden_layers=[1024, 512], output_dim=10, dropout_prob=0.3):
        super(MLP, self).__init__()

        layers = []
        for i in range(len(hidden_layers)):
            layers.append(nn.Linear(input_dim if i == 0 else hidden_layers[i - 1], hidden_layers[i]))
            layers.append(nn.BatchNorm1d(hidden_layers[i]))  # Add BatchNorm layer
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_prob))  # Add Dropout layer
        layers.append(nn.Linear(hidden_layers[-1], output_dim))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.model(x)

def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

class SGD_SAM(torch.optim.Optimizer):
    def __init__(self, params, base_optimizer=torch.optim.SGD, rho=0.05, **kwargs):
        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"

        self.base_optimizer = base_optimizer(params, **kwargs)
        self.defaults = self.base_optimizer.defaults
        self.param_groups = self.base_optimizer.param_groups
        self.rho = rho

    @torch.no_grad()
    def first_step(self, zero_grad=False):
        grad_norm = self._grad_norm()
        for group in self.param_groups:
            scale = self.rho / (grad_norm + 1e-12)

            for param in group['params']:
                if param.grad is None: continue

                # Apply the SAM perturbation
                e_w = param.grad * scale
                param.add_(e_w)  # climb to the local maximum "w + e(w)"
        if zero_grad: self.zero_grad()

    @torch.no_grad()
    def second_step(self, zero_grad=False):
        # Compute the gradient norm before reversing the SAM perturbation.
        grad_norm = self._grad_norm()
        for group in self.param_groups:
            weight_decay = group.get('weight_decay', 0.0)  # Get weight_decay value

            for param in group['params']:
                if param.grad is None: continue

                # Reverse the SAM perturbation
                scale = self.rho / (grad_norm + 1e-12)  # Use the pre-computed grad_norm
                e_w = param.grad * scale
                param.sub_(e_w)  # revert to the original parameters "w - e(w)"

                # Apply weight decay directly to the weights (if nonzero)
                if weight_decay != 0:
                    param.data.add_(param.data, alpha=-weight_decay)

        self.base_optimizer.step()  # do the actual "sharpness-aware" update

        if zero_grad: self.zero_grad()

    # Method to calculate the gradient norm
    def _grad_norm(self):
        norm = torch.norm(
                    torch.stack([
                        p.grad.norm(p=2)
                        for group in self.param_groups
                        for p in group['params']
                        if p.grad is not None
                    ]),
                    p=2
               )
        return norm

    def step(self, closure=None):
        raise RuntimeError('SAM doesn’t implement step() function. Use first_step() and second_step().')

# Load CIFAR-10 dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True)
val_dataset = datasets.CIFAR10(root='./data', train=False, download=True)

# Splitting the dataset into training and validation sets if needed
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, _ = torch.utils.data.random_split(train_dataset, [train_size, val_size])


# Define the transformations for the training dataset
train_transforms = v2.Compose([
    v2.ToImage(),
    v2.RandomHorizontalFlip(),
    #v2.RandomCrop(32, padding=4),
    v2.RandomRotation(degrees=15), # Rotates the image by a given angle
    #v2.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), # Randomly changes the brightness, contrast, and other properties
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize((28, 28), antialias=True),
    v2.Grayscale(),
    v2.Normalize((0.48075222969055176,), (0.23171930015087128,)),  # Normalizing with calculated mean and std
    Lambda(lambda x: torch.flatten(x))
])

# Define the transformations for the validation dataset (without random transformations)
val_transforms = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize((28, 28), antialias=True),
    v2.Grayscale(),
    v2.Normalize((0.48075222969055176,), (0.23171930015087128,)),  # Normalizing with calculated mean and std
    Lambda(lambda x: torch.flatten(x))
])

# Custom dataset class that applies the transformations
class TransformDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

    def __len__(self):
        return len(self.dataset)


class CachedDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform
        self.cache = {}

    def __getitem__(self, idx):
        if idx in self.cache:
            return self.cache[idx]
        else:
            image, label = self.dataset[idx]
            if self.transform:
                image = self.transform(image)
            self.cache[idx] = (image, label)
            return image, label

    def __len__(self):
        return len(self.dataset)

# Apply transformations to the training dataset without caching
train_dataset = TransformDataset(train_dataset, transform=train_transforms)

# Apply transformations to the validation dataset with caching
val_dataset = CachedDataset(TransformDataset(val_dataset, transform=val_transforms))


# Model, criterion, optimizer, and train function remain unchanged (but make sure to use wandb.config for hyperparameters)

def train(model, train_loader, val_loader, criterion, optimizer, epochs, scheduler, use_sam=False, early_stopping_patience=5):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    for epoch in range(epochs):
        model.train()
        train_losses = []
        train_accuracies = []
        total_norm = 0

        for batch_idx, (inputs, targets) in enumerate(train_loader):
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            acc = accuracy(outputs, targets)
            train_losses.append(loss.item())
            train_accuracies.append(acc.item())

            optimizer.zero_grad()
            loss.backward()

            # If SAM is used, the first step is to ascent.
            if use_sam:
                optimizer.first_step(zero_grad=False)
                # Make a second forward pass to compute the second gradient
                outputs_second = model(inputs)
                loss_second = criterion(outputs_second, targets)
                loss_second.backward()

                # Log the batch gradients norm before zeroing gradients
                batch_gradients_norm = sum(p.grad.data.norm(2).item() ** 2 for p in model.parameters() if p.grad is not None) ** 0.5
                wandb.log({
                    "batch_training_loss": loss.item(),
                    "batch_gradients_norm": batch_gradients_norm,
                    "batch_idx": epoch * len(train_loader) + batch_idx
                })
                # writer.add_scalar('Loss/Train_batch', loss.item(), epoch * len(train_loader) + batch_idx)
                # writer.add_scalar('Gradients/Norm', batch_gradients_norm, epoch * len(train_loader) + batch_idx)
                optimizer.second_step(zero_grad=True)

            else:

                optimizer.step()
                # Calculate and log batch training loss and gradients norm
                batch_gradients_norm = sum(p.grad.data.norm(2).item() ** 2 for p in model.parameters() if p.grad is not None) ** 0.5

                # Log batch training loss
                wandb.log({
                    "batch_training_loss": loss.item(),
                    "batch_gradients_norm": batch_gradients_norm,
                    "batch_idx": epoch * len(train_loader) + batch_idx
                })

        # Average training loss and accuracy for the epoch
        avg_train_loss = sum(train_losses) / len(train_losses)
        avg_train_accuracy = sum(train_accuracies) / len(train_accuracies)

        # Compute the model norm (L2 norm of the parameters)
        # Compute the total norm of the parameters (for model norm logging).
        total_norm = sum(p.data.norm(2).item() ** 2 for p in model.parameters())
        model_norm = total_norm ** 0.5

        # Perform validation
        model.eval()
        val_losses = []
        val_accuracies = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                acc = accuracy(outputs, targets)
                val_losses.append(loss.item())
                val_accuracies.append(acc.item())

        avg_val_loss = sum(val_losses) / len(val_losses)
        avg_val_accuracy = sum(val_accuracies) / len(val_accuracies)

        # Log epoch metrics to wandb
        wandb.log({
            "epoch": epoch,
            "epoch_training_loss": avg_train_loss,
            "epoch_training_accuracy": avg_train_accuracy,
            "epoch_validation_loss": avg_val_loss,
            "epoch_validation_accuracy": avg_val_accuracy,
            "model_norm": model_norm,
            "learning_rate": scheduler.get_last_lr()[0],
            "batch_size": train_loader.batch_size,
            "optimizer": optimizer.__class__.__name__
        })

        # Update learning rate if scheduler is provided
        if scheduler is not None:
            scheduler.step()

        # Early stopping logic
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            # Reset the early stopping counter
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= early_stopping_patience:
            print(f"No improvement in validation loss for {early_stopping_patience} consecutive epochs. Stopping early.")
            early_stop = True
            break

        # Print metrics
        print(f"Epoch {epoch}: avg_train_loss={avg_train_loss}, avg_train_accuracy={avg_train_accuracy}, avg_val_loss={avg_val_loss}, avg_val_accuracy={avg_val_accuracy}")

    return model

# The main function to be called by the sweep
def train_with_wandb():
    # Initialize a new wandb run
    #writer = SummaryWriter(log_dir='runs/my_experiment2')
    wandb.init()

    # Load data
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True, num_workers=0)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=wandb.config.batch_size, shuffle=False, num_workers=0)

    # Initialize model and loss function
    model = MLP()
    criterion = nn.CrossEntropyLoss()

    # Choose the optimizer
    weight_decay = wandb.config.weight_decay  # Get weight decay from wandb.config
    optimizer_name = wandb.config.optimizer
    lr = wandb.config.learning_rate

    # Optimizer-specific hyperparameters
    if optimizer_name == 'sgd' or optimizer_name == 'sgd_sam':
        momentum = wandb.config.get('momentum', 0.9)  # Provide a default value if not specified
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
        if optimizer_name == 'sgd_sam':
            rho = wandb.config.get('rho', 0.05)
            optimizer = SGD_SAM(model.parameters(), base_optimizer=torch.optim.SGD, rho=rho, lr=lr, momentum=momentum, weight_decay= weight_decay)
            use_sam = True
        else:
            use_sam = False
    elif optimizer_name == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        use_sam = False
    elif optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
        use_sam = False
    elif optimizer_name == 'adagrad':
        if 'momentum' in wandb.config:
            print("Warning: Momentum is not used by the Adagrad optimizer and will be ignored.")
        optimizer = optim.Adagrad(model.parameters(), lr=lr, weight_decay=weight_decay)
        use_sam = False

    # Set up the learning rate scheduler
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    # Train the model
    train(model, train_loader, val_loader, criterion, optimizer, epochs=30, scheduler=scheduler, use_sam=use_sam, early_stopping_patience=5)

    # Finish the run
    #writer.close()
    wandb.finish()


# Start the sweep
sweep_id = wandb.sweep(sweep=sweep_config, project="wandb-final9")
wandb.agent(sweep_id, train_with_wandb)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 74448098.68it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: bygi5msd
Sweep URL: https://wandb.ai/marius-workspace/wandb-final9/sweeps/bygi5msd


[34m[1mwandb[0m: Agent Starting Run: xlxbawi0 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.99
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33mmariusmarin[0m ([33mmarius-workspace[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 0: avg_train_loss=1.9211829748153686, avg_train_accuracy=0.3124, avg_val_loss=1.7685720153674958, avg_val_accuracy=0.37072054140127386
Epoch 1: avg_train_loss=1.7967604555130006, avg_train_accuracy=0.359725, avg_val_loss=1.6966977742067568, avg_val_accuracy=0.3903264331210191
Epoch 2: avg_train_loss=1.7438571701049805, avg_train_accuracy=0.376825, avg_val_loss=1.6587576091669167, avg_val_accuracy=0.4049562101910828
Epoch 3: avg_train_loss=1.7085986618041993, avg_train_accuracy=0.39315, avg_val_loss=1.6139548318401264, avg_val_accuracy=0.42038216560509556
Epoch 4: avg_train_loss=1.6819992765426637, avg_train_accuracy=0.396825, avg_val_loss=1.5822485336072885, avg_val_accuracy=0.4341162420382166
Epoch 5: avg_train_loss=1.6527046825408935, avg_train_accuracy=0.4092, avg_val_loss=1.5775778225273083, avg_val_accuracy=0.43998805732484075
Epoch 6: avg_train_loss=1.6375436595916748, avg_train_accuracy=0.417375, avg_val_loss=1.5429289599133145, avg_val_accuracy=0.45232882165605093
Epoch 7

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,█▆▆▆▅▅▄▄▃▃▂▃▁▃▃▁▁▂▁▂▁▂▁▁▂▂▂▂▂▃▁▂▁▂▂▂▂▂▂▂
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇▇█▄▅▇▅▃▄▄▆▆▄▅▆▃▁▄▄▄▂▃▁▁▆▃▃▂▃▄▃▃▃▄▄▅▂▂▅▃
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_training_accuracy,▁▃▄▄▄▅▅▆▆▆▇▇▇▇▇▇██████████████
epoch_training_loss,█▆▅▅▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▂▃▄▅▅▆▅▆▆▇▇▇▇▇███████████████
epoch_validation_loss,█▇▆▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,1.22391
batch_idx,18749
batch_size,64
batch_training_loss,1.41528
epoch,29
epoch_training_accuracy,0.48352
epoch_training_loss,1.45731
epoch_validation_accuracy,0.49254
epoch_validation_loss,1.43263
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: 2vbo0sz3 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.01


Epoch 0: avg_train_loss=1.9748666962623596, avg_train_accuracy=0.29065, avg_val_loss=1.7925007529913808, avg_val_accuracy=0.36920926517571884
Epoch 1: avg_train_loss=1.8474985013008118, avg_train_accuracy=0.338225, avg_val_loss=1.7317774105376709, avg_val_accuracy=0.3889776357827476
Epoch 2: avg_train_loss=1.798098823451996, avg_train_accuracy=0.35895, avg_val_loss=1.6914805474753578, avg_val_accuracy=0.3955670926517572
Epoch 3: avg_train_loss=1.7622465158462524, avg_train_accuracy=0.372075, avg_val_loss=1.6668519790941916, avg_val_accuracy=0.4094448881789137
Epoch 4: avg_train_loss=1.7432459069252013, avg_train_accuracy=0.37975, avg_val_loss=1.650978029345552, avg_val_accuracy=0.41074281150159747
Epoch 5: avg_train_loss=1.733339656639099, avg_train_accuracy=0.381625, avg_val_loss=1.633722245883637, avg_val_accuracy=0.4213258785942492
Epoch 6: avg_train_loss=1.7139184894561768, avg_train_accuracy=0.3943, avg_val_loss=1.643429005488801, avg_val_accuracy=0.413538338658147
Epoch 7: avg_tr

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,▃▂▂▂▁▃▃▂▂▂▃▁▃▃▂▅▂▃▅▃▅▄▃▇▆▆▇▆█▅▄▄▅▅▄▇▄▅█▄
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇█▇▇▄▅▆▃▆▃▅▅▇▅▅▅▅▄▅▅▅▃▂▄▅▄▅▆▇▄▂▁▃▄▂▄▄▃▅▁
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_training_accuracy,▁▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇██████████████
epoch_training_loss,█▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▂▃▄▄▅▄▅▅▅▇▇▇▇▇▇▇▇████████████
epoch_validation_loss,█▆▅▅▄▄▄▄▃▄▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,9.00556
batch_idx,37499
batch_size,32
batch_training_loss,1.58637
epoch,29
epoch_training_accuracy,0.44105
epoch_training_loss,1.59395
epoch_validation_accuracy,0.46066
epoch_validation_loss,1.52122
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: m812y960 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.99
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 0: avg_train_loss=1.9519203490257264, avg_train_accuracy=0.3037, avg_val_loss=1.7991696916068325, avg_val_accuracy=0.36661341853035145
Epoch 1: avg_train_loss=1.8620277520179749, avg_train_accuracy=0.33595, avg_val_loss=1.7492659846052956, avg_val_accuracy=0.38448482428115016
Epoch 2: avg_train_loss=1.8276525023460388, avg_train_accuracy=0.350675, avg_val_loss=1.7254972450268535, avg_val_accuracy=0.389676517571885
Epoch 3: avg_train_loss=1.8056496024131774, avg_train_accuracy=0.359625, avg_val_loss=1.7019317180584794, avg_val_accuracy=0.402555910543131
Epoch 4: avg_train_loss=1.7913982444763183, avg_train_accuracy=0.361425, avg_val_loss=1.6884325902682904, avg_val_accuracy=0.4054512779552716
Epoch 5: avg_train_loss=1.7773830418586731, avg_train_accuracy=0.369475, avg_val_loss=1.6721431131180102, avg_val_accuracy=0.40964456869009586
Epoch 6: avg_train_loss=1.763389328098297, avg_train_accuracy=0.371725, avg_val_loss=1.6617616140804352, avg_val_accuracy=0.41623402555910544
Epoch 7:

0,1
batch_gradients_norm,▄▁▅▅▃▂▄▃▄▂▅▄▅▅▅▆▄▅▂▄█▇▇▆▅▆▅▆▆▄▅▄▆▄▄▆█▆▇▇
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▆▃█▆▅▃▄▄▄▂▇▆▂▄▄▇▃▆▁▂▆█▇▆▃▂▁▅▅▂▂▄▆▂▃▃▅▅▆▅
epoch,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
epoch_training_accuracy,▁▄▅▅▆▆▆▇▇▇███████
epoch_training_loss,█▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▅▆▇▇▇████████
epoch_validation_loss,█▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁
learning_rate,██████████▁▁▁▁▁▁▁

0,1
batch_gradients_norm,6.22429
batch_idx,21249
batch_size,32
batch_training_loss,1.79987
epoch,16
epoch_training_accuracy,0.3914
epoch_training_loss,1.72044
epoch_validation_accuracy,0.42901
epoch_validation_loss,1.63062
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: k32xuu8x with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 0: avg_train_loss=2.144220999336243, avg_train_accuracy=0.22375, avg_val_loss=2.0157729774523694, avg_val_accuracy=0.2975716560509554
Epoch 1: avg_train_loss=2.060842402267456, avg_train_accuracy=0.263375, avg_val_loss=1.9748236009269764, avg_val_accuracy=0.3153861464968153
Epoch 2: avg_train_loss=2.0321864501953124, avg_train_accuracy=0.27535, avg_val_loss=1.9517152036071583, avg_val_accuracy=0.3249402866242038
Epoch 3: avg_train_loss=2.014566238975525, avg_train_accuracy=0.28045, avg_val_loss=1.9363015267499692, avg_val_accuracy=0.32961783439490444
Epoch 4: avg_train_loss=1.9985106517791749, avg_train_accuracy=0.285375, avg_val_loss=1.92307713001397, avg_val_accuracy=0.334593949044586
Epoch 5: avg_train_loss=1.989540412902832, avg_train_accuracy=0.292525, avg_val_loss=1.9104902562062451, avg_val_accuracy=0.33638535031847133
Epoch 6: avg_train_loss=1.9782711380004883, avg_train_accuracy=0.2942, avg_val_loss=1.9016037435288642, avg_val_accuracy=0.34026671974522293
Epoch 7: avg_tr

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,█▄▅▅▅▄▆▃▄▄▃█▄▇▃▅█▆▁▃▂▁▃▂▃▅▂▅▃▆▃▄▇▄▆▃▅▃▄▂
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,█▅▇▅▄▄▄▃▂▄▅▇▃▅▄▅▅▄▂▃▃▃▂▃▁▄▃▅▄▄▃▃▄▁▅▃▄▃▅▃
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▄▅▆▆▇▇▇▇███████████████████
epoch_training_loss,█▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▅▅▆▆▇▇██▇███▇██▇█▇████████
epoch_validation_loss,█▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,4.68576
batch_idx,17499
batch_size,64
batch_training_loss,1.91614
epoch,27
epoch_training_accuracy,0.30555
epoch_training_loss,1.95244
epoch_validation_accuracy,0.34863
epoch_validation_loss,1.87638
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: 3zot21y3 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 0: avg_train_loss=2.0025831336021422, avg_train_accuracy=0.283025, avg_val_loss=1.8203652694702148, avg_val_accuracy=0.3555
Epoch 1: avg_train_loss=1.8912965383052827, avg_train_accuracy=0.3258, avg_val_loss=1.7458866386413574, avg_val_accuracy=0.379
Epoch 2: avg_train_loss=1.841972592639923, avg_train_accuracy=0.3428, avg_val_loss=1.7084318891525268, avg_val_accuracy=0.3938
Epoch 3: avg_train_loss=1.8043928775310516, avg_train_accuracy=0.35285, avg_val_loss=1.6586644828796386, avg_val_accuracy=0.4122
Epoch 4: avg_train_loss=1.7834322164535523, avg_train_accuracy=0.36395, avg_val_loss=1.6340480676651001, avg_val_accuracy=0.417
Epoch 5: avg_train_loss=1.7537826076507568, avg_train_accuracy=0.376425, avg_val_loss=1.6173215795516969, avg_val_accuracy=0.4262
Epoch 6: avg_train_loss=1.7406096604824066, avg_train_accuracy=0.384675, avg_val_loss=1.5991134656906127, avg_val_accuracy=0.4345
Epoch 7: avg_train_loss=1.7243033504962921, avg_train_accuracy=0.38645, avg_val_loss=1.580283206367

VBox(children=(Label(value='0.002 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.115113…

0,1
batch_gradients_norm,█▆▄▄▃▃▂▂▂▂▂▂▂▁▂▂▁▁▂▁▂▂▁▂▁▁▁▂▁▂▂▂▂▂▂▁▁▂▁▂
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇▄▄▆▃▆▂▅▆▄▆▃▃▅▆▅▅▄▆▄▅▄▃▄▁▄▃▆▃▇▅▅▅▅▄▁▃█▂▄
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_training_accuracy,▁▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇██████████████
epoch_training_loss,█▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▂▃▄▄▅▅▆▆▆▇▇▇▇▇█▇██▇██████████
epoch_validation_loss,█▆▆▅▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,1.44591
batch_idx,74999
batch_size,16
batch_training_loss,1.66221
epoch,29
epoch_training_accuracy,0.43375
epoch_training_loss,1.6009
epoch_validation_accuracy,0.4743
epoch_validation_loss,1.4737
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: vwbf243o with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.01


Epoch 0: avg_train_loss=2.1468979077339174, avg_train_accuracy=0.220875, avg_val_loss=1.9810821804375693, avg_val_accuracy=0.31639376996805113
Epoch 1: avg_train_loss=2.020074793243408, avg_train_accuracy=0.278675, avg_val_loss=1.9085218369389494, avg_val_accuracy=0.3373602236421725
Epoch 2: avg_train_loss=1.9688426562309265, avg_train_accuracy=0.2975, avg_val_loss=1.8684099737447672, avg_val_accuracy=0.34804313099041534
Epoch 3: avg_train_loss=1.932113517665863, avg_train_accuracy=0.31125, avg_val_loss=1.8375167286814973, avg_val_accuracy=0.3608226837060703
Epoch 4: avg_train_loss=1.9079335852622985, avg_train_accuracy=0.320475, avg_val_loss=1.813208953260233, avg_val_accuracy=0.36351837060702874
Epoch 5: avg_train_loss=1.8951293829917908, avg_train_accuracy=0.3251, avg_val_loss=1.7981777141650264, avg_val_accuracy=0.37030750798722045
Epoch 6: avg_train_loss=1.873510555934906, avg_train_accuracy=0.33475, avg_val_loss=1.7778354887002574, avg_val_accuracy=0.3766972843450479
Epoch 7: avg

VBox(children=(Label(value='0.001 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.072398…

0,1
batch_gradients_norm,█▆▆▅▄▅▅▅▄█▆▄▂▃▁▄▄▇▂▃▂▅▆▂▅▄▃▆▃▄▁▅▂▃▄▃▇▅▄▄
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,██▅▆▆▇█▃▃▇▇▅▁▃▂▄▄▆▃▃▄▃▅▂▄▃▄▆▃▄▁▆▁▃▂▂▅▄▇▂
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▄▅▆▆▆▇▇▇▇██████████████████
epoch_training_loss,█▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇██████████
epoch_validation_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,6.51685
batch_idx,34999
batch_size,32
batch_training_loss,1.9484
epoch,27
epoch_training_accuracy,0.35635
epoch_training_loss,1.81319
epoch_validation_accuracy,0.39796
epoch_validation_loss,1.73052
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: v6jzs3ok with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	weight_decay: 0.0001


Epoch 0: avg_train_loss=1.9844622875213622, avg_train_accuracy=0.2862, avg_val_loss=1.8109985893249512, avg_val_accuracy=0.3665
Epoch 1: avg_train_loss=1.8992413528442382, avg_train_accuracy=0.322275, avg_val_loss=1.7680368766784669, avg_val_accuracy=0.3822
Epoch 2: avg_train_loss=1.8695953820705413, avg_train_accuracy=0.3334, avg_val_loss=1.748933808708191, avg_val_accuracy=0.3852
Epoch 3: avg_train_loss=1.849860858297348, avg_train_accuracy=0.341725, avg_val_loss=1.7231419982910157, avg_val_accuracy=0.394
Epoch 4: avg_train_loss=1.834820036315918, avg_train_accuracy=0.34825, avg_val_loss=1.7126068822860718, avg_val_accuracy=0.3992
Epoch 5: avg_train_loss=1.8234820241451264, avg_train_accuracy=0.35275, avg_val_loss=1.6959029375076293, avg_val_accuracy=0.4053
Epoch 6: avg_train_loss=1.8099659974098206, avg_train_accuracy=0.35605, avg_val_loss=1.6826316854476928, avg_val_accuracy=0.4105
Epoch 7: avg_train_loss=1.8031092384815217, avg_train_accuracy=0.36275, avg_val_loss=1.68541798686981

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,▃▂▃▂▁▁▂▃▅▄▄▄▄▅▂▆▄▃▂▅▅▆█▅▁▃▄▂▇▃▃▄▅▃▆▃▂▅▅▂
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▄▅▄▃▂▃▃▅▄▃▅▃▅▅▃▇▄▂▃▃▅▅█▄▂▄▄▁▄▂▃▄▃▃▄▅▂▆▅▃
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▄▅▅▆▆▇▇▇▇▇██▇██████████████
epoch_training_loss,█▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▃▄▅▆▇▆▇█▇█▇▇▇▇▇███████▇▇██
epoch_validation_loss,█▆▅▄▄▃▃▃▂▂▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▂▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,7.95503
batch_idx,69999
batch_size,16
batch_training_loss,1.80507
epoch,27
epoch_training_accuracy,0.37125
epoch_training_loss,1.77429
epoch_validation_accuracy,0.421
epoch_validation_loss,1.64986
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: utgrlvsf with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	weight_decay: 0.01


Epoch 0: avg_train_loss=1.9392815183639527, avg_train_accuracy=0.30725, avg_val_loss=1.7918402009708867, avg_val_accuracy=0.3752985668789809
Epoch 1: avg_train_loss=1.835306092643738, avg_train_accuracy=0.344025, avg_val_loss=1.7398913149621076, avg_val_accuracy=0.38763933121019106
Epoch 2: avg_train_loss=1.798819645690918, avg_train_accuracy=0.36295, avg_val_loss=1.7071100040605873, avg_val_accuracy=0.4004777070063694
Epoch 3: avg_train_loss=1.7685292854309083, avg_train_accuracy=0.376375, avg_val_loss=1.6906340243710074, avg_val_accuracy=0.40545382165605093
Epoch 4: avg_train_loss=1.7461925310134887, avg_train_accuracy=0.38155, avg_val_loss=1.6701053069655303, avg_val_accuracy=0.4133160828025478
Epoch 5: avg_train_loss=1.7326012533187867, avg_train_accuracy=0.386325, avg_val_loss=1.6519031312055648, avg_val_accuracy=0.41620222929936307
Epoch 6: avg_train_loss=1.7183638195037843, avg_train_accuracy=0.391175, avg_val_loss=1.6399446088037672, avg_val_accuracy=0.4197850318471338
Epoch 7:

0,1
batch_gradients_norm,▁▁▁▃▃▂▃▃▅▃▄▄▆▃▅▄▅▄▅▆▅▄▆▇▅▅▅▇▆▆▆▆▄█▆▆▅▅▆▇
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇▇▆▆█▅▆▆▆▃▅▄▄▁▄▂▄▃▄▅▃▁▅▅▄▃▃▄▅▃▅▆▂▇▄▄▃▃▅▅
epoch,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇██
epoch_training_accuracy,▁▃▅▅▆▆▆▇▇▇██████████████
epoch_training_loss,█▅▅▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▂▄▄▅▆▆▆▇▇██████████████
epoch_validation_loss,█▆▅▄▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁

0,1
batch_gradients_norm,5.23296
batch_idx,14999
batch_size,64
batch_training_loss,1.54199
epoch,23
epoch_training_accuracy,0.4155
epoch_training_loss,1.65839
epoch_validation_accuracy,0.43491
epoch_validation_loss,1.59434
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: loclfeso with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.99
[34m[1mwandb[0m: 	optimizer: sgd_sam
[34m[1mwandb[0m: 	weight_decay: 0




Epoch 0: avg_train_loss=2.1186165175914766, avg_train_accuracy=0.261525, avg_val_loss=1.825413081741333, avg_val_accuracy=0.3454
Epoch 1: avg_train_loss=1.8983627789497375, avg_train_accuracy=0.319525, avg_val_loss=1.7585902500152588, avg_val_accuracy=0.3765
Epoch 2: avg_train_loss=1.8520708891391755, avg_train_accuracy=0.3406, avg_val_loss=1.6915560089111328, avg_val_accuracy=0.4013
Epoch 3: avg_train_loss=1.8195586098194123, avg_train_accuracy=0.355175, avg_val_loss=1.6632763549804688, avg_val_accuracy=0.4128
Epoch 4: avg_train_loss=1.7920364712238313, avg_train_accuracy=0.363275, avg_val_loss=1.649209031867981, avg_val_accuracy=0.417
Epoch 5: avg_train_loss=1.7757310054302216, avg_train_accuracy=0.36605, avg_val_loss=1.6212818183898925, avg_val_accuracy=0.4264
Epoch 6: avg_train_loss=1.7562786778926849, avg_train_accuracy=0.37545, avg_val_loss=1.6057978370666504, avg_val_accuracy=0.425
Epoch 7: avg_train_loss=1.7458308727264404, avg_train_accuracy=0.3787, avg_val_loss=1.621095071792

0,1
batch_gradients_norm,█▄▄▃▃▃▃▃▂▁▃▄▂▄▂▃▄▄▆▄▂▂▆▄▃▃▂▄▄▁▄▂▂▂▅▁▃▃▂▂
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇▅▆▅▂▃▅▅▄▁▄▅▄▃▂▄▄▃▆▅▂▂█▃▃▃▃▅▂▁▄▂▃▂▅▁▃▃▂▃
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▃▄▅▅▅▆▆▆▆▇▇▇▇██████████████
epoch_training_loss,█▅▄▄▃▃▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▅▅▅▅▆▆▆▇▇▇▇██▇██████████
epoch_validation_loss,█▇▅▄▄▄▃▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,3.13205
batch_idx,69999
batch_size,16
batch_training_loss,1.54785
epoch,27
epoch_training_accuracy,0.42352
epoch_training_loss,1.62213
epoch_validation_accuracy,0.4696
epoch_validation_loss,1.5122
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: we29v6py with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	weight_decay: 0.01


Epoch 0: avg_train_loss=2.158554561710358, avg_train_accuracy=0.216275, avg_val_loss=2.027114991378784, avg_val_accuracy=0.297
Epoch 1: avg_train_loss=2.0888321989059446, avg_train_accuracy=0.248425, avg_val_loss=1.9910029399871827, avg_val_accuracy=0.3105
Epoch 2: avg_train_loss=2.0660802223205565, avg_train_accuracy=0.25675, avg_val_loss=1.9681295053482055, avg_val_accuracy=0.3207
Epoch 3: avg_train_loss=2.049905586576462, avg_train_accuracy=0.265375, avg_val_loss=1.9521289520263672, avg_val_accuracy=0.3255
Epoch 4: avg_train_loss=2.034146576356888, avg_train_accuracy=0.274125, avg_val_loss=1.9396829616546631, avg_val_accuracy=0.3325
Epoch 5: avg_train_loss=2.0265257658958435, avg_train_accuracy=0.27655, avg_val_loss=1.9249458200454712, avg_val_accuracy=0.3351
Epoch 6: avg_train_loss=2.0161430682659147, avg_train_accuracy=0.2794, avg_val_loss=1.9189359958648682, avg_val_accuracy=0.3391
Epoch 7: avg_train_loss=2.0093755424976347, avg_train_accuracy=0.282675, avg_val_loss=1.91711791229

VBox(children=(Label(value='0.001 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.083716…

0,1
batch_gradients_norm,█▅▆▂▆▂▃▃▄▃▄▃▃▆▄▅▇▅▅▄▄▃▁▆▄▅▇▄▂▄▅▅▄▅▅▄▇▆▂▅
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,█▆▆▂▇▃▄▅▄▂▅▄▃▇▃▄█▄▄▄▃▃▁▄▃▅▆▃▂▅▄▅▄▄▅▄▅▆▂▅
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
epoch_training_accuracy,▁▄▅▆▆▇▇▇▇██████████
epoch_training_loss,█▅▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▆▆▇▇▇█▇█████▇██
epoch_validation_loss,█▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,9.45903
batch_idx,47499
batch_size,16
batch_training_loss,2.09304
epoch,18
epoch_training_accuracy,0.28928
epoch_training_loss,1.99268
epoch_validation_accuracy,0.3479
epoch_validation_loss,1.89752
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: 2h81smp6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd_sam
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 0: avg_train_loss=1.9551361936569214, avg_train_accuracy=0.303175, avg_val_loss=1.7792913610008871, avg_val_accuracy=0.3718152866242038
Epoch 1: avg_train_loss=1.8220781675338744, avg_train_accuracy=0.34985, avg_val_loss=1.7112187951993032, avg_val_accuracy=0.39331210191082805
Epoch 2: avg_train_loss=1.7723300443649292, avg_train_accuracy=0.3639, avg_val_loss=1.6849189207052728, avg_val_accuracy=0.3942078025477707
Epoch 3: avg_train_loss=1.7421503929138185, avg_train_accuracy=0.376025, avg_val_loss=1.641846105551264, avg_val_accuracy=0.4166003184713376
Epoch 4: avg_train_loss=1.7161072061538696, avg_train_accuracy=0.3881, avg_val_loss=1.6300936785473186, avg_val_accuracy=0.4151074840764331
Epoch 5: avg_train_loss=1.692825012779236, avg_train_accuracy=0.397575, avg_val_loss=1.612406512734237, avg_val_accuracy=0.4224721337579618
Epoch 6: avg_train_loss=1.6787788934707641, avg_train_accuracy=0.401575, avg_val_loss=1.5952296674631203, avg_val_accuracy=0.43341958598726116
Epoch 7: avg

0,1
batch_gradients_norm,█▅▄▄▅▅▄▅▄▅▄▃▅▆▅▅▃▄▄▃▂▄▃▅▃▂▃▂▄▃▁▂▃▃▂▄▂▃▁▂
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,█▅▄▅▆▄▂▅▄▄▃▃▅▅▄▄▄▅▄▄▄▅▃▃▃▂▃▃▃▄▁▃▃▄▄▄▃▃▃▄
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▄▄▅▆▆▆▇▇▇███████████████
epoch_training_loss,█▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▃▅▅▅▆▇▇▇███▇██▇██████▇█
epoch_validation_loss,█▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁

0,1
batch_gradients_norm,5.43106
batch_idx,15624
batch_size,64
batch_training_loss,1.56588
epoch,24
epoch_training_accuracy,0.42605
epoch_training_loss,1.62359
epoch_validation_accuracy,0.44875
epoch_validation_loss,1.56216
learning_rate,1e-05


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vnb7kghp with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0001


Epoch 0: avg_train_loss=2.110703943729401, avg_train_accuracy=0.234725, avg_val_loss=1.9248292343139648, avg_val_accuracy=0.3292
Epoch 1: avg_train_loss=1.9881125504016877, avg_train_accuracy=0.287975, avg_val_loss=1.8480755693435669, avg_val_accuracy=0.3524
Epoch 2: avg_train_loss=1.9435253315448762, avg_train_accuracy=0.3046, avg_val_loss=1.8134901176452636, avg_val_accuracy=0.3644
Epoch 3: avg_train_loss=1.9112230302333832, avg_train_accuracy=0.3161, avg_val_loss=1.778035392189026, avg_val_accuracy=0.3765
Epoch 4: avg_train_loss=1.8891508202552796, avg_train_accuracy=0.32435, avg_val_loss=1.7604339281082153, avg_val_accuracy=0.3795
Epoch 5: avg_train_loss=1.8671650192260743, avg_train_accuracy=0.334375, avg_val_loss=1.738027557182312, avg_val_accuracy=0.3877
Epoch 6: avg_train_loss=1.8513435415744781, avg_train_accuracy=0.340525, avg_val_loss=1.718791558074951, avg_val_accuracy=0.3939
Epoch 7: avg_train_loss=1.8430246886730195, avg_train_accuracy=0.34305, avg_val_loss=1.714870619392

VBox(children=(Label(value='0.001 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.072568…

0,1
batch_gradients_norm,▆▄▄▄▄▆▄▃▄▆▅▃▇▂▅▅▂▁▃▄▄▂▇▇▆▃▂▆▄▄▄▆▇▇▇▅█▇▄▄
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▆▅▂▄▂█▅▃▃▆▄▄▅▁▅▂▁▁▂▂▂▂▃▆▄▂▂▅▃▃▃▄▅▅▅▂▅▄▂▁
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_training_accuracy,▁▄▅▅▆▆▇▇▇▇████████████████████
epoch_training_loss,█▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇█████████████▇▇
epoch_validation_loss,█▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,10.48486
batch_idx,74999
batch_size,16
batch_training_loss,1.81315
epoch,29
epoch_training_accuracy,0.36145
epoch_training_loss,1.78901
epoch_validation_accuracy,0.4106
epoch_validation_loss,1.66833
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: 00lcb8hx with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 0: avg_train_loss=2.1457447368621825, avg_train_accuracy=0.218475, avg_val_loss=1.9845556611070236, avg_val_accuracy=0.3036142172523962
Epoch 1: avg_train_loss=2.0188039816856382, avg_train_accuracy=0.277975, avg_val_loss=1.9048102998885865, avg_val_accuracy=0.336361821086262
Epoch 2: avg_train_loss=1.9704465908050537, avg_train_accuracy=0.295975, avg_val_loss=1.8669478203922796, avg_val_accuracy=0.3462460063897764
Epoch 3: avg_train_loss=1.9371714337348938, avg_train_accuracy=0.309175, avg_val_loss=1.8299255767188514, avg_val_accuracy=0.35932507987220447
Epoch 4: avg_train_loss=1.9116581741333007, avg_train_accuracy=0.316475, avg_val_loss=1.8133794233060112, avg_val_accuracy=0.365814696485623
Epoch 5: avg_train_loss=1.890558503627777, avg_train_accuracy=0.3253, avg_val_loss=1.7897772282457198, avg_val_accuracy=0.3735023961661342
Epoch 6: avg_train_loss=1.8763642684936523, avg_train_accuracy=0.3355, avg_val_loss=1.7758168206809046, avg_val_accuracy=0.3778953674121406
Epoch 7: avg

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,▆▇▅▄▄▃▇▅▅▃▄▄▃▄▅▃▅▃█▄▄▄▇▂▂▆▅▁█▆▂▅▇▄▄▇▃▆▇▆
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇█▆▅▄▃▅▆▆▄▅▅▄▅▃▃▆▂▆▃▄▃▆▂▂▅▄▁▅▆▄▅▅▃▃▄▃▇▇▅
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▄▅▆▆▆▇▇▇▇███████████████
epoch_training_loss,█▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▆▆▇▇▇████████████████
epoch_validation_loss,█▆▅▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁

0,1
batch_gradients_norm,7.06122
batch_idx,31249
batch_size,32
batch_training_loss,1.81015
epoch,24
epoch_training_accuracy,0.35615
epoch_training_loss,1.817
epoch_validation_accuracy,0.39457
epoch_validation_loss,1.7231
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: zkg1xluq with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.01


Epoch 0: avg_train_loss=2.204359808540344, avg_train_accuracy=0.1885, avg_val_loss=2.0442419150832354, avg_val_accuracy=0.28164808917197454
Epoch 1: avg_train_loss=2.075755178642273, avg_train_accuracy=0.25275, avg_val_loss=1.9729314131341922, avg_val_accuracy=0.3088176751592357
Epoch 2: avg_train_loss=2.0227744661331175, avg_train_accuracy=0.27685, avg_val_loss=1.9330365528726274, avg_val_accuracy=0.3271297770700637
Epoch 3: avg_train_loss=1.9868614681243897, avg_train_accuracy=0.289025, avg_val_loss=1.9006171993389251, avg_val_accuracy=0.33638535031847133
Epoch 4: avg_train_loss=1.95922592086792, avg_train_accuracy=0.300675, avg_val_loss=1.877625387185698, avg_val_accuracy=0.3453423566878981
Epoch 5: avg_train_loss=1.9445637340545654, avg_train_accuracy=0.304525, avg_val_loss=1.8611131602791464, avg_val_accuracy=0.35071656050955413
Epoch 6: avg_train_loss=1.9248072639465332, avg_train_accuracy=0.314325, avg_val_loss=1.8440970159639978, avg_val_accuracy=0.35529458598726116
Epoch 7: av

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,█▇▅▆▆▅▄▄▄▃▆▅▇▇▅▄▁▅▄▅▃▃▄▅▆▇▅▃▇▅▄▆▄▆▅▄▆▄▅▆
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,█▇▆▅▅▅▃▃▃▂█▅▅▅▃▃▃▃▃▃▂▂▅▅▄▆▄▁▅▆▄▄▃▄▂▂▆▃▃▂
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_training_accuracy,▁▄▅▆▆▆▇▇▇█████████████████████
epoch_training_loss,█▅▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▆▆▆▇▇█████████████████████
epoch_validation_loss,█▆▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,4.25645
batch_idx,18749
batch_size,64
batch_training_loss,1.68911
epoch,29
epoch_training_accuracy,0.33682
epoch_training_loss,1.86658
epoch_validation_accuracy,0.3758
epoch_validation_loss,1.79421
learning_rate,0.0


[34m[1mwandb[0m: Agent Starting Run: xu4fual7 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0


Epoch 0: avg_train_loss=2.0084440118789675, avg_train_accuracy=0.28165, avg_val_loss=1.8189706194932294, avg_val_accuracy=0.3635549363057325
Epoch 1: avg_train_loss=1.8722025884628295, avg_train_accuracy=0.33665, avg_val_loss=1.7549172609475008, avg_val_accuracy=0.3825636942675159
Epoch 2: avg_train_loss=1.8173207098007202, avg_train_accuracy=0.3538, avg_val_loss=1.7097824773970682, avg_val_accuracy=0.39599920382165604
Epoch 3: avg_train_loss=1.7826253116607667, avg_train_accuracy=0.36695, avg_val_loss=1.6756563847232018, avg_val_accuracy=0.40356289808917195
Epoch 4: avg_train_loss=1.7565526208877564, avg_train_accuracy=0.374225, avg_val_loss=1.6481646071573732, avg_val_accuracy=0.41132563694267515
Epoch 5: avg_train_loss=1.7360044303894042, avg_train_accuracy=0.38245, avg_val_loss=1.627909007345795, avg_val_accuracy=0.4215764331210191
Epoch 6: avg_train_loss=1.7188143585205078, avg_train_accuracy=0.386875, avg_val_loss=1.610686918732467, avg_val_accuracy=0.431031050955414
Epoch 7: avg

0,1
batch_gradients_norm,▄▁▃▂▄▂▁▇▆▅▄▅▆▅▅▁▅▄▇▄▁▅▇▃▄▇▇▄▃▇▅▆▆▇▇▄▃▆▆█
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▇▆▇▄▇▄▄▆▆▅▃▅▆▅▃▃▅▄▅▄▁▃▄▄▄█▆▄▃▂▆▆▆▇▆▄▃▅▃▅
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
epoch_training_accuracy,▁▄▅▅▆▆▆▇▇▇▇████████████████
epoch_training_loss,█▆▅▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▂▃▄▅▅▆▆▇▇▇▇▇▇▇▇███████████
epoch_validation_loss,█▆▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
batch_gradients_norm,4.34988
batch_idx,16874
batch_size,64
batch_training_loss,1.47861
epoch,26
epoch_training_accuracy,0.42092
epoch_training_loss,1.62888
epoch_validation_accuracy,0.45382
epoch_validation_loss,1.53829
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: qqbhh1dd with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adagrad
[34m[1mwandb[0m: 	weight_decay: 0.0001


Epoch 0: avg_train_loss=2.1684671776771545, avg_train_accuracy=0.2109, avg_val_loss=2.0286991151809692, avg_val_accuracy=0.2937
Epoch 1: avg_train_loss=2.090413843536377, avg_train_accuracy=0.246575, avg_val_loss=1.9855188802719117, avg_val_accuracy=0.3117
Epoch 2: avg_train_loss=2.0632320981502534, avg_train_accuracy=0.25945, avg_val_loss=1.9624335815429688, avg_val_accuracy=0.3167
Epoch 3: avg_train_loss=2.047587134742737, avg_train_accuracy=0.26505, avg_val_loss=1.9465469173431396, avg_val_accuracy=0.3266
Epoch 4: avg_train_loss=2.034316085958481, avg_train_accuracy=0.272725, avg_val_loss=1.9391524225234986, avg_val_accuracy=0.3266
Epoch 5: avg_train_loss=2.0254062935829165, avg_train_accuracy=0.27235, avg_val_loss=1.923827307510376, avg_val_accuracy=0.332
Epoch 6: avg_train_loss=2.018606778240204, avg_train_accuracy=0.27735, avg_val_loss=1.9171840738296508, avg_val_accuracy=0.3339
Epoch 7: avg_train_loss=2.009051835155487, avg_train_accuracy=0.283325, avg_val_loss=1.910006291580200

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,▇▄▆█▃█▅▃▁▆▆▄▃▃▄▂▃▃▄▇▆▅▂▅▄▅▁▂▁▆▃▄▇▄▄▃▇▅▄▃
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▅▅▇▇▄█▆▄▂▅▆▄▃▃▄▄▃▄▄▅▅▃▃▄▃▅▁▃▂▅▄▅▇▄▃▄█▅▅▄
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
epoch_training_accuracy,▁▄▅▆▆▆▇▇██████▇█████
epoch_training_loss,█▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▅▅▆▆▇▇▇█▇▇█▇▇▇███
epoch_validation_loss,█▆▅▄▄▃▂▂▂▁▁▁▁▁▁▁▁▂▁▁
learning_rate,██████████▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,9.33021
batch_idx,49999
batch_size,16
batch_training_loss,1.88334
epoch,19
epoch_training_accuracy,0.28875
epoch_training_loss,1.98912
epoch_validation_accuracy,0.3417
epoch_validation_loss,1.88953
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: hx2trfk3 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.99
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0001


Epoch 0: avg_train_loss=1.9332335603713988, avg_train_accuracy=0.30905, avg_val_loss=1.7693532591412782, avg_val_accuracy=0.3586783439490446
Epoch 1: avg_train_loss=1.8021872406005859, avg_train_accuracy=0.355125, avg_val_loss=1.6808897917437706, avg_val_accuracy=0.4002786624203822
Epoch 2: avg_train_loss=1.744268451690674, avg_train_accuracy=0.37615, avg_val_loss=1.6442828428973058, avg_val_accuracy=0.41033041401273884
Epoch 3: avg_train_loss=1.7135756076812745, avg_train_accuracy=0.3888, avg_val_loss=1.6222052893061547, avg_val_accuracy=0.41580414012738853
Epoch 4: avg_train_loss=1.6897655521392823, avg_train_accuracy=0.3984, avg_val_loss=1.6037163157371959, avg_val_accuracy=0.42744824840764334
Epoch 5: avg_train_loss=1.672332616043091, avg_train_accuracy=0.402675, avg_val_loss=1.6001504424271311, avg_val_accuracy=0.4231687898089172
Epoch 6: avg_train_loss=1.6590792999267578, avg_train_accuracy=0.410075, avg_val_loss=1.581830536483959, avg_val_accuracy=0.43670382165605093
Epoch 7: av

0,1
batch_gradients_norm,█▇▆▅▄▃▄▃▂▃▂▁▁▂▂▁▂▃▁▂▂▁▃▂▂▃▄▄▂▂▁▂▃▃▃▃▂▃▃▂
batch_idx,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,▆█▆▅▆▄▅▆▃▇█▂▅▄▆▅▃▅▃▄▁▂▆▃▄▄▅▆▄▄▁▃▂▅▅▃▃▇▃▃
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
epoch_training_accuracy,▁▃▄▄▅▅▅▅▆▆▇▇▇▇▇▇▇█████████████
epoch_training_loss,█▆▅▅▄▄▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_validation_accuracy,▁▃▄▄▅▄▅▆▅▆▇▇▇▇████████████████
epoch_validation_loss,█▆▅▅▅▄▄▄▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,██████████▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
batch_gradients_norm,1.02802
batch_idx,18749
batch_size,64
batch_training_loss,1.04939
epoch,29
epoch_training_accuracy,0.48052
epoch_training_loss,1.46837
epoch_validation_accuracy,0.49114
epoch_validation_loss,1.43712
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: jtf3as10 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: sgd_sam
[34m[1mwandb[0m: 	weight_decay: 0.0001


Epoch 0: avg_train_loss=2.0853281734466553, avg_train_accuracy=0.246425, avg_val_loss=1.9287664746023288, avg_val_accuracy=0.3301154458598726
Epoch 1: avg_train_loss=1.9621783756256104, avg_train_accuracy=0.30075, avg_val_loss=1.8731529978430195, avg_val_accuracy=0.3484275477707006
Epoch 2: avg_train_loss=1.9152925886154175, avg_train_accuracy=0.3188, avg_val_loss=1.848520743619105, avg_val_accuracy=0.3560907643312102
Epoch 3: avg_train_loss=1.8908769289016725, avg_train_accuracy=0.33225, avg_val_loss=1.8285315553094172, avg_val_accuracy=0.3590764331210191
Epoch 4: avg_train_loss=1.8762400741577148, avg_train_accuracy=0.3345, avg_val_loss=1.8147555635233594, avg_val_accuracy=0.36673964968152867
Epoch 5: avg_train_loss=1.8599299215316774, avg_train_accuracy=0.3426, avg_val_loss=1.8087546514098052, avg_val_accuracy=0.3721138535031847
Epoch 6: avg_train_loss=1.8514777139663696, avg_train_accuracy=0.344325, avg_val_loss=1.8026174451135526, avg_val_accuracy=0.37420382165605093
Epoch 7: avg_

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_gradients_norm,█▇█▆▆▅▅▆▄▄▅▄▅▄▄▄▄▄▄▄▃▃▃▂▃▃▂▃▃▂▂▂▂▁▂▁▂▁▁▁
batch_idx,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
batch_size,▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch_training_loss,█▇█▃▃▄▃▆▄▄▅▄▅▅▄▄▄▃▅▃▃▃▃▂▂▂▁▃▃▂▂▄▅▂▃▄▄▄▄▃
epoch,▁▂▂▃▃▄▄▅▅▆▆▇▇█
epoch_training_accuracy,▁▅▆▇▇▇▇███████
epoch_training_loss,█▅▃▃▂▂▁▁▁▁▁▂▂▂
epoch_validation_accuracy,▁▄▅▅▇▇█████▇▆▇
epoch_validation_loss,█▅▄▃▂▂▂▁▁▁▂▂▃▃
learning_rate,██████████▁▁▁▁

0,1
batch_gradients_norm,3.56556
batch_idx,8749
batch_size,64
batch_training_loss,2.07019
epoch,13
epoch_training_accuracy,0.34978
epoch_training_loss,1.86589
epoch_validation_accuracy,0.36793
epoch_validation_loss,1.83363
learning_rate,1e-05


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: epke2rin with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.01


Epoch 0: avg_train_loss=2.0906876187324523, avg_train_accuracy=0.2416, avg_val_loss=2.0440503578186036, avg_val_accuracy=0.267
Epoch 1: avg_train_loss=2.0815526272773743, avg_train_accuracy=0.245275, avg_val_loss=2.0205244909286497, avg_val_accuracy=0.2731
Epoch 2: avg_train_loss=2.0743685405731203, avg_train_accuracy=0.2481, avg_val_loss=2.0092275680541993, avg_val_accuracy=0.2876
Epoch 3: avg_train_loss=2.068874038696289, avg_train_accuracy=0.245175, avg_val_loss=1.9913550842285157, avg_val_accuracy=0.2958
Epoch 4: avg_train_loss=2.070186266040802, avg_train_accuracy=0.249025, avg_val_loss=1.9939391542434692, avg_val_accuracy=0.2932
Epoch 5: avg_train_loss=2.0693383916378023, avg_train_accuracy=0.2501, avg_val_loss=1.9910883670806885, avg_val_accuracy=0.3
Epoch 6: avg_train_loss=2.066299139070511, avg_train_accuracy=0.24755, avg_val_loss=1.9958801200866698, avg_val_accuracy=0.2919
Epoch 7: avg_train_loss=2.0653687777996064, avg_train_accuracy=0.2525, avg_val_loss=1.9959843992233277, 