In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from network import Network

# Creating Variables For Our Hyperparameters

In [31]:
batch_size = 100
learning_rate = 0.01
shuffle=True

In [3]:
network = Network()
optimizer = torch.optim.Adam(
    params=network.parameters(),
    lr=learning_rate
)

In [32]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data',
    train=True,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor()
    ])
)

In [5]:
train_loader = DataLoader(
    dataset=train_set,
    batch_size=batch_size,
    shuffle=False
)

In [6]:
tb = SummaryWriter(comment=f'batch_size={batch_size} lr={learning_rate}')

# Calculate Loss With Different Batch Sizes
Since we'll be varying our batch sizes now, we'll need to make a change to the way we are calculating and accumulating
the loss. Instead of just summing the loss returned by the loss function. We'll adjust it to account for the batch size.

    total_loss += loss.item() * batch_size

Why do this? The cross_entropy loss function averages the loss values that are produced by the batch and then returns
this average loss. This is why we need to account for the batch size.

There is a parameter that the cross_entropy function accepts called reduction that we could also use.

The reduction parameter optionally accepts a string as an argument. This parameter specifies the reduction to apply to
the output of the loss function.
    1.  'none' - no reduction will be applied.
    2.  'mean' - the sum of the output will be divided by the number of elements in the output.
    3.  'sum' - the output will be summed.

  Note that the default is 'mean'. This is why loss.item() * batch_size works.

# Experimenting With Hyperparameter Values
All we need to do is create some lists and some loops, and we can run the code and sit back and wait for all the
combinations to run.

# Parameter Lists

In [7]:
batch_size_list = [10, 100, 1000, 10000]
learning_rate_list = [0.01, 0.001, 0.0001, 0.00001]

In [8]:
for batch_size in batch_size_list:
    for learning_rate in learning_rate_list:
        optimizer = torch.optim.Adam(
            params=network.parameters(),
            lr=learning_rate
        )
        train_loader = DataLoader(
            dataset=train_set,
            batch_size=batch_size,
            shuffle=False
        )
        images, labels = next(
            iter(train_loader)
        )
        grid = torchvision.utils.make_grid(images)

        comment = f'batch_size={batch_size} lr={learning_rate}'
        tb = SummaryWriter(comment=comment)
        tb.add_image('images', grid)
        tb.add_graph(network, images)

        for epoch in range(5):
            total_loss_per_epoch = 0
            total_correct_per_epoch = 0
            accuracy_per_epoch = 0
            for batch in train_loader:
                images, labels = batch # 1. Get Batch
                predictions = network(images) # 2. Pass Batch to Network

                loss = torch.nn.functional.cross_entropy(
                    input=predictions,
                    target=labels,
                ) # 3. Calculate loss function

                optimizer.zero_grad() # 4A. Zero out the Gradients
                loss.backward() # 4B. Calculate the Gradients
                optimizer.step() # 5. Update the Weights

                total_loss_per_epoch += loss.item() * batch_size
                total_correct_per_epoch += predictions.argmax(dim=1).eq(labels).sum().item()

            accuracy_per_epoch =  total_correct_per_epoch / len(train_set)
            tb.add_scalar('Loss', total_loss_per_epoch, epoch)
            tb.add_scalar('Correct', total_correct_per_epoch, epoch)
            tb.add_scalar('Accuracy', accuracy_per_epoch, epoch)

            for name, param in network.named_parameters():
                tb.add_histogram(name, param, epoch)

            print('Epoch: {},'
                  '\nLoss: {},'
                  '\nCorrect: {},'
                  '\nAccuracy: {}'
                  '\n'.format(
                epoch+1, total_loss_per_epoch, total_correct_per_epoch, accuracy_per_epoch
                  )
            )

        tb.close()

Epoch: 1,
Loss: 38609.35492885299,
Correct: 45213,
Accuracy: 0.75355

Epoch: 2,
Loss: 33503.85079354048,
Correct: 47448,
Accuracy: 0.7908

Epoch: 3,
Loss: 33265.30728791375,
Correct: 47552,
Accuracy: 0.7925333333333333

Epoch: 4,
Loss: 32964.24246055074,
Correct: 47606,
Accuracy: 0.7934333333333333

Epoch: 5,
Loss: 32869.254753100686,
Correct: 47877,
Accuracy: 0.79795

Epoch: 1,
Loss: 34610.122762969695,
Correct: 46954,
Accuracy: 0.7825666666666666

Epoch: 2,
Loss: 23130.320638203993,
Correct: 51441,
Accuracy: 0.85735

Epoch: 3,
Loss: 20049.986519434024,
Correct: 52562,
Accuracy: 0.8760333333333333

Epoch: 4,
Loss: 18439.105176299345,
Correct: 53104,
Accuracy: 0.8850666666666667

Epoch: 5,
Loss: 17286.888727564365,
Correct: 53593,
Accuracy: 0.8932166666666667

Epoch: 1,
Loss: 50552.06475857645,
Correct: 41166,
Accuracy: 0.6861

Epoch: 2,
Loss: 35434.13332019001,
Correct: 46255,
Accuracy: 0.7709166666666667

Epoch: 3,
Loss: 31554.784537684172,
Correct: 47791,
Accuracy: 0.796516666666666

# Adding More Hyperparameters Without Nesting
There is a solution. We can create a set of parameters for each run, and package all of them up in a single iterable.

If we have a list of parameters, we can package them up into a set for each of our runs using the Cartesian product.
For this we'll use the product function from the itertools library.

In [9]:
from itertools import product

In [14]:
learning_rate = 0.01
batch_size = 100
shuffle = True

In [15]:
parameters = {
    learning_rate: [0.01, 0.001],
    batch_size: [100, 1000],
    shuffle: [True, False]
}

In [16]:
param_values = [value for value in parameters.values()]

In [17]:
for learning_rate, batch_size, shuffle in product(*param_values):
    print(learning_rate, batch_size, shuffle)

0.01 100 True
0.01 100 False
0.01 1000 True
0.01 1000 False
0.001 100 True
0.001 100 False
0.001 1000 True
0.001 1000 False


In [20]:
for learning_rate, batch_size, shuffle in product(*param_values):
    train_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        shuffle=shuffle
    )

    network = Network()

    optimizer = torch.optim.Adam(
        params=network.parameters(),
        lr=learning_rate
    )

    images, labels = next(iter(train_loader))

    comment = f'batch_size {batch_size} lr {learning_rate} shuffle {shuffle}'
    tb = SummaryWriter(comment=comment)
    grid = torchvision.utils.make_grid(images)
    tb.add_image('images', grid)
    tb.add_graph(network, images)

    for epoch in range(5):
        total_loss_per_epoch = 0
        total_correct_per_epoch = 0
        accuracy_per_epoch = 0
        for batch in train_loader:
            images, labels = batch # 1. Get Batch
            predictions = network(images) # 2. Pass Images into Network

            loss = torch.nn.functional.cross_entropy(
                input=predictions,
                target=labels,
                reduction='mean'
            ) # 3. Calculate Loss Function

            optimizer.zero_grad() # 4A. Zero out the Gradients
            loss.backward() # 4B. Calculate Gradients
            optimizer.step() # 5. Update the weights

            total_loss_per_epoch += loss.item() * batch_size
            # * batch_size or images.shape[0], to be more accurate, because we use different batch sizes and we are
            # using the cross_entropy.reduction='mean' (scalar)!
            total_correct_per_epoch += predictions.argmax(dim=1).eq(labels).sum().item()

        accuracy_per_epoch = total_correct_per_epoch / len(train_set)

        tb.add_scalar('Loss', total_loss_per_epoch, epoch)
        tb.add_scalar('Correct', total_correct_per_epoch, epoch)
        tb.add_scalar('Accuracy', accuracy_per_epoch, epoch)

        for name, param in network.named_parameters():
            tb.add_histogram(name, param, epoch)
            tb.add_histogram(f'{name}.grad', param.grad, epoch)

        print('Epoch: {}, '
              '\nLoss: {},'
              '\nAccuracy: {}'.format(
            epoch + 1,
            total_loss_per_epoch,
            accuracy_per_epoch
              ))

    tb.close() # tb.close() after all epochs!

RuntimeError: grad can be implicitly created only for scalar outputs

# Using The RunBuilder Class

In [3]:
from collections import OrderedDict, namedtuple
from itertools import product

This product() function is the one we saw last time that computes a Cartesian product given multiple list inputs.

In [17]:
class RunBuilder:
    @staticmethod
    def get_runs(param_dict):

        Run = namedtuple('Run', param_dict.keys())
        runs = []
        for values in product(*param_dict.values()):
            runs.append(
                Run(*values)
            )

        return runs

In [18]:
params = OrderedDict(
    learning_rate = [.01, 0.0001],
    batch_size = [100, 1000],
    # shuffle = [True, False]
)

In [19]:
runs = RunBuilder.get_runs(params)

In [20]:
print(runs)

[Run(learning_rate=0.01, batch_size=100, shuffle=True), Run(learning_rate=0.01, batch_size=100, shuffle=False), Run(learning_rate=0.01, batch_size=1000, shuffle=True), Run(learning_rate=0.01, batch_size=1000, shuffle=False), Run(learning_rate=0.0001, batch_size=100, shuffle=True), Run(learning_rate=0.0001, batch_size=100, shuffle=False), Run(learning_rate=0.0001, batch_size=1000, shuffle=True), Run(learning_rate=0.0001, batch_size=1000, shuffle=False)]


In [22]:
print('First run:', runs[0])

First run: Run(learning_rate=0.01, batch_size=100, shuffle=True)


In [27]:
print(
    params.keys()
)
print(params.values())

odict_keys(['learning_rate', 'batch_size', 'shuffle'])
odict_values([[0.01, 0.0001], [100, 1000], [True, False]])


Before:

    for lr, batch_size, shuffle in product(*param_values):
        comment = f' batch_size={batch_size} lr={lr} shuffle={shuffle}'
        # Training process given the set of parameters

After:

    for run in Runbuilder.get_runs(params):
        comment = f'-{run}'
         # Training process given the set of parameters

In [35]:
for run in RunBuilder.get_runs(params):
    network = Network()

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True # drops last batch if the size of last batch differs from other
    )
    optimizer = torch.optim.Adam(
        params=network.parameters(),
        lr=learning_rate
    )
    
    images, labels = next(iter(train_loader))
    grid = torchvision.utils.make_grid(images)

    comment = f'-{run}'
    tb = SummaryWriter(comment=comment)
    tb.add_image('images', grid)
    tb.add_graph(network, images)

    for epoch in range(5):
        total_loss_per_epoch = 0
        total_correct_per_epoch = 0
        accuracy_per_epoch = 0
        for batch in train_loader:
            images, labels = batch
            predictions = network(images)
            loss = torch.nn.functional.cross_entropy(
                input=predictions,
                target=labels,
                reduction='mean'
            )

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss_per_epoch += loss.item() * batch_size
            total_correct_per_epoch += predictions.argmax(dim=1).eq(labels).sum().item()

        accuracy_per_epoch = total_correct_per_epoch / len(train_set)

        tb.add_scalar('loss', total_loss_per_epoch, epoch)
        tb.add_scalar('correct', total_correct_per_epoch, epoch)
        tb.add_scalar('accuracy', accuracy_per_epoch, epoch)

        for name, param in network.named_parameters():
            tb.add_histogram(name, param, epoch)
            tb.add_histogram(f'{name}.grad', param.grad, epoch)

        print(f'Epoch: {epoch}, '
              f'Loss: {total_loss_per_epoch}, '
              f'Accuracy: {accuracy_per_epoch}\n')

    tb.close()

Epoch: 0, Loss: 32700.03606379032, Accuracy: 0.7940333333333334



MemoryError: Unable to allocate 1.09 MiB for an array with shape (392, 242, 3) and data type float32