In [142]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)    #True by default

<torch.autograd.grad_mode.set_grad_enabled at 0x27d460e1208>

In [143]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)

    def forward(self, t):
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t, kernel_size=2, stride=2)

        t = t.reshape(-1, 12*4*4)
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = self.out(t)

        return t

In [144]:
network = Network()

In [145]:
def num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [146]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data/FashionMNIST'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

In [147]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=100)
batch = next(iter(train_loader))
images, labels = batch

In [148]:
preds = network(images)
loss = F.cross_entropy(preds, labels)

In [149]:
loss

tensor(2.2931, grad_fn=<NllLossBackward>)

In [150]:
loss.item()

2.293059825897217

here the cross entropy function from the pytorch functional library has been used to calculate the loss using the prediction tensor from the batch

the loss is the difference value between the nets calculated values and the real values used to train the data, so should as low as possible

In [151]:
print(network.conv1.weight.grad)

None


In [152]:
loss.backward()

In [153]:
network.conv1.weight.grad.shape

torch.Size([6, 1, 5, 5])

here after the loss function was applied to the prediction tensor, a gradient of the weights inside the layers can be made

this is achieved using the back propagation method applied to the loss tensor we calculated earlier

before having applied this backward function you can see that the gradient on our first convolutional layer is None

after applying this function the same layer is seen to have a tensor representing the gradient of each dimension inside the layer, this is formatted similarly to the weight tensor we first looked at, as an indivdual gradient is calculated for all these different values

In [154]:
optimiser = optim.Adam(network.parameters(), lr=0.01)

here we have made an optimiser object.

this has the purpose of updating the weights within the layers after the loss has been calculated. this intends to change the networks calculations for processing the batch, and potentially decrease the loss

here the Adam optmiser has been used, there are many optimisers to choose from, this is the one were using.

we have to pass the parameters of the network (the weights) and the learning rate (lr for short)

the learning rate is a hyperparameter, and the value passed is from trial and error

In [155]:
loss.item()

2.293059825897217

In [156]:
num_correct(preds, labels)

11

In [157]:
optimiser.step()

In [158]:
preds = network(images)
loss = F.cross_entropy(preds, labels)

In [159]:
loss.item()

2.265202045440674

In [160]:
num_correct(preds, labels)

22

the optimiser object has the step function passed on it, this steps in the direction of the loss function's minimum

this applied the change in the weights to the layers in the network

from this we can re-calculate the prediction tensor, and the loss inside this network

the loss should decrease, and potentially the number of correct items increase, this is seen to be true, therefore the optimiser function has worked

when stepping in the direction of the loss functions minimum, the learning rate determins the distance in which you desire to step across the gradient

the learning rate must be low enough to move accurately, and find the most accurate value of the minimum, this uses more epochs

the learning rate must be high enough to move quickly, and produce a perceivable difference in the loss function, this uses fewer epochs