# Pytorch Notes
This file contains examples and explanations of pytorch code

In [34]:
# Imports

import torch # Base package
from torch import nn # Neural Network package
from torch.utils.data import DataLoader # Useful tool for loading data and passing it to models
from torchvision.transforms import ToTensor # Converts image data to tensors (Which are what pytorch uses) 
from torchvision.transforms import Lambda # Allows us to apply our own transformations using lambda functions
from torchvision.transforms import Compose # Used to chain together transformations

from torchvision import datasets # Free data to tinker with
print("All imports OK")

All imports OK


## Loading data

In [35]:
# We'll just use some freely available data
training_data = datasets.FashionMNIST(
    root="data", # Data folder location
    train=True, # This will be our training data
    download=False, # Can replace with False after the first run
    transform=ToTensor(), # Pytorch uses tensor objects, so may as well transform the data now
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False, # This will not be our training data
    download=False,
    transform=ToTensor(),
)


# Pytorch's DataLoaders allow us to, you guessed it, load data
# Typical usage is...
size_of_batch = 100 # However many number of data samples you use in each batch of data
train_dataloader = DataLoader(training_data, batch_size=size_of_batch)
test_dataloader = DataLoader(test_data, batch_size=size_of_batch)

In [36]:
# We can find out about the shape of our data like so
# X is the conventional name for data, y for the target result
for X, y in test_dataloader:
    print("Shape of X [Size of each batch, Colours, Height, Width]: \n", X.shape)
    print("Shape of y: ", y.shape, y.dtype)
    break # We only need to print this info once

Shape of X [Size of each batch, Colours, Height, Width]: 
 torch.Size([100, 1, 28, 28])
Shape of y:  torch.Size([100]) torch.int64


## Custom data loaders

In [37]:
# Classes can be used to make our own data loaders,
# This becomes increasing necessary as we deal with large and more complex problems
# as we will be unable to just load everything into our machine's memory at once.
# It also lets us make methods useful for a specific problem.

# Todo : find some data and make a custom loader for it

## Preprocessing
Preprocessing depends heavily on the dataset you are using, but possible choices are:
- Batch Normalisation, practically always helpful to do
- Image cropping
- Image translation
- Image rotations/flips
- Contrast changes

In [38]:
# Let's just do batch normalisation
# Todo: figure our how to do that

## Models

In [None]:
# Cuda info
print(f"Cuda Available: {torch.cuda.is_available()}")
print(f"Current Device: {torch.cuda.current_device()}")
device_id = torch.cuda.current_device()
print(f"--> {torch.cuda.get_device_name(device_id)}")
print(f"Memory Allocated: {torch.cuda.memory_allocated(device_id)}")
print(f"Memory Reserved: {torch.cuda.memory_reserved()}")

In [39]:
# If you have a nVidia graphics card with cuda capabilities, pytorch can make use of it
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Currently using: {device}")

# The typical practice with pytorch is to make a child class from the nn.Module
# and then define whatever features your problem requires
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__() # super() gives us all the methods from the default neural network class
        self.flatten = nn.Flatten() # Flattening tensors makes them 1 dimensional, often needed https://pytorch.org/docs/stable/generated/torch.flatten.html
        # Here's the actual network, Sequential() will apply operations one after another (sequentially)
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512), # A list of network layer sizes is typically better practice than hard coding values
            nn.ReLU(), # Some activation function, ReLU is the current go-to but feel free to use sigmoid, tanh etc.
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32,10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x) # Logits are a models raw predictions, not yet a usable guess
        return logits

model = NeuralNetwork().to(device)
print(model)

Currently using: cuda
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): ReLU()
    (8): Linear(in_features=64, out_features=32, bias=True)
    (9): ReLU()
    (10): Linear(in_features=32, out_features=10, bias=True)
    (11): ReLU()
  )
)


## Training
In order to train a model, we need a loss function as well as an optimizer<br>
The loss function will give us a value for how well the model is doing,<br>
and the optimizer will attempt to improve this value

In [40]:
# Cross Entropy is useful for classification tasks https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
# tl;dr: input our logits, receive a score
loss_function = nn.CrossEntropyLoss()

# SGD is stochastic gradient descent, this is a pretty basic algorithm in the ML world but still works fine here
# Comparison of optimizers: https://mlfromscratch.com/optimizers-explained/
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [41]:
# Now to make the training loop
# In each training step, the model will make a prediction on the training data,
# calculate the error, then backpropagates that to adjust it's parameters
def train(dataloader, model, loss_function, optimizer):
    # X is the data, y is the target
    for batch, (X, y) in enumerate(dataloader):
        # This can be left out if you aren't using cuda, it just moves the tensors to GPU for processing
        X, y = X.to(device), y.to(device)
        # Make some predictions
        predictions = model(X)
        # Compute the loss
        loss = loss_function(predictions, y)
        # Backpropogate
        # Reset gradients, this is needed so that we are recalculating them each time we do this
        optimizer.zero_grad
        # Although we didn't define a backward() method, the model inherited it thanks to super()!
        loss.backward()
        # Update the parameters (weights and biases)
        optimizer.step()

        # Print some information so we can see what's happening
        size = len(dataloader.dataset)
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X) # loss.item is the loss for the batch divided by batch size
            print(f"Loss: {loss}  [{current}/{size}")

In [42]:
# We also need to test the model to verify it is generalising rather than overfitting
def test(dataloader, model):
    model.eval() # This sets the model to "evaluation mode", if we were using dropout this is required to test correctly
    test_loss, correct = 0, 0
    with torch.no_grad(): # This stops calculating the gradients, saves on memory usage for testing
        for X, y in dataloader:
            X, y = X.to(device), y.to(device) # Cuda stuff again
            predictions = model(X) # Make predictions
            test_loss += loss_function(predictions, y).item()
            correct += (predictions.argmax(1) == y).type(torch.float).sum().item() # I think this counts the correct predictions

        size = len(dataloader.dataset)
        test_loss /= size
        correct /= size
        print(f"Test Error:\nAccuracy: {(100*correct)}%\nAverage Loss: {test_loss}\n")

In [43]:
# Finally let's give it a whirl
epochs = 3 # How many training cycles we do
for t in range(epochs):
    print(f"Epoch #{t+1}\n"+ ("-"*10))
    train(train_dataloader, model, loss_function, optimizer)
    test(test_dataloader, model)

print("Done!")

Epoch #1
----------
Loss: 2.3079710006713867  [0/60000
Loss: 2.296825885772705  [10000/60000
Loss: 2.2952146530151367  [20000/60000
Loss: 2.250478506088257  [30000/60000
Loss: 1.6190952062606812  [40000/60000
Loss: 1.3212895393371582  [50000/60000
Test Error:
Accuracy: 53.080000000000005%
Average Loss: 0.01047793996334076

Epoch #2
----------
Loss: 1.0316749811172485  [0/60000
Loss: 0.9102623462677002  [10000/60000
Loss: 0.8483622074127197  [20000/60000
Loss: 0.9862223863601685  [30000/60000
Loss: 0.8516178131103516  [40000/60000
Loss: 1.051077961921692  [50000/60000
Test Error:
Accuracy: 74.35000000000001%
Average Loss: 0.00722424253821373

Epoch #3
----------
Loss: 0.6108499765396118  [0/60000
Loss: 0.6426427960395813  [10000/60000
Loss: 0.6533907055854797  [20000/60000
Loss: 0.7465096116065979  [30000/60000
Loss: 0.8007423281669617  [40000/60000
Loss: 0.949354887008667  [50000/60000
Test Error:
Accuracy: 73.8%
Average Loss: 0.00830358789563179

Done!


## Saving and loading models

In [None]:
# Saving is very easy!
torch.save(model.state_dict(), "my_model.mod")
# Loading is almost as easy
some_model = NeuralNetwork()
some_model.load_state_dict(torch.load("my_model.mod"))
