<a href="https://colab.research.google.com/github/Tensor-Reloaded/Neural-Networks-Template-2024/blob/main/Lab09/PyTorch-tutorial-MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Datasets in PyTorch

https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset

In [1]:
from torch.utils.data import Dataset


# A dataset in PyTorch should implement the following methods:
# __len__ (optional in some rare cases) and __getitem__ (mandatory)
class SimpleDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        # In __getitem__ we usually load the data from the disk or from memory, apply transformations and return it
        return self.data[i], self.labels[i]


dataset = SimpleDataset(data=["apple", "cucumber", "pear", "orange"],
                        labels=["fruit", "vegetable", "fruit", "fruit"])
print(len(dataset))  # uses SimpleDataset.__len__
print(dataset[0])  # uses SimpleDataset.__getitem__

for data, label in dataset:
    print(data, label)

4
('apple', 'fruit')
apple fruit
cucumber vegetable
pear fruit
orange fruit


## DataLoaders in PyTorch

https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader

In [2]:
from torch.utils.data import DataLoader
# We use DataLoaders to automatically load and batchify the data from the Dataset

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

print(len(dataloader))
for data_batched, labels_batched in dataloader:
    print(data_batched, labels_batched)

2
('cucumber', 'pear') ('vegetable', 'fruit')
('orange', 'apple') ('fruit', 'fruit')


In [3]:
dataloader = DataLoader(dataset, batch_size=3, shuffle=True)
# If the batch size does not divide the size of the dataset, the size of the last batch will be len(dataset) % batch_size

print(len(dataloader))
for data_batched, labels_batched in dataloader:
    print(data_batched, labels_batched)

2
('apple', 'cucumber', 'orange') ('fruit', 'vegetable', 'fruit')
('pear',) ('fruit',)


In [4]:
dataloader = DataLoader(dataset, batch_size=3, shuffle=True, drop_last=True)
# We can drop the last batch

print(len(dataloader))
for data_batched, labels_batched in dataloader:
    print(data_batched, labels_batched)

1
('apple', 'cucumber', 'pear') ('fruit', 'vegetable', 'fruit')


Also see https://pytorch.org/tutorials/beginner/basics/data_tutorial.html.


## Defining a model in PyTorch

https://pytorch.org/docs/stable/generated/torch.nn.Module.html

In [5]:
from torch import nn
from torch import Tensor

class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        # We call the constructor of the parent module
        self.layer_1 = nn.Linear(input_size, hidden_size)
        # We create a Linear layer: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
        self.layer_2 = nn.Linear(hidden_size, output_size)

    # In PyTorch we define only the forward step. The backward step is automatically calculated by the Autograd engine.
    def forward(self, x: Tensor):
        # x is a Tensor of size [batch size, input_size]
        x = self.layer_1(x) # x is a Tensor of size [batch size, hidden_size]
        x = x.relu()  # We can use any activation function
        x = self.layer_2(x) # x is a Tensor of size [batch size, output_size]
        return x


model = MyModel(input_size=784, hidden_size=100, output_size=10)
print(model)

MyModel(
  (layer_1): Linear(in_features=784, out_features=100, bias=True)
  (layer_2): Linear(in_features=100, out_features=10, bias=True)
)


## Devices

https://pytorch.org/docs/stable/tensor_attributes.html#torch-device

In [6]:
import torch


def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')  # On multi-gpu workstation we can select cuda:0, cuda:1, ...
    if torch.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')


device = get_device()
print(device)

model = model.to(device)

cuda


## Optimizers in PyTorch

https://pytorch.org/docs/main/optim.html

In [7]:
# Optimizers apply the gradients calculated by the Autograd engine to the weights, using their own optimization technique
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True, weight_decay=0.001)  # SGD with Nesterov momentum and weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.001)  # Adam with Weight Decay


# Schedulers change the learning rate enabling faster training
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # Learning rate scheduler, halves the learning rate each 10 steps

## Loss functions in PyTorch

https://pytorch.org/docs/stable/nn.html#loss-functions

In [8]:
criterion = nn.CrossEntropyLoss()  # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html

## Training in PyTorch

In [9]:
def train(model, train_dataloader, criterion, optimizer, device):
    model.train()  # We need to activate the dropout & batch norm layers

    mean_loss = 0.0

    for data, labels in train_dataloader:
        data = data.to(device)  # We move the data to device. Bonus: we can do this in an async manner using non_blocking and pin_memory
        labels = labels.to(device)

        outputs = model(data)  # the forward pass
        loss = criterion(outputs, labels)  # we calculate the loss

        loss.backward()  # we backpropagate the loss

        if False:
            # After loss.backward(), the gradients for each weight and bias are calculated and assigned to layer.weight.grad and layer.bias.grad
            last_layer_w_grad = model.layer_2.weight.grad
            last_layer_b_grad = model.layer_2.bias.grad
            print(f"Last layer gradient: {last_layer_w_grad.shape}")
            print(f"Last layer gradient: {last_layer_b_grad.shape}")

        optimizer.step()  # we update the weights
        optimizer.zero_grad()  # we reset the gradients

        mean_loss += loss.item()

    mean_loss /= len(train_dataloader)
    return mean_loss

## Validation in PyTorch

In [10]:
def val_1(model, val_dataloader, criterion, device):
    model.eval()  # We need to deactivate the dropout & batch norm layers

    mean_loss = 0.0

    for data, labels in val_dataloader:
        data = data.to(device)
        labels = labels.to(device)

        with torch.no_grad():  # Context manager that disables Autograd (no gradients need to be calculated during validation)
            outputs = model(data)  # the forward pass
            loss = criterion(outputs, labels)  # we calculate the loss
        # A better context manager is torch.inference_mode(), which also disables version counter for tensors.
        # Tensors created without version counter can never be used in a operation that requires gradient.

        mean_loss += loss.item()

    mean_loss /= len(val_dataloader)
    return mean_loss

@torch.inference_mode()  # it is better to decorate the method with torch.inference_mode or torch.no_grad
def val(model, val_dataloader, criterion, device):
    model.eval()

    mean_loss = 0.0

    for data, labels in val_dataloader:
        data = data.to(device)
        labels = labels.to(device)

        output = model(data)
        loss = criterion(output, labels)

        mean_loss += loss.item()

    mean_loss /= len(val_dataloader)
    return mean_loss

## Training loop

In [11]:
from tqdm import tqdm


def main(model, train_dataloader, val_dataloader, criterion, optimizer, device, epochs):
    with tqdm(tuple(range(epochs))) as tbar:
        for epoch in tbar:
            train_loss = train(model, train_dataloader, criterion, optimizer, device)
            val_loss = val(model, val_dataloader, criterion, device)
            scheduler.step()
            tbar.set_description(f"Train loss: {train_loss:.3f} | Val loss: {val_loss:.3f}")

In [15]:
from torchvision.datasets import MNIST
import numpy as np
from torchvision.transforms import v2


def transforms():
    return lambda x: torch.from_numpy(np.array(x, dtype=np.float32).flatten() / 255)

train_dataset = MNIST(root='./data', train=True, download=True, transform=transforms())
val_dataset = MNIST(root='./data', train=False, download=True, transform=transforms())
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=500, shuffle=False)

main(model, train_dataloader, val_dataloader, criterion, optimizer, device, 10)

Train loss: 0.020 | Val loss: 0.064: 100%|██████████| 10/10 [00:47<00:00,  4.74s/it]
