<a href="https://colab.research.google.com/github/Tensor-Reloaded/AI-Learning-Hub/blob/main/resources/beginner_pytorch/03_simple_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 03. Simple Training

Consider the following problem:
* Create a DL model able to classify 8-bit strings based on the number of 1 in the bit string: even or odd.

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [None]:
def number_to_bits(x: int) -> torch.Tensor:
    x = np.array([x], dtype=np.uint8)
    x = np.unpackbits(x)
    x = torch.from_numpy(x)
    return x.to(torch.float32)


def parity_label(x: torch.Tensor) -> int:
    return x.sum().item() % 2

In [None]:
class BitsParityDataset(Dataset):
    def __init__(self, start: int, end: int):
        self.start = start
        self.end = end
        self.items = list(range(start, end + 1))

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, idx: int):
        x = number_to_bits(self.items[idx])
        y = parity_label(x)
        return x, torch.tensor(y, dtype=torch.long)

In [None]:
class MLP(nn.Module):
    def __init__(self, width: int = 8, hidden: int = 4):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(width, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 2)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.block(x)

In [None]:
class Trainer:
    def __init__(
            self,
            model: nn.Module,
            optimizer: torch.optim.Optimizer,
            criterion: nn.Module,
            batch_size: int = 16,
            val_batch_size: int = 32,
            use_cpu: bool = False,
    ):
        self.batch_size = batch_size
        self.val_batch_size = val_batch_size  # We can use a bigger batch size for validation

        self.device = torch.device("cpu") if use_cpu else torch.accelerator.current_accelerator()
        # The current accelerator automically detects CUDA/MPS/CPU
        print(f"Using device: {self.device}")

        train_set = BitsParityDataset(0, 100)
        val_set = BitsParityDataset(101, 127)
        self.train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        self.val_loader = DataLoader(val_set, batch_size=val_batch_size, shuffle=False)
        # We don't need to shuffle the validation set

        self.model = model.to(self.device)  # The model must be on the same device
        self.criterion = criterion.to(self.device)  # Required for some loss functions
        self.optimizer = optimizer


    def train(self):
        self.model.train()

        total = 0
        correct = 0
        total_loss = 0

        for data, target in tqdm(self.train_loader, desc="Training", leave=False):
            # We must move the data to the same device as the model
            data = data.to(self.device)
            target = target.to(self.device)
            # We can also use non_blocking=True to speed up the transfer for large tensors
            # data = data.to(self.device, non_blocking=True)
            # but this is useful only for pinned memory transfers (CPU-to-GPU)
            # In most cases, the improvement is negligible

            predicted = self.model(data)
            loss = self.criterion(predicted, target)
            loss.backward()

            self.optimizer.step()
            self.optimizer.zero_grad()

            correct += (predicted.argmax(dim=1) == target).sum().item()
            total += data.size(0)
            total_loss += loss.item() * data.size(0)

        return total_loss / total, correct / total

    # @torch.no_grad()  # This is what you usually see in tutorials
    @torch.inference_mode()  # This is the recommended way to do this
    def val(self):
        self.model.eval()

        total = 0
        correct = 0
        total_loss = 0

        for data, target in tqdm(self.val_loader, desc="Validation", leave=False):
            data = data.to(self.device)
            target = target.to(self.device)

            predicted = self.model(data)
            loss = self.criterion(predicted, target)

            correct += (predicted.argmax(dim=1) == target).sum().item()
            total += data.size(0)
            total_loss += loss.item() * data.size(0)

        return total_loss / total, correct / total

    def run(self, epochs: int):
        print(f"Running {epochs} epochs")
        with tqdm(range(epochs), desc="Training") as pbar:
            for _ in pbar:
                tr_loss, tr_acc = self.train()
                va_loss, va_acc = self.val()
                pbar.set_postfix(train_loss=tr_loss, train_acc=tr_acc, val_loss=va_loss, val_acc=va_acc)
        print("Last validation accuracy: ", va_acc)
        print()

In [None]:
def main(epochs: int, use_cpu: bool):
    model = MLP()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.CrossEntropyLoss()

    trainer = Trainer(model, optimizer, criterion, use_cpu=use_cpu)
    trainer.run(epochs)


In [None]:
if __name__ == '__main__':
    main(100, True)
    main(100, True)
    main(100, True)


Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:02<00:00, 48.72it/s, train_acc=0.485, train_loss=0.689, val_acc=0.519, val_loss=0.692]


Last validation accuracy:  0.5185185185185185

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:01<00:00, 51.23it/s, train_acc=0.554, train_loss=0.681, val_acc=0.593, val_loss=0.72]


Last validation accuracy:  0.5925925925925926

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:01<00:00, 53.27it/s, train_acc=0.644, train_loss=0.676, val_acc=0.667, val_loss=0.674]

Last validation accuracy:  0.6666666666666666






---
Why is it hard for the model to perfectly learn the rule?

Can we use a rule-based algorithm to perfectly solve this problem?

Can we adapt the data to help the neural network learn better?

In [None]:
def number_to_bits(x: int) -> torch.Tensor:
    x = np.array([x], dtype=np.uint8)
    x = np.unpackbits(x)
    x = torch.from_numpy(x)
    return (x.to(torch.float32) - 0.5) * 2


if __name__ == '__main__':
    main(100, True)
    main(100, True)
    main(100, True)

# Wait! What? What did I just do?
# Why is it easier for the model to learn a mapping between the binary representation if we replace it with -1, 1?
# Knowing the answer to this question will be helful in the long run. Keep it in mind and revisit it when you have more experience.

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:01<00:00, 51.42it/s, train_acc=1, train_loss=0.00153, val_acc=1, val_loss=0.00374]


Last validation accuracy:  1.0

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:01<00:00, 52.42it/s, train_acc=1, train_loss=0.000737, val_acc=1, val_loss=0.00102]


Last validation accuracy:  1.0

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:01<00:00, 54.68it/s, train_acc=1, train_loss=0.000894, val_acc=1, val_loss=0.00333]

Last validation accuracy:  1.0






In [None]:
def number_to_bits(x: int) -> torch.Tensor:
    x = np.array([x], dtype=np.uint8)
    x = np.unpackbits(x)
    x = torch.from_numpy(x)
    return (x.to(torch.float32) - 0.5)

if __name__ == '__main__':
    main(100, True)
    main(100, True)
    main(100, True)
# If I use (-0.5, 0.5), the model is not able to learn the rule anymore. Why?

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:02<00:00, 49.13it/s, train_acc=0.594, train_loss=0.674, val_acc=0.63, val_loss=0.66] 


Last validation accuracy:  0.6296296296296297

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:02<00:00, 49.31it/s, train_acc=0.564, train_loss=0.691, val_acc=0.444, val_loss=0.691]


Last validation accuracy:  0.4444444444444444

Using device: cpu
Running 100 epochs


Training: 100%|██████████| 100/100 [00:01<00:00, 51.15it/s, train_acc=0.604, train_loss=0.686, val_acc=0.556, val_loss=0.682]


Last validation accuracy:  0.5555555555555556



In [None]:
if __name__ == '__main__':
    # Some more experiments to evaluate the effect of the number of epochs and device
    main(200, True)
    main(200, False)
    main(400, True)
    main(400, False)
    main(800, True)
    main(800, False)



Using device: cpu
Running 200 epochs


Training: 100%|██████████| 200/200 [00:03<00:00, 55.75it/s, train_acc=0.653, train_loss=0.657, val_acc=0.593, val_loss=0.662]


Last validation accuracy:  0.5925925925925926

Using device: cuda
Running 200 epochs


Training: 100%|██████████| 200/200 [00:05<00:00, 38.57it/s, train_acc=0.604, train_loss=0.683, val_acc=0.407, val_loss=0.695]


Last validation accuracy:  0.4074074074074074

Using device: cpu
Running 400 epochs


Training: 100%|██████████| 400/400 [00:07<00:00, 54.61it/s, train_acc=0.525, train_loss=0.692, val_acc=0.407, val_loss=0.691]


Last validation accuracy:  0.4074074074074074

Using device: cuda
Running 400 epochs


Training: 100%|██████████| 400/400 [00:10<00:00, 39.41it/s, train_acc=0.693, train_loss=0.639, val_acc=0.63, val_loss=0.722] 


Last validation accuracy:  0.6296296296296297

Using device: cpu
Running 800 epochs


Training: 100%|██████████| 800/800 [00:14<00:00, 54.19it/s, train_acc=0.683, train_loss=0.63, val_acc=0.704, val_loss=0.602] 


Last validation accuracy:  0.7037037037037037

Using device: cuda
Running 800 epochs


Training: 100%|██████████| 800/800 [00:20<00:00, 39.72it/s, train_acc=0.535, train_loss=0.662, val_acc=0.444, val_loss=0.707]

Last validation accuracy:  0.4444444444444444






---

What you should learn from this:
* cuda is slower than cpu for small models. This is due to the overhead of transfer between host and device (RAM and VRAM)
* Training longer usually leads to better results, but can also lead to overfitting
* Results depend on network initialization

Data is usually the most important when training a neural network