# IPR

In [1]:
import torch
from torch import nn

## Data

In [2]:
from torchvision import transforms
from torchvision.datasets import FashionMNIST, MNIST
from torch.utils.data import DataLoader, random_split, Subset
from torch import Generator

def load_data(Mnist: MNIST, batch_size: int, split: list[int, int], resize: int | None=None) -> tuple[DataLoader, DataLoader, DataLoader]:
    """Download a dataset and then load it into memory."""
    trans: list[transforms.ToTensor] = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
        
    transform: transforms.Compose = transforms.Compose(trans)
    
    mnist_train: MNIST = Mnist(root="../data", train=True, transform=transform, download=True)
    mnist_test: MNIST = Mnist(root="../data", train=False, transform=transform, download=True)
    
    mnist_train, mnist_val = random_split(mnist_train, split, generator=Generator().manual_seed(42))
    
    return (
        DataLoader(mnist_train, batch_size, shuffle=True, num_workers=2),
        DataLoader(mnist_val, batch_size, shuffle=False, num_workers=2),
        DataLoader(mnist_test, batch_size, shuffle=False, num_workers=2)
    )

## Device

In [10]:
from torch import device


def try_gpu(i=0) -> device:
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

## Training

In [11]:
from typing import Callable
from torch import Tensor


def evaluate_accuracy(
        net: nn.Module,
        data_iter: DataLoader,
        loss: Callable[..., Tensor],
        device: device) -> tuple[float, float]:
    """Compute the accuracy for a model on a dataset."""
    net.eval()  # Set the model to evaluation mode

    total_loss: int = 0
    total_hits:  int = 0
    total_samples: int = 0
    
    with torch.no_grad():
        for X, y in data_iter:
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            
            l = loss(y_hat, y)
            
            total_loss += float(l)
            total_hits += sum(net(X).argmax(axis=1).type(y.dtype) == y)
            total_samples += y.numel()
            
            
    return total_loss / len(data_iter), total_hits / total_samples  * 100

In [14]:
from termcolor import colored
from torch.optim.optimizer import Optimizer
from torch import optim

def train(
    net: nn.Module,
    train_iter: DataLoader,
    val_iter: DataLoader,
    num_epochs: int,
    learning_rate: float,
    loss: Callable[..., Tensor],
    optimizer: Optimizer,
    device) -> tuple[list[float], list[float], list[float], list[float]]:
    """Train a model."""
    
    def train_epoch() -> tuple[float, float]:  
        # Set the model to training mode
        net.train()
        
        # Sum of training loss, sum of training correct predictions, no. of examples
        total_loss: int = 0
        total_hits: int = 0
        total_samples: int = 0
        
        for X, y in train_iter:
            # Compute gradients and update parameters
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            
            l = loss(y_hat, y)
            # Using PyTorch built-in optimizer & loss criterion
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            
            total_loss += float(l)
            total_hits += sum(y_hat.argmax(axis=1).type(y.dtype) == y)
            total_samples += y.numel()
            
        # Return training loss and training accuracy
        return total_loss / len(train_iter), total_hits / total_samples  * 100
    
    def print_epoch_summary(epoch: int, train_loss: float, train_acc: float, val_loss: float, val_acc: float, last_values: tuple[float, float, float, float]):
        def diff_color(diff: float, is_pos: bool) -> str:
            if (diff > 0.0 and is_pos) or (diff < 0.0 and not is_pos):
                return "green"
            return "red"
        
        if epoch == 0:
            print(f'{epoch + 1:5} | {train_loss:12.2f} | {train_acc:13.2f}% | {val_loss:15.2f} | {val_acc:18.2f}%')
        else:
            train_loss_diff = train_loss - last_values[0]
            train_acc_diff = train_acc - last_values[1]
            val_loss_diff = val_loss - last_values[2]
            val_acc_diff = val_acc - last_values[3]
            print(f'{epoch + 1:5} | ({colored(f"{train_loss_diff:+3.2f}", diff_color(train_loss_diff, False))}) {train_loss:4.2f} ', end='')
            print(f'| ({colored(f"{train_acc_diff:+3.2f}", diff_color(train_acc_diff, True))}) {train_acc:5.2f}% ', end='')
            print(f'|    ({colored(f"{val_loss_diff:+3.2f}", diff_color(val_loss_diff, False))}) {val_loss:3.2f} ', end='')
            print(f'|      ({colored(f"{val_acc_diff:+3.2f}", diff_color(val_acc_diff, True))}) {val_acc:5.2f}%') 
    
    train_loss_all: list[float] = []
    train_acc_all: list[float] = []
    val_loss_all: list[float] = []
    val_acc_all: list[float] = []
    
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
            
    net.apply(init_weights)
    
    print('Training on', device)
    net.to(device)
    
    optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)
    loss = nn.CrossEntropyLoss()
    
    last_values: tuple[float, float, float, float] = (0.0, 0.0, 0.0, 0.0)
    print(f'Epoch |   Train Loss | Train Accuracy | Validation Loss | Validation Accuracy')
    for epoch in range(num_epochs):
        train_loss, train_acc = train_epoch()
        train_loss_all.append(train_loss)
        train_acc_all.append(train_acc)
        
        val_loss, val_acc = evaluate_accuracy(net, val_iter, loss, device)
        val_loss_all.append(val_loss)
        val_acc_all.append(val_acc)
        
        print_epoch_summary(epoch, train_loss, train_acc, val_loss, val_acc, last_values)
        last_values = (train_loss, train_acc, val_loss, val_acc)

    return train_loss_all, train_acc_all, val_loss_all, val_acc_all

## Plotting

In [23]:
import plotly.express as px
import plotly.graph_objects as go

In [31]:
def plot_loss(train_loss_all: list[float], val_loss_all: list[float]):
    epochs = list(range(1, len(train_loss_all) + 1))
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=epochs, y=train_loss_all, name='Training loss', line=dict(width=4)))
    fig.add_trace(go.Scatter(x=epochs, y=val_loss_all, name='Validation loss', line=dict(width=4)))
    fig.update_layout(
        title='Training and validation loss',
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font = {
            'size': 18,
            'color': 'white'
        },
        xaxis = {
            'title': 'Epochs',
        }, 
        yaxis = {
            'title': 'Loss',
        })
    fig.show()

In [32]:
def plot_accuracy(train_acc_all: list[float], val_acc_all: list[float]):
    epochs = list(range(1, len(train_acc_all) + 1))
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=epochs, y=train_acc_all, name='Training accuracy', line=dict(width=4)))
    fig.add_trace(go.Scatter(x=epochs, y=val_acc_all, name='Validation accuracy', line=dict(width=4)))
    fig.update_layout(
        title='Training and validation accuracy',
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font = {
            'size': 18,
            'color': 'white'
        },
        xaxis = {
            'title': 'Epochs',
        }, 
        yaxis = {
            'title': 'Accuracy',
        }
    )
    fig.show()

## Models


In [7]:
def init_weights(std: float):
    def init(m: nn.Module | nn.Linear):
        if type(m) == nn.Linear:
            nn.init.normal_(m.weight, std=std)
            
    return init

In [3]:
def test_net(net: nn.Module, x: torch.Tensor):
    for layer in net:
        x: Tensor = layer(x)
        print(f'{layer.__class__.__name__: <15}\t->\t{x.shape}')

### SoftMax

In [None]:
softmax_net = nn.Sequential(
    nn.Flatten(),               # 28 x 28 -> 784
    nn.Linear(784, 10)          # 784 -> 10
)

### Multilayer Perceptron

In [None]:
multi_layer_net = nn.Sequential(
    nn.Flatten(),                   # 28 x 28 -> 784
    nn.Linear(784, 256), nn.ReLU(), # 784 -> 256
    nn.Linear(256, 10)              # 256 -> 10
)

### Convolutional Neural Network

In [None]:
conv_net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),    # 28 x 28 -> 28 x 28 x 6
    nn.AvgPool2d(kernel_size=2, stride=2),                      # 28 x 28 x 6 -> 14 x 14 x 6
    nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),              # 14 x 14 x 6 -> 10 x 10 x 16
    nn.AvgPool2d(kernel_size=2, stride=2),                      # 10 x 10 x 16 -> 5 x 5 x 16
    nn.Flatten(),                                               # 5 x 5 x 16 -> 400
    nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),                   # 400 -> 120
    nn.Linear(120, 84), nn.Sigmoid(),                           # 120 -> 84
    nn.Linear(84, 10)                                           # 84 -> 10
)

### Deep Convolutional Neural Network

In [49]:
alex_net = nn.Sequential(
    nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),   # 224 x 224 -> 54 x 54 x 96
    nn.MaxPool2d(kernel_size=3, stride=2),                              # 54 x 54 x 96 -> 26 x 26 x 96
    nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),            # 26 x 26 x 96 -> 26 x 26 x 256
    nn.MaxPool2d(kernel_size=3, stride=2),                              # 26 x 26 x 256 -> 12 x 12 x 256
    nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),           # 12 x 12 x 256 -> 12 x 12 x 384
    nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),           # 12 x 12 x 384 -> 12 x 12 x 384
    nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),           # 12 x 12 x 384 -> 12 x 12 x 256
    nn.MaxPool2d(kernel_size=3, stride=2),                              # 12 x 12 x 256 -> 5 x 5 x 256
    nn.Flatten(),                                                       # 5 x 5 x 256 -> 6400
    nn.Linear(6400, 4096), nn.ReLU(), nn.Dropout(p=0.5),                # 6400 -> 4096
    nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),                # 4096 -> 4096
    nn.Linear(4096, 10)                                                 # 4096 -> 10
)

### Batch Normalization

In [12]:
batch_norm_net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), nn.BatchNorm2d(6), nn.Sigmoid(),    # 28 x 28 -> 24 x 24 x 6
    nn.AvgPool2d(kernel_size=2, stride=2),                              # 24 x 24 x 6 -> 12 x 12 x 6
    nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(),  # 12 x 12 x 6 -> 8 x 8 x 16
    nn.AvgPool2d(kernel_size=2, stride=2),                              # 8 x 8 x 16 -> 4 x 4 x 16
    nn.Flatten(),                                                       # 4 x 4 x 16 -> 256
    nn.Linear(256, 120), nn.BatchNorm1d(120), nn.Sigmoid(),             # 256 -> 120
    nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(),               # 120 -> 84
    nn.Linear(84, 10)                                                   # 84 -> 10
)

In [77]:
class ConvBlock(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.lconv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.lbatch1 = nn.BatchNorm2d(64)
        self.lrelu1 = nn.ReLU()
        
        self.lconv2 = nn.Conv2d(64, 3, kernel_size=3)
        self.lrelu2 = nn.ReLU()
        
        self.rconv1 = nn.Conv2d(1, 3, kernel_size=3)
        
    
    def forward(self, x):
        # left path
        Yl = self.lrelu1(self.lbatch1(self.lconv1(x)))
        Yl = self.lrelu2(self.lconv2(Yl))
        
        # right path
        Yr = self.rconv1(x)
        
        return Yr + Yl

combined_net = nn.Sequential(
    ConvBlock(),
    nn.Flatten(),
    nn.Linear(3 * 30 * 30, 10)
)

# Testing

In [3]:
data = load_data(MNIST, 150, [45000, 15000])
data_train, data_val, data_test = data

In [19]:
lab4_net = nn.Sequential(
    nn.Flatten(),
    nn.Linear(784, 512), nn.ReLU(),
    nn.Linear(512, 256), nn.ReLU(),
    nn.Linear(256, 10)
)

In [37]:
%pip install prettytable

Collecting prettytable
  Downloading prettytable-3.6.0-py3-none-any.whl (27 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
from prettytable import PrettyTable


def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

# 2 batch size
# 3 inputs
# 4 hidden units
net = nn.LSTM(3, 4, 2)

    
count_parameters(net)

# x = torch.tensor([
#     [4, 3],
#     [0, 0],
#     [0, 0]
# ])
# 
# w = torch.tensor([
#     [1, 1, 1],
#     [-1, -1, 0],
# ])
# 
# (w @ x)

+--------------+------------+
|   Modules    | Parameters |
+--------------+------------+
| weight_ih_l0 |     48     |
| weight_hh_l0 |     64     |
|  bias_ih_l0  |     16     |
|  bias_hh_l0  |     16     |
| weight_ih_l1 |     64     |
| weight_hh_l1 |     64     |
|  bias_ih_l1  |     16     |
|  bias_hh_l1  |     16     |
+--------------+------------+
Total Trainable Params: 304


304

In [20]:
test_net(lab4_net, torch.randn(1, 1, 28, 28))

Flatten        	->	torch.Size([1, 784])
Linear         	->	torch.Size([1, 512])
ReLU           	->	torch.Size([1, 512])
Linear         	->	torch.Size([1, 256])
ReLU           	->	torch.Size([1, 256])
Linear         	->	torch.Size([1, 10])


In [8]:
lab4_net.apply(init_weights(0.01))

Sequential(
  (0): Linear(in_features=784, out_features=512, bias=True)
  (1): ReLU()
  (2): Linear(in_features=512, out_features=256, bias=True)
  (3): ReLU()
  (4): Linear(in_features=256, out_features=10, bias=True)
)

In [21]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(lab4_net.parameters(), lr=0.05)

train_loss, train_acc, val_loss, val_acc = train(
    lab4_net,
    data_train,
    data_val,
    10,
    0.05,
    loss_fn,
    optimizer,
    try_gpu())

Training on cpu
Epoch |   Train Loss | Train Accuracy | Validation Loss | Validation Accuracy
    1 |         0.76 |         82.04% |            0.41 |              88.83%
    2 | ([32m-0.42[0m) 0.34 | ([32m+8.54[0m) 90.58% |    ([32m-0.08[0m) 0.33 |      ([32m+1.99[0m) 90.82%
    3 | ([32m-0.06[0m) 0.28 | ([32m+1.41[0m) 92.00% |    ([32m-0.04[0m) 0.28 |      ([32m+1.31[0m) 92.13%
    4 | ([32m-0.03[0m) 0.25 | ([32m+0.98[0m) 92.98% |    ([32m-0.03[0m) 0.26 |      ([32m+0.50[0m) 92.63%
    5 | ([32m-0.02[0m) 0.23 | ([32m+0.59[0m) 93.57% |    ([32m-0.02[0m) 0.23 |      ([32m+0.86[0m) 93.49%
    6 | ([32m-0.02[0m) 0.21 | ([32m+0.62[0m) 94.19% |    ([32m-0.01[0m) 0.22 |      ([32m+0.33[0m) 93.82%
    7 | ([32m-0.02[0m) 0.19 | ([32m+0.48[0m) 94.68% |    ([32m-0.02[0m) 0.20 |      ([32m+0.48[0m) 94.30%
    8 | ([32m-0.01[0m) 0.18 | ([32m+0.34[0m) 95.02% |    ([32m-0.01[0m) 0.19 |      ([32m+0.07[0m) 94.37%
    9 | ([32m-0.01[0m) 0.16 

In [33]:
plot_accuracy(train_acc, val_acc)

In [34]:
plot_loss(train_loss, val_loss)