<a href="https://colab.research.google.com/github/PietroZamberlan/dino-neuromatch/blob/joel_example/resnet_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import copy

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np

import time
import torch
import torchvision
import torchvision.datasets as datasets
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from tqdm.auto import tqdm

In [None]:
def load_mnist_data(change_tensors=False, download=True):
  """
  Load training and test examples for the MNIST handwritten digits dataset
  with every image: 28*28 x 1 channel (greyscale image)

  Args:
    change_tensors: Bool
      Argument to check if tensors need to be normalised
    download: Bool
      Argument to check if dataset needs to be downloaded/already exists

  Returns:
    train_set:
      train_data: Tensor
        training input tensor of size (train_size x 784)
      train_target: Tensor
        training 0-9 integer label tensor of size (train_size)
    test_set:
      test_data: Tensor
        test input tensor of size (test_size x 784)
      test_target: Tensor
        training 0-9 integer label tensor of size (test_size)
  """
  # Load train and test sets
  train_set = datasets.MNIST(root='.', train=True, download=download,
                             transform=torchvision.transforms.ToTensor())
  test_set = datasets.MNIST(root='.', train=False, download=download,
                            transform=torchvision.transforms.ToTensor())

  # Original data is in range [0, 255]. We normalize the data wrt its mean and std_dev.
  # Note that we only used *training set* information to compute mean and std
  mean = train_set.data.float().mean()
  std = train_set.data.float().std()

  if change_tensors:
    # Apply normalization directly to the tensors containing the dataset
    train_set.data = (train_set.data.float() - mean) / std
    test_set.data = (test_set.data.float() - mean) / std
  else:
    tform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                            torchvision.transforms.Normalize(mean=[mean / 255.], std=[std / 255.])
                                            ])
    train_set = datasets.MNIST(root='.', train=True, download=download,
                               transform=tform)
    test_set = datasets.MNIST(root='.', train=False, download=download,
                              transform=tform)

  return train_set, test_set


train_set, test_set = load_mnist_data(change_tensors=True)

100%|██████████| 9.91M/9.91M [00:02<00:00, 4.94MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 131kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.25MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 2.97MB/s]


In [None]:
def eval_model(model, loss_fn, data_loader, num_batches=np.inf, device='cpu'):
  """
  To evaluate a given model

  Args:
    model: nn.Module derived class
      The model which is to be evaluated
    data_loader: Iterable
      A configured dataloading utility
    num_batches: Integer
      Size of minibatches
    device: String
      Sets the device. CUDA if available, CPU otherwise

  Returns:
    mean of log loss and mean of log accuracy
  """

  loss_log, acc_log = [], []
  model.to(device=device)

  # We are just evaluating the model, no need to compute gradients
  with torch.no_grad():
    for batch_id, batch in enumerate(data_loader):
      # If we only evaluate a number of batches, stop after we reach that number
      if batch_id > num_batches:
        break
      # Extract minibatch data
      data, labels = batch[0].to(device), batch[1].to(device)
      # Evaluate model and loss on minibatch
      preds = model(data)
      preds = torch.nn.functional.softmax(preds, dim=-1)
      loss_log.append(loss_fn(preds, labels).item())
      acc_log.append(torch.mean(1. * (preds.argmax(dim=1) == labels)).item())

  return np.mean(loss_log), np.mean(acc_log)

In [None]:
# @title Set device (GPU or CPU). Execute `set_device()`
# especially if torch modules used.

# Inform the user if the notebook uses GPU or CPU.

def set_device():
  """
  Set the device. CUDA if available, CPU otherwise

  Args:
    None

  Returns:
    Nothing
  """
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under `Runtime` -> "
        "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

In [None]:
# @title Set random seed

# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random
import torch

def set_seed(seed=None, seed_torch=True):
  """
  Function that controls randomness.
  NumPy and random modules must be imported.

  Args:
    seed : Integer
      A non-negative integer that defines the random state. Default is `None`.
    seed_torch : Boolean
      If `True` sets the random seed for pytorch tensors, so pytorch module
      must be imported. Default is `True`.

  Returns:
    Nothing.
  """
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')


# In case that `DataLoader` is used
def seed_worker(worker_id):
  """
  DataLoader will reseed workers following randomness in
  multi-process data loading algorithm.

  Args:
    worker_id: integer
      ID of subprocess to seed. 0 means that
      the data will be loaded in the main process
      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

  Returns:
    Nothing
  """
  worker_seed = torch.initial_seed() % 2**32
  np.random.seed(worker_seed)
  random.seed(worker_seed)

In [None]:
SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

Random seed 2021 has been set.
GPU is enabled in this notebook.


In [None]:
class ResidualLinearBlock(nn.Module):
  def __init__(self, units):
    super(ResidualLinearBlock, self).__init__()
    self.relu = nn.ReLU()
    self.fc1 = nn.Linear(units, units)

  def forward(self, x):

    # Go forward on the blocks
    identity = x # Take the residual from input
    x = self.relu(x)
    x = self.fc1(x)
    # Add the residual to the output of the blocks
    x += identity
    x = x * (1 / np.sqrt(2))
    return x

In [None]:
class ResidualConvBlock(nn.Module):
  def __init__(self, in_channels, out_channels:int, kernel_size:int, stride:int, downsample:bool=False):
    super(ResidualConvBlock, self).__init__()
    self.relu = nn.ReLU(inplace=True) #inplace = True optimises by performing the ReLu in-place (no new Tensor is created)
    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding='same')
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size, stride, padding='same')

    self.batchnorm1 = nn.BatchNorm2d(out_channels)
    self.batchnorm2 = nn.BatchNorm2d(out_channels)

    self.downsample = downsample
    if self.downsample:
      self.downsampler = nn.MaxPool2d(2, 2)

  def forward(self, x):

    # Go forward on the blocks
    identity = x # Take the residual from input
    x = self.conv1(x)
    x = self.batchnorm1(x)
    x = self.relu(x)

    x = self.conv2(x)
    x = self.batchnorm2(x)



    # Add the residual to the output of the blocks
    x += identity

    if self.downsample:
      x = self.downsampler(x)

    x = self.relu(x)


    return x

In [None]:
# He initialization of weights
def weights_init(layer_in):
  if isinstance(layer_in, nn.Linear):
    nn.init.kaiming_normal_(layer_in.weight, mode='fan_in', nonlinearity='relu')
    layer_in.bias.data.fill_(0.0)

In [None]:
class ResidualMLP(nn.Module):
  def __init__(self, in_dim=784, out_dim=10, depth=3, units_per_block=128):
    super(ResidualMLP, self).__init__()

    self.in_dim = in_dim

    self.initial_layer = nn.Linear(in_dim, units_per_block)

    blocks_list = []
    for i in range(depth):
      blocks_list.append(ResidualLinearBlock((units_per_block)))
    self.hidden_layers = nn.Sequential(*blocks_list)

    self.final_layer = nn.Sequential(
        nn.ReLU(),
        nn.Linear(units_per_block, out_dim)
    )

  def forward(self, x):
    transformed_x = x.view(-1, self.in_dim) # flatten the input into (batch_dim, in_dim), using a -1 to tell .view to just make the math work

    hidden_out = self.initial_layer(transformed_x)
    hidden_out = self.hidden_layers(hidden_out)
    output = self.final_layer(hidden_out)

    return output

In [None]:
class ResidualCNNClassifer(nn.Module):
  def __init__(self, in_dim=784, out_dim=10, depth=3, units_per_block=128):
    super(ResidualMLP, self).__init__()

    self.in_dim = in_dim

    self.initial_layer = nn.Linear(in_dim, units_per_block)

    blocks_list = []
    for i in range(depth):
      blocks_list.append(ResidualLinearBlock((units_per_block)))
    self.hidden_layers = nn.Sequential(*blocks_list)

    self.final_layer = nn.Sequential(
        nn.ReLU(),
        nn.Linear(units_per_block, out_dim)
    )

  def forward(self, x):
    transformed_x = x.view(-1, self.in_dim) # flatten the input into (batch_dim, in_dim), using a -1 to tell .view to just make the math work

    hidden_out = self.initial_layer(transformed_x)
    hidden_out = self.hidden_layers(hidden_out)
    output = self.final_layer(hidden_out)

    return output

In [None]:
# Training Settings

MAX_EPOCHS = 2
LR = 6e-4 * np.sqrt(2)
BATCH_SIZE = 32

my_model = ResidualMLP(depth=10).to('cuda')


optimizer = torch.optim.AdamW(my_model.parameters(), lr=LR, weight_decay=1e-3)

lossfn = nn.CrossEntropyLoss()

In [None]:
set_seed(seed=SEED)
# Print training stats every LOG_FREQ minibatches
LOG_FREQ = 200
# Frequency for evaluating the validation metrics
VAL_FREQ = 200
# Load data using a Pytorch Dataset
train_set_orig, test_set_orig = load_mnist_data(change_tensors=False)

# We separate 10,000 training samples to create a validation set
train_set_orig, val_set_orig = torch.utils.data.random_split(train_set_orig, [50000, 10000])

# Create the corresponding DataLoaders for training and test
g_seed = torch.Generator()
g_seed.manual_seed(SEED)

train_loader = torch.utils.data.DataLoader(train_set_orig,
                                           shuffle=True,
                                           batch_size=BATCH_SIZE,
                                           num_workers=2,
                                           worker_init_fn=seed_worker,
                                           generator=g_seed)
val_loader = torch.utils.data.DataLoader(val_set_orig,
                                         shuffle=True,
                                         batch_size=256,
                                         num_workers=2,
                                         worker_init_fn=seed_worker,
                                         generator=g_seed)
test_loader = torch.utils.data.DataLoader(test_set_orig,
                                          batch_size=256,
                                          num_workers=2,
                                          worker_init_fn=seed_worker,
                                          generator=g_seed)

# Run training
metrics = {'train_loss':[],
           'train_acc':[],
           'val_loss':[],
           'val_acc':[],
           'val_idx':[]}

step_idx = 0
for epoch in tqdm(range(MAX_EPOCHS)):

  running_loss, running_acc = 0., 0.

  for batch_id, batch in enumerate(train_loader):
    step_idx += 1
    # Extract minibatch data and labels
    data, labels = batch[0].to(DEVICE), batch[1].to(DEVICE)
    # Just like before, refresh gradient accumulators.
    # Note that this is now a method of the optimizer.
    optimizer.zero_grad()
    # Evaluate model and loss on minibatch

    logits = my_model(data)
    probs = torch.nn.functional.softmax(logits, dim=-1)

    loss = lossfn(probs, labels)
    acc = torch.mean(1.0 * (logits.argmax(dim=1) == labels))

    # Compute gradients
    loss.backward()
    # Update parameters
    # Note how all the magic in the update of the parameters is encapsulated by
    # the optimizer class.
    optimizer.step()
    # Log metrics for plotting
    metrics['train_loss'].append(loss.cpu().item())
    metrics['train_acc'].append(acc.cpu().item())

    if batch_id % VAL_FREQ == (VAL_FREQ - 1):
      # Get an estimate of the validation accuracy with 100 batches
      val_loss, val_acc = eval_model(my_model, lossfn, val_loader,
                                     num_batches=100,
                                     device=DEVICE)
      metrics['val_idx'].append(step_idx)
      metrics['val_loss'].append(val_loss)
      metrics['val_acc'].append(val_acc)

      print(f"[VALID] Epoch {epoch + 1} - Batch {batch_id + 1} - "
            f"Loss: {val_loss:.3f} - Acc: {100*val_acc:.3f}%")

    # print statistics
    running_loss += loss.cpu().item()
    running_acc += acc.cpu().item()
    # Print every LOG_FREQ minibatches
    if batch_id % LOG_FREQ == (LOG_FREQ-1):
      print(f"[TRAIN] Epoch {epoch + 1} - Batch {batch_id + 1} - "
            f"Loss: {running_loss / LOG_FREQ:.3f} - "
            f"Acc: {100 * running_acc / LOG_FREQ:.3f}%")

      running_loss, running_acc = 0., 0.

Random seed 2021 has been set.


  0%|          | 0/2 [00:00<?, ?it/s]

[VALID] Epoch 1 - Batch 200 - Loss: 1.725 - Acc: 73.301%
[TRAIN] Epoch 1 - Batch 200 - Loss: 1.885 - Acc: 57.781%
[VALID] Epoch 1 - Batch 400 - Loss: 1.600 - Acc: 86.309%
[TRAIN] Epoch 1 - Batch 400 - Loss: 1.651 - Acc: 81.297%
[VALID] Epoch 1 - Batch 600 - Loss: 1.599 - Acc: 86.172%
[TRAIN] Epoch 1 - Batch 600 - Loss: 1.608 - Acc: 85.344%
[VALID] Epoch 1 - Batch 800 - Loss: 1.580 - Acc: 88.047%
[TRAIN] Epoch 1 - Batch 800 - Loss: 1.597 - Acc: 86.500%
[VALID] Epoch 1 - Batch 1000 - Loss: 1.564 - Acc: 89.873%
[TRAIN] Epoch 1 - Batch 1000 - Loss: 1.582 - Acc: 87.922%
[VALID] Epoch 1 - Batch 1200 - Loss: 1.562 - Acc: 89.912%
[TRAIN] Epoch 1 - Batch 1200 - Loss: 1.580 - Acc: 88.141%
[VALID] Epoch 1 - Batch 1400 - Loss: 1.587 - Acc: 87.520%
[TRAIN] Epoch 1 - Batch 1400 - Loss: 1.577 - Acc: 88.422%
[VALID] Epoch 2 - Batch 200 - Loss: 1.575 - Acc: 88.564%
[TRAIN] Epoch 2 - Batch 200 - Loss: 1.573 - Acc: 88.797%
[VALID] Epoch 2 - Batch 400 - Loss: 1.578 - Acc: 88.408%
[TRAIN] Epoch 2 - Batch 4

# Various Testing Functions


In [None]:
# testing fns
blocks_list = []
for i in range(1,101):
  blocks_list.append(ResidualLinearBlock((256)))
BigBoy = nn.Sequential(*blocks_list)

BigBoy.apply(weights_init)
BigBoy.to('cuda')

Sequential(
  (0): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (1): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (2): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (3): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (4): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (5): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (6): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (7): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_features=256, out_features=256, bias=True)
  )
  (8): ResidualLinearBlock(
    (relu): ReLU()
    (fc1): Linear(in_

In [None]:
batch = torch.randn(65536, 256).to('cuda')
print(f'Variance of the input({batch.var()})')

Variance of the input(1.0000959634780884)


In [None]:
out = BigBoy(batch)
print(f'Variance of the output({out.var()})')

Variance of the output(0.4467712342739105)


In [None]:
#testing conv blocks
blocks_list = []
for i in range(1,100):
  downsample = True if i%2==0 else False
  blocks_list.append(ResidualConvBlock(2, 2, 3, 1, False))
ConvBlocks = nn.Sequential(*blocks_list)

ConvBlocks.apply(weights_init)
ConvBlocks.to('cuda')

Sequential(
  (0): ResidualConvBlock(
    (relu): ReLU(inplace=True)
    (conv1): Conv2d(2, 2, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (conv2): Conv2d(2, 2, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (batchnorm1): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (batchnorm2): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (1): ResidualConvBlock(
    (relu): ReLU(inplace=True)
    (conv1): Conv2d(2, 2, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (conv2): Conv2d(2, 2, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (batchnorm1): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (batchnorm2): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (2): ResidualConvBlock(
    (relu): ReLU(inplace=True)
    (conv1): Conv2d(2, 2, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (conv2): Conv2d(2, 2, kernel_size=(3, 3), 

In [None]:
batch = torch.randn(2, 2, 256, 256).to('cuda')
print(f'Variance of the input({batch.var()})')

Variance of the input(0.9947458505630493)


In [None]:
out = ConvBlocks(batch)
print(f'Variance of the output({out.var()})')

Variance of the output(38.918663024902344)


another commit


In [None]:
added a change
