<a href="https://colab.research.google.com/github/Squirrelcoding/mini-projects/blob/main/ResNet_performance_visualizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from torch import nn

In [2]:
!pip install torch



In [3]:
import torch

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
import requests
from pathlib import Path

# Download helper functions from Learn PyTorch repo (if not already downloaded)
if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  # Note: you need the "raw" GitHub URL for this to work
  request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

helper_functions.py already exists, skipping download


In [6]:
# !wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
# !unzip -q tiny-imagenet-200.zip


URL transformed to HTTPS due to an HSTS policy
--2025-08-07 23:12:20--  https://cs231n.stanford.edu/tiny-imagenet-200.zip
Resolving cs231n.stanford.edu (cs231n.stanford.edu)... 171.64.64.64
Connecting to cs231n.stanford.edu (cs231n.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248100043 (237M) [application/zip]
Saving to: ‘tiny-imagenet-200.zip.1’

tiny-imagenet-200.z  12%[=>                  ]  28.41M  7.77MB/s    eta 27s    ^C
replace tiny-imagenet-200/words.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [7]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

print(f"Train size: {len(mnist_train)}")
print(f"Test size: {len(mnist_test)}")


Train size: 60000
Test size: 10000


In [8]:
class BasicBlock(nn.Module):
  def __init__(self, channels: int, out_channels: int, downsampling=False) -> None:
    super().__init__()

    stride = 2 if downsampling else 1

    self.channels = channels
    # First convolutional layer. If downsampling, we set the stride to 2.
    self.conv1 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=stride, padding=1)
    # batch normalization
    self.bn1 = nn.BatchNorm2d(num_features=out_channels)
    # ReLU. Not really necessary but helps with keeping track of stuff.
    self.relu1 = nn.ReLU()
    # second convolutional layer that increases channels if downsampling
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
    # batch normalization
    self.bn2 = nn.BatchNorm2d(num_features=out_channels)
    self.relu2 = nn.ReLU()

    self.downsampling = None
    if downsampling:
      # double the channels to keep time complexity of layers while halving the input
      self.downsampling = nn.Sequential(nn.Conv2d(in_channels=self.channels,
                                    out_channels=self.channels * 2,
                                    kernel_size=1,
                                    padding=0,
                                    stride=2), nn.BatchNorm2d(num_features=out_channels))

  def forward(self, x):
    x_copy = x
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu1(x)
    x = self.conv2(x)
    x = self.bn2(x)

    # downsample stuff to match dimensions of x and x_copy
    if self.downsampling:
      x_copy = self.downsampling(x_copy)

    x = x + x_copy
    x = self.relu2(x)
    return x

In [9]:
import torch

class ResNet18(nn.Module):
  def __init__(self, in_channels: int) -> None:
    super().__init__()

    # Deepnet part
    self.conv1 = nn.Conv2d(in_channels, out_channels=64, stride=2, kernel_size=7, padding=1)
    self.conv10 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

    self.conv2_1 = BasicBlock(64, 64)
    self.conv2_2 = BasicBlock(64, 64)
    self.conv3_1 = BasicBlock(64, 128, downsampling=True)
    self.conv3_2 = BasicBlock(128, 128)
    self.conv4_1 = BasicBlock(128,256, downsampling=True)
    self.conv4_2 = BasicBlock(256, 256)
    self.conv5_1 = BasicBlock(256, 512, downsampling=True)
    self.conv5_2 = BasicBlock(512, 512)

    # Classification head

    # Average pooling
    self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))

    self.fc_layer = nn.Linear(512, 200)

  def forward(self, x):
    x = self.conv1(x)
    x = self.conv10(x)
    x = self.conv2_1(x)
    x = self.conv2_2(x)
    x = self.conv3_1(x)
    x = self.conv3_2(x)
    x = self.conv4_1(x)
    x = self.conv4_2(x)
    x = self.conv5_1(x)
    x = self.conv5_2(x)

    x = self.avg_pool(x)

    x = torch.flatten(x, 1)

    x = self.fc_layer(x)
    return x

In [10]:
from PIL import Image


model = ResNet18(in_channels=3).to(device)


In [11]:
import os
def find_classes(root_dir: str):
  folder_classes = {}
  for i, folder in enumerate(os.listdir(root_dir)):
    folder_classes[folder] = i
  return [x for x in folder_classes], folder_classes

find_classes("tiny-imagenet-200/train")

(['n03584254',
  'n04465501',
  'n02793495',
  'n03970156',
  'n06596364',
  'n02094433',
  'n03026506',
  'n03770439',
  'n04149813',
  'n03976657',
  'n07695742',
  'n03355925',
  'n04501370',
  'n02927161',
  'n03649909',
  'n04456115',
  'n04133789',
  'n03100240',
  'n04398044',
  'n03014705',
  'n02814860',
  'n03255030',
  'n02917067',
  'n02841315',
  'n01644900',
  'n04254777',
  'n04376876',
  'n03160309',
  'n07711569',
  'n04560804',
  'n02730930',
  'n02321529',
  'n03617480',
  'n02165456',
  'n09193705',
  'n02113799',
  'n04532670',
  'n02099601',
  'n01768244',
  'n04597913',
  'n02815834',
  'n02106662',
  'n07614500',
  'n02206856',
  'n02843684',
  'n07873807',
  'n09256479',
  'n02415577',
  'n09428293',
  'n02403003',
  'n03637318',
  'n02791270',
  'n02950826',
  'n07615774',
  'n03977966',
  'n02123045',
  'n07749582',
  'n02074367',
  'n01784675',
  'n02480495',
  'n02909870',
  'n07753592',
  'n02948072',
  'n02802426',
  'n03447447',
  'n02437312',
  'n034002

In [12]:
from torchvision import datasets
from torchvision.transforms import ToTensor

train_data = datasets.ImageFolder(root="tiny-imagenet-200/train", # target folder of images
                                  transform=ToTensor(), # transforms to perform on data (images)
                                  target_transform=None) # transforms to perform on labels (if necessary)
test_data = datasets.ImageFolder(root="tiny-imagenet-200/test", # target folder of images
                                  transform=ToTensor(), # transforms to perform on data (images)
                                  target_transform=None) # transforms to perform on labels (if necessary)
val_data = datasets.ImageFolder(root="tiny-imagenet-200/val", # target folder of images
                                  transform=ToTensor(), # transforms to perform on data (images)
                                  target_transform=None) # transforms to perform on labels (if necessary)

In [30]:
from torch.utils.data import DataLoader

# Setup the batch size hyperparameter
BATCH_SIZE = 256

# Turn datasets into iterables (batches)
train_dataloader = DataLoader(train_data, # dataset to turn into iterable
    batch_size=BATCH_SIZE, # how many samples per batch?
    shuffle=True # shuffle data every epoch?
)

test_dataloader = DataLoader(test_data,
    batch_size=BATCH_SIZE,
    shuffle=False # don't necessarily have to shuffle the testing data
)

val_dataloader = DataLoader(val_data,
    batch_size=BATCH_SIZE,
    shuffle=False # don't necessarily have to shuffle the testing data
)
# Let's check out what we've created
print(f"Dataloaders: {train_dataloader, val_dataloader}")
print(f"Length of train dataloader: {len(train_dataloader)} batches of {BATCH_SIZE}")
print(f"Length of val dataloader: {len(val_dataloader)} batches of {BATCH_SIZE}")

Dataloaders: (<torch.utils.data.dataloader.DataLoader object at 0x7a7745f7c910>, <torch.utils.data.dataloader.DataLoader object at 0x7a776c163790>)
Length of train dataloader: 391 batches of 256
Length of val dataloader: 40 batches of 256


In [31]:
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau

loss_fn = nn.CrossEntropyLoss()


# Your existing setup
# model = ... ResNet18 ...
# loss_fn = nn.CrossEntropyLoss()

# 1. Define the optimizer (use a higher initial LR like the paper, e.g., 0.1)
# Note: 0.01 is also a fine starting point for a smaller dataset like Tiny ImageNet.
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)

# 2. Define the scheduler
# This will decrease the LR by a factor of 10 at epoch 5 and epoch 8.
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=5 # A common starting value; you may need to tune this
)

In [32]:
from helper_functions import accuracy_fn # Note: could also use torchmetrics.Accuracy(task = 'multiclass', num_classes=len(class_names)).to(device)

from timeit import default_timer as timer
def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time.

    Args:
        start (float): Start time of computation (preferred in timeit format).
        end (float): End time of computation.
        device ([type], optional): Device that compute is running on. Defaults to None.

    Returns:
        float: time between start and end in seconds (higher is longer).
    """
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [33]:
# Import tqdm for progress bar
from tqdm.auto import tqdm
from timeit import default_timer as timer
import torch
import torch.optim as optim
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Assuming model, loss_fn, optimizer, accuracy_fn, train_dataloader, test_dataloader, and device are defined

# Set the seed and start the timer
torch.manual_seed(42)
train_time_start_on_cpu = timer()

# Define the learning rate scheduler as specified in the paper
# It divides the learning rate by 10 (factor=0.1) when the error plateaus.
# 'mode' is 'min' because we're monitoring validation loss.
# 'patience' determines how many epochs to wait for improvement before reducing LR.
# The paper doesn't specify patience, so a common starting value like 5 is used.
scheduler = ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=5,
    verbose=True # To see when the learning rate is adjusted
)

# ResNet paper trains for up to 60 * 10^4 iterations (mini-batches).
# We'll set a high number of epochs and break the loop if iterations are met,
# or let the scheduler reduce LR until convergence.
# A typical ResNet training can take 90-120 epochs, but we'll adapt to iterations.
max_iterations = 60 * 10**4
current_iterations = 0
epochs = 120 # Set a sufficiently high number of epochs to reach max_iterations or convergence

# Create training and testing loop
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n-------")
    ### Training
    train_loss = 0
    model.train() # Set the model to training mode

    # Add a loop to loop through training batches
    for batch_idx, (X, y) in enumerate(train_dataloader):
        # Break if max_iterations is reached
        if current_iterations >= max_iterations:
            print(f"Reached max_iterations ({max_iterations}). Stopping training.")
            break

        X, y = X.to(device), y.to(device)

        # 1. Forward pass
        y_pred = model(X)

        # 2. Calculate loss (per batch)
        loss = loss_fn(y_pred, y)
        train_loss += loss.item() # Use .item() to get a standard Python number for summation

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        current_iterations += 1 # Increment iteration count

        # Print out how many samples have been seen
        if current_iterations % 1000 == 0: # Print less frequently to avoid spamming console
            print(f"Iteration: {current_iterations} | Looked at {batch_idx * len(X)}/{len(train_dataloader.dataset)} samples")

    # If max_iterations was reached within the inner loop, break the outer loop too
    if current_iterations >= max_iterations:
        break

    # Divide total train loss by length of train dataloader (average loss per batch per epoch)
    train_loss /= len(train_dataloader)

    ### Testing (Validation)
    # Setup variables for accumulatively adding up loss and accuracy
    test_loss, test_acc = 0, 0
    model.eval() # Set the model to evaluation mode
    with torch.inference_mode():
        for X, y in val_dataloader:
            X, y = X.to(device), y.to(device)
            # 1. Forward pass
            test_pred = model(X)

            # 2. Calculate loss (accumulatively)
            test_loss += loss_fn(test_pred, y).item() # Use .item()

            # 3. Calculate accuracy (preds need to be same as y_true)
            test_acc += accuracy_fn(y_true=y, y_pred=test_pred.argmax(dim=1))

        # Calculations on test metrics need to happen inside torch.inference_mode()
        # Divide total test loss by length of test dataloader (per batch)
        test_loss /= len(val_dataloader)

        # Divide total accuracy by length of test dataloader (per batch)
        test_acc /= len(val_dataloader)

    ## Print out what's happening
    # Step the scheduler based on the validation loss (test_loss)
    scheduler.step(test_loss)
    current_lr = optimizer.param_groups[0]['lr'] # Get the current learning rate from the optimizer

    print(f"\nTrain loss: {train_loss:.5f} | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%, Current LR: {current_lr:.6f}\n")
    print(f"Saving model at epoch {epoch} and iteration {current_iterations}...")
    torch.save(model.state_dict(), f"model_epoch{epoch}_iter{current_iterations}.pth")

# Calculate training time
train_time_end_on_cpu = timer()
total_train_time_model_0 = print_train_time(start=train_time_start_on_cpu,
                                           end=train_time_end_on_cpu,
                                           device=str(next(model.parameters()).device))

  0%|          | 0/120 [00:00<?, ?it/s]

Epoch: 0
-------

Train loss: 0.02432 | Test loss: 17.02022, Test acc: 0.63%, Current LR: 0.010000

Saving model at epoch 0 and iteration 391...
Epoch: 1
-------

Train loss: 0.01496 | Test loss: 17.08986, Test acc: 0.62%, Current LR: 0.010000

Saving model at epoch 1 and iteration 782...
Epoch: 2
-------
Iteration: 1000 | Looked at 55552/100000 samples

Train loss: 0.01231 | Test loss: 17.22180, Test acc: 0.59%, Current LR: 0.010000

Saving model at epoch 2 and iteration 1173...
Epoch: 3
-------

Train loss: 0.01127 | Test loss: 17.18932, Test acc: 0.61%, Current LR: 0.010000

Saving model at epoch 3 and iteration 1564...
Epoch: 4
-------

Train loss: 0.01003 | Test loss: 17.26799, Test acc: 0.61%, Current LR: 0.010000

Saving model at epoch 4 and iteration 1955...
Epoch: 5
-------
Iteration: 2000 | Looked at 11264/100000 samples

Train loss: 0.00922 | Test loss: 17.38690, Test acc: 0.59%, Current LR: 0.010000

Saving model at epoch 5 and iteration 2346...
Epoch: 6
-------


KeyboardInterrupt: 