In [2]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2
%matplotlib inline

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Load dataset

In [22]:
import numpy as np
import torch as ch
import torch.nn as nn

from pathlib import Path

from ffcv.pipeline.operation import Operation
from ffcv.loader import Loader, OrderOption
from ffcv.transforms import (
    ToTensor,
    ToDevice,
    Squeeze,
    NormalizeImage,
    RandomHorizontalFlip,
    ToTorchImage,
)
from ffcv.fields.rgb_image import (
    CenterCropRGBImageDecoder,
    RandomResizedCropRGBImageDecoder,
)
from ffcv.fields.basics import IntDecoder

from tqdm import tqdm

In [23]:
def get_resolution(epoch):
    min_res = 160
    max_res = 192
    end_ramp = 76
    start_ramp = 65

    assert min_res <= max_res

    if epoch <= start_ramp:
        return min_res

    if epoch >= end_ramp:
        return max_res

    interp = np.interp([epoch], [start_ramp, end_ramp], [min_res, max_res])
    final_res = int(np.round(interp[0] / 32)) * 32
    return final_res

In [24]:
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]) * 255
IMAGENET_STD = np.array([0.229, 0.224, 0.225]) * 255
DEFAULT_CROP_RATIO = 224 / 256

In [27]:
train_dataset = "/home/soroush1/projects/def-kohitij/soroush1/training_fast_publish_faster/data/imagenet_train_256.ffcv"
num_workers = 10
batch_size = 512
distributed = 0
in_memory = 1

device = "cuda:0" if ch.cuda.is_available() else "cpu"
train_path = Path(train_dataset)
assert train_path.is_file()

res = get_resolution(epoch=0)
decoder = RandomResizedCropRGBImageDecoder((res, res))
image_pipeline = [
    decoder,
    RandomHorizontalFlip(),
    ToTensor(),
    ToDevice(ch.device(device), non_blocking=True),
    ToTorchImage(),
    NormalizeImage(IMAGENET_MEAN, IMAGENET_STD, np.float16),
]

label_pipeline = [
    IntDecoder(),
    ToTensor(),
    Squeeze(),
    ToDevice(ch.device(device), non_blocking=True),
]

order = OrderOption.RANDOM if distributed else OrderOption.QUASI_RANDOM

tr_loader = Loader(
    train_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    order=order,
    os_cache=in_memory,
    drop_last=True,
    pipelines={"image": image_pipeline, "label": label_pipeline},
    distributed=distributed,
)

total_tr_data = len(tr_loader)
for i, (imgs, lbls) in tqdm(enumerate(tr_loader), total=total_tr_data):
    pass

100%|██████████| 2502/2502 [05:53<00:00,  7.09it/s]


In [29]:
val_dataset = "/home/soroush1/projects/def-kohitij/soroush1/training_fast_publish_faster/data/imagenet_validation_256.ffcv"
num_workers = 10
batch_size = 512
distributed = 0
in_memory = 1
resolution = 256

device = "cuda:0" if ch.cuda.is_available() else "cpu"
val_dataset = Path(val_dataset)
assert val_dataset.is_file()

res_tuple = (resolution, resolution)
cropper = CenterCropRGBImageDecoder(res_tuple, ratio=DEFAULT_CROP_RATIO)

image_pipeline = [
    cropper,
    ToTensor(),
    ToDevice(ch.device(device), non_blocking=True),
    ToTorchImage(),
    NormalizeImage(IMAGENET_MEAN, IMAGENET_STD, np.float16),
]

label_pipeline = [
    IntDecoder(),
    ToTensor(),
    Squeeze(),
    ToDevice(ch.device(device), non_blocking=True),
]

val_loader = Loader(
    val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    order=OrderOption.SEQUENTIAL,
    drop_last=False,
    pipelines={"image": image_pipeline, "label": label_pipeline},
    distributed=distributed,
)

total_val_data = len(val_loader)
for i, (imgs, lbls) in tqdm(enumerate(val_loader), total=total_val_data):
    pass

100%|██████████| 98/98 [00:14<00:00,  6.89it/s]


# Create Model and set optimizers

In [34]:
from torchvision import models
from torch.cuda.amp import GradScaler

In [35]:
class BlurPoolConv2d(ch.nn.Module):

    # Purpose: This class creates a convolutional layer that first applies a blurring filter to the input before performing the convolution operation.
    # Condition: The function apply_blurpool iterates over all layers of the model and replaces convolution layers (ch.nn.Conv2d) with BlurPoolConv2d if they have a stride greater than 1 and at least 16 input channels.
    # Preventing Aliasing: Blurring the output of convolution layers (especially those with strides greater than 1) helps to reduce aliasing effects. Aliasing occurs when high-frequency signals are sampled too sparsely, leading to incorrect representations.
    # Smooth Transitions: Applying a blur before downsampling ensures that transitions between pixels are smooth, preserving important information in the feature maps.
    # Stabilizing Training: Blurring can help stabilize training by reducing high-frequency noise, making the model less sensitive to small changes in the input data.
    def __init__(self, conv):
        super().__init__()
        default_filter = ch.tensor([[[[1, 2, 1], [2, 4, 2], [1, 2, 1]]]]) / 16.0
        filt = default_filter.repeat(conv.in_channels, 1, 1, 1)
        self.conv = conv
        self.register_buffer("blur_filter", filt)

    def forward(self, x):
        blurred = F.conv2d(
            x,
            self.blur_filter,
            stride=1,
            padding=(1, 1),
            groups=self.conv.in_channels,
            bias=None,
        )
        return self.conv.forward(blurred)


def apply_blurpool(mod: ch.nn.Module):
    for name, child in mod.named_children():
        if isinstance(child, ch.nn.Conv2d) and (
            np.max(child.stride) > 1 and child.in_channels >= 16
        ):
            setattr(mod, name, BlurPoolConv2d(child))
        else:
            apply_blurpool(child)

In [39]:
arch = "alexnet"
weights = None
use_blurpool = True
checkpoint = None
device = "cuda:0" if ch.cuda.is_available() else "cpu"

scaler = GradScaler()  # since we're using float16, that's why we need to scale our loss
model = getattr(models, arch)(weights=weights)


if use_blurpool:
    apply_blurpool(model)

model = model.to(memory_format=ch.channels_last)
model = model.to(device)

In [44]:
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

# Optimization, Losses, Scheduler

In [51]:
import torchmetrics

In [52]:
# Using SGD as optimizer, using weight decay only for layer that has no batch normalization
momentum = 0.9  # pytorch documentation
weight_decay = 1e-4  # pytorch documentation weight decay
label_smoothing = 0.1  # ffcv documentation
lr = 0.01
lr_warmup_epochs = 0
lr_warmup_decay = 0.01
lr_step_size = 30
lr_gamma = 0.1
lr_warmup_method = "linear"
device = "cuda:0" if ch.cuda.is_available() else "cpu"

all_params = list(model.named_parameters())
bn_params = []
other_params = []
for k, v in all_params:
    if "bn" in k:
        print(f"{k =}")
        bn_params.append(v)

for k, v in all_params:
    if not "bn" in k:
        print(f"{k =}")
        other_params.append(v)

param_groups = [
    {"params": bn_params, "weight_decay": 0.0},
    {"params": other_params, "weight_decay": weight_decay},
]

optimizer = ch.optim.SGD(param_groups, lr=lr, momentum=momentum)
loss = ch.nn.CrossEntropyLoss(label_smoothing=label_smoothing)
top1 = torchmetrics.Accuracy(task="multiclass", num_classes=1000).to(device)
top5 = torchmetrics.Accuracy(
    task="multiclass",
    num_classes=1000,
    top_k=5,
).to(device)


# scheduler
main_lr_scheduler = ch.optim.lr_scheduler.StepLR(
    optimizer, step_size=lr_step_size, gamma=lr_gamma
)

if lr_warmup_epochs > 0:
    if lr_warmup_method == "linear":
        warmup_lr_scheduler = ch.optim.lr_scheduler.LinearLR(
            optimizer, start_factor=lr_warmup_decay, total_iters=lr_warmup_epochs
        )

    lr_scheduler = ch.optim.lr_scheduler.SequentialLR(
        optimizer,
        schedulers=[warmup_lr_scheduler, main_lr_scheduler],
        milestones=[lr_warmup_epochs],
    )
else:
    lr_scheduler = main_lr_scheduler

k ='features.0.weight'
k ='features.0.bias'
k ='features.3.weight'
k ='features.3.bias'
k ='features.6.weight'
k ='features.6.bias'
k ='features.8.weight'
k ='features.8.bias'
k ='features.10.weight'
k ='features.10.bias'
k ='classifier.1.weight'
k ='classifier.1.bias'
k ='classifier.4.weight'
k ='classifier.4.bias'
k ='classifier.6.weight'
k ='classifier.6.bias'


# Training 

In [60]:
from torch.cuda.amp import autocast

In [62]:
def train_loop(epoch, model, tr_loader, optimizer, loss, scaler, log_level):
    model.train()
    losses = []

    iterator = tqdm(tr_loader)
    for ix, (images, target) in enumerate(iterator):
        optimizer.zero_grad(set_to_none=True)

        with autocast():
            output = model(images)
            loss_train = loss(output, target)

        scaler.scale(loss_train).backward()
        scaler.step(optimizer)
        scaler.update()

        if log_level > 0:

            losses.append(loss_train.detach().cpu())
            group_lrs = []
            for _, group in enumerate(optimizer.param_groups):
                group_lrs.append(f'{group["lr"]:.3f}')

            top1_acc = top1(output, target)
            top5_acc = top5(output, target)

            names = ["ep", "iter", "shape", "lrs"]
            values = [epoch, ix, tuple(images.shape), group_lrs]
            names += ["loss", "top1", "top5"]
            values += [
                f"{loss_train.item():.3f}",
                f"{top1_acc.item():.3f}",
                f"{top5_acc.item():.3f}",
            ]

            msg = ", ".join(f"{n}={v}" for n, v in zip(names, values))
            iterator.set_description(msg)

    return np.mean(losses)


def val_loop(epoch, model, tr_loader, optimizer, loss, scaler, log_level):
    pass

In [63]:
epochs = 2
log_level = 1

for epoch in range(epochs):
    train_loss = train_loop(
        epoch=epoch,
        model=model,
        tr_loader=tr_loader,
        optimizer=optimizer,
        loss=loss,
        scaler=scaler,
        log_level=log_level,
    )



ep=0, iter=2501, shape=(512, 3, 160, 160), lrs=['0.010', '0.010'], loss=5.960, top1=0.037, top5=0.086: 100%|██████████| 2502/2502 [10:04<00:00,  4.14it/s]
ep=1, iter=2501, shape=(512, 3, 160, 160), lrs=['0.010', '0.010'], loss=5.385, top1=0.072, top5=0.184: 100%|██████████| 2502/2502 [04:33<00:00,  9.15it/s]
