In [11]:
import os
import urllib.request
from types import SimpleNamespace
from urllib.error import HTTPError

import lightning as L
import matplotlib
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
import numpy as np
import seaborn as sns
import tabulate
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.nn.functional as F
import torch.utils.data as data
import torchvision

%matplotlib inline
from IPython.display import HTML, display
from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping, RichProgressBar
from torchvision.models.vision_transformer import vit_h_14, vit_l_32, vit_b_32
import lightning.pytorch.loggers as pl_loggers 
from PIL import Image
from torchvision import transforms
from torchvision.datasets import CIFAR10

matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf")  # For export
matplotlib.rcParams["lines.linewidth"] = 2.0
sns.reset_orig()

# PyTorch
# Torchvision

In [12]:
# Setting the seed
L.seed_everything(42)

Seed set to 42


42

In [13]:
# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "results/saved_models/ConvNets")


# Function for setting the seed
L.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

Seed set to 42


In [14]:
train_dataset = CIFAR10(root=DATASET_PATH, train=True, download=True)
DATA_MEANS = (train_dataset.data / 255.0).mean(axis=(0, 1, 2))
DATA_STD = (train_dataset.data / 255.0).std(axis=(0, 1, 2))
print("Data mean", DATA_MEANS)
print("Data std", DATA_STD)

Files already downloaded and verified
Data mean [0.49139968 0.48215841 0.44653091]
Data std [0.24703223 0.24348513 0.26158784]


In [15]:
from lightning.pytorch.utilities.types import TRAIN_DATALOADERS
import torch.utils
import torch.utils.data


"""test_transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Resize((224, 224)),
        transforms.Normalize(DATA_MEANS, DATA_STD),
    ]
)
# For training, we add some augmentation. Networks are too powerful and would overfit.
train_transform = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(DATA_MEANS, DATA_STD),
    ]
)
# Loading the training dataset. We need to split it into a training and validation part
# We need to do a little trick because the validation set should not use the augmentation.
train_dataset = CIFAR10(
    root=DATASET_PATH, train=True, transform=train_transform, download=True
)
val_dataset = CIFAR10(
    root=DATASET_PATH, train=True, transform=test_transform, download=True
)
L.seed_everything(42)
train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
L.seed_everything(42)
_, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])

# Loading the test set
test_set = CIFAR10(
    root=DATASET_PATH, train=False, transform=test_transform, download=True
)

# We define a set of data loaders that we can use for various purposes later.
train_loader = data.DataLoader(
    train_set,
    batch_size=128,
    shuffle=True,
    drop_last=True,
    pin_memory=True,
    num_workers=4,
)
val_loader = data.DataLoader(
    val_set, batch_size=2048, shuffle=False, drop_last=False, num_workers=4
)
test_loader = data.DataLoader(
    test_set, batch_size=2048, shuffle=False, drop_last=False, num_workers=4
)"""


class CIFARLitDataModule(L.LightningDataModule):
    def __init__(self) -> None:
        super().__init__()

        self.test_transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Resize((224, 224)),
                transforms.Normalize(DATA_MEANS, DATA_STD),
            ]
        )
        # For training, we add some augmentation. Networks are too powerful and would overfit.
        self.train_transform = transforms.Compose(
            [
                transforms.RandomHorizontalFlip(),
                transforms.RandomResizedCrop(
                    (32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)
                ),
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(DATA_MEANS, DATA_STD),
            ]
        )

        self.train_set: torch.utils.data.Dataset
        self.val_set: torch.utils.data.Dataset
        self.test_set: torch.utils.data.Dataset

    def prepare_data(self) -> None:
        train_dataset = CIFAR10(
            root=DATASET_PATH, train=True, transform=self.train_transform, download=True
        )
        val_dataset = CIFAR10(
            root=DATASET_PATH, train=True, transform=self.test_transform, download=True
        )
        L.seed_everything(42)
        self.train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
        L.seed_everything(42)
        _, self.val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])

        # Loading the test set
        self.test_set = CIFAR10(
            root=DATASET_PATH, train=False, transform=self.test_transform, download=True
        )

    def train_dataloader(self) -> torch.utils.data.DataLoader:
        return data.DataLoader(
            self.train_set,
            batch_size=128,
            shuffle=True,
            drop_last=True,
            pin_memory=True,
            num_workers=4,
        )

    def val_dataloader(self) -> torch.utils.data.DataLoader:
        return data.DataLoader(
            self.val_set,
            batch_size=2048,
            shuffle=False,
            drop_last=False,
            num_workers=4,
        )

    def test_dataloader(self) -> torch.utils.data.DataLoader:
        return data.DataLoader(
            self.test_set,
            batch_size=2048,
            shuffle=False,
            drop_last=False,
            num_workers=4,
        )


datamodule = CIFARLitDataModule()

In [16]:
# NUM_IMAGES = 4
# images = [train_dataset[idx][0] for idx in range(NUM_IMAGES)]
# orig_images = [Image.fromarray(train_dataset.data[idx]) for idx in range(NUM_IMAGES)]
# orig_images = [datamodule.test_transform(img) for img in orig_images]
# 
# img_grid = torchvision.utils.make_grid(
#     torch.stack(images + orig_images, dim=0), nrow=4, normalize=True, pad_value=0.5
# )
# img_grid = img_grid.permute(1, 2, 0)
# 
# plt.figure(figsize=(8, 8))
# plt.title("Augmentation examples on CIFAR10")
# plt.imshow(img_grid)
# plt.axis("off")
# plt.show()
# plt.close()

In [17]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
def count_params(model: nn.Module):
    params = []
    for param in model.parameters():
        params.append(param.flatten())
    return len(torch.cat(params))

class ClassifierViT(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        # self.model = vit_h_14()  # ~ 600 M params
        # self.model = vit_l_32()  # ~ 308 M params
        self.model = vit_b_32()  # ~ 88 M params

    def forward(self, x):
        return self.model.forward(x)


# count_params(vit_h_14()),  count_params(vit_l_32()), count_params(vit_b_32())

In [18]:
class CIFARModule(L.LightningModule):
    def __init__(
        self, optimizer_name, optimizer_hparams, lr_scheduler_name, lr_scheduler_hparams
    ):
        """CIFARModule.

        Args:
            optimizer_name: Name of the optimizer to use. Currently supported: Adam, SGD
            optimizer_hparams: Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        # Create model
        # self.model = Classifier()  # create_model(model_name, model_hparams)
        self.model = ClassifierViT()  # create_model(model_name, model_hparams)
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()
        # Example input for visualizing the graph in Tensorboard
        self.example_input_array = torch.zeros((1, 3, 32, 32), dtype=torch.float32)
        self.example_input_array = torch.zeros((1, 3, 224, 224), dtype=torch.float32)

    def forward(self, imgs):
        # Forward function that is run when visualizing the graph
        return self.model(imgs)

    def configure_optimizers(self):
        # AdamW is Adam with a correct implementation of weight decay (see here
        # for details: https://arxiv.org/pdf/1711.05101.pdf)
        try:
            optimizer_cls = getattr(optim, self.hparams.optimizer_name)
            optimizer = optimizer_cls(self.parameters(), **self.hparams.optimizer_hparams)
        except AttributeError:
            assert False, f'Unknown optimizer: "{self.hparams.optimizer_name}"'        

        try:
            scheduler_cls = getattr(lr_scheduler, self.hparams.lr_scheduler_name)
            scheduler = scheduler_cls(optimizer=optimizer, **self.hparams.lr_scheduler_hparams)
        except AttributeError:
            assert False, f'Unknown lr_scheduler: "{self.hparams.lr_scheduler_name}"'
        
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        # "batch" is the output of the training data loader.
        imgs, labels = batch
        # print(imgs.shape)
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        # Logs the accuracy per epoch to tensorboard (weighted average over batches)
        self.log("train_acc", acc, on_step=True, on_epoch=True)
        self.log("train_loss", loss, on_step=True, on_epoch=True)
        return loss  # Return tensor to call ".backward" on
    

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        # By default logs it per epoch (weighted average over batches)
        self.log("val_acc", acc, on_step=True, on_epoch=True)

    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        # By default logs it per epoch (weighted average over batches), and returns it afterwards
        self.log("test_acc", acc)


In [19]:
def train_model(save_name: str =None, **model_kwargs):
    """Train model.

    Args:
        save_name (optional): If specified, this name will be used for creating the checkpoint and logging directory.
    """
    if save_name is None:
        save_name = "BaselineClassifier"
    
    # Create a PyTorch Lightning trainer with the generation callback
    log_path = os.path.join(
            CHECKPOINT_PATH, save_name
        )
    datamodule = CIFARLitDataModule()

    trainer = L.Trainer(
        default_root_dir=log_path,  # Where to save models
        # We run on a single GPU (if possible)
        accelerator="auto",
        devices=1,
        # How many epochs to train for if no patience is set
        max_epochs=20,
        callbacks=[
            ModelCheckpoint(
                save_weights_only=True, mode="max", monitor="val_acc_epoch"
            ),  # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
            LearningRateMonitor("epoch"),
        ],  # Log learning rate every epoch
        logger=[pl_loggers.CSVLogger(log_path), pl_loggers.TensorBoardLogger(log_path)]
    )  # In case your notebook crashes due to the progress bar, consider increasing the refresh rate
    trainer.logger._log_graph = (
        True  # If True, we plot the computation graph in tensorboard
    )
    trainer.logger._default_hp_metric = (
        None  # Optional logging argument that we don't need
    )
    model = CIFARModule(**model_kwargs)
    # trainer.fit(model, train_loader, val_loader)
    trainer.fit(model, datamodule=datamodule)
    
    model = CIFARModule.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path
    )  # Load best checkpoint after training
    # Test best model on validation and test set
    val_result = trainer.test(model, datamodule=datamodule, verbose=False)
    # val_result = trainer.test(model, dataloaders=val_loader, verbose=False)
    test_result = trainer.test(model, datamodule=datamodule, verbose=False)
    # test_result = trainer.test(model, dataloaders=test_loader, verbose=False)
    result = {"test": test_result[0]["test_acc"], "val": val_result[0]["test_acc"]}

    return model, result



In [20]:
constant_lr_scheduler_config = [
    {
        "optimizer_name": "AdamW",
        # "optimizer_hparams": {"lr": learning_rate, "momentum": 0.9, "weight_decay": 1e-4},
        "optimizer_hparams": {"lr": learning_rate, "weight_decay": 1e-4},
        "lr_scheduler_name": "ConstantLR",
        "lr_scheduler_hparams": {"factor": 1},
    }
    for learning_rate in np.linspace(1e-6, 1e-1, 20)
]

train_model(None, **constant_lr_scheduler_config[0])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


Seed set to 42
Seed set to 42


Files already downloaded and verified



  | Name        | Type             | Params | In sizes         | Out sizes
--------------------------------------------------------------------------------
0 | model       | ClassifierViT    | 88.2 M | [1, 3, 224, 224] | [1, 1000]
1 | loss_module | CrossEntropyLoss | 0      | ?                | ?        
--------------------------------------------------------------------------------
88.2 M    Trainable params
0         Non-trainable params
88.2 M    Total params
352.897   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/robin/miniconda3/envs/DLRK/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


IsADirectoryError: [Errno 21] Is a directory: '/home/robin/projects/DeepLearningResearchKitchen/project/'

Do Baseline experiments:
- Constant LR scheduler
- different lr and observe test error 

Adam experiments:
- observe lr for Adam across training runs 