# Q&A
* how to determine data split between train, validation and test sets? Since we cannot use the test set for any observation, is it even possible?  

# Setup

## Libraries

In [1]:
%matplotlib inline

In [2]:
!pip install matplotlib torch torchvision numpy pandas scikit-learn wandb



In [3]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output
from tqdm.auto import tqdm

import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch import nn

import wandb

  from .autonotebook import tqdm as notebook_tqdm


## Config

In [4]:
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {_device}")

Using device: cuda


## Import data
**About data:** The dataset consists of 102 flower categories, and each class has between 40 and 258 images. The images have large scale, pose, and light variations. In addition, there are categories that have large variations within the category and several very similar categories.  
The default split of the dataset is 1020, 1020 and 6149 images for training, validation and test sets respectively.
If you can handle the bigger training dataset, you can experiment by taking up to 80% of the test set for training.

In [5]:
class InMemDataLoader(object):
    """
    A data loader that keeps all data in CPU or GPU memory.
    """

    __initialized = False

    def __init__(
        self,
        dataset,
        batch_size=1,
        shuffle=False,
        sampler=None,
        batch_sampler=None,
        drop_last=False,
    ):
        """A torch dataloader that fetches data from memory."""
        batches = []
        for i in tqdm(range(len(dataset))):
            batch = [torch.tensor(t) for t in dataset[i]]
            batches.append(batch)
        tensors = [torch.stack(ts) for ts in zip(*batches)]
        dataset = torch.utils.data.TensorDataset(*tensors)
        self.dataset = dataset
        self.batch_size = batch_size
        self.drop_last = drop_last

        if batch_sampler is not None:
            if batch_size > 1 or shuffle or sampler is not None or drop_last:
                raise ValueError(
                    "batch_sampler option is mutually exclusive "
                    "with batch_size, shuffle, sampler, and "
                    "drop_last"
                )
            self.batch_size = None
            self.drop_last = None

        if sampler is not None and shuffle:
            raise ValueError("sampler option is mutually exclusive with " "shuffle")

        if batch_sampler is None:
            if sampler is None:
                if shuffle:
                    sampler = torch.utils.data.RandomSampler(dataset)
                else:
                    sampler = torch.utils.data.SequentialSampler(dataset)
            batch_sampler = torch.utils.data.BatchSampler(
                sampler, batch_size, drop_last
            )

        self.sampler = sampler
        self.batch_sampler = batch_sampler
        self.__initialized = True

    def __setattr__(self, attr, val):
        if self.__initialized and attr in ("batch_size", "sampler", "drop_last"):
            raise ValueError(
                "{} attribute should not be set after {} is "
                "initialized".format(attr, self.__class__.__name__)
            )

        super(InMemDataLoader, self).__setattr__(attr, val)

    def __iter__(self):
        for batch_indices in self.batch_sampler:
            yield self.dataset[batch_indices]

    def __len__(self):
        return len(self.batch_sampler)

    def to(self, device):
        self.dataset.tensors = tuple(t.to(device) for t in self.dataset.tensors)
        return self

In [6]:
def load_flowers(batch_size=64, test_train_valid_percent=(0.1, 0.8, 0.1), transform=None, Loader=torch.utils.data.DataLoader):
    """
    Load the flowers dataset with the given batch size and transformation.
    The dataset is split into train, validation, and test sets according to the specified percentages.
    The data is loaded using the specified loader class.
    """

    if transform is None:
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ])

    test_percent, train_percent, valid_percent = test_train_valid_percent

    test = torchvision.datasets.Flowers102(
        root='./data', split='test', download=True, transform=transform
    )
    test = torch.utils.data.Subset(test, range(int(len(test) * test_percent)))

    train = torchvision.datasets.Flowers102(
        root='./data', split='train', download=True, transform=transform
    )
    train = torch.utils.data.Subset(train, range(int(len(train) * train_percent)))

    valid = torchvision.datasets.Flowers102(
        root='./data', split='val', download=True, transform=transform
    )
    valid = torch.utils.data.Subset(valid, range(int(len(valid) * valid_percent)))

    data_loaders = {
        'train': Loader(train, batch_size=batch_size, shuffle=True),
        'valid': Loader(valid, batch_size=batch_size, shuffle=False),
        'test': Loader(test, batch_size=batch_size, shuffle=False),
    }

    return data_loaders


In [7]:
batch_size = 64
data_loaders = load_flowers(batch_size, (1, 1, 1), transform=None, Loader=InMemDataLoader)

  batch = [torch.tensor(t) for t in dataset[i]]
100%|██████████| 1020/1020 [00:10<00:00, 100.07it/s]
100%|██████████| 1020/1020 [00:09<00:00, 104.65it/s]
100%|██████████| 6149/6149 [00:59<00:00, 103.79it/s]


# Solution

## Task 1

### Custom layers

### Model

In [8]:
class Model1(nn.Module):
    def __init__(self, *args, **kwargs):
        super(Model1, self).__init__()
        self.layers = nn.Sequential(*args, **kwargs)

    def forward(self, x):
        x = self.layers(x)
        return x

In [9]:
def evaluate(model, data_loader):
    loss = 0
    correct = 0
    loss_fn = nn.CrossEntropyLoss(
        reduction='sum',
    )

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(_device), labels.to(_device)

            outputs = model(inputs)
            loss += loss_fn(outputs, labels).item()
            pred = outputs.argmax(
                dim=1, keepdim=True
            )
            correct += (
                pred.eq(labels.view_as(pred)).sum().item()
            )
    
    loss /= len(data_loader.dataset)
    accuracy = correct / len(data_loader.dataset)
    return loss, accuracy

In [10]:
def train_step(model, optimizer, loss_fn):
    for batch_idx, (inputs, labels) in enumerate(data_loaders['train']):
        inputs, labels = inputs.to(_device), labels.to(_device)

        optimizer.zero_grad()   # Zero gradients
        logits = model(inputs)   # Forward pass
        loss = loss_fn(logits, labels)  # Compute loss
        loss.backward() # Backward pass
        optimizer.step()    # Update weights
        
        wandb.log({
            "loss": loss.item(),
            "batch_idx": batch_idx,
        })

In [18]:
def init_weights(model):
    """
    Initialize weights of the model using Kaiming uniform distribution.
    """
    for layer in model.modules():
        if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
            nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
            if layer.bias is not None:
                nn.init.zeros_(layer.bias)
        elif isinstance(layer, nn.Linear):
            nn.init.xavier_uniform_(layer.weight, gain=1.0)
            if layer.bias is not None:
                nn.init.zeros_(layer.bias)

### Train

In [None]:
# version 1 of model - too simple
model_type = "v1"

model = Model1(
    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Dropout(p=0.25),

    nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Dropout(p=0.25),

    nn.Flatten(),
    nn.Linear(128 * 56 * 56, 512),
    nn.ReLU(),

    nn.Dropout(p=0.5),
    nn.Linear(512, 102),
)

In [19]:
# version 2
model_type = "v2"

model = Model1(
    nn.Conv2d(3, 128, kernel_size=11, stride=4, padding=2),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=5, stride=2),
    nn.Dropout(p=0.25),

    nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),

    nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Dropout(p=0.25),

    nn.Conv2d(256, 256, kernel_size=3),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Dropout(p=0.25),

    nn.Flatten(),
    nn.Linear(6400, 512),
    nn.ReLU(),

    nn.Dropout(p=0.5),
    nn.Linear(512, 102),
)

In [69]:
# stacked 3x3 convs
model_type = "v3_stack3x3"

model = Model1(
    nn.Conv2d(3, 64, kernel_size=5, stride=3, padding=2),
    nn.BatchNorm2d(64),
    nn.ReLU(),

    nn.Conv2d(64, 128, kernel_size=3),
    nn.BatchNorm2d(128),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=3, stride=3),
    nn.Dropout(p=0.25),

    nn.Conv2d(128, 256, kernel_size=3, stride=2),
    nn.BatchNorm2d(256),
    nn.ReLU(),

    nn.Conv2d(256, 256, kernel_size=3),
    nn.BatchNorm2d(256),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=3, stride=3),
    nn.Dropout(p=0.25),

    nn.Flatten(),
    nn.Linear(2304, 1024),
    nn.ReLU(),

    nn.Linear(1024, 512),
    nn.ReLU(),

    nn.Dropout(p=0.5),
    nn.Linear(512, 102),
)

In [42]:
# version 3 = alexnet
model_type = "alexnet"

model = Model1(
    nn.Conv2d(3, 96, kernel_size=11, stride=4),
    nn.BatchNorm2d(96),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Dropout(p=0.25),

    nn.Conv2d(96, 256, kernel_size=5, padding=2),
    nn.BatchNorm2d(256),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Dropout(p=0.25),

    nn.Conv2d(256, 384, kernel_size=3, padding=1),
    nn.BatchNorm2d(384),
    nn.ReLU(),
    nn.Dropout(p=0.25),

    nn.Conv2d(384, 384, kernel_size=3, padding=1),
    nn.BatchNorm2d(384),
    nn.ReLU(),
    nn.Dropout(p=0.25),

    nn.Conv2d(384, 256, kernel_size=3, padding=1),
    nn.BatchNorm2d(256),
    nn.ReLU(),

    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Dropout(p=0.25),


    nn.Flatten(),
    nn.Linear(6400, 4096),
    nn.ReLU(),

    nn.Linear(4096, 4096),
    nn.ReLU(),

    nn.Dropout(p=0.5),
    nn.Linear(4096, 102),
    # nn.Softmax(dim=1),
)

In [70]:
model.to(_device)

epochs = 100

learning_rate = 0.0001
momentum = 0.9
weight_decay = 0.0005
betas = (0.9, 0.999)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=learning_rate,
    betas=betas,
    weight_decay=weight_decay,
)
# optimizer = torch.optim.SGD(
#     model.parameters(),
#     lr=learning_rate,
#     momentum=momentum,
#     weight_decay=weight_decay,
# )

loss_fn = nn.CrossEntropyLoss()

init_weights(model)

run = wandb.init(
    entity = "fejowo5522-",
    project= "NN_list3_OxFlow",
    config = {
        "task": 1,
        "batch_size": batch_size,
        "epochs": epochs,
        "optimizer": "SGD",
        "learning_rate": learning_rate,
        # "momentum": momentum,
        "betas": betas,
        "weight_decay": weight_decay,
        "loss_fn": "cross_entropy",
        "model": model_type,
    }
)
run.name = "Task1_" + str(int(time.time()))

model.train()
for epoch in tqdm(range(epochs), desc="Training", leave=False):
    train_step(model, optimizer, loss_fn)

    for loader, split in [
        (data_loaders['train'], 'train'),
        (data_loaders['valid'], 'valid'),
    ]:
        loss, accuracy = evaluate(model, loader)
        wandb.log({
            "epoch": epoch,
            f"{split}_loss": loss,
            f"{split}_accuracy": accuracy,
        })

loss, accuracy = evaluate(model, data_loaders['test'])
wandb.log({
    "test_loss": loss,
    "test_accuracy": accuracy,
})
print(
    "Test set: Average loss: {:8.6f}, Accuracy: ({:4.1f}%)".format(
        loss,
        100.0 * accuracy,
    )
)

run.finish()

                                                          

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
