## Install Libraries


In [1]:
%pip install "ray[tune]"
# %pip install torch torchvision torchaudio # Uncomment this to install PyTorch 2.0+ on ilab (required version to run this program)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray[tune]
  Downloading ray-2.3.1-cp39-cp39-manylinux2014_x86_64.whl (58.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting virtualenv>=20.0.24
  Downloading virtualenv-20.21.0-py3-none-any.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting frozenlist
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiosignal
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━

## Import Libraries


In [3]:
from autoaugment import CIFAR10Policy, ImageNetPolicy
from filelock import FileLock
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from torch.utils.data import random_split, ConcatDataset
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np

## Data loaders


In [4]:
# new for project
class Cutout:
    def __init__(self, n_holes, length):
        self.n_holes = n_holes
        self.length = length

    def __call__(self, img):
        h = img.shape[0]
        w = img.shape[1]
        d = img.shape[2]
        mask = np.ones((h, w, d), np.float32)

        for n in range(self.n_holes):
            y = np.random.randint(h)
            x = np.random.randint(w)

            y1 = np.clip(y - self.length // 2, 0, h)
            y2 = np.clip(y + self.length // 2, 0, h)
            x1 = np.clip(x - self.length // 2, 0, w)
            x2 = np.clip(x + self.length // 2, 0, w)

            mask[y1:y2, x1:x2, 0:d] = 0.0

        mask = torch.from_numpy(mask)
        mask = mask.expand_as(img)
        img *= mask

        return img


# from deepaugment.deepaugment import DeepAugment


# Define mixup function
def mixup_data(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

In [5]:
def load_data(config, data_dir="./data"):
    transform_aug = transforms.Compose(
        [
            # ImageNetPolicy(),
            CIFAR10Policy(),
            transforms.ToTensor(),
            transforms.Normalize(
                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
            ),  # to be consistent with others
            # Cutout(n_holes=1, length=16),
        ]
    )

    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )

    with FileLock(os.path.expanduser("~/.data.lock")):
        trainset_orig = torchvision.datasets.CIFAR10(
            root=data_dir, train=True, download=True, transform=transform
        )

        trainset_aug = torchvision.datasets.CIFAR10(
            root=data_dir, train=True, download=True, transform=transform_aug
        )

        testset = torchvision.datasets.CIFAR10(
            root=data_dir, train=False, download=True, transform=transform
        )

    return trainset_orig, trainset_aug, testset

## Configurable neural network


In [6]:
def flatten(x, start_dim=1, end_dim=-1):
    return x.flatten(start_dim=start_dim, end_dim=end_dim)

In [7]:
# Will need to modify the parameters and the network depending on what you are experimenting with

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 32
channel_2 = 64
channel_3 = 128
pool_kernel_size = 2

kernel_size_1 = 3
kernel_size_2 = 5
kernel_size_3 = 7

pad_size_1 = 2
pad_size_2 = 3
pad_size_3 = 3

fc_count_1 = 1024


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, channel_1, kernel_size_1, padding=pad_size_1)
        self.norm1 = nn.BatchNorm2d(channel_1)
        self.pool1 = nn.MaxPool2d(pool_kernel_size)
        self.conv2 = nn.Conv2d(channel_1, channel_2, kernel_size_2, padding=pad_size_2)
        self.norm2 = nn.BatchNorm2d(channel_2)
        self.pool2 = nn.MaxPool2d(pool_kernel_size)
        self.conv3 = nn.Conv2d(channel_2, channel_3, kernel_size_3, padding=pad_size_3)
        self.norm3 = nn.BatchNorm2d(channel_3)
        self.pool3 = nn.MaxPool2d(pool_kernel_size)

        H_after_conv1 = int(H + 2 * pad_size_1 - 1 * (kernel_size_1 - 1) - 1 + 1)
        W_after_conv1 = int(W + 2 * pad_size_1 - 1 * (kernel_size_1 - 1) - 1 + 1)
        H_after_pool1 = int(
            (H_after_conv1 + 2 * 0 - 1 * (pool_kernel_size - 1) - 1) / pool_kernel_size
            + 1
        )
        W_after_pool1 = int(
            (W_after_conv1 + 2 * 0 - 1 * (pool_kernel_size - 1) - 1) / pool_kernel_size
            + 1
        )
        H_after_conv2 = int(
            H_after_pool1 + 2 * pad_size_2 - 1 * (kernel_size_2 - 1) - 1 + 1
        )
        W_after_conv2 = int(
            W_after_pool1 + 2 * pad_size_2 - 1 * (kernel_size_2 - 1) - 1 + 1
        )
        H_after_pool2 = int(
            (H_after_conv2 + 2 * 0 - 1 * (pool_kernel_size - 1) - 1) / pool_kernel_size
            + 1
        )
        W_after_pool2 = int(
            (W_after_conv2 + 2 * 0 - 1 * (pool_kernel_size - 1) - 1) / pool_kernel_size
            + 1
        )
        H_after_conv3 = int(
            H_after_pool2 + 2 * pad_size_3 - 1 * (kernel_size_3 - 1) - 1 + 1
        )
        W_after_conv3 = int(
            W_after_pool2 + 2 * pad_size_3 - 1 * (kernel_size_3 - 1) - 1 + 1
        )
        H_after_pool3 = int(
            (H_after_conv3 + 2 * 0 - 1 * (pool_kernel_size - 1) - 1) / pool_kernel_size
            + 1
        )
        W_after_pool3 = int(
            (W_after_conv3 + 2 * 0 - 1 * (pool_kernel_size - 1) - 1) / pool_kernel_size
            + 1
        )

        self.fc1 = nn.Linear(H_after_pool3 * W_after_pool3 * channel_3, fc_count_1)
        self.fc2 = nn.Linear(fc_count_1, fc_count_1)
        self.fc3 = nn.Linear(fc_count_1, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.norm1(x)
        x = F.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.norm2(x)
        x = F.relu(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.norm3(x)
        x = F.relu(x)
        x = self.pool3(x)
        x = flatten(x)
        x = self.fc1(x)
        # x = F.relu(x)
        x = self.fc2(x)
        # x = F.relu(x)
        x = self.fc3(x)

        return x

## The train function


In [23]:
def train_cifar(config):
    net = (
        Net()
    )  # Will need to modify the parameters depending on what you are experimenting with

    learning_rate = 0.015221
    momentum = 0.900000
    weight_decay = 0.001

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.SGD(
        net.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
        momentum=momentum,
        nesterov=True,
    )

    # Do not modify any code below!
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            )
            net.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    data_dir = os.path.abspath("./data")
    trainset_orig, trainset_aug, testset = load_data(config, data_dir)

    test_abs = int(len(trainset_orig) * 0.8)
    train_subset_orig, val_subset = random_split(
        trainset_orig, [test_abs, len(trainset_orig) - test_abs]
    )

    # Get the indices used for the split
    # Use the same indices to reproduce the split
    train_subset_aug = torch.utils.data.Subset(trainset_aug, train_subset_orig.indices)

    train_subset = ConcatDataset([train_subset_orig, train_subset_aug])

    trainloader = torch.utils.data.DataLoader(
        train_subset, batch_size=64, shuffle=True, num_workers=1
    )
    valloader = torch.utils.data.DataLoader(
        val_subset, batch_size=64, shuffle=True, num_workers=1
    )

    for epoch in range(10):
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # mixup
            if config["mixup"] == True:
                x, y_a, y_b, lam = mixup_data(inputs, labels)
                x, y_a, y_b = map(torch.autograd.Variable, (x, y_a, y_b))

                scores = net(x)

                loss = lam * F.cross_entropy(scores, y_a) + (1 - lam) * F.cross_entropy(
                    scores, y_b
                )
            else:
                outputs = net(inputs)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:
                print(
                    f"[{epoch + 1}, {i + 1:>5} loss: {running_loss / epoch_steps:.3f}]"
                )
                running_loss = 0.0

        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        os.makedirs("my_model", exist_ok=True)
        torch.save((net.state_dict(), optimizer.state_dict()), "my_model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("my_model")
        session.report(
            {"loss": (val_loss / val_steps), "accuracy": correct / total},
            checkpoint=checkpoint,
        )
    print("Finished Training")

## Test set accuracy


In [14]:
def test_best_model(best_result):
    best_trained_model = (
        Net()
    )  # Will need to modify the parameters depending on what you are experimenting with

    # Do not modify any code below!
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    best_trained_model.to(device)

    checkpoint_path = os.path.join(
        best_result.checkpoint.to_directory(), "checkpoint.pt"
    )

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    trainset_orig, trainset_aug, testset = load_data(config)

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2
    )

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = best_trained_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Best trial test set accuracy: {correct / total}")

## Configuring the search space


In [12]:
# You will be experimenting with the hyperparameters here
# Use tune.grid_search to specify what values you want to experiment with a hyperparameter
config = {
    "add_aug": tune.grid_search([True]),
    "increase_data": tune.grid_search([True]),
    "mixup": tune.grid_search([False]),
}

In [24]:
# Do not modify any code below!
def main(config):
    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_cifar),
            resources={"gpu": 1},
        ),
        param_space=config,
    )
    results = tuner.fit()

    best_result = results.get_best_result("accuracy", "max")

    print(f"Best trial config: {best_result.config}")
    print(f"Best trial final validation loss: {best_result.metrics['loss']}")
    print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}")

    test_best_model(best_result)

    return results.get_dataframe("accuracy", "max")


results_df = main(config)

0,1
Current time:,2023-04-14 00:44:07
Running for:,00:08:17.07
Memory:,7.0/12.7 GiB

Trial name,status,loc,add_aug,increase_data,mixup,iter,total time (s),loss,accuracy
train_cifar_4e8ff_00000,TERMINATED,172.28.0.12:13366,True,True,False,10,493.693,0.697313,0.7823


[2m[36m(train_cifar pid=13366)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_00-35-50/train_cifar_4e8ff_00000_0_add_aug=True,increase_data=True,mixup=False_2023-04-14_00-35-50/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=13366)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 65536/170498071 [00:00<04:42, 602789.21it/s]
  0%|          | 229376/170498071 [00:00<02:30, 1128783.81it/s]
  1%|          | 917504/170498071 [00:00<00:48, 3478900.99it/s]
  2%|▏         | 3702784/170498071 [00:00<00:13, 12087514.04it/s]
  6%|▌         | 9863168/170498071 [00:00<00:05, 27898630.82it/s]
  9%|▉         | 15990784/170498071 [00:00<00:04, 37366439.31it/s]
 13%|█▎        | 22118400/170498071 [00:00<00:03, 43402270.38it/s]
 16%|█▌        | 27688960/170498071 [00:00<00:03, 47028950.82it/s]
 19%|█▉        | 32440320/170498071 [00:00<00:02, 46587525.20it/s]
 23%|██▎       | 38502400/170498071 [00:01<00:02, 50740807.52it/s]
 26%|██▌       | 44269568/170498071 [00:01<00:02, 52607383.97it/s]
 29%|██▉       | 49577984/170498071 [00:01<00:02, 51591534.71it/s]
 32%|███▏      | 54853632/170498071 [00:01<00:02, 51667811.44it/s]
 36%|███▌      | 60588032/170498071 [00:01<00:02, 53317425

[2m[36m(train_cifar pid=13366)[0m Extracting /root/ray_results/train_cifar_2023-04-14_00-35-50/train_cifar_4e8ff_00000_0_add_aug=True,increase_data=True,mixup=False_2023-04-14_00-35-50/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_00-35-50/train_cifar_4e8ff_00000_0_add_aug=True,increase_data=True,mixup=False_2023-04-14_00-35-50/data
[2m[36m(train_cifar pid=13366)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=13366)[0m Files already downloaded and verified


Trial name,accuracy,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_cifar_4e8ff_00000,0.7823,2023-04-14_00-44-07,True,,42e4609b300c4417835391fc24b2f6d7,"0_add_aug=True,increase_data=True,mixup=False",bb9231d73d2c,10,0.697313,172.28.0.12,13366,True,493.693,45.3231,493.693,1681433047,0,,10,4e8ff_00000,0.00386429


2023-04-14 00:44:07,913	INFO tune.py:798 -- Total run time: 497.10 seconds (497.05 seconds for the tuning loop).


Best trial config: {'add_aug': True, 'increase_data': True, 'mixup': False}
Best trial final validation loss: 0.6973125864366058
Best trial final validation accuracy: 0.7823
[2m[36m(train_cifar pid=13366)[0m Finished Training
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Best trial test set accuracy: 0.7434


In [25]:
final_df = results_df.drop(
    columns=[
        "should_checkpoint",
        "timesteps_total",
        "episodes_total",
        "training_iteration",
        "trial_id",
        "experiment_id",
        "date",
        "timestamp",
        "pid",
        "hostname",
        "node_ip",
        "time_since_restore",
        "timesteps_since_restore",
        "iterations_since_restore",
        "warmup_time",
        "logdir",
    ]
)
final_df.sort_values(by=["accuracy"], ascending=False).head(10)

Unnamed: 0,loss,accuracy,time_this_iter_s,done,time_total_s,config/add_aug,config/increase_data,config/mixup
0,0.694631,0.7823,48.219459,False,400.545705,True,True,False


In [26]:
final_df.sort_values(by=["accuracy"]).head(10)

Unnamed: 0,loss,accuracy,time_this_iter_s,done,time_total_s,config/add_aug,config/increase_data,config/mixup
0,0.694631,0.7823,48.219459,False,400.545705,True,True,False
