## Install Libraries


In [1]:
#!pip install git+https://github.com/yfw/starter-code
%pip install ray[tune] bayesian-optimization==1.2.0 scikit-optimize
# %pip install torch torchvision torchaudio # Uncomment this to install PyTorch 2.0+ on ilab (required version to run this program)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray[tune]
  Downloading ray-2.3.1-cp39-cp39-manylinux2014_x86_64.whl (58.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bayesian-optimization==1.2.0
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting aiosignal
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting virtualenv>=20.0.24
  Downloading virtualenv-20.21.0-py3-none-any.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
Collecting frozenlist
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m3.6 

## Import Libraries


In [2]:
from filelock import FileLock
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from torch.utils.data import random_split


import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from collections import OrderedDict


from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search.skopt import SkOptSearch

from ray.tune.search import ConcurrencyLimiter

## Data loaders


In [3]:
def load_data(data_dir="./data"):
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )

    with FileLock(os.path.expanduser("~/.data.lock")):
        trainset = torchvision.datasets.CIFAR10(
            root=data_dir, train=True, download=True, transform=transform
        )

        testset = torchvision.datasets.CIFAR10(
            root=data_dir, train=False, download=True, transform=transform
        )

    return trainset, testset


## Configurable neural network


In [4]:
def flatten(x, start_dim=1, end_dim=-1):
    return x.flatten(start_dim=start_dim, end_dim=end_dim)


In [5]:
# Will need to modify the parameters and the network depending on what you are experimenting with

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 32
channel_2 = 64
channel_3 = 128
pool_kernel_size = 2

kernel_size_1 = 3
kernel_size_2 = 5
kernel_size_3 = 7

pad_size_1 = 2
pad_size_2 = 3
pad_size_3 = 3

fc_count_1 = 1024

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, channel_1, kernel_size_1, padding=pad_size_1)
        self.norm1 = nn.BatchNorm2d(channel_1)
        self.pool1 = nn.MaxPool2d(pool_kernel_size)
        self.conv2 = nn.Conv2d(channel_1, channel_2, kernel_size_2, padding=pad_size_2)
        self.norm2 = nn.BatchNorm2d(channel_2)
        self.pool2 = nn.MaxPool2d(pool_kernel_size)
        self.conv3 = nn.Conv2d(channel_2, channel_3, kernel_size_3, padding=pad_size_3)
        self.norm3 = nn.BatchNorm2d(channel_3)
        self.pool3 = nn.MaxPool2d(pool_kernel_size)        

        H_after_conv1 = int(H + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
        W_after_conv1 = int(W + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
        H_after_pool1 = int((H_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
        W_after_pool1 = int((W_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
        H_after_conv2 = int(H_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
        W_after_conv2 = int(W_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
        H_after_pool2 = int((H_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
        W_after_pool2 = int((W_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)  
        H_after_conv3 = int(H_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
        W_after_conv3 = int(W_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
        H_after_pool3 = int((H_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
        W_after_pool3 = int((W_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

        self.fc1 = nn.Linear(H_after_pool3*W_after_pool3*channel_3, fc_count_1)
        self.fc2 = nn.Linear(fc_count_1, fc_count_1)
        self.fc3 = nn.Linear(fc_count_1, 10)


    def forward(self, x):
        x = self.conv1(x)
        x = self.norm1(x)
        x = F.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.norm2(x)
        x = F.relu(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.norm3(x)
        x = F.relu(x)
        x = self.pool3(x)
        x = flatten(x)
        x = self.fc1(x)
        #x = F.relu(x)
        x = self.fc2(x)
        #x = F.relu(x)
        x = self.fc3(x)

        return x



## The train function


In [10]:
def train_cifar(config):
    net = Net()  # Will need to modify the parameters depending on what you are experimenting with

    learning_rate = config['learning_rate']
    momentum = config['momentum']
    weight_decay = 0.001

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.SGD(net.parameters(), lr=learning_rate, 
                weight_decay=weight_decay,
                momentum=momentum, nesterov=True)    




    # Do not modify any code below!
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            )
            net.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    data_dir = os.path.abspath("./data")
    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs]
    )

    trainloader = torch.utils.data.DataLoader(
        train_subset, batch_size=64, shuffle=True, num_workers=1
    )
    valloader = torch.utils.data.DataLoader(
        val_subset, batch_size=64, shuffle=True, num_workers=1
    )

    for epoch in range(10):
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:
                print(
                    f"[{epoch + 1}, {i + 1:>5} loss: {running_loss / epoch_steps:.3f}]"
                )
                running_loss = 0.0

        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        os.makedirs("my_model", exist_ok=True)
        torch.save((net.state_dict(), optimizer.state_dict()), "my_model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("my_model")
        session.report(
            {"loss": (val_loss / val_steps), "accuracy": correct / total},
            checkpoint=checkpoint,
        )
    print("Finished Training")


## Test set accuracy


In [7]:
def test_best_model(best_result):
    best_trained_model = Net(
     )  # Will need to modify the parameters depending on what you are experimenting with

    # Do not modify any code below!
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    best_trained_model.to(device)

    checkpoint_path = os.path.join(
        best_result.checkpoint.to_directory(), "checkpoint.pt"
    )

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2
    )

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = best_trained_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Best trial test set accuracy: {correct / total}")


## Configuring the search space


In [8]:
# You will be experimenting with the hyperparameters here
# Use tune.grid_search to specify what values you want to experiment with a hyperparameter
config = { 
          "learning_rate": tune.uniform(1e-3, 1e-1),
          "momentum": tune.uniform(0.001, 0.9),
          }



In [9]:
# Do not modify any code below!
def main(config):

  algo = BayesOptSearch(utility_kwargs={"kind": "ucb", "kappa": 2.5, "xi": 0.0})

  # current_best_params = [
  #   {
  #       "learning_rate": 0.053306,
  #       "momentum": 0.570568,
  #   },
  #   {
  #       "learning_rate": 0.028210,
  #       "momentum": 0.338465,
  #   },
  #   {
  #       "learning_rate": 0.013342,
  #       "momentum": 0.605203,
  #   }
  # ]

  # algo = SkOptSearch(
  #   metric="accuracy",
  #   mode="max",
  #   points_to_evaluate=current_best_params)

  algo = ConcurrencyLimiter(algo, max_concurrent=1)

  tuner = tune.Tuner(
    tune.with_resources(
      tune.with_parameters(train_cifar),
      resources={"gpu": 1},
    ),
    tune_config=tune.TuneConfig(
      metric="accuracy",
      mode="max",
      num_samples=10,
      search_alg=algo,
    ),
    param_space=config,
    )
  
  results = tuner.fit()

  best_result = results.get_best_result("accuracy", "max")

  print(f"Best trial config: {best_result.config}")
  print(f"Best trial final validation loss: {best_result.metrics['loss']}")
  print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}")

  test_best_model(best_result)

  return results.get_dataframe("accuracy", "max")


results_df = main(config)


2023-04-14 01:16:55,699	INFO worker.py:1553 -- Started a local Ray instance.


0,1
Current time:,2023-04-14 01:49:56
Running for:,00:32:58.70
Memory:,5.1/12.7 GiB

Trial name,status,loc,learning_rate,momentum,iter,total time (s),loss,accuracy
train_cifar_571f383b,TERMINATED,172.28.0.12:1261,0.0380795,0.860571,10,204.102,0.842126,0.7459
train_cifar_993d625b,TERMINATED,172.28.0.12:1261,0.0734674,0.578927,10,191.976,,0.0996
train_cifar_e23d027a,TERMINATED,172.28.0.12:1261,0.0164458,0.224796,10,205.079,0.770198,0.7628
train_cifar_3199b712,TERMINATED,172.28.0.12:1261,0.00675028,0.792941,10,196.923,0.928022,0.7525
train_cifar_a8183bd1,TERMINATED,172.28.0.12:1261,0.0605104,0.666458,10,196.977,,0.1058
train_cifar_8417558c,TERMINATED,172.28.0.12:1261,0.00303786,0.875928,10,194.805,0.988081,0.744
train_cifar_32ce26a2,TERMINATED,172.28.0.12:1261,0.0834118,0.269871,10,197.958,0.832356,0.7584
train_cifar_6b9336f6,TERMINATED,172.28.0.12:1261,0.0190007,0.246724,10,195.105,0.790836,0.7624
train_cifar_6d5fb91e,TERMINATED,172.28.0.12:1261,0.03112,0.519805,10,192.479,0.904954,0.7426
train_cifar_57d2028d,TERMINATED,172.28.0.12:1261,0.0437626,0.332983,10,199.586,0.795054,0.7694


[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_571f383b_1_learning_rate=0.0381,momentum=0.8606_2023-04-14_01-16-57/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:39, 4322173.84it/s]
  4%|▍         | 7503872/170498071 [00:00<00:03, 42205837.32it/s]
 11%|█         | 19136512/170498071 [00:00<00:02, 75504864.09it/s]
 18%|█▊        | 30769152/170498071 [00:00<00:01, 91300749.11it/s]
 25%|██▍       | 42467328/170498071 [00:00<00:01, 100483482.03it/s]
 32%|███▏      | 54099968/170498071 [00:00<00:01, 105777069.08it/s]
 39%|███▊      | 65732608/170498071 [00:00<00:00, 109183420.81it/s]
 45%|████▍     | 76677120/170498071 [00:00<00:01, 87331374.28it/s] 
 53%|█████▎    | 89882624/170498071 [00:00<00:00, 99168145.18it/s]
 59%|█████▉    | 100466688/170498071 [00:01<00:00, 99145121.73it/s]
 65%|██████▌   | 110854144/170498071 [00:01<00:00, 87085516.15it/s]
 71%|███████   | 120553472/170498071 [00:01<00:00, 89611458.98it/s]
 77%|███████▋  | 130908160/170498071 [00:01<00:00, 87134689.80it/s]
 82%|████████▏ | 139952128/170498071 [00:0

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_571f383b_1_learning_rate=0.0381,momentum=0.8606_2023-04-14_01-16-57/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_571f383b_1_learning_rate=0.0381,momentum=0.8606_2023-04-14_01-16-57/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified




Trial name,accuracy,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_cifar_3199b712,0.7525,2023-04-14_01-30-19,True,,d548b5ead44b483ab53a1b40edcd1933,"4_learning_rate=0.0068,momentum=0.7929",19340ad46eca,10,0.928022,172.28.0.12,1261,True,196.923,17.8808,196.923,1681435819,0,,10,3199b712,0.00380731
train_cifar_32ce26a2,0.7584,2023-04-14_01-40-09,True,,d548b5ead44b483ab53a1b40edcd1933,"7_learning_rate=0.0834,momentum=0.2699",19340ad46eca,10,0.832356,172.28.0.12,1261,True,197.958,17.8717,197.958,1681436409,0,,10,32ce26a2,0.00380731
train_cifar_571f383b,0.7459,2023-04-14_01-20-25,True,,d548b5ead44b483ab53a1b40edcd1933,"1_learning_rate=0.0381,momentum=0.8606",19340ad46eca,10,0.842126,172.28.0.12,1261,True,204.102,18.1535,204.102,1681435225,0,,10,571f383b,0.00380731
train_cifar_57d2028d,0.7694,2023-04-14_01-49-56,True,,d548b5ead44b483ab53a1b40edcd1933,"10_learning_rate=0.0438,momentum=0.3330",19340ad46eca,10,0.795054,172.28.0.12,1261,True,199.586,21.1145,199.586,1681436996,0,,10,57d2028d,0.00380731
train_cifar_6b9336f6,0.7624,2023-04-14_01-43-24,True,,d548b5ead44b483ab53a1b40edcd1933,"8_learning_rate=0.0190,momentum=0.2467",19340ad46eca,10,0.790836,172.28.0.12,1261,True,195.105,18.2513,195.105,1681436604,0,,10,6b9336f6,0.00380731
train_cifar_6d5fb91e,0.7426,2023-04-14_01-46-36,True,,d548b5ead44b483ab53a1b40edcd1933,"9_learning_rate=0.0311,momentum=0.5198",19340ad46eca,10,0.904954,172.28.0.12,1261,True,192.479,21.5827,192.479,1681436796,0,,10,6d5fb91e,0.00380731
train_cifar_8417558c,0.744,2023-04-14_01-36-51,True,,d548b5ead44b483ab53a1b40edcd1933,"6_learning_rate=0.0030,momentum=0.8759",19340ad46eca,10,0.988081,172.28.0.12,1261,True,194.805,17.7239,194.805,1681436211,0,,10,8417558c,0.00380731
train_cifar_993d625b,0.0996,2023-04-14_01-23-37,True,,d548b5ead44b483ab53a1b40edcd1933,"2_learning_rate=0.0735,momentum=0.5789",19340ad46eca,10,,172.28.0.12,1261,True,191.976,17.8238,191.976,1681435417,0,,10,993d625b,0.00380731
train_cifar_a8183bd1,0.1058,2023-04-14_01-33-36,True,,d548b5ead44b483ab53a1b40edcd1933,"5_learning_rate=0.0605,momentum=0.6665",19340ad46eca,10,,172.28.0.12,1261,True,196.977,17.7874,196.977,1681436016,0,,10,a8183bd1,0.00380731
train_cifar_e23d027a,0.7628,2023-04-14_01-27-02,True,,d548b5ead44b483ab53a1b40edcd1933,"3_learning_rate=0.0164,momentum=0.2248",19340ad46eca,10,0.770198,172.28.0.12,1261,True,205.079,20.6025,205.079,1681435622,0,,10,e23d027a,0.00380731


[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_993d625b_2_learning_rate=0.0735,momentum=0.5789_2023-04-14_01-20-25/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:38, 4406799.67it/s]
  5%|▍         | 7831552/170498071 [00:00<00:03, 44470430.77it/s]
 11%|█         | 18415616/170498071 [00:00<00:02, 72204901.84it/s]
 17%|█▋        | 29786112/170498071 [00:00<00:01, 88394766.92it/s]
 24%|██▍       | 40992768/170498071 [00:00<00:01, 96864920.26it/s]
 31%|███       | 52428800/170498071 [00:00<00:01, 100996159.33it/s]
 37%|███▋      | 62554112/170498071 [00:00<00:01, 93060888.35it/s] 
 43%|████▎     | 73990144/170498071 [00:00<00:00, 99183000.40it/s]
 50%|█████     | 85360640/170498071 [00:00<00:00, 103476453.49it/s]
 56%|█████▋    | 96239616/170498071 [00:01<00:00, 104987100.25it/s]
 63%|██████▎   | 107249664/170498071 [00:01<00:00, 106342564.95it/s]
 69%|██████▉   | 117964800/170498071 [00:01<00:00, 102086789.31it/s]
 75%|███████▌  | 128253952/170498071 [00:01<00:00, 101667414.02it/s]
 82%|████████▏ | 139526144/170498071 [00

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_993d625b_2_learning_rate=0.0735,momentum=0.5789_2023-04-14_01-20-25/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_993d625b_2_learning_rate=0.0735,momentum=0.5789_2023-04-14_01-20-25/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_e23d027a_3_learning_rate=0.0164,momentum=0.2248_2023-04-14_01-23-37/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 360448/170498071 [00:00<00:47, 3552704.40it/s]
  2%|▏         | 3014656/170498071 [00:00<00:09, 16884647.22it/s]
  4%|▍         | 7045120/170498071 [00:00<00:05, 27325365.91it/s]
  6%|▌         | 10190848/170498071 [00:00<00:05, 28926432.50it/s]
  8%|▊         | 14090240/170498071 [00:00<00:04, 32530763.95it/s]
 10%|█         | 17367040/170498071 [00:00<00:04, 32191525.89it/s]
 12%|█▏        | 21135360/170498071 [00:00<00:04, 33957672.62it/s]
 14%|█▍        | 24707072/170498071 [00:00<00:04, 34441399.52it/s]
 17%|█▋        | 28180480/170498071 [00:00<00:04, 34165847.20it/s]
 19%|█▉        | 32112640/170498071 [00:01<00:03, 35642407.77it/s]
 21%|██        | 35684352/170498071 [00:01<00:03, 34064559.22it/s]
 23%|██▎       | 39518208/170498071 [00:01<00:03, 35191720.71it/s]
 25%|██▌       | 43057152/170498071 [00:01<00:03, 35031397.72it/s]
 27%|██▋       | 46596096/170498071 [00:01<00:03, 3

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_e23d027a_3_learning_rate=0.0164,momentum=0.2248_2023-04-14_01-23-37/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_e23d027a_3_learning_rate=0.0164,momentum=0.2248_2023-04-14_01-23-37/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_3199b712_4_learning_rate=0.0068,momentum=0.7929_2023-04-14_01-27-02/data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:40, 4193060.94it/s]
  3%|▎         | 4784128/170498071 [00:00<00:06, 26233658.99it/s]
  6%|▋         | 10715136/170498071 [00:00<00:03, 40848161.94it/s]
 10%|▉         | 16384000/170498071 [00:00<00:03, 46721609.11it/s]
 13%|█▎        | 22675456/170498071 [00:00<00:02, 52297724.68it/s]
 17%|█▋        | 28540928/170498071 [00:00<00:02, 54315421.74it/s]
 21%|██        | 35291136/170498071 [00:00<00:02, 58525397.60it/s]
 24%|██▍       | 41189376/170498071 [00:00<00:02, 57133224.80it/s]
 28%|██▊       | 47448064/170498071 [00:00<00:02, 58625239.44it/s]
 31%|███▏      | 53346304/170498071 [00:01<00:02, 58386008.94it/s]
 35%|███▍      | 59441152/170498071 [00:01<00:01, 58977996.00it/s]
 38%|███▊      | 65372160/170498071 [00:01<00:01, 58714514.53it/s]
 42%|████▏     | 71303168/170498071 [00:01<00:01, 58833905.20it/s]
 45%|████▌     | 77332480/170498071 [00:01<00:01, 59261424.33it/s]
 49%|████▉     | 836

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_3199b712_4_learning_rate=0.0068,momentum=0.7929_2023-04-14_01-27-02/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_3199b712_4_learning_rate=0.0068,momentum=0.7929_2023-04-14_01-27-02/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_a8183bd1_5_learning_rate=0.0605,momentum=0.6665_2023-04-14_01-30-19/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:39, 4311462.49it/s]
  3%|▎         | 5537792/170498071 [00:00<00:05, 30877865.56it/s]
  7%|▋         | 11763712/170498071 [00:00<00:03, 44917399.95it/s]
 10%|█         | 17629184/170498071 [00:00<00:03, 50251530.73it/s]
 14%|█▍        | 23527424/170498071 [00:00<00:02, 53298151.12it/s]
 17%|█▋        | 28901376/170498071 [00:00<00:02, 50423039.65it/s]
 20%|██        | 34471936/170498071 [00:00<00:02, 52048601.31it/s]
 24%|██▍       | 40501248/170498071 [00:00<00:02, 54460009.29it/s]
 27%|██▋       | 46235648/170498071 [00:00<00:02, 55329101.53it/s]
 31%|███       | 52559872/170498071 [00:01<00:02, 57613145.67it/s]
 34%|███▍      | 58621952/170498071 [00:01<00:01, 58496757.75it/s]
 38%|███▊      | 64552960/170498071 [00:01<00:01, 58733622.40it/s]
 42%|████▏     | 70778880/170498071 [00:01<00:01, 59679661.71it/s]
 45%|████▌     | 76775424/170498071 [00:01<00:01, 

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_a8183bd1_5_learning_rate=0.0605,momentum=0.6665_2023-04-14_01-30-19/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_a8183bd1_5_learning_rate=0.0605,momentum=0.6665_2023-04-14_01-30-19/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_8417558c_6_learning_rate=0.0030,momentum=0.8759_2023-04-14_01-33-36/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 393216/170498071 [00:00<00:43, 3886573.45it/s]
  3%|▎         | 4620288/170498071 [00:00<00:06, 26330182.50it/s]
  6%|▌         | 10584064/170498071 [00:00<00:03, 41489854.42it/s]
 10%|▉         | 16646144/170498071 [00:00<00:03, 48972908.98it/s]
 13%|█▎        | 22675456/170498071 [00:00<00:02, 52914375.49it/s]
 17%|█▋        | 28704768/170498071 [00:00<00:02, 55248380.62it/s]
 20%|██        | 34603008/170498071 [00:00<00:02, 56415576.89it/s]
 24%|██▍       | 40665088/170498071 [00:00<00:02, 57726213.90it/s]
 27%|██▋       | 46759936/170498071 [00:00<00:02, 58563049.65it/s]
 31%|███       | 52658176/170498071 [00:01<00:02, 58646501.07it/s]
 34%|███▍      | 58523648/170498071 [00:01<00:01, 58489695.61it/s]
 38%|███▊      | 64552960/170498071 [00:01<00:01, 59016101.82it/s]
 41%|████▏     | 70615040/170498071 [00:01<00:01, 59347972.13it/s]
 45%|████▍     | 76578816/170498071 [00:01<00:01, 

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_8417558c_6_learning_rate=0.0030,momentum=0.8759_2023-04-14_01-33-36/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_8417558c_6_learning_rate=0.0030,momentum=0.8759_2023-04-14_01-33-36/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_32ce26a2_7_learning_rate=0.0834,momentum=0.2699_2023-04-14_01-36-51/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:41, 4143257.48it/s]
  4%|▍         | 7176192/170498071 [00:00<00:04, 39610807.38it/s]
 10%|█         | 17793024/170498071 [00:00<00:02, 69180463.99it/s]
 16%|█▋        | 27787264/170498071 [00:00<00:01, 81063339.60it/s]
 22%|██▏       | 38043648/170498071 [00:00<00:01, 88731988.25it/s]
 28%|██▊       | 48398336/170498071 [00:00<00:01, 93713047.16it/s]
 34%|███▍      | 58458112/170498071 [00:00<00:01, 95936080.53it/s]
 40%|████      | 68616192/170498071 [00:00<00:01, 97683563.30it/s]
 46%|████▋     | 78872576/170498071 [00:00<00:00, 99137513.42it/s]
 52%|█████▏    | 88834048/170498071 [00:01<00:00, 98915235.75it/s]
 58%|█████▊    | 99287040/170498071 [00:01<00:00, 100576090.94it/s]
 64%|██████▍   | 109379584/170498071 [00:01<00:00, 99195906.63it/s]
 70%|███████   | 119898112/170498071 [00:01<00:00, 100880841.43it/s]
 76%|███████▋  | 130023424/170498071 [00:01<00

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_32ce26a2_7_learning_rate=0.0834,momentum=0.2699_2023-04-14_01-36-51/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_32ce26a2_7_learning_rate=0.0834,momentum=0.2699_2023-04-14_01-36-51/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_6b9336f6_8_learning_rate=0.0190,momentum=0.2467_2023-04-14_01-40-09/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:40, 4250761.28it/s]
  4%|▍         | 7602176/170498071 [00:00<00:03, 42494605.96it/s]
 11%|█         | 18776064/170498071 [00:00<00:02, 73538239.37it/s]
 17%|█▋        | 28868608/170498071 [00:00<00:01, 84189406.69it/s]
 24%|██▎       | 40140800/170498071 [00:00<00:01, 94333921.84it/s]
 30%|███       | 51216384/170498071 [00:00<00:01, 99830052.49it/s]
 37%|███▋      | 62652416/170498071 [00:00<00:01, 104485344.46it/s]
 43%|████▎     | 73728000/170498071 [00:00<00:00, 106462778.76it/s]
 50%|████▉     | 85065728/170498071 [00:00<00:00, 108556063.38it/s]
 56%|█████▋    | 95944704/170498071 [00:01<00:00, 104464751.15it/s]
 63%|██████▎   | 107216896/170498071 [00:01<00:00, 106861959.48it/s]
 69%|██████▉   | 118358016/170498071 [00:01<00:00, 108146464.58it/s]
 76%|███████▌  | 129204224/170498071 [00:01<00:00, 103159949.41it/s]
 82%|████████▏ | 140509184/170498071 [00

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_6b9336f6_8_learning_rate=0.0190,momentum=0.2467_2023-04-14_01-40-09/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_6b9336f6_8_learning_rate=0.0190,momentum=0.2467_2023-04-14_01-40-09/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_6d5fb91e_9_learning_rate=0.0311,momentum=0.5198_2023-04-14_01-43-24/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
[2m[36m(train_cifar pid=1261)[0m   0%|          | 393216/170498071 [00:00<00:44, 3847719.95it/s]
  3%|▎         | 5701632/170498071 [00:00<00:05, 32484695.62it/s]
  7%|▋         | 12615680/170498071 [00:00<00:03, 48955551.54it/s]
 11%|█         | 18382848/170498071 [00:00<00:02, 52345134.14it/s]
 14%|█▍        | 23658496/170498071 [00:00<00:03, 45709732.24it/s]
 17%|█▋        | 28377088/170498071 [00:00<00:03, 45238729.60it/s]
 21%|██        | 35422208/170498071 [00:00<00:02, 52847024.95it/s]
 25%|██▌       | 42893312/170498071 [00:00<00:02, 59408811.92it/s]
 29%|██▉       | 50102272/170498071 [00:01<00:02, 53840819.12it/s]
 33%|███▎      | 56885248/170498071 [00:01<00:01, 57511675.19it/s]
 37%|███▋      | 62849024/170498071 [00:01<00:02, 52181084.69it/s]
 40%|████      | 68288512/170498071 [00:01<00:01, 52656948.92it/s]
 44%|████▍     | 75661312/170498071 [00:01<00:01, 58376464.05it/s]
 49%|████▉   

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_6d5fb91e_9_learning_rate=0.0311,momentum=0.5198_2023-04-14_01-43-24/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_6d5fb91e_9_learning_rate=0.0311,momentum=0.5198_2023-04-14_01-43-24/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified
[2m[36m(train_cifar pid=1261)[0m Finished Training
[2m[36m(train_cifar pid=1261)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_57d2028d_10_learning_rate=0.0438,momentum=0.3330_2023-04-14_01-46-36/data/cifar-10-python.tar.gz


[2m[36m(train_cifar pid=1261)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 458752/170498071 [00:00<00:40, 4186774.68it/s]
  3%|▎         | 5865472/170498071 [00:00<00:05, 32323396.73it/s]
  7%|▋         | 12648448/170498071 [00:00<00:03, 48088320.60it/s]
 12%|█▏        | 19857408/170498071 [00:00<00:02, 57416074.95it/s]
 16%|█▌        | 26869760/170498071 [00:00<00:02, 61913541.11it/s]
 24%|██▍       | 41189376/170498071 [00:00<00:01, 66976305.67it/s]
 29%|██▉       | 49545216/170498071 [00:00<00:01, 72233505.02it/s]
 33%|███▎      | 56819712/170498071 [00:00<00:01, 72317970.56it/s]
 38%|███▊      | 64978944/170498071 [00:01<00:01, 75101565.00it/s]
 43%|████▎     | 72515584/170498071 [00:01<00:01, 74520047.51it/s]
 47%|████▋     | 80707584/170498071 [00:01<00:01, 76601026.95it/s]
 52%|█████▏    | 88375296/170498071 [00:01<00:01, 72301928.73it/s]
 56%|█████▌    | 95715328/170498071 [00:01<00:01, 72531915.96it/s]
 61%|██████    | 103415808/170498071 [00:01<00:00,

[2m[36m(train_cifar pid=1261)[0m Extracting /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_57d2028d_10_learning_rate=0.0438,momentum=0.3330_2023-04-14_01-46-36/data/cifar-10-python.tar.gz to /root/ray_results/train_cifar_2023-04-14_01-16-53/train_cifar_57d2028d_10_learning_rate=0.0438,momentum=0.3330_2023-04-14_01-46-36/data
[2m[36m(train_cifar pid=1261)[0m Files already downloaded and verified


2023-04-14 01:49:56,469	INFO tune.py:798 -- Total run time: 1979.06 seconds (1978.69 seconds for the tuning loop).


[2m[36m(train_cifar pid=1261)[0m Finished Training
Best trial config: {'learning_rate': 0.04376255684556946, 'momentum': 0.3329833121584336}
Best trial final validation loss: 0.7950538981492352
Best trial final validation accuracy: 0.7694
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:01<00:00, 87240625.40it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Best trial test set accuracy: 0.7408


In [11]:
final_df = results_df.drop(
    columns=[
        "should_checkpoint",
        "timesteps_total",
        "episodes_total",
        "training_iteration",
        "trial_id",
        "experiment_id",
        "date",
        "timestamp",
        "pid",
        "hostname",
        "node_ip",
        "time_since_restore",
        "timesteps_since_restore",
        "iterations_since_restore",
        "warmup_time",
        "logdir",
    ]
)
final_df.sort_values(by=['accuracy'], ascending=False).head(10)

Unnamed: 0,loss,accuracy,time_this_iter_s,done,time_total_s,config/learning_rate,config/momentum
9,0.691863,0.7699,20.274218,False,123.951019,0.043763,0.332983
7,0.696732,0.7676,18.06458,False,137.223111,0.019001,0.246724
6,0.783483,0.7641,21.025885,False,180.08632,0.083412,0.269871
2,0.770198,0.7628,20.602467,False,205.079264,0.016446,0.224796
8,0.777695,0.7602,17.986834,False,152.805428,0.03112,0.519805
3,0.783662,0.7575,17.602912,False,140.393131,0.00675,0.792941
0,0.768786,0.7533,17.687043,False,166.263623,0.038079,0.860571
5,0.818956,0.7471,17.793905,False,138.655107,0.003038,0.875928
4,,0.1058,25.813243,False,25.813243,0.06051,0.666458
1,,0.0996,23.572686,False,23.572686,0.073467,0.578927


In [12]:
final_df.sort_values(by=['accuracy']).head(10)

Unnamed: 0,loss,accuracy,time_this_iter_s,done,time_total_s,config/learning_rate,config/momentum
1,,0.0996,23.572686,False,23.572686,0.073467,0.578927
4,,0.1058,25.813243,False,25.813243,0.06051,0.666458
5,0.818956,0.7471,17.793905,False,138.655107,0.003038,0.875928
0,0.768786,0.7533,17.687043,False,166.263623,0.038079,0.860571
3,0.783662,0.7575,17.602912,False,140.393131,0.00675,0.792941
8,0.777695,0.7602,17.986834,False,152.805428,0.03112,0.519805
2,0.770198,0.7628,20.602467,False,205.079264,0.016446,0.224796
6,0.783483,0.7641,21.025885,False,180.08632,0.083412,0.269871
7,0.696732,0.7676,18.06458,False,137.223111,0.019001,0.246724
9,0.691863,0.7699,20.274218,False,123.951019,0.043763,0.332983
