## Install Libraries


In [1]:
%pip install "ray[tune]"
# %pip install torch torchvision torchaudio # Uncomment this to install PyTorch 2.0+ on ilab (required version to run this program)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Import Libraries


In [2]:
from collections import OrderedDict
from filelock import FileLock
from ray import tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from torch.utils.data import random_split
import os
import ray
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms


In [3]:
ray.init(ignore_reinit_error=True, num_cpus=4, num_gpus=4)
print("success")


2023-04-08 02:15:12,252	INFO worker.py:1553 -- Started a local Ray instance.


success


## Data loaders


In [4]:
def load_data(data_dir="./data"):
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )

    with FileLock(os.path.expanduser("~/.data.lock")):
        trainset = torchvision.datasets.CIFAR10(
            root=data_dir, train=True, download=True, transform=transform
        )

        testset = torchvision.datasets.CIFAR10(
            root=data_dir, train=False, download=True, transform=transform
        )

    return trainset, testset


## Configurable neural network


In [5]:
def flatten(x, start_dim=1, end_dim=-1):
    return x.flatten(start_dim=start_dim, end_dim=end_dim)


In [6]:
# Will need to modify the parameters and the network depending on what you are experimenting with
class Net(nn.Module):
    def __init__(self, kernel_size):
        super(Net, self).__init__()
        self.conv = nn.Conv2d(3, 3, kernel_size, padding="same")
        self.fc = nn.Linear(3 * 32 * 32, 10)

    def forward(self, x):
        x = F.relu(self.conv(x))
        x = flatten(x)
        x = self.fc(x)
        return x


## The train function


In [7]:
def train_cifar(config):
    net = Net(
        config["kernel_size"]
    )  # Will need to modify the parameters depending on what you are experimenting with

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=1e-3)

    # Do not modify any code below!
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            )
            net.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    data_dir = os.path.abspath("./data")
    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs]
    )

    trainloader = torch.utils.data.DataLoader(
        train_subset, batch_size=64, shuffle=True, num_workers=8
    )
    valloader = torch.utils.data.DataLoader(
        val_subset, batch_size=64, shuffle=True, num_workers=8
    )

    for epoch in range(10):
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:
                print(
                    f"[{epoch + 1}, {i + 1:>5} loss: {running_loss / epoch_steps:.3f}]"
                )
                running_loss = 0.0

        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        os.makedirs("my_model", exist_ok=True)
        torch.save((net.state_dict(), optimizer.state_dict()), "my_model/checkpoint.pt")
        checkpoint = Checkpoint.from_directory("my_model")
        session.report(
            {"loss": (val_loss / val_steps), "accuracy": correct / total},
            checkpoint=checkpoint,
        )
    print("Finished Training")


## Test set accuracy


In [8]:
def test_best_model(best_result):
    best_trained_model = Net(
        best_result.config["kernel_size"]
    )  # Will need to modify the parameters depending on what you are experimenting with

    # Do not modify any code below!
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    best_trained_model.to(device)

    checkpoint_path = os.path.join(
        best_result.checkpoint.to_directory(), "checkpoint.pt"
    )

    model_state, optimizer_state = torch.load(checkpoint_path)

    new_model_state = OrderedDict()
    for key, value in model_state.items():
        name = key[7:]  # remove `module.`
        new_model_state[name] = value

    best_trained_model.load_state_dict(new_model_state)

    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2
    )

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = best_trained_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Best trial test set accuracy: {correct / total}")


## Configuring the search space


In [9]:
# You will be experimenting with the hyperparameters here
# Use tune.grid_search to specify what values you want to experiment with a hyperparameter
config = {"kernel_size": tune.grid_search([1, 2, 3, 5, 10, 15, 32])}


In [10]:
# Do not modify any code below!
def main(config):
    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_cifar),
            resources={"cpu": 4, "gpu": 4},
        ),
        param_space=config,
    )
    results = tuner.fit()

    best_result = results.get_best_result("accuracy", "max")

    print(f"Best trial config: {best_result.config}")
    print(f"Best trial final validation loss: {best_result.metrics['loss']}")
    print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}")

    test_best_model(best_result)

    return results.get_dataframe("accuracy", "max")


results_df = main(config)


0,1
Current time:,2023-04-08 02:23:40
Running for:,00:08:27.23
Memory:,120.8/1511.6 GiB

Trial name,status,loc,kernel_size,iter,total time (s),loss,accuracy
train_cifar_b8ee4_00000,TERMINATED,128.6.4.148:1801667,1,10,89.0534,2.03016,0.3049
train_cifar_b8ee4_00001,TERMINATED,128.6.4.148:1801667,2,10,88.4936,1.90522,0.3382
train_cifar_b8ee4_00002,TERMINATED,128.6.4.148:1801667,3,10,62.68,1.8127,0.367
train_cifar_b8ee4_00003,TERMINATED,128.6.4.148:1801667,5,10,64.5858,1.73498,0.4027
train_cifar_b8ee4_00004,TERMINATED,128.6.4.148:1801667,10,10,63.342,1.71815,0.3955
train_cifar_b8ee4_00005,TERMINATED,128.6.4.148:1801667,15,10,64.8978,1.64808,0.4178
train_cifar_b8ee4_00006,TERMINATED,128.6.4.148:1801667,32,10,69.6977,1.62944,0.4252


0it [00:00, ?it/s]far pid=1801667)[0m 
  0%|          | 8192/170498071 [00:00<41:15, 68864.23it/s]


[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00000_0_kernel_size=1_2023-04-08_02-15-13/data/cifar-10-python.tar.gz


  0%|          | 319488/170498071 [00:00<01:43, 1649852.08it/s]
  1%|          | 1646592/170498071 [00:00<00:25, 6605169.86it/s]
  2%|▏         | 3153920/170498071 [00:00<00:19, 8411005.63it/s]
  3%|▎         | 5251072/170498071 [00:00<00:15, 10489991.14it/s]
  4%|▍         | 7348224/170498071 [00:00<00:14, 11588170.52it/s]
  6%|▌         | 9445376/170498071 [00:00<00:13, 12218829.73it/s]
  7%|▋         | 11542528/170498071 [00:01<00:12, 12762765.48it/s]
  8%|▊         | 13639680/170498071 [00:01<00:11, 13185775.10it/s]
  9%|▉         | 15736832/170498071 [00:01<00:11, 13054386.02it/s]
 10%|█         | 17833984/170498071 [00:01<00:11, 13199912.66it/s]
 12%|█▏        | 19931136/170498071 [00:01<00:11, 13370268.11it/s]
 13%|█▎        | 22028288/170498071 [00:01<00:10, 13522474.13it/s]
 14%|█▍        | 24125440/170498071 [00:02<00:10, 13687592.04it/s]
 15%|█▌        | 26222592/170498071 [00:02<00:10, 13682130.79it/s]
 17%|█▋        | 28319744/170498071 [00:02<00:10, 13725505.55it/s]
 18%|

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:30, 13343691.10it/s]                               
170500096it [00:33, 5020793.95it/s] 0m 
[2m[36m(tr

Trial name,accuracy,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_cifar_b8ee4_00000,0.3049,2023-04-08_02-16-46,True,,7a3beb6ec4864b71b354f8d2ce0995e6,0_kernel_size=1,rlab1.cs.rutgers.edu,10,2.03016,128.6.4.148,1801667,True,89.0534,5.52512,89.0534,1680934606,0,,10,b8ee4_00000,0.0058682
train_cifar_b8ee4_00001,0.3382,2023-04-08_02-18-15,True,,7a3beb6ec4864b71b354f8d2ce0995e6,1_kernel_size=2,rlab1.cs.rutgers.edu,10,1.90522,128.6.4.148,1801667,True,88.4936,6.40203,88.4936,1680934695,0,,10,b8ee4_00001,0.0058682
train_cifar_b8ee4_00002,0.367,2023-04-08_02-19-17,True,,7a3beb6ec4864b71b354f8d2ce0995e6,2_kernel_size=3,rlab1.cs.rutgers.edu,10,1.8127,128.6.4.148,1801667,True,62.68,5.82265,62.68,1680934757,0,,10,b8ee4_00002,0.0058682
train_cifar_b8ee4_00003,0.4027,2023-04-08_02-20-22,True,,7a3beb6ec4864b71b354f8d2ce0995e6,3_kernel_size=5,rlab1.cs.rutgers.edu,10,1.73498,128.6.4.148,1801667,True,64.5858,6.12053,64.5858,1680934822,0,,10,b8ee4_00003,0.0058682
train_cifar_b8ee4_00004,0.3955,2023-04-08_02-21-25,True,,7a3beb6ec4864b71b354f8d2ce0995e6,4_kernel_size=10,rlab1.cs.rutgers.edu,10,1.71815,128.6.4.148,1801667,True,63.342,5.80842,63.342,1680934885,0,,10,b8ee4_00004,0.0058682
train_cifar_b8ee4_00005,0.4178,2023-04-08_02-22-30,True,,7a3beb6ec4864b71b354f8d2ce0995e6,5_kernel_size=15,rlab1.cs.rutgers.edu,10,1.64808,128.6.4.148,1801667,True,64.8978,6.32728,64.8978,1680934950,0,,10,b8ee4_00005,0.0058682
train_cifar_b8ee4_00006,0.4252,2023-04-08_02-23-40,True,,7a3beb6ec4864b71b354f8d2ce0995e6,6_kernel_size=32,rlab1.cs.rutgers.edu,10,1.62944,128.6.4.148,1801667,True,69.6977,5.65356,69.6977,1680935020,0,,10,b8ee4_00006,0.0058682


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(

[2m[36m(train_cifar pid=1801667)[0m Finished Training
[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00001_1_kernel_size=2_2023-04-08_02-16-46/data/cifar-10-python.tar.gz


  0%|          | 8192/170498071 [00:00<42:59, 66099.43it/s]
  0%|          | 319488/170498071 [00:00<01:45, 1617254.72it/s]
  1%|          | 1859584/170498071 [00:00<00:22, 7427353.71it/s]
  2%|▏         | 3153920/170498071 [00:00<00:19, 8384159.38it/s]
  3%|▎         | 5251072/170498071 [00:00<00:14, 11225778.87it/s]
  4%|▍         | 7348224/170498071 [00:00<00:12, 12842289.27it/s]
  6%|▌         | 9445376/170498071 [00:00<00:11, 13804368.10it/s]
  7%|▋         | 11542528/170498071 [00:01<00:10, 14476686.67it/s]
  8%|▊         | 13639680/170498071 [00:01<00:10, 14765415.45it/s]
  9%|▉         | 15736832/170498071 [00:01<00:10, 15006525.84it/s]
 10%|█         | 17833984/170498071 [00:01<00:10, 15222082.20it/s]
 12%|█▏        | 19931136/170498071 [00:01<00:09, 15397452.44it/s]
 13%|█▎        | 22028288/170498071 [00:01<00:09, 15475895.53it/s]
 14%|█▍        | 24125440/170498071 [00:01<00:09, 15611896.66it/s]
 15%|█▌        | 26222592/170498071 [00:01<00:09, 15594845.36it/s]
 17%|█▋     

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:27, 6207029.17it/s] 0m 
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:27, 6200578.32it/s] 0m 
170500096it [00:27, 6197661.57it/s] 0m 
170500096it [00:27, 6196513.42it/s] 0m 
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.

[2m[36m(train_cifar pid=1801667)[0m Finished Training
[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00002_2_kernel_size=3_2023-04-08_02-18-15/data/cifar-10-python.tar.gz


  0%|          | 294912/170498071 [00:00<01:54, 1480028.59it/s]
  2%|▏         | 3874816/170498071 [00:00<00:10, 15254600.81it/s]
  6%|▌         | 10289152/170498071 [00:00<00:05, 31972903.43it/s]
 10%|█         | 17063936/170498071 [00:00<00:03, 43960379.95it/s]
 14%|█▎        | 23371776/170498071 [00:00<00:02, 50148605.80it/s]
 18%|█▊        | 30842880/170498071 [00:00<00:02, 57608846.83it/s]
 22%|██▏       | 38248448/170498071 [00:00<00:02, 62705731.04it/s]
 27%|██▋       | 45735936/170498071 [00:00<00:01, 66430291.36it/s]
 32%|███▏      | 54009856/170498071 [00:01<00:01, 71393510.83it/s]
 37%|███▋      | 62382080/170498071 [00:01<00:01, 75121649.16it/s]
 42%|████▏     | 71688192/170498071 [00:01<00:01, 80519805.07it/s]
 47%|████▋     | 80543744/170498071 [00:01<00:01, 82938723.62it/s]
 53%|█████▎    | 90144768/170498071 [00:01<00:00, 86772087.89it/s]
 58%|█████▊    | 99729408/170498071 [00:01<00:00, 89487090.22it/s]
 64%|██████▍   | 109649920/170498071 [00:01<00:00, 92337977.61it/s

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:05, 30433506.09it/s]                                
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:05, 30315266.93it/s]                                
170500096it [00:05, 30300793.29it/s]                                
[2m[36m(train_cifar p

[2m[36m(train_cifar pid=1801667)[0m Finished Training
[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00003_3_kernel_size=5_2023-04-08_02-19-17/data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:01<?, ?it/s]
  0%|          | 155648/170498071 [00:01<01:58, 1434704.51it/s]
  1%|▏         | 2228224/170498071 [00:01<00:13, 12396142.76it/s]
  5%|▍         | 7716864/170498071 [00:01<00:05, 31430776.19it/s]
  9%|▊         | 14688256/170498071 [00:01<00:03, 46335060.75it/s]
 14%|█▎        | 23076864/170498071 [00:01<00:02, 59719884.10it/s]
 18%|█▊        | 30965760/170498071 [00:01<00:02, 66172827.71it/s]
 24%|██▍       | 40902656/170498071 [00:01<00:01, 76852068.49it/s]
 29%|██▉       | 49061888/170498071 [00:01<00:01, 78349540.72it/s]
 34%|███▍      | 58040320/170498071 [00:02<00:01, 81909187.65it/s]
 39%|███▉      | 67043328/170498071 [00:02<00:01, 84400265.98it/s]
 44%|████▍     | 75423744/170498071 [00:02<00:01, 82858352.19it/s]
 50%|█████     | 85549056/170498071 [00:02<00:00, 88354242.03it/s]
 55%|█████▌    | 94363648/170498071 [00:02<00:00, 85931983.43it/s]
 61%|██████    | 103383040/170498071 [00:02<00:00, 87182389.72it/s]
 66%|██████▌   | 112

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m 
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:06, 25733464.98it/s]                               
170500096it [00:06, 25704113.67it/s]                               
170500096it [00:06, 25690964.19it/s]                  

[2m[36m(train_cifar pid=1801667)[0m Finished Training
[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00004_4_kernel_size=10_2023-04-08_02-20-22/data/cifar-10-python.tar.gz


  0%|          | 319488/170498071 [00:00<01:45, 1612717.12it/s]
  2%|▏         | 4005888/170498071 [00:00<00:10, 15812634.45it/s]
  6%|▌         | 10444800/170498071 [00:00<00:04, 33583366.55it/s]
  9%|▉         | 15949824/170498071 [00:00<00:03, 40938306.14it/s]
 13%|█▎        | 22331392/170498071 [00:00<00:03, 48251830.48it/s]
 17%|█▋        | 28983296/170498071 [00:00<00:02, 54064526.25it/s]
 21%|██        | 35258368/170498071 [00:00<00:02, 56771462.58it/s]
 25%|██▍       | 42328064/170498071 [00:00<00:02, 61059870.06it/s]
 29%|██▊       | 48726016/170498071 [00:01<00:01, 61930393.17it/s]
 33%|███▎      | 56008704/170498071 [00:01<00:01, 65242149.79it/s]
 37%|███▋      | 63176704/170498071 [00:01<00:01, 67190983.27it/s]
 41%|████▏     | 70483968/170498071 [00:01<00:01, 68888633.25it/s]
 46%|████▌     | 78340096/170498071 [00:01<00:01, 71796741.78it/s]
 51%|█████     | 86261760/170498071 [00:01<00:01, 74026611.51it/s]
 56%|█████▌    | 95019008/170498071 [00:01<00:00, 78093603.64it/s]

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:05, 29169221.63it/s]                               
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:05, 29022582.62it/s]                               
170500096it [00:05, 28997727.87it/s]                               
170500096it [00:05, 289725

[2m[36m(train_cifar pid=1801667)[0m Finished Training
[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00005_5_kernel_size=15_2023-04-08_02-21-25/data/cifar-10-python.tar.gz


0it [00:00, ?it/s]far pid=1801667)[0m 
  0%|          | 8192/170498071 [00:00<42:52, 66265.28it/s]
  0%|          | 204800/170498071 [00:00<02:45, 1030676.65it/s]
  0%|          | 425984/170498071 [00:00<01:55, 1468023.63it/s]
  2%|▏         | 2646016/170498071 [00:00<00:19, 8764409.59it/s]
  5%|▌         | 9142272/170498071 [00:00<00:06, 26781779.96it/s]
  9%|▉         | 15736832/170498071 [00:00<00:03, 39127885.26it/s]
 13%|█▎        | 21553152/170498071 [00:00<00:03, 45051994.51it/s]
 16%|█▋        | 28024832/170498071 [00:00<00:02, 50950230.97it/s]
 21%|██        | 35045376/170498071 [00:00<00:02, 56800895.95it/s]
 25%|██▍       | 42057728/170498071 [00:01<00:02, 60741028.23it/s]
 29%|██▉       | 49848320/170498071 [00:01<00:01, 65916689.87it/s]
 34%|███▎      | 57434112/170498071 [00:01<00:01, 68906228.16it/s]
 39%|███▊      | 65716224/170498071 [00:01<00:01, 73058013.85it/s]
 43%|████▎     | 74121216/170498071 [00:01<00:01, 76347296.69it/s]
 49%|████▊     | 82837504/170498071 [0

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m 
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:05, 29676493.16it/s]                               
170500096it [00:05, 29655172.45it/s]                               
170500096it [00:05, 29627763.34it/s]                  

[2m[36m(train_cifar pid=1801667)[0m Finished Training
[2m[36m(train_cifar pid=1801667)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /common/home/vig4/ray_results/train_cifar_2023-04-08_02-15-13/train_cifar_b8ee4_00006_6_kernel_size=32_2023-04-08_02-22-30/data/cifar-10-python.tar.gz


0it [00:00, ?it/s]far pid=1801667)[0m 
  0%|          | 0/170498071 [00:01<?, ?it/s]
  0%|          | 147456/170498071 [00:01<02:05, 1352798.86it/s]
  1%|          | 2105344/170498071 [00:01<00:14, 11682604.69it/s]
  5%|▍         | 7798784/170498071 [00:01<00:05, 31925149.59it/s]
  9%|▉         | 14966784/170498071 [00:01<00:03, 47421011.62it/s]
 14%|█▍        | 24346624/170498071 [00:01<00:02, 63843572.35it/s]
 18%|█▊        | 31293440/170498071 [00:01<00:02, 65737817.60it/s]
 24%|██▍       | 40902656/170498071 [00:01<00:01, 75390265.71it/s]
 28%|██▊       | 48267264/170498071 [00:01<00:01, 74760739.03it/s]
 34%|███▎      | 57376768/170498071 [00:02<00:01, 79839064.21it/s]
 38%|███▊      | 65437696/170498071 [00:02<00:01, 79919268.07it/s]
 43%|████▎     | 74072064/170498071 [00:02<00:01, 81834672.29it/s]
 48%|████▊     | 82608128/170498071 [00:02<00:01, 82898515.57it/s]
 53%|█████▎    | 90873856/170498071 [00:02<00:00, 82783837.06it/s]
 58%|█████▊    | 99131392/170498071 [00:02<00:00

[2m[36m(train_cifar pid=1801667)[0m Files already downloaded and verified


[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m 
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it [00:06, 24908570.19it/s]                               
[2m[36m(train_cifar pid=1801667)[0m   img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
170500096it 

Best trial config: {'kernel_size': 32}
Best trial final validation loss: 1.629443556639799
Best trial final validation accuracy: 0.4252
[2m[36m(train_cifar pid=1801667)[0m Finished Training
Files already downloaded and verified
Files already downloaded and verified


  img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
  img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
  return F.conv2d(input, weight, bias, self.stride,


Best trial test set accuracy: 0.4379


In [11]:
results_df.drop(
    columns=[
        "should_checkpoint",
        "done",
        "timesteps_total",
        "episodes_total",
        "training_iteration",
        "trial_id",
        "experiment_id",
        "date",
        "timestamp",
        "pid",
        "hostname",
        "node_ip",
        "time_since_restore",
        "timesteps_since_restore",
        "iterations_since_restore",
        "warmup_time",
        "logdir",
    ]
)


Unnamed: 0,loss,accuracy,time_this_iter_s,time_total_s,config/kernel_size
0,2.030164,0.3049,5.525124,89.053396,1
1,1.911807,0.3401,6.023555,82.091532,2
2,1.812696,0.367,5.822653,62.679997,3
3,1.73498,0.4027,6.120526,64.585817,5
4,1.718148,0.3955,5.808425,63.341961,10
5,1.64808,0.4178,6.327283,64.897759,15
6,1.629444,0.4252,5.653556,69.697712,32
