## Ray-tune for Hyperparameter Turning

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BoostcampAITech/lecture-note-python-basics-for-ai/blob/main/codes/pytorch/07_torch-study/ray-tune/ray_tune.ipynb)

In [1]:
! pip install ray

Collecting ray
  Downloading ray-2.3.0-cp310-cp310-manylinux2014_x86_64.whl (58.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting filelock
  Downloading filelock-3.9.1-py3-none-any.whl (9.7 kB)
Collecting virtualenv>=20.0.24
  Downloading virtualenv-20.21.0-py3-none-any.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting aiosignal
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting protobuf!=3.19.5,>=3.15.3
  Downloading protobuf-4.22.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting msgpack<2.0.0,>=1.0.0
  Downloading msgpack-1.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [2]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<4,>=3.8.0
  Downloading protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: protobuf, tensorboardX
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.22.1
    Uninstalling protobuf-4.22.1:
      Successfully uninstalled protobuf-4.22.1
Successfully installed protobuf-3.20.3 tensorboardX-2.6


In [3]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m00:01[0mm00:01[0m
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.16.0-py2.py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting appdirs>=1.4.3
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting path

In [7]:
!pip install ray[tune]

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [8]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

import wandb

In [9]:
# 데이터 잘라내기
def load_data(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset


In [10]:
class Net(nn.Module):
    # l1, l2: Elastic net에서의 l1, l2 loss가 아님
    # 단순히 마지막 Linear layer의 크기 설정을 위한 hyperparameter
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [12]:
# train의 과정이 반드시 하나의 함수로 선언되어 있어야 함
# 그래야만 ray가 이것을 학습 함수로 불러와서 tuning이 가능
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    
    # WandB로 monitoring까지 하면 더 좋음 (필수는 아님)
    wandb.init(project='torch-turn', entity='teamlab')
    wandb.watch(net)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        wandb.log({"val_loss": val_loss})
        wandb.log({"loss": loss})

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [13]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

## Main Hyperparameter Tuning Functions

In [15]:
torch.cuda.is_available()

True

In [14]:
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.hyperopt import HyperOptSearch

def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    
    # (중요도 1순위) config에 search space를 지정
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    
    # (중요도 2순위) 학습 스케쥴링 알고리즘 지정
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    
    # 결과 출력 양식 지정
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    
    # (중요도 3순위) 병렬 처리 방식으로 학습 수행하도록 설정
    # 위는 결국 전부 이 tune.run()을 실행하기 위한 과정들
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    wandb.login(key="0a25ae829bf4e2a6cd2acfdd4e65e6a26cd9927e")
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)

  from ray.tune.suggest.bayesopt import BayesOptSearch
  from ray.tune.suggest.bayesopt import BayesOptSearch
  from ray.tune.suggest.hyperopt import HyperOptSearch
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/kingstar/.netrc


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /home/kingstar/workspace/pytorch_practice/data/cifar-10-python.tar.gz


100.0%


Extracting /home/kingstar/workspace/pytorch_practice/data/cifar-10-python.tar.gz to /home/kingstar/workspace/pytorch_practice/data
Files already downloaded and verified


2023-03-15 11:17:12,343	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html



== Status ==
Current time: 2023-03-15 11:17:12 (running for 00:00:00.05)
Memory usage on this node: 2.3/15.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-------------------------+----------+---------------------+--------------+------+------+-------------+
| Trial name              | status   | loc                 |   batch_size |   l1 |   l2 |          lr |
|-------------------------+----------+---------------------+--------------+------+------+-------------|
| train_cifar_7f5ac_00000 | RUNNING  | 172.23.199.201:3668 |            8 |   32 |    8 | 0.000178243 |
| train_cifar_7f5ac_00001 | PENDING  |                     |            8 |    8 |  256 | 0.000102774 |
| train_cifar_7f5ac_00002 | PENDING  |    

[2m[36m(func pid=3668)[0m wandb: Currently logged in as: teamlab. Use `wandb login --relogin` to force relogin


[2m[36m(func pid=3733)[0m Files already downloaded and verified
[2m[36m(func pid=3731)[0m Files already downloaded and verified
[2m[36m(func pid=3729)[0m Files already downloaded and verified
[2m[36m(func pid=3737)[0m Files already downloaded and verified
[2m[36m(func pid=3735)[0m Files already downloaded and verified
[2m[36m(func pid=3735)[0m Files already downloaded and verified
[2m[36m(func pid=3733)[0m Files already downloaded and verified
[2m[36m(func pid=3731)[0m Files already downloaded and verified
[2m[36m(func pid=3729)[0m Files already downloaded and verified
[2m[36m(func pid=3737)[0m Files already downloaded and verified


[2m[36m(func pid=3733)[0m wandb: Currently logged in as: teamlab. Use `wandb login --relogin` to force relogin
[2m[36m(func pid=3729)[0m wandb: Currently logged in as: teamlab. Use `wandb login --relogin` to force relogin
[2m[36m(func pid=3737)[0m wandb: Currently logged in as: teamlab. Use `wandb login --relogin` to force relogin
[2m[36m(func pid=3731)[0m wandb: Currently logged in as: teamlab. Use `wandb login --relogin` to force relogin
[2m[36m(func pid=3735)[0m wandb: Currently logged in as: teamlab. Use `wandb login --relogin` to force relogin
[2m[36m(func pid=3668)[0m wandb: Tracking run with wandb version 0.14.0
[2m[36m(func pid=3668)[0m wandb: Run data is saved locally in /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12/train_cifar_7f5ac_00000_0_batch_size=8,l1=32,l2=8,lr=0.0002_2023-03-15_11-17-12/wandb/run-20230315_111716-v00mdj86
[2m[36m(func pid=3668)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(func pid=3668)[0m wandb: Syn

== Status ==
Current time: 2023-03-15 11:17:19 (running for 00:00:06.58)
Memory usage on this node: 5.3/15.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (4 PENDING, 6 RUNNING)
+-------------------------+----------+---------------------+--------------+------+------+-------------+
| Trial name              | status   | loc                 |   batch_size |   l1 |   l2 |          lr |
|-------------------------+----------+---------------------+--------------+------+------+-------------|
| train_cifar_7f5ac_00000 | RUNNING  | 172.23.199.201:3668 |            8 |   32 |    8 | 0.000178243 |
| train_cifar_7f5ac_00001 | RUNNING  | 172.23.199.201:3729 |            8 |    8 |  256 | 0.000102774 |
| train_cifar_7f5ac_00002 | RUNNING  | 17

[2m[36m(func pid=3737)[0m wandb: Tracking run with wandb version 0.14.0
[2m[36m(func pid=3737)[0m wandb: Run data is saved locally in /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12/train_cifar_7f5ac_00005_5_batch_size=8,l1=256,l2=16,lr=0.0002_2023-03-15_11-17-14/wandb/run-20230315_111718-47lxc7g1
[2m[36m(func pid=3737)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(func pid=3737)[0m wandb: Syncing run wandering-dream-653
[2m[36m(func pid=3737)[0m wandb: ⭐️ View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(func pid=3737)[0m wandb: 🚀 View run at https://wandb.ai/teamlab/torch-turn/runs/47lxc7g1
[2m[36m(func pid=3735)[0m wandb: Tracking run with wandb version 0.14.0
[2m[36m(func pid=3735)[0m wandb: Run data is saved locally in /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12/train_cifar_7f5ac_00004_4_batch_size=8,l1=32,l2=8,lr=0.0967_2023-03-15_11-17-14/wandb/run-20230315_111718-pol4dxjk
[2m[36m(func pid=3735)[0m wandb

== Status ==
Current time: 2023-03-15 11:17:24 (running for 00:00:11.59)
Memory usage on this node: 5.7/15.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (4 PENDING, 6 RUNNING)
+-------------------------+----------+---------------------+--------------+------+------+-------------+
| Trial name              | status   | loc                 |   batch_size |   l1 |   l2 |          lr |
|-------------------------+----------+---------------------+--------------+------+------+-------------|
| train_cifar_7f5ac_00000 | RUNNING  | 172.23.199.201:3668 |            8 |   32 |    8 | 0.000178243 |
| train_cifar_7f5ac_00001 | RUNNING  | 172.23.199.201:3729 |            8 |    8 |  256 | 0.000102774 |
| train_cifar_7f5ac_00002 | RUNNING  | 17

Trial name,accuracy,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_cifar_7f5ac_00000,0.0977,2023-03-15_11-17-41,True,,e8db405cdecc465f9c9ffa7d35e23ca0,Kingstar-Desktop,1,2.30576,172.23.199.201,3668,True,27.0859,27.0859,27.0859,1678846661,0,,1,7f5ac_00000,0.00225472
train_cifar_7f5ac_00001,0.2231,2023-03-15_11-18-10,True,,b18f88a018ea4bff95b25b534b5f43c3,Kingstar-Desktop,2,2.07262,172.23.199.201,3729,True,53.8996,25.5668,53.8996,1678846690,0,,2,7f5ac_00001,0.00249505
train_cifar_7f5ac_00002,0.5002,2023-03-15_11-18-53,False,,ba7f4749d5fb48d98d5d01df6f0e14c1,Kingstar-Desktop,6,1.39431,172.23.199.201,3731,True,96.7456,14.9573,96.7456,1678846733,0,,6,7f5ac_00002,0.00201845
train_cifar_7f5ac_00003,0.4036,2023-03-15_11-18-40,False,,f691bb3bc71b4588821332d81069e606,Kingstar-Desktop,1,1.66019,172.23.199.201,3733,True,84.2293,84.2293,84.2293,1678846720,0,,1,7f5ac_00003,0.00234413
train_cifar_7f5ac_00004,0.0992,2023-03-15_11-17-44,True,,279dc4323d6a4447b45be682234ddf16,Kingstar-Desktop,1,2.3294,172.23.199.201,3735,True,28.2683,28.2683,28.2683,1678846664,0,,1,7f5ac_00004,0.00282693
train_cifar_7f5ac_00005,0.3787,2023-03-15_11-18-38,False,,6885adb3ab5944baa27855abf637855b,Kingstar-Desktop,3,1.71061,172.23.199.201,3737,True,82.4263,25.3445,82.4263,1678846718,0,,3,7f5ac_00005,0.00226235
train_cifar_7f5ac_00006,0.288,2023-03-15_11-18-26,False,,e8db405cdecc465f9c9ffa7d35e23ca0,Kingstar-Desktop,1,1.80031,172.23.199.201,3668,True,44.5802,44.5802,44.5802,1678846706,0,,1,7f5ac_00006,0.00225472


[2m[36m(func pid=3729)[0m [1,  4000] loss: 1.148
[2m[36m(func pid=3735)[0m [1,  4000] loss: 1.166
[2m[36m(func pid=3737)[0m [1,  4000] loss: 1.147
== Status ==
Current time: 2023-03-15 11:17:41 (running for 00:00:28.33)
Memory usage on this node: 5.7/15.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -2.1763066513061524
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (4 PENDING, 6 RUNNING)
+-------------------------+----------+---------------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status   | loc                 |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_iteration |
|-------------------------+----------+---------------------+--------------+------+------+-

[2m[36m(func pid=3735)[0m [1,  2000] loss: 2.318
[2m[36m(func pid=3729)[0m [2,  2000] loss: 2.235
[2m[36m(func pid=3733)[0m [1, 10000] loss: 0.345
[2m[36m(func pid=3737)[0m [2,  2000] loss: 2.165
== Status ==
Current time: 2023-03-15 11:17:57 (running for 00:00:44.86)
Memory usage on this node: 5.8/15.6 GiB 
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: -1.825210302734375 | Iter 1.000: -2.271267781639099
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (2 PENDING, 6 RUNNING, 2 TERMINATED)
+-------------------------+------------+---------------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status     | loc                 |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_iteration |
|------

[2m[36m(func pid=3735)[0m [1,  6000] loss: 0.772
[2m[36m(func pid=3733)[0m [1, 14000] loss: 0.240
[2m[36m(func pid=3729)[0m Files already downloaded and verified
[2m[36m(func pid=3729)[0m Files already downloaded and verified
[2m[36m(func pid=3668)[0m [1,  8000] loss: 0.507
== Status ==
Current time: 2023-03-15 11:18:13 (running for 00:01:00.70)
Memory usage on this node: 5.7/15.6 GiB 
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: -1.9317257415771485 | Iter 1.000: -2.271267781639099
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (1 PENDING, 6 RUNNING, 3 TERMINATED)
+-------------------------+------------+---------------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status     | loc                 |   batch_size 

[2m[36m(func pid=3729)[0m [1,  4000] loss: 0.959
[2m[36m(func pid=3733)[0m [1, 20000] loss: 0.167
[2m[36m(func pid=3735)[0m [1, 12000] loss: 0.386
[2m[36m(func pid=3737)[0m [3,  4000] loss: 0.895
== Status ==
Current time: 2023-03-15 11:18:31 (running for 00:01:18.29)
Memory usage on this node: 5.9/15.6 GiB 
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: None | Iter 4.000: -1.5010592198371888 | Iter 2.000: -1.9317257415771485 | Iter 1.000: -2.2568849164009093
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (1 PENDING, 6 RUNNING, 3 TERMINATED)
+-------------------------+------------+---------------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status     | loc                 |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_it

[2m[36m(func pid=3668)[0m [2,  6000] loss: 0.534
[2m[36m(func pid=3733)[0m [2,  2000] loss: 1.624
[2m[36m(func pid=3737)[0m [4,  2000] loss: 1.675
[2m[36m(func pid=3729)[0m [1, 10000] loss: 0.347
[2m[36m(func pid=3735)[0m [1, 18000] loss: 0.257
[2m[36m(func pid=3731)[0m [6,  2000] loss: 1.370
== Status ==
Current time: 2023-03-15 11:18:50 (running for 00:01:37.84)
Memory usage on this node: 5.8/15.6 GiB 
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: None | Iter 4.000: -1.5010592198371888 | Iter 2.000: -1.9317257415771485 | Iter 1.000: -2.24250205116272
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (1 PENDING, 6 RUNNING, 3 TERMINATED)
+-------------------------+------------+---------------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status     |

2023-03-15 11:18:53,237	ERROR tune.py:794 -- Trials did not complete: [train_cifar_7f5ac_00002, train_cifar_7f5ac_00003, train_cifar_7f5ac_00005, train_cifar_7f5ac_00006, train_cifar_7f5ac_00007, train_cifar_7f5ac_00008, train_cifar_7f5ac_00009]
2023-03-15 11:18:53,238	INFO tune.py:798 -- Total run time: 100.37 seconds (100.34 seconds for the tuning loop).
[2m[36m(func pid=3737)[0m 2023-03-15 11:18:53,253	ERROR worker.py:772 -- Worker exits with an exit code 1.
[2m[36m(func pid=3737)[0m Traceback (most recent call last):
[2m[36m(func pid=3737)[0m   File "python/ray/_raylet.pyx", line 1166, in ray._raylet.task_execution_handler
[2m[36m(func pid=3737)[0m   File "python/ray/_raylet.pyx", line 1072, in ray._raylet.execute_task_with_cancellation_handler
[2m[36m(func pid=3737)[0m   File "python/ray/_raylet.pyx", line 805, in ray._raylet.execute_task
[2m[36m(func pid=3737)[0m   File "python/ray/_raylet.pyx", line 850, in ray._raylet.execute_task
[2m[36m(func pid=3737)[0m 

== Status ==
Current time: 2023-03-15 11:18:53 (running for 00:01:40.35)
Memory usage on this node: 5.8/15.6 GiB 
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: None | Iter 4.000: -1.5010592198371888 | Iter 2.000: -1.9317257415771485 | Iter 1.000: -2.24250205116272
Resources requested: 12.0/12 CPUs, 0/0 GPUs, 0.0/8.34 GiB heap, 0.0/4.17 GiB objects
Result logdir: /home/kingstar/ray_results/train_cifar_2023-03-15_11-17-12
Number of trials: 10/10 (1 PENDING, 6 RUNNING, 3 TERMINATED)
+-------------------------+------------+---------------------+--------------+------+------+-------------+---------+------------+----------------------+
| Trial name              | status     | loc                 |   batch_size |   l1 |   l2 |          lr |    loss |   accuracy |   training_iteration |
|-------------------------+------------+---------------------+--------------+------+------+-------------+---------+------------+----------------------|
| train_cifar_7f5ac_00002 | RUNNING    | 172.23.

AttributeError: '_TrackedCheckpoint' object has no attribute 'value'