In [1]:
!git clone https://github.com/RichardStaszkiewicz/CNN-Hyperparameter-Tuning.git
!mv /content/CNN-Hyperparameter-Tuning/modules /content/modules
!mv /content/CNN-Hyperparameter-Tuning/model /content/model
!mkdir logs
!pip install "ray[tune]" torch torchvision pytorch-lightning

Cloning into 'CNN-Hyperparameter-Tuning'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 40 (delta 12), reused 30 (delta 5), pack-reused 0[K
Receiving objects: 100% (40/40), 88.77 KiB | 1.67 MiB/s, done.
Resolving deltas: 100% (12/12), done.
Collecting ray[tune]
  Downloading ray-2.6.3-cp310-cp310-manylinux2014_x86_64.whl (56.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.8-py3-none-any.whl (727 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.0/727.0 kB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.0 MB/s[0m eta [3

In [2]:
from modules import plmodules as plm
from ray import tune, air
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
import pytorch_lightning as pl
import yaml

## Config

In [4]:
with open("/content/model/configs/model.yaml", 'r') as stream:
      default_config=yaml.safe_load(stream)

default_config = default_config['model']

In [5]:
search_config = {
    "batch_size": tune.grid_search([64, 128, 256]),
    "lr": tune.grid_search([0.01, 0.1]),
    "mlp_out_l0": tune.grid_search([64, 128]),
    "mlp_af_l0": tune.grid_search(['relu', 'None']),
    "mlp_bn_l0": tune.grid_search([True, False]),
    "mlp_do_l0": tune.grid_search([0.1, 0.3])
}

In [6]:
config = default_config | search_config

## Scheduers

In [21]:
scheduler_asha = ASHAScheduler(
    time_attr="time_total_s",
    max_t=300,
    grace_period=100,
    reduction_factor=2,
)

## Callbacks

In [8]:
tune_report_callback = TuneReportCheckpointCallback(
    metrics={
    "ptl/train_loss": "ptl/train_loss",
    'ptl/train_accuracy': 'ptl/train_accuracy',
    'ptl/val_loss': 'ptl/val_loss',
    'ptl/val_accuracy': 'ptl/val_accuracy',
    'time_total_s': 'time_total_s',
    },
    filename="ray_ckpt",
    on="validation_end",
)

## Trainable

In [9]:
def actualise_config(config):
  mlp = [int(k.replace("mlp_out_l", '')) for k in config.keys() if "mlp_out_l" in k]
  for l in mlp:
    config["mlp_config"]['block_list'][l]['out_size'] = config[f"mlp_out_l{l}"]
    config["mlp_config"]['block_list'][l+1]['in_size'] = config[f"mlp_out_l{l}"]
  mlp = [int(k.replace("mlp_af_l", '')) for k in config.keys() if "mlp_af_l" in k]
  for af in mlp:
    config["mlp_config"]['block_list'][af]['activation_fun'] = config[f"mlp_af_l{af}"]
  mlp = [int(k.replace("mlp_bn_l", '')) for k in config.keys() if "mlp_bn_l" in k]
  for bn in mlp:
    config["mlp_config"]['block_list'][bn]['batch_norm'] = config[f"mlp_bn_l{bn}"]
  mlp = [int(k.replace("mlp_do_l", '')) for k in config.keys() if "mlp_do_l" in k]
  for do in mlp:
    config["mlp_config"]['block_list'][do]['dropout'] = config[f"mlp_do_l{do}"]
  return config

In [10]:
def run_with_tune(config, epochs=50):
    config = actualise_config(config)
    model = plm.MNISTClassifier(config)
    dm = plm.MNISTDataModule(config['batch_size'])
    trainer = pl.Trainer(
        max_epochs=epochs,
        fast_dev_run=False,
        callbacks=[
            tune_report_callback,
        ],
    )
    trainer.fit(model, dm)

## Reporter

In [11]:
reporter = CLIReporter(
    parameter_columns=[search_config.keys()],
    metric_columns=[ "time_total_s", "ptl/train_accuracy", "ptl/val_loss", "ptl/val_accuracy", "training_iteration"]
)

## Trial

In [12]:
train_fn_with_parameters = tune.with_parameters(run_with_tune, epochs=50)

In [18]:
resources_per_trial = {"cpu": 2, "gpu": 1}

In [22]:
tuner = tune.Tuner(
        tune.with_resources(
            train_fn_with_parameters,
            resources=resources_per_trial
        ),
        tune_config=tune.TuneConfig(
            metric="ptl/val_loss",
            mode="min",
            search_alg=tune.search.BasicVariantGenerator(),
            #time_budget_s=600,
            scheduler=scheduler_asha,
            #num_samples=-1,
        ),
        run_config=air.RunConfig(
            name="tune_mnist_asha",
            progress_reporter=reporter,
        ),
        param_space=config,
    )
results = tuner.fit()

2023-09-02 15:59:37,161	INFO tune.py:666 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


[2m[36m(run_with_tune pid=2421)[0m 
[2m[36m(run_with_tune pid=2421)[0m Validation DataLoader 0:  11%|█         | 20/188 [00:00<00:02, 58.06it/s][A
+----------------------------------------------------------+
| Configuration for experiment     tune_mnist_asha         |
+----------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator   |
| Scheduler                        AsyncHyperBandScheduler |
| Number of trials                 96                      |
+----------------------------------------------------------+

View detailed results here: /root/ray_results/tune_mnist_asha
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/tune_mnist_asha`

Trial status: 16 PENDING
Current time: 2023-09-02 15:59:37. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/1 GPUs
+---------------------------------------------------------------------------------------------------------------------+
| 

[2m[36m(run_with_tune pid=2421)[0m Metric time_total_s does not exist in `trainer.callback_metrics.
Resume experiment with: Tuner.restore(path="/root/ray_results/tune_mnist_asha", trainable=...)
- /root/ray_results/tune_mnist_asha/run_with_tune_b7795_00000_0_batch_size=64,lr=0.0100,mlp_af_l0=relu,mlp_bn_l0=True,mlp_do_l0=0.1000,mlp_out_l0=64_2023-09-02_15-59-37
- /root/ray_results/tune_mnist_asha/run_with_tune_b7795_00001_1_batch_size=128,lr=0.0100,mlp_af_l0=relu,mlp_bn_l0=True,mlp_do_l0=0.1000,mlp_out_l0=64_2023-09-02_15-59-37
- /root/ray_results/tune_mnist_asha/run_with_tune_b7795_00002_2_batch_size=256,lr=0.0100,mlp_af_l0=relu,mlp_bn_l0=True,mlp_do_l0=0.1000,mlp_out_l0=64_2023-09-02_15-59-37
- /root/ray_results/tune_mnist_asha/run_with_tune_b7795_00003_3_batch_size=64,lr=0.1000,mlp_af_l0=relu,mlp_bn_l0=True,mlp_do_l0=0.1000,mlp_out_l0=64_2023-09-02_15-59-37
- /root/ray_results/tune_mnist_asha/run_with_tune_b7795_00004_4_batch_size=128,lr=0.1000,mlp_af_l0=relu,mlp_bn_l0=True,mlp_d

Trial status: 16 PENDING
Current time: 2023-09-02 15:59:59. Total running time: 22s
Logical resource usage: 0/2 CPUs, 0/1 GPUs
+---------------------------------------------------------------------------------------------------------------------+
| Trial name                  status       batch_size     lr     mlp_out_l0   mlp_af_l0     mlp_bn_l0       mlp_do_l0 |
+---------------------------------------------------------------------------------------------------------------------+
| run_with_tune_b7795_00000   PENDING              64   0.01             64   relu          True                  0.1 |
| run_with_tune_b7795_00001   PENDING             128   0.01             64   relu          True                  0.1 |
| run_with_tune_b7795_00002   PENDING             256   0.01             64   relu          True                  0.1 |
| run_with_tune_b7795_00003   PENDING              64   0.1              64   relu          True                  0.1 |
| run_with_tune_b7795_00004   PEN