In [1]:
import torch
import torch.nn as nn
from torch import optim
from torchvision import transforms
from torch import Tensor
import numpy as np

from ray import train as ray_train
from ray import tune
from ray.tune.search import ConcurrencyLimiter
from ray.tune.search.optuna import OptunaSearch

from pathlib import Path

import params.learning
from params.learning import LEARNING
from dataset import get_dataloader
from model import ResNet18Velocity_Regression_Alt
from train import train
from validate import validate
from test import test
from result import parameters_table, generate_log

from custom_transforms import (
    Cutout,
    Shadowcasting,
)

from params import PROJECT_PATH

device = "cuda" if torch.cuda.is_available() else "cpu"

# Hyperparameters list

In [2]:
# Define ablation range of data augmentation

image_augmentation_transforms = [
    transforms.ColorJitter(
        brightness=0.9, contrast=0.3, saturation=1, hue=0.1
    ),
    transforms.GaussianBlur(3),
    transforms.GaussianBlur(7),
    transforms.RandomSolarize(0.5),
    Cutout(),
    Shadowcasting(),
]
augmentation_transforms = [
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.RandomResizedCrop(
        params.learning.IMAGE_SHAPE,
        scale=(0.2, 1.0),
        ratio=(3, 3),
        antialias=True,
    ),
    transforms.Normalize(
        Tensor([0.4333, 0.4610, 0.4413, 0.0926, 0.4989, 0.5924, 0.8239]),
        Tensor([0.2223, 0.2146, 0.2154, 0.0817, 0.2162, 0.2504, 0.1614]),
    ),
]

In [3]:
# Optimizer hyperparameters
optimizers = [
    optim.Adam,
    # optim.SGD,  # Clearly less performant than Adam on previous runs
    # optim.RMSprop,
    # optim.AdamW,
    # optim.Adamax,
    # optim.ASGD,
    # optim.LBFGS,
]
learning_rates = [1e-5, 1e-3]

In [4]:
image_augmentation_search_space = {
    f"image_augmentation_{i}": tune.choice([True, False])
    for i, aug in enumerate(image_augmentation_transforms)
}

augmentation_search_space = {
    f"augmentation_{i}": tune.choice([True, False])
    for i, aug in enumerate(augmentation_transforms)
}

In [5]:
search_space = {
    "nb_epochs": 200,
    "learning_rate": tune.loguniform(*learning_rates),
    "optimizer": tune.choice(optimizers),
    **image_augmentation_search_space,
    **augmentation_search_space,
}

# Black-box optimizer

In [7]:
def get_multiple_choice(
    config: dict, prefix: str, choices: []
) -> transforms.Compose:
    return transforms.Compose(
        [
            aug
            for i, aug in enumerate(choices)
            if f"{prefix}_{i}" in config and config[f"{prefix}_{i}"]
        ]
    )

In [13]:
criterion_regression = nn.MSELoss()


def trial(config):
    image_augmentation_transform = get_multiple_choice(
        config, "image_augmentation", image_augmentation_transforms
    )
    augmentation_transform = get_multiple_choice(
        config, "augmentation", augmentation_transforms
    )

    model = ResNet18Velocity_Regression_Alt().to(device)
    train_loader, val_loader, _ = get_dataloader(
        params.learning.DATASET,
        image_augmentation_transform=image_augmentation_transform,
        augmentation_transform=augmentation_transform,
        multimodal_transform=transforms.Resize(
            params.learning.IMAGE_SHAPE, antialias=True
        ),
        batch_size=params.learning.LEARNING["batch_size"],
    )
    optimizer = config["optimizer"](
        model.parameters(),
        lr=config["learning_rate"],
    )

    scheduler = optim.lr_scheduler.LinearLR(
        optimizer=optimizer, total_iters=config["nb_epochs"]
    )

    for epoch in range(config["nb_epochs"]):
        train_regression_loss = train(
            model,
            device,
            train_loader,
            optimizer,
            scheduler,
            criterion_regression,
            epoch,
        )

        val_regression_loss = validate(
            model, device, val_loader, criterion_regression, epoch
        )

        ray_train.report(
            {
                "train_loss": train_regression_loss,
                "val_loss": val_regression_loss,
            }
        )

In [14]:
algo = OptunaSearch()
gpu_trial = tune.with_resources(trial, {"cpu": 12, "gpu": 1})

tuner = tune.Tuner(
    gpu_trial,
    tune_config=tune.TuneConfig(
        metric="val_loss",
        mode="min",
        search_alg=algo,
        num_samples=100,
    ),
    param_space=search_space,
)
results = tuner.fit()

[2m[36m(trial pid=507369)[0m   0%|          | 0/30 [00:00<?, ?batch/s]
[2m[36m(trial pid=507369)[0m   0%|          | 0/30 [00:00<?, ?batch/s]
[2m[36m(trial pid=507369)[0m   0%|          | 0/30 [00:00<?, ?batch/s]


2023-10-18 12:11:11,189	INFO tune.py:1143 -- Total run time: 113.29 seconds (103.26 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/home/g_thomas/ray_results/trial_2023-10-18_12-09-17", trainable=...)
- trial_0bfded91: FileNotFoundError('Could not fetch metrics for trial_0bfded91: both result.json and progress.csv were not found at /home/g_thomas/ray_results/trial_2023-10-18_12-09-17/trial_0bfded91_2_augmentation_0=True,augmentation_1=True,augmentation_2=False,augmentation_3=False,batch_size=134.0042,image_augme_2023-10-18_12-09-22')


In [8]:
print("Best hyperparameters found were: ", results.get_best_result().config)

Best hyperparameters found were:  {'nb_epochs': 200, 'batch_size': 64, 'learning_rate': 0.0009467398972249038, 'optimizer': <class 'torch.optim.adam.Adam'>, 'image_augmentation': 16, 'augmentation': 2}


In [9]:
print(results.get_best_result())

Result(
  metrics={'iterations': 21, 'train_loss': 0.6919330770986668, 'val_loss': 0.620546210805575},
  path='/home/g_thomas/ray_results/trial_2023-10-16_17-55-44/trial_5f204acb_48_augmentation=2,batch_size=64,image_augmentation=16,learning_rate=0.0009,nb_epochs=200,optimizer=ref_ph_7bdb82fb_2023-10-17_06-13-13',
  filesystem='local',
  checkpoint=None
)


In [11]:
df = results.get_dataframe()
df.columns

Index(['iterations', 'train_loss', 'val_loss', 'timestamp', 'done',
       'training_iteration', 'trial_id', 'date', 'time_this_iter_s',
       'time_total_s', 'pid', 'hostname', 'node_ip', 'time_since_restore',
       'iterations_since_restore', 'checkpoint_dir_name', 'config/nb_epochs',
       'config/batch_size', 'config/learning_rate', 'config/optimizer',
       'config/image_augmentation', 'config/augmentation', 'logdir'],
      dtype='object')

In [15]:
results.experiment_path

'/home/g_thomas/ray_results/trial_2023-10-16_17-55-44'

[2m[33m(raylet)[0m [2023-10-18 11:15:58,487 E 35227 35227] (raylet) node_manager.cc:3007: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 16de75f6b059170fc6343450c045b4dc49c26bd59454d217191e897a, IP: 147.250.35.113) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 147.250.35.113`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
