In [1]:
import os
from pathlib import Path
from typing import Dict
from datetime import datetime as dt
import logging

import ray
import torch
from ray import train, tune
from ray.tune.schedulers import ASHAScheduler
from torchvision import transforms

from engine import (load_enet_best, get_loss_optimizer, create_writer, pretty_json, val_step, train_step)
from load_data_functions import create_cv_datasets, create_dataloaders

In [2]:
project_path = os.getcwd()

def tune_enet(config: Dict[str, any], start_time_tuning: str):
    """
    Tune the pretrained EfficientNet model based on the given hyperparameter configuration and
    log the results using ray train.report and Tensorboard.

    :param config:  Dictionary containing hyperparameters for tuning.
        - "bs" (int): Batch size.
        - "do" (float): Dropout probability for the dropout layer.
        - "loss_fn" (str): Name of the loss function. 
        - "lr" (float): Learning rate.
        - "aug" (bool): Whether to apply data augmentation to increase the training set size.
    :param start_time_tuning: Start time of the tuning run for logging purposes.
    """
    # Extract hyperparameters from the config
    batch_size = config["bs"]
    dropout_p = config["do"]
    loss_fn_str = config["loss_fn"]
    learning_rate = config["lr"]
    augmentation = config["aug"]

    # fixed parameters and other variables
    split_path = Path(project_path) / "data/train_test_split"
    fold_list = [split_path / f"fold_{i}" for i in range(1, 6)]
    fold_num = 5
    used_fold = "fold_1"
    img_size = 224
    model_name = "pretrained_enet"
    optimizer_str = "Adam"
    num_epochs = 20
    train_percentage = 1.0

    # setup device agnostic code
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # load model
    model = load_enet_best(device, dropout=dropout_p, project_path=project_path)
    model.to(device)

    # get loss function and optimizer
    loss_fn, optimizer = get_loss_optimizer(loss_fn_str, optimizer_str, model, learning_rate)

    # define data transforms
    data_transforms = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    augment_transform = transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),  # resize the images
        # randomly add Gaussian blur
        transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.7),
        # randomly perform a horizontal flip
        transforms.RandomHorizontalFlip(p=0.7),
        # turn the images into a torch.Tensor
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    # create training and validation datasets
    datasets = create_cv_datasets(fold_list, data_transforms, select_one=False,
                                  train_percentage=train_percentage,
                                  augmentation=augmentation, augment_transform=augment_transform)
    train_data = datasets[used_fold]["train"]
    validation_data = datasets[used_fold]["validation"]
    print(f"Size of the training set: {len(train_data)}")
    print(f"Size of the validation set: {len(validation_data)}")
    train_loader, val_loader = create_dataloaders(train_data, validation_data, device, batch_size)

    # create a writer to track training results
    writer = create_writer(f"hp_tuning", model_name, f"{used_fold}_start_{start_time_tuning}", project_path)

    # track hyperparameters and other relevant information
    info = {
        "model_name": model_name,
        "batch_size": batch_size,
        "dropout_p": dropout_p,
        "loss_fn_str": loss_fn_str,
        "learning_rate": learning_rate,
        "augmentation": augmentation,
        "split_path": str(split_path),
        "fold_list": [str(fold) for fold in fold_list],
        "validation_fold": used_fold,
        "fold_num": fold_num,
        "img_size": img_size,
        "num_epochs": num_epochs,
        "optimizer_str": optimizer_str,
        "percentage": train_percentage,
        "cross_validation": False
    }

    writer.add_text("Miscellaneous", pretty_json(info))

    # train the model
    fold_train_cccs = []
    fold_train_pccs = []
    fold_train_loss = []
    fold_val_cccs = []
    fold_val_pccs = []
    fold_val_loss = []

    for epoch in range(num_epochs):
        train_loss, train_ccc, train_pcc = train_step(model=model,
                                                      dataloader=train_loader,
                                                      loss_fn=loss_fn,
                                                      optimizer=optimizer,
                                                      device=device)

        val_loss, val_ccc, val_pcc = val_step(model=model,
                                              dataloader=val_loader,
                                              loss_fn=loss_fn,
                                              device=device)

        fold_train_loss.append(train_loss)
        fold_train_cccs.append(train_ccc)
        fold_train_pccs.append(train_pcc)
        fold_val_loss.append(val_loss)
        fold_val_cccs.append(val_ccc)
        fold_val_pccs.append(val_pcc)

        print(
            f"Epoch: {epoch + 1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_ccc: {train_ccc:.4f} | "
            f"train_pcc: {train_pcc:.4f} | "
            f"val_loss: {val_loss:.4f} | "
            f"val_ccc: {val_ccc:.4f} | "
            f"val_pcc: {val_pcc:.4f}"
        )

        # Add loss and accuracy results to SummaryWriter
        writer.add_scalars(main_tag=f"Loss_{used_fold}",
                           tag_scalar_dict={"train_loss": train_loss,
                                            "val_loss": val_loss},
                           global_step=epoch)

        writer.add_scalars(main_tag=f"CCC_{used_fold}",
                           tag_scalar_dict={"train_ccc": train_ccc,
                                            "val_ccc": val_ccc},
                           global_step=epoch)
        writer.add_scalars(main_tag=f"PCC_{used_fold}",
                           tag_scalar_dict={"train_pcc": train_pcc,
                                            "val_pcc": val_pcc},
                           global_step=epoch)

        # report the results
        train.report({"val_ccc": val_ccc, "val_pcc": val_pcc, "val_loss": val_loss})

    # evaluate results
    best_ccc = max(fold_val_cccs)
    best_epoch = fold_val_cccs.index(best_ccc)
    best_epoch_pcc = fold_val_pccs[best_epoch]
    best_epoch_loss = fold_val_loss[best_epoch]

    # log hyperparameters and metrics
    hparam_dict = {f"{key}": value for key, value in config.items()}
    hparam_dict["best_epoch"] = best_epoch
    metric_dict = {"hparam/best_val_ccc": best_ccc,
                   "hparam/val_pcc": best_epoch_pcc,
                   "hparam/val_loss": best_epoch_loss}
    writer.add_hparams(hparam_dict, metric_dict)
    writer.close()

In [3]:
# first hyperparameter tuning run
search_space = {
    "bs": tune.choice([16, 32, 64, 128, 256]),
    "do": tune.uniform(0.0, 0.5),
    "loss_fn": tune.choice(["MAE", "MSE"]),
    "lr": tune.qloguniform(1e-4, 1e-1, 5e-5),
    "aug": False  # this parameter will not be tuned in this round
}

os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"

ray.shutdown()
ray.init(num_gpus=1, logging_level=logging.ERROR)
resources = {"gpu": 1}
trainable_with_resources = tune.with_resources(tune_enet, resources)

start_tune = dt.now().strftime("%H-%M-%S")

tuner = tune.Tuner(
    tune.with_parameters(trainable_with_resources,
                         start_time_tuning=start_tune),
    tune_config=tune.TuneConfig(
        num_samples=20,
        scheduler=ASHAScheduler(metric="val_ccc", mode="max")
    ),
    param_space=search_space
)

results = tuner.fit()

0,1
Current time:,2024-05-26 22:41:58
Running for:,03:25:39.50
Memory:,27.2/63.7 GiB

Trial name,status,loc,bs,do,loss_fn,lr,iter,total time (s),val_ccc,val_pcc,val_loss
tune_enet_aa95f_00000,TERMINATED,127.0.0.1:23156,256,0.371305,MSE,0.054,20,1483.12,0.289464,0.386843,0.181286
tune_enet_aa95f_00001,TERMINATED,127.0.0.1:26948,256,0.11096,MAE,0.00235,20,1623.92,0.33012,0.366141,0.17963
tune_enet_aa95f_00002,TERMINATED,127.0.0.1:2744,256,0.18713,MSE,0.0347,1,83.8888,0.129258,0.167679,0.132193
tune_enet_aa95f_00003,TERMINATED,127.0.0.1:7140,64,0.0447068,MSE,0.0001,4,319.175,0.250378,0.401975,0.0507523
tune_enet_aa95f_00004,TERMINATED,127.0.0.1:3352,64,0.0444027,MAE,0.0281,1,84.1908,0.155129,0.292617,0.482046
tune_enet_aa95f_00005,TERMINATED,127.0.0.1:2904,256,0.0907329,MAE,0.00945,4,325.378,0.0221671,0.0152968,0.241325
tune_enet_aa95f_00006,TERMINATED,127.0.0.1:30200,16,0.265858,MAE,0.00015,1,84.5605,0.078357,0.159623,0.178246
tune_enet_aa95f_00007,TERMINATED,127.0.0.1:2288,128,0.184449,MAE,0.00895,4,320.517,-0.0233024,-0.0509643,0.295654
tune_enet_aa95f_00008,TERMINATED,127.0.0.1:25092,16,0.369446,MAE,0.00785,1,84.6233,0.0612178,0.130723,0.314655
tune_enet_aa95f_00009,TERMINATED,127.0.0.1:29132,128,0.29515,MAE,0.0088,16,1264.7,0.326764,0.377313,0.199


[36m(tune_enet pid=23156)[0m cuda
[36m(tune_enet pid=23156)[0m [INFO] Created a new pretrained EfficientNet-B0 model.
[36m(tune_enet pid=23156)[0m Size of the training set: 19803
[36m(tune_enet pid=23156)[0m Size of the validation set: 4915
[36m(tune_enet pid=23156)[0m [INFO] Created SummaryWriter, saving to: C:\local\AIProject\runs\hp_tuning\pretrained_enet\fold_1_start_19-16-18\2024-05-26_19-16-24...
[36m(tune_enet pid=23156)[0m Epoch: 1 | train_loss: 0.7220 | train_ccc: 0.1955 | train_pcc: 0.4009 | val_loss: 0.7403 | val_ccc: 0.1457 | val_pcc: 0.3299
[36m(tune_enet pid=23156)[0m Epoch: 2 | train_loss: 0.5861 | train_ccc: 0.2661 | train_pcc: 0.4778 | val_loss: 0.4555 | val_ccc: 0.1726 | val_pcc: 0.3899
[36m(tune_enet pid=23156)[0m Epoch: 3 | train_loss: 0.3779 | train_ccc: 0.3217 | train_pcc: 0.4801 | val_loss: 0.3281 | val_ccc: 0.2518 | val_pcc: 0.4286
[36m(tune_enet pid=23156)[0m Epoch: 4 | train_loss: 0.2883 | train_ccc: 0.3550 | train_pcc: 0.4866 | val_loss: 0.2

Observations
* Larger batch sizes yield better performances than smaller batch sizes.
* Learning rates below 0.001 yield better performances than higher learning rates.
* MSE and MAE yield similar performances, so the MSE is focused in the following.

In [4]:
# second hyperparameter tuning run
search_space = {
    "bs": 256,
    "do": tune.uniform(0.0, 0.5),
    "loss_fn": "MSE",
    "lr": tune.quniform(1e-4, 1e-3, 5e-5),
    "aug": tune.choice([True, False])  
}

os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"

ray.shutdown()
ray.init(num_gpus=1, logging_level=logging.ERROR)
resources = {"gpu": 1}
trainable_with_resources = tune.with_resources(tune_enet, resources)

start_tune = dt.now().strftime("%H-%M-%S")

tuner = tune.Tuner(
    tune.with_parameters(trainable_with_resources,
                         start_time_tuning=start_tune),
    tune_config=tune.TuneConfig(
        num_samples=20,
        scheduler=ASHAScheduler(metric="val_ccc", mode="max")
    ),
    param_space=search_space
)

results = tuner.fit()

0,1
Current time:,2024-05-27 00:44:44
Running for:,02:02:39.05
Memory:,27.2/63.7 GiB

Trial name,status,loc,aug,do,lr,iter,total time (s),val_ccc,val_pcc,val_loss
tune_enet_69e10_00000,TERMINATED,127.0.0.1:25948,False,0.224225,0.00065,20,1622.2,0.410174,0.474734,0.0509481
tune_enet_69e10_00001,TERMINATED,127.0.0.1:7860,False,0.0883411,0.00095,4,326.474,0.399903,0.468529,0.0501419
tune_enet_69e10_00002,TERMINATED,127.0.0.1:22456,False,0.267191,0.00045,1,84.5168,0.380439,0.4804,0.0534174
tune_enet_69e10_00003,TERMINATED,127.0.0.1:25772,False,0.498996,0.0006,1,84.9139,0.400056,0.485991,0.0608402
tune_enet_69e10_00004,TERMINATED,127.0.0.1:28256,False,0.300071,0.0009,1,85.314,0.394811,0.492886,0.0500898
tune_enet_69e10_00005,TERMINATED,127.0.0.1:27428,False,0.0624864,0.0007,1,85.0934,0.399146,0.494481,0.0507884
tune_enet_69e10_00006,TERMINATED,127.0.0.1:30592,False,0.117185,0.00035,1,84.8116,0.367164,0.465123,0.0549372
tune_enet_69e10_00007,TERMINATED,127.0.0.1:2644,True,0.43187,0.00095,16,2397.2,0.43985,0.494635,0.0515981
tune_enet_69e10_00008,TERMINATED,127.0.0.1:29420,True,0.228223,0.00055,4,603.402,0.400823,0.4651,0.0503155
tune_enet_69e10_00009,TERMINATED,127.0.0.1:6912,False,0.013492,0.0002,1,84.0438,0.329708,0.424603,0.0587024


[36m(tune_enet pid=25948)[0m cuda
[36m(tune_enet pid=25948)[0m [INFO] Created a new pretrained EfficientNet-B0 model.
[36m(tune_enet pid=25948)[0m Size of the training set: 19803
[36m(tune_enet pid=25948)[0m Size of the validation set: 4915
[36m(tune_enet pid=25948)[0m [INFO] Created SummaryWriter, saving to: C:\local\AIProject\runs\hp_tuning\pretrained_enet\fold_1_start_22-42-05\2024-05-26_22-42-11...
[36m(tune_enet pid=25948)[0m Epoch: 1 | train_loss: 0.0614 | train_ccc: 0.5205 | train_pcc: 0.5425 | val_loss: 0.0540 | val_ccc: 0.4019 | val_pcc: 0.4940
[36m(tune_enet pid=25948)[0m Epoch: 2 | train_loss: 0.0421 | train_ccc: 0.6639 | train_pcc: 0.6872 | val_loss: 0.0529 | val_ccc: 0.4388 | val_pcc: 0.5101
[36m(tune_enet pid=25948)[0m Epoch: 3 | train_loss: 0.0390 | train_ccc: 0.6949 | train_pcc: 0.7174 | val_loss: 0.0555 | val_ccc: 0.4499 | val_pcc: 0.5143
[36m(tune_enet pid=25948)[0m Epoch: 4 | train_loss: 0.0363 | train_ccc: 0.7230 | train_pcc: 0.7428 | val_loss: 0.0

Observations
* All tested hyperparameter combinations achieve similar performance.
* This will be validated in the next step using cross validation. 