# Bayesian Optimisation Ray Tune
- This file performs hyperparameter optimization using Bayesian Optimization with the HyperBand scheduler from Ray Tune.
- It integrates a Vision Transformer model, optimizing over a manually defined configuration space using ConfigSpace.

In [1]:
import os
from datetime import datetime
import logging
import ConfigSpace as CS
import ray
from ray import tune
from ray.tune import Tuner, TuneConfig, with_resources
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision.datasets import ImageFolder
import torch.nn.functional as F

In [2]:
# Constants
EPOCHS = 30
CLASSES = 3
DATA_DIR = "/home/sur06423/wacv_paper/wacv_paper/data/pizza_steak_sushi"

In [3]:
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[
    logging.FileHandler(f"hyperopt_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"),
    logging.StreamHandler()
])

In [4]:
class VisionTransformerModel:
    """Encapsulates the Vision Transformer model setup."""
    
    @staticmethod
    def define_model(use_gpu, num_classes=CLASSES):
        """Configures a Vision Transformer for image classification with a frozen backbone.
        
        Args:
            use_gpu (bool): Whether to utilize CUDA if available.
            num_classes (int): The number of classes for the output layer.

        Returns:
            Tuple[torch.nn.Module, torchvision.transforms]: Configured model and associated transforms.
        """
        pretrained_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
        model = torchvision.models.vit_b_16(weights=pretrained_weights)
        for param in model.parameters():
            param.requires_grad = False
        model.heads = nn.Linear(in_features=768, out_features=num_classes)
        return model, pretrained_weights.transforms()


In [5]:
class DataLoaderFactory:
    """Handles the creation of dataset loaders for training and validation."""

    @staticmethod
    def get_data_loaders(transform, batch_size=256):
        """Creates training and validation data loaders.

        Args:
            transform (callable): A function/transform that takes in a PIL image and returns a transformed version.
            batch_size (int): Number of images in each batch of data.

        Returns:
            Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]: Data loaders for training and validation.
        """
        train_dataset = ImageFolder(root=os.path.join(DATA_DIR, "train"), transform=transform)
        val_dataset = ImageFolder(root=os.path.join(DATA_DIR, "test"), transform=transform)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        return train_loader, val_loader

In [6]:
class Metrics:
    """Provides methods to calculate evaluation metrics for model performance."""
    
    @staticmethod
    def calculate_balanced_accuracy(y_pred, y_true, num_classes=CLASSES):
        """Calculates the balanced accuracy across given predictions and true labels.

        Args:
            y_pred (torch.Tensor): Predictions from the model.
            y_true (torch.Tensor): Actual labels from the dataset.
            num_classes (int): Number of different classes in the dataset.

        Returns:
            float: Balanced accuracy score.
        """
        correct_per_class = torch.zeros(num_classes, device=y_pred.device)
        total_per_class = torch.zeros(num_classes, device=y_pred.device)
        for c in range(num_classes):
            true_positives = ((y_pred == c) & (y_true == c)).sum()
            condition_positives = (y_true == c).sum()
            correct_per_class[c] = true_positives.float()
            total_per_class[c] = condition_positives.float()
        recall_per_class = correct_per_class / total_per_class.clamp(min=1)
        return recall_per_class.mean().item()

In [7]:
def create_config_space():
    """Defines and returns the configuration space for hyperparameter tuning using ConfigSpace."""
    config_space = CS.ConfigurationSpace()
    optimizer = CS.CategoricalHyperparameter("optimizer", ["adam", "sgd", "adamw"])
    scheduler = CS.CategoricalHyperparameter("scheduler", ["constant", "cosineannealinglr", "linearinterpolationlr"])
    lr = CS.UniformFloatHyperparameter("lr", lower=1e-5, upper=1e-1, log=True)
    momentum = CS.UniformFloatHyperparameter("momentum", lower=0.8, upper=0.99)
    weight_decay = CS.UniformFloatHyperparameter("weight_decay", lower=0, upper=0.1)
    end_lr = CS.UniformFloatHyperparameter("end_lr", lower=1e-3, upper=1e-2, log=True)
    use_gpu = CS.CategoricalHyperparameter("use_gpu", [True])  # Explicitly set "use_gpu" to True
    config_space.add([optimizer, scheduler, lr, momentum, weight_decay, end_lr, use_gpu])
    return config_space

# Class API for Hyperparameter Optimisation Ray Tune

In [8]:
class TrainViT(tune.Trainable):
    """A trainable class for Ray Tune that handles the training and validation of a Vision Transformer model."""
    
    def setup(self, config):
        """Prepares the model, data loaders, optimizer, and scheduler for training based on the configuration provided.

        Args:
            config (dict): Configuration dictionary containing hyperparameters and model settings.
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() and config["use_gpu"] else "cpu")
        self.model, transforms = VisionTransformerModel.define_model(config["use_gpu"])
        self.model.to(self.device)
        self.train_loader, self.val_loader = DataLoaderFactory.get_data_loaders(transforms)
        self.optimizer = self._initialize_optimizer(config)
        self.scheduler = self._initialize_scheduler(config)

    def _initialize_optimizer(self, config):
        """Initializes the optimizer based on the configuration.

        Args:
            config (dict): Configuration dictionary specifying the optimizer type and parameters.

        Returns:
            torch.optim.Optimizer: Initialized optimizer.
        """
        optimizer_map = {
            "adam": lambda: optim.Adam(self.model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"]),
            "sgd": lambda: optim.SGD(self.model.parameters(), lr=config["lr"], momentum=config["momentum"], weight_decay=config["weight_decay"]),
            "adamw": lambda: optim.AdamW(self.model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
        }
        return optimizer_map[config["optimizer"]]()

    def _initialize_scheduler(self, config):
        """Initializes the learning rate scheduler based on the configuration.

        Args:
            config (dict): Configuration dictionary specifying the scheduler type and parameters.

        Returns:
            Optional[torch.optim.lr_scheduler._LRScheduler]: Initialized scheduler, or None if not applicable.
        """
        scheduler_map = {
            "constant": lambda: None,
            "cosineannealinglr": lambda: optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=EPOCHS),
            "linearinterpolationlr": lambda: optim.lr_scheduler.LambdaLR(
                self.optimizer,
                lr_lambda=lambda epoch: (1 - float(epoch) / EPOCHS) + (float(epoch) / EPOCHS) * config["end_lr"] / config["lr"]
            )
        }
        return scheduler_map[config["scheduler"]]()

    def step(self):
        """Executes a single step of training and validation.

        Returns:
            dict: A dictionary containing training and validation loss and accuracy.
        """
        train_loss, train_acc = self._train_one_epoch()
        if self.scheduler:
            self.scheduler.step()
        val_loss, val_acc = self._validate_one_epoch()
        return {"loss": train_loss, "accuracy": train_acc, "val_loss": val_loss, "val_acc": val_acc}

    def _train_one_epoch(self):
        """Conducts a single epoch of training on the entire training dataset.

        Returns:
            Tuple[float, float]: Training loss and Balanced accuracy.
        """
        self.model.train()
        running_loss = 0.0
        all_predictions, all_labels = [], []
        for inputs, labels in self.train_loader:
            inputs, labels = inputs.to(self.device), labels.to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            self.optimizer.step()
            running_loss += loss.item() * inputs.size(0)
            all_predictions.append(torch.argmax(torch.softmax(outputs, dim=1), dim=1))
            all_labels.append(labels)
        avg_loss = running_loss / len(self.train_loader.dataset)
        balanced_acc = Metrics.calculate_balanced_accuracy(torch.cat(all_predictions), torch.cat(all_labels))
        return avg_loss, balanced_acc

    def _validate_one_epoch(self):
        """Conducts validation on the entire validation dataset and computes loss and accuracy.

        Returns:
            Tuple[float, float]: Validation loss and Balanced accuracy.
        """
        self.model.eval()
        running_loss = 0.0
        all_predictions, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in self.val_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                loss = F.cross_entropy(outputs, labels)
                running_loss += loss.item() * inputs.size(0)
                all_predictions.append(torch.argmax(torch.softmax(outputs, dim=1), dim=1))
                all_labels.append(labels)
        avg_loss = running_loss / len(self.val_loader.dataset)
        balanced_acc = Metrics.calculate_balanced_accuracy(torch.cat(all_predictions), torch.cat(all_labels))
        return avg_loss, balanced_acc

    def save_checkpoint(self, checkpoint_dir):
        """Saves the current model and optimizer state to a checkpoint.

        Args:
            checkpoint_dir (str): Directory path to save the checkpoint.

        Returns:
            str: Path to the checkpoint file.
        """
        checkpoint_path = os.path.join(checkpoint_dir, "checkpoint.pth")
        torch.save({
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
        }, checkpoint_path)
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_dir):
        """Loads the model and optimizer state from a checkpoint.

        Args:
            checkpoint_dir (str): Directory path from which to load the checkpoint.
        """
        checkpoint_path = os.path.join(checkpoint_dir, "checkpoint.pth")
        checkpoint = torch.load(checkpoint_path)
        self.model.load_state_dict(checkpoint["model_state_dict"])
        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

In [9]:
def setup_library_paths():
    """Configures additional library paths required for GPU computation.
        Note: These paths are specific to my local server.
              You might not need to define them.
        
        Adds specified library paths to the LD_LIBRARY_PATH environment variable
        if they are not already included, ensuring GPU dependencies are located.
    """
    library_paths = [
        "/usr/lib/xorg-nvidia-525.116.04/lib/x86_64-linux-gnu",
        "/usr/lib/xorg/lib/x86_64-linux-gnu",
        "/usr/lib/xorg-nvidia-535.113.01/lib/x86_64-linux-gnu"
    ]
    # Current LD_LIBRARY_PATH from the environment
    current_ld_library_path = os.environ.get('LD_LIBRARY_PATH', '')
    # Add only new paths that are not already in the LD_LIBRARY_PATH
    new_paths = [path for path in library_paths if path not in current_ld_library_path]
    # Update the environment variable
    os.environ['LD_LIBRARY_PATH'] = ':'.join(new_paths + [current_ld_library_path])
    print("Updated LD_LIBRARY_PATH:")
    print(os.environ['LD_LIBRARY_PATH'])

In [10]:
def main():
    """Main function to set up and execute the hyperparameter tuning."""
    ray.shutdown()
    # Initialise the Ray cluster with dashboard included
    ray.init(num_cpus=40, num_gpus=4, include_dashboard=True)
    # Setup GPU library paths
    setup_library_paths()

    # Use ConfigSpace directly with TuneBOHB
    config_space = create_config_space()
    bohb_hyperband = HyperBandForBOHB(time_attr="training_iteration", max_t=EPOCHS, reduction_factor=2, stop_last_trials=False)
    
    # Set metric and mode in TuneBOHB
    bohb_search = TuneBOHB(config_space, metric="val_acc", mode="max")  # Specify metric and mode
    bohb_search = tune.search.ConcurrencyLimiter(bohb_search, max_concurrent=4)

    run_config = ray.train.RunConfig(
        name="Bayesian_Trial_Exp",
        stop={"training_iteration": EPOCHS},
        checkpoint_config=ray.train.CheckpointConfig(checkpoint_frequency=1, checkpoint_at_end=True)
    )

    tuner = Tuner(
        trainable=with_resources(
            TrainViT,
            resources=lambda config: {"gpu": 1, "cpu": 3} if config.get("use_gpu", False) else {"cpu": 3}
        ),
        param_space={},  # Leave param_space empty as TuneBOHB uses config_space
        tune_config=TuneConfig(
            metric="val_acc",
            mode="max",
            scheduler=bohb_hyperband,
            search_alg=bohb_search,
            num_samples=32
        ),
        run_config=run_config
    )

    results = tuner.fit()
    best_result = results.get_best_result(metric="val_acc", mode="max")
    logging.info("Best trial config: {}".format(best_result.config))
    logging.info("Best trial final validation accuracy: {}".format(best_result.metrics["val_acc"]))

    ray.shutdown()

In [11]:
main()

0,1
Current time:,2024-11-02 11:54:03
Running for:,00:33:59.24
Memory:,22.5/503.4 GiB

Trial name,status,loc,end_lr,lr,momentum,optimizer,scheduler,use_gpu,weight_decay,iter,total time (s),loss,accuracy,val_loss
TrainViT_e8b9dbef,TERMINATED,10.56.7.46:2067308,0.00859156,0.000137338,0.95441,adam,constant,True,0.0702793,1,29.2324,1.1514,0.282222,1.04254
TrainViT_1b154b15,TERMINATED,10.56.7.46:2068273,0.00746483,2.41354e-05,0.88815,adamw,linearinterpola_3a50,True,0.0646759,1,28.0405,1.09805,0.426222,1.07918
TrainViT_dfa71a53,TERMINATED,10.56.7.46:2068390,0.00664807,1.0345e-05,0.979274,sgd,constant,True,0.0149964,1,27.2705,1.11672,0.322222,1.10355
TrainViT_5bc1a2ff,TERMINATED,10.56.7.46:2068493,0.00645562,0.00014916,0.913602,sgd,constant,True,0.0345147,1,28.1918,1.09194,0.371111,1.01704
TrainViT_c4960fb4,TERMINATED,10.56.7.46:2068599,0.00575781,6.40682e-05,0.867219,sgd,cosineannealinglr,True,0.0240613,1,28.8873,1.10373,0.327556,1.06689
TrainViT_b84013fd,TERMINATED,10.56.7.46:2074646,0.00939692,0.0265329,0.962829,adamw,cosineannealinglr,True,0.0675284,8,231.448,0.00780827,0.998667,0.118852
TrainViT_d634c15b,TERMINATED,10.56.7.46:2071537,0.00330186,0.00374415,0.885066,adamw,cosineannealinglr,True,0.0501659,2,54.644,0.138685,0.948444,0.0802933
TrainViT_f10a3b05,TERMINATED,10.56.7.46:2068971,0.00846438,1.71086e-05,0.911234,adam,cosineannealinglr,True,0.0683006,1,27.8968,1.03582,0.504444,1.02502
TrainViT_315372e7,TERMINATED,10.56.7.46:2069123,0.0054966,6.61444e-05,0.976973,adamw,linearinterpola_6070,True,0.00864234,1,27.9164,1.10641,0.390222,1.04539
TrainViT_376966ef,TERMINATED,10.56.7.46:2069232,0.0091004,2.1922e-05,0.945915,adam,constant,True,0.0146524,1,26.7716,1.1652,0.304,1.14829


[36m(TrainViT pid=2067308)[0m   return F.conv2d(input, weight, bias, self.stride,
[36m(TrainViT pid=2068273)[0m   return F.conv2d(input, weight, bias, self.stride,
[36m(TrainViT pid=2068390)[0m   return F.conv2d(input, weight, bias, self.stride,
[36m(TrainViT pid=2068493)[0m   return F.conv2d(input, weight, bias, self.stride,
[36m(TrainViT pid=2067308)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/sur06423/ray_results/Bayesian_Trial_Exp/TrainViT_e8b9dbef_1_end_lr=0.0086,lr=0.0001,momentum=0.9544,optimizer=adam,scheduler=constant,use_gpu=True,weight_decay=0.0703_2024-11-02_11-20-03/checkpoint_000000)
[36m(TrainViT pid=2068273)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/sur06423/ray_results/Bayesian_Trial_Exp/TrainViT_1b154b15_2_end_lr=0.0075,lr=0.0000,momentum=0.8881,optimizer=adamw,scheduler=linearinterpolationlr,use_gpu=True,weight_de_2024-11-02_11-20-18/checkpoint_000000)
[36m(TrainViT pid=2068599)[0m 