In [1]:
import sys
# append the path of the parent directory
sys.path.append("..")

# External library imports
import datetime
import torch
import torch.nn as nn
import numpy as np
import random
import gc
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset, random_split
import matplotlib.pyplot as plt


# internal library imports
from dataset import DebrisStatePairsDataset
from model import CNN, UNet
from train import TrainerPairs, CustomDebrisLoss, SparseLoss, AdaptiveSparseLoss
from util.setting_utils import set_seed

import optuna
from optuna.exceptions import TrialPruned
import optuna.visualization as vis

In [2]:
# Parameters

root_dir = r'/home/tom/repos/dyna-landslide-surrogate/data_experiment_prune'
checkpoint_dir = r'/home/tom/repos/dyna-landslide-surrogate/checkpoints'
batch_size = 32
split_proportions = (0.7, 0.15, 0.15)
epochs = 15

timestep_interval = 5
experiment = "base_model_compar_adaploss_batch32_timestep5_lr1e-4_l2reg_drop0-2_expdata_prune_MedUNet"

in_channels = 3  # Number of input channels (e.g., terrain, velocity, thickness)
out_channels = 2  # Number of output channels (e.g., next velocity, next thickness)

In [3]:
# Set up CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device: {device}.")


Training on device: cuda.


In [4]:
# def objective(trial):
#     # Define hyperparameters to tune
#     features_str = trial.suggest_categorical('features', ('64,128,256', '64,128,256,512'))
#     dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
#     learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
#     weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)
#     timestep_interval = trial.suggest_int('timestep_interval', 1, 10)
#     batch_size = trial.suggest_categorical('batch_size', (16, 32, 64, 128))

#     features = list(map(int, features_str.split(',')))

#     # Create the dataset with the sampled timestep_interval
#     dataset = DebrisStatePairsDataset(root_dir, array_size=256, apply_scaling=True, timestep_interval=timestep_interval)

#     try:
#         # Split dataset into train, validation, and test sets and create dataloaders
#         train_loader, val_loader, test_loader = dataset.create_dataloaders(split_proportions, batch_size, random_state=42)

#         # Create the model with the sampled hyperparameters
#         model = UNet(in_channels=in_channels, out_channels=out_channels, features=features, dropout_rate=dropout_rate)

#         # Move model to the appropriate device
#         model.to(device)

#         # Define the loss function and optimizer with the sampled hyperparameters
#         criterion = AdaptiveSparseLoss()
#         optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#         # Initialize the trainer
#         model_name = f"trial_{trial.number}"
#         trainer = TrainerPairs(model, optimizer, criterion, device, model_name=model_name, checkpoint_dir=checkpoint_dir, patience=10)

#         # Train the model
#         trainer.train(train_loader, val_loader, epochs=epochs)

#         # Get the best validation loss as the objective value
#         best_val_loss = min(trainer.validation_losses)

#     except RuntimeError as e:
#         if 'CUDA out of memory' in str(e):
#             # CUDA memory issue encountered, prune the trial
#             raise TrialPruned(f"Trial {trial.number} pruned due to CUDA out of memory error.")
#         else:
#             # Other runtime error, re-raise the exception
#             raise

#     # Clean-up
#     del model
#     del optimizer
#     del trainer
#     torch.cuda.empty_cache()  # Clear memory cache
#     gc.collect()  # Collect garbage

#     return best_val_loss

def objective(trial):

    study_name = 'attempt2'

    # Define hyperparameters to tune
    features_str = trial.suggest_categorical('features', ('64,128,256', '64,128,256,512'))
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True)
    timestep_interval = trial.suggest_int('timestep_interval', 1, 10)
    batch_size = trial.suggest_categorical('batch_size', (16, 32, 64, 128))

    features = list(map(int, features_str.split(',')))
    dataset = DebrisStatePairsDataset(root_dir, array_size=256, apply_scaling=True, timestep_interval=timestep_interval)
    train_loader, val_loader, test_loader = dataset.create_dataloaders(split_proportions, batch_size, random_state=42)

    try:
        # Model creation and setup for multi-GPU
        model = UNet(in_channels=in_channels, out_channels=out_channels, features=features, dropout_rate=dropout_rate)
        # if torch.cuda.device_count() > 1:
        #     model = torch.nn.DataParallel(model)
        model.to(device)

        # Loss and optimizer
        criterion = AdaptiveSparseLoss()
        optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

        # Trainer initialization and training execution
        trainer = TrainerPairs(model, optimizer, criterion, device, f"{study_name}_trial_{trial.number}", checkpoint_dir, 10)
        trainer.train(train_loader, val_loader, epochs=epochs)
        best_val_loss = min(trainer.validation_losses)

    except RuntimeError as e:
        if 'CUDA out of memory' in str(e):
            # CUDA memory issue encountered, prune the trial
            print(f"Trial {trial.number} pruned due to CUDA out of memory error.")
            raise TrialPruned()
        else:
            # Other runtime error, re-raise the exception
            raise

    finally:
        # Cleanup to prevent memory leaks
        del model, optimizer, trainer
        torch.cuda.empty_cache()
        gc.collect()

    return best_val_loss

In [5]:
# Create an Optuna study
study = optuna.create_study(study_name="attempt2", direction='minimize')

# Optimize the objective function
study.optimize(objective, n_trials=50)

# Print the best hyperparameters and best value
print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

[I 2024-04-26 21:12:01,137] A new study created in memory with name: attempt2
[I 2024-04-26 21:12:05,696] Trial 0 pruned. 


Trial 0 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:08,671] Trial 1 pruned. 


Trial 1 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:12,373] Trial 2 pruned. 


Trial 2 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:15,288] Trial 3 pruned. 


Trial 3 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:18,368] Trial 4 pruned. 


Trial 4 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:21,310] Trial 5 pruned. 


Trial 5 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:24,825] Trial 6 pruned. 


Trial 6 pruned due to CUDA out of memory error.


[I 2024-04-26 21:12:28,002] Trial 7 pruned. 


Trial 7 pruned due to CUDA out of memory error.
Epoch [1/15], Loss: 0.1153
Validation Loss: 0.0730
Current Learning Rate: [1.0025657040303674e-05]


[W 2024-04-26 21:14:48,048] Trial 8 failed with parameters: {'features': '64,128,256,512', 'dropout_rate': 0.24521299023398427, 'learning_rate': 1.0025657040303674e-05, 'weight_decay': 1.854882896440581e-06, 'timestep_interval': 6, 'batch_size': 16} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/tom/anaconda3/envs/cuda/lib/python3.12/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_754392/3865638753.py", line 82, in objective
    trainer.train(train_loader, val_loader, epochs=epochs)
  File "/home/tom/repos/dyna-landslide-surrogate/LandSlideDyna/model/train.py", line 103, in train
    loss = self.criterion(predictions, next_state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tom/anaconda3/envs/cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._c

KeyboardInterrupt: 

In [None]:
import os
os.environ["NCCL_DEBUG"] = "INFO"

In [None]:
# Plot optimization history
vis.plot_optimization_history(study)

# Plot intermediate values
vis.plot_intermediate_values(study)

# Plot parallel coordinate
vis.plot_parallel_coordinate(study)

# Plot contour (example with 'learning_rate' and 'weight_decay' hyperparameters)
vis.plot_contour(study, params=['learning_rate', 'weight_decay'])

# Plot slice (example with 'dropout_rate' hyperparameter)
vis.plot_slice(study, params=['dropout_rate'])

# Plot parameter importances
vis.plot_param_importances(study)