In [1]:
import sys
# append the path of the parent directory
sys.path.append("..")

# External library imports
import datetime
import torch
import torch.nn as nn
import numpy as np
import random
import gc
from torch.optim import Adam
from torch.utils.data import DataLoader, Subset, random_split


# internal library imports
from dataset import DebrisStatePairsDataset
from model import CNN, UNet
from train import TrainerPairs, CustomDebrisLoss
from util.setting_utils import set_seed




In [2]:
# Set the random seed for multiple libraries to ensure repeatability

set_seed(42)

In [3]:
# Parameters

root_dir = r'/home/tom/repos/dyna-landslide-surrogate/data'
checkpoint_dir = r'/home/tom/repos/dyna-landslide-surrogate/checkpoints'
batch_size = 32
split_proportions = (0.7, 0.15, 0.15)
epochs = 50

in_channels = 3  # Number of input channels (e.g., terrain, velocity, thickness)
out_channels = 2  # Number of output channels (e.g., next velocity, next thickness)

In [4]:
# Data set up

# Initialize dataset with scaling
dataset = DebrisStatePairsDataset(root_dir, array_size=256, apply_scaling=True, timestep_interval=5)

# Split dataset into train, validation, and test sets and create dataloaders
train_loader, val_loader, test_loader = dataset.create_dataloaders(split_proportions, batch_size, random_state=42)



# Dataset stats
print(f"Total dataset size: {len(dataset)}")
print(f"Train size: {len(train_loader.dataset)}, Validation size: {len(val_loader.dataset)}, Test size: {len(test_loader.dataset)}")

Total dataset size: 54190
Train size: 38302, Validation size: 7788, Test size: 8100


In [5]:
# pair_filenames = dataset.get_pair_filenames()

# # Print the filenames for each pair
# for i, filenames in enumerate(pair_filenames):
#     print(f"Pair {i+1}:")
#     print("Current velocity:", filenames['current_velocity'])
#     print("Next velocity:", filenames['next_velocity'])
#     print("Current thickness:", filenames['current_thickness'])
#     print("Next thickness:", filenames['next_thickness'])
#     print("Terrain:", filenames['terrain'])
#     print()

In [6]:
# def compute_min_max(dataset):
#     min_elevation = np.inf
#     max_elevation = -np.inf
#     min_velocity = np.inf
#     max_velocity = -np.inf
#     min_thickness = np.inf
#     max_thickness = -np.inf

#     for cnn_input, _ in dataset:
#         elevation = cnn_input[0, :, :].numpy()
#         thickness = cnn_input[1, :, :].numpy()
#         velocity = cnn_input[2, :, :].numpy()

#         min_elevation = min(min_elevation, elevation.min())
#         max_elevation = max(max_elevation, elevation.max())
#         min_velocity = min(min_velocity, velocity.min())
#         max_velocity = max(max_velocity, velocity.max())
#         min_thickness = min(min_thickness, thickness.min())
#         max_thickness = max(max_thickness, thickness.max())

#     return min_elevation, max_elevation, min_velocity, max_velocity, min_thickness, max_thickness

In [7]:
# # Check the range of values in each dataset
# train_min_max = compute_min_max(train_loader.dataset)
# val_min_max = compute_min_max(val_loader.dataset)
# test_min_max = compute_min_max(test_loader.dataset)

# print("Train dataset range:")
# print("Elevation: [{:.2f}, {:.2f}]".format(*train_min_max[:2]))
# print("Velocity: [{:.2f}, {:.2f}]".format(*train_min_max[2:4]))
# print("Thickness: [{:.2f}, {:.2f}]".format(*train_min_max[4:]))

# print("\nValidation dataset range:")
# print("Elevation: [{:.2f}, {:.2f}]".format(*val_min_max[:2]))
# print("Velocity: [{:.2f}, {:.2f}]".format(*val_min_max[2:4]))
# print("Thickness: [{:.2f}, {:.2f}]".format(*val_min_max[4:]))

# print("\nTest dataset range:")
# print("Elevation: [{:.2f}, {:.2f}]".format(*test_min_max[:2]))
# print("Velocity: [{:.2f}, {:.2f}]".format(*test_min_max[2:4]))
# print("Thickness: [{:.2f}, {:.2f}]".format(*test_min_max[4:]))

In [8]:
# Model setup

today = datetime.date.today()

date = today.strftime('%Y-%m-%d')
experiment = "base_model_comparison_l1Loss_batch32_timstep5_lr1e-4_l2reg_drop0-25"

models = [
    {'model': CNN(dropout_rate=0.25), 'name': f'{date}_{experiment}_CNN'},
    {'model': UNet(in_channels=in_channels, out_channels=out_channels, features=[64, 128, 256], dropout_rate=0.25), 'name': f'{date}_{experiment}_SmallUNet'}, 
    {'model': UNet(in_channels=in_channels, out_channels=out_channels, features=[64, 128, 256, 512], dropout_rate=0.25), 'name': f'{date}_{experiment}_MedUNet'}, 
    {'model': UNet(in_channels=in_channels, out_channels=out_channels, features=[64, 128, 256, 512, 1024], dropout_rate=0.25), 'name': f'{date}_{experiment}_LargeUNet'}
]


In [9]:
# # Create an instance of each architecture
# cnn = CNN()
# small_unet = UNet(in_channels=3, out_channels=2, features=[64, 128, 256])
# med_unet = UNet(in_channels=3, out_channels=2, features=[64, 128, 256, 512])
# large_unet = UNet(in_channels=3, out_channels=2, features=[64, 128, 256, 512, 1024])

# # Calculate the number of parameters for each architecture
# cnn_params = sum(p.numel() for p in cnn.parameters())
# small_unet_params = sum(p.numel() for p in small_unet.parameters())
# med_unet_params = sum(p.numel() for p in med_unet.parameters())
# large_unet_params = sum(p.numel() for p in large_unet.parameters())

# # Print the number of parameters for each architecture
# print("ComplexCNN parameters:", cnn_params)
# print("SimpleUNet parameters:", small_unet_params)
# print("LargeUNet parameters:", med_unet_params)
# print("UNet parameters:", large_unet_params)

In [10]:
# Set up CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device: {device}.")

# Train each model
for model_info in models:
    
    model = model_info['model']

    # Check if multiple GPUs are available and wrap the model using nn.DataParallel
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        # This will wrap the model for use with multiple GPUs
        model = torch.nn.DataParallel(model)
    
    # Move model to the appropriate device
    model.to(device)
    
    # Define the loss function and optimizer
    # criterion = nn.MSELoss()
    criterion = nn.L1Loss()
    # criterion = CustomDebrisLoss(loss_fn_zero=nn.SmoothL1Loss(), loss_fn_debris=nn.SmoothL1Loss(), debris_weight=0.66)
    optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
    
    # Initialize the trainer
    model_name = model_info['name']
    trainer = TrainerPairs(model, optimizer, criterion, device, model_name=model_name, checkpoint_dir=checkpoint_dir)
    
    # Train the model
    print(f"Training {model_name}...")
    trainer.train(train_loader, val_loader, epochs=epochs, checkpoint_interval=5)
    
    print(f"Finished training {model_name}.")

    # Clean-up
    del model
    del optimizer
    del trainer
    torch.cuda.empty_cache()  # Clear memory cache
    gc.collect()  # Collect garbage
    
    print(f"GPU memory cleared after training {model_name}.")

Training on device: cuda.
Using 2 GPUs!
Training 2024-04-24_base_model_comparison_l1Loss_batch32_timstep5_lr1e-4_l2reg_drop0-25_CNN...
Epoch [1/50], Loss: 0.0144
Validation Loss: 0.0009
Epoch [2/50], Loss: 0.0007
Validation Loss: 0.0005
Epoch [3/50], Loss: 0.0006
Validation Loss: 0.0005
Epoch [4/50], Loss: 0.0005
Validation Loss: 0.0005
Epoch [5/50], Loss: 0.0005
Validation Loss: 0.0005
Model saved to /home/tom/repos/dyna-landslide-surrogate/checkpoints/2024-04-24_base_model_comparison_l1Loss_batch32_timstep5_lr1e-4_l2reg_drop0-25_CNN/model_epoch_5.pth
Losses saved to /home/tom/repos/dyna-landslide-surrogate/checkpoints/2024-04-24_base_model_comparison_l1Loss_batch32_timstep5_lr1e-4_l2reg_drop0-25_CNN/losses_epoch_5.json
Epoch [6/50], Loss: 0.0005
Validation Loss: 0.0004
Epoch [7/50], Loss: 0.0005
Validation Loss: 0.0004


In [None]:
# # Evaluate the model on the test set
# print("Plotting losses...")
# trainer.plot_losses()

In [None]:
# # Evaluate the model on the test set
# print("Evaluating the model on the test set...")
# trainer.test(test_loader)


In [None]:
# trainer.plot_predictions(test_loader, num_predictions=15)