# Flood Model Training Notebook

Train a Flood ConvLSTM Model using `usl_models` lib.

In [1]:
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import keras_tuner
import time
import keras
import logging
from usl_models.flood_ml import constants
from usl_models.flood_ml.model import FloodModel
from usl_models.flood_ml.model_params import FloodModelParams
from usl_models.flood_ml.dataset import load_dataset_windowed, load_dataset

logging.getLogger().setLevel(logging.WARNING)
keras.utils.set_random_seed(812)

timestamp = time.strftime("%Y%m%d-%H%M%S")
sim_names = ["Manhattan-Manhattan_config/Rainfall_Data_1.txt"]

2025-04-16 17:41:16.080778: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-16 17:41:16.813544: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-16 17:41:16.813632: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-16 17:41:16.946517: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-16 17:41:17.207515: I tensorflow/core/platform/cpu_feature_guar

In [2]:
train_dataset = load_dataset_windowed(sim_names=sim_names, batch_size=4, dataset_split='train')
validation_data = load_dataset_windowed(sim_names=sim_names, batch_size=4, dataset_split='val')

2025-04-16 17:41:30.148650: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38380 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


In [3]:
tuner = keras_tuner.BayesianOptimization(
    FloodModel.get_hypermodel(
        lstm_units=[32, 64, 128],
        lstm_kernel_size=[3, 5],
        lstm_dropout=[0.2, 0.3],
        lstm_recurrent_dropout=[0.2, 0.3],
        n_flood_maps=[5],
        m_rainfall=[6],
    ),
        objective="val_loss",
        max_trials=10,
        project_name=f"logs/htune_project_{timestamp}",
)

tuner.search_space_summary()


2025-04-16 17:41:33.240818: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Search space summary
Default search space size: 6
lstm_units (Choice)
{'default': 32, 'conditions': [], 'values': [32, 64, 128], 'ordered': True}
lstm_kernel_size (Choice)
{'default': 3, 'conditions': [], 'values': [3, 5], 'ordered': True}
lstm_dropout (Choice)
{'default': 0.2, 'conditions': [], 'values': [0.2, 0.3], 'ordered': True}
lstm_recurrent_dropout (Choice)
{'default': 0.2, 'conditions': [], 'values': [0.2, 0.3], 'ordered': True}
n_flood_maps (Choice)
{'default': 5, 'conditions': [], 'values': [5], 'ordered': True}
m_rainfall (Choice)
{'default': 6, 'conditions': [], 'values': [6], 'ordered': True}


In [4]:
log_dir = f"logs/htune_{timestamp}"
print(log_dir)
tb_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
tuner.search(train_dataset, epochs=100, validation_data=validation_data , callbacks=[tb_callback])
best_model, best_hp = tuner.get_best_models()[0], tuner.get_best_hyperparameters()[0]
best_hp.values

logs/htune_20250416-174122

Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
64                |64                |lstm_units
3                 |3                 |lstm_kernel_size
0.3               |0.3               |lstm_dropout
0.3               |0.3               |lstm_recurrent_dropout
5                 |5                 |n_flood_maps
6                 |6                 |m_rainfall

Epoch 1/100


2025-04-16 17:41:39.551319: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inflood_conv_lstm/conv_lstm/conv_lstm2d/while/body/_1/flood_conv_lstm/conv_lstm/conv_lstm2d/while/dropout_7/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2025-04-16 17:41:42.644872: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2025-04-16 17:41:43.046821: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2025-04-16 17:41:47.208893: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f5b51200b10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-16 17:41:47.208927: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-04-16 17:41:47.233635: I tensorflow/comp

     43/Unknown - 52s 798ms/step - loss: 0.0041 - mean_absolute_error: 0.0133 - root_mean_squared_error: 0.0641

2025-04-16 17:42:27.737433: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 11096560215919195309
2025-04-16 17:42:27.737478: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 5649978391134141741
2025-04-16 17:42:27.737492: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 6152201312481187555
2025-04-16 17:42:41.039862: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 5697303379528365099
2025-04-16 17:42:41.039902: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 864897617394993133
2025-04-16 17:42:41.039914: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 11673756230953316518
2025-04-16 17:42:41.039920: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv it

Epoch 2/100
 3/43 [=>............................] - ETA: 9s - loss: 0.0081 - mean_absolute_error: 0.0214 - root_mean_squared_error: 0.0899     

KeyboardInterrupt: 

In [None]:
final_params = FloodModel.Params(**best_hp.values)
model = FloodModel(params=final_params)
tb_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
model.fit(train_dataset, validation_data, epochs=2000, callbacks=[tb_callback])
model.save_model(log_dir + "/model")


In [None]:
# Test calling the model on some data.
inputs, labels_ = next(iter(train_dataset))
prediction = model.call(inputs)
prediction.shape

In [None]:
# Test calling the model for n predictions
full_dataset = load_dataset(sim_names=sim_names, batch_size=1)
inputs, labels = next(iter(full_dataset))
predictions = model.call_n(inputs, n=4)
predictions.shape

In [1]:
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import time
import keras
import logging
from usl_models.flood_ml import constants
from usl_models.flood_ml.model import FloodModel
from usl_models.flood_ml.dataset import load_dataset_windowed, load_dataset

# Configure GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

logging.getLogger().setLevel(logging.WARNING)
keras.utils.set_random_seed(812)

# ===== DATA LOADING =====
def remove_elevation_features(input_dict, label):
    """Remove elevation features (channels 0 and 1)"""
    input_dict['geospatial'] = input_dict['geospatial'][..., 2:]  # Keep channels 2-8 (7 features)
    return input_dict, label

timestamp = time.strftime("%Y%m%d-%H%M%S")
sim_names = ["Atlanta-Atlanta_config/Rainfall_Data_1.txt"]

# Load datasets
train_dataset = load_dataset_windowed(
    sim_names=sim_names,
    batch_size=4,
    dataset_split='train'
).map(remove_elevation_features)

validation_data = load_dataset_windowed(
    sim_names=sim_names,
    batch_size=4,
    dataset_split='val'
).map(remove_elevation_features)

constants.GEO_FEATURES = 7  # Must match the number of features after removal

# ===== MODEL SETUP =====
standard_params = FloodModel.Params(
    num_features=constants.GEO_FEATURES,
    lstm_units=64,
    lstm_kernel_size=3,
    lstm_dropout=0.2,
    lstm_recurrent_dropout=0.2,
    n_flood_maps=5,
    m_rainfall=6,
    optimizer=keras.optimizers.Adam(learning_rate=0.001)
)

model = FloodModel(params=standard_params)

# ===== TRAINING =====
log_dir = f"logs/training_{timestamp}"
print(f"Training with {constants.GEO_FEATURES} features in {log_dir}")

# Verify data loading
try:
    sample = next(iter(train_dataset))
    print("Sample input shapes:")
    print(f"Geospatial: {sample[0]['geospatial'].shape} (should be (4, 1000, 1000, 9))")
    print(f"Temporal: {sample[0]['temporal'].shape}")
    print(f"Spatiotemporal: {sample[0]['spatiotemporal'].shape}")
except Exception as e:
    print(f"Data loading error: {str(e)}")
    raise

# Train using the underlying Keras model
history = model._model.fit(
    train_dataset,
    epochs=500,
    callbacks=[keras.callbacks.TensorBoard(log_dir)]
)

# ===== EVALUATION =====
model.save_model(log_dir + "/model")

# # Manual validation
# val_sample = next(iter(validation_data))
# val_pred = model.call(val_sample[0])
# val_loss = tf.keras.losses.MeanSquaredError()(val_sample[1], val_pred)
# print(f"Validation loss: {val_loss.numpy():.4f}")

# # Prediction test
# test_dataset = load_dataset(sim_names=sim_names, batch_size=1,dataset_split='test').map(remove_elevation_features)
# test_input, _ = next(iter(test_dataset))
# predictions = model.call_n(test_input, n=4)
# print("Autoregressive predictions shape:", predictions.shape)

2025-04-17 18:21:07.280995: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-17 18:21:07.332892: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-17 18:21:07.332923: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-17 18:21:07.334224: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-17 18:21:07.342724: I tensorflow/core/platform/cpu_feature_guar

Training with 7 features in logs/training_20250417-182109
Sample input shapes:
Geospatial: (4, 1000, 1000, 7) (should be (4, 1000, 1000, 9))
Temporal: (4, 5, 6)
Spatiotemporal: (4, 5, 1000, 1000, 1)
Epoch 1/500


2025-04-17 18:21:21.660114: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inflood_conv_lstm/conv_lstm/conv_lstm2d/while/body/_1/flood_conv_lstm/conv_lstm/conv_lstm2d/while/dropout_7/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2025-04-17 18:21:24.671928: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2025-04-17 18:21:24.841348: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2025-04-17 18:21:28.869499: I external/local_xla/xla/service/service.cc:168] XLA service 0x7ff2211a5850 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-17 18:21:28.869536: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-04-17 18:21:28.877115: I tensorflow/comp

      5/Unknown - 18s 209ms/step - loss: 0.0086 - mean_absolute_error: 0.0166 - root_mean_squared_error: 0.0927

KeyboardInterrupt: 