In [None]:
# Import necessary packages
import pickle
import random
import sys
import time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

sys.path.append("..")
# Import classes and functions from other files
from hy2dl.aux_functions.functions_training import nse_basin_averaged
from hy2dl.aux_functions.functions_evaluation import forecast_NSE, forecast_PNSE
from hy2dl.datasetzoo import get_dataset
from hy2dl.modelzoo import get_model
from hy2dl.utils.config import Config
from hy2dl.utils.optimizer import Optimizer
from hy2dl.utils.utils import set_random_seed, upload_to_device, write_report

# colorblind friendly palette
color_palette = {"observed": "#377eb8","simulated": "#4daf4a"}

Part 1. Initialize information

In [None]:
# Create a dictionary where all the information will be stored
experiment_settings = {}

# Experiment name
experiment_settings["experiment_name"] = "TestErase"

# paths to access the information
experiment_settings["path_data"] = "../data/luxemburg"
experiment_settings["path_entities_training"] = "../data/luxemburg/basins_lux_25.txt"
experiment_settings["path_entities_validation"] = "../data/luxemburg/basins_lux_27.txt"
experiment_settings["path_entities_testing"] = "../data/luxemburg/basins_lux_27.txt"

# dynamic forcings for hindcast period
experiment_settings["dynamic_input"] = ["precipitation","rel_humidity","temperature","radiation","pressure","wind"]
# dymamic forcings for forecast period
experiment_settings["forecast_input"] = ["precipitation","rel_humidity","temperature","radiation","pressure","wind"]
# target
experiment_settings["target"] = ["discharge"]

# static attributes that will be used. If one is not using static_inputs, initialize the variable as an empty list.
experiment_settings["static_input"] = ["MNQ","MQ","MHQ","area","elev_mean","slope_mean","p_mean","pet_mean","aridity","t_mean",
                                       "snow_cover","forest_frac","crop_frac","urban_frac","clay_frac","silt_frac","sand_frac"]

# time periods
experiment_settings["training_period"] = ["2002-01-01 00:00:00", "2017-12-31 23:00:00"] 
experiment_settings["validation_period"] = ["2015-01-01 00:00:00", "2017-12-31 23:00:00"]
experiment_settings["testing_period"] = ["2018-01-01 00:00:00", "2022-12-31 23:00:00"]

# model configuration
experiment_settings["hidden_size"]= 128
experiment_settings["batch_size_training"]= 256
experiment_settings["batch_size_evaluation"]= 1024
experiment_settings["epochs"]= 10
experiment_settings["dropout_rate"]= 0.4
experiment_settings["learning_rate"]= {1: 1e-3, 5: 5e-4, 7: 1e-4}
experiment_settings["validate_every"]= 1
experiment_settings["validate_n_random_basins"]= -1

experiment_settings["seq_length_hindcast"]= 364*24
experiment_settings["custom_seq_processing"] = {"hc_1D": {"n_steps": 351,"freq_factor": 24,},
                                                "hc_1h": {"n_steps": (365 - 352) * 24, "freq_factor": 1}}

experiment_settings["seq_length_forecast"]= 24
experiment_settings["predict_last_n"] = 24

experiment_settings["unique_prediction_blocks"] = False
experiment_settings["dynamic_embedding"] = {"hiddens": [10, 10, 6]}
experiment_settings["custom_seq_processing_flag"] = True


# device to train the model
experiment_settings["device"] = "gpu"
experiment_settings["num_workers"] = 4

# define random seed
experiment_settings["random_seed"] = 110

# dataset
experiment_settings["dataset"] = "luxemburg"

# model
experiment_settings["model"] = "CudaLSTM"
experiment_settings["initial_forget_bias"] = 3.0

In [None]:
# Initialize experiment configuration
config = Config(experiment_settings)
config.dump() # save the configuration to a YAML file for reproducibility

# Get dataset class
Dataset = get_dataset(config)

Part 2. Create datasets and dataloaders used to train/validate the model

In [None]:
# Dataset training
training_dataset = Dataset(cfg= config, time_period= "training")

training_dataset.calculate_basin_std()
training_dataset.calculate_global_statistics(save_scaler=True)
training_dataset.standardize_data()

In [None]:
# Dataloader training
train_loader = DataLoader(dataset=training_dataset,
                          batch_size=config.batch_size_training,
                          shuffle=True,
                          drop_last=True,
                          collate_fn=training_dataset.collate_fn,
                          num_workers=config.num_workers)

# Print details of a loader´s sample to see that our format is correct
print("Number of batches in training: ", len(train_loader))
print("\nSample batch details")
print(f"\n{'Key':<30} | {'Shape':<20}")
print("-" * 55)
# Loop through the sample dictionary and print the shape of each element
for key, value in next(iter(train_loader)).items():
    if key.startswith(("x_d", "x_conceptual")):
        print(f"{key}")
        for i, v in value.items():
            print(f"{'':<4}{i:<26} | {str(v.shape):<20}")
        print()
    else:
        print(f"{key:<30} | {str(value.shape):<20}")
        print()

In [None]:
# In evaluation (validation and testing) we will create an individual dataset per basin
entities_ids = np.loadtxt(config.path_entities_validation, dtype="str").tolist()
entities_ids = [entities_ids] if isinstance(entities_ids, str) else entities_ids
validation_dataset = {}
for entity in entities_ids:
    dataset = Dataset(cfg= config, 
                      time_period= "validation",
                      check_NaN=False,
                      entities_ids=entity)
    
    dataset.scaler = training_dataset.scaler
    dataset.standardize_data(standardize_output=False)
    validation_dataset[entity] = dataset

Part 3. Train model

In [None]:
# Model
set_random_seed(cfg=config)
model = get_model(config).to(config.device)

# optimizer
optimizer = Optimizer(cfg=config, model=model) 

training_time = time.time()
# Loop through the different epochs
for epoch in range(1, config.epochs + 1):
    epoch_start_time = time.time()
    total_loss = []
    # Training -------------------------------------------------------------------------------------------------------
    model.train()
    for idx, sample in enumerate(train_loader):

        # reach maximum iterations per epoch
        if config.max_updates_per_epoch is not None and idx >= config.max_updates_per_epoch:
            break

        sample = upload_to_device(sample, config.device)  # upload tensors to device
        optimizer.optimizer.zero_grad()  # sets gradients of weigths and bias to zero
        pred = model(sample)  # forward call

        loss = nse_basin_averaged(y_sim=pred["y_hat"], 
                                  y_obs=sample["y_obs"], 
                                  per_basin_target_std=sample["std_basin"])

        loss.backward()  # backpropagates
        
        optimizer.clip_grad_and_step(epoch, idx) # clip gradients and update weights
        
        total_loss.append(loss.item())

        # remove from cuda
        del sample, pred
        torch.cuda.empty_cache()

    # training report
    report = f'Epoch: {epoch:<2} | Loss training: {"%.3f "% (np.mean(total_loss))}'

    # Validation -----------------------------------------------------------------------------------------------------
    if epoch % config.validate_every == 0:
        model.eval()
        validation_results = {}
        with torch.no_grad():
            # If we define validate_n_random_basins as 0 or negative, we take all the basins
            if config.validate_n_random_basins <= 0:
                validation_basin_ids = validation_dataset.keys()
            else:
                keys = list(validation_dataset.keys())
                validation_basin_ids = random.sample(keys, config.validate_n_random_basins)

            # Go through each basin that will be used for validation
            for basin in validation_basin_ids:
                loader = DataLoader(
                    dataset=validation_dataset[basin],
                    batch_size=config.batch_size_evaluation,
                    shuffle=False,
                    drop_last=False,
                    collate_fn=validation_dataset[basin].collate_fn,
                    num_workers=config.num_workers
                )

                dates, simulated_values, observed_values = [], [], []
                for i, sample in enumerate(loader):
                    sample = upload_to_device(sample, config.device)  # upload tensors to device
                    pred = model(sample)
                    # backtransformed information
                    y_sim = pred["y_hat"] * dataset.scaler["y_std"].to(config.device) + dataset.scaler["y_mean"].to(config.device)

                    # Join the results from the different batches
                    dates.extend(sample["date_issue_fc"])
                    observed_values.extend(sample["persistent_q"].cpu().detach().numpy())
                    simulated_values.append(y_sim.cpu().detach().numpy())
                    if i == len(loader) - 1:
                        dates.extend(sample["date"][-1, :])
                        observed_values.extend(sample["y_obs"][-1, :].cpu().detach().numpy())

                    # remove from cuda
                    del sample, pred, y_sim
                    torch.cuda.empty_cache()

                # Construct dataframe with observed and simulated values
                df = pd.DataFrame(index=dates)
                df["Observed"] = np.concatenate(observed_values, axis=0)
                y_sim = np.squeeze(np.concatenate(simulated_values, axis=0), -1)
                y_sim = np.concatenate((y_sim, np.full([y_sim.shape[1], y_sim.shape[1]], np.nan)), axis=0)
                df[[f"lead_time_{i + 1}" for i in range(y_sim.shape[1])]] = y_sim
                
                validation_results[basin] = df
                
            # average loss validation
            loss_validation = forecast_NSE(results=validation_results).median().mean()
            report += f'| NSE validation: {"%.3f "% (loss_validation)}'

    # save model after every epoch
    torch.save(model.state_dict(), config.path_save_folder / "model" / f"model_epoch_{epoch}")

    # print epoch report
    report += (
        f'| Epoch time: {"%.1f "% (time.time()-epoch_start_time)} s | '
        f'LR:{"%.5f "% (optimizer.optimizer.param_groups[0]["lr"])}'
    )
    print(report)
    write_report(cfg = config, text=report)
    # modify learning rate
    optimizer.update_optimizer_lr(epoch=epoch)

# print final report
report = f'Total training time: {"%.1f "% (time.time()-training_time)} s'
print(report)
write_report(cfg = config, text=report)

Part 4. Test model

In [None]:
# In case I already trained an LSTM I can re-construct the model
#model = get_model(config).to(config.device)
#model.load_state_dict(torch.load(config.path_save_folder / "model" / "model_epoch_5", map_location=config.device))

# We can read the training scaler or read a previously stored one
scaler = training_dataset.scaler
#with open(config.path_save_folder / "scaler.pickle", "rb") as file:
#    scaler = pickle.load(file)

In [None]:
# In evaluation (validation and testing) we will create an individual dataset per basin. This will give us more 
# flexibility
entities_ids = np.loadtxt(config.path_entities_testing, dtype="str").tolist()
entities_ids = [entities_ids] if isinstance(entities_ids, str) else entities_ids

testing_dataset = {}
for entity in entities_ids:
    dataset = Dataset(cfg= config, 
                      time_period= "testing",
                      check_NaN=False,
                      entities_ids=entity)

    dataset.scaler = scaler
    dataset.standardize_data(standardize_output=False)
    testing_dataset[entity] = dataset

In [None]:
model.eval()
test_results = {}
with torch.no_grad():
    for basin in testing_dataset:
        loader = DataLoader(
            dataset=testing_dataset[basin],
            batch_size=config.batch_size_evaluation,
            shuffle=False,
            drop_last=False,
            collate_fn=testing_dataset[basin].collate_fn,
            num_workers=config.num_workers
        )

        dates, simulated_values, observed_values = [], [], []
        for i, sample in enumerate(loader):
            sample = upload_to_device(sample, config.device)  # upload tensors to device
            pred = model(sample)
            # backtransformed information
            y_sim = pred["y_hat"] * dataset.scaler["y_std"].to(config.device) + dataset.scaler["y_mean"].to(config.device)

            # Join the results from the different batches
            dates.extend(sample["date_issue_fc"])
            observed_values.extend(sample["persistent_q"].cpu().detach().numpy())
            simulated_values.append(y_sim.cpu().detach().numpy())
            if i == len(loader) - 1:
                dates.extend(sample["date"][-1, :])
                observed_values.extend(sample["y_obs"][-1, :].cpu().detach().numpy())

            # remove from cuda
            del sample, pred, y_sim
            torch.cuda.empty_cache()

        # Construct dataframe with observed and simulated values
        df = pd.DataFrame(index=dates)
        df["Observed"] = np.concatenate(observed_values, axis=0)
        y_sim = np.squeeze(np.concatenate(simulated_values, axis=0), -1)
        y_sim = np.concatenate((y_sim, np.full([y_sim.shape[1], y_sim.shape[1]], np.nan)), axis=0)
        df[[f"lead_time_{i + 1}" for i in range(y_sim.shape[1])]] = y_sim

        # Save the dataframe in a basin-indexed dictionary
        test_results[basin] = df

# Save results as a pickle file
with open(config.path_save_folder / "test_results.pickle", "wb") as f:
    pickle.dump(test_results, f)

Part 5. Initial analysis

In [None]:
# Results testing
df_NSE = forecast_NSE(results=test_results)

#figure size
plt.figure(figsize=(12, 6))

for index, row in df_NSE.iterrows():
    plt.plot(np.arange(len(row))+1, df_NSE.T[index], label=f"B{index:<2} ({row.mean():.2f})")

plt.ylabel("NSE", fontsize=16, fontweight="bold")
plt.xlabel("Lead time [h]", fontsize=14, fontweight="bold")
plt.tick_params(axis="both", labelsize=14)
plt.title("NSE per lead time", fontsize=18, fontweight="bold")
plt.legend(loc="center left", bbox_to_anchor=(1.01, 0.5), ncol=1, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Visualize forecast for basin and period of interest
basin_of_interest = "17"
period_of_interest = ["2020-02-02 00:00:00", "2020-02-06 23:00:00"]

# Filter the results
df_period_of_interest = test_results[basin_of_interest].loc[period_of_interest[0] : period_of_interest[1], :]

# Create figure
fig, ax1 = plt.subplots(figsize=(15, 7.5))

# Observe series
ax1.plot(df_period_of_interest["Observed"], label="Observed discharge", color=color_palette["observed"], linewidth=3, 
         marker="o")

# Simulated forecasted series
for i in range(0, df_period_of_interest.shape[0]-1, 1):
    time_slide = pd.date_range(
        start=df_period_of_interest.index[i+1], periods=df_period_of_interest.shape[1] - 1, freq="h"
    )

    forecast = df_period_of_interest.iloc[i, 1:].values
    ax1.plot(time_slide, forecast, alpha=0.5, linestyle="--")


# Format plot
ax1.set_xlabel("Date", fontsize=14, fontweight="bold")
ax1.tick_params(axis="x", labelsize=14)
ax1.set_ylabel("Discharge [mm/h]", fontsize=16, fontweight="bold")
ax1.tick_params(axis="y", labelsize=14)
ax1.set_title("Forecasted discharge", fontsize=18, fontweight="bold")

# Save the figure
plt.tight_layout()
#plt.savefig("../results/LuxCamels_seed_110/forecasted_discharge.png", dpi=300)
plt.show()