This script preprocesses climate datasets for seasonal machine learning. It calculates selected climate indicators (e.g., rx90p, pr, txx), generates additional features, scales the data, and saves the results as CSV files. It processes ERA5, CMIP6 scenario data, and a test dataset (if specified), while automatically excluding known invalid datasets.

In [1]:
from warnings import warn
import sys

sys.path.append('/home/vgarcia/notebooks')
from preprocessing_functions import *
from experiments_functions import *

In [None]:
# Input file
out_preprocess_basepath = "/data/dl20-data/climate_operational/Victor_data/preprocessed_datasets_ML_monthly"
test_mode = False

# Define datasets to process
use_era5 = True
process_test_dataset = True
cmip6_models = []
scenarios = []

cmip6_models = [
    "access-cm2",
    "cmcc-esm2",
    "inm-cm4-8",
    "inm-cm5-0",
    "miroc-es2l",
    "mpi-esm1-2-lr",
    "mri-esm2-0",
    "noresm2-mm"
]

scenarios = ["historical", "ssp126", "ssp245", "ssp585"]

# note: ssp585_mri-esm2-0 does not exist, it will be removed
datasets = []
datasets.extend([
    f"{scenario}_{model}"
    for model in cmip6_models
    for scenario in scenarios
])

# list all datasets to process

if 'ssp585_mri-esm2-0' in datasets:
    print("Removed invalid dataset: ssp585_mri-esm2-0", )
    datasets.remove('ssp585_mri-esm2-0')

if use_era5:
    print("Added era5")
    datasets.insert(0, "era5")

if process_test_dataset:
    print("Added test dataset")
    datasets.insert(0, "test")

Added era5


In [None]:
for dataset in datasets:
    print("Preprocessing:", dataset)
    # make the out path if it does not exist
    out_path = out_preprocess_basepath + f"/{dataset}/{dataset}.csv"
    indicators_dict = calculate_indicators(indicators = ["rx90p", "pr", "txx"], dataset = dataset, test=test_mode)
    indicators_dict = indicators_calculate_extra_features(indicators_dict, lags = 3)

    df = dict_to_dataframe(indicators_dict, frequency = False)
    df_merged = df_add_index_variables(df, mean = False, trend=False, lags = 0)
    df_scaled = encode_and_scale(df_merged)

    df_scaled = df_scaled.drop(["txx_anom"], axis = 1)

    # Save to CSV
    df_scaled.to_csv(out_path, index=False)