In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import xarray as xr
import fnmatch
import numpy as np
import os
import keras
from surgeNN.io import load_predictand,load_predictors
from surgeNN.preprocessing import split_predictand_and_predictors_chronological,split_predictand_and_predictors_stratified_years
from surgeNN.preprocessing import generate_batched_windowed_filtered_tf_input, generate_windowed_filtered_np_input, deseasonalize_da, deseasonalize_df_var
from surgeNN.evaluation import add_error_metrics_to_prediction_ds,rmse
from surgeNN.preprocessing import standardize_predictand_splits, standardize_predictor_splits
from surgeNN.models import train_gssr_mlr, predict_gssr_mlr
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt

2024-11-01 13:04:14.768938: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-01 13:04:14.843272: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Configure the script:

In [2]:
tgs        = ['stavanger-svg-nor-nhs.csv','wick-wic-gbr-bodc.csv','esbjerg-esb-dnk-dmi.csv',
                  'immingham-imm-gbr-bodc.csv','den_helder-denhdr-nld-rws.csv', 'fishguard-fis-gbr-bodc.csv',  
                  'brest-822a-fra-uhslc.csv', 'vigo-vigo-esp-ieo.csv',  'alicante_i_outer_harbour-alio-esp-da_mm.csv']
tgs = ['alicante_i_outer_harbour-alio-esp-da_mm.csv']
temp_freq = 3 # [hours] temporal frequency to use
n_cells   = 5 #n x n grid cells around tide gauge of predictor data to use

this_n_steps = 9

var_names = ['msl','u10','v10',
            'u10_sqd','v10_sqd',
            'u10_cbd','v10_cbd'] #variables to use

architecture = 'mlr'

input_dir  = '/home/jovyan/test_surge_models/input/' #directory with predictor & predictand data
output_dir = os.path.join('/home/jovyan/test_surge_models/results/mlr_4p5x4p5/') #where to store the results

split_fractions = [.6,.2,.2] #train, test, val

Load in & preprocess data:

In [3]:
tg_lons = []
tg_lats = []

for tg in tqdm(tgs): #loop over TGs
    #load & process predictors
    predictors = load_predictors('gs://leap-persistent/timh37/era5_predictors/'+str(temp_freq)+'hourly',tg,n_cells) #open predictor xarray dataset
    predictors = predictors.sel(time=slice('1979','2017')) #2018 because of end year GTSM simulations that are used as benchmark

    predictors = predictors.isel(lon_around_tg = np.arange(1,19),lat_around_tg = np.arange(1,19)) #reduce to 4 by 4 degree 
    for var in var_names: #add higher order predictors
        if '_sqd' in var:
            predictors[var] = predictors[var.split('_')[0]]**2
        elif '_cbd' in var:
            predictors[var] = predictors[var.split('_')[0]]**3
        else:
            continue

    for var in var_names: #remove amean
        predictors[var] = predictors[var].groupby(predictors.time.dt.year) - predictors[var].groupby(predictors.time.dt.year).mean('time')
        predictors[var] = deseasonalize_da(predictors[var])
    #also remove seasmean??
    predictors['stacked'] = predictors[var_names].to_array(dim="var") #put predictor variables into one array
    predictors = predictors[['stacked']]
    predictors['stacked'] = predictors['stacked'].transpose("time","var","lon_around_tg",...)#.stack(f=['var','lon_around_tg','lat_around_tg'],create_index=False)
    
    #load & process predictands
    predictand = load_predictand('/home/jovyan/test_surge_models/input/t_tide_'+str(temp_freq)+'h_hourly_deseasoned_predictands',tg) #open predictand csv
    #predictand = load_predictand('/home/jovyan/test_surge_models/input/t_tide_'+str(temp_freq)+'h_anoms_predictands',tg) #open predictand csv
    predictand = predictand[(predictand['date']>=predictors.time.isel(time=0).values) & (predictand['date']<=predictors.time.isel(time=-1).values)]  # only use predictands in the period for which we also have predictor values, including at preceding timesteps
    
    predictand = deseasonalize_df_var(predictand,'surge','date')
    
    predictand = predictand.set_index('date').resample(str(temp_freq)+'h').fillna(method=None) #insert nans where timesteps are missing
    predictand = predictand.reset_index()[['surge','date','lon','lat']]

    #train & evaluate models n_runs times:
    tg_datasets = [] #list to store output

    #generate splits
    #idx_train,idx_val,idx_test,x_train,x_val,x_test,y_train,y_val,y_test = split_predictand_and_predictors_chronological(predictand,predictors,split_fractions,this_n_steps)
    idx_train,idx_val,idx_test,x_train,x_val,x_test,y_train,y_val,y_test = split_predictand_and_predictors_stratified_years(predictand,predictors,split_fractions,this_n_steps,7,seed=0,how='99pct')

    #standardize the input based on the mean & sd of the train split
    y_train,y_val,y_test,y_train_mean,y_train_sd = standardize_predictand_splits(y_train,y_val,y_test,output_transform = True)
    x_train,x_val,x_test = standardize_predictor_splits(x_train,x_val,x_test)

    #get values & timestamps of observations to compare predictions with
    t_train = predictand['date'].values[idx_train][np.isfinite(y_train)]
    t_val = predictand['date'].values[idx_val][np.isfinite(y_val)]
    t_test = predictand['date'].values[idx_test][np.isfinite(y_test)]

    #get windowed predictors & filter rows with nan observations from inputs (for tensorflow pipeline (to handle out-of-memory windowed predictors) see code below)
    x_train,y_train = generate_windowed_filtered_np_input(x_train.stacked.values,y_train,this_n_steps)
    x_val,y_val = generate_windowed_filtered_np_input(x_val.stacked.values,y_val,this_n_steps)
    x_test,y_test = generate_windowed_filtered_np_input(x_test.stacked.values,y_test,this_n_steps)
    
    x_train,x_val,x_test = [np.reshape(k,(k.shape[0],np.prod(k.shape[1::]))) for k in [x_train,x_val,x_test]]
    
    o_train,o_val,o_test = [y_train_sd * k + y_train_mean for k in [y_train,y_val,y_test]] #back-transform observations

    mlr_coefs,train_components = train_gssr_mlr(x_train,y_train)
    
    #store model
    coef_ds = xr.Dataset(
            data_vars=dict(
                mlr_coefs=(['tg','coef'],mlr_coefs[np.newaxis,:])
                ),
            coords=dict(
                coef=np.arange(len(mlr_coefs)),
                tg=[tg]
            ),
            )
    
    my_fn = architecture+'_4p5x4p5_'+str(temp_freq)+'h_'+tg.replace('.csv','')
    coef_ds.to_netcdf(os.path.join(output_dir,'mlr_models',my_fn+'_gssr_mlr_coefs.nc'))

    #store pc spatial patterns into netcdf
    components_ds = xr.Dataset(
        data_vars=dict(
            component=(['tg','pc','f'],train_components[np.newaxis,:,:])
            ),
        coords=dict(
            tg=[tg],
            pc=np.arange(train_components.shape[0]),
            f=np.arange(x_train.shape[-1])
        ),
        )

    components_ds.to_netcdf(os.path.join(output_dir,'mlr_models',my_fn+'_gssr_mlr_pca_components.nc'))

    
    
    
    #to-do: predictions for test component as well
    prediction_val,prediction_val_components = predict_gssr_mlr(x_val,mlr_coefs,train_components,len(predictors.lon_around_tg),var_names,this_n_steps)
    prediction_test,prediction_test_components = predict_gssr_mlr(x_test,mlr_coefs,train_components,len(predictors.lon_around_tg),var_names,this_n_steps)

    #make predictions & back-transform        
    yhat_val = prediction_val*y_train_sd + y_train_mean
    yhat_test = prediction_test*y_train_sd + y_train_mean

    #store into xr dataset
    ds_val = xr.Dataset(data_vars=dict(o=(["time"], o_val),yhat=(["time"], yhat_val)),
    coords=dict(time=t_val,),
    attrs=dict(description="MLR prediction performance."),)
    
    ds_test = xr.Dataset(data_vars=dict(o=(["time"], o_test),yhat=(["time"], yhat_test)),
    coords=dict(time=t_test,),
    attrs=dict(description="MLR prediction performance."),)

    ds_i = xr.concat((ds_val,ds_test),dim='split',coords='different') #concatenate results for each split
    ds_i = ds_i.assign_coords(split = ['val','test'])
    out_ds = ds_i 
#concatenate across runs & compute statistics
out_ds = add_error_metrics_to_prediction_ds(out_ds,[.95,.98,.99,.995])

out_ds = out_ds.assign_coords(tg = np.array([tg]))

out_ds = out_ds.assign_coords(lon = ('tg',np.array([predictand['lon'].values[0]])))
out_ds = out_ds.assign_coords(lat = ('tg',np.array([predictand['lat'].values[0]])))

out_ds.attrs['temp_freq'] = temp_freq
out_ds.attrs['n_cells'] = n_cells

my_path = os.path.join(output_dir,'performance')
my_fn = architecture+'_4p5x4p5_'+str(temp_freq)+'h_'+tg.replace('.csv','')

out_ds.to_netcdf(os.path.join(my_path,my_fn+'.nc'),mode='w')

100%|██████████| 1/1 [18:23<00:00, 1103.85s/it]


In [5]:
predictors.isel(lon_around_tg = np.arange(1,19),lat_around_tg = np.arange(1,19))

(66485, 25200)

In [6]:
y_train.shape

(66485,)