In [1]:
import xarray as xr
import numpy as np
import os
from surgeNN import io, preprocessing
from surgeNN.evaluation import add_error_metrics_to_prediction_ds
from surgeNN.models import train_gssr_mlr, predict_gssr_mlr
from tqdm import tqdm
#MLR version of train_and_predict.py for neural networks

2025-09-26 07:40:25.536439: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-26 07:40:25.609020: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Configure the script:

In [2]:
tgs        = ['stavanger-svg-nor-nhs.csv','wick-wic-gbr-bodc.csv','esbjerg-esb-dnk-dmi.csv',
                  'immingham-imm-gbr-bodc.csv','den_helder-denhdr-nld-rws.csv', 'fishguard-fis-gbr-bodc.csv',  
                  'brest-822a-fra-uhslc.csv', 'vigo-vigo-esp-ieo.csv',  'alicante_i_outer_harbour-alio-esp-da_mm.csv']
tgs        = ['brest-822a-fra-uhslc.csv', 'vigo-vigo-esp-ieo.csv',  'alicante_i_outer_harbour-alio-esp-da_mm.csv']
temp_freq = 3 # [hours] temporal frequency to use
predictor_degrees   = 4.5 #n x n grid cells around tide gauge of predictor data to use

this_n_steps = 9

predictor_vars = ['msl','u10','v10',
            'u10_sqd','v10_sqd',
            'u10_cbd','v10_cbd'] #variables to use

predictor_path  = 'gs://leap-persistent/timh37/era5_predictors/3hourly/'
predictand_path = '/home/jovyan/test_surge_models/input/t_tide_3h_hourly_deseasoned_predictands'
output_dir = '/home/jovyan/test_surge_models/results/mlr_4p5x4p5/' #where to store the results

split_fractions = [.6,.2,.2] #train, test, val

Load in & preprocess data:

In [3]:
for tg in tqdm(tgs): #loop over TGs
    #load & process predictors
    n_cells = int(predictor_degrees * (4/1)) #determine how many grid cells around TG to use (era5 resolution = 0.25 degree)
        
    predictors = io.Predictor(predictor_path)
    predictors.open_dataset(tg,['msl','u10','v10'],n_cells)
    predictors.trim_years(1979,2017)
    
    for var in predictor_vars: #add higher order predictors
        if '_sqd' in var:
            predictors.data[var] = predictors.data[var.split('_')[0]]**2
        elif '_cbd' in var:
            predictors.data[var] = predictors.data[var.split('_')[0]]**3
        else:
            continue
    
    predictors.subtract_annual_means()
    predictors.deseasonalize()

    #load & process predictands
    predictand = io.Predictand(predictand_path)
    predictand.open_dataset(tg)
    predictand.trim_dates(predictors.data.time.isel(time=0).values,predictors.data.time.isel(time=-1).values)
    predictand.deseasonalize()
    predictand.resample_fillna(str(temp_freq)+'h')
   
    model_input = preprocessing.trainingInput(predictors,predictand)
    model_input.predictors['stacked'] = model_input.predictors['stacked'].transpose("time","var","lon_around_tg",...)
    
    model_input.split_stratified(split_fractions,this_n_steps,7,0,'99pct')
    y_train_mean,y_train_sd = model_input.standardize()

    #model_input.compute_denseloss_weights(0) #generate the Denseloss weights for each split

    x_train,y_train = model_input.get_windowed_filtered_np_input('train',this_n_steps) #generate input for neural network model
    x_val,y_val     = model_input.get_windowed_filtered_np_input('val',this_n_steps)
    x_test,y_test   = model_input.get_windowed_filtered_np_input('test',this_n_steps)
  
    x_train,x_val,x_test = [np.reshape(k,(k.shape[0],np.prod(k.shape[1::]))) for k in [x_train,x_val,x_test]]
    o_train,o_val,o_test = [y_train_sd * k + y_train_mean for k in [y_train,y_val,y_test]] #back-transform observations
    
    mlr_coefs,train_components = train_gssr_mlr(x_train,y_train)
    
    #store model
    coef_ds = xr.Dataset(data_vars=dict(mlr_coefs=(['tg','coef'],mlr_coefs[np.newaxis,:])),
            coords=dict(coef=np.arange(len(mlr_coefs)),tg=[tg]),)
    
    my_fn = 'mlr'+'_'+str(predictor_degrees).replace('.','p')+'x'+str(predictor_degrees).replace('.','p')+'_'+str(temp_freq)+'h_'+tg.replace('.csv','')
    #coef_ds.to_netcdf(os.path.join(output_dir,'mlr_models',my_fn+'_gssr_mlr_coefs.nc'))

    #store pc spatial patterns into netcdf
    components_ds = xr.Dataset(data_vars=dict(component=(['tg','pc','f'],train_components[np.newaxis,:,:])),
        coords=dict(tg=[tg],pc=np.arange(train_components.shape[0]),f=np.arange(x_train.shape[-1])),)

    #components_ds.to_netcdf(os.path.join(output_dir,'mlr_models',my_fn+'_gssr_mlr_pca_components.nc'))

    prediction_val,prediction_val_components = predict_gssr_mlr(x_val,mlr_coefs,train_components,predictor_vars,this_n_steps)
    prediction_test,prediction_test_components = predict_gssr_mlr(x_test,mlr_coefs,train_components,predictor_vars,this_n_steps)

    #make predictions & back-transform        
    yhat_val = prediction_val*y_train_sd + y_train_mean
    yhat_test = prediction_test*y_train_sd + y_train_mean

    #store into xr dataset
    ds_val = xr.Dataset(data_vars=dict(o=(["time"], o_val),yhat=(["time"], yhat_val)),
    coords=dict(time=model_input.t_val,),attrs=dict(description="MLR prediction performance."),)
    
    ds_test = xr.Dataset(data_vars=dict(o=(["time"], o_test),yhat=(["time"], yhat_test)),
    coords=dict(time=model_input.t_test,),attrs=dict(description="MLR prediction performance."),)

    out_ds = xr.concat((ds_val,ds_test),dim='split',coords='different') #concatenate results for each split
    out_ds = out_ds.assign_coords(split = ['val','test'])

    #concatenate across runs & compute statistics
    out_ds = add_error_metrics_to_prediction_ds(out_ds,[.95,.98,.99,.995],3)

    out_ds = out_ds.assign_coords(tg = np.array([tg]))

    out_ds = out_ds.assign_coords(lon = ('tg',np.array([predictand.data['lon'].values[0]])))
    out_ds = out_ds.assign_coords(lat = ('tg',np.array([predictand.data['lat'].values[0]])))

    out_ds.attrs['temp_freq'] = temp_freq
    out_ds.attrs['n_cells'] = n_cells
    out_ds.attrs['predictor_vars'] = predictor_vars
    
    my_path = os.path.join(output_dir,'performance')
    my_fn = 'mlr'+'_'+str(predictor_degrees).replace('.','p')+'x'+str(predictor_degrees).replace('.','p')+'_'+str(temp_freq)+'h_'+tg.replace('.csv','')

    out_ds.to_netcdf(os.path.join(my_path,my_fn+'.nc'),mode='w')

100%|██████████| 3/3 [56:10<00:00, 1123.42s/it]


In [10]:
predictors.data

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(18,)","(18,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 72 B 72 B Shape (18,) (18,) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",18  1,

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(18,)","(18,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(18,)","(18,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 72 B 72 B Shape (18,) (18,) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",18  1,

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(18,)","(18,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 32 graph layers,2 chunks in 32 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 32 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 32 graph layers,2 chunks in 32 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 32 graph layers,2 chunks in 32 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 32 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 32 graph layers,2 chunks in 32 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 32 graph layers,2 chunks in 32 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 32 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 32 graph layers,2 chunks in 32 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 33 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 33 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 33 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 140.85 MiB 92.54 MiB Shape (113960, 18, 18) (74875, 18, 18) Dask graph 2 chunks in 33 graph layers Data type float32 numpy.ndarray",18  18  113960,

Unnamed: 0,Array,Chunk
Bytes,140.85 MiB,92.54 MiB
Shape,"(113960, 18, 18)","(74875, 18, 18)"
Dask graph,2 chunks in 33 graph layers,2 chunks in 33 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [9]:
39*365.25*8

113958.0