In [1]:
'''script to regrid CMIP6 datatsets to target grid and store them'''
import numpy as np
import xarray as xr
import dask
import os
import intake
import pandas as pd
from sklearn.metrics import confusion_matrix
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.postprocessing import combine_datasets,_concat_sorted_time, merge_variables
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat, drop_older_versions
from cmip_ds_dict_operations import select_period, pr_flux_to_m, drop_duplicate_timesteps, drop_coords, drop_incomplete
import xesmf as xe
import gcsfs
import keras
from surgeNN.preprocessing import deseasonalize_da, generate_windowed_filtered_np_input, stack_predictors_for_lstm, stack_predictors_for_convlstm
from surgeNN.io import load_predictors
from scipy.stats import rankdata
from surgeNN.models import train_gssr_mlr, predict_gssr_mlr
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 
from datetime import datetime
import fnmatch
#NOTE: keras & tensorflow versions must align with the versions used to train the models (see req.txt)

  from tqdm.autonotebook import tqdm
2024-11-11 10:59:07.510180: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-11 10:59:07.611285: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
#configure the script
highresmip_model = 'HadGEM3-GC31-HM' #input global climate model
predictor_dir = 'gs://leap-persistent/timh37/HighResMIP/surgeNN_predictors/' #path to predictor data derived from highresmip model

output_dir = '/home/jovyan/test_surge_models/results/nn_tests/highresmip_predictions'

var_names = ['msl','u10','v10','w'] #must align with what was used training
n_steps = 9 #must align with what was used training

model_type = 'mlr' #which data-driven model to apply
#model_performance_dir = '/home/jovyan/test_surge_models/results/nn_tests/performance/lstm/'
model_performance_dir = '/home/jovyan/test_surge_models/results/mlr_4p5x4p5/performance/'

model_dir = '/home/jovyan/test_surge_models/results/nn_tests/keras_models/lstm'
model_dir = '/home/jovyan/test_surge_models/results/mlr_4p5x4p5/mlr_models' #path to saved neural networks

model_selection = 'combined_rank' #how to select the model of type 'model_type'

output_tgs = ['stavanger-svg-nor-nhs.csv','wick-wic-gbr-bodc.csv','esbjerg-esb-dnk-dmi.csv',
                  'immingham-imm-gbr-bodc.csv','den_helder-denhdr-nld-rws.csv', 'fishguard-fis-gbr-bodc.csv',  
                  'brest-822a-fra-uhslc.csv', 'vigo-vigo-esp-ieo.csv',  'alicante_i_outer_harbour-alio-esp-da_mm.csv']
output_tgs = ['alicante_i_outer_harbour-alio-esp-da_mm.csv']
out_split = 'val'
out_q = .99

standardize_predictors_with_era5 = 0 #standardize predictors with era5 data?

In [3]:
fs.ls(predictor_dir)

['leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_alicante_i_outer_harbour-alio-esp-da_mm.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_brest-822a-fra-uhslc.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_den_helder-denhdr-nld-rws.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_esbjerg-esb-dnk-dmi.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_fishguard-fis-gbr-bodc.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_immingham-imm-gbr-bodc.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_stavanger-svg-nor-nhs.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_vigo-vigo-esp-ieo.zarr',
 'leap-persiste

In [10]:
surge_output = [] #initialize

for tg in output_tgs:
    print('processing: '+tg)
    predictors = xr.open_dataset(os.path.join(predictor_dir,'predictors_'+highresmip_model+'_1950_2050_'+tg.replace('.csv','')+'.zarr'),engine='zarr') #load highresmip predictors
    
    #open the data-driven model to use
    if (model_type.lower()=='lstm') or (model_type.lower()=='convlstm'):
        model_performances = xr.open_mfdataset(os.path.join(model_performance_dir,'*'+tg.replace('.csv','')+'_mse_hp1_it*'),combine='nested',concat_dim='it').load()
        #model_performances = model_performances.where(model_performances.hyperparameters.sel(p='dl_alpha') == 0)   
                                               
        if model_selection == 'combined_rank':
            rmses = model_performances.sel(tg=tg).rmse_extremes.sel(quantile=out_q).sel(split=out_split).values.flatten()
            f1s = model_performances.sel(tg=tg).f1.sel(quantile=out_q).sel(split=out_split).values.flatten()
            #2: find highest combined ranked model
            best_ranked_model = np.nanargmin(rankdata(f1s*-1,nan_policy='omit')+rankdata(rmses,nan_policy='omit'))
            selected_it,selected_i = np.unravel_index(best_ranked_model,
                                                              model_performances.sel(tg=tg).rmse_extremes.sel(quantile=out_q).sel(split=out_split).shape)                                       
        else:
            raise Exception('selection criterion not recognized')      
        
        fn = fnmatch.filter(os.listdir(model_dir),'*'+tg.replace('.csv','')+'_mse_hp1_i'+str(selected_it)+'_it'+str(selected_it)+'.keras')[0]
        model = keras.models.load_model(os.path.join(model_dir,fn))
                                      
    elif model_type.lower()=='mlr':
        model_performances = xr.open_mfdataset(os.path.join(model_performance_dir,'*'+tg.replace('.csv','')+'.nc')).load()
        mlr_coefs = xr.open_dataset(os.path.join(model_dir,'mlr_4p5x4p5_3h_'+tg.replace('.csv','')+'_gssr_mlr_coefs.nc')) #load MLR coefficients at TGs
        pcs = xr.open_dataset(os.path.join(model_dir,'mlr_4p5x4p5_3h_'+tg.replace('.csv','')+'_gssr_mlr_pca_components.nc'))
                                               
    else:
        raise Exception('model type not recognized')                                        
    
    if standardize_predictors_with_era5: #retrieve x_train_mean and _sd from ERA5 
        era5_predictors = load_predictors('gs://leap-persistent/timh37/era5_predictors/'+'3hourly',tg+'.csv',5) 
        era5_predictors = era5_predictors.sel(time=slice('1979','2017'))

        if 'w' in var_names and 'w' not in era5_predictors.variables: #add wind speed
            era5_predictors['w'] == np.sqrt((era5_predictors.u10**2+era5_predictors.v10**2))

        for var in var_names: #preprocess
            era5_predictors[var] = era5_predictors[var].groupby(era5_predictors.time.dt.year) - era5_predictors[var].groupby(era5_predictors.time.dt.year).mean('time') #remove annual means
            era5_predictors[var] = deseasonalize_da(era5_predictors[var]) #remove mean seasonal cycle

        x_train = era5_predictors.sel(time=model_performances.time.where(model_performances.o.isel(i=selected_i,it=selected_it).sel(split='train')))
        x_train_mean = x_train.mean(dim='time') #skips nan by default
        x_train_sd = x_train.std(dim='time',ddof=0)

    #derive backtransform
    if model_type.lower()!='mlr':
        y_train_mean = np.nanmean(model_performances.isel(i=selected_i,it=selected_it).o.sel(split='train'))
        y_train_sd = np.nanstd(model_performances.isel(i=selected_i,it=selected_it).o.sel(split='train'),ddof=0)
    else: #this is not working yet & needs fixing
        y_train_mean = np.nanmean(model_performances.o.sel(split='train'))
        y_train_sd = np.nanstd(model_performances.o.sel(split='train'),ddof=0)

    #preprocess predictors:
    if 'w' in var_names and 'w' not in predictors.variables:
        predictors['w'] = np.sqrt((predictors.u10**2+predictors.v10**2))
            
    for var in var_names: #preprocess
        predictors[var] = predictors[var].groupby(predictors.time.dt.year) - predictors[var].groupby(predictors.time.dt.year).mean('time') #remove annual means
        predictors[var] = deseasonalize_da(predictors[var]) #remove mean seasonal cycle
    
    if standardize_predictors_with_era5:
        predictors = (predictors - x_train_mean)/x_train_sd #standardize with era5 transform
    else:
        predictors = (predictors - predictors.mean(dim='time'))/predictors.std(dim='time',ddof=0) #standardize normally
                         
    if model_type.lower()=='convlstm' or model_type.lower()=='lstm':
        if model_type.lower()=='convlstm':                                       
            x = stack_predictors_for_convlstm(predictors,var_names) #put into right format
        else:
            x = stack_predictors_for_lstm(predictors,var_names)     
        x,y = generate_windowed_filtered_np_input(x,np.zeros(x.shape[0]-n_steps+1),n_steps)
        y_out = model.predict(x) 
                                               
    elif model_type.lower()=='mlr':
        predictors = predictors.isel(lon_around_tg = np.arange(1,19),lat_around_tg = np.arange(1,19))
        predictors['stacked'] = predictors[var_names].to_array(dim="var") #put predictor variables into one array
        predictors = predictors[['stacked']]
        predictors['stacked'] = predictors['stacked'].transpose("time","var","lon_around_tg",...)#.stack(f=['var','lon_around_tg','lat_around_tg'],create_index=False)
        x,y = generate_windowed_filtered_np_input(predictors.stacked.values,np.zeros(len(predictors.time)-n_steps+1),n_steps)
        x = np.reshape(x,(x.shape[0],np.prod(x.shape[1::])))
        
        y_out,y_components = predict_gssr_mlr(x,mlr_coefs.mlr_coefs.values.flatten(),pcs.component.values[0,:,:],len(predictors.lon_around_tg),var_names,n_steps)
                              
    yhat = y_out*y_train_sd + y_train_mean #backtransform
    
    surge_output.append(
        
        xr.Dataset(
    data_vars=dict(
        surge=(["time","tg"], yhat),
        y_train_mean = (["tg"],[y_train_mean]),
        y_train_sd = (["tg"],[y_train_sd])
    ),
    coords=dict(
        time=predictors.time.isel(time=np.arange(n_steps-1,len(predictors.time))),
        lon=model_performances.lon,
        lat=model_performances.lat,

    ),
    attrs=dict(description="Predictions of data-driven model applied to HighResMIP data",
               highresmip_model=highresmip_model,
               model=model_type,
               selection=model_selection,
               out_q=out_q,
               out_split = out_split,
               standardization_with_era5 = standardize_predictors_with_era5),)
                                      )
if model_type.lower()!='mlr':
    surge_output.attrs['model_i']=selected_i
    surge_output.attrs['model_it']=selected_it
        
todays_date = datetime.today().strftime('%m-%d-%Y')
surge_ds = xr.merge(surge_output)
#surge_ds.to_netcdf(os.path.join(output_dir,highresmip_model+'_'+model_type+'_predictions_'+todays_date+'.nc'),mode='w')


processing: alicante_i_outer_harbour-alio-esp-da_mm.csv


NameError: name 'selected_i' is not defined

In [9]:
model_performances #no train split in mlr performances, how to retrieve sd/mean without having to open lstm/convlstm then?
#include train split in mlr calculation..