In [1]:
'''script to regrid CMIP6 datatsets to target grid and store them'''
import numpy as np
import xarray as xr
import dask
import os
import intake
import pandas as pd
from sklearn.metrics import confusion_matrix
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.postprocessing import combine_datasets,_concat_sorted_time, merge_variables
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat, drop_older_versions
from cmip_ds_dict_operations import select_period, pr_flux_to_m, drop_duplicate_timesteps, drop_coords, drop_incomplete
import xesmf as xe
import gcsfs
import keras
from open_era5_predictors import get_era5_around_tgs
from surgeNN.preprocessing import deseasonalize_da
from surgeNN.io import load_predictors
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

def generate_windowed_filtered_np_input(x,y,n_steps,w=None):
    '''
    Generate numpy arrays of windowed nan-filtered input data
    Input:
        x: predictors
        y: predictands
        n_steps: number of timesteps to use predictors at
        w: sample weights of predictands, optional
    Output:
        x_out: windowed, nan-filtered predictors
        y_out: nan-filtered predictands
    '''
    x_out = np.stack([x[k:k+n_steps,:] for k in np.arange(x.shape[0])][0:-(n_steps-1)],axis=0) #create windowed predictor array (x(t=-n_steps to t=0) to predict y(t=0)
    
    #filter where y is nan
    where_y_is_finite = np.isfinite(y)
    x_out = x_out[where_y_is_finite,...]
    y_out = y[where_y_is_finite]

    if w is not None: #do the same for the weights, if any
        w_out = w[where_y_is_finite]
        return x_out,y_out,w_out
    else:
        return x_out,y_out
        
def stack_predictors_for_lstm(predictors,var_names):
    ''' stack predictors to prepare for lstm input'''
    return np.reshape(np.stack([predictors[k].values for k in var_names],axis=-1),
                      (len(predictors.time),len(predictors.latitude) * len(predictors.longitude) * len(var_names))) #stack grid cells & variables

def stack_predictors_for_convlstm(predictors,var_names):
    ''' stack predictors to prepare for convlstm input'''
    return np.stack([predictors[k].values for k in var_names],axis=-1) #stack variables


def deseasonalize_da(da):
    '''subtract long-term monthly means from variable in dataset'''
    
    deseasoned_da = da.groupby(da.time.dt.month) - da.groupby(da.time.dt.month).mean('time')
    
    deseasoned_da = deseasoned_da + (da.mean(dim='time') - deseasoned_da.mean(dim='time'))

    return deseasoned_da

  from tqdm.autonotebook import tqdm
2024-10-18 13:20:44.722942: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-18 13:20:44.829643: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
highresmip_model = 'HadGEM3-GC31-HM'
predictor_dir = 'gs://leap-persistent/timh37/HighResMIP/surgeNN_predictors/'
nn_model_dir = '/home/jovyan/test_surge_models/results/nn_tests/keras_models/lstm'

output_dir = '/home/jovyan/test_surge_models/results/nn_tests/highresmip_predictions'
var_names = ['msl','u10','v10','w']
n_steps = 9

only_alpha0 = False
standardize_predictors_with_era5 = True

In [3]:
fs.ls('gs://leap-persistent/timh37/HighResMIP/surgeNN_predictors/')

['leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_alicante_i_outer_harbour-alio-esp-da_mm.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_brest-822a-fra-uhslc.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_den_helder-denhdr-nld-rws.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_esbjerg-esb-dnk-dmi.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_fishguard-fis-gbr-bodc.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_immingham-imm-gbr-bodc.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_stavanger-svg-nor-nhs.zarr',
 'leap-persistent/timh37/HighResMIP/surgeNN_predictors/predictors_HadGEM3-GC31-HM_1950_2050_vigo-vigo-esp-ieo.zarr',
 'leap-persiste

In [4]:
surge_output = []
for output_tg in ['stavanger-svg-nor-nhs.csv','wick-wic-gbr-bodc.csv','esbjerg-esb-dnk-dmi.csv',
                  'immingham-imm-gbr-bodc.csv','den_helder-denhdr-nld-rws.csv', 'fishguard-fis-gbr-bodc.csv',  
                  'brest-822a-fra-uhslc.csv', 'vigo-vigo-esp-ieo.csv',  'alicante_i_outer_harbour-alio-esp-da_mm.csv']:#tg_coords.tg.values:
    print('processing: '+output_tg)
    output_tg = output_tg.replace('.csv','')
    
    predictors = xr.open_dataset(os.path.join(predictor_dir,'predictors_'+highresmip_model+'_1950_2050_'+output_tg+'.zarr'),engine='zarr')

    #find best lstm and load it
    lstm = xr.open_mfdataset('/home/jovyan/test_surge_models/results/nn_tests/performance/lstm/lstm_3h*'+output_tg+'_mse_hp1_it*',combine='nested',concat_dim='it').load()
    
    if only_alpha0:
        lstm = lstm.where(lstm.hyperparameters.sel(p='dl_alpha') == 0)
        
    idxmin = lstm.rmse_extremes.sel(split='val').sel(quantile=.99).argmin(dim=['i','it']) #lowest rmse extremes, could also use f1?
    
    ###### derive original x_train_mean and sd (not sure how best to standardize CMIP predictors)
    era5_predictors = load_predictors('gs://leap-persistent/timh37/era5_predictors/'+'3hourly',output_tg+'.csv',5) 
    era5_predictors = era5_predictors.sel(time=slice('1979','2017')) #2018 because of end year GTSM simulations that are used as benchmark

    if 'w' in var_names and 'w' not in era5_predictors.variables:
        era5_predictors['w'] == np.sqrt((era5_predictors.u10**2+era5_predictors.v10**2))

    for var in var_names: #remove amean
        era5_predictors[var] = era5_predictors[var].groupby(era5_predictors.time.dt.year) - era5_predictors[var].groupby(era5_predictors.time.dt.year).mean('time') #remove annual means
        era5_predictors[var] = deseasonalize_da(era5_predictors[var]) #remove mean seasonal cycle
    
    x_train = era5_predictors.sel(time=lstm.time.where(lstm.o.isel(i=idxmin['i'].values,it=idxmin['it'].values).sel(split='train')))
    
    x_train_mean = x_train.mean(dim='time') #skips nan by default
    x_train_sd = x_train.std(dim='time',ddof=0) #skips nan by default
    ###### 

    #derive backtransform
    y_train_mean = np.nanmean(lstm.isel(i=idxmin['i'].values,it=idxmin['it'].values).o.sel(split='train'))
    y_train_sd = np.nanstd(lstm.isel(i=idxmin['i'].values,it=idxmin['it'].values).o.sel(split='train'),ddof=0)
    
    model = keras.models.load_model(os.path.join(nn_model_dir,'lstm_3h_'+output_tg+'_mse_hp1_i'+str(idxmin['i'].values)+'_it'+str(idxmin['it'].values)+'.keras'))
    
    #preprocess predictors:
    if 'w' in var_names and 'w' not in predictors.variables:
        predictors['w'] = np.sqrt((predictors.u10**2+predictors.v10**2))
            
    for var in var_names: #remove amean
        predictors[var] = predictors[var].groupby(predictors.time.dt.year) - predictors[var].groupby(predictors.time.dt.year).mean('time') #remove annual means
        predictors[var] = deseasonalize_da(predictors[var]) #remove mean seasonal cycle
    
    if standardize_predictors_with_era5:
        predictors = (predictors - x_train_mean)/x_train_sd
    else:
        predictors = (predictors - predictors.mean(dim='time'))/predictors.std(dim='time',ddof=0) #standardize

    if model_type.lower()=='mlr':
        predictors['stacked'] = predictors[var_names].to_array(dim="var") #put predictor variables into one array
        predictors = predictors[['stacked']]
        predictors['stacked'] = predictors['stacked'].transpose("time","var","lon_around_tg",...)#.stack(f=['var','lon_around_tg','lat_around_tg'],create_index=False)
    
    
    x = stack_predictors_for_lstm(predictors,var_names) #put into right format
    x,y = generate_windowed_filtered_np_input(x,np.zeros(x.shape[0]-n_steps+1),n_steps)
    y_out = model.predict(x)
    yhat = y_out*y_train_sd + y_train_mean
    
    surge_output.append(
        
        xr.Dataset(
    data_vars=dict(
        surge=(["time","tg"], yhat),
        y_train_mean = (["tg"],[y_train_mean]),
        y_train_sd = (["tg"],[y_train_sd])
    ),
    coords=dict(
        time=predictors.time.isel(time=np.arange(n_steps-1,len(predictors.time))),
        lon=lstm.lon,
        lat=lstm.lat,
        i_lstm = (["tg"],[idxmin['i'].values]),
        it_lstm = (["tg"],[idxmin['it'].values]),
    ),
    attrs=dict(description="LSTM with lowest RMSE above the observed 99th percentile applied to HighResMIP data",model=highresmip_model),)
        
                                      )
    
surge_ds = xr.merge(surge_output)
surge_ds.to_netcdf(os.path.join('/home/jovyan/test_surge_models/results/nn_tests/highresmip_predictions',
                                highresmip_model+'_lstm_minRMSE_predictions'+['','_alpha0'][only_alpha0]+['','_standardized_x_with_era5'][standardize_predictors_with_era5]+'.nc'),mode='w')
#surge_ds.to_netcdf(os.path.join(output_dir,'lstm_surge_'+highresmip_model)
    

processing: stavanger-svg-nor-nhs.csv


2024-10-18 13:21:11.232410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/nvidia/lib64
2024-10-18 13:21:11.232436: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)


processing: wick-wic-gbr-bodc.csv
processing: esbjerg-esb-dnk-dmi.csv
processing: immingham-imm-gbr-bodc.csv
processing: den_helder-denhdr-nld-rws.csv
processing: fishguard-fis-gbr-bodc.csv
processing: brest-822a-fra-uhslc.csv
processing: vigo-vigo-esp-ieo.csv
processing: alicante_i_outer_harbour-alio-esp-da_mm.csv
