In [1]:
import numpy as np
import fnmatch
import xarray as xr
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!
from xmip.postprocessing import combine_datasets,_concat_sorted_time
from sklearn.decomposition import PCA
import gcsfs
fs = gcsfs.GCSFileSystem() # equivalent to fsspec.fs('gs')

  from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!


In [2]:
grid_size = 9 #n by n degree grid around each TG
cmip6_resolution = 1.5

num_grid_cells = int(grid_size/cmip6_resolution)

mlr_coefs = xr.open_dataset('/home/jovyan/CMIP6cex/cmip6_processing/gssr_mlr_coefs_1p5_9deg_gesla2.nc') #load MLR coefficients at TGs
era5_pcs = xr.open_dataset('/home/jovyan/CMIP6cex/cmip6_processing/era5_pca_components_1p5_9deg_gesla2.nc')

Loop over timeseries of `psl` & `sfcWind` at common 1.5 by 1.5 degree grid and open them:

In [3]:
#in_dir = '/home/jovyan/CMIP6cf/output/subsetted_forcing/'
var1 = 'psl'
var2 = 'sfcWind'

var1_dir = 'leap-persistent/timh37/CMIP6/timeseries/'+var1+'_europe'
var2_dir = 'leap-persistent/timh37/CMIP6/timeseries/'+var2+'_europe'

output_dir = 'leap-persistent/timh37/CMIP6/timeseries/surge_tgs'

var1_models = [k.split('/')[-1] for k in fs.ls(var1_dir) if k.startswith('.')==False]
var2_models = [k.split('/')[-1] for k in fs.ls(var2_dir) if k.startswith('.')==False]

models = [k for k in var1_models if k in var2_models]
ddict = defaultdict(dict)

for source_id in models:
    var1_model_path = os.path.join(var1_dir,source_id)
    var2_model_path = os.path.join(var2_dir,source_id)
    
    var1_exps = [s.split('/')[-1].split('_')[-1][0:-5] for s in fs.ls(var1_model_path) if s.startswith('.')==False] 
    var2_exps = [s.split('/')[-1].split('_')[-1][0:-5] for s in fs.ls(var2_model_path) if s.startswith('.')==False]
    experiment_ids = list(set(var1_exps) & set(var2_exps))
    
    for experiment_id in set(experiment_ids): #for each experiment_id, open the datasets, concatenating all realizations:
        #load data:
        fn = fnmatch.filter(fs.ls(var1_model_path),'*'+experiment_id+'*')[0]
        fn = fn.split('/')[-1]
        
        var1_var2_data = xr.open_mfdataset((os.path.join('gs://',var1_model_path,fn),os.path.join('gs://',var2_model_path,fn)),engine='zarr',chunks={'member_id':1,'time':100000,'longitude':5})
        
        ddict[fn.replace('.zarr','')] = var1_var2_data
        

Generate predictor data and multiply with regression coefficients:

In [4]:
ddict_predictors = defaultdict(dict)

for key,ds in tqdm(ddict.items()):
   
    for m,member in tqdm(enumerate(ds.member_id.values)):
        ds_mem = ds.sel(member_id=member)
        
        if key == list(ddict.keys())[0]: #common grid, so only need to generate these grids for the first ds
            lat_ranges = np.zeros((len(mlr_coefs.tg),int(grid_size/cmip6_resolution)))
            lon_ranges = np.zeros((len(mlr_coefs.tg),int(grid_size/cmip6_resolution)))

            for t,tg in enumerate(mlr_coefs.tg.values):
                lat_ranges[t,:] = ds_mem.latitude[((ds_mem.latitude>=(mlr_coefs.sel(tg=tg).lat-grid_size/2)) & (ds_mem.latitude<=(mlr_coefs.sel(tg=tg).lat+grid_size/2)))][0:int(grid_size/cmip6_resolution)]
                lon_ranges[t,:] = ds_mem.longitude[((ds_mem.longitude>=(mlr_coefs.sel(tg=tg).lon-grid_size/2)) & (ds_mem.longitude<=(mlr_coefs.sel(tg=tg).lon+grid_size/2)))][0:int(grid_size/cmip6_resolution)]
        
            lons_da = xr.DataArray(lon_ranges,dims=['tg','lon_around_tg'],coords={'tg':mlr_coefs.tg,'lon_around_tg':np.arange(0,int(grid_size/cmip6_resolution))})
            lats_da = xr.DataArray(lat_ranges,dims=['tg','lat_around_tg'],coords={'tg':mlr_coefs.tg,'lat_around_tg':np.arange(0,int(grid_size/cmip6_resolution))})
        
        
        #sanity check timeseries length
        num_days = (ds_mem.time[-1]-ds_mem.time[0]).dt.days
        assert (len(ds_mem.time) > .9*num_days) & (len(ds_mem.time) < 1.1*num_days)
        
        predictors = ds_mem
        
        #generate predictors
        predictors['sfcWind_sqd'] = predictors['sfcWind']**2 #add wind squared
        predictors['sfcWind_cbd'] = predictors['sfcWind']**3 #add wind cubed
        
        predictors = (predictors-predictors.mean(dim='time'))/predictors.std(dim='time',ddof=0) #normalize predictor variables (ignores nan by default?)
        predictors = predictors.sel(latitude=lats_da,longitude=lons_da).load() #takes a lot of memory, but is much more efficient than loading per TG?
        
        #concatenate & stack normalized forcing variables to data array with shape (time,(4 variables * grid_size * grid_size))
        predictors['predictors'] = predictors[["psl", "sfcWind", "sfcWind_sqd","sfcWind_cbd"]].to_array(dim="predictor_var") 
        predictors['predictors'] = predictors['predictors'].transpose("time","predictor_var","lon_around_tg",...).stack(f=['predictor_var','lon_around_tg','lat_around_tg'],create_index=False)
            
        #compute surges from predictors
        if m==0: #initialize
            surge_ds = xr.Dataset(data_vars=dict(surge=(['member_id','time','tg'], np.nan*np.zeros( (len(ds.member_id),len(ds.time),len(mlr_coefs.tg))) )),
                            coords=dict(member_id=ds.member_id,time=ds.time,tg=mlr_coefs.tg)) #initialize output dataset per model
        
        for t,tg in enumerate(mlr_coefs.tg):
            predictors_at_tg = predictors.sel(tg=tg)
            mlr_coefs_at_tg = mlr_coefs.mlr_coefs.sel(tg=tg)
            
            num_pcs = int(np.sum(np.isfinite(mlr_coefs_at_tg)))-1 #number of mlr coefs = number of PCs to derive, intercept doesn't count
            idx_timesteps_w_data = np.argwhere((np.isfinite(predictors_at_tg.predictors).all(axis=1)).values).flatten() #omit timesteps with NaN if any
            
            #get principal components (using sklearn to keep deterministic signs consistent)
            pca = PCA(num_pcs)
            pca.fit(predictors_at_tg.predictors.isel(time=idx_timesteps_w_data)) #remove missing values for PCA
            pcs = pca.transform(predictors_at_tg.predictors.isel(time=idx_timesteps_w_data))
            
            components = xr.DataArray(data=pca.components_,dims=['pc','f'],coords=dict(pc=np.arange(num_pcs),f=predictors_at_tg.f))
            
            #compute RMSEs with ERA5 principal components, only considering the pressure part of the forcing (first num_grid_cells**2)
            rmses = np.sqrt(((components.isel(f=np.arange(num_grid_cells**2))-era5_pcs.sel(tg=tg).isel(f=np.arange(num_grid_cells**2)).isel(pc=np.arange(num_pcs)).component)**2).mean(dim='f')) #original sign
            rmses_flipped = np.sqrt(((components.isel(f=np.arange(num_grid_cells**2))--era5_pcs.isel(f=np.arange(num_grid_cells**2)).sel(tg=tg).isel(pc=np.arange(num_pcs)).component)**2).mean(dim='f')) #opposite sign

            s = (rmses<rmses_flipped).astype('int') #flip pcs if rmse of flipped pc is lower
            s[s==0]=-1
            pcs = pcs * s.values
            
            #multiply with ERA5 regression coefficients to compute surges
            surge_ds['surge'][m,idx_timesteps_w_data,t] = np.sum(mlr_coefs_at_tg[np.isfinite(mlr_coefs_at_tg)].values * np.column_stack((np.ones(pcs.shape[0]),pcs)),axis=1) 
            
    #surge_ds.to_zarr(os.path.join('gs://',output_dir,ds.source_id,key+'.zarr'),mode='w')
    #surge_ds.close()

  0%|          | 0/48 [00:00<?, ?it/s]

0it [00:00, ?it/s]

KeyboardInterrupt: 