**Need a lot of memory to run this?**

In [1]:
import numpy as np
import xarray as xr
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!
from xmip.postprocessing import combine_datasets,_concat_sorted_time
from sklearn.decomposition import PCA
import gcsfs
fs = gcsfs.GCSFileSystem() # equivalent to fsspec.fs('gs')

  from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!


Loop over subsetted `psl` & `sfcWind` datasets and open them:

In [16]:
#in_dir = '/home/jovyan/CMIP6cf/output/subsetted_forcing/'
in_dir = 'leap-persistent/timh37/CMIP6/subsetted_data/forcing_gssr_tgs'
models = [k.split('/')[-1] for k in fs.ls(in_dir)]

ddict = defaultdict(dict)
#for source_id in [s for s in os.listdir(in_dir) if s.startswith('.')==False]:
for source_id in ['CESM2']:#[s for s in models if s.startswith('.')==False]:
    
    experiments = [s.split('/')[-1].split('_')[2] for s in fs.ls(os.path.join(in_dir,source_id))]
    experiment_ids = [s for s in experiments if s.startswith('.')==False]
    #experiment_ids = [s.split('_')[2] for s in os.listdir(os.path.join(in_dir,source_id)) if s.startswith('.')==False]
    
    for experiment_id in set(experiment_ids): #for each experiment_id, open the datasets, concatenating all realizations:
        
        #source_ds = xr.open_mfdataset(os.path.join(in_dir,source_id,'*'+experiment_id+'*.nc'),join='outer',combine='nested',
        #                              compat='override',coords='minimal',concat_dim='member_id') #need to test this for large np. of realizations, like EC-Earth3
        source_ds = xr.open_mfdataset(os.path.join('gs://',in_dir,source_id,'*'+experiment_id+'*.zarr'),engine='zarr',chunks={},join='outer',combine='nested',
                                      compat='override',coords='minimal',concat_dim='member_id') #need to test this for large np. of realizations, like EC-Earth3
        
        ddict[source_ds.original_key.rsplit('.',1)[0]] = source_ds
        

Append SSP runs to historical runs for each SSP:

In [18]:
ssps = set([k.split('.')[2] for k in ddict.keys() if 'ssp' in k])

ddict_concat = defaultdict(dict)

for ssp in ssps:
    ddict_ssp = defaultdict(dict)
    
    for k in ddict.keys():
        if ((ssp in k) or ('historical' in k)):
            if k.replace('historical',ssp) in ddict.keys(): #only consider historical if there's also ssp
                ddict_ssp[k] = ddict[k]
            
    #append SSP to historical, only for realizations for which both experiments are provided (join=inner)
    hist_ssp = combine_datasets(ddict_ssp,
                                _concat_sorted_time,
                                match_attrs =['source_id', 'grid_label','table_id'],combine_func_kwargs={'join':'inner'})
    
    for key,ds in hist_ssp.items(): #put back together in dictionary
        ddict_concat[key+'.'+ssp] = ds

Sanity-check timeseries length:

In [19]:
for k,v in ddict_concat.items():
    num_days = (v.time[-1]-v.time[0]).dt.days
    assert (len(v.time) > .9*num_days) & (len(v.time) < 1.1*num_days)

Generate forcing data:

In [20]:
#generate forcing to compute surges with
for k,v in ddict_concat.items():
    attrs = v.attrs
    
    v['sfcWind_sqd'] = v['sfcWind']**2 #add wind squared
    v['sfcWind_cbd'] = v['sfcWind']**3 #add wind cubed
    
    v = (v-v.mean(dim='time'))/v.std(dim='time',ddof=0) #normalize (ignores nan by default?)
    v.attrs = attrs
    
    #concatenate & stack normalized forcing variables to data array with shape (time,(4 variables * num_degr * num_degr))
    v['forcing'] = v[["psl", "sfcWind", "sfcWind_sqd","sfcWind_cbd"]].to_array(dim="forcing_var") 
    v['forcing'] = v['forcing'].transpose("time","forcing_var","lon_around_tg",...).stack(f=['forcing_var','lon_around_tg','lat_around_tg'],create_index=False)
    ddict_concat[k]=v

Derive the principal components and multiply with regression coefficients derived from ERA5:

In [21]:
mlrcoefs = xr.open_dataset('/home/jovyan/CMIP6cf/gssr_coefs_1degRes_forcing.nc') #contains coordinates of and MLR coefficients at TGs

In [25]:
for k,ds in tqdm(ddict_concat.items()): #loop over datasets
    print('Deriving surges from forcing for: '+k)
 
    #generate path for storing output
    #model_path = os.path.join('/home/jovyan/CMIP6cf/output/surge_timeseries/',ds.source_id)
    #output_fn = os.path.join(model_path,k.replace('.','_')+'.nc')
    
    model_path = os.path.join('leap-persistent/timh37/CMIP6/timeseries/surge_tgs',ds.source_id)
    output_fn = os.path.join(model_path,k.replace('.','_')+'.zarr')
    
    #if not os.path.exists(model_path):
    #    os.mkdir(model_path)
    
    #derive surges from forcing
    surge_ds = xr.Dataset(data_vars=dict(surge=(['member_id','time','tg'], np.nan*np.zeros( (len(ds.member_id),len(ds.time),len(ds.tg))) )),
                            coords=dict(member_id=ds.member_id,time=ds.time,tg=ds.tg)) #initialize output
    
    for i_member,member in enumerate(ds.member_id):
        forcing_mem = ds.forcing.sel(member_id=member).copy(deep=True).load() #load forcing data array into memory (for all tg for current dataset and member)
        
        
        for i_tg,tg in enumerate(ds.tg):
            #get model forcing at TG
            forcing_tg = forcing_mem.sel(tg=tg) 
            
            #get MLR coefficients at TG
            tg_coefs = mlrcoefs.mlrcoefs.sel(tg=tg)
            num_pcs = int(np.sum(np.isfinite(tg_coefs)))-1 #number of coefs = number of PCs to derive, intercept doesn't count
            
            i_timesteps_w_data = np.argwhere(np.isfinite(forcing_tg.data).all(axis=1)).flatten()
            
            #get principal components (using sklearn to keep deterministic signs consistent)
            pca = PCA(num_pcs)
            pca.fit(forcing_tg.isel(time=i_timesteps_w_data).data) #remove missing values for PCA
            pcs = pca.transform(forcing_tg.isel(time=i_timesteps_w_data).data)
            
            #multiply with ERA5 regression coefficients to compute surges
            surge_ds['surge'][i_member,i_timesteps_w_data,i_tg] = np.sum(tg_coefs[np.isfinite(tg_coefs)].values * np.column_stack((np.ones(pcs.shape[0]),pcs)),axis=1) 

    surge_ds['surge'] = surge_ds['surge'].assign_coords(lon=('tg', mlrcoefs.lon.data),lat=('tg', mlrcoefs.lat.data)).assign_attrs(ds.attrs)
    
    #store:
    #surge_ds['surge'].to_netcdf(output_fn,mode='w')
    surge_ds.to_zarr(os.path.join('gs://',output_fn),mode='w')
    surge_ds.close()
    

  0%|          | 0/2 [00:00<?, ?it/s]

Deriving surges from forcing for: CESM2.gn.day.ssp585


AttributeError: 'DataArray' object has no attribute 'to_zarr'