In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
import dask
import intake
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.postprocessing import combine_datasets,_concat_sorted_time
import gcsfs
fs = gcsfs.GCSFileSystem() # equivalent to fsspec.fs('gs')

'''script to concatenate historical and ssp runs from different CMIP6 models and members'''

  from tqdm.autonotebook import tqdm


'script to concatenate historical and ssp runs from different CMIP6 models and members'

In [2]:
variable = 'psl' #variable to process
domain = 'europe' #europe, tgs

in_dir = 'leap-persistent/timh37/CMIP6/subsetted_data/'+variable+'_'+domain+'/' #where to open
out_dir = 'leap-persistent/timh37/CMIP6/timeseries/'+variable+'_'+domain+'/' #where to store

models = [k.split('/')[-1] for k in fs.ls(in_dir) if k.startswith('.')==False] #find models vor variable

ddict = defaultdict(dict) #initialize dictionary to store datasets in

for source_id in ['MPI-ESM1-2-HR']:#models:
    experiments = [s.split('/')[-1].split('_')[2] for s in fs.ls(os.path.join(in_dir,source_id))] #find experiments available for model
    experiment_ids = [s for s in experiments if s.startswith('.')==False]
    for experiment_id in set(experiment_ids): #for each experiment_id, open the datasets, concatenating all variants:
        source_ds = xr.open_mfdataset(os.path.join('gs://',in_dir,source_id,'*'+experiment_id+'*.zarr'),join='outer',combine='nested',
                                      compat='override',coords='minimal',concat_dim='member_id',engine='zarr',chunks={}) #need to test this for large np. of realizations, like EC-Earth3
        
        ddict[source_ds.original_key.rsplit('.',1)[0]] = source_ds

Append SSP runs to historical runs for each SSP:

In [3]:
ssps = set([k.split('.')[2] for k in ddict.keys() if 'ssp' in k]) #find unique SSPs in dictionary

ddict_concat = defaultdict(dict)

for ssp in ssps: #loop over SSPs
    ddict_ssp = defaultdict(dict)
    
    for k in ddict.keys():
        if ((ssp in k) or ('historical' in k)):
            if k.replace('historical',ssp) in ddict.keys(): #only consider historical if there's also ssp
                ddict_ssp[k] = ddict[k]
            
    #append SSP to historical, only for realizations for which both experiments are provided (join=inner)
    hist_ssp = combine_datasets(ddict_ssp,_concat_sorted_time,match_attrs =['source_id', 'grid_label','table_id'],combine_func_kwargs={'join':'inner'})

    for key,ds in hist_ssp.items(): #put back together in dictionary
        ddict_concat[key+'.'+ssp] = ds

Store per SSP:

In [4]:
for key,ds in tqdm(ddict_concat.items()):
    model_path = os.path.join(out_dir,ds.source_id)
    
    if 'tg' in ds.coords:
        ds['tg'] = ds.tg.astype('str') #something wrong with encoding object types in zarr, this is the work-around
    ds['member_id'] = ds.member_id.astype('str')
    
    if 'chunks' in ds[variable].encoding: #something wrong with encoding of chunks for saving to zarr, this is the work-around
        del ds[variable].encoding['chunks']
    
    if 'longitude' in ds.coords:    
        ds.chunk({'member_id':1,'longitude':5,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w')
    else:
        ds.chunk({'member_id':1,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w')
    
    ds.close()

  0%|          | 0/2 [00:00<?, ?it/s]