In [1]:
import numpy as np
import xarray as xr
import dask
import intake
import pandas as pd
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.postprocessing import combine_datasets, _match_datasets,_concat_sorted_time
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat
from cmip_ds_dict_operations import preselect_years, pr_flux_to_m, drop_duplicate_timesteps, drop_coords
import xesmf as xe
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

'''script to regrid CMIP6 datatsets to target grid and store them'''

  from tqdm.autonotebook import tqdm


'script to regrid CMIP6 datatsets to target grid and store them'

In [2]:
#combine_datasets() work around to merge variables into datasets that have not exactly matching coordinates but are supposed to have the same grid
def align_lonlat(ds_list):
    aligned_ds_list = []
    for ds in ds_list: #list of ds can't seem to be passed to xr.align instead
        a,b = xr.align(ds_list[0],ds,join='override',exclude=['time','member_id'])
        aligned_ds_list.append(b)
    return aligned_ds_list

def merge_variables_aligning_lonlat(ds_list):
    aligned_ds_list = align_lonlat(ds_list) #override same-dimension lon/lat prior to concatenating (ensures lon/lats are not padded)
    return xr.merge(aligned_ds_list, join='outer',compat='override')

Query simulations & manipulate data catalogue:

In [3]:
variable = 'sfcWind' #variable to obtain data for
query_vars = ['sfcWind','pr','psl'] #variables models simulations are required to provide

''' models with relatively high resolution can be queried using: source_id=highRes_Models in col.search()
highRes_models = ['BCC-CSM2-MR','CESM2','CESM2-WACCM','CMCC-ESM2','CMCC-CM2-SR5','EC-Earth3',
                'GFDL-CM4','GFDL-ESM4','HadGEM3-GC31-MM','MIROC6','MPI-ESM1-2-HR','MRI-ESM2-0',
                'NorESM2-MM','TaiESM1']
'''
col = google_cmip_col() #xmip wrapper

#need to do this for each SSP separately as availability may differ between them
cat_data_ssp245 = col.search( #find instances providing all required query_vars for both historical & ssp245 experiments
    experiment_id=['historical','ssp245'],
    source_id=['HadGEM3-GC31-LL'],
    table_id='day',
    variable_id=query_vars)#,
    #require_all_on=['source_id', 'member_id','grid_label'])

cat_data_ssp585 = col.search( #find instances providing all required query_vars for both historical & ssp585 experiments
    experiment_id=['historical','ssp585'],
    source_id=['HadGEM3-GC31-LL'],
    table_id='day',
    variable_id=query_vars)#,
    #require_all_on=['source_id', 'member_id','grid_label'])

cat_data = cat_data_ssp585
cat_data.esmcat._df = pd.concat([cat_data_ssp245.df,cat_data_ssp585.df],ignore_index=True).drop_duplicates(ignore_index=True)
cat_data = reduce_cat_to_max_num_realizations(cat_data) #per model, select grid and 'ipf' combination providing most realizations

#query_vars.remove(variable)
#cat_data = drop_vars_from_cat(cat_data,query_vars) #only process in data for 'variable'

In [6]:
cat_data.df[cat_data.df.experiment_id=='ssp245'].sort_values(by='member_id')[0:50]

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
0,ScenarioMIP,CAS,FGOALS-g3,ssp245,r1i1p1f1,day,pr,gn,gs://cmip6/CMIP6/ScenarioMIP/CAS/FGOALS-g3/ssp...,,20190818
5,ScenarioMIP,CAS,FGOALS-g3,ssp245,r2i1p1f1,day,pr,gn,gs://cmip6/CMIP6/ScenarioMIP/CAS/FGOALS-g3/ssp...,,20191212
4,ScenarioMIP,CAS,FGOALS-g3,ssp245,r3i1p1f1,day,pr,gn,gs://cmip6/CMIP6/ScenarioMIP/CAS/FGOALS-g3/ssp...,,20191212
6,ScenarioMIP,CAS,FGOALS-g3,ssp245,r4i1p1f1,day,pr,gn,gs://cmip6/CMIP6/ScenarioMIP/CAS/FGOALS-g3/ssp...,,20191212


Gist to load in additional CMIP6 data residing in Julis' buckets:

```python
#ddict_manual = {ddict_manual = {k.split('/')[-1].replace('.zarr',''):xr.open_dataset(fs.get_mapper(k),engine='zarr',chunks={}) for k in fs.ls('leap-persistent/jbusecke/data/CMIP6/dataflow_test_production')}
k.split('/')[-1].replace('.zarr',''):xr.open_dataset(fs.get_mapper(k),engine='zarr',chunks={}) for k in fs.ls('leap-persistent/jbusecke/data/CMIP6/manual_test')}
ddict_manual = {k.split('/')[-1].replace('.zarr',''):xr.open_dataset(fs.get_mapper(k),engine='zarr',chunks={}) for k in fs.ls('leap-persistent/jbusecke/data/CMIP6/dataflow_test_production')}
```

Open datasets into dictionary:

In [4]:
cat_data.esmcat.aggregation_control.groupby_attrs = [] #to circumvent aggregate=false bug

#to avoid this issue: https://github.com/intake/intake-esm/issues/496
    #doesn't actually aggregate if we set cmip6_cat.esmcat.aggregation_control.groupby_attrs = []
kwargs = {'zarr_kwargs':{'consolidated':True,'use_cftime':True},'aggregate':True} #keyword arguments for generating dictionary of datasets from cmip6 catalogue
ddict = cat_data.to_dataset_dict(**kwargs) #open datasets into dictionary


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


  ddict = cat_data.to_dataset_dict(**kwargs) #open datasets into dictionary


In [5]:
if variable=='pr':
    ddict = pr_flux_to_m(ddict) #convert pr flux to accumulated pr
ddict = drop_duplicate_timesteps(ddict) #CESM2-WACCM has duplicate timesteps
ddict = preselect_years(ddict,1850,2100) #some models have time series until post-2100, we exclude those here
ddict = drop_coords(ddict,['bnds','nbnd'])

In [6]:
target_grid = xr.Dataset( #define the common grid used for analysis and to derive the MLR coefficients (approximately average grid resolution of included CMIP6 models)
        {   "longitude": (["longitude"], np.arange(-30,22.5,1.5), {"units": "degrees_east"}),
            "latitude": (["latitude"], np.arange(30,70.5,1.5), {"units": "degrees_north"}),})

In [7]:
with dask.config.set(**{'array.slicing.split_large_chunks': True}): #join=outer pads NaNs which result in large chunks for timeseries that differ in length
    ddict_merged = combine_datasets(ddict,merge_variables_aligning_lonlat,match_attrs=['source_id', 'grid_label', 'experiment_id', 'table_id','variant_label']) #group datasets of same model

Do the subsetting at the target grid:

In [8]:
#subsetting for coarse MPI-ESM1-2-HR
for key,ds in ddict_merged.items():
    if key in ['MPI-ESM1-2-HR.gn.ssp585.day.r1i1p1f1','MPI-ESM1-2-HR.gn.historical.day.r1i1p1f1']:
        
        regridder = xe.Regridder(ds,ddict_merged['CanESM5.gn.historical.day.r1i1p1f1'],'bilinear',ignore_degenerate=True,periodic=True) # #define regridder for this model/grid
    
        ds = ds.isel(dcpp_init_year=0,drop=True)
        regridded_ds = regridder(ds,keep_attrs=True) #do the regridding
        model_path = os.path.join('leap-persistent/timh37/CMIP6/subsetted_data/'+variable+'_europe/',ds.source_id+'_CanESM5_grid') #store to leap-persistent
        regridded_ds = regridded_ds.rename_dims({'lat':'latitude','lon':'longitude'})
        regridded_ds.chunk({'member_id':1,'longitude':5,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w') #store to leap-persistent as .zarr
        regridded_ds.close()    
  

In [9]:
for key,ds in ddict_merged.items():
    lon_coord = list(k for k in ds.dims if 'lon' in k)[0] #find lon/lat coordinate names of CMIP6 model
    
    #two lines below may not be necessary if periodic=True in xe.Regridder?
    ds.coords[lon_coord] = ((ds.coords[lon_coord] + 180) % 360) - 180 #wrap around 0
    ds = ds.reindex({ lon_coord : np.sort(ds[lon_coord])})
    
    ds.attrs["original_key"] = key #add key information to attributes
    ddict_merged[key] = ds

ddict_eu = defaultdict(dict) #generate new dictionary holding data for European subdomain
ds_dict = {k: v for k, v in ddict_merged.items()}

while len(ds_dict) > 0: #<- copied from xmip's combine_datasets
    k = list(ds_dict.keys())[0]
    ds = ds_dict.pop(k)

    matched_datasets = _match_datasets(ds, ds_dict, ['source_id', 'grid_label'], pop=True) #find datasets belonging to same model/grid
    regridder = xe.Regridder(matched_datasets[0],target_grid,'bilinear',ignore_degenerate=True,periodic=True) # #define regridder for this model/grid
    
    for matched_ds in matched_datasets:
        first_ds,aligned_ds = xr.align(matched_datasets[0],matched_ds,join='override',exclude=['time','member_id','dcpp_init_year']) #makes sure lon/lat coordinates of each model simulation are exactly the same

        aligned_ds = aligned_ds.isel(dcpp_init_year=0,drop=True) #get rid of dcpp_init_year dimension
        ddict_eu[matched_ds.original_key] = regridder(aligned_ds,keep_attrs=True) #do the regridding

Store the dataset to leap-persistent share (directories structured per model):

In [10]:
for key,ds in tqdm(ddict_eu.items()):
    model_path = os.path.join('leap-persistent/timh37/CMIP6/subsetted_data/'+variable+'_europe/',ds.source_id) #store to leap-persistent
    ds.chunk({'member_id':1,'longitude':5,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w') #store to leap-persistent as .zarr
    ds.close()

  0%|          | 0/395 [00:00<?, ?it/s]