In [1]:
'''script to regrid CMIP6 datatsets to target grid and store them'''

import numpy as np
import xarray as xr
import dask
import intake
import pandas as pd
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.postprocessing import combine_datasets, _match_datasets,_concat_sorted_time
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat, drop_older_versions
from cmip_ds_dict_operations import select_period, pr_flux_to_m, drop_duplicate_timesteps, drop_coords, drop_incomplete
import xesmf as xe
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

  from tqdm.autonotebook import tqdm


fs.ls('leap-persistent/timh37/CMIP6/subsetted_data/pr_europe/ACCESS-CM2')

test=xr.open_dataset('gs://leap-persistent/timh37/CMIP6/subsetted_data/pr_europe/ACCESS-CM2/ACCESS-CM2_gn_historical_day_r1i1p1f1.zarr',engine='zarr')

In [2]:
#configure settings
overwrite_existing = True #whether or not to process files for which output already exists (to-do: implement)

target_grid = xr.Dataset( #grid to interpolate CMIP6 simulations to
        {   "longitude": (["longitude"], np.arange(-30,22.5,1.5), {"units": "degrees_east"}),
            "latitude": (["latitude"], np.arange(70,30,-1.5), {"units": "degrees_north"}),})

query_vars = ['sfcWind','pr','psl'] #variables to process
required_vars = ['sfcWind','pr','psl'] #variables that includes models should provide

ssps = ['ssp245','ssp585']

In [3]:
#query simulations & manipulate data catalogue:
col = google_cmip_col() #google cloud catalogue
lcol = intake.open_esm_datastore("https://storage.googleapis.com/leap-persistent-ro/data-library/catalogs/cmip6-test/leap-pangeo-cmip6-test.json") #temporary pangeo-leap-forge catalogue
col.esmcat._df = pd.concat([col.df,lcol.df],ignore_index=True) #merge these catalogues

ssp_cats = defaultdict(dict)

#search catalogue per ssp (need to do this for each SSP separately as availability may differ between them)
for s,ssp in enumerate(ssps):
    ssp_cat = col.search( #find instances providing all required query_vars for both historical & ssp experiments
    experiment_id=['historical',ssp],
    table_id='day',
    variable_id=required_vars,
    require_all_on=['source_id', 'member_id','grid_label'])
    ssp_cats[ssp] = ssp_cat
    
ssp_cats_merged = ssp_cats[ssp] #merge catalogues for all ssps, and drop duplicate historical simulations
ssp_cats_merged.esmcat._df = pd.concat([v.df for k,v in ssp_cats.items()],ignore_index=True).drop_duplicates(ignore_index=True)

ssp_cats_merged = drop_older_versions(ssp_cats_merged) #if google cloud and leap-pangeo catalogues provide duplicate datasets, keep the newest version, and if the versions are identical, keep the leap-pangeo dataset
ssp_cats_merged = reduce_cat_to_max_num_realizations(ssp_cats_merged) #per model, select grid and 'ipf' combination providing most realizations (needs to be applied to both SSPs together to ensure the same variants are used under both scenarios)
ssp_cats_merged = drop_vars_from_cat(ssp_cats_merged,[k for k in required_vars if k not in query_vars]) #out of required variables only process query variables

In [4]:
for s,ssp in tqdm(enumerate(ssps)): #for each ssp:  
    #select historical and ssp data in merged catalogue for this particular ssp
    cat_to_open = ssp_cats_merged.search(
    experiment_id=['historical',ssp],
    table_id='day',
    variable_id=required_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

    #open datasets into dictionary
    cat_to_open.esmcat.aggregation_control.groupby_attrs = [] #to circumvent aggregate=false bug

    #to avoid this issue: https://github.com/intake/intake-esm/issues/496
        #doesn't actually aggregate if we set cmip6_cat.esmcat.aggregation_control.groupby_attrs = []
    kwargs = {'zarr_kwargs':{'consolidated':True,'use_cftime':True},'aggregate':True} #keyword arguments for generating dictionary of datasets from cmip6 catalogue
    ddict = cat_to_open.to_dataset_dict(**kwargs) #open datasets into dictionary

    #preprocess datasets in dictionary
    ddict = pr_flux_to_m(ddict) #convert pr flux to accumulated pr
    ddict = drop_duplicate_timesteps(ddict) #remove duplicate timesteps if datasets have them
    #ddict = select_period(ddict,1850,2100) #preselect time periods, do this at later stage in the chain?
    ddict = drop_coords(ddict,['bnds','nbnd','height']) #remove some unused auxiliary coordinates
    
    #concatenate historical and ssp datasets in time
    with dask.config.set(**{'array.slicing.split_large_chunks': True}):
        hist_ssp = combine_datasets(ddict,_concat_sorted_time,match_attrs =['source_id', 'grid_label','table_id','variant_label','variable_id'],combine_func_kwargs={'join':'inner','coords':'minimal'})    

    hist_ssp_ = defaultdict(dict) #probably a better way to do this, but there are approx. 1 files for which the time units are inconsistent between historical and ssp
    for k,v in hist_ssp.items():
        if v.time[-1].values.dtype != v.time[0].values.dtype:
            print('dropping ' + k +' due to inconsistent timestamps in historical and ssp runs')
            continue
        else:
            hist_ssp_[k] = v
            
    hist_ssp_ = drop_duplicate_timesteps(hist_ssp_) #remove overlap between historical and ssp experiments which sometimes exists
    hist_ssp_complete = drop_incomplete(hist_ssp_) #remove historical+ssp timeseries which are not montonically increasing or have large timegaps (based on Julius Buseckes rudimentary testing in CMIP6-LEAP-feadstock)
    
    #regrid these datasets to the target grid
    hist_ssp_eu = defaultdict(dict)
    for key,ds in tqdm(hist_ssp_complete.items()):
        ds.attrs["time_concat_key"] = key #add current key information to attributes
        ds = ds.isel(dcpp_init_year=0,drop=True) #remove this coordinate

        regridder = xe.Regridder(ds,target_grid,'bilinear',ignore_degenerate=True,periodic=True) #create regridder for this dataset
        try:
            hist_ssp_eu[key] = regridder(ds,keep_attrs=True) #apply regridder
        except: #issue with 1 dataset that is chunked along two dimensions, rechunk that
            hist_ssp_eu[key] = regridder(ds.chunk({'time':100,'lat':1000,'lon':1000}),keep_attrs=True)
        
    #storage (to-do..)
    #for key,ds in tqdm(ddict_eu.items()):
    #    model_path = os.path.join('leap-persistent/timh37/CMIP6/subsetted_data/'+variable+'_europe/',ds.source_id) #store to leap-persistent
    #    ds.chunk({'member_id':1,'longitude':5,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w') #store to leap-persistent as .zarr
    #    ds.close()
    

0it [00:00, ?it/s]


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Dropping duplicate timesteps for:ScenarioMIP.NCAR.CESM2-WACCM.ssp585.r1i1p1f1.day.psl.gn.gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/day/psl/gn/v20200702/.20200702
Dropping duplicate timesteps for:ScenarioMIP.NCAR.CESM2-WACCM.ssp585.r1i1p1f1.day.pr.gn.gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/day/pr/gn/v20200702/.20200702
Dropping duplicate timesteps for:ScenarioMIP.NCAR.CESM2-WACCM.ssp585.r1i1p1f1.day.sfcWind.gn.gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/day/sfcWind/gn/v20200702/.20200702
Dropping duplicate timesteps for:CMIP.NCAR.CESM2-WACCM.historical.r3i1p1f1.day.psl.gn.gs://cmip6/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r3i1p1f1/day/psl/gn/v20190227/.20190227
Dropping duplicate timesteps for:CMIP.NCAR.CESM2-WACCM.historical.r2i1p1f1.day.psl.gn.gs://cmip6/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r2i1p1f1/day/psl/gn/v20190227/.20190227
Dropping duplicate timesteps for:CESM2.gn.day.r11i1p1f1.sfcWind
Dropping duplicate timesteps for

  0%|          | 0/605 [00:00<?, ?it/s]

Calculate total size of datasets

x=0
for k,v in hist_ssp.items():
 
    x += v.nbytes/1000000000
x

with dask.config.set(**{'array.slicing.split_large_chunks': True}): #join=outer pads NaNs which result in large chunks for timeseries that differ in length
    ddict_merged = combine_datasets(ddict,merge_variables_aligning_lonlat,match_attrs=['source_id', 'grid_label', 'experiment_id', 'table_id','variant_label']) #group datasets of same model

#query simulations & manipulate data catalogue:
col = google_cmip_col() #google cloud catalogue
lcol = intake.open_esm_datastore("https://storage.googleapis.com/leap-persistent-ro/data-library/catalogs/cmip6-test/leap-pangeo-cmip6-test.json") #temporary pangeo-leap-forge catalogue
col.esmcat._df = pd.concat([col.df,lcol.df],ignore_index=True) #merge these catalogues


#search catalogue (need to do this for each SSP separately as availability may differ between them)
cat_data_ssp245 = col.search( #find instances providing all required query_vars for both historical & ssp245 experiments
    experiment_id=['historical','ssp245'],
    table_id='day',
    variable_id=required_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

cat_data_ssp585 = col.search( #find instances providing all required query_vars for both historical & ssp585 experiments
    experiment_id=['historical','ssp585'],
    table_id='day',
    variable_id=required_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

#merge SSPs and remove duplicate historical simulations
cat_data=cat_data_ssp585
cat_data.esmcat._df = pd.concat([cat_data_ssp245.df,cat_data_ssp585.df],ignore_index=True).drop_duplicates(ignore_index=True) 

cat_data = drop_older_versions(cat_data) #if google cloud and leap-pangeo catalogues provide duplicate datasets, keep the newest version, and if the versions are identical, keep the leap-pangeo dataset
cat_data = reduce_cat_to_max_num_realizations(cat_data) #per model, select grid and 'ipf' combination providing most realizations (needs to be applied to both SSPs together to ensure the same variants are used under both scenarios)
cat_data = drop_vars_from_cat(cat_data,[k for k in required_vars if k not in query_vars]) #out of required variables only process query variables

Store the dataset to leap-persistent share (directories structured per model):

for key,ds in tqdm(ddict_eu.items()):
    model_path = os.path.join('leap-persistent/timh37/CMIP6/subsetted_data/'+variable+'_europe/',ds.source_id) #store to leap-persistent
    ds.chunk({'member_id':1,'longitude':5,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w') #store to leap-persistent as .zarr
    ds.close()