In [1]:
import numpy as np
import xarray as xr
import dask
import intake
import pandas as pd
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat
import typing
import itertools

  from tqdm.autonotebook import tqdm


In [2]:
#combine_datasets() work around to merge variables into datasets that have not exactly matching coordinates but are supposed to have the same grid
def align_lonlat(ds_list):
    aligned_ds_list = []
    for ds in ds_list: #list of ds can't seem to be passed to xr.align instead
        a,b = xr.align(ds_list[0],ds,join='override',exclude=['time','member_id'])
        aligned_ds_list.append(b)
    return aligned_ds_list

def merge_variables_aligning_lonlat(ds_list):
    aligned_ds_list = align_lonlat(ds_list) #override same-dimension lon/lat prior to concatenating (ensures lon/lats are not padded)
    return xr.merge(aligned_ds_list, join='outer',compat='override')

In [3]:
#borrowed from intake-esm so that I can apply this on the concatenation of different data catalogues
def search_apply_require_all_on(
    *,
    df: pd.DataFrame,
    query: dict[str, typing.Any],
    require_all_on: typing.Union[str, list[typing.Any]],
    columns_with_iterables: set = None,
) -> pd.DataFrame:
    _query = query.copy()
    
    for column in require_all_on:
        _query.pop(column, None)
    
    keys = list(_query.keys())
    grouped = df.groupby(require_all_on)
    values = [tuple(v) for v in _query.values()]
    condition = set(itertools.product(*values))
    query_results = []
    
    for _, group in grouped:
        group_for_index = group
        # Unpack iterables to get testable index.
        for column in (columns_with_iterables or set()).intersection(keys):
            group_for_index = unpack_iterable_column(group_for_index, column)

        index = group_for_index.set_index(keys).index
        if not isinstance(index, pd.MultiIndex):
            index = {(element,) for element in index.to_list()}
        else:
            index = set(index.to_list())
        if condition.issubset(index):  # with iterables we could have more then requested
            query_results.append(group)

    if query_results:
        return pd.concat(query_results).reset_index(drop=True)

    return pd.DataFrame(columns=df.columns)

In [4]:
gcol = google_cmip_col() #xmip wrapper
ccol = intake.open_esm_datastore("https://storage.googleapis.com/leap-persistent-ro/data-library/catalogs/cmip6-test/leap-pangeo-cmip6-test.json")

variable = 'sfcWind' #variable to obtain data for
query_vars = ['sfcWind','pr','psl'] #variables models simulations are required to provide
experiments = ['historical','ssp245']

cat1_ssp245 = gcol.search(experiment_id=['historical','ssp245'],
    table_id='day',
    variable_id=query_vars)
cat2_ssp245 = ccol.search(experiment_id=['historical','ssp245'],
    table_id='day',
    variable_id=query_vars)

cat_ssp245 = cat1_ssp245
cat_ssp245.esmcat._df = pd.concat([cat1_ssp245.df,cat2_ssp245.df],ignore_index=True).drop_duplicates(ignore_index=True) #combine the dataframes of the two catalogues
cat_ssp245.esmcat._df = search_apply_require_all_on(df=cat_ssp245.df,query=dict(experiment_id=['historical','ssp245'],
                                                    table_id=['day'],variable_id=query_vars),require_all_on=['source_id', 'member_id','grid_label'])

#treating SSPs separately as I don't know how else to apply the 'require all on' function to each ssp catalogue separately
cat1_ssp585 = gcol.search(experiment_id=['historical','ssp585'],
    table_id='day',
    variable_id=query_vars)
cat2_ssp585 = ccol.search(experiment_id=['historical','ssp585'],
    table_id='day',
    variable_id=query_vars)

cat_ssp585 = cat1_ssp585
cat_ssp585.esmcat._df = pd.concat([cat1_ssp585.df,cat2_ssp585.df],ignore_index=True).drop_duplicates(ignore_index=True) #combine the dataframes of the two catalogues
cat_ssp585.esmcat._df = search_apply_require_all_on(df=cat_ssp585.df,query=dict(experiment_id=['historical','ssp585'],
                                                    table_id=['day'],variable_id=query_vars),require_all_on=['source_id', 'member_id','grid_label'])

cat = cat_ssp585
cat.esmcat._df = pd.concat([cat_ssp245.df,cat_ssp585.df],ignore_index=True).drop_duplicates(ignore_index=True)
cat = reduce_cat_to_max_num_realizations(cat) #per model, select grid and 'ipf' combination providing most realizations

for source_id in cat.df.source_id.unique():
    print(source_id)
    for ssp in ['ssp245','ssp585']:
        print(ssp)
        print(len(cat.df[(cat.df.source_id==source_id) & (cat.df.experiment_id==ssp)].member_id.unique()))

To deal with duplicates with different versions, keep the newest versions. Executed by ordering versions ascendingly, 

In [5]:
for i in np.arange(len(cat.df)):
    if isinstance(cat.df.loc[i,'version'],int)==False:
        cat.df.loc[i,'version'] = int(cat.df.loc[i,'version'].replace('v',''))
cat.esmcat._df = cat.df.sort_values(by='version', ascending=False).drop_duplicates(subset=['activity_id','institution_id','source_id','experiment_id','member_id','table_id','variable_id','grid_label']).sort_index()

In [None]:
#test loading in data for a single variable
query_vars.remove(variable)
cat = drop_vars_from_cat(cat,query_vars) #only process data for 'variable'

cat.esmcat.aggregation_control.groupby_attrs = [] #to circumvent aggregate=false bug

#to avoid this issue: https://github.com/intake/intake-esm/issues/496
    #doesn't actually aggregate if we set cmip6_cat.esmcat.aggregation_control.groupby_attrs = []
kwargs = {'zarr_kwargs':{'consolidated':True,'use_cftime':True},'aggregate':True} #keyword arguments for generating dictionary of datasets from cmip6 catalogue
ddict = cat.to_dataset_dict(**kwargs) #open datasets into dictionary


  ddict = cat.to_dataset_dict(**kwargs) #open datasets into dictionary



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Loading in works.