In [1]:
import numpy as np
import xarray as xr
import dask
import intake
import pandas as pd
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat
import typing
import itertools

  from tqdm.autonotebook import tqdm


In [3]:
#borrowed from intake-esm so that I can apply this on the concatenation of different data catalogues
def search_apply_require_all_on(
    *,
    df: pd.DataFrame,
    query: dict[str, typing.Any],
    require_all_on: typing.Union[str, list[typing.Any]],
    columns_with_iterables: set = None,
) -> pd.DataFrame:
    _query = query.copy()
    
    for column in require_all_on:
        _query.pop(column, None)
    
    keys = list(_query.keys())
    grouped = df.groupby(require_all_on)
    values = [tuple(v) for v in _query.values()]
    condition = set(itertools.product(*values))
    query_results = []
    
    for _, group in grouped:
        group_for_index = group
        # Unpack iterables to get testable index.
        for column in (columns_with_iterables or set()).intersection(keys):
            group_for_index = unpack_iterable_column(group_for_index, column)

        index = group_for_index.set_index(keys).index
        if not isinstance(index, pd.MultiIndex):
            index = {(element,) for element in index.to_list()}
        else:
            index = set(index.to_list())
        if condition.issubset(index):  # with iterables we could have more then requested
            query_results.append(group)

    if query_results:
        return pd.concat(query_results).reset_index(drop=True)

    return pd.DataFrame(columns=df.columns)

In [4]:
ccol = intake.open_esm_datastore(
    "https://storage.googleapis.com/leap-persistent-ro/data-library/catalogs/cmip6-test/leap-pangeo-cmip6-noqc-test.json"
)
variable = 'sfcWind' #variable to obtain data for
query_vars = ['sfcWind','pr','psl'] #variables models simulations are required to provide
experiments = ['historical','ssp585']

cat_ssp245 = ccol.search(experiment_id=['historical','ssp585'],table_id='day')


In [5]:
cat_ssp245.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
0,CMIP,MIROC,MIROC6,historical,r35i1p1f1,day,psl,gn,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20200519
1,ScenarioMIP,EC-Earth-Consortium,EC-Earth3,ssp585,r132i1p1f1,day,pr,gr,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20200412
2,ScenarioMIP,MPI-M,MPI-ESM1-2-LR,ssp585,r29i1p1f1,day,psl,gn,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20210901
3,CMIP,EC-Earth-Consortium,EC-Earth3,historical,r108i1p1f1,day,pr,gr,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20200412
4,CMIP,EC-Earth-Consortium,EC-Earth3,historical,r139i1p1f1,day,psl,gr,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20200412
...,...,...,...,...,...,...,...,...,...,...,...
310,CMIP,EC-Earth-Consortium,EC-Earth3,historical,r121i1p1f1,day,psl,gr,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20200412
311,CMIP,EC-Earth-Consortium,EC-Earth3,historical,r139i1p1f1,day,pr,gr,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20200412
312,ScenarioMIP,MIROC,MIROC-ES2L,ssp585,r1i1p1f2,day,sfcWind,gn,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20220530
313,CMIP,MPI-M,MPI-ESM1-2-LR,historical,r21i1p1f1,day,sfcWind,gn,gs://leap-persistent-ro/data-library/cmip6-tes...,,v20210901


In [6]:
cat_ssp245.esmcat.aggregation_control.groupby_attrs = [] #to circumvent aggregate=false bug

#to avoid this issue: https://github.com/intake/intake-esm/issues/496
    #doesn't actually aggregate if we set cmip6_cat.esmcat.aggregation_control.groupby_attrs = []
kwargs = {'zarr_kwargs':{'consolidated':True,'use_cftime':True},'aggregate':True} #keyword arguments for generating dictionary of datasets from cmip6 catalogue
ddict = cat_ssp245.to_dataset_dict(**kwargs) #open datasets into dictionary



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


In [7]:
ddict

{'CMIP.EC-Earth-Consortium.EC-Earth3.historical.r134i1p1f1.day.sfcWind.gr.gs://leap-persistent-ro/data-library/cmip6-testing/a618127503-6112919295-1/CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3.historical.r134i1p1f1.day.sfcWind.gr.v20200412.zarr.v20200412': <xarray.Dataset>
 Dimensions:         (lat: 256, bnds: 2, lon: 512, member_id: 1,
                      dcpp_init_year: 1, time: 16436)
 Coordinates:
     height          float64 ...
   * lat             (lat) float64 -89.46 -88.77 -88.07 ... 88.07 88.77 89.46
     lat_bnds        (lat, bnds) float64 dask.array<chunksize=(256, 2), meta=np.ndarray>
   * lon             (lon) float64 0.0 0.7031 1.406 2.109 ... 357.9 358.6 359.3
     lon_bnds        (lon, bnds) float64 dask.array<chunksize=(512, 2), meta=np.ndarray>
   * time            (time) object 1970-01-01 12:00:00 ... 2014-12-31 12:00:00
     time_bnds       (time, bnds) object dask.array<chunksize=(288, 2), meta=np.ndarray>
   * member_id       (member_id) object 'r134i1p1f1'
   * d

In [31]:
for key,ds in ddict.items():
    print(ds.attrs)

{'Conventions': 'CF-1.7 CMIP-6.2', 'activity_id': 'CMIP', 'branch_method': 'standard', 'branch_time_in_child': 0.0, 'branch_time_in_parent': 134685.0, 'cmor_version': '3.5.0', 'contact': 'Lijuan Li (ljli@mail.iap.ac.cn)', 'data_specs_version': '01.00.31', 'experiment': 'all-forcing simulation of the recent past', 'experiment_id': 'historical', 'external_variables': 'areacella', 'forcing_index': 1, 'frequency': 'day', 'further_info_url': 'https://furtherinfo.es-doc.org/CMIP6.CAS.FGOALS-g3.historical.none.r3i1p1f1', 'grid': 'native atmosphere area-weighted latxlon grid (80x180 latxlon)', 'grid_label': 'gn', 'initialization_index': 1, 'institution': 'Chinese Academy of Sciences, Beijing 100029, China', 'institution_id': 'CAS', 'license': 'CMIP6 model data produced by Lawrence Livermore PCMDI is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing

In [22]:
ddict

{'CMIP.CAS.FGOALS-g3.historical.r4i1p1f1.day.psl.gn.gs://leap-persistent-ro/data-library/cmip6-testing/a618127503-6091056444-4/CMIP6.CMIP.CAS.FGOALS-g3.historical.r4i1p1f1.day.psl.gn.v20191029.zarr.v20191029': <xarray.Dataset>
 Dimensions:         (lat: 80, bnds: 2, lon: 180, member_id: 1,
                      dcpp_init_year: 1, time: 60225)
 Coordinates:
   * lat             (lat) float64 -90.0 -84.82 -80.72 ... 80.72 84.82 90.0
     lat_bnds        (lat, bnds) float64 dask.array<chunksize=(80, 2), meta=np.ndarray>
   * lon             (lon) float64 0.0 2.0 4.0 6.0 ... 352.0 354.0 356.0 358.0
     lon_bnds        (lon, bnds) float64 dask.array<chunksize=(180, 2), meta=np.ndarray>
   * time            (time) object 1850-01-01 12:00:00 ... 2014-12-31 12:00:00
     time_bnds       (time, bnds) object dask.array<chunksize=(2409, 2), meta=np.ndarray>
   * member_id       (member_id) object 'r4i1p1f1'
   * dcpp_init_year  (dcpp_init_year) float64 nan
 Dimensions without coordinates: bnds
 