In [6]:
import numpy as np
import xarray as xr
import dask
import intake
import pandas as pd
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat
from cmip_ds_dict_operations import drop_incomplete
import typing
import itertools
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

In [3]:
gcol = google_cmip_col() #xmip wrapper
ccol = intake.open_esm_datastore("https://storage.googleapis.com/leap-persistent-ro/data-library/catalogs/cmip6-test/leap-pangeo-cmip6-test.json")
gcol.esmcat._df = pd.concat([gcol.df,ccol.df],ignore_index=True)

variable = 'sfcWind' #variable to obtain data for
query_vars = ['sfcWind','pr','psl'] #variables models simulations are required to provide
experiments = ['historical','ssp245']

cat_ssp245 = gcol.search( #find instances providing all required query_vars for both historical & ssp245 experiments
    experiment_id=['historical','ssp245'],
    table_id='day',
    variable_id=query_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

cat_ssp585 = gcol.search( #find instances providing all required query_vars for both historical & ssp245 experiments
    experiment_id=['historical','ssp585'],
    table_id='day',
    variable_id=query_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

cat = cat_ssp245
cat.esmcat._df = pd.concat([cat_ssp245.df,cat_ssp585.df],ignore_index=True).drop_duplicates(ignore_index=True) #combine the dataframes of the two catalogues
cat = reduce_cat_to_max_num_realizations(cat) #per model, select grid and 'ipf' combination providing most realizations

To deal with duplicates with different versions, keep the newest versions. Executed by ordering versions ascendingly, 

In [4]:
for i in np.arange(len(cat.df)):
    if isinstance(cat.df.loc[i,'version'],int)==False:
        cat.df.loc[i,'version'] = int(cat.df.loc[i,'version'].replace('v',''))
cat.esmcat._df = cat.df.sort_values(by='version', ascending=False).drop_duplicates(subset=['activity_id','institution_id','source_id','experiment_id','member_id','table_id','variable_id','grid_label']).sort_index()

In [5]:
for source_id in cat.df.source_id.unique():
    print(source_id)
    print('ssp245')
    print(len(cat.df[(cat.df.source_id==source_id)&(cat.df.experiment_id=='ssp245')].member_id.unique()))
    print('ssp585')
    print(len(cat.df[(cat.df.source_id==source_id)&(cat.df.experiment_id=='ssp585')].member_id.unique()))

ACCESS-CM2
ssp245
5
ssp585
7
ACCESS-ESM1-5
ssp245
40
ssp585
40
CESM2
ssp245
2
ssp585
2
CESM2-WACCM
ssp245
3
ssp585
3
CMCC-CM2-SR5
ssp245
1
ssp585
1
CMCC-ESM2
ssp245
1
ssp585
1
CanESM5
ssp245
25
ssp585
25
EC-Earth3
ssp245
63
ssp585
26
EC-Earth3-Veg
ssp245
1
ssp585
0
FGOALS-g3
ssp245
2
ssp585
0
GFDL-CM4
ssp245
1
ssp585
1
GFDL-ESM4
ssp245
1
ssp585
1
HadGEM3-GC31-LL
ssp245
5
ssp585
4
HadGEM3-GC31-MM
ssp245
0
ssp585
4
IITM-ESM
ssp245
1
ssp585
1
INM-CM4-8
ssp245
1
ssp585
1
INM-CM5-0
ssp245
1
ssp585
1
IPSL-CM6A-LR
ssp245
11
ssp585
7
KACE-1-0-G
ssp245
3
ssp585
3
MIROC-ES2L
ssp245
10
ssp585
1
MIROC6
ssp245
45
ssp585
50
MPI-ESM1-2-HR
ssp245
2
ssp585
2
MPI-ESM1-2-LR
ssp245
24
ssp585
24
MRI-ESM2-0
ssp245
5
ssp585
5
NorESM2-LM
ssp245
3
ssp585
1
NorESM2-MM
ssp245
2
ssp585
1
TaiESM1
ssp245
1
ssp585
1
UKESM1-0-LL
ssp245
5
ssp585
5


In [6]:
#test loading in data for a single variable
query_vars.remove(variable)
cat = drop_vars_from_cat(cat,query_vars) #only process data for 'variable'

cat.esmcat.aggregation_control.groupby_attrs = [] #to circumvent aggregate=false bug

#to avoid this issue: https://github.com/intake/intake-esm/issues/496
    #doesn't actually aggregate if we set cmip6_cat.esmcat.aggregation_control.groupby_attrs = []
kwargs = {'zarr_kwargs':{'consolidated':True,'use_cftime':True},'aggregate':True} #keyword arguments for generating dictionary of datasets from cmip6 catalogue
ddict = cat.to_dataset_dict(**kwargs) #open datasets into dictionary



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Loading in works.