In [1]:
import numpy as np
import xarray as xr
import dask
import intake
import pandas as pd
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.postprocessing import combine_datasets, _match_datasets,_concat_sorted_time
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_vars_from_cat
from cmip_ds_dict_operations import preselect_years, pr_flux_to_m, drop_duplicate_timesteps, drop_coords
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

'''script to get CMIP6 datatsets at grid cells nearest to tide gauges and store them'''

  from tqdm.autonotebook import tqdm


'script to get CMIP6 datatsets at grid cells nearest to tide gauges and store them'

In [3]:
fs.ls('gs://leap-persistent/timh37/CMIP6/timeseries/pr_tgs')

['leap-persistent/timh37/CMIP6/timeseries/pr_tgs/CESM2',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/CESM2-WACCM',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/CMCC-CM2-SR5',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/CMCC-ESM2',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/EC-Earth3',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/GFDL-CM4',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/GFDL-ESM4',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/HadGEM3-GC31-MM',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/MIROC6',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/MPI-ESM1-2-HR',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/MRI-ESM2-0',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/NorESM2-MM',
 'leap-persistent/timh37/CMIP6/timeseries/pr_tgs/TaiESM1']

In [3]:
#combine_datasets() work around to merge variables into datasets that have not exactly matching coordinates but are supposed to have the same grid
def align_lonlat(ds_list):
    aligned_ds_list = []
    for ds in ds_list: #list of ds can't seem to be passed to xr.align instead
        a,b = xr.align(ds_list[0],ds,join='override',exclude=['time','member_id'])
        aligned_ds_list.append(b)
    return aligned_ds_list

def merge_variables_aligning_lonlat(ds_list):
    aligned_ds_list = align_lonlat(ds_list) #override same-dimension lon/lat prior to concatenating (ensures lon/lats are not padded)
    return xr.merge(aligned_ds_list, join='outer',compat='override')

In [4]:
def select_gridcells_nearest_to_tgs(tg_ds,ds):
    '''
    tg_ds = xr.DataSet containing 'lon' and 'lat' coordinates of tide gauges
    ds    = xr.DataSet containing CMIP6 data to subset
    '''
    
    lon_name = list(k for k in ds.dims if 'lon' in k)[0] #find lon/lat coordinate names
    lat_name = list(k for k in ds.dims if 'lat' in k)[0]
    
    #compute distances between TG coordinates and grid cell centers
    distances = 2*np.arcsin( np.sqrt(
        np.sin( (np.pi/180) * 0.5*(ds[lat_name]-tg_ds.lat) )**2 +
        np.cos((np.pi/180)*tg_ds.lat)*np.cos((np.pi/180)*ds[lat_name])*np.sin((np.pi/180)*0.5*(ds[lon_name]-tg_ds.lon))**2) )
    
    idx_nearest = distances.argmin(dim=[lon_name,lat_name]) #find indices of nearest grid cells
    ds_subsetted = ds[idx_nearest] #subset ds at nearest grid cells
    
    ds_subsetted = ds_subsetted.rename_vars({'lon':'gridcell_lon','lat':'gridcell_lat'}) #keep coordinates of nearest grid cells
    ds_subsetted = ds_subsetted.assign_coords(lon=tg_ds.lon,lat=tg_ds.lat) #replace coordinates with TG coordinates
    
    return ds_subsetted

Query simulations & manipulate data catalogue:

In [5]:
variable = 'pr' #variable to obtain data for
query_vars = ['sfcWind','pr','psl'] #variables models simulations are required to provide

''' models with relatively high resolution can be queried using: source_id=highRes_Models in col.search()
highRes_models = ['BCC-CSM2-MR','CESM2','CESM2-WACCM','CMCC-ESM2','CMCC-CM2-SR5','EC-Earth3',
                'GFDL-CM4','GFDL-ESM4','HadGEM3-GC31-MM','MIROC6','MPI-ESM1-2-HR','MRI-ESM2-0',
                'NorESM2-MM','TaiESM1']
'''
col = google_cmip_col() #xmip wrapper

#need to do this for each SSP separately as availability may differ between them
cat_data_ssp245 = col.search( #find instances providing all required query_vars for both historical & ssp245 experiments
    experiment_id=['historical','ssp245'],
    table_id='day',
    variable_id=query_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

cat_data_ssp585 = col.search( #find instances providing all required query_vars for both historical & ssp585 experiments
    experiment_id=['historical','ssp585'],
    table_id='day',
    variable_id=query_vars,
    require_all_on=['source_id', 'member_id','grid_label'])

cat_data = cat_data_ssp585
cat_data.esmcat._df = pd.concat([cat_data_ssp245.df,cat_data_ssp585.df],ignore_index=True).drop_duplicates(ignore_index=True)
cat_data = reduce_cat_to_max_num_realizations(cat_data) #per model, select grid and 'ipf' combination providing most realizations

query_vars.remove(variable)
cat_data = drop_vars_from_cat(cat_data,query_vars) #only process in data for 'variable'

Open datasets into dictionary:

In [6]:
cat_data.esmcat.aggregation_control.groupby_attrs = [] #to circumvent aggregate=false bug

#to avoid this issue: https://github.com/intake/intake-esm/issues/496
    #doesn't actually aggregate if we set cmip6_cat.esmcat.aggregation_control.groupby_attrs = []
kwargs = {'zarr_kwargs':{'consolidated':True,'use_cftime':True},'aggregate':True} #keyword arguments for generating dictionary of datasets from cmip6 catalogue
ddict = cat_data.to_dataset_dict(**kwargs) #open datasets into dictionary

  ddict = cat_data.to_dataset_dict(**kwargs) #open datasets into dictionary



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


**NB: I don't seem to need any preprocessing. If I turn it on I get a lot of renaming failed warnings.**

In [7]:
if variable=='pr':
    ddict = pr_flux_to_m(ddict) #convert pr flux to accumulated pr
ddict = drop_duplicate_timesteps(ddict) #CESM2-WACCM has duplicate timesteps
ddict = preselect_years(ddict,1850,2100) #some models have time series until post-2100, we exclude those here
ddict = drop_coords(ddict,['bnds','nbnd'])

Dropping duplicate timesteps for:ScenarioMIP.NCAR.CESM2-WACCM.ssp585.r1i1p1f1.day.sfcWind.gn.gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/day/sfcWind/gn/v20200702/.nan.20200702


In [8]:
with dask.config.set(**{'array.slicing.split_large_chunks': True}): #join=outer pads NaNs which result in large chunks for timeseries that differ in length
    ddict_merged = combine_datasets(ddict,merge_variables_aligning_lonlat,match_attrs=['source_id', 'grid_label', 'experiment_id', 'table_id','variant_label']) #group datasets of same model

Do the subsetting at grid cells nearest to the tide gauges:

In [9]:
ddict_at_tgs = defaultdict(dict)
tg_coords = xr.open_dataset('/home/jovyan/CMIP6cex/cmip6_processing/gssr_mlr_coefs_1p5_9deg_codec.nc') #contains TG coordinates

for key,ds in tqdm(ddict_merged.items()):
    ds = ds.isel(dcpp_init_year=0,drop=True) #get rid of dcpp_init_year dimension
    ds.attrs["original_key"] = key
    ddict_at_tgs[key] = select_gridcells_nearest_to_tgs(tg_coords,ds)

  0%|          | 0/395 [00:00<?, ?it/s]

Store the datasets (directories structured per model):

In [None]:
for key,ds in tqdm(ddict_at_tgs.items()):
    model_path = os.path.join('leap-persistent/timh37/CMIP6/subsetted_data/'+variable+'_tgs/',ds.source_id)
    ds.chunk({'member_id':1,'time':100000}).to_zarr(os.path.join('gs://',model_path,key.replace('.','_')+'.zarr'),mode='w')
    ds.close()

  0%|          | 0/395 [00:00<?, ?it/s]

^takes about an hour for hist+ssp245+ssp585 excluding EC-Earth3.