In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
import dask
import intake
import fsspec
import os
from collections import defaultdict
from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!
from xmip.utils import google_cmip_col
from xmip.preprocessing import combined_preprocessing
from xmip.postprocessing import merge_variables, combine_datasets


  from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!


In [2]:
def regrid_to_era5(ds,era5_grid):
    """wrapper around xesmf regridding"""
    regridder = xe.Regridder(ds,era5_grid,'bilinear',ignore_degenerate=True)
    
    return regridder(ds,keep_attrs=True)

def shorten_ssp_runs(ddict,end_year):
    ddict_shortened=ddict
    for k, v in ddict.items():
        if 'ssp' in k:
            ddict_shortened[k] = v.sel(time=slice(None, str(end_year)))
        else:
            ddict_shortened[k] = v
    return ddict_shortened

def concat_realizations_most_common_ipf(ds_list):
    '''custom function that concatenates only the realizations of the most common 'ipf' combination,
    takes the first sorted 'ipf' if multiple 'ipf' are equally common'''
    member_ids = [ds.member_id.data[0] for ds in ds_list]
    
    member_ids.sort() #often i1 is the baseline?

    ipf_ids = [s[s.find('i'):] for s in member_ids] #separate 'ipf' from 'r'
    from collections import Counter

    most_common_ipf = Counter(ipf_ids).most_common()[0][0]

    # find unique members and decide which values of 'ipf' give the most members/variants?
    # pick only the matching datasets from the list
    ds_pick = [ds for ds in ds_list if (most_common_ipf in ds.member_id.data[0])]
    
    return xr.concat(ds_pick, dim='member_id', join='outer', coords='minimal',compat='override') #return xr.concat(ds_pick, dim='member_id')

In [3]:

my_models = ['BCC-CSM2-MR','CESM2''CESM2-WACCM','CMCC-ESM2','CMCC-CM2-SR5','EC-Earth3',
                'GFDL-CM4','GFDL-ESM4','HadGEM3-GC31-MM','MIROC6','MPI-ESM1-2-HR','MRI-ESM2-0',
                'NorESM2-MM','TaiESM1']

'''
my_models = ['EC-Earth3']
'''

col = google_cmip_col()
experiment_id='ssp585'
source_id = my_models
kwargs = {
    'zarr_kwargs':{
        'consolidated':True,
        'use_cftime':True
    },
    'aggregate':False
}

cat_data = col.search(
    source_id=source_id,
    experiment_id=['historical','ssp585'],
    table_id='day',
    variable_id=['psl','sfcWind'],
    require_all_on=['source_id', 'member_id','grid_label']
)
ddict = cat_data.to_dataset_dict(**kwargs)
#ddict = cat_data.to_dataset_dict(**kwargs,preprocess=combined_preprocessing) # a lot of 'renaming failed' warnings here

#NB: I'm not applying any preprocessing/renaming here at the moment because the regridding seems to work fine. The atmospheric fields are typically a bit more straightforward to handle.

  ddict = cat_data.to_dataset_dict(**kwargs)



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


In [4]:
ddict_ = shorten_ssp_runs(ddict,2100)

In [5]:
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ddict_merged = merge_variables(ddict_,merge_kwargs={'join':'outer'}) #produces large chunks

In [7]:
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ddict_concat = combine_datasets(
        ddict_merged,
        concat_realizations_most_common_ipf,
        match_attrs=['source_id', 'grid_label', 'experiment_id', 'table_id']
    )
#NB: This leaves multiple datasets for the same model with different grid labels. Probably need function to keep grid label with most variants?
# Since this occurs only for a few models it is probably OK to just save these and remove them later?


In [8]:
mlrcoefs = xr.open_dataset('/home/jovyan/CMIP6cf/gssr_coefs_1degRes_forcing.nc') #contains coordinates of and MLR coefficients around TGs

era5_grid = xr.Dataset( #the ERA5 grid used to derive the MLR coefficients
        {
            "longitude": (["longitude"], np.arange(-40,30,1)+1/2, {"units": "degrees_east"}),
            "latitude": (["latitude"], np.arange(70,10,-1)-1/2, {"units": "degrees_north"}),
        }
    )

#get coordinates of n x n degree grids around each tide gauge
num_degr = 2
lat_ranges = np.zeros((len(mlrcoefs.tg),2))
lon_ranges = np.zeros((len(mlrcoefs.tg),2))

for t,tg in enumerate(mlrcoefs.tg.values):
    lat_ranges[t,:] = era5_grid.latitude[((era5_grid.latitude>=(mlrcoefs.sel(tg=tg).lat-num_degr/2)) & (era5_grid.latitude<=(mlrcoefs.sel(tg=tg).lat+num_degr/2)))][0:2]
    lon_ranges[t,:] = era5_grid.longitude[((era5_grid.longitude>=(mlrcoefs.sel(tg=tg).lon-num_degr/2)) & (era5_grid.longitude<=(mlrcoefs.sel(tg=tg).lon+num_degr/2)))][0:2]

#create da's to index the CMIP6 files with:
lons_da = xr.DataArray(lon_ranges,dims=['tg','lon_around_tg'],coords={'tg':mlrcoefs.tg,'lon_around_tg':[0,1]})
lats_da = xr.DataArray(lat_ranges,dims=['tg','lat_around_tg'],coords={'tg':mlrcoefs.tg,'lat_around_tg':[0,1]})

In [9]:
ddict_subsetted = ddict_concat#copy dictionary with concatenated realizations
for key in tqdm(ddict_concat):
    ds = ddict_concat[key]

    #change longitude coordinates to -180 -> 180 (avoids getting NaNs at the 0-meridian)
    lon_coord = list(k for k in ds.dims if 'lon' in k)[0] #find lon/lat coordinate names
    ds.coords[lon_coord] = ((ds.coords[lon_coord] + 180) % 360) - 180 #wrap around 0
    ds = ds.reindex({ lon_coord : np.sort(ds[lon_coord])})
    
    regridded_ds = regrid_to_era5(ds,era5_grid) #regrid to the ERA5 grid bilinearly
    ds_around_tgs = regridded_ds.sel(latitude=lats_da,longitude=lons_da) #subset at n x n degree grids around TGs
    ddict_subsetted[key] = ds_around_tgs
    #ds_around_tgs.to_netcdf(key+'.nc',mode='w') #test storing the subsetted output, takes ~3-4 min per file

  0%|          | 0/24 [00:00<?, ?it/s]

In [12]:
list(ddict_subsetted.keys())

['EC-Earth3.gr.historical.day',
 'MIROC6.gn.historical.day',
 'EC-Earth3.gr.ssp585.day',
 'GFDL-CM4.gr1.historical.day',
 'NorESM2-MM.gn.ssp585.day',
 'MPI-ESM1-2-HR.gn.historical.day',
 'MRI-ESM2-0.gn.ssp585.day',
 'HadGEM3-GC31-MM.gn.ssp585.day',
 'TaiESM1.gn.historical.day',
 'CMCC-ESM2.gn.historical.day',
 'HadGEM3-GC31-MM.gn.historical.day',
 'TaiESM1.gn.ssp585.day',
 'CMCC-CM2-SR5.gn.ssp585.day',
 'MPI-ESM1-2-HR.gn.ssp585.day',
 'GFDL-ESM4.gr1.ssp585.day',
 'NorESM2-MM.gn.historical.day',
 'CMCC-ESM2.gn.ssp585.day',
 'GFDL-CM4.gr1.ssp585.day',
 'MIROC6.gn.ssp585.day',
 'GFDL-CM4.gr2.ssp585.day',
 'GFDL-CM4.gr2.historical.day',
 'GFDL-ESM4.gr1.historical.day',
 'MRI-ESM2-0.gn.historical.day',
 'CMCC-CM2-SR5.gn.historical.day']