In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
import dask
import intake
import fsspec
from collections import defaultdict
from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!
from xmip.utils import google_cmip_col
from xmip.preprocessing import combined_preprocessing


  from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!


In [2]:
def regrid_to_era5(ds,era5_grid):
    """wrapper around xesmf regridding"""
    regridder = xe.Regridder(ds,era5_grid,'bilinear')
    
    return regridder(ds)


In [173]:
my_models = ['BCC-CSM2-MR',
                'CESM2'
                'CESM2-WACCM',
                'CMCC-ESM2',
                'CMCC-CM2-SR5',
                'EC-Earth3',
                'GFDL-CM4',
                'GFDL-ESM4',
                'HadGEM3-GC31-MM',
                'MIROC6',
                'MPI-ESM1-2-HR',
                'MRI-ESM2-0',
                'NorESM2-MM',
                'TaiESM1']

col = google_cmip_col()
experiment_id='ssp585'
source_id = my_models
kwargs = {
    'zarr_kwargs':{
        'consolidated':True,
        'use_cftime':True
    },
    'aggregate':False
}

cat_data = col.search(
    source_id=source_id,
    experiment_id=experiment_id,
    table_id='day',
    variable_id=['psl','sfcWind']
)
ddict = cat_data.to_dataset_dict(**kwargs)
list(ddict.keys())

  ddict = cat_data.to_dataset_dict(**kwargs)



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


['ScenarioMIP.NOAA-GFDL.GFDL-CM4.ssp585.r1i1p1f1.day.psl.gr1.gs://cmip6/CMIP6/ScenarioMIP/NOAA-GFDL/GFDL-CM4/ssp585/r1i1p1f1/day/psl/gr1/v20180701/.nan.20180701',
 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3.ssp585.r109i1p1f1.day.sfcWind.gr.gs://cmip6/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3/ssp585/r109i1p1f1/day/sfcWind/gr/v20200412/.nan.20200412',
 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3.ssp585.r117i1p1f1.day.psl.gr.gs://cmip6/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3/ssp585/r117i1p1f1/day/psl/gr/v20200412/.nan.20200412',
 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3.ssp585.r116i1p1f1.day.sfcWind.gr.gs://cmip6/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3/ssp585/r116i1p1f1/day/sfcWind/gr/v20200412/.nan.20200412',
 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3.ssp585.r108i1p1f1.day.sfcWind.gr.gs://cmip6/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3/ssp585/r108i1p1f1/day/sfcWind/gr/v20200412/.nan.20200412',
 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3.ssp585.r123i1p1f1.day

In [174]:
from xmip.postprocessing import merge_variables

ddict_merged = merge_variables(ddict)

list(ddict_merged.keys())



['GFDL-CM4.gr1.ssp585.day.r1i1p1f1',
 'EC-Earth3.gr.ssp585.day.r109i1p1f1',
 'MIROC6.gn.ssp585.day.r14i1p1f1',
 'MIROC6.gn.ssp585.day.r31i1p1f1',
 'EC-Earth3.gr.ssp585.day.r3i1p1f1',
 'GFDL-ESM4.gr1.ssp585.day.r1i1p1f1',
 'MIROC6.gn.ssp585.day.r1i1p1f1',
 'MPI-ESM1-2-HR.gn.ssp585.day.r1i1p1f1',
 'MIROC6.gn.ssp585.day.r9i1p1f1',
 'EC-Earth3.gr.ssp585.day.r132i1p1f1',
 'MIROC6.gn.ssp585.day.r46i1p1f1',
 'MIROC6.gn.ssp585.day.r13i1p1f1',
 'NorESM2-MM.gn.ssp585.day.r1i1p1f1',
 'BCC-CSM2-MR.gn.ssp585.day.r1i1p1f1',
 'MIROC6.gn.ssp585.day.r2i1p1f1',
 'HadGEM3-GC31-MM.gn.ssp585.day.r1i1p1f3',
 'EC-Earth3.gr.ssp585.day.r11i1p1f1',
 'EC-Earth3.gr.ssp585.day.r129i1p1f1',
 'MIROC6.gn.ssp585.day.r39i1p1f1',
 'EC-Earth3.gr.ssp585.day.r147i1p1f1',
 'EC-Earth3.gr.ssp585.day.r104i1p1f1',
 'MIROC6.gn.ssp585.day.r28i1p1f1',
 'EC-Earth3.gr.ssp585.day.r138i1p1f1',
 'MIROC6.gn.ssp585.day.r4i1p1f1',
 'HadGEM3-GC31-MM.gn.ssp585.day.r3i1p1f3',
 'EC-Earth3.gr.ssp585.day.r141i1p1f1',
 'MIROC6.gn.ssp585.day.r36i

In [177]:
reqVars = ['sfcWind','psl']
ddict_filtered = {k: v for k, v in ddict_merged.items() if set(reqVars).issubset(list(ddict_merged[k].variables))}
list(ddict_filtered.keys())

['GFDL-CM4.gr1.ssp585.day.r1i1p1f1',
 'EC-Earth3.gr.ssp585.day.r3i1p1f1',
 'GFDL-ESM4.gr1.ssp585.day.r1i1p1f1',
 'MIROC6.gn.ssp585.day.r1i1p1f1',
 'MPI-ESM1-2-HR.gn.ssp585.day.r1i1p1f1',
 'NorESM2-MM.gn.ssp585.day.r1i1p1f1',
 'BCC-CSM2-MR.gn.ssp585.day.r1i1p1f1',
 'HadGEM3-GC31-MM.gn.ssp585.day.r1i1p1f3',
 'CMCC-ESM2.gn.ssp585.day.r1i1p1f1',
 'EC-Earth3.gr.ssp585.day.r1i1p1f1',
 'MPI-ESM1-2-HR.gn.ssp585.day.r2i1p1f1',
 'MRI-ESM2-0.gn.ssp585.day.r1i1p1f1',
 'EC-Earth3.gr.ssp585.day.r4i1p1f1',
 'HadGEM3-GC31-MM.gn.ssp585.day.r4i1p1f3',
 'GFDL-CM4.gr2.ssp585.day.r1i1p1f1',
 'CMCC-CM2-SR5.gn.ssp585.day.r1i1p1f1',
 'TaiESM1.gn.ssp585.day.r1i1p1f1']

In [91]:
"""
member_ids = ['r1i2p1f1','r1i1p1f1']
member_ids.sort()
ipf_ids = [s[s.find('i'):] for s in member_ids]#find unique ipf combinations

from collections import Counter

Counter(ipf_ids).most_common()[0][0]
"""

'i1p1f1'

In [180]:
def concat_realizations_most_common_ipf(ds_list):
    member_ids = [ds.member_id.data[0] for ds in ds_list]
    
    member_ids.sort() #often i1 is the baseline?
    
    ipf_ids = [s[s.find('i'):] for s in member_ids] #separate 'ipf' from 'r'
    from collections import Counter

    most_common_ipf = Counter(ipf_ids).most_common()[0][0]

    # find unique members and decide which values of 'ipf' give the most members/variants?
    # pick only the matching datasets from the list
    ds_pick = [ds for ds in ds_list if ((most_common_ipf in ds.member_id.data[0]) & ('sfcWind' in ds.variables) & ('psl' in ds.variables)) ]
    
    return xr.concat(ds_pick, dim='member_id')

from xmip.postprocessing import combine_datasets


ddict_concat = combine_datasets(
    ddict_filtered,
    concat_realizations_most_common_ipf,
    match_attrs=['source_id', 'grid_label', 'experiment_id', 'table_id']
)



In [179]:
list(ddict_concat.keys())

['GFDL-CM4.gr1.ssp585.day',
 'EC-Earth3.gr.ssp585.day',
 'GFDL-ESM4.gr1.ssp585.day',
 'MIROC6.gn.ssp585.day',
 'MPI-ESM1-2-HR.gn.ssp585.day',
 'NorESM2-MM.gn.ssp585.day',
 'BCC-CSM2-MR.gn.ssp585.day',
 'HadGEM3-GC31-MM.gn.ssp585.day',
 'CMCC-ESM2.gn.ssp585.day',
 'MRI-ESM2-0.gn.ssp585.day',
 'GFDL-CM4.gr2.ssp585.day',
 'CMCC-CM2-SR5.gn.ssp585.day',
 'TaiESM1.gn.ssp585.day']

In [23]:
'''
from xmip.postprocessing import concat_members

ddict_concat = concat_members(ddict_merged)
print(list(ddict_concat.keys()))
'''



['MPI-ESM1-2-HR.gn.ssp245.day', 'MRI-ESM2-0.gn.ssp245.day']


In [181]:
mlrcoefs = xr.open_dataset('/home/jovyan/CMIP6cf/gssr_coefs_1degRes_forcing.nc') #contains coordinates of and MLR coefficients around TGs

era5_grid = xr.Dataset(
        {
            "longitude": (["longitude"], np.arange(-40,30,1)+1/2, {"units": "degrees_east"}),
            "latitude": (["latitude"], np.arange(70,10,-1)-1/2, {"units": "degrees_north"}),
        }
    ) #grid of the ERA5 forcing used to derive the MLR coefficients

#get coordinates of 2x2 degree grids around each tide gauge
num_degr = 2
lat_ranges = np.zeros((len(mlrcoefs.tg),2))
lon_ranges = np.zeros((len(mlrcoefs.tg),2))

for t,tg in enumerate(mlrcoefs.tg.values):
    lat_ranges[t,:] = era5_grid.latitude[((era5_grid.latitude>=(mlrcoefs.sel(tg=tg).lat-num_degr/2)) & (era5_grid.latitude<=(mlrcoefs.sel(tg=tg).lat+num_degr/2)))][0:2]
    lon_ranges[t,:] = era5_grid.longitude[((era5_grid.longitude>=(mlrcoefs.sel(tg=tg).lon-num_degr/2)) & (era5_grid.longitude<=(mlrcoefs.sel(tg=tg).lon+num_degr/2)))][0:2]

#create da's to index the CMIP6 simulations with
lons_da = xr.DataArray(lon_ranges,dims=['tg','lon_around_tg'],coords={'tg':mlrcoefs.tg,'lon_around_tg':[0,1]})
lats_da = xr.DataArray(lat_ranges,dims=['tg','lat_around_tg'],coords={'tg':mlrcoefs.tg,'lat_around_tg':[0,1]})

In [186]:
'''
ddict_subsetted = ddict_concat
for key in ddict_concat:
    ds = ddict_concat[key]
    print(ds.coords)
    #change longitude coordinates (avoids getting NaNs at the 0-meridian)
    lon_coord = list(k for k in ds.dims if 'lon' in k)[0] #find lon/lat coordinate names

    ds.coords[lon_coord] = ((ds.coords[lon_coord] + 180) % 360) - 180 #wrap around 0
    ds = ds.reindex({ lon_coord : np.sort(ds[lon_coord])})

    regridded_ds = regrid_to_era5(ds,era5_grid) #regrid to same grid as ERA5
    ds_around_tgs = regridded_ds.sel(latitude=lats_da,longitude=lons_da) #subset at num_degr by num_degr grids around TGs
    #ds_around_tgs.to_netcdf('test.nc',mode='w') #save into single file (can save to individual tg files as well, just for testing)
    ddict_subsetted[key] = ds_around_tgs'''

"\nddict_subsetted = ddict_concat\nfor key in ddict_concat:\n    ds = ddict_concat[key]\n    print(ds.coords)\n    #change longitude coordinates (avoids getting NaNs at the 0-meridian)\n    lon_coord = list(k for k in ds.dims if 'lon' in k)[0] #find lon/lat coordinate names\n\n    ds.coords[lon_coord] = ((ds.coords[lon_coord] + 180) % 360) - 180 #wrap around 0\n    ds = ds.reindex({ lon_coord : np.sort(ds[lon_coord])})\n\n    regridded_ds = regrid_to_era5(ds,era5_grid) #regrid to same grid as ERA5\n    ds_around_tgs = regridded_ds.sel(latitude=lats_da,longitude=lons_da) #subset at num_degr by num_degr grids around TGs\n    #ds_around_tgs.to_netcdf('test.nc',mode='w') #save into single file (can save to individual tg files as well, just for testing)\n    ddict_subsetted[key] = ds_around_tgs"