In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import xesmf as xe
import dask
import intake
import fsspec
from collections import defaultdict
from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!

  from tqdm.autonotebook import tqdm  # Fancy progress bars for our loops!


In [2]:
def drop_all_bounds(ds):
    """Drop coordinates like 'time_bounds' from datasets,
    which can lead to issues when merging."""
    drop_vars = [vname for vname in ds.coords
                 if (('_bounds') in vname ) or ('_bnds') in vname]
    return ds.drop(drop_vars)

def open_dsets(df):
    """Open datasets from cloud storage and return xarray dataset."""
    dsets = [xr.open_zarr(fsspec.get_mapper(ds_url), consolidated=True)
             .pipe(drop_all_bounds)
             for ds_url in df.zstore]
    try:
        ds = xr.merge(dsets, join='exact')
        return ds
    except ValueError:
        return None

def open_delayed(df):
    """A dask.delayed wrapper around `open_dsets`.
    Allows us to open many datasets in parallel."""
    return dask.delayed(open_dsets)(df)

def regrid_to_era5(ds,era5_grid):
    """wrapper around xesmf regridding"""
    regridder = xe.Regridder(ds,era5_grid,'bilinear')
    
    return regridder(ds)

In [3]:
mlrcoefs = xr.open_dataset('CMIP6cf/gssr_coefs_1degRes_forcing.nc') #contains coordinates of and MLR coefficients around TGs

era5_grid = xr.Dataset(
        {
            "longitude": (["longitude"], np.arange(-40,30,1)+1/2, {"units": "degrees_east"}),
            "latitude": (["latitude"], np.arange(70,10,-1)-1/2, {"units": "degrees_north"}),
        }
    ) #grid of the ERA5 forcing used to derive the MLR coefficients

#get coordinates of 2x2 degree grids around each tide gauge
num_degr = 2
lat_ranges = np.zeros((len(mlrcoefs.tg),2))
lon_ranges = np.zeros((len(mlrcoefs.tg),2))

for t,tg in enumerate(mlrcoefs.tg.values):
    lat_ranges[t,:] = era5_grid.latitude[((era5_grid.latitude>=(mlrcoefs.sel(tg=tg).lat-num_degr/2)) & (era5_grid.latitude<=(mlrcoefs.sel(tg=tg).lat+num_degr/2)))][0:2]
    lon_ranges[t,:] = era5_grid.longitude[((era5_grid.longitude>=(mlrcoefs.sel(tg=tg).lon-num_degr/2)) & (era5_grid.longitude<=(mlrcoefs.sel(tg=tg).lon+num_degr/2)))][0:2]

#create da's to index the CMIP6 simulations with
lons_da = xr.DataArray(lon_ranges,dims=['tg','lon_around_tg'],coords={'tg':mlrcoefs.tg,'lon_around_tg':[0,1]})
lats_da = xr.DataArray(lat_ranges,dims=['tg','lat_around_tg'],coords={'tg':mlrcoefs.tg,'lat_around_tg':[0,1]})

In [4]:
#open CMIP6 files
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
df.head()
col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")


query0 = dict(
    experiment_id=['ssp585'], 
    table_id='day',                            
    variable_id=['sfcWind','psl']
)

col_subset = col.search(**query0)
col_subset.df.groupby("source_id")[
    ["experiment_id", "variable_id", "table_id"]
].nunique()

query = dict(
    experiment_id=['ssp585'], 
    table_id='day',                            
    variable_id=['sfcWind','psl'], 
    member_id = ['r1i1p1f1','r2i1p1f1'],                
    source_id=['MPI-ESM1-2-HR','MIROC6']
)
"""
#display all available (it would be nice to have a function that shows only those instances for which all queried aspects are available
col_subset.df.groupby("source_id")[
    ["experiment_id", "variable_id", "table_id","member_id"]
].nunique()
"""

'\n#display all available (it would be nice to have a function that shows only those instances for which all queried aspects are available\ncol_subset.df.groupby("source_id")[\n    ["experiment_id", "variable_id", "table_id","member_id"]\n].nunique()\n'

In [5]:
col_subset = col.search(**query)
overview=col_subset.df
overview

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
0,ScenarioMIP,DKRZ,MPI-ESM1-2-HR,ssp585,r1i1p1f1,day,sfcWind,gn,gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-H...,,20190710
1,ScenarioMIP,DKRZ,MPI-ESM1-2-HR,ssp585,r1i1p1f1,day,psl,gn,gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-H...,,20190710
2,ScenarioMIP,DWD,MPI-ESM1-2-HR,ssp585,r2i1p1f1,day,sfcWind,gn,gs://cmip6/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR...,,20190710
3,ScenarioMIP,DWD,MPI-ESM1-2-HR,ssp585,r2i1p1f1,day,psl,gn,gs://cmip6/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR...,,20190710
4,ScenarioMIP,MIROC,MIROC6,ssp585,r1i1p1f1,day,psl,gn,gs://cmip6/CMIP6/ScenarioMIP/MIROC/MIROC6/ssp5...,,20191016
5,ScenarioMIP,MIROC,MIROC6,ssp585,r1i1p1f1,day,sfcWind,gn,gs://cmip6/CMIP6/ScenarioMIP/MIROC/MIROC6/ssp5...,,20200323
6,ScenarioMIP,MIROC,MIROC6,ssp585,r2i1p1f1,day,sfcWind,gn,gs://cmip6/CMIP6/ScenarioMIP/MIROC/MIROC6/ssp5...,,20200323


In [47]:
#open datasets
dsets = defaultdict(dict)

for group, df in col_subset.df.groupby(by=['source_id', 'experiment_id']):
    dsets[group[0]][group[1]] = open_delayed(df)
dsets 

defaultdict(dict,
            {'MIROC6': {'ssp585': Delayed('open_dsets-35109469-a5f8-497e-b41c-be76193758ed')},
             'MPI-ESM1-2-HR': {'ssp585': Delayed('open_dsets-0603db0e-ea69-4c52-bbe5-6c247d14658b')}})

In [None]:
#
dsets_ = dask.compute(dict(dsets))[0] #here I run into memory problems if not separating the variants
dsets_

In [None]:
for k, v in tqdm(dsets_.items()):
    expt_dsets = v.values()
    
    for ds in expt_dsets:
  
        #change longitude coordinates (avoids getting NaNs at the 0-meridian)
        lon_coord = list(k for k in ds.dims if 'lon' in k)[0] #find lon/lat coordinate names

        ds.coords[lon_coord] = ((ds.coords[lon_coord] + 180) % 360) - 180 #wrap around 0
        ds = ds.reindex({ lon_coord : np.sort(ds[lon_coord])})

        regridded_ds = regrid_to_era5(ds,era5_grid) #regrid to same grid as ERA5
        ds_around_tgs = regridded_ds.sel(latitude=lats_da,longitude=lons_da) #subset at num_degr by num_degr grids around TGs
        #ds_around_tgs.to_netcdf('test.nc',mode='w') #save into single file (can save to individual tg files as well, just for testing)

In [None]:
"""
#change longitude coordinates (avoids getting NaNs at the 0-meridian)
lon_coord = list(k for k in forcing.dims if 'lon' in k)[0] #find lon/lat coordinate names
            
forcing.coords[lon_coord] = ((forcing.coords[lon_coord] + 180) % 360) - 180 #wrap around 0
forcing = forcing.reindex({ lon_coord : np.sort(forcing[lon_coord])})

regridded_forcing = regrid_to_era5(forcing,era5_grid) #regrid to same grid as ERA5
forcing_around_tgs = regridded_forcing.sel(latitude=lats_da,longitude=lons_da) #subset at num_degr by num_degr grids around TGs

#test
#normalized_forcing = normalized_forcing.stack(coord=['lon_around_tg','lat_around_tg'])
"""