In [7]:
import numpy as np
import xarray as xr
import dask
import cftime
import os
import intake
import pandas as pd
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.preprocessing import rename_cmip6, promote_empty_dims, correct_coordinates, broadcast_lonlat, correct_lon, correct_units, fix_metadata,_drop_coords
from xmip.postprocessing import combine_datasets,_concat_sorted_time, match_metrics
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_older_versions, search_cloud
from cmip_ds_dict_operations import generate_dict_of_datasets, drop_duplicate_timesteps, drop_coords, drop_incomplete, drop_vars, create_regridder_dict, regrid_datasets_in_ddict,select_period, create_land_mask_dict, fix_inconsistent_calendars
import xesmf as xe
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

Various functionalities:

In [2]:
def partial_combined_preprocessing(ds): #'combined_preprocessing' from xmip is problematic for some datasets
    ds = rename_cmip6(ds) # fix naming
    ds = promote_empty_dims(ds) # promote empty dims to actual coordinates
    ds = correct_coordinates(ds) # demote coordinates from data_variables
    ds = broadcast_lonlat(ds) # broadcast lon/lat
    ds = correct_lon(ds) # shift all lons to consistent 0-360
    ds = correct_units(ds) # fix the units
    ''' part of combined preprocessing
    ds = parse_lon_lat_bounds(ds) # rename the `bounds` according to their style (bound or vertex)
    ds = sort_vertex_order(ds) # sort verticies in a consistent manner
    ds = maybe_convert_bounds_to_vertex(ds) # convert vertex into bounds and vice versa, so both are available
    ds = maybe_convert_vertex_to_bounds(ds)
    '''
    ds = fix_metadata(ds)
    ds = ds.drop_vars(_drop_coords, errors="ignore")
    return ds

def cleanup_datasets_in_dict(ddict):
    ddict = drop_duplicate_timesteps(ddict) #remove duplicate timesteps if present
    ddict = drop_coords(ddict,['vertices_latitude','vertices_longitude']) #remove coords & variables
    ddict = drop_vars(ddict,['vertices_latitude','vertices_longitude'])

    ddict_out = defaultdict(dict)
    for k,v in ddict.items():
        
        v = v.isel(dcpp_init_year=0,drop=True,missing_dims='ignore')
        
        if v.source_id=='INM-CM4-8':
            in_russia_blob = ((v.lat>=40)&(v.lat<=70)&(v.lon>=65)&(v.lon<=120))
            in_us_blob = ((v.lat>=40)&(v.lat<=50)&(v.lon>=260)&(v.lon<=290))
            v = v.where(v.lat>=-79).where(in_russia_blob==False).where(in_us_blob==False) 

        if v.source_id=='MPI-ESM1-2-HR':
            if v.member_id == 'r2i1p1f1':
                print('Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: '+k)
                continue
        
        if 'x' in v:
            if len(v['x'])==0:
                print('Longitude and/or latitude dimensions have length 0, dropping: '+k)
                continue
        if 'y' in v:
            if len(v['y'])==0:
                print('Longitude and/or latitude dimensions have length 0, dropping: '+k)
                continue
     
        ddict_out[k] = v

    return ddict_out

Configure the script:

In [3]:
query_var = 'psl' #variables to process
query_ssps = ['ssp126','ssp245','ssp370','ssp585'] #SSPs to use for data query

ssps_to_process = ['ssp126','ssp245','ssp370','ssp585'] #SSPs to process data for 


regrid = True
target_grid = xr.Dataset(
    {"lat": (["lat"], np.arange(-90, 90, 1), {"units": "degrees_north"}),
     "lon": (["lon"], np.arange(0, 360, 1), {"units": "degrees_east"}),})
target_grid.attrs['name'] = '1x1' #target grid assumed to be regular

zos_path = 'gs://leap-persistent/timh37/CMIP6/zos_1x1'

models_to_exclude = ['AWI-CM-1-1-MR','KIOST-ESM'] #models to exclude a-priori becaue of preprocessing/data issues

min_pic_numYears = 150

output_period = ['1980','2500']

output_path = 'gs://leap-persistent/timh37/CMIP6/'

overwrite_existing = False

Query datasets, put into dictionaries of datasets, and preprocess:

In [4]:
#search & generate hist+SSPs dictionary of datasets
ssp_cats = defaultdict(dict)
for s,ssp in enumerate(query_ssps):
    cat = search_cloud(query_var,['historical',ssp],'Amon',['source_id', 'member_id','grid_label']) #done per SSP because availability may be different
    ssp_cats[ssp] = cat

#put ssp cats together (AFAIK no other way but to copy an existing catalog and to assign the concatenation of the dataframes inside each separate catalogue as the new dataframe)   
ssp_cats_merged = ssp_cats[query_ssps[0]] 
ssp_cats_merged.esmcat._df = pd.concat([v.df for k,v in ssp_cats.items()],ignore_index=True).drop_duplicates(ignore_index=True)

#potentially first throw out members for which zos is unavailable? for now, let's see the overlap with 'zos' if we don't do this
ssp_cats_merged = reduce_cat_to_max_num_realizations(ssp_cats_merged) #per model, select grid and 'ipf' combination providing most realizations (needs to be applied to both SSPs together to ensure the same variants are used under both scenarios)

ssp_ddicts = defaultdict(dict) #not sure when/this is needed?
for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ssp_cat = ssp_cats_merged.search(experiment_id=['historical',ssp],table_id='Amon',variable_id=query_var,require_all_on=['source_id', 'member_id','grid_label']) #retrieve ssp cat from reduced catalogue

    ssp_ddict = {}
    ssp_ddict = generate_dict_of_datasets(ssp_cat,models_to_exclude,partial_combined_preprocessing)
    ssp_ddict = cleanup_datasets_in_dict(ssp_ddict)    
    
    with dask.config.set(**{'array.slicing.split_large_chunks': True}): #concatenate historical and SSP
        ssp_ddict = combine_datasets(ssp_ddict,_concat_sorted_time,match_attrs =['source_id', 'grid_label','table_id','variant_label','variable_id'],combine_func_kwargs={'join':'inner','coords':'minimal','compat':'override'})    
    
    ssp_ddict = drop_duplicate_timesteps(ssp_ddict) #remove overlap between historical and ssp experiments, which sometimes exists, again using 'drop_duplicate_timesteps'
    ssp_ddict = fix_inconsistent_calendars(ssp_ddict)
    ssp_ddict = select_period(ssp_ddict,output_period[0],output_period[-1]) #select requested output period
    ssp_ddict = drop_incomplete(ssp_ddict) #remove historical+ssp timeseries which are not montonically increasing or have large timegaps (based on checks in CMIP6-LEAP-feadstock)

    ssp_ddicts[ssp] = ssp_ddict #add to dictionary of dictionaries of datasets

if regrid: #if regridding
    regridder_dict = create_regridder_dict(ssp_ddicts,target_grid) #generate xesmf regridders per model-grid combination  
    #TO-DO: develop option to regrid to tide gauges/list of coordinates
mask_dict = create_land_mask_dict(ssp_ddicts,zos_path) #Find matching ocean/land mask from corresponding 'zos' files, these are needed because IBE is based on pressure anomalies relative to the ocean-area weighted mean:

ssp126

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: CMIP.MPI-M.MPI-ESM1-2-HR.historical.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp126.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR/ssp126/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
ssp245

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp245.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp245/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: CMIP.MPI-M.MPI-ESM1-2-HR.historical.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:EC-Earth3-Veg.gr.Amon.r5i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl
ssp370

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.membe

Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: CMIP.MPI-M.MPI-ESM1-2-HR.historical.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp370.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp370/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r5i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl
ssp585

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id

Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp585.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR/ssp585/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding, therefore dropping: CMIP.MPI-M.MPI-ESM1-2-HR.historical.r2i1p1f1.Amon.psl.gn.gs://cmip6/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r2i1p1f1/Amon/psl/gn/v20190710/.20190710
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy)


Carry out preprocessing:

In [8]:
for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ssp_ddict = ssp_ddicts[ssp]

    if regrid:
        ssp_ddict = regrid_datasets_in_ddict(ssp_ddict,regridder_dict)
    ssp_ddicts[ssp] = ssp_ddict

ssp126


  0%|          | 0/289 [00:00<?, ?it/s]

ssp245


  0%|          | 0/408 [00:00<?, ?it/s]

ssp370


  0%|          | 0/364 [00:00<?, ?it/s]

ssp585


  0%|          | 0/338 [00:00<?, ?it/s]

Compute IBE:

In [9]:
[LON,LAT] = np.meshgrid(target_grid.lon,target_grid.lat)
aweights = np.cos(np.deg2rad(LAT)) #come up with weights for regular 1x1 grid

In [11]:
ibe_ddicts = defaultdict(dict)

for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ibe_ddict = defaultdict(dict)
    
    ssp_ddict = ssp_ddicts[ssp]
    for key,ds in tqdm(ssp_ddict.items()):
        if ds.source_id not in list(mask_dict.keys()):
            continue
        else:
            mask = mask_dict[ds.source_id]
            ds['psl'] = ds['psl'].where(mask,np.nan) #add land mask based on matching preprocessed zos file
            ds['aweights'] = (('lat','lon'),aweights) #add latitude-based area weights
            ibe = (1/(9.81 * 1025)) * -(ds['psl'] - ds['psl'].weighted(ds.aweights).mean(('lon','lat'))) #from Stammer 2008
            
            ibe = ibe.to_dataset() #turn into new dataset
            ibe = ibe.rename({'psl':'ibe'})
            ibe.attrs = ds.attrs 
    
            ibe_ddict[key] = ibe #put into dictionary
    ibe_ddicts[ssp] = ibe_ddict

ssp126


  0%|          | 0/289 [00:00<?, ?it/s]

ssp245


  0%|          | 0/408 [00:00<?, ?it/s]

ssp370


  0%|          | 0/364 [00:00<?, ?it/s]

ssp585


  0%|          | 0/338 [00:00<?, ?it/s]

Store output:

In [13]:
for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ibe_ddict = ibe_ddicts[ssp]
    for key,ds in tqdm(ibe_ddict.items()):
        
        ds = ds.sel(time=slice(output_period[0],output_period[1])) #select output period
        ds = ds[['ibe']] #get rid of 'area' that is a variable in some datasets
        ds_name = key+'.hist_'+ssp+'.'+str(ds.time[0].dt.year.values)+'-'+str(ds.time[-1].dt.year.values) #generate file name

        output_fn = os.path.join(output_path,'ibe'+['','_'+target_grid.attrs['name']][regrid],ds.source_id,ds_name)
        
        if overwrite_existing or not fs.exists(output_fn):
            #store:
            try:
                ds.to_zarr(output_fn,mode='w') #fails if chunks are not uniform due to time concatenation
            except:
                ds['ibe'] = ds['ibe'].chunk({'time':'auto'})
                ds.to_zarr(output_fn,mode='w')

ssp126


  0%|          | 0/280 [00:00<?, ?it/s]

ssp245


  0%|          | 0/399 [00:00<?, ?it/s]

ssp370


  0%|          | 0/348 [00:00<?, ?it/s]

ssp585


  0%|          | 0/323 [00:00<?, ?it/s]