In [1]:
import numpy as np
import xarray as xr
import dask
import cftime
import os
import intake
import pandas as pd
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.utils import google_cmip_col
from xmip.preprocessing import rename_cmip6, promote_empty_dims, correct_coordinates, broadcast_lonlat, correct_lon, correct_units, fix_metadata,_drop_coords
from xmip.postprocessing import combine_datasets,_concat_sorted_time, match_metrics
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, drop_older_versions, search_cloud
from cmip_ds_dict_operations import generate_dict_of_datasets, drop_duplicate_timesteps, drop_coords, drop_incomplete, drop_vars, create_regridder_dict, get_availability_from_ddicts
import xesmf as xe
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

  from tqdm.autonotebook import tqdm


Various functionalities:

In [2]:
def partial_combined_preprocessing(ds): #'combined_preprocessing' from xmip is problematic for some datasets
    ds = rename_cmip6(ds) # fix naming
    ds = promote_empty_dims(ds) # promote empty dims to actual coordinates
    ds = correct_coordinates(ds) # demote coordinates from data_variables
    ds = broadcast_lonlat(ds) # broadcast lon/lat
    ds = correct_lon(ds) # shift all lons to consistent 0-360
    ds = correct_units(ds) # fix the units
    ''' part of combined preprocessing
    ds = parse_lon_lat_bounds(ds) # rename the `bounds` according to their style (bound or vertex)
    ds = sort_vertex_order(ds) # sort verticies in a consistent manner
    ds = maybe_convert_bounds_to_vertex(ds) # convert vertex into bounds and vice versa, so both are available
    ds = maybe_convert_vertex_to_bounds(ds)
    '''
    ds = fix_metadata(ds)
    ds = ds.drop_vars(_drop_coords, errors="ignore")
    return ds

def cleanup_datasets_in_dict(ddict):
    ddict = drop_duplicate_timesteps(ddict) #remove duplicate timesteps if present
    ddict = drop_coords(ddict,['vertices_latitude','vertices_longitude']) #remove coords & variables
    ddict = drop_vars(ddict,['vertices_latitude','vertices_longitude'])

    for k,v in ddict.items():
        if 'dcpp_init_year' in v:
            ddict[k] = v.isel(dcpp_init_year=0,drop=True)
    return ddict

Configure the script:

In [3]:
query_var = 'psl' #variables to process
#ssps = ['ssp585']
ssps = ['ssp126','ssp245','ssp370','ssp585'] #SSPs to process #(TODO: loop over multiple, streamline code!)

regrid = True
target_grid = xr.Dataset(
    {"lat": (["lat"], np.arange(-90, 90, 1), {"units": "degrees_north"}),
     "lon": (["lon"], np.arange(0, 360, 1), {"units": "degrees_east"}),})
target_grid.attrs['name'] = '1x1' #target grid assumed to be regular

zos_path = 'gs://leap-persistent/timh37/CMIP6/zos_1x1'

#models to exclude a-priori becaue of preprocessing issues (to be sorted out?)
models_to_exclude = ['AWI-CM-1-1-MR','AWI-ESM-1-1-LR','AWI-CM-1-1-LR','KIOST-ESM']

min_pic_numYears = 150

output_period = ['1950','2500']

output_path = 'gs://leap-persistent/timh37/CMIP6/'

overwrite_existing = False

Query datasets, put into dictionaries of datasets, and preprocess:

In [4]:
#search & load hist+ssp dictionary of datasets
ssp_cats = defaultdict(dict)
for s,ssp in enumerate(ssps):
    cat = search_cloud(query_var,['historical',ssp],'Amon',['source_id', 'member_id','grid_label']) #done per SSP because availability may be different
    ssp_cats[ssp] = cat

#put ssp cats together (AFAIK no other way but to copy an existing catalog and to assign the concatenation of the dataframes inside each separate catalogue as the new dataframe)   
ssp_cats_merged = ssp_cats[ssps[0]] 
ssp_cats_merged.esmcat._df = pd.concat([v.df for k,v in ssp_cats.items()],ignore_index=True).drop_duplicates(ignore_index=True)
ssp_cats_merged = reduce_cat_to_max_num_realizations(ssp_cats_merged) #per model, select grid and 'ipf' combination providing most realizations (needs to be applied to both SSPs together to ensure the same variants are used under both scenarios)

ssp_ddicts = defaultdict(dict) #not sure when/this is needed?
for s,ssp in enumerate(ssps):
    ssp_cat = ssp_cats_merged.search(experiment_id=['historical',ssp],table_id='Amon',variable_id=query_var,require_all_on=['source_id', 'member_id','grid_label']) #retrieve ssp cat from reduced catalogue
    ssp_ddict = generate_dict_of_datasets(ssp_cat,models_to_exclude,partial_combined_preprocessing)
    ssp_ddict = cleanup_datasets_in_dict(ssp_ddict)    
    
    with dask.config.set(**{'array.slicing.split_large_chunks': True}): #concatenate historical and SSP
        ssp_ddict = combine_datasets(ssp_ddict,_concat_sorted_time,match_attrs =['source_id', 'grid_label','table_id','variant_label','variable_id'],combine_func_kwargs={'join':'inner','coords':'minimal','compat':'override'})    
    
    ssp_ddict = drop_duplicate_timesteps(ssp_ddict) #remove overlap between historical and ssp experiments, which sometimes exists, again using 'drop_duplicate_timesteps'

    #intermediate step to drop incomplete time series for hist+ssp, to-do: put in a separate function?
    inconsistent_experiment_calendars = [] #identify if historical and SSP experiments have different calendars, which causes issues later on
    for k,v in ssp_ddict.items():
        try:
            v.time[-1] - v.time[0]
        except: #unify calendars 
            not_prolgreg = np.where(np.array([type(i) for i in v.time.values]) != cftime._cftime.DatetimeProlepticGregorian)[0] #find where calendar is not proleptic gregorian
            converted_time = v.isel(time=not_prolgreg).convert_calendar('proleptic_gregorian',use_cftime=True).time #convert at these indices
            newtime = v.time.values #replace old time index with new values
            newtime[not_prolgreg] = converted_time.values
            ssp_ddict[k]['time'] = newtime
        
    ssp_ddict = drop_incomplete(ssp_ddict) #remove historical+ssp timeseries which are not montonically increasing or have large timegaps (based on checks in CMIP6-LEAP-feadstock
    ssp_ddict.pop('MPI-ESM1-2-HR.gn.Amon.r2i1p1f1.psl') #grid r1i1p1f1 and r2i1p1f1 are different despite same label, causes issues with regridding
    ssp_ddicts[ssp] = ssp_ddict #add to dictionary of dictionaries of datasets


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Dropping duplicate timesteps for:EC-Earth3-Veg.gr.Amon.r5i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl


/srv/conda/envs/notebook/lib/python3.12/site-packages/pydantic/main.py:1114: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  return self.raw_function(**d, **var_kwargs)



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r5i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl


/srv/conda/envs/notebook/lib/python3.12/site-packages/pydantic/main.py:1114: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  return self.raw_function(**d, **var_kwargs)



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'


Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r4i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r1i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r2i1p1f1.psl
Dropping duplicate timesteps for:FGOALS-g3.gn.Amon.r3i1p1f1.psl


Regrid:

In [5]:
regridder_dict = {}
regridder_dict = create_regridder_dict(ssp_ddicts,target_grid)

  0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
for s,ssp in enumerate(ssps):
    print(ssp)
    ssp_ddict = ssp_ddicts[ssp]
    for key,ds in tqdm(ssp_ddict.items()):
        regridder = regridder_dict[ds.attrs['source_id']] #select regridder for this source_id
        regridded_ds = regridder(ds, keep_attrs=True) #do the regridding
        ssp_ddicts[ssp][key] = regridded_ds

ssp126


  0%|          | 0/288 [00:00<?, ?it/s]

ssp245


  0%|          | 0/404 [00:00<?, ?it/s]

ssp370


  0%|          | 0/343 [00:00<?, ?it/s]

ssp585


  0%|          | 0/334 [00:00<?, ?it/s]

Find matching ocean/land mask from corresponding 'zos' files, these are needed because IBE is based on pressure anomalies relative to the ocean-area weighted mean:

In [24]:
model_masks = defaultdict(dict)
psl_models = np.unique(np.hstack([[ds.attrs['source_id'] for ds in dataset_dict.values()] for dataset_dict in ssp_ddicts.values()]))

zos_path = zos_path

for model in tqdm(psl_models):
    try:
        fns = fs.ls(os.path.join(zos_path,model))
        zos_ds = xr.open_dataset('gs://'+fns[0],engine='zarr')
        model_masks[str(model)] = np.isfinite(zos_ds.zos.isel(time=0)).isel(member_id=0,drop=True)
    except:
        continue

  0%|          | 0/50 [00:00<?, ?it/s]

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy)


Compute IBE:

In [26]:
[LON,LAT] = np.meshgrid(target_grid.lon,target_grid.lat)
aweights = np.cos(np.deg2rad(LAT)) #come up with weights for regular 1x1 grid

In [28]:
ibe_ddicts = defaultdict(dict)

for s,ssp in enumerate(ssps):
    print(ssp)
    ibe_ddict = defaultdict(dict)
    
    ssp_ddict = ssp_ddicts[ssp]
    for key,ds in tqdm(ssp_ddict.items()):
        if ds.source_id not in list(model_masks.keys()):
            continue
        else:
            mask = model_masks[ds.source_id]
            ds['psl'] = ds['psl'].where(mask,np.nan) #add land mask based on matching preprocessed zos file
            ds['aweights'] = (('lat','lon'),aweights) #add latitude-based area weights
            ibe = (1/(9.81 * 1025)) * -(ds['psl'] - ds['psl'].weighted(ds.aweights).mean(('lon','lat'))) #from Stammer 2008
            
            ibe = ibe.to_dataset() #turn into new dataset
            ibe = ibe.rename({'psl':'ibe'})
            ibe.attrs = ds.attrs 
    
            ibe_ddict[key] = ibe #put into dictionary
    ibe_ddicts[ssp] = ibe_ddict

ssp126


  0%|          | 0/288 [00:00<?, ?it/s]

ssp245


  0%|          | 0/404 [00:00<?, ?it/s]

ssp370


  0%|          | 0/343 [00:00<?, ?it/s]

ssp585


  0%|          | 0/334 [00:00<?, ?it/s]

Print number of available models & members:

availability = get_availability_from_ddicts(ibe_ddicts)
for k,v in availability.items():
    print('')
    print(k)
    for model in np.unique(np.hstack([[ds.attrs['source_id'] for ds in dataset_dict.values()] for dataset_dict in ibe_ddicts.values()])):
        print(str(len(v[model])))


Store output:

In [15]:
for s,ssp in enumerate(ssps):
    print(ssp)
    ibe_ddict = ibe_ddicts[ssp]
    for key,ds in tqdm(ibe_ddict.items()):
        
        ds = ds.sel(time=slice(output_period[0],output_period[1])) #select output period
        ds = ds[['ibe']] #get rid of 'area' that is a variable in some datasets
        ds_name = key+'.hist_'+ssp+'.'+str(ds.time[0].dt.year.values)+'-'+str(ds.time[-1].dt.year.values) #generate file name

        output_fn = os.path.join(output_path,'ibe'+['','_'+target_grid.attrs['name']][regrid],ds.source_id,ds_name)
        
        if overwrite_existing or not fs.exists(output_fn):
            #store:
            try:
                ds.to_zarr(output_fn,mode='w') #fails if chunks are not uniform due to time concatenation
            except:
                ds['ibe'] = ds['ibe'].chunk({'time':'auto'})
                ds.to_zarr(output_fn,mode='w')

ssp585


  0%|          | 0/320 [00:00<?, ?it/s]

In [6]:
fs.ls(os.path.join(output_path+'ibe_1x1'))

['leap-persistent/timh37/CMIP6/ibe_1x1/ACCESS-CM2',
 'leap-persistent/timh37/CMIP6/ibe_1x1/ACCESS-ESM1-5',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CAMS-CSM1-0',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CAS-ESM2-0',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CESM2',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CESM2-FV2',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CESM2-WACCM',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CIESM',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CMCC-CM2-SR5',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CMCC-ESM2',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CNRM-CM6-1',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CNRM-CM6-1-HR',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CNRM-ESM2-1',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CanESM5',
 'leap-persistent/timh37/CMIP6/ibe_1x1/CanESM5-CanOE',
 'leap-persistent/timh37/CMIP6/ibe_1x1/EC-Earth3',
 'leap-persistent/timh37/CMIP6/ibe_1x1/EC-Earth3-CC',
 'leap-persistent/timh37/CMIP6/ibe_1x1/EC-Earth3-Veg',
 'leap-persistent/timh37/CMIP6/ibe_1x1/EC-Earth3-Veg-LR',
 'l