In [2]:
import numpy as np
import xarray as xr
import dask
import os
import intake
import pandas as pd
from collections import defaultdict
from tqdm.autonotebook import tqdm
from xmip.preprocessing import rename_cmip6, promote_empty_dims, correct_coordinates, broadcast_lonlat, correct_lon, correct_units, fix_metadata,_drop_coords
from xmip.postprocessing import combine_datasets,_concat_sorted_time, match_metrics
from cmip_catalogue_operations import reduce_cat_to_max_num_realizations, reduce_areacello_cat, search_cloud
from cmip_ds_dict_operations import generate_dict_of_datasets, drop_duplicate_timesteps, drop_coords, fix_inconsistent_calendars, find_matching_pic_datasets, store_matched_pic_linfit, subtract_pic_linfit
from cmip_ds_dict_operations import drop_incomplete, drop_vars, create_regridder_dict, subtract_ocean_awMean, get_availability_from_ddicts,select_period, regrid_datasets_in_ddict
import xesmf as xe
import gcsfs
fs = gcsfs.GCSFileSystem() #list stores, stripp zarr from filename, load 

  from tqdm.autonotebook import tqdm


Various functionalities:

In [2]:
def partial_combined_preprocessing(ds): #'combined_preprocessing' from xmip is problematic for some datasets
    ds = rename_cmip6(ds) # fix naming
    ds = promote_empty_dims(ds) # promote empty dims to actual coordinates
    ds = correct_coordinates(ds) # demote coordinates from data_variables
    ds = broadcast_lonlat(ds) # broadcast lon/lat
    ds = correct_lon(ds) # shift all lons to consistent 0-360
    ds = correct_units(ds) # fix the units
    ''' part of combined preprocessing
    ds = parse_lon_lat_bounds(ds) # rename the `bounds` according to their style (bound or vertex)
    ds = sort_vertex_order(ds) # sort verticies in a consistent manner
    ds = maybe_convert_bounds_to_vertex(ds) # convert vertex into bounds and vice versa, so both are available
    ds = maybe_convert_vertex_to_bounds(ds)
    '''
    ds = fix_metadata(ds)
    ds = ds.drop_vars(_drop_coords, errors="ignore")
    return ds

def cleanup_datasets_in_dict(ddict):
    ddict = drop_duplicate_timesteps(ddict) #remove duplicate timesteps if present
    ddict = drop_coords(ddict,['vertices_latitude','vertices_longitude']) #remove coords & variables
    ddict = drop_vars(ddict,['vertices_latitude','vertices_longitude'])

    ddict_out = defaultdict(dict)
    for k,v in ddict.items():
        
        v = v.isel(dcpp_init_year=0,drop=True,missing_dims='ignore')
        
        if v.source_id=='INM-CM4-8':
            in_russia_blob = ((v.lat>=40)&(v.lat<=70)&(v.lon>=65)&(v.lon<=120))
            in_us_blob = ((v.lat>=40)&(v.lat<=50)&(v.lon>=260)&(v.lon<=290))
            v = v.where(v.lat>=-79).where(in_russia_blob==False).where(in_us_blob==False) 

        if v.source_id=='FGOALS-g3':
            if v.member_id.values in ['r4i1p1f1','r5i1p1f1']:
                print('Large jump between "zos" of FGOALS-g3 from historical to SSP, dropping: '+k)
                continue
                
        if 'x' in v:
            if len(v['x'])==0:
                print('Longitude and/or latitude dimensions have length 0, dropping: '+k)
                continue
        if 'y' in v:
            if len(v['y'])==0:
                print('Longitude and/or latitude dimensions have length 0, dropping: '+k)
                continue
     
        ddict_out[k] = v

    return ddict_out

Configure the script:

In [3]:
query_var = 'zos' #variable to use for data query
query_ssps = ['ssp126','ssp245','ssp370','ssp585'] #SSPs to use for data query

ssps_to_process = ['ssp126','ssp245','ssp370','ssp585'] #SSPs to process data for 

regrid = True
target_grid = xr.Dataset(
    {"lat": (["lat"], np.arange(-90, 90, 1), {"units": "degrees_north"}),
     "lon": (["lon"], np.arange(0, 360, 1), {"units": "degrees_east"}),})
target_grid.attrs['name'] = '1x1'

models_to_exclude = ['AWI-CM-1-1-MR','KIOST-ESM'] #models to exclude a-priori becaue of preprocessing/data issues

min_pic_numYears = 150

output_period = ['1980','2500']

output_path = 'gs://leap-persistent/timh37/CMIP6/'

overwrite_existing = False

Query datasets, tidy up, put into dictionaries of datasets, prepare for preprocessing:

In [4]:
#search & generate piControl dictionary of datasets
pic_cat = search_cloud(query_var,'piControl','Omon',['source_id', 'member_id','grid_label']) #done separately from hist/SSPs because parent variant (i.e., piControl variant) is not necessarily the same as historical/SSPs variant
pic_ddict = generate_dict_of_datasets(pic_cat,models_to_exclude,partial_combined_preprocessing)
pic_ddict = cleanup_datasets_in_dict(pic_ddict)
pic_ddict = drop_incomplete(pic_ddict) #remove timeseries which are not montonically increasing or have large timegaps (based on checks in CMIP6-LEAP-feadstock)

#search & generate areacello dictionary of datasets
areacello_cat = search_cloud(variable_id='areacello')
areacello_cat = reduce_areacello_cat(areacello_cat) #remove duplicates of model-grid combinations
areacello_ddict = generate_dict_of_datasets(areacello_cat,models_to_exclude,partial_combined_preprocessing)
areacello_ddict = cleanup_datasets_in_dict(areacello_ddict)

for k,v in areacello_ddict.items():
    areacello_ddict[k] = v.isel(member_id=0,drop=True).isel(dcpp_init_year=0,drop=True,missing_dims='ignore') #member_id is irrelevant here and causes issues upon match metrics
    
#search & generate hist+SSPs dictionary of datasets
ssp_cats = defaultdict(dict)
for s,ssp in enumerate(query_ssps):
    cat = search_cloud(query_var,['historical',ssp],'Omon',['source_id', 'member_id','grid_label']) #done per SSP because availability may be different
    ssp_cats[ssp] = cat

#put ssp cats together (AFAIK no other way but to copy an existing catalog and to assign the concatenation of the dataframes inside each separate catalogue as the new dataframe)   
ssp_cats_merged = ssp_cats[query_ssps[0]] 
ssp_cats_merged.esmcat._df = pd.concat([v.df for k,v in ssp_cats.items()],ignore_index=True).drop_duplicates(ignore_index=True)

#remove hist/SSP catalogue entries without areacello and pic source_id/grid_label combinations:
where_no_auxdata = [i for i,val in enumerate(list(ssp_cats_merged.df.source_id+ssp_cats_merged.df.grid_label)) if ( (val in list(areacello_cat.df.source_id+areacello_cat.df.grid_label)) & (val in list(pic_cat.df.source_id+pic_cat.df.grid_label)) ) ]
ssp_cats_merged.esmcat._df = ssp_cats_merged.df.iloc[where_no_auxdata]

ssp_cats_merged = reduce_cat_to_max_num_realizations(ssp_cats_merged) #per model, select grid and 'ipf' combination providing most realizations (needs to be applied to both SSPs together to ensure the same variants are used under both scenarios)

ssp_ddicts = defaultdict(dict) #initialize dictionary to hold ssp dictionaries of datasets
for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ssp_cat = ssp_cats_merged.search(experiment_id=['historical',ssp],table_id='Omon',variable_id=query_var,require_all_on=['source_id', 'member_id','grid_label']) #retrieve ssp cat from reduced catalogue
    
    ssp_ddict = {}
    ssp_ddict = generate_dict_of_datasets(ssp_cat,models_to_exclude,partial_combined_preprocessing)
    ssp_ddict = cleanup_datasets_in_dict(ssp_ddict)    
    
    with dask.config.set(**{'array.slicing.split_large_chunks': True}): #concatenate historical and SSP
        ssp_ddict = combine_datasets(ssp_ddict,_concat_sorted_time,match_attrs =['source_id', 'grid_label','table_id','variant_label','variable_id'],combine_func_kwargs={'join':'inner','coords':'minimal','compat':'override'})    
    
    ssp_ddict = drop_duplicate_timesteps(ssp_ddict) #remove overlap between historical and ssp experiments, which sometimes exists, again using 'drop_duplicate_timesteps'
    ssp_ddict = fix_inconsistent_calendars(ssp_ddict) #through concatenating hist/SSP, sometimes inconsistencies in time calendar arise
    ssp_ddict = select_period(ssp_ddict,output_period[0],output_period[-1]) #select requested output period
    ssp_ddict = drop_incomplete(ssp_ddict) #remove historical+ssp timeseries which are not montonically increasing or have large timegaps (based on checks in CMIP6-LEAP-feadstock)
    
    ssp_ddict = match_metrics(ssp_ddict,areacello_ddict,['areacello']) #add 'areacello' metric for computing area-weighted mean, if available
    ssp_ddict, datasets_without_pic = find_matching_pic_datasets(ssp_ddict,pic_ddict,query_var,min_pic_numYears) #add matched_pic_ds key to attributes, if available

    ssp_ddict = store_matched_pic_linfit(ssp_ddict,pic_ddict,query_var,os.path.join(output_path,str(query_var)+'_piControl_linfit')) #apply linear fits to matched piControl & store output, if not already stored
    
    ssp_ddicts[ssp] = ssp_ddict #add dictionaries of datasets to dictionary of SSPs

if regrid: #if regridding
    regridder_dict = create_regridder_dict(ssp_ddicts,target_grid) #generate xesmf regridders per model-grid combination  
    #TO-DO: develop option to regrid to tide gauges/list of coordinates
    


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'





--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'




ssp126

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'




Dropping duplicate timesteps for:ScenarioMIP.CNRM-CERFACS.CNRM-ESM2-1.ssp126.r4i1p1f2.Omon.zos.gn.gs://cmip6/CMIP6/ScenarioMIP/CNRM-CERFACS/CNRM-ESM2-1/ssp126/r4i1p1f2/Omon/zos/gn/v20190410/.20190410
Could not determine unique timesteps in: CESM2-WACCM.gn.Omon.r1i1p1f1.zos, dropping dataset.


  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

ssp245

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'




Dropping duplicate timesteps for:EC-Earth3-Veg.gn.Omon.r5i1p1f1.zos


  0%|          | 0/404 [00:00<?, ?it/s]

  0%|          | 0/404 [00:00<?, ?it/s]

ssp370

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'




  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

ssp585

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.zstore.dcpp_init_year.version'




Dropping duplicate timesteps for:CMIP.E3SM-Project.E3SM-1-0.historical.r5i1p1f1.Omon.zos.gr.gs://cmip6/CMIP6/CMIP/E3SM-Project/E3SM-1-0/historical/r5i1p1f1/Omon/zos/gr/v20200429/.20200429


  0%|          | 0/317 [00:00<?, ?it/s]

  0%|          | 0/317 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

Carry out preprocessing:

In [5]:
for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ssp_ddict = ssp_ddicts[ssp]
    
    ssp_ddict = subtract_pic_linfit(ssp_ddict,query_var,os.path.join(output_path,str(query_var)+'_piControl_linfit')) #correct for drift
    ssp_ddict = subtract_ocean_awMean(ssp_ddict,query_var) #remove ocean area-weighted mean

    if regrid:
        ssp_ddict = regrid_datasets_in_ddict(ssp_ddict,regridder_dict)
    ssp_ddicts[ssp] = ssp_ddict

ssp126


  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/254 [00:00<?, ?it/s]

  0%|          | 0/254 [00:00<?, ?it/s]

ssp245


  0%|          | 0/404 [00:00<?, ?it/s]

  0%|          | 0/348 [00:00<?, ?it/s]

  0%|          | 0/348 [00:00<?, ?it/s]

ssp370


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

  0%|          | 0/233 [00:00<?, ?it/s]

ssp585


  0%|          | 0/317 [00:00<?, ?it/s]

  0%|          | 0/260 [00:00<?, ?it/s]

  0%|          | 0/260 [00:00<?, ?it/s]

Store output:

In [6]:
for s,ssp in enumerate(ssps_to_process):
    print(ssp)
    ssp_ddict = ssp_ddicts[ssp]
    
    for key,ds in tqdm(ssp_ddict.items()):
        ds_name = key+'.hist_'+ssp+'.'+str(ds.time[0].dt.year.values)+'-'+str(ds.time[-1].dt.year.values) #generate file name

        output_fn = os.path.join(output_path,query_var+['','_'+target_grid.attrs['name']][regrid],ds.source_id,ds_name)
        
        if overwrite_existing or not fs.exists(output_fn):   
            ds = ds[[query_var]] #get rid of 'area' that is a variable in some datasets
            ds[query_var] = ds[query_var].chunk({'time':200,'member_id':1,'lat':len(target_grid.lat),'lon':len(target_grid.lon)})
            print('storing: '+output_fn)
            ds.to_zarr(output_fn,mode='w') 
            ds.close()
        else:
            #print(output_fn+' already exists in output directory, moving on.')

ssp126


  0%|          | 0/254 [00:00<?, ?it/s]

gs://leap-persistent/timh37/CMIP6/zos_1x1/CNRM-ESM2-1/CNRM-ESM2-1.gn.Omon.r1i1p1f2.zos.hist_ssp126.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MPI-ESM1-2-LR/MPI-ESM1-2-LR.gn.Omon.r46i1p1f1.zos.hist_ssp126.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MIROC-ES2L/MIROC-ES2L.gn.Omon.r7i1p1f2.zos.hist_ssp126.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CanESM5/CanESM5.gn.Omon.r19i1p1f1.zos.hist_ssp126.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MPI-ESM1-2-LR/MPI-ESM1-2-LR.gn.Omon.r9i1p1f1.zos.hist_ssp126.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MIROC6/MIROC6.gn.Omon.r33i1p1f1.zos.hist_ssp126.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/FIO-ESM-2-0/FIO-ESM-2-0.gn.Omon

  0%|          | 0/348 [00:00<?, ?it/s]

gs://leap-persistent/timh37/CMIP6/zos_1x1/CNRM-CM6-1/CNRM-CM6-1.gn.Omon.r7i1p1f2.zos.hist_ssp245.1980-2020 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CNRM-CM6-1/CNRM-CM6-1.gn.Omon.r6i1p1f2.zos.hist_ssp245.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CNRM-ESM2-1/CNRM-ESM2-1.gn.Omon.r3i1p1f2.zos.hist_ssp245.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CAMS-CSM1-0/CAMS-CSM1-0.gn.Omon.r2i1p1f1.zos.hist_ssp245.1980-2099 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MPI-ESM1-2-LR/MPI-ESM1-2-LR.gn.Omon.r37i1p1f1.zos.hist_ssp245.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MIROC-ES2L/MIROC-ES2L.gn.Omon.r7i1p1f2.zos.hist_ssp245.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/HadGEM3-GC31-LL/HadGEM3

  0%|          | 0/233 [00:00<?, ?it/s]

gs://leap-persistent/timh37/CMIP6/zos_1x1/CanESM5/CanESM5.gn.Omon.r19i1p1f1.zos.hist_ssp370.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/GISS-E2-1-G/GISS-E2-1-G.gn.Omon.r9i1p1f2.zos.hist_ssp370.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/ACCESS-ESM1-5/ACCESS-ESM1-5.gn.Omon.r23i1p1f1.zos.hist_ssp370.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/EC-Earth3-Veg/EC-Earth3-Veg.gn.Omon.r2i1p1f1.zos.hist_ssp370.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/ACCESS-ESM1-5/ACCESS-ESM1-5.gn.Omon.r8i1p1f1.zos.hist_ssp370.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CAMS-CSM1-0/CAMS-CSM1-0.gn.Omon.r2i1p1f1.zos.hist_ssp370.1980-2099 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CanESM5/CanESM5.

  0%|          | 0/260 [00:00<?, ?it/s]

gs://leap-persistent/timh37/CMIP6/zos_1x1/EC-Earth3-Veg/EC-Earth3-Veg.gn.Omon.r4i1p1f1.zos.hist_ssp585.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MIROC6/MIROC6.gn.Omon.r44i1p1f1.zos.hist_ssp585.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/CanESM5/CanESM5.gn.Omon.r19i1p1f1.zos.hist_ssp585.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/EC-Earth3/EC-Earth3.gn.Omon.r11i1p1f1.zos.hist_ssp585.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MPI-ESM1-2-LR/MPI-ESM1-2-LR.gn.Omon.r17i1p1f1.zos.hist_ssp585.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MPI-ESM1-2-LR/MPI-ESM1-2-LR.gn.Omon.r38i1p1f1.zos.hist_ssp585.1980-2100 already exists in output directory, moving on.
gs://leap-persistent/timh37/CMIP6/zos_1x1/MPI-ESM1-2-LR/MPI-ESM1-2-LR