# Save warming year annual temperature and precipitation datasets

#### Create datasets of mean annual temperature or annual precipitation for every possible model, member, scenario combination available on Google Cloud, for each warming year identified via a warming level and temperature tolerance approach, and for each lat/lon point on a common grid. This results in one dataset per model/member/scenario combination. From these datasets, also create aggregated datasets that are the average across years and members (resulting in one file for each model and scenario).

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import os.path
import dask
from warming_years import calc_warming_years_temperature_window, get_cmip6_data 
from warming_years import get_cmip6_data_at_warming_years
from file_control import gmst_table_dir, data_dir, model_table_dir
from IPython.display import clear_output

In [2]:
target_wl = 2
temp_tol = 0.25
# only works for tas or pr right now due to temporal averaging
cmip6_variable = 'tas' 

savedir = os.path.join(data_dir, 
                       'zarr_' + str(target_wl) + 'C_' + cmip6_variable + 
                       '_annual_data')

In [3]:
# import gmst table
tab = pd.read_csv(os.path.join(gmst_table_dir, 'CMIP6_GMST_table_all.csv'))

# create a df of unique model, member, and scenario combinations to loop over
mms = tab[['model','member','scenario']].drop_duplicates()

# remove historical cases since those will be handled in addition to each 
# scenario, not as a separate scenario
mms = mms.loc[mms['scenario'] != 'historical']

# create warming years table and remove model/member/scenario combinations for 
# which warming years were not available because the the target warming level 
# was never reached
wltable = calc_warming_years_temperature_window(mms, target_wl, temp_tol, 3000
                                               ).dropna()
wltable

Unnamed: 0,model,member,scenario,warming_year
0,GFDL-CM4,r1i1p1f1,ssp245,2043.0
1,GFDL-CM4,r1i1p1f1,ssp245,2044.0
2,GFDL-CM4,r1i1p1f1,ssp245,2045.0
3,GFDL-CM4,r1i1p1f1,ssp245,2046.0
4,GFDL-CM4,r1i1p1f1,ssp245,2047.0
...,...,...,...,...
23570,ACCESS-CM2,r5i1p1f1,ssp585,2034.0
23571,ACCESS-CM2,r5i1p1f1,ssp585,2035.0
23572,ACCESS-CM2,r5i1p1f1,ssp585,2036.0
23573,ACCESS-CM2,r5i1p1f1,ssp585,2037.0


In [4]:
# There are a couple of model/member/scenario combos for which tas is available 
# (so they show up in the gmst table and warming year table), but pr is not
# remove these from mms to avoid errors in the for loop below.
if cmip6_variable == 'pr':
    mms = mms.drop([mms[(mms['model']=='NorESM2-LM') &
                        (mms['member']=='r1i1p1f1') & 
                        (mms['scenario']=='ssp585')].index[0]])
    mms = mms.drop([mms[(mms['model']=='ACCESS-ESM1-5') & 
                        (mms['member']=='r30i1p1f1') & 
                        (mms['scenario']=='ssp585')].index[0]])
    mms = mms.drop([mms[(mms['model']=='ACCESS-ESM1-5') & 
                        (mms['member']=='r24i1p1f1') & 
                        (mms['scenario']=='ssp585')].index[0]])
    mms = mms.drop([mms[(mms['model']=='ACCESS-ESM1-5') & 
                        (mms['member']=='r38i1p1f1') & 
                        (mms['scenario']=='ssp585')].index[0]])
    mms = mms.drop([mms[(mms['model']=='ACCESS-ESM1-5') & 
                        (mms['member']=='r31i1p1f1') & 
                        (mms['scenario']=='ssp585')].index[0]])

    # and remove these ones that only have precipitation data through 2039, 
    # resulting in no warming years for a 2C warming level
    mms = mms.drop([mms[(mms['model']=='MIROC6') & 
                        (mms['member']=='r22i1p1f1') & 
                        (mms['scenario']=='ssp245')].index[0]])
    mms = mms.drop([mms[(mms['model']=='MIROC6') & 
                        (mms['member']=='r50i1p1f1') & 
                        (mms['scenario']=='ssp245')].index[0]])


In [5]:
# create new common grid to interpolate to
newlats = np.arange(-90, 90.01, .5)
newlons = np.arange(-180, 180, .5)

In [6]:
for i in range(mms.shape[0]):
    print('running file ' + str(i+1) + ' of ' + str(mms.shape[0]), end = '\r')
    model = mms['model'].iloc[i]
    member = mms['member'].iloc[i]
    scenario = mms['scenario'].iloc[i]

    wltable = calc_warming_years_temperature_window(mms.iloc[i:(i+1)], 
                                                    target_wl, 
                                                    temp_tol,
                                                    3000).dropna()
    
    yrs = wltable['warming_year'].to_numpy()
    
    # if warming years are available for this model/member/scenario, then create
    # dataset. Otherwise go to the next one.
    if len(yrs) > 0:
        # get warming year data
        zz = get_cmip6_data_at_warming_years(model, member, scenario, 
                                             cmip6_variable, yrs, year_window=0,
                                             outfilename=None)
        # get rid of nans
        zz[cmip6_variable] = zz[cmip6_variable].interpolate_na(dim='lon')
                
        # interpolate to common spatial grid
        zz = zz.interp(lon=newlons, lat=newlats, method="linear")
           
        # save the dataset
        zz.to_netcdf(os.path.join(savedir,
                                  'model_member_scenario_years',
                                  model + '_' + member + '_' + scenario + '_' + 
                                  str(target_wl) + 'C_' + cmip6_variable + 
                                  '_all_years.nc'))

clear_output()

In [7]:
# open table of first and last available years
flyear = pd.read_csv(os.path.join(model_table_dir,
                                  'all_zarr_models_first_last_year.csv'))


In [8]:
# to remove warnings about chunk sizes
dask.config.set(**{'array.slicing.split_large_chunks': False})

<dask.config.set at 0x7fa8cb0dc290>

In [9]:
# Create aggregated files from those above that average across years, then 
# across members so that there is a file for each model and scenario

# all combinations of model and scenario that are available:
wltable = calc_warming_years_temperature_window(mms, target_wl, temp_tol, 3000
                                               ).dropna()
ms = wltable[['model','scenario']].drop_duplicates()

for i in range(ms.shape[0]):    
    m = ms['model'].iloc[i]
    s = ms['scenario'].iloc[i]
    print(m,s, end = '\r')
    
    try:
        xx = xr.open_mfdataset(os.path.join(savedir,
                                            'model_member_scenario_years',
                                            m + '_*' + s + '*.nc'))
    except: # not sure why, some of these model/scenarios give an error 
        # ('ValueError: Resulting object does not have monotonic global indexes 
        # along dimension year') when using the above approach, but the below 
        # method works 
        xx = xr.open_mfdataset(os.path.join(savedir,
                                            'model_member_scenario_years',
                                            m + '_*' + s + '*.nc'), 
                               combine='nested', concat_dim='member')
    
    # remove any members that don't have data for the full 21st century (2015 
    # through 2099)
    thisflyear = flyear.loc[(flyear['model'] == m ) & 
                            (flyear['scenario'] == s) & 
                            (flyear['member'].isin(xx['member'].values))]
    toremove = thisflyear.loc[(thisflyear['tas_first_year']>2015) | 
                              (thisflyear['tas_last_year']<2099)]['member']
    
    # if all members have incomplete timeseries, then don't save the dataset
    if len(xx['member'].values) == len(toremove): 
        print(m + ' ' + s + 
              ' was not saved because all members had incomplete timeseries')
    else:
        xx = xx.drop_sel(member = toremove)  
    
        # pick only years <= 2100
        xx = xx.where(xx.year <= 2100, drop=True)
    
        # aggregate across years and members
        xm = xx.median('year').mean('member')
    
        # get rid of unnecesary variables
        xm = xm.drop(['areacella'], errors='ignore')

        # save the file
        xm.to_netcdf(savedir + 'model_scenario/' + m + '_' + s + '_' + 
                     str(target_wl) + 'C_' + cmip6_variable + '.nc')


IPSL-CM6A-LR ssp534-over was not saved because all members had incomplete timeseries
MRI-ESM2-0 ssp534-over was not saved because all members had incomplete timeseries
MIROC6 ssp534-over was not saved because all members had incomplete timeseries
MPI-ESM-1-2-HAM ssp370 was not saved because all members had incomplete timeseries
CESM2-WACCM ssp534-over was not saved because all members had incomplete timeseries
GISS-E2-1-G ssp534-over was not saved because all members had incomplete timeseries
IITM-ESM ssp370 was not saved because all members had incomplete timeseries
ACCESS-ESM1-5 ssp245r05