In [1]:
import numpy as np
import xarray as xr
import os
import fnmatch
from tqdm.autonotebook import tqdm
import dask
import random
import sys
sys.path.insert(0, '/home/jovyan/CMIP6cf/cmip6cf/')
import gcsfs
from dependence_metrics import kendallstau, utdc_at_threshold, utdc_cfg

  from tqdm.autonotebook import tqdm


In [2]:
def pot(da,threshold,dim):
    
    assert (threshold>=0) & (threshold<1)
    
    return da.where(da>da.quantile(threshold,dim=dim))

def declustered_peaks(da,threshold,window_len,dim):
    #computes peaks above threshold of xr.DataArray and declusters them with a rolling window.
    
    peaks = pot(da,threshold,dim)
    
    return peaks.where(peaks==peaks.rolling({dim:window_len},center=True,min_periods=1).max(skipna=True))

def rolling_max(da,window_len,dim):
    return da.rolling({dim:window_len},center=True,min_periods=1).max()

def compute_ktau_in_windows(da1,da2,dim):

    ktau = xr.apply_ufunc(
                    kendallstau, da1, da2,
                    input_core_dims=[[dim], [dim]], #core dimension: time, loop over the others
                    output_core_dims=[["statistic"]], #outputs tau and p
                    vectorize=True, 
                    dask='allowed', #allow calculating in chunks (dask='parallelized' doesn't work)
                    output_dtypes=[float],
                    output_sizes={"statistic": 2}, #output must be numpy array
                    )
    return ktau

def compute_utdc_in_windows(da1,da2,dim,estimator):

    if estimator=='cfg':
        utdc = xr.apply_ufunc(
                    utdc_cfg, da1, da2,
                    input_core_dims=[[dim], [dim]], #core dimension: time, loop over the others
                    output_core_dims=[["statistic"]], #outputs tau and p
                    vectorize=True, 
                    dask='allowed', #allow calculating in chunks (dask='parallelized' doesn't work)
                    output_dtypes=[float],
                    output_sizes={"statistic": 2}, #output must be numpy array
                    )
    elif estimator=='threshold':
        utdc = xr.apply_ufunc(
                    utdc_at_threshold, da1, da2, #uses .95% by default
                    input_core_dims=[[dim], [dim]], #core dimension: time, loop over the others
                    output_core_dims=[["statistic"]], #outputs tau and p
                    vectorize=True, 
                    dask='allowed', #allow calculating in chunks (dask='parallelized' doesn't work)
                    output_dtypes=[float],
                    output_sizes={"statistic": 2}, #output must be numpy array
                    )
    else:
        raise Except('Estimator'+ str(estimator) +' not implemented.') 
    
    #to-do: implement p-value for utdc!
    
    return utdc

def break_0_ties(da):
    


    return da_no_ties

Configure the bivariate sampling:

In [3]:
max_lag = 0 #days
declus_window_len = 1 #days
threshold = .98 #quantile

output_yrs = np.arange(1880,2100,20)
window_len=40 #may need to increase? indicate settings in output folder?
utdc_threshold = .95

season = 'year' #'DJF', 'JJA', 'year'

overwrite_output = True

random.seed(10)

Analyze surge & pr, looping over models & experiments:

In [None]:
#sfcWind_dir = '/home/jovyan/CMIP6cf/output/timeseries/sfcWind_europe/'
#pr_dir = '/home/jovyan/CMIP6cf/output/timeseries/pr_europe/'
#source_ids = list(set(os.listdir(sfcWind_dir)) & set(os.listdir(pr_dir))) #intersection of models

fs = gcsfs.GCSFileSystem() # equivalent to fsspec.fs('gs')

sfcWind_dir = 'leap-persistent/timh37/CMIP6/timeseries/sfcWind_europe'
pr_dir = 'leap-persistent/timh37/CMIP6/timeseries/pr_europe'


models_sfcWind = [k.split('/')[-1] for k in fs.ls(sfcWind_dir)]
models_pr = [k.split('/')[-1] for k in fs.ls(pr_dir)]
source_ids = sorted(list(set(models_sfcWind) & set(models_pr))) #intersection of models

for source_id in ['CESM2']:#[k for k in source_ids if ~k.startswith('.')]: #loop over models
    
    sfcWind_path = os.path.join(sfcWind_dir,source_id)
    pr_path = os.path.join(pr_dir,source_id)
    
    #sfcWind_exps = [s.split('_')[-1][0:-3] for s in os.listdir(sfcWind_path) if s.startswith('.')==False]
    #pr_exps = [s.split('_')[-1][0:-3] for s in os.listdir(pr_path) if s.startswith('.')==False]
    sfcWind_exps = [s.split('/')[-1].split('_')[-1][0:-5] for s in fs.ls(sfcWind_path) if s.startswith('.')==False]
    pr_exps = [s.split('/')[-1].split('_')[-1][0:-5] for s in fs.ls(pr_path) if s.startswith('.')==False] 
    
    experiment_ids = list(set(sfcWind_exps) & set(pr_exps))

    for experiment_id in experiment_ids: #loop over experiments
        #load data
        #fn = fnmatch.filter(os.listdir(sfcWind_path),'*'+experiment_id+'*')[0]
        fn = fnmatch.filter(fs.ls(sfcWind_path),'*'+experiment_id+'*')[0]
        fn = fn.split('/')[-1]
        print('Processing file: '+fn)
        #sfcWind_pr = xr.open_mfdataset((os.path.join(sfcWind_path,fn),os.path.join(pr_path,fn)),chunks={'member_id':1,'time':100000,'longitude':5})#.sel(longitude=np.arange(-25,11))
        sfcWind_pr = xr.open_mfdataset((os.path.join('gs://',sfcWind_path,fn),os.path.join('gs://',pr_path,fn)),engine='zarr',chunks={'member_id':1,'time':100000,'longitude':5})#.sel(longitude=np.arange(-25,11))
        
        #generate output paths & check if output already exists
        model_path = os.path.join('/home/jovyan/CMIP6cf/output/dependence/sfcWind_pr_europe/40yr_p98_lag0d_declus1d',sfcWind_pr.source_id)
        output_fn = os.path.join(model_path,fn)

        if not os.path.exists(model_path):
            os.mkdir(model_path)

        if not overwrite_output: #if not overwriting existing output
            if os.path.exists(output_fn):
                print('Output already exists for this instance.')
                continue
        
        #construct time window indices
        if len(np.unique(sfcWind_pr.time.resample(time='1Y').count()))>1: #remove leap days so that each computation window has the same length
            with dask.config.set(**{'array.slicing.split_large_chunks': True}):
                sfcWind_pr = sfcWind_pr.sel(time=~((sfcWind_pr.time.dt.month == 2) & (sfcWind_pr.time.dt.day == 29))) #^probably (hopefully) only has a small effect on the results
        
        #select DJF:
        days_in_year = int(sfcWind_pr.time.resample(time='1Y').count()[0])
        
        if window_len%2 !=0: #odd
            window_start_idx = days_in_year*(output_yrs-1850-int(np.floor(window_len/2)))
            first_window_idx = np.arange(0*days_in_year,window_len*days_in_year)
        else: #even
            window_start_idx = days_in_year*(output_yrs-1850-int(window_len/2)+1)
            first_window_idx = np.arange(0*days_in_year,window_len*days_in_year)
        
        if np.max(first_window_idx[:,np.newaxis]+window_start_idx[np.newaxis,:])>=len(sfcWind_pr.time):
            raise Exception('Windows exceed simulation length.')
            
        window_idx = xr.DataArray( #indices of windows
            data=first_window_idx[:,np.newaxis]+window_start_idx[np.newaxis,:],
            dims=["time_in_window_idx","window"],
            coords=dict(
                time_in_window_idx=first_window_idx,
                window=output_yrs
            ),
        )
        
        for m,member in tqdm(enumerate(sfcWind_pr.member_id)): #loop over members to compute the dependence
            sfcWind_pr_mem = sfcWind_pr.sel(member_id=member).copy(deep=True).load()
   
            #break ties (where pr=0, ranks are not well defined) #TO_DO!!
            
            #sfcWind_pr_mem['pr'][np.where(sfcWind_pr_mem.pr==0)] = sfcWind_pr_mem['pr'][np.where(sfcWind_pr_mem.pr==0)] + np.random.uniform(0,sfcWind_pr_mem.pr[sfcWind_pr_mem.pr>0].min(dim='time'),(sfcWind_pr_mem.pr==0).sum(dim='time'))
            
            sfcWind_pr_wdws = sfcWind_pr_mem.isel(time=window_idx) #select windows
            
            if season == 'DJF':
                idx = [np.where((sfcWind_pr_wdws.time.isel(window=0).dt.month == month)) for month in [1,2,12]]
                season_idx = np.sort(np.hstack(idx)[0])
                sfcWind_pr_wdws = sfcWind_pr_wdws.isel(time_in_window_idx = season_idx)
            elif season == 'JJA':
                idx = [np.where((sfcWind_pr_wdws.time.isel(window=0).dt.month == month)) for month in [6,7,8]]
                season_idx = np.sort(np.hstack(idx)[0])
                sfcWind_pr_wdws = sfcWind_pr_wdws.isel(time_in_window_idx = season_idx)
            
            
            data_is_complete = np.isfinite(sfcWind_pr_wdws.sfcWind).all(dim='time_in_window_idx') * np.isfinite(sfcWind_pr_wdws.pr).all(dim='time_in_window_idx')
            
            '''
            pr_hist_threshold = sfcWind_pr_wdws['pr'].sel(window=1980).quantile(threshold,dim='time_in_window_idx')
            sfcWind_hist_threshold = sfcWind_pr_wdws['sfcWind'].sel(window=1980).quantile(threshold,dim='time_in_window_idx')

            pr_peaks = sfcWind_pr_wdws['pr'].where(sfcWind_pr_wdws['pr']>pr_hist_threshold)
            sfcWind_peaks = sfcWind_pr_wdws['sfcWind'].where(sfcWind_pr_wdws['sfcWind']>sfcWind_hist_threshold)
            
            pr_peaks_declustered = pr_peaks.where(pr_peaks==pr_peaks.rolling({'time_in_window_idx':declus_window_len},center=True,min_periods=1).max(skipna=True))
            sfcWind_peaks_declustered = sfcWind_peaks.where(sfcWind_peaks==sfcWind_peaks.rolling({'time_in_window_idx':declus_window_len},center=True,min_periods=1).max(skipna=True))
            '''
            
            
            pr_peaks_declustered = declustered_peaks(sfcWind_pr_wdws['pr'],threshold,declus_window_len,dim='time_in_window_idx')
            sfcWind_peaks_declustered = declustered_peaks(sfcWind_pr_wdws['sfcWind'],threshold,declus_window_len,dim='time_in_window_idx')
            
            #compute rank correlations with lag
            ktau_pr_cdon_sfcWind = compute_ktau_in_windows(sfcWind_peaks_declustered,
                                                         rolling_max(sfcWind_pr_wdws['pr'],max_lag*2+1,dim='time_in_window_idx'),
                                                         dim="time_in_window_idx")

            ktau_sfcWind_cdon_pr = compute_ktau_in_windows(pr_peaks_declustered,
                                                         rolling_max(sfcWind_pr_wdws['sfcWind'],max_lag*2+1,dim='time_in_window_idx'),
                                                         dim="time_in_window_idx")

            ktau_both_peaks = compute_ktau_in_windows(sfcWind_peaks_declustered,
                                                         rolling_max(pr_peaks_declustered,max_lag*2+1,dim='time_in_window_idx'),
                                                         dim="time_in_window_idx")
            
            #compute UTDC's from daily pairs
            cfg_utdc = compute_utdc_in_windows(sfcWind_pr_wdws['sfcWind'],sfcWind_pr_wdws['pr'],dim="time_in_window_idx",estimator='cfg')
            
            threshold_utdc = compute_utdc_in_windows(sfcWind_pr_wdws['sfcWind'],sfcWind_pr_wdws['pr'],dim="time_in_window_idx",estimator='threshold')
            
            dependence_mem = xr.concat([ktau_pr_cdon_sfcWind,ktau_sfcWind_cdon_pr,ktau_both_peaks],dim='extreme_variate').to_dataset(name='ktau')
            dependence_mem['utdc'] = xr.concat([cfg_utdc,threshold_utdc],dim='estimator')
            dependence_mem['complete_window'] = data_is_complete #store where windows miss data
            
            #count occurrences of peaks
            co_occurring = np.isfinite((rolling_max(pr_peaks_declustered,max_lag*2+1,dim='time_in_window_idx')*sfcWind_peaks_declustered))
            
            for month in np.arange(1,13):
                if month==1:
                    num_co_occurring_pmonth = co_occurring.where(co_occurring.time.dt.month==month).sum(dim='time_in_window_idx')
                    num_pr_peaks_pmonth = np.isfinite(pr_peaks_declustered.where(pr_peaks_declustered.time.dt.month==month)).sum(dim='time_in_window_idx')
                    num_sfcWind_peaks_pmonth = np.isfinite(sfcWind_peaks_declustered.where(sfcWind_peaks_declustered.time.dt.month==month)).sum(dim='time_in_window_idx')
                else:
                    num_co_occurring_pmonth = xr.concat((num_co_occurring_pmonth,co_occurring.where(co_occurring.time.dt.month==month).sum(dim='time_in_window_idx')),dim='month')
                    num_pr_peaks_pmonth = xr.concat((num_pr_peaks_pmonth,np.isfinite(pr_peaks_declustered.where(pr_peaks_declustered.time.dt.month==month)).sum(dim='time_in_window_idx')),dim='month')
                    num_sfcWind_peaks_pmonth = xr.concat((num_sfcWind_peaks_pmonth,np.isfinite(sfcWind_peaks_declustered.where(sfcWind_peaks_declustered.time.dt.month==month)).sum(dim='time_in_window_idx')),dim='month')
                    
            num_co_occurring_pmonth = num_co_occurring_pmonth.assign_coords({'month':np.arange(1,13)})                             
            num_pr_peaks_pmonth = num_pr_peaks_pmonth.assign_coords({'month':np.arange(1,13)})       
            num_sfcWind_peaks_pmonth = num_sfcWind_peaks_pmonth.assign_coords({'month':np.arange(1,13)})       
            
            dependence_mem['num_co_occurring'] = num_co_occurring_pmonth
            dependence_mem['num_pr_peaks'] = num_pr_peaks_pmonth
            dependence_mem['num_sfcWind_peaks'] = num_sfcWind_peaks_pmonth
            
            
            #dependence_mem['num_co_occurring'] = np.isfinite((rolling_max(pr_peaks_declustered,max_lag*2+1,dim='time_in_window_idx')*sfcWind_peaks_declustered)).sum(dim='time_in_window_idx')
            #^possibly expand this to extreme of one and say 80th percentile of other
            
            dependence_mem['sfcWind_thresholds'] = sfcWind_pr_wdws['sfcWind'].quantile(np.arange(.9,1,.01),dim='time_in_window_idx')
            dependence_mem['pr_thresholds'] = sfcWind_pr_wdws['pr'].quantile(np.arange(.9,1,.01),dim='time_in_window_idx')
            
            dependence_mem = dependence_mem.expand_dims(dim={"member_id": 1}) #add coordinates & dimensions
            dependence_mem = dependence_mem.assign_coords({'extreme_variate':['sfcWind','pr','both'],'statistic':['coef','p'],'estimator':['cfg','.95']})
            
            if m==0: #concatenate results over member_id's
                dependence = dependence_mem
            else:
                dependence = xr.concat((dependence,dependence_mem),dim='member_id')    
            
            dependence.attrs = sfcWind_pr.attrs #keep original attributes and add information on the extremes analysis
            dependence.attrs['window_length'] = str(window_len)
            dependence.attrs['declustering'] = 'Rolling window of '+str(declus_window_len)+' days'
            dependence.attrs['allowed_lag'] = str(max_lag)
            
            #store for all members of this model & experiment
            dependence.to_netcdf(output_fn,mode='w')
            dependence.close()
            

Processing file: CESM2_gn_day_ssp245.zarr


0it [00:00, ?it/s]

In [6]:
sfcWind_pr

Unnamed: 0,Array,Chunk
Bytes,2.37 GiB,118.82 MiB
Shape,"(2, 91615, 34, 51)","(1, 91615, 34, 5)"
Dask graph,22 chunks in 2 graph layers,22 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 2.37 GiB 118.82 MiB Shape (2, 91615, 34, 51) (1, 91615, 34, 5) Dask graph 22 chunks in 2 graph layers Data type float64 numpy.ndarray",2  1  51  34  91615,

Unnamed: 0,Array,Chunk
Bytes,2.37 GiB,118.82 MiB
Shape,"(2, 91615, 34, 51)","(1, 91615, 34, 5)"
Dask graph,22 chunks in 2 graph layers,22 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.18 GiB,59.41 MiB
Shape,"(2, 91615, 34, 51)","(1, 91615, 34, 5)"
Dask graph,22 chunks in 2 graph layers,22 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.18 GiB 59.41 MiB Shape (2, 91615, 34, 51) (1, 91615, 34, 5) Dask graph 22 chunks in 2 graph layers Data type float32 numpy.ndarray",2  1  51  34  91615,

Unnamed: 0,Array,Chunk
Bytes,1.18 GiB,59.41 MiB
Shape,"(2, 91615, 34, 51)","(1, 91615, 34, 5)"
Dask graph,22 chunks in 2 graph layers,22 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
