In [1]:
'''
A selection of traditional statistical metrics for comparing against d-score components
'''

import numpy as np

def nse(obs, mod):
    """
    Calculate the Nash-Sutcliffe Efficiency (NSE)
    (https://www.sciencedirect.com/science/article/pii/0022169470902556?via%3Dihub)
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Nash-Sutcliffe Efficiency
    """
    return 1 - (mse(obs, mod) / np.var(obs))


def mse(obs, mod):
    """
    Calculate the mean squared error (MSE)
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        mean squared error
    """
    return np.mean((obs - mod) ** 2)


def pbias(obs, mod):
    """
    Calculate the percent bias
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Percent bias
    """
    return 100 * ((np.sum(mod - obs)) / (np.sum(obs)))


def pbias_percentile(obs, model, percentile, fun):
    """
    Calculate the percent bias for a percentile bin
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
        percentile: float
        fun: comparison function (e.g., np.greater)
    Returns:
        Percent bias for bin
    """
    threshold = np.percentile(obs, q=percentile)
    i = fun(obs, threshold)
    
    return pbias(obs[i], model[i])
    


def pearson_r(obs, mod):
    """
    Calculate Pearson's r
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Pearson's r
    """
    #return np.cov(mod, obs) / np.sqrt( np.var(mod) * np.var(obs))
    return np.corrcoef(mod, obs)[0,1]


def spearman_r(obs, mod):
    """
    Calculate Spearman's r
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Spearman's r
    """
    return pearson_r(np.argsort(mod), np.argsort(obs))


def kge(obs, mod):
    """
    Calculate the Kling-Gupta Efficiency (KGE)
    (https://www.sciencedirect.com/science/article/pii/S0022169409004843)
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Kling-Gupta Efficiency
    """
    #d_obs = obs - np.mean(obs)
    #d_mod = mod - np.mean(mod)
    #r = np.sum(d_obs * d_mod) / np.sqrt(np.sum(d_mod ** 2) * np.sum(d_obs ** 2))
    r = pearson_r(obs, mod)
    #alpha = np.std(mod) / np.std(obs)
    alpha = sd_ratio(obs, mod)
    beta = np.sum(mod) / np.sum(obs)

    ED = np.sqrt((r - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)
    return 1 - ED

def sd_ratio(obs, mod):
    """
    Calculate the standard deviation ratio of the model predictions and observations
    
    Args:
        obs: numpy array of observed values
        mod: numpy array of modeled values
    Returns:
        Standard deviation ratio   
    """
    return np.std(mod) / np.std(obs)


In [2]:
# Start cluster

In [3]:

from dask_jobqueue import SLURMCluster
from dask.distributed import Client, progress

## Denali setup
#cluster = SLURMCluster() #TOH: my config defaults to Denali, but this won't work for other users

# Tallgrass setup
cluster = SLURMCluster(queue='cpu', cores=1, interface='ib0',
                       job_extra=['--nodes=1', '--ntasks-per-node=1', '--cpus-per-task=1'],
                       scheduler_options={'dashboard_address':36999},
                       #cores=1, extra=['--resources processes=1']
                       memory='6GB')

## PC setup
#import os
#n_cores = os.cpu_count() # set to match your machine
#client = Client(threads_per_worker=1, n_workers=n_cores - 1)

client = Client(cluster)
client

  from distributed.utils import tmpfile


0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.22.0.2:36999/status,

0,1
Dashboard: http://172.22.0.2:36999/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.22.0.2:45151,Workers: 0
Dashboard: http://172.22.0.2:36999/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [4]:
cluster.adapt(maximum_jobs=100)

<distributed.deploy.adaptive.Adaptive at 0x2aaab634baf0>

In [5]:

#client.close()

# Intake catalog
We use an Intake catalog to help manage the various datasets that might be used in an evaluation.

In [6]:
import intake 
url = 'https://raw.githubusercontent.com/nhm-usgs/data-pipeline-helpers/main/hytest/hytest_intake_catalog.yml'
cat = intake.open_catalog(url)
print(list(cat))

['conus404-40year-onprem', 'conus404-2017-onprem', 'conus404-2017-cloud', 'nwis-streamflow-usgs-gages-onprem', 'nwis-streamflow-usgs-gages-cloud', 'nwm21-streamflow-usgs-gages-onprem', 'nwm21-streamflow-usgs-gages-cloud', 'nwm21-streamflow-cloud']


In [7]:
# read in intake
obs_ds = cat['nwis-streamflow-usgs-gages-onprem'].to_dask()
model_ds = cat['nwm21-streamflow-usgs-gages-onprem'].to_dask()


obs = obs_ds['streamflow']
mod = model_ds['streamflow'].astype('float32')

obs.name = 'observed'
mod.name = 'predicted'

In [8]:
#obs_ds['streamflow'].resample({'time':'1D'}, loffset='5h').mean()
obs_ds['streamflow']

Unnamed: 0,Array,Chunk
Bytes,466.87 MiB,59.80 kiB
Shape,"(15310, 7994)","(15310, 1)"
Count,7995 Tasks,7994 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 466.87 MiB 59.80 kiB Shape (15310, 7994) (15310, 1) Count 7995 Tasks 7994 Chunks Type float32 numpy.ndarray",7994  15310,

Unnamed: 0,Array,Chunk
Bytes,466.87 MiB,59.80 kiB
Shape,"(15310, 7994)","(15310, 1)"
Count,7995 Tasks,7994 Chunks
Type,float32,numpy.ndarray


In [9]:
# selecting a single gage is fast
gage_id = 'USGS-01030350'
x = obs.sel(gage_id=gage_id)


In [10]:
import pandas as pd
import numpy as np

def compute_metrics(gage_id, obs, model):
    # select the data for the given gage_id
    # TODO the selection may be distributed, but can we force it onto a single node? Maybe by allocating 2 cores?
    obs = obs.sel(gage_id=gage_id).to_series()
    model = model.sel(gage_id=gage_id).to_series().resample('1D', offset='5h').mean() # Resampling could be done in preanalysis
    # make sure the indices match
    obs.index = obs.index.to_period('D')
    model.index = model.index.to_period('D')


    # merge obs and predictions and drop nans.
    df = pd.merge(obs, model, left_index=True, right_index=True).dropna(how='any')
    obs = df['observed']
    model = df['predicted']
    
    # compute log flow for use in log NSE
    threshold = 0.01
    log_obs = np.log(obs.where(obs > threshold, threshold))
    log_model = np.log(model.where(model > threshold, threshold))
    
    scores = pd.Series(dtype='float')
    scores['nse'] = nse(obs, model)
    scores['log_nse'] = nse(log_obs, log_model)
    scores['kge'] = kge(obs, model)
    
    scores['pbias'] = pbias(obs, model)
    scores['pearson_r'] = pearson_r(obs, model)
    scores['spearman_r'] = spearman_r(obs, model)
    scores['sd_ratio'] = sd_ratio(obs, model)
    
    # compute high flow and low flow bias
    high_percentile = 98
    low_percentile = 30
    
    scores['pbias_q' + str(high_percentile)] = pbias_percentile(obs, model, high_percentile, np.greater)
    scores['pbias_q' + str(low_percentile)] = pbias_percentile(obs, model, high_percentile, np.less_equal)
    scores.name = gage_id
    return scores

In [11]:
# TODO why so slow to resample?

In [12]:
%%time
# to_series() is slow, but much faster than resampling the xarray
model = mod.sel(gage_id=gage_id).to_series()
#model.resample('1D', offset='5h').mean() # Resampling could be done in preanalysis

CPU times: user 203 ms, sys: 173 ms, total: 376 ms
Wall time: 3.5 s


In [13]:
%%time
# slow to resample xarray; note chunking by day
mod.sel(gage_id=gage_id).resample({'time':'1D'}, loffset='5h').mean()

CPU times: user 15.6 s, sys: 445 ms, total: 16.1 s
Wall time: 15.9 s


Unnamed: 0,Array,Chunk
Bytes,59.80 kiB,4 B
Shape,"(15310,)","(1,)"
Count,77230 Tasks,15310 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 59.80 kiB 4 B Shape (15310,) (1,) Count 77230 Tasks 15310 Chunks Type float32 numpy.ndarray",15310  1,

Unnamed: 0,Array,Chunk
Bytes,59.80 kiB,4 B
Shape,"(15310,)","(1,)"
Count,77230 Tasks,15310 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,int32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type int32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,int32,numpy.ndarray


In [56]:
#TODO can we do the merging in xarray? No
# selecting a single gage is fast
import xarray as xr

gage_id = 'USGS-01030350'
x = obs.sel(gage_id=gage_id)

y = mod.sel(gage_id=gage_id).resample({'time':'1D'}, loffset='5h').mean()

# gives wrong number of values (for dataframe had to reindex both using to_period
test = xr.merge([x,y], join='inner') 
test.dropna(dim='time')

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,int32,numpy.ndarray
Array Chunk Bytes 4 B 4.0 B Shape () () Count 4 Tasks 1 Chunks Type int32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4.0 B
Shape,(),()
Count,4 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.20 kiB,6.20 kiB
Shape,"(1586,)","(1586,)"
Count,7998 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.20 kiB 6.20 kiB Shape (1586,) (1586,) Count 7998 Tasks 1 Chunks Type float32 numpy.ndarray",1586  1,

Unnamed: 0,Array,Chunk
Bytes,6.20 kiB,6.20 kiB
Shape,"(1586,)","(1586,)"
Count,7998 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.20 kiB,4 B
Shape,"(1586,)","(1,)"
Count,85180 Tasks,1586 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.20 kiB 4 B Shape (1586,) (1,) Count 85180 Tasks 1586 Chunks Type float32 numpy.ndarray",1586  1,

Unnamed: 0,Array,Chunk
Bytes,6.20 kiB,4 B
Shape,"(1586,)","(1,)"
Count,85180 Tasks,1586 Chunks
Type,float32,numpy.ndarray


In [57]:
%%time
# run for a single site using 1 core
gage_id = 'USGS-01030350'
compute_metrics(gage_id, obs, mod)

CPU times: user 175 ms, sys: 45.5 ms, total: 221 ms
Wall time: 4 s


nse            0.610186
log_nse        0.437533
kge            0.581806
pbias        -12.679163
pearson_r      0.799410
spearman_r    -0.003219
sd_ratio       0.655655
pbias_q98    -43.865916
pbias_q30     -7.173589
Name: USGS-01030350, dtype: float64

In [12]:
# distributed not working yet

In [58]:
%%time
import dask.bag as db
b = db.from_sequence(obs.gage_id[0:3].to_numpy().tolist())#, npartitions=100)

b = b.map(compute_metrics, obs=obs, model=mod)
#results = pd.DataFrame().append(b.compute())
#results.index.name = 'site_no'

CPU times: user 27.1 ms, sys: 4.9 ms, total: 32 ms
Wall time: 29.1 ms


In [60]:
b.visualize()

KeyboardInterrupt: 