metrics: add CRPSS (#605)

* metrics: add CRPSS Add skill score version of the CRPS metric, which shows the relative improvement (in terms of CRPS) between a forecast versus a reference forecast. * metrics: add test for negative CRPSS value Add test for situation where the CRPS of the forecast is worse than the reference forecast and therefore the CRPSS is negative. * docs/api: list CRPSS metric (for prob fx) * docs/whatsnew/1.0.04rc4: add CRPS skill score mention * tests: flake8 corrections * tests/calculator: handle CRPSS requiring ref dist fx Revisions to handle CRPSS requiring reference forecasts that are full distribution forecasts (not just single probability forecasts). Still a work-in-progress from these changes are moving things forward. * CRPS: add checks to ensure forecasts have proper dimensions Add checks to ensure that the forecasts given to the CRPS function have the correct dimensions. If there is an issue, raise a ValueError. * CRPS: more revisions to improve robustness Revisions to further improve robustness of inputs to CRPS and CRPSS. Particularly for the reference forecasts. * calculator tests: revise for CRPSS changes * CRPSS: modify to avoid divide by zero Modify the CRPSS calculation to avoid divide by zero (same pattern as in `deterministic.forecast_skill`).
SolarArbiter · Oct 29, 2020 · e12a37b · e12a37b
1 parent c943b9b
commit e12a37b
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 19 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -580,6 +580,7 @@ Functions to compute forecast probabilistic performance metrics:
     metrics.probabilistic.uncertainty
     metrics.probabilistic.sharpness
     metrics.probabilistic.continuous_ranked_probability_score
+    metrics.probabilistic.crps_skill_score
 
 Event
 -----

diff --git a/docs/source/whatsnew/1.0.0rc4.rst b/docs/source/whatsnew/1.0.0rc4.rst
@@ -36,6 +36,8 @@ API Changes
   from the BSRN network (:issue:`541`) (:pull:`604`)
 * Removed ``preprocessing.VALIDATION_RESULT_TOTAL_STRING`` and
   ``preprocessing.UNDEFINED_DATA_STRING``.
+* Added CRPS skill score metric :py:func:`metrics.probabilistic.crps_skill_score`
+  (:issue:`494`) (:pull:`605`)
 
 
 Enhancements

diff --git a/solarforecastarbiter/metrics/probabilistic.py b/solarforecastarbiter/metrics/probabilistic.py
@@ -494,6 +494,14 @@ def continuous_ranked_probability_score(obs, fx, fx_prob):
     crps : float
         The Continuous Ranked Probability Score [unitless].
 
+    Raises
+    ------
+    ValueError
+        If the forecasts have incorrect dimensions; either a) the forecasts are
+        for a single sample (n=1) with d CDF intervals but are given as a 1D
+        array with d values or b) the forecasts are given as 2D arrays (n,d)
+        but do not contain at least 2 CDF intervals (i.e. d < 2).
+
     Examples
     --------
 
@@ -514,7 +522,14 @@ def continuous_ranked_probability_score(obs, fx, fx_prob):
     """
 
     # match observations to fx shape: (n,) => (n, d)
-    obs = np.tile(obs, (fx.shape[1], 1)).T
+    if np.ndim(fx) < 2:
+        raise ValueError("forecasts must be 2D arrays (expected (n,d), got"
+                         f"{np.shape(fx)})")
+    elif np.shape(fx)[1] < 2:
+        raise ValueError("forecasts must have d >= 2 CDF intervals "
+                         f"(expected >= 2, got {np.shape(fx)[1]})")
+    else:
+        obs = np.tile(obs, (fx.shape[1], 1)).T
 
     # event: 0=did not happen, 1=did happen
     o = np.where(obs <= fx, 1.0, 0.0)
@@ -529,6 +544,57 @@ def continuous_ranked_probability_score(obs, fx, fx_prob):
     return crps
 
 
+def crps_skill_score(obs, fx, fx_prob, ref, ref_prob):
+    """CRPS skill score.
+
+        CRPSS = 1 - CRPS_fx / CRPS_ref
+
+    where CRPS_fx is the CPRS of the evaluated forecast and CRPS_ref is the
+    CRPS of a reference forecast.
+
+    Parameters
+    ----------
+    obs : (n,) array_like
+        Observations (physical unit).
+    fx : (n, d) array_like
+        Forecasts (physical units) of the right-hand-side of a CDF with d
+        intervals (d >= 2), e.g., fx = [10 MW, 20 MW, 30 MW] is interpreted as
+        <= 10 MW, <= 20 MW, <= 30 MW.
+    fx_prob : (n,) array_like
+        Probability [%] associated with the forecasts.
+    ref : (n, d) array_like
+        Reference forecasts (physical units) of the right-hand-side of a CDF
+        with d intervals (d >= 2), e.g., fx = [10 MW, 20 MW, 30 MW] is
+        interpreted as <= 10 MW, <= 20 MW, <= 30 MW.
+    ref_prob : (n,) array_like
+        Probability [%] associated with the reference forecast.
+
+    Returns
+    -------
+    skill : float
+        The CRPS skill score [unitless].
+
+    See Also
+    --------
+    :py:func:`solarforecastarbiter.metrics.probabilistic.continuous_ranked_probability_score`
+
+    """
+
+    if np.isscalar(ref):
+        return np.nan
+    else:
+        crps_fx = continuous_ranked_probability_score(obs, fx, fx_prob)
+        crps_ref = continuous_ranked_probability_score(obs, ref, ref_prob)
+
+        if crps_fx == crps_ref:
+            return 0.0
+        elif crps_ref == 0.0:
+            # avoid divide by zero
+            return np.NINF
+        else:
+            return 1 - crps_fx / crps_ref
+
+
 # Add new metrics to this map to map shorthand to function
 _MAP = {
     'bs': (brier_score, 'BS'),
@@ -540,18 +606,19 @@ def continuous_ranked_probability_score(obs, fx, fx_prob):
     'qss': (quantile_skill_score, 'QSS'),
     # 'sh': (sharpness, 'SH'),  # TODO
     'crps': (continuous_ranked_probability_score, 'CRPS'),
+    'crpss': (crps_skill_score, 'CRPSS'),
 }
 
 __all__ = [m[0].__name__ for m in _MAP.values()]
 
 # Functions that require a reference forecast
-_REQ_REF_FX = ['bss', 'qss']
+_REQ_REF_FX = ['bss', 'qss', 'crpss']
 
 # Functions that require normalized factor
 _REQ_NORM = []
 
 # Functions that require full distribution forecasts (as 2dim)
-_REQ_DIST = ['crps']
+_REQ_DIST = ['crps', 'crpss']
 
 # TODO: Functions that require two forecasts (e.g., sharpness)
 # _REQ_FX_FX = ['sh']
diff --git a/solarforecastarbiter/metrics/tests/test_calculator.py b/solarforecastarbiter/metrics/tests/test_calculator.py
@@ -303,10 +303,9 @@ def test_calculate_metrics_with_probablistic(single_observation,
                                           PROB_NO_REF)
     assert len(result) == 1
     assert isinstance(result[0], datamodel.MetricResult)
-    verify_metric_result(result[0],
-                         proc_prfx_obs,
-                         LIST_OF_CATEGORIES,
-                         probabilistic._REQ_DIST)
+    verify_metric_result(
+        result[0], proc_prfx_obs, LIST_OF_CATEGORIES,
+        set(probabilistic._REQ_DIST) - set(probabilistic._REQ_REF_FX))
 
     # With reference
     ref_fx_values = fx_values + .5
@@ -336,11 +335,11 @@ def test_calculate_metrics_with_probablistic(single_observation,
 
     expected = {
         0: ('total', 'crps', '0', 17.247),
-        1: ('year', 'crps', '2019', 17.247),
-        2: ('season', 'crps', 'JJA', 17.247),
-        3: ('month', 'crps', 'Aug', 17.247),
-        4: ('hour', 'crps', '0', 19.801000000000002),
-        5: ('hour', 'crps', '1', 19.405)
+        2: ('year', 'crps', '2019', 17.247),
+        4: ('season', 'crps', 'JJA', 17.247),
+        6: ('month', 'crps', 'Aug', 17.247),
+        8: ('hour', 'crps', '0', 19.801000000000002),
+        9: ('hour', 'crps', '1', 19.405)
     }
     attr_order = ('category', 'metric', 'index', 'value')
     for k, expected_attrs in expected.items():
@@ -384,9 +383,10 @@ def test_calculate_metrics_with_probablistic(single_observation,
 
     # Distribution and constant values
     all_results = calculator.calculate_metrics(
-        [proc_prfx_obs] + cv_proc_ref_prfx_obs, LIST_OF_CATEGORIES,
+        [proc_ref_prfx_obs] + cv_proc_ref_prfx_obs, LIST_OF_CATEGORIES,
         # reverse to ensure order output is independent
         PROBABILISTIC_METRICS[::-1])
+
     assert all_results[0] == dist_results[0]
     assert len(all_results[1:]) == len(cv_results)
     for a, b in zip(all_results[1:], cv_results):
@@ -589,7 +589,8 @@ def test_calculate_probabilistic_metrics_missing_ref(
         pd.DataFrame(np.random.randn(10, 3), index=create_dt_index(10)),
         pd.Series(np.random.randn(10), index=create_dt_index(10)))
     result = calculator.calculate_probabilistic_metrics(
-            proc_fxobs, LIST_OF_CATEGORIES, probabilistic._REQ_REF_FX
+            proc_fxobs, LIST_OF_CATEGORIES,
+            set(probabilistic._REQ_REF_FX) - set(probabilistic._REQ_DIST)
         )
     assert isinstance(result, datamodel.MetricResult)
     assert result.values == ()
@@ -606,7 +607,8 @@ def test_calculate_probabilistic_metrics_no_reference_data(
         pd.Series(np.random.randn(10), index=create_dt_index(10)),
         ref_values=None)
     result = calculator.calculate_probabilistic_metrics(
-            proc_fxobs, LIST_OF_CATEGORIES, probabilistic._REQ_REF_FX
+            proc_fxobs, LIST_OF_CATEGORIES,
+            set(probabilistic._REQ_REF_FX) - set(probabilistic._REQ_DIST)
         )
     assert isinstance(result, datamodel.MetricResult)
     assert result.values == ()
@@ -955,7 +957,7 @@ def test_apply_deterministic_bad_metric_func():
     ('unc', [1, 1, 1], [100, 100, 100], [1, 1, 1], None, None, 0.),
 
     # CRPS single forecast
-    ('crps', [[1]], [[100]], [[0]], None, None, 0.),
+    ('crps', [[1, 1]], [[100, 100]], [[0, 0]], None, None, 0.),
     # CRPS mulitple forecasts
     ('crps', [[1, 1, 1], [2, 2, 2], [3, 3, 3]],
              [[100, 100, 100], [100, 100, 100], [100, 100, 100]],

diff --git a/solarforecastarbiter/metrics/tests/test_probabilistic.py b/solarforecastarbiter/metrics/tests/test_probabilistic.py
@@ -218,10 +218,10 @@ def test_sharpness(fx_lower, fx_upper, value):
 
 
 @pytest.mark.parametrize("fx,fx_prob,obs,value", [
-    # 1 sample, 1 CDF interval
+    # 1 sample, 2 CDF intervals
     (
-        np.array([[10]]),
-        np.array([[100]]),
+        np.array([[10, 20]]),
+        np.array([[100, 100]]),
         np.array([8]),
         0.0,
     ),
@@ -283,7 +283,75 @@ def test_sharpness(fx_lower, fx_upper, value):
         np.array([8, 8]),
         (1.0 ** 2) * 10 + (0.5 ** 2) * 10,
     ),
+
+    # fail: only 1 CDF interval
+    pytest.param(
+        np.array([[10], [20]]),
+        np.array([[100], [100]]),
+        np.array([8, 8]),
+        0.0,
+        marks=pytest.mark.xfail(strict=True, type=ValueError),
+    ),
+
+    # fail: forecast as 1D array (instead of 2D)
+    pytest.param(
+        np.array([10, 20]),
+        np.array([100, 100]),
+        np.array([8, 8]),
+        0.0,
+        marks=pytest.mark.xfail(strict=True, type=ValueError),
+    ),
 ])
 def test_crps(fx, fx_prob, obs, value):
     crps = prob.continuous_ranked_probability_score(obs, fx, fx_prob)
     assert crps == value
+
+
+@pytest.mark.parametrize("fx,fx_prob,ref,ref_prob,obs,value", [
+    # 2 samples, 3 CDF intervals
+    (
+        np.array([[10, 20, 30], [10, 20, 30]]),    # fx
+        np.array([[0, 100, 100], [0, 100, 100]]),  # fx_prob
+        np.array([[10, 20, 30], [10, 20, 30]]),    # ref
+        np.array([[0, 0, 100], [0, 0, 100]]),      # ref_prob
+        np.array([8, 8]),                          # obs
+        1 - 10 / 20,
+    ),
+    (
+        np.array([[10, 20, 30], [10, 20, 30]]),    # fx
+        np.array([[0, 0, 100], [0, 0, 100]]),      # fx_prob
+        np.array([[10, 20, 30], [10, 20, 30]]),    # ref
+        np.array([[0, 100, 100], [0, 100, 100]]),  # ref_prob
+        np.array([8, 8]),                          # obs
+        1 - 20 / 10,
+    ),
+    (
+        np.array([[10, 20, 30], [10, 20, 30]]),    # fx
+        np.array([[0, 100, 100], [0, 100, 100]]),  # fx_prob
+        np.array([[10, 20, 30], [10, 20, 30]]),    # ref
+        np.array([[0, 100, 100], [0, 100, 100]]),  # ref_prob
+        np.array([8, 8]),                          # obs
+        0.0
+    ),
+    (
+        np.array([[10, 20, 30], [10, 20, 30]]),    # fx
+        np.array([[0, 100, 100], [0, 100, 100]]),  # fx_prob
+        np.array([[10, 20, 30], [10, 20, 30]]),    # ref
+        np.array([[0, 100, 100], [0, 100, 100]]),  # ref_prob
+        np.array([8, 8]),                          # obs
+        0.0
+    ),
+
+    # 2 samples, 2 CDF intervals
+    (
+        np.array([[10, 20], [10, 20]]),      # fx
+        np.array([[0, 100], [0, 100]]),      # fx_prob
+        np.array([[10, 20], [10, 20]]),      # ref
+        np.array([[100, 100], [100, 100]]),  # ref_prob
+        np.array([8, 8]),                    # obs
+        np.NINF,
+    ),
+])
+def test_crps_skill_score(obs, fx, fx_prob, ref, ref_prob, value):
+    crpss = prob.crps_skill_score(obs, fx, fx_prob, ref, ref_prob)
+    assert crpss == value