Merge pull request #553 from SALib/rsa

Add Regional Sensitivity Analysis
SALib · Mar 17, 2023 · 1223b5e · 1223b5e
2 parents bbf0bfc + 353bf45
commit 1223b5e
Show file tree

Hide file tree

Showing 6 changed files with 297 additions and 8 deletions.
diff --git a/README.rst b/README.rst
@@ -49,7 +49,9 @@ Included methods
 * High-Dimensional Model Representation (HDMR)
   (`Rabitz et al. 1999 <https://doi.org/10.1016/S0010-4655(98)00152-0>`__, `Li et al. 2010 <https://doi.org/10.1021/jp9096919>`__)
 
-* PAWN (`Pianosi and Wagener 2018 <10.1016/j.envsoft.2018.07.019>`__, `Pianosi and Wagener 2015 <https://doi.org/10.1016/j.envsoft.2015.01.004>`__)
+* PAWN (`Pianosi and Wagener 2018 <https://dx.doi.org/10.1016/j.envsoft.2018.07.019>`__, `Pianosi and Wagener 2015 <https://doi.org/10.1016/j.envsoft.2015.01.004>`__)
+
+* Regional Sensitivity Analysis (based on `Saltelli et al. 2008 <https://dx.doi.org/10.1002/9780470725184>`__, `Pianosi et al., 2016 <https://dx.doi.org/10.1016/j.envsoft.2016.02.008>`__)
 
 
 **Contributing:** see `here <CONTRIBUTING.md>`__

diff --git a/docs/index.rst b/docs/index.rst
@@ -5,9 +5,13 @@ SALib - Sensitivity Analysis Library in Python
 .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.160164.svg
    :target: https://doi.org/10.5281/zenodo.160164
 
+.. image:: https://img.shields.io/badge/DOI-10.18174%2Fsesmo.18155-blue
+  :target: https://sesmo.org/article/view/18155
+
 .. image:: http://joss.theoj.org/papers/431262803744581c1d4b6a95892d3343/status.svg
   :target: http://joss.theoj.org/papers/431262803744581c1d4b6a95892d3343
 
+
 Python implementations of commonly used sensitivity analysis methods, including
 Sobol, Morris, and FAST methods. Useful in systems modeling to calculate the
 effects of model inputs or exogenous factors on outputs of interest.
@@ -31,7 +35,9 @@ Supported Methods
 * High Dimensional Model Representation
   (`Li et al. 2010 <https://pubs.acs.org/doi/pdf/10.1021/jp9096919>`_)
 * PAWN
-  (`Pianosi and Wagener 2018 <10.1016/j.envsoft.2018.07.019>`__, `Pianosi and Wagener 2015 <https://doi.org/10.1016/j.envsoft.2015.01.004>`__)
+  (`Pianosi and Wagener 2018 <https://dx.doi.org/10.1016/j.envsoft.2018.07.019>`__, `Pianosi and Wagener 2015 <https://doi.org/10.1016/j.envsoft.2015.01.004>`__)
+* * Regional Sensitivity Analysis
+  (based on `Saltelli et al. 2008 <https://dx.doi.org/10.1002/9780470725184>`__, `Pianosi et al., 2016 <https://dx.doi.org/10.1016/j.envsoft.2016.02.008>`__)
 
 
 .. toctree::

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
   "numpy>=1.20.3",
   "scipy>=1.7.3",
   "matplotlib>=3.2.2",
-  "pandas>=1.1.2",
+  "pandas>=1.2",
   "multiprocess",
 ]
 

diff --git a/src/SALib/analyze/rsa.py b/src/SALib/analyze/rsa.py
@@ -0,0 +1,281 @@
+from typing import Dict
+from types import MethodType
+import warnings
+
+import numpy as np
+import pandas as pd
+from scipy.stats import anderson_ksamp
+
+from . import common_args
+from ..util import read_param_file, ResultDict, extract_group_names, _check_groups
+
+
+def analyze(
+    problem: Dict,
+    X: np.ndarray,
+    Y: np.ndarray,
+    bins: int = 20,
+    target: str = "Y",
+    print_to_console: bool = False,
+    seed: int = None,
+):
+    """
+    Perform Regional Sensitivity Analysis (RSA), also known as Monte Carlo Filtering.
+
+    In a usual RSA, a desirable region of output space is defined. Outputs which fall
+    within this region is categorized as being "behavioral" (:math:`B`), and those
+    outside are described as being "non-behavioral" (:math:`\\bar{B}`). The input
+    factors are also partitioned into behavioral and non-behavioral subsets, such that
+    :math:`f(X_{i}|B) -> (Y|B)` and :math:`f(X_{i}|\\bar{B}) -> (Y|\\bar{B})`. The
+    distribution between the two sub-samples are compared for each factor. The greater
+    the difference between the two distributions, the more important the given factor
+    is in driving model outputs.
+
+    The approach implemented in SALib partitions factor or output space into :math:`b`
+    bins (default: 20) according to their percentile values. Output space is targeted
+    for analysis by default (`target="Y"`), such that :math:`(Y|b_{i})` is mapped back
+    to :math:`X_{i}|b_{i}`. In other words, we treat outputs falling within a given bin
+    (:math:`b_{i}`) corresponding to their inputs as behavioral, and those outside the
+    bin as non-behavioral. This aids in answering the question "Which :math:`X_{i}`
+    contributes most toward a given range of outputs?". Factor space can also be
+    assessed (`target="X"`), such that :math:`f(X_{i}|b_{i}) -> (Y|b_{i})` and
+    :math:`f(X_{i}|b_{~i}) -> (Y|b_{~i})`. This aids in answering the question "where
+    in factor space are outputs most sensitive to?"
+
+    The $k$-sample Anderson-Darling test is used to compare distributions. Results of
+    the analysis are normalized so that values will be ::math:`\\in [0, 1]`, and
+    indicate relative sensitivity across factor/output space. Larger values indicate
+    greater dissimilarity (thus, sensitivity).
+
+    Notes
+    -----
+    Compatible with:
+        all samplers
+
+    When applied to grouped factors, the analysis is conducted on each factor
+    individually, and the mean of the results for a group are reported.
+
+    Increasing the value of :math:`S` increases the granularity of the analysis
+    (across factor space), but necessitates larger sample sizes.
+
+    This analysis will produce NaNs, indicating areas of factor space that did not have
+    any samples, or for which the outputs were constant.
+
+    Parameters
+    ----------
+    problem : dict
+        The problem definition
+    X : numpy.array
+        A NumPy array containing the model inputs
+    Y : numpy.array
+        A NumPy array containing the model outputs
+    bins : int
+        The number of bins to use (default: 20)
+    target : str
+        Assess factor space ("X") or output space ("Y")
+        (default: "Y")
+    print_to_console : bool
+        Print results directly to console (default False)
+    seed : int
+        Seed value to ensure deterministic results
+        Unused, but defined to maintain compatibility.
+
+    References
+    ----------
+    1. Pianosi, F., K. Beven, J. Freer, J. W. Hall, J. Rougier, D. B. Stephenson, and
+    T. Wagener. 2016.
+    Sensitivity analysis of environmental models:
+    A systematic review with practical workflow.
+    Environmental Modelling & Software 79:214-232.
+    https://dx.doi.org/10.1016/j.envsoft.2016.02.008
+
+    2. Saltelli, A., M. Ratto, T. Andres, F. Campolongo, J. Cariboni, D. Gatelli,
+    M. Saisana, and S. Tarantola. 2008.
+    Global Sensitivity Analysis: The Primer.
+    Wiley, West Sussex, U.K.
+    https://dx.doi.org/10.1002/9780470725184
+    Accessible at:
+    http://www.andreasaltelli.eu/file/repository/Primer_Corrected_2022.pdf
+    """
+    groups = _check_groups(problem)
+    if not groups:
+        var_names = problem["names"]
+    else:
+        var_names, _ = extract_group_names(problem.get("groups", []))
+
+    results = rsa(X, Y, bins, target)
+
+    if groups:
+        groups = np.array(groups)
+        unique_grps = [*dict.fromkeys(groups)]
+        tmp = np.full((bins, len(unique_grps)), np.nan)
+
+        # Take the mean of effects from parameters that are grouped together
+        for grp_id, grp in enumerate(unique_grps):
+            tmp[:, grp_id] = np.nanmean(results[:, groups == grp], axis=1)
+
+        results = tmp
+        tmp = None
+
+    Si = ResultDict([(g, results[:, i]) for i, g in enumerate(var_names)])
+    Si["names"] = var_names
+    Si["bins"] = np.arange(0.0, 1.0, (1.0 / bins))
+    Si["target"] = target
+
+    # Attach object methods specific to this sensitivity method to ResultDict.
+    Si.to_df = MethodType(to_df, Si)
+    Si._plot = Si.plot
+    Si.plot = MethodType(plot, Si)
+
+    if print_to_console:
+        print(Si.to_df())
+
+    return Si
+
+
+def rsa(X: np.ndarray, y: np.ndarray, bins: int = 10, target="X") -> np.ndarray:
+    N, D = X.shape
+
+    # Pre-allocated arrays to store data/results
+    seq = np.append(np.arange(0.0, 1.0, (1 / bins)), 1.0)
+    X_di = np.empty(N)  # store of factor values
+    r_s = np.full((bins, D), np.nan)  # results
+
+    if target == "X":
+        t_arr = X_di  # target factor space for analysis
+        m_arr = y  # map behavioral region of factor space to output space
+    elif target == "Y":
+        t_arr = y  # target output space for analysis
+        m_arr = X_di  # map outputs back to factor space
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        for d_i in range(D):
+            X_di[:] = X[:, d_i]
+
+            # Assess first bin separately, making sure that the
+            # bin edges are inclusive of the lower bound
+            quants = np.quantile(t_arr, seq)
+            b = (quants[0] <= t_arr) & (t_arr <= quants[1])
+            if _has_samples(y, b):
+                r_s[0, d_i] = anderson_ksamp((m_arr[b], m_arr[~b])).statistic
+
+            # Then assess the other bins
+            for s in range(1, bins):
+                b = (quants[s] < t_arr) & (t_arr <= quants[s + 1])
+
+                if _has_samples(y, b):
+                    r_s[s, d_i] = anderson_ksamp((m_arr[b], m_arr[~b])).statistic
+
+    min_val = np.nanmin(r_s)
+    return (r_s - min_val) / (np.nanmax(r_s) - min_val)
+
+
+def _has_samples(y, sel):
+    """Check if the given region of factor space has > 0 samples.
+
+    Returns
+    -------
+    bool, true if > 0 non-unique samples are found, false otherwise.
+    """
+    return (
+        (np.count_nonzero(sel) != 0)
+        and (len(y[~sel]) != 0)
+        and np.unique(y[sel]).size > 1
+    )
+
+
+def to_df(self):
+    """Conversion to Pandas DataFrame specific to Regional Sensitivity Analysis results.
+
+    Overrides the conversion method attached to the SALib problem spec.
+
+    Returns
+    -------
+    Pandas DataFrame
+    """
+    names = self["names"]
+
+    # Only convert these elements in dict to DF
+    new_spec = {k: v for k, v in self.items() if k not in ["names", "bins", "target"]}
+
+    return pd.DataFrame(new_spec, columns=names, index=self["bins"])
+
+
+def plot(self, factors=None):
+    """Plotting for Regional Sensitivity Analysis.
+
+    Overrides the plot method attached to the SALib problem spec.
+    """
+    import matplotlib.pyplot as plt
+
+    Si_df = self.to_df()
+    fig, ax = plt.subplots()
+
+    xlabel = f"${self['target']}$ (Percentile)"
+    ylabel = "Relative $S_{i}$"
+
+    if factors is None:
+        factors = slice(None)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        ax = Si_df.loc[:, factors].plot(
+            ax=ax,
+            xlabel=xlabel,
+            ylabel=ylabel,
+            subplots=True,
+            legend=False,
+            sharey=True,
+            sharex=True,
+        )
+
+    fig.legend(bbox_to_anchor=(1.1, 1.0))
+    fig.tight_layout()
+
+    return ax
+
+
+def cli_parse(parser):
+    parser.add_argument(
+        "-X", "--model-input-file", type=str, required=True, help="Model input file"
+    )
+
+    parser.add_argument(
+        "-b",
+        "--bins",
+        type=int,
+        required=False,
+        help="Number of bins to partition target space into",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--target",
+        type=str,
+        required=False,
+        help="Target input space (X) or output space (Y)",
+    )
+
+    return parser
+
+
+def cli_action(args):
+    problem = read_param_file(args.paramfile)
+    X = np.loadtxt(args.model_input_file, delimiter=args.delimiter)
+    Y = np.loadtxt(
+        args.model_output_file, delimiter=args.delimiter, usecols=(args.column,)
+    )
+    analyze(
+        problem,
+        X,
+        Y,
+        bins=args.bins,
+        target=args.target,
+        print_to_console=True,
+        seed=args.seed,
+    )
+
+
+if __name__ == "__main__":
+    common_args.run_cli(cli_parse, cli_action)
diff --git a/src/SALib/util/problem.py b/src/SALib/util/problem.py
@@ -469,7 +469,7 @@ def to_df(self):
 
         raise RuntimeError("Analysis not yet conducted")
 
-    def plot(self):
+    def plot(self, **kwargs):
         """Plot results as a bar chart.
 
         Returns
@@ -481,7 +481,7 @@ def plot(self):
 
         num_rows = len(self["outputs"])
         if num_rows == 1:
-            return self._analysis.plot()
+            return self._analysis.plot(**kwargs)
 
         try:
             plt
@@ -500,7 +500,7 @@ def plot(self):
             num_rows, num_cols, sharey=True, figsize=(p_width, p_height)
         )
         for res, ax in zip(self._analysis, axes):
-            self._analysis[res].plot(ax=ax)
+            self._analysis[res].plot(ax=ax, **kwargs)
 
             try:
                 ax[0].set_title(res)

diff --git a/tests/test_problem_spec.py b/tests/test_problem_spec.py
@@ -166,10 +166,10 @@ def test_parallel_multi_output():
     )
 
     assert (
-        np.testing.assert_allclose(sp.results, psp.results, atol=1e-10) is None
+        np.testing.assert_equal(sp.results, psp.results) is None
     ), "Model results not equal!"
     assert (
-        np.testing.assert_allclose(sp.analysis, psp.analysis, atol=1e-10) is None
+        np.testing.assert_equal(sp.analysis, psp.analysis) is None
     ), "Analysis results not equal!"