From 0f73abd12a2c819c23558935a59a185e5f7beccc Mon Sep 17 00:00:00 2001
From: Sebastian Gsell <sebastian.gsell93@gmail.com>
Date: Tue, 2 May 2023 17:10:14 +0200
Subject: [PATCH] Address comments and speed up tests

---
 .../benchmarking/benchmark_reports.py         |  64 ++++---
 src/estimagic/visualization/profile_plot.py   |   4 +-
 tests/benchmarking/test_benchmark_reports.py  | 166 +++++++++++++-----
 tests/visualization/test_profile_plot.py      |  10 +-
 4 files changed, 164 insertions(+), 80 deletions(-)

diff --git a/src/estimagic/benchmarking/benchmark_reports.py b/src/estimagic/benchmarking/benchmark_reports.py
index fed8bf81d..dfedd23f1 100644
--- a/src/estimagic/benchmarking/benchmark_reports.py
+++ b/src/estimagic/benchmarking/benchmark_reports.py
@@ -3,6 +3,8 @@
     process_benchmark_results,
 )
 
+from estimagic.visualization.profile_plot import create_solution_times
+
 
 def convergence_report(
     problems, results, *, stopping_criterion="y", x_precision=1e-4, y_precision=1e-4
@@ -31,7 +33,7 @@ def convergence_report(
     Returns:
         pandas.DataFrame: columns are the algorithms and the dimensionality of the
             benchmark problems, indexes are the problems. For the algorithms columns,
-            the values are strings that are either "success" "failed", or "error".
+            the values are strings that are either "success", "failed", or "error".
             For the dimensionality column, the values denote the number of dimensions
             of the problem.
 
@@ -44,11 +46,7 @@ def convergence_report(
         y_precision=y_precision,
     )
 
-    convergence_report = converged_info.replace({True: "success", False: "failed"})
-
-    for key, value in results.items():
-        if isinstance(value["solution"], str):
-            convergence_report.at[key] = "error"
+    convergence_report = _get_success_info(results, converged_info)
 
     dim = {problem: len(problems[problem]["inputs"]["params"]) for problem in problems}
     convergence_report["dimensionality"] = convergence_report.index.map(dim)
@@ -61,7 +59,6 @@ def rank_report(
     results,
     *,
     runtime_measure="n_evaluations",
-    normalize_runtime=False,
     stopping_criterion="y",
     x_precision=1e-4,
     y_precision=1e-4,
@@ -80,10 +77,6 @@ def rank_report(
             This is the runtime until the desired convergence was reached by an
             algorithm. This is called performance measure by Moré and Wild (2009).
             Default is "n_evaluations".
-        normalize_runtime (bool): If True the runtime each algorithm needed for each
-            problem is scaled by the time the fastest algorithm needed. If True, the
-            resulting plot is what Moré and Wild (2009) called data profiles.
-            Default is False.
         stopping_criterion (str): one of "x_and_y", "x_or_y", "x", "y". Determines
             how convergence is determined from the two precisions.
         x_precision (float or None): how close an algorithm must have gotten to the
@@ -111,22 +104,15 @@ def rank_report(
         x_precision=x_precision,
         y_precision=y_precision,
     )
-    scenarios = list({algo[1] for algo in results.keys()})
 
-    success_info = converged_info.replace({True: "success", False: "failed"})
-    for key, value in results.items():
-        if isinstance(value["solution"], str):
-            success_info.at[key] = "error"
-
-    solution_times = histories.groupby(["problem", "algorithm"])[runtime_measure].max()
+    solution_times = create_solution_times(histories, runtime_measure, converged_info)
+    solution_times = solution_times.stack().reset_index()
+    solution_times = solution_times.rename(
+        columns={solution_times.columns[2]: runtime_measure}
+    )
 
-    if normalize_runtime:
-        solution_times = solution_times.unstack()
-        solution_times = solution_times.divide(solution_times.min(axis=1), axis=0)
-        solution_times = solution_times.stack(dropna=False)
-        solution_times.name = runtime_measure
+    success_info = _get_success_info(results, converged_info)
 
-    solution_times = solution_times.reset_index()
     solution_times["rank"] = (
         solution_times.groupby("problem")[runtime_measure].rank(
             method="dense", ascending=True
@@ -135,8 +121,8 @@ def rank_report(
     ).astype("Int64")
 
     df_wide = solution_times.pivot(index="problem", columns="algorithm", values="rank")
-    rank_report = df_wide.astype(str)[scenarios]
-    rank_report[~converged_info] = success_info[scenarios]
+    rank_report = df_wide.astype(str)
+    rank_report[~converged_info] = success_info
 
     return rank_report
 
@@ -169,3 +155,29 @@ def traceback_report(results):
     traceback_report = pd.DataFrame.from_dict(tracebacks, orient="columns")
 
     return traceback_report
+
+
+def _get_success_info(results, converged_info):
+    """Create a DataFrame with information on whether an algorithm succeeded or not.
+
+    Args:
+        results (dict): estimagic benchmarking results dictionary. Keys are
+            tuples of the form (problem, algorithm), values are dictionaries of the
+            collected information on the benchmark run, including 'criterion_history'
+            and 'time_history'.
+        converged_info (pandas.DataFrame): columns are the algorithms, indexes are the
+            problems. The values are boolean and True when the algorithm arrived at
+            the solution with the desired precision.
+
+    Returns:
+        pandas.DataFrame: columns are the algorithms, indexes are the problems.
+           values are strings that are either "success", "failed", or "error".
+
+    """
+    success_info = converged_info.replace({True: "success", False: "failed"})
+
+    for key, value in results.items():
+        if isinstance(value["solution"], str):
+            success_info.at[key] = "error"
+
+    return success_info
diff --git a/src/estimagic/visualization/profile_plot.py b/src/estimagic/visualization/profile_plot.py
index a2081260b..7c96b13d0 100644
--- a/src/estimagic/visualization/profile_plot.py
+++ b/src/estimagic/visualization/profile_plot.py
@@ -86,7 +86,7 @@ def profile_plot(
         y_precision=y_precision,
     )
 
-    solution_times = _create_solution_times(
+    solution_times = create_solution_times(
         df,
         runtime_measure=runtime_measure,
         converged_info=converged_info,
@@ -139,7 +139,7 @@ def profile_plot(
     return fig
 
 
-def _create_solution_times(df, runtime_measure, converged_info):
+def create_solution_times(df, runtime_measure, converged_info):
     """Find the solution time for each algorithm and problem.
 
     Args:
diff --git a/tests/benchmarking/test_benchmark_reports.py b/tests/benchmarking/test_benchmark_reports.py
index aa5ca6df9..67a1229ef 100644
--- a/tests/benchmarking/test_benchmark_reports.py
+++ b/tests/benchmarking/test_benchmark_reports.py
@@ -1,31 +1,131 @@
 import pytest
 from itertools import product
+import numpy as np
 
-from estimagic import get_benchmark_problems
-from estimagic.benchmarking.run_benchmark import run_benchmark
 from estimagic.benchmarking.benchmark_reports import (
     convergence_report,
     rank_report,
     traceback_report,
 )
+from estimagic import OptimizeResult
 
 
 @pytest.fixture
 def benchmark_example():
-    problems = get_benchmark_problems("example")
-    stop_after_10 = {
+    _stop_after_10 = {
         "stopping_max_criterion_evaluations": 10,
         "stopping_max_iterations": 10,
     }
     optimizers = {
-        "lbfgsb": {"algorithm": "scipy_lbfgsb", "algo_options": stop_after_10},
-        "nm": {"algorithm": "scipy_neldermead", "algo_options": stop_after_10},
+        "lbfgsb": {"algorithm": "scipy_lbfgsb", "algo_options": _stop_after_10},
+        "nm": {"algorithm": "scipy_neldermead", "algo_options": _stop_after_10},
+    }
+
+    problems = {
+        "bard_good_start": {
+            "inputs": {"params": np.array([1, 1, 1])},
+            "solution": {
+                "params": np.array([0.08241056, 1.13303608, 2.34369519]),
+                "value": 0.00821487730657897,
+            },
+            "noisy": False,
+            "info": {},
+            "start_criterion": 41.6817,
+        },
+        "box_3d": {
+            "inputs": {"params": np.array([0, 10, 20])},
+            "solution": {"params": np.array([1, 10, 1]), "value": 0},
+            "noisy": False,
+            "info": {},
+            "start_criterion": 1031.154,
+        },
+    }
+
+    results = {
+        ("bard_good_start", "lbfgsb"): {
+            "params_history": [
+                [1.0, 1.0, 1.0],
+                [0.48286315298120086, 1.6129119244711858, 1.5974181569859445],
+                [0.09754340799557773, 1.7558262514618663, 1.7403560082627973],
+            ],
+            "criterion_history": np.array(
+                [
+                    4.16816959e01,
+                    3.20813118e00,
+                    9.97263708e-03,
+                ]
+            ),
+            "time_history": [
+                0.0,
+                0.0003762839987757616,
+                0.0007037959985609632,
+            ],
+            "batches_history": [0, 1, 2],
+            "solution": OptimizeResult,  # success
+        },
+        ("box_3d", "lbfgsb"): {
+            "params_history": [
+                [0.0, 10.0, 20.0],
+                [-0.6579976970071755, 10.014197643614924, 19.247113914560085],
+                [-3.2899884850358774, 10.070988218074623, 16.235569572800433],
+            ],
+            "criterion_history": np.array(
+                [
+                    1.03115381e03,
+                    8.73640769e02,
+                    9.35093416e02,
+                ]
+            ),
+            "time_history": [
+                0.0,
+                0.000555748996703187,
+                0.0009771709992492106,
+            ],
+            "batches_history": [0, 1, 2],
+            "solution": OptimizeResult,  # failed
+        },
+        ("bard_good_start", "nm"): {
+            "params_history": [
+                [1.0, 1.0, 1.0],
+                [1.05, 1.0, 1.0],
+                [0.7999999999999998, 1.1999999999999993, 1.0499999999999994],
+                [0.08241056, 1.13303608, 2.34369519],
+            ],
+            "criterion_history": np.array(
+                [
+                    41.68169586,
+                    43.90748158,
+                    23.92563745,
+                    0.00821487730657897,
+                ]
+            ),
+            "time_history": [
+                0.0,
+                3.603900040616281e-05,
+                0.0004506860022956971,
+                0.00015319500016630627,
+            ],
+            "batches_history": [0, 1, 2, 4],
+            "solution": OptimizeResult,  # success
+        },
+        ("box_3d", "nm"): {
+            "params_history": [
+                [0.0, 10.0, 20.0],
+                [0.025, 10.0, 20.0],
+                [0.0, 10.5, 20.0],
+            ],
+            "criterion_history": np.array(
+                [1031.15381061, 1031.17836473, 1030.15033678]
+            ),
+            "time_history": [
+                0.0,
+                5.73799989069812e-05,
+                0.00010679600018193014,
+            ],
+            "batches_history": [0, 1, 2],
+            "solution": "some traceback",  # error
+        },
     }
-    results = run_benchmark(
-        problems,
-        optimizers,
-        n_cores=1,  # must be 1 for the test to work
-    )
 
     return problems, optimizers, results
 
@@ -54,29 +154,18 @@ def test_convergence_report(options, benchmark_example):
     assert df.shape == (len(problems), len(expected_columns))
     assert set(df.columns) == set(expected_columns)
 
-
-def test_convergence_report_with_failed_and_error(benchmark_example):
-    problems, _, results = benchmark_example
-    failed_problem = ("bard_good_start", "nm")
-    error_problem = ("box_3d", "nm")
-    results[error_problem]["solution"] = "some traceback"
-
-    df = convergence_report(problems=problems, results=results)
-
-    assert df[failed_problem[1]].loc[failed_problem[0]] == "failed"
-    assert df[error_problem[1]].loc[error_problem[0]] == "error"
+    assert df["lbfgsb"].loc["box_3d"] == "failed"
+    assert df["nm"].loc["box_3d"] == "error"
 
 
 # ====================================================================================
 # Rank report
 # ====================================================================================
 
-keys = ["runtime_measure", "normalize_runtime", "stopping_criterion"]
+keys = ["runtime_measure", "stopping_criterion"]
 runtime_measure = ["n_evaluations", "walltime", "n_batches"]
-normalize_runtime = [True, False]
 RANK_REPORT_OPTIONS = [
-    dict(zip(keys, value))
-    for value in product(runtime_measure, normalize_runtime, stopping_criterion)
+    dict(zip(keys, value)) for value in product(runtime_measure, stopping_criterion)
 ]
 
 
@@ -89,17 +178,8 @@ def test_rank_report(options, benchmark_example):
     assert df.shape == (len(problems), len(optimizers))
     assert set(df.columns) == set(optimizers.keys())
 
-
-def test_rank_report_with_failed_and_error(benchmark_example):
-    problems, _, results = benchmark_example
-    failed_problem = ("bard_good_start", "nm")
-    error_problem = ("box_3d", "nm")
-    results[error_problem]["solution"] = "some traceback"
-
-    df = rank_report(problems=problems, results=results)
-
-    assert df[failed_problem[1]].loc[failed_problem[0]] == "failed"
-    assert df[error_problem[1]].loc[error_problem[0]] == "error"
+    assert df["lbfgsb"].loc["box_3d"] == "failed"
+    assert df["nm"].loc["box_3d"] == "error"
 
 
 # ====================================================================================
@@ -107,16 +187,10 @@ def test_rank_report_with_failed_and_error(benchmark_example):
 # ====================================================================================
 
 
-def test_traceback_report_no_error(benchmark_example):
+def test_traceback_report(benchmark_example):
     _, optimizers, results = benchmark_example
 
     df = traceback_report(results=results)
-    assert df.shape == (0, len(optimizers))
 
-
-def test_traceback_report_with_error(benchmark_example):
-    *_, results = benchmark_example
-    results[("box_3d", "nm")]["solution"] = "some traceback"
-
-    df = traceback_report(results=results)
-    assert df.shape[0] > 0
+    assert df.shape == (1, len(optimizers))
+    assert np.isnan(df.at["box_3d", "lbfgsb"])
diff --git a/tests/visualization/test_profile_plot.py b/tests/visualization/test_profile_plot.py
index de9e9db55..2d3a7fabb 100644
--- a/tests/visualization/test_profile_plot.py
+++ b/tests/visualization/test_profile_plot.py
@@ -4,7 +4,7 @@
 from estimagic import get_benchmark_problems
 from estimagic.benchmarking.run_benchmark import run_benchmark
 from estimagic.visualization.profile_plot import (
-    _create_solution_times,
+    create_solution_times,
     _determine_alpha_grid,
     _find_switch_points,
     profile_plot,
@@ -64,7 +64,7 @@ def test_create_solution_times_n_evaluations():
     )
     expected.columns.name = "algorithm"
 
-    res = _create_solution_times(
+    res = create_solution_times(
         df=df, runtime_measure="n_evaluations", converged_info=info
     )
     pd.testing.assert_frame_equal(res, expected)
@@ -102,9 +102,7 @@ def test_create_solution_times_n_batches():
     )
     expected.columns.name = "algorithm"
 
-    res = _create_solution_times(
-        df=df, runtime_measure="n_batches", converged_info=info
-    )
+    res = create_solution_times(df=df, runtime_measure="n_batches", converged_info=info)
     pd.testing.assert_frame_equal(res, expected)
 
 
@@ -140,7 +138,7 @@ def test_create_solution_times_walltime():
     )
     expected.columns.name = "algorithm"
 
-    res = _create_solution_times(df=df, runtime_measure="walltime", converged_info=info)
+    res = create_solution_times(df=df, runtime_measure="walltime", converged_info=info)
     pd.testing.assert_frame_equal(res, expected)