From 0f73abd12a2c819c23558935a59a185e5f7beccc Mon Sep 17 00:00:00 2001 From: Sebastian Gsell Date: Tue, 2 May 2023 17:10:14 +0200 Subject: [PATCH] Address comments and speed up tests --- .../benchmarking/benchmark_reports.py | 64 ++++--- src/estimagic/visualization/profile_plot.py | 4 +- tests/benchmarking/test_benchmark_reports.py | 166 +++++++++++++----- tests/visualization/test_profile_plot.py | 10 +- 4 files changed, 164 insertions(+), 80 deletions(-) diff --git a/src/estimagic/benchmarking/benchmark_reports.py b/src/estimagic/benchmarking/benchmark_reports.py index fed8bf81d..dfedd23f1 100644 --- a/src/estimagic/benchmarking/benchmark_reports.py +++ b/src/estimagic/benchmarking/benchmark_reports.py @@ -3,6 +3,8 @@ process_benchmark_results, ) +from estimagic.visualization.profile_plot import create_solution_times + def convergence_report( problems, results, *, stopping_criterion="y", x_precision=1e-4, y_precision=1e-4 @@ -31,7 +33,7 @@ def convergence_report( Returns: pandas.DataFrame: columns are the algorithms and the dimensionality of the benchmark problems, indexes are the problems. For the algorithms columns, - the values are strings that are either "success" "failed", or "error". + the values are strings that are either "success", "failed", or "error". For the dimensionality column, the values denote the number of dimensions of the problem. @@ -44,11 +46,7 @@ def convergence_report( y_precision=y_precision, ) - convergence_report = converged_info.replace({True: "success", False: "failed"}) - - for key, value in results.items(): - if isinstance(value["solution"], str): - convergence_report.at[key] = "error" + convergence_report = _get_success_info(results, converged_info) dim = {problem: len(problems[problem]["inputs"]["params"]) for problem in problems} convergence_report["dimensionality"] = convergence_report.index.map(dim) @@ -61,7 +59,6 @@ def rank_report( results, *, runtime_measure="n_evaluations", - normalize_runtime=False, stopping_criterion="y", x_precision=1e-4, y_precision=1e-4, @@ -80,10 +77,6 @@ def rank_report( This is the runtime until the desired convergence was reached by an algorithm. This is called performance measure by Moré and Wild (2009). Default is "n_evaluations". - normalize_runtime (bool): If True the runtime each algorithm needed for each - problem is scaled by the time the fastest algorithm needed. If True, the - resulting plot is what Moré and Wild (2009) called data profiles. - Default is False. stopping_criterion (str): one of "x_and_y", "x_or_y", "x", "y". Determines how convergence is determined from the two precisions. x_precision (float or None): how close an algorithm must have gotten to the @@ -111,22 +104,15 @@ def rank_report( x_precision=x_precision, y_precision=y_precision, ) - scenarios = list({algo[1] for algo in results.keys()}) - success_info = converged_info.replace({True: "success", False: "failed"}) - for key, value in results.items(): - if isinstance(value["solution"], str): - success_info.at[key] = "error" - - solution_times = histories.groupby(["problem", "algorithm"])[runtime_measure].max() + solution_times = create_solution_times(histories, runtime_measure, converged_info) + solution_times = solution_times.stack().reset_index() + solution_times = solution_times.rename( + columns={solution_times.columns[2]: runtime_measure} + ) - if normalize_runtime: - solution_times = solution_times.unstack() - solution_times = solution_times.divide(solution_times.min(axis=1), axis=0) - solution_times = solution_times.stack(dropna=False) - solution_times.name = runtime_measure + success_info = _get_success_info(results, converged_info) - solution_times = solution_times.reset_index() solution_times["rank"] = ( solution_times.groupby("problem")[runtime_measure].rank( method="dense", ascending=True @@ -135,8 +121,8 @@ def rank_report( ).astype("Int64") df_wide = solution_times.pivot(index="problem", columns="algorithm", values="rank") - rank_report = df_wide.astype(str)[scenarios] - rank_report[~converged_info] = success_info[scenarios] + rank_report = df_wide.astype(str) + rank_report[~converged_info] = success_info return rank_report @@ -169,3 +155,29 @@ def traceback_report(results): traceback_report = pd.DataFrame.from_dict(tracebacks, orient="columns") return traceback_report + + +def _get_success_info(results, converged_info): + """Create a DataFrame with information on whether an algorithm succeeded or not. + + Args: + results (dict): estimagic benchmarking results dictionary. Keys are + tuples of the form (problem, algorithm), values are dictionaries of the + collected information on the benchmark run, including 'criterion_history' + and 'time_history'. + converged_info (pandas.DataFrame): columns are the algorithms, indexes are the + problems. The values are boolean and True when the algorithm arrived at + the solution with the desired precision. + + Returns: + pandas.DataFrame: columns are the algorithms, indexes are the problems. + values are strings that are either "success", "failed", or "error". + + """ + success_info = converged_info.replace({True: "success", False: "failed"}) + + for key, value in results.items(): + if isinstance(value["solution"], str): + success_info.at[key] = "error" + + return success_info diff --git a/src/estimagic/visualization/profile_plot.py b/src/estimagic/visualization/profile_plot.py index a2081260b..7c96b13d0 100644 --- a/src/estimagic/visualization/profile_plot.py +++ b/src/estimagic/visualization/profile_plot.py @@ -86,7 +86,7 @@ def profile_plot( y_precision=y_precision, ) - solution_times = _create_solution_times( + solution_times = create_solution_times( df, runtime_measure=runtime_measure, converged_info=converged_info, @@ -139,7 +139,7 @@ def profile_plot( return fig -def _create_solution_times(df, runtime_measure, converged_info): +def create_solution_times(df, runtime_measure, converged_info): """Find the solution time for each algorithm and problem. Args: diff --git a/tests/benchmarking/test_benchmark_reports.py b/tests/benchmarking/test_benchmark_reports.py index aa5ca6df9..67a1229ef 100644 --- a/tests/benchmarking/test_benchmark_reports.py +++ b/tests/benchmarking/test_benchmark_reports.py @@ -1,31 +1,131 @@ import pytest from itertools import product +import numpy as np -from estimagic import get_benchmark_problems -from estimagic.benchmarking.run_benchmark import run_benchmark from estimagic.benchmarking.benchmark_reports import ( convergence_report, rank_report, traceback_report, ) +from estimagic import OptimizeResult @pytest.fixture def benchmark_example(): - problems = get_benchmark_problems("example") - stop_after_10 = { + _stop_after_10 = { "stopping_max_criterion_evaluations": 10, "stopping_max_iterations": 10, } optimizers = { - "lbfgsb": {"algorithm": "scipy_lbfgsb", "algo_options": stop_after_10}, - "nm": {"algorithm": "scipy_neldermead", "algo_options": stop_after_10}, + "lbfgsb": {"algorithm": "scipy_lbfgsb", "algo_options": _stop_after_10}, + "nm": {"algorithm": "scipy_neldermead", "algo_options": _stop_after_10}, + } + + problems = { + "bard_good_start": { + "inputs": {"params": np.array([1, 1, 1])}, + "solution": { + "params": np.array([0.08241056, 1.13303608, 2.34369519]), + "value": 0.00821487730657897, + }, + "noisy": False, + "info": {}, + "start_criterion": 41.6817, + }, + "box_3d": { + "inputs": {"params": np.array([0, 10, 20])}, + "solution": {"params": np.array([1, 10, 1]), "value": 0}, + "noisy": False, + "info": {}, + "start_criterion": 1031.154, + }, + } + + results = { + ("bard_good_start", "lbfgsb"): { + "params_history": [ + [1.0, 1.0, 1.0], + [0.48286315298120086, 1.6129119244711858, 1.5974181569859445], + [0.09754340799557773, 1.7558262514618663, 1.7403560082627973], + ], + "criterion_history": np.array( + [ + 4.16816959e01, + 3.20813118e00, + 9.97263708e-03, + ] + ), + "time_history": [ + 0.0, + 0.0003762839987757616, + 0.0007037959985609632, + ], + "batches_history": [0, 1, 2], + "solution": OptimizeResult, # success + }, + ("box_3d", "lbfgsb"): { + "params_history": [ + [0.0, 10.0, 20.0], + [-0.6579976970071755, 10.014197643614924, 19.247113914560085], + [-3.2899884850358774, 10.070988218074623, 16.235569572800433], + ], + "criterion_history": np.array( + [ + 1.03115381e03, + 8.73640769e02, + 9.35093416e02, + ] + ), + "time_history": [ + 0.0, + 0.000555748996703187, + 0.0009771709992492106, + ], + "batches_history": [0, 1, 2], + "solution": OptimizeResult, # failed + }, + ("bard_good_start", "nm"): { + "params_history": [ + [1.0, 1.0, 1.0], + [1.05, 1.0, 1.0], + [0.7999999999999998, 1.1999999999999993, 1.0499999999999994], + [0.08241056, 1.13303608, 2.34369519], + ], + "criterion_history": np.array( + [ + 41.68169586, + 43.90748158, + 23.92563745, + 0.00821487730657897, + ] + ), + "time_history": [ + 0.0, + 3.603900040616281e-05, + 0.0004506860022956971, + 0.00015319500016630627, + ], + "batches_history": [0, 1, 2, 4], + "solution": OptimizeResult, # success + }, + ("box_3d", "nm"): { + "params_history": [ + [0.0, 10.0, 20.0], + [0.025, 10.0, 20.0], + [0.0, 10.5, 20.0], + ], + "criterion_history": np.array( + [1031.15381061, 1031.17836473, 1030.15033678] + ), + "time_history": [ + 0.0, + 5.73799989069812e-05, + 0.00010679600018193014, + ], + "batches_history": [0, 1, 2], + "solution": "some traceback", # error + }, } - results = run_benchmark( - problems, - optimizers, - n_cores=1, # must be 1 for the test to work - ) return problems, optimizers, results @@ -54,29 +154,18 @@ def test_convergence_report(options, benchmark_example): assert df.shape == (len(problems), len(expected_columns)) assert set(df.columns) == set(expected_columns) - -def test_convergence_report_with_failed_and_error(benchmark_example): - problems, _, results = benchmark_example - failed_problem = ("bard_good_start", "nm") - error_problem = ("box_3d", "nm") - results[error_problem]["solution"] = "some traceback" - - df = convergence_report(problems=problems, results=results) - - assert df[failed_problem[1]].loc[failed_problem[0]] == "failed" - assert df[error_problem[1]].loc[error_problem[0]] == "error" + assert df["lbfgsb"].loc["box_3d"] == "failed" + assert df["nm"].loc["box_3d"] == "error" # ==================================================================================== # Rank report # ==================================================================================== -keys = ["runtime_measure", "normalize_runtime", "stopping_criterion"] +keys = ["runtime_measure", "stopping_criterion"] runtime_measure = ["n_evaluations", "walltime", "n_batches"] -normalize_runtime = [True, False] RANK_REPORT_OPTIONS = [ - dict(zip(keys, value)) - for value in product(runtime_measure, normalize_runtime, stopping_criterion) + dict(zip(keys, value)) for value in product(runtime_measure, stopping_criterion) ] @@ -89,17 +178,8 @@ def test_rank_report(options, benchmark_example): assert df.shape == (len(problems), len(optimizers)) assert set(df.columns) == set(optimizers.keys()) - -def test_rank_report_with_failed_and_error(benchmark_example): - problems, _, results = benchmark_example - failed_problem = ("bard_good_start", "nm") - error_problem = ("box_3d", "nm") - results[error_problem]["solution"] = "some traceback" - - df = rank_report(problems=problems, results=results) - - assert df[failed_problem[1]].loc[failed_problem[0]] == "failed" - assert df[error_problem[1]].loc[error_problem[0]] == "error" + assert df["lbfgsb"].loc["box_3d"] == "failed" + assert df["nm"].loc["box_3d"] == "error" # ==================================================================================== @@ -107,16 +187,10 @@ def test_rank_report_with_failed_and_error(benchmark_example): # ==================================================================================== -def test_traceback_report_no_error(benchmark_example): +def test_traceback_report(benchmark_example): _, optimizers, results = benchmark_example df = traceback_report(results=results) - assert df.shape == (0, len(optimizers)) - -def test_traceback_report_with_error(benchmark_example): - *_, results = benchmark_example - results[("box_3d", "nm")]["solution"] = "some traceback" - - df = traceback_report(results=results) - assert df.shape[0] > 0 + assert df.shape == (1, len(optimizers)) + assert np.isnan(df.at["box_3d", "lbfgsb"]) diff --git a/tests/visualization/test_profile_plot.py b/tests/visualization/test_profile_plot.py index de9e9db55..2d3a7fabb 100644 --- a/tests/visualization/test_profile_plot.py +++ b/tests/visualization/test_profile_plot.py @@ -4,7 +4,7 @@ from estimagic import get_benchmark_problems from estimagic.benchmarking.run_benchmark import run_benchmark from estimagic.visualization.profile_plot import ( - _create_solution_times, + create_solution_times, _determine_alpha_grid, _find_switch_points, profile_plot, @@ -64,7 +64,7 @@ def test_create_solution_times_n_evaluations(): ) expected.columns.name = "algorithm" - res = _create_solution_times( + res = create_solution_times( df=df, runtime_measure="n_evaluations", converged_info=info ) pd.testing.assert_frame_equal(res, expected) @@ -102,9 +102,7 @@ def test_create_solution_times_n_batches(): ) expected.columns.name = "algorithm" - res = _create_solution_times( - df=df, runtime_measure="n_batches", converged_info=info - ) + res = create_solution_times(df=df, runtime_measure="n_batches", converged_info=info) pd.testing.assert_frame_equal(res, expected) @@ -140,7 +138,7 @@ def test_create_solution_times_walltime(): ) expected.columns.name = "algorithm" - res = _create_solution_times(df=df, runtime_measure="walltime", converged_info=info) + res = create_solution_times(df=df, runtime_measure="walltime", converged_info=info) pd.testing.assert_frame_equal(res, expected)