diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b489eba036..a8247a247b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,7 +16,9 @@ jobs: IRIS_TEST_DATA_PATH: benchmarks/iris-test-data IRIS_TEST_DATA_VERSION: "2.5" # Lets us manually bump the cache to rebuild + ENV_CACHE_BUILD: "0" TEST_DATA_CACHE_BUILD: "2" + PY_VER: 3.8 steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it @@ -32,19 +34,15 @@ jobs: run: | pip install nox - - name: Cache .nox and .asv/env directories + - name: Cache environment directories id: cache-env-dir uses: actions/cache@v2 with: path: | .nox benchmarks/.asv/env - # Make sure GHA never gets an exact cache match by using the unique - # github.sha. This means it will always store this run as a new - # cache (Nox may have made relevant changes during run). Cache - # restoration still succeeds via the partial restore-key match. - key: ${{ runner.os }}-${{ github.sha }} - restore-keys: ${{ runner.os }} + $CONDA/pkgs + key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }} - name: Cache test data directory id: cache-test-data @@ -62,7 +60,7 @@ jobs: unzip -q iris-test-data.zip mkdir --parents ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_LOC_PATH} mv iris-test-data-${IRIS_TEST_DATA_VERSION} ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH} - + - name: Set test data var run: | echo "OVERRIDE_TEST_DATA_REPOSITORY=${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}/test_data" >> $GITHUB_ENV diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py index 2e741c3da0..4a964a648d 100644 --- a/benchmarks/benchmarks/__init__.py +++ b/benchmarks/benchmarks/__init__.py @@ -5,45 +5,4 @@ # licensing details. """Common code for benchmarks.""" -import os -from pathlib import Path - -# Environment variable names -_ASVDIR_VARNAME = "ASV_DIR" # As set in nightly script "asv_nightly/asv.sh" -_DATADIR_VARNAME = "BENCHMARK_DATA" # For local runs - ARTIFICIAL_DIM_SIZE = int(10e3) # For all artificial cubes, coords etc. - -# Work out where the benchmark data dir is. -asv_dir = os.environ.get("ASV_DIR", None) -if asv_dir: - # For an overnight run, this comes from the 'ASV_DIR' setting. - benchmark_data_dir = Path(asv_dir) / "data" -else: - # For a local run, you set 'BENCHMARK_DATA'. - benchmark_data_dir = os.environ.get(_DATADIR_VARNAME, None) - if benchmark_data_dir is not None: - benchmark_data_dir = Path(benchmark_data_dir) - - -def testdata_path(*path_names): - """ - Return the path of a benchmark test data file. - - These are based from a test-data location dir, which is either - ${}/data (for overnight tests), or ${} for local testing. - - If neither of these were set, an error is raised. - - """.format( - _ASVDIR_VARNAME, _DATADIR_VARNAME - ) - if benchmark_data_dir is None: - msg = ( - "Benchmark data dir is not defined : " - 'Either "${}" or "${}" must be set.' - ) - raise (ValueError(msg.format(_ASVDIR_VARNAME, _DATADIR_VARNAME))) - path = benchmark_data_dir.joinpath(*path_names) - path = str(path) # Because Iris doesn't understand Path objects yet. - return path diff --git a/benchmarks/benchmarks/generate_data/__init__.py b/benchmarks/benchmarks/generate_data/__init__.py new file mode 100644 index 0000000000..a56f2e4623 --- /dev/null +++ b/benchmarks/benchmarks/generate_data/__init__.py @@ -0,0 +1,94 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Scripts for generating supporting data for benchmarking. + +Data generated using Iris should use :func:`run_function_elsewhere`, which +means that data is generated using a fixed version of Iris and a fixed +environment, rather than those that get changed when the benchmarking run +checks out a new commit. + +Downstream use of data generated 'elsewhere' requires saving; usually in a +NetCDF file. Could also use pickling but there is a potential risk if the +benchmark sequence runs over two different Python versions. + +""" +from inspect import getsource +from os import environ +from pathlib import Path +from subprocess import CalledProcessError, check_output, run +from textwrap import dedent + +#: Python executable used by :func:`run_function_elsewhere`, set via env +#: variable of same name. Must be path of Python within an environment that +#: includes Iris (including dependencies and test modules) and Mule. +try: + DATA_GEN_PYTHON = environ["DATA_GEN_PYTHON"] + _ = check_output([DATA_GEN_PYTHON, "-c", "a = True"]) +except KeyError: + error = "Env variable DATA_GEN_PYTHON not defined." + raise KeyError(error) +except (CalledProcessError, FileNotFoundError, PermissionError): + error = ( + "Env variable DATA_GEN_PYTHON not a runnable python executable path." + ) + raise ValueError(error) + +# The default location of data files used in benchmarks. Used by CI. +default_data_dir = (Path(__file__).parents[2] / ".data").resolve() +# Optionally override the default data location with environment variable. +BENCHMARK_DATA = Path(environ.get("BENCHMARK_DATA", default_data_dir)) +if BENCHMARK_DATA == default_data_dir: + BENCHMARK_DATA.mkdir(exist_ok=True) +elif not BENCHMARK_DATA.is_dir(): + message = f"Not a directory: {BENCHMARK_DATA} ." + raise ValueError(message) + +# Manual flag to allow the rebuilding of synthetic data. +# False forces a benchmark run to re-make all the data files. +REUSE_DATA = True + + +def run_function_elsewhere(func_to_run, *args, **kwargs): + """ + Run a given function using the :const:`DATA_GEN_PYTHON` executable. + + This structure allows the function to be written natively. + + Parameters + ---------- + func_to_run : FunctionType + The function object to be run. + NOTE: the function must be completely self-contained, i.e. perform all + its own imports (within the target :const:`DATA_GEN_PYTHON` + environment). + *args : tuple, optional + Function call arguments. Must all be expressible as simple literals, + i.e. the ``repr`` must be a valid literal expression. + **kwargs: dict, optional + Function call keyword arguments. All values must be expressible as + simple literals (see ``*args``). + + Returns + ------- + str + The ``stdout`` from the run. + + """ + func_string = dedent(getsource(func_to_run)) + func_string = func_string.replace("@staticmethod\n", "") + func_call_term_strings = [repr(arg) for arg in args] + func_call_term_strings += [ + f"{name}={repr(val)}" for name, val in kwargs.items() + ] + func_call_string = ( + f"{func_to_run.__name__}(" + ",".join(func_call_term_strings) + ")" + ) + python_string = "\n".join([func_string, func_call_string]) + result = run( + [DATA_GEN_PYTHON, "-c", python_string], capture_output=True, check=True + ) + return result.stdout diff --git a/benchmarks/benchmarks/generate_data/um_files.py b/benchmarks/benchmarks/generate_data/um_files.py new file mode 100644 index 0000000000..8792fcc48b --- /dev/null +++ b/benchmarks/benchmarks/generate_data/um_files.py @@ -0,0 +1,215 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Generate FF, PP and NetCDF files based on a minimal synthetic FF file. + +NOTE: uses the Mule package, so depends on an environment with Mule installed. +""" + + +def _create_um_files( + len_x: int, len_y: int, len_z: int, len_t: int, compress, save_paths: dict +) -> None: + """ + Generate an FF object of given shape and compression, save to FF/PP/NetCDF. + + This is run externally + (:func:`benchmarks.generate_data.run_function_elsewhere`), so all imports + are self-contained and input parameters are simple types. + """ + from copy import deepcopy + from datetime import datetime + from tempfile import NamedTemporaryFile + + from mo_pack import compress_wgdos as mo_pack_compress + from mule import ArrayDataProvider, Field3, FieldsFile + from mule.pp import fields_to_pp_file + import numpy as np + + from iris import load_cube + from iris import save as save_cube + + def packing_patch(*compress_args, **compress_kwargs) -> bytes: + """ + Force conversion from returned :class:`memoryview` to :class:`bytes`. + + Downstream uses of :func:`mo_pack.compress_wgdos` were written + for the ``Python2`` behaviour, where the returned buffer had a + different ``__len__`` value to the current :class:`memoryview`. + Unable to fix directly in Mule, so monkey patching for now. + """ + return mo_pack_compress(*compress_args, **compress_kwargs).tobytes() + + import mo_pack + + mo_pack.compress_wgdos = packing_patch + + ######## + + template = { + "fixed_length_header": {"dataset_type": 3, "grid_staggering": 3}, + "integer_constants": { + "num_p_levels": len_z, + "num_cols": len_x, + "num_rows": len_y, + }, + "real_constants": {}, + "level_dependent_constants": {"dims": (len_z + 1, None)}, + } + new_ff = FieldsFile.from_template(deepcopy(template)) + + data_array = np.arange(len_x * len_y).reshape(len_x, len_y) + array_provider = ArrayDataProvider(data_array) + + def add_field(level_: int, time_step_: int) -> None: + """ + Add a minimal field to the new :class:`~mule.FieldsFile`. + + Includes the minimum information to allow Mule saving and Iris + loading, as well as incrementation for vertical levels and time + steps to allow generation of z and t dimensions. + """ + new_field = Field3.empty() + # To correspond to the header-release 3 class used. + new_field.lbrel = 3 + # Mule uses the first element of the lookup to test for + # unpopulated fields (and skips them), so the first element should + # be set to something. The year will do. + new_field.raw[1] = datetime.now().year + + # Horizontal. + new_field.lbcode = 1 + new_field.lbnpt = len_x + new_field.lbrow = len_y + new_field.bdx = new_ff.real_constants.col_spacing + new_field.bdy = new_ff.real_constants.row_spacing + new_field.bzx = new_ff.real_constants.start_lon - 0.5 * new_field.bdx + new_field.bzy = new_ff.real_constants.start_lat - 0.5 * new_field.bdy + + # Hemisphere. + new_field.lbhem = 32 + # Processing. + new_field.lbproc = 0 + + # Vertical. + # Hybrid height values by simulating sequences similar to those in a + # theta file. + new_field.lbvc = 65 + if level_ == 0: + new_field.lblev = 9999 + else: + new_field.lblev = level_ + + level_1 = level_ + 1 + six_rec = 20 / 3 + three_rec = six_rec / 2 + + new_field.blev = level_1 ** 2 * six_rec - six_rec + new_field.brsvd1 = ( + level_1 ** 2 * six_rec + (six_rec * level_1) - three_rec + ) + + brsvd2_simulated = np.linspace(0.995, 0, len_z) + shift = min(len_z, 2) + bhrlev_simulated = np.concatenate( + [np.ones(shift), brsvd2_simulated[:-shift]] + ) + new_field.brsvd2 = brsvd2_simulated[level_] + new_field.bhrlev = bhrlev_simulated[level_] + + # Time. + new_field.lbtim = 11 + + new_field.lbyr = time_step_ + for attr_name in ["lbmon", "lbdat", "lbhr", "lbmin", "lbsec"]: + setattr(new_field, attr_name, 0) + + new_field.lbyrd = time_step_ + 1 + for attr_name in ["lbmond", "lbdatd", "lbhrd", "lbmind", "lbsecd"]: + setattr(new_field, attr_name, 0) + + # Data and packing. + new_field.lbuser1 = 1 + new_field.lbpack = int(compress) + new_field.bacc = 0 + new_field.bmdi = -1 + new_field.lbext = 0 + new_field.set_data_provider(array_provider) + + new_ff.fields.append(new_field) + + for time_step in range(len_t): + for level in range(len_z): + add_field(level, time_step + 1) + + ff_path = save_paths.get("FF", None) + pp_path = save_paths.get("PP", None) + nc_path = save_paths.get("NetCDF", None) + + if ff_path: + new_ff.to_file(ff_path) + if pp_path: + fields_to_pp_file(str(pp_path), new_ff.fields) + if nc_path: + temp_ff_path = None + # Need an Iris Cube from the FF content. + if ff_path: + # Use the existing file. + ff_cube = load_cube(ff_path) + else: + # Make a temporary file. + temp_ff_path = NamedTemporaryFile() + new_ff.to_file(temp_ff_path.name) + ff_cube = load_cube(temp_ff_path.name) + + save_cube(ff_cube, nc_path, zlib=compress) + if temp_ff_path: + temp_ff_path.close() + + +FILE_EXTENSIONS = {"FF": "", "PP": ".pp", "NetCDF": ".nc"} + + +def create_um_files( + len_x: int, + len_y: int, + len_z: int, + len_t: int, + compress: bool, + file_types: list, +) -> dict: + """ + Generate FF-based FF / PP / NetCDF files with specified shape and compression. + + All files representing a given shape are saved in a dedicated directory. A + dictionary of the saved paths is returned. + + If the required files exist, they are re-used, unless + :const:`benchmarks.REUSE_DATA` is ``False``. + """ + # Self contained imports to avoid linting confusion with _create_um_files(). + from . import BENCHMARK_DATA, REUSE_DATA, run_function_elsewhere + + save_name_sections = ["UM", len_x, len_y, len_z, len_t] + save_name = "_".join(str(section) for section in save_name_sections) + save_dir = BENCHMARK_DATA / save_name + if not save_dir.is_dir(): + save_dir.mkdir(parents=True) + + save_paths = {} + files_exist = True + for file_type in file_types: + file_ext = FILE_EXTENSIONS[file_type] + save_path = (save_dir / f"{compress}").with_suffix(file_ext) + files_exist = files_exist and save_path.is_file() + save_paths[file_type] = str(save_path) + + if not REUSE_DATA or not files_exist: + _ = run_function_elsewhere( + _create_um_files, len_x, len_y, len_z, len_t, compress, save_paths + ) + + return save_paths diff --git a/benchmarks/benchmarks/loading.py b/benchmarks/benchmarks/loading.py new file mode 100644 index 0000000000..4558c3b5cb --- /dev/null +++ b/benchmarks/benchmarks/loading.py @@ -0,0 +1,185 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +File loading benchmark tests. + +Where applicable benchmarks should be parameterised for two sizes of input data: + * minimal: enables detection of regressions in parts of the run-time that do + NOT scale with data size. + * large: large enough to exclusively detect regressions in parts of the + run-time that scale with data size. Size should be _just_ large + enough - don't want to bloat benchmark runtime. + +""" + +from iris import AttributeConstraint, Constraint, load, load_cube +from iris.cube import Cube +from iris.fileformats.um import structured_um_loading + +from .generate_data import BENCHMARK_DATA, REUSE_DATA, run_function_elsewhere +from .generate_data.um_files import create_um_files + + +class LoadAndRealise: + params = [ + [(2, 2, 2), (1280, 960, 5)], + [False, True], + ["FF", "PP", "NetCDF"], + ] + param_names = ["xyz", "compressed", "file_format"] + + def setup_cache(self) -> dict: + file_type_args = self.params[2] + file_path_dict = {} + for xyz in self.params[0]: + file_path_dict[xyz] = {} + x, y, z = xyz + for compress in self.params[1]: + file_path_dict[xyz][compress] = create_um_files( + x, y, z, 1, compress, file_type_args + ) + return file_path_dict + + def setup( + self, + file_path_dict: dict, + xyz: tuple, + compress: bool, + file_format: str, + ) -> None: + self.file_path = file_path_dict[xyz][compress][file_format] + self.cube = self.load() + + def load(self) -> Cube: + return load_cube(self.file_path) + + def time_load(self, _, __, ___, ____) -> None: + _ = self.load() + + def time_realise(self, _, __, ___, ____) -> None: + # Don't touch cube.data - permanent realisation plays badly with ASV's + # re-run strategy. + assert self.cube.has_lazy_data() + self.cube.core_data().compute() + + +class STASHConstraint: + # xyz sizes mimic LoadAndRealise to maximise file re-use. + params = [[(2, 2, 2), (1280, 960, 5)], ["FF", "PP"]] + param_names = ["xyz", "file_format"] + + def setup_cache(self) -> dict: + file_type_args = self.params[1] + file_path_dict = {} + for xyz in self.params[0]: + x, y, z = xyz + file_path_dict[xyz] = create_um_files( + x, y, z, 1, False, file_type_args + ) + return file_path_dict + + def setup( + self, file_path_dict: dict, xyz: tuple, file_format: str + ) -> None: + self.file_path = file_path_dict[xyz][file_format] + + def time_stash_constraint(self, _, __, ___) -> None: + _ = load_cube(self.file_path, AttributeConstraint(STASH="m??s??i901")) + + +class TimeConstraint: + params = [[3, 20], ["FF", "PP", "NetCDF"]] + param_names = ["time_dim_len", "file_format"] + + def setup_cache(self) -> dict: + file_type_args = self.params[1] + file_path_dict = {} + for time_dim_len in self.params[0]: + file_path_dict[time_dim_len] = create_um_files( + 20, 20, 5, time_dim_len, False, file_type_args + ) + return file_path_dict + + def setup( + self, file_path_dict: dict, time_dim_len: int, file_format: str + ) -> None: + self.file_path = file_path_dict[time_dim_len][file_format] + self.time_constr = Constraint(time=lambda cell: cell.point.year < 3) + + def time_time_constraint(self, _, __, ___) -> None: + _ = load_cube(self.file_path, self.time_constr) + + +class ManyVars: + FILE_PATH = BENCHMARK_DATA / "many_var_file.nc" + + @staticmethod + def _create_file(save_path: str) -> None: + """Is run externally - everything must be self-contained.""" + import numpy as np + + from iris import save + from iris.coords import AuxCoord + from iris.cube import Cube + + data_len = 8 + data = np.arange(data_len) + cube = Cube(data, units="unknown") + extra_vars = 80 + names = ["coord_" + str(i) for i in range(extra_vars)] + for name in names: + coord = AuxCoord(data, long_name=name, units="unknown") + cube.add_aux_coord(coord, 0) + save(cube, save_path) + + def setup_cache(self) -> None: + if not REUSE_DATA or not self.FILE_PATH.is_file(): + # See :mod:`benchmarks.generate_data` docstring for full explanation. + _ = run_function_elsewhere( + self._create_file, + str(self.FILE_PATH), + ) + + def time_many_var_load(self) -> None: + _ = load(str(self.FILE_PATH)) + + +class StructuredFF: + """ + Test structured loading of a large-ish fieldsfile. + + Structured load of the larger size should show benefit over standard load, + avoiding the cost of merging. + """ + + params = [[(2, 2, 2), (1280, 960, 5)], [False, True]] + param_names = ["xyz", "structured_loading"] + + def setup_cache(self) -> dict: + file_path_dict = {} + for xyz in self.params[0]: + x, y, z = xyz + file_path_dict[xyz] = create_um_files(x, y, z, 1, False, ["FF"]) + return file_path_dict + + def setup(self, file_path_dict, xyz, structured_load): + self.file_path = file_path_dict[xyz]["FF"] + self.structured_load = structured_load + + def load(self): + """Load the whole file (in fact there is only 1 cube).""" + + def _load(): + _ = load(self.file_path) + + if self.structured_load: + with structured_um_loading(): + _load() + else: + _load() + + def time_structured_load(self, _, __, ___): + self.load() diff --git a/noxfile.py b/noxfile.py index 6367b74aef..0600540c5b 100755 --- a/noxfile.py +++ b/noxfile.py @@ -289,7 +289,7 @@ def linkcheck(session: nox.sessions.Session): ) -@nox.session(python=PY_VER[-1], venv_backend="conda") +@nox.session(python=PY_VER, venv_backend="conda") @nox.parametrize( ["ci_mode"], [True, False], @@ -297,7 +297,7 @@ def linkcheck(session: nox.sessions.Session): ) def benchmarks(session: nox.sessions.Session, ci_mode: bool): """ - Perform esmf-regrid performance benchmarks (using Airspeed Velocity). + Perform Iris performance benchmarks (using Airspeed Velocity). Parameters ---------- @@ -315,6 +315,47 @@ def benchmarks(session: nox.sessions.Session, ci_mode: bool): """ session.install("asv", "nox") + + data_gen_var = "DATA_GEN_PYTHON" + if data_gen_var in os.environ: + print("Using existing data generation environment.") + else: + print("Setting up the data generation environment...") + # Get Nox to build an environment for the `tests` session, but don't + # run the session. Will re-use a cached environment if appropriate. + session.run_always( + "nox", + "--session=tests", + "--install-only", + f"--python={session.python}", + ) + # Find the environment built above, set it to be the data generation + # environment. + data_gen_python = next( + Path(".nox").rglob(f"tests*/bin/python{session.python}") + ).resolve() + session.env[data_gen_var] = data_gen_python + + mule_dir = data_gen_python.parents[1] / "resources" / "mule" + if not mule_dir.is_dir(): + print("Installing Mule into data generation environment...") + session.run_always( + "git", + "clone", + "https://github.com/metomi/mule.git", + str(mule_dir), + external=True, + ) + session.run_always( + str(data_gen_python), + "-m", + "pip", + "install", + str(mule_dir / "mule"), + external=True, + ) + + print("Running ASV...") session.cd("benchmarks") # Skip over setup questions for a new machine. session.run("asv", "machine", "--yes")