Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loading Benchmarks #4477

Merged
merged 24 commits into from Feb 14, 2022
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
bdc7175
Synthetic FF PP NetCDF and loading benchmarks.
trexfeathers Dec 17, 2021
eb3d772
Remove legacy benchmark data directory handling.
trexfeathers Jan 5, 2022
94c855b
GitHub benchmark action fixed PY_VER.
trexfeathers Jan 5, 2022
1e7b6a0
Missing licence headers.
trexfeathers Jan 5, 2022
83eef27
Cache generated benchmark data.
trexfeathers Jan 5, 2022
94aff73
ALWAYS cache benchmark generated data.
trexfeathers Jan 5, 2022
fe82b24
Also add StructuredFF benchmark.
trexfeathers Jan 5, 2022
7fe7d5c
Revert "ALWAYS cache benchmark generated data."
trexfeathers Jan 6, 2022
0a806f1
Revert "Cache generated benchmark data."
trexfeathers Jan 6, 2022
7561195
Improved benchmark GHA env caching (2min faster?)
trexfeathers Jan 6, 2022
d503ce6
[pre-commit.ci] pre-commit autoupdate (#4560)
pre-commit-ci[bot] Feb 1, 2022
81c4bcf
Merge remote-tracking branch 'upstream/main' into all_benchmarks
trexfeathers Feb 1, 2022
ca169af
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 1, 2022
73b8d84
Kick Cirrus.
trexfeathers Feb 2, 2022
8658bde
Merge remote-tracking branch 'upstream/main' into all_benchmarks
trexfeathers Feb 10, 2022
fc9a1cb
Revert "Merge remote-tracking branch 'upstream/main' into all_benchma…
trexfeathers Feb 10, 2022
56ae140
Revert "Revert "Merge remote-tracking branch 'upstream/main' into all…
trexfeathers Feb 10, 2022
585f266
Revert "[pre-commit.ci] pre-commit autoupdate (#4560)"
trexfeathers Feb 10, 2022
d9cef30
Revert "[pre-commit.ci] auto fixes from pre-commit.com hooks"
trexfeathers Feb 10, 2022
39f086c
Fix inconsistency with upstream in dev.rst.template.
trexfeathers Feb 10, 2022
9fbcb33
Loading benchmark review clarifications.
trexfeathers Feb 11, 2022
79a8228
ManyVars benchmark use setup_cache.
trexfeathers Feb 11, 2022
b69a388
Clarify file re-use in benchmarks um_files generator.
trexfeathers Feb 11, 2022
f96480d
Benchmarking better strategy for not permanently realising arrays.
trexfeathers Feb 14, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 6 additions & 8 deletions .github/workflows/benchmark.yml
Expand Up @@ -16,7 +16,9 @@ jobs:
IRIS_TEST_DATA_PATH: benchmarks/iris-test-data
IRIS_TEST_DATA_VERSION: "2.5"
# Lets us manually bump the cache to rebuild
ENV_CACHE_BUILD: "0"
TEST_DATA_CACHE_BUILD: "2"
PY_VER: 3.8

steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
Expand All @@ -32,19 +34,15 @@ jobs:
run: |
pip install nox

- name: Cache .nox and .asv/env directories
- name: Cache environment directories
id: cache-env-dir
uses: actions/cache@v2
with:
path: |
.nox
benchmarks/.asv/env
# Make sure GHA never gets an exact cache match by using the unique
# github.sha. This means it will always store this run as a new
# cache (Nox may have made relevant changes during run). Cache
# restoration still succeeds via the partial restore-key match.
key: ${{ runner.os }}-${{ github.sha }}
restore-keys: ${{ runner.os }}
$CONDA/pkgs
key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }}

- name: Cache test data directory
id: cache-test-data
Expand All @@ -62,7 +60,7 @@ jobs:
unzip -q iris-test-data.zip
mkdir --parents ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_LOC_PATH}
mv iris-test-data-${IRIS_TEST_DATA_VERSION} ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}

- name: Set test data var
run: |
echo "OVERRIDE_TEST_DATA_REPOSITORY=${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}/test_data" >> $GITHUB_ENV
Expand Down
41 changes: 0 additions & 41 deletions benchmarks/benchmarks/__init__.py
Expand Up @@ -5,45 +5,4 @@
# licensing details.
"""Common code for benchmarks."""

import os
from pathlib import Path

# Environment variable names
_ASVDIR_VARNAME = "ASV_DIR" # As set in nightly script "asv_nightly/asv.sh"
_DATADIR_VARNAME = "BENCHMARK_DATA" # For local runs

ARTIFICIAL_DIM_SIZE = int(10e3) # For all artificial cubes, coords etc.

# Work out where the benchmark data dir is.
asv_dir = os.environ.get("ASV_DIR", None)
if asv_dir:
# For an overnight run, this comes from the 'ASV_DIR' setting.
benchmark_data_dir = Path(asv_dir) / "data"
else:
# For a local run, you set 'BENCHMARK_DATA'.
benchmark_data_dir = os.environ.get(_DATADIR_VARNAME, None)
if benchmark_data_dir is not None:
benchmark_data_dir = Path(benchmark_data_dir)


def testdata_path(*path_names):
"""
Return the path of a benchmark test data file.

These are based from a test-data location dir, which is either
${}/data (for overnight tests), or ${} for local testing.

If neither of these were set, an error is raised.

""".format(
_ASVDIR_VARNAME, _DATADIR_VARNAME
)
if benchmark_data_dir is None:
msg = (
"Benchmark data dir is not defined : "
'Either "${}" or "${}" must be set.'
)
raise (ValueError(msg.format(_ASVDIR_VARNAME, _DATADIR_VARNAME)))
path = benchmark_data_dir.joinpath(*path_names)
path = str(path) # Because Iris doesn't understand Path objects yet.
return path
91 changes: 91 additions & 0 deletions benchmarks/benchmarks/generate_data/__init__.py
@@ -0,0 +1,91 @@
# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Scripts for generating supporting data for benchmarking.

Data generated using Iris should use :func:`run_function_elsewhere`, which
means that data is generated using a fixed version of Iris and a fixed
environment, rather than those that get changed when the benchmarking run
checks out a new commit.

Downstream use of data generated 'elsewhere' requires saving; usually in a
NetCDF file. Could also use pickling but there is a potential risk if the
benchmark sequence runs over two different Python versions.

"""
from inspect import getsource
from os import environ
from pathlib import Path
from subprocess import CalledProcessError, check_output, run
from textwrap import dedent

#: Python executable used by :func:`run_function_elsewhere`, set via env
#: variable of same name. Must be path of Python within an environment that
#: includes Iris (including dependencies and test modules) and Mule.
try:
DATA_GEN_PYTHON = environ["DATA_GEN_PYTHON"]
_ = check_output([DATA_GEN_PYTHON, "-c", "a = True"])
except KeyError:
error = "Env variable DATA_GEN_PYTHON not defined."
raise KeyError(error)
except (CalledProcessError, FileNotFoundError, PermissionError):
error = (
"Env variable DATA_GEN_PYTHON not a runnable python executable path."
)
raise ValueError(error)

default_data_dir = (Path(__file__).parents[2] / ".data").resolve()
pp-mo marked this conversation as resolved.
Show resolved Hide resolved
BENCHMARK_DATA = Path(environ.get("BENCHMARK_DATA", default_data_dir))
pp-mo marked this conversation as resolved.
Show resolved Hide resolved
if BENCHMARK_DATA == default_data_dir:
BENCHMARK_DATA.mkdir(exist_ok=True)
elif not BENCHMARK_DATA.is_dir():
message = f"Not a directory: {BENCHMARK_DATA} ."
raise ValueError(message)

# Manual flag to allow the rebuilding of synthetic data.
REUSE_DATA = True
pp-mo marked this conversation as resolved.
Show resolved Hide resolved


def run_function_elsewhere(func_to_run, *args, **kwargs):
"""
Run a given function using the :const:`DATA_GEN_PYTHON` executable.

This structure allows the function to be written natively.

Parameters
----------
func_to_run : FunctionType
The function object to be run.
NOTE: the function must be completely self-contained, i.e. perform all
its own imports (within the target :const:`DATA_GEN_PYTHON`
environment).
*args : tuple, optional
Function call arguments. Must all be expressible as simple literals,
i.e. the ``repr`` must be a valid literal expression.
**kwargs: dict, optional
Function call keyword arguments. All values must be expressible as
simple literals (see ``*args``).

Returns
-------
str
The ``stdout`` from the run.

"""
func_string = dedent(getsource(func_to_run))
func_string = func_string.replace("@staticmethod\n", "")
func_call_term_strings = [repr(arg) for arg in args]
func_call_term_strings += [
f"{name}={repr(val)}" for name, val in kwargs.items()
]
func_call_string = (
f"{func_to_run.__name__}(" + ",".join(func_call_term_strings) + ")"
)
python_string = "\n".join([func_string, func_call_string])
result = run(
[DATA_GEN_PYTHON, "-c", python_string], capture_output=True, check=True
)
return result.stdout
212 changes: 212 additions & 0 deletions benchmarks/benchmarks/generate_data/um_files.py
@@ -0,0 +1,212 @@
# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Generate FF, PP and NetCDF files based on a minimal synthetic FF file.

NOTE: uses the Mule package, so depends on an environment with Mule installed.
"""


def _create_um_files(
len_x: int, len_y: int, len_z: int, len_t: int, compress, save_paths: dict
) -> None:
"""
Generate an FF object of given shape and compression, save to FF/PP/NetCDF.

This is run externally
(:func:`benchmarks.generate_data.run_function_elsewhere`), so all imports
are self-contained and input parameters are simple types.
"""
from copy import deepcopy
from datetime import datetime
from tempfile import NamedTemporaryFile

from mo_pack import compress_wgdos as mo_pack_compress
from mule import ArrayDataProvider, Field3, FieldsFile
from mule.pp import fields_to_pp_file
import numpy as np

from iris import load_cube
from iris import save as save_cube

def packing_patch(*compress_args, **compress_kwargs) -> bytes:
"""
Force conversion from returned :class:`memoryview` to :class:`bytes`.

Downstream uses of :func:`mo_pack.compress_wgdos` were written
for the ``Python2`` behaviour, where the returned buffer had a
different ``__len__`` value to the current :class:`memoryview`.
Unable to fix directly in Mule, so monkey patching for now.
"""
return mo_pack_compress(*compress_args, **compress_kwargs).tobytes()

import mo_pack

mo_pack.compress_wgdos = packing_patch

########

template = {
"fixed_length_header": {"dataset_type": 3, "grid_staggering": 3},
"integer_constants": {
"num_p_levels": len_z,
"num_cols": len_x,
"num_rows": len_y,
},
"real_constants": {},
"level_dependent_constants": {"dims": (len_z + 1, None)},
}
new_ff = FieldsFile.from_template(deepcopy(template))

data_array = np.arange(len_x * len_y).reshape(len_x, len_y)
array_provider = ArrayDataProvider(data_array)

def add_field(level_: int, time_step_: int) -> None:
pp-mo marked this conversation as resolved.
Show resolved Hide resolved
"""
Add a minimal field to the new :class:`~mule.FieldsFile`.

Includes the minimum information to allow Mule saving and Iris
loading, as well as incrementation for vertical levels and time
steps to allow generation of z and t dimensions.
"""
new_field = Field3.empty()
# To correspond to the header-release 3 class used.
new_field.lbrel = 3
# Mule uses the first element of the lookup to test for
# unpopulated fields (and skips them), so the first element should
# be set to something. The year will do.
new_field.raw[1] = datetime.now().year

# Horizontal.
new_field.lbcode = 1
new_field.lbnpt = len_x
new_field.lbrow = len_y
new_field.bdx = new_ff.real_constants.col_spacing
new_field.bdy = new_ff.real_constants.row_spacing
new_field.bzx = new_ff.real_constants.start_lon - 0.5 * new_field.bdx
new_field.bzy = new_ff.real_constants.start_lat - 0.5 * new_field.bdy

# Hemisphere.
new_field.lbhem = 32
# Processing.
new_field.lbproc = 0

# Vertical.
# Hybrid height values by simulating sequences similar to those in a
# theta file.
new_field.lbvc = 65
if level_ == 0:
new_field.lblev = 9999
else:
new_field.lblev = level_

level_1 = level_ + 1
six_rec = 20 / 3
three_rec = six_rec / 2

new_field.blev = level_1 ** 2 * six_rec - six_rec
new_field.brsvd1 = (
level_1 ** 2 * six_rec + (six_rec * level_1) - three_rec
)

brsvd2_simulated = np.linspace(0.995, 0, len_z)
shift = min(len_z, 2)
bhrlev_simulated = np.concatenate(
[np.ones(shift), brsvd2_simulated[:-shift]]
)
new_field.brsvd2 = brsvd2_simulated[level_]
new_field.bhrlev = bhrlev_simulated[level_]

# Time.
new_field.lbtim = 11

new_field.lbyr = time_step_
for attr_name in ["lbmon", "lbdat", "lbhr", "lbmin", "lbsec"]:
setattr(new_field, attr_name, 0)

new_field.lbyrd = time_step_ + 1
for attr_name in ["lbmond", "lbdatd", "lbhrd", "lbmind", "lbsecd"]:
setattr(new_field, attr_name, 0)

# Data and packing.
new_field.lbuser1 = 1
new_field.lbpack = int(compress)
new_field.bacc = 0
new_field.bmdi = -1
new_field.lbext = 0
new_field.set_data_provider(array_provider)

new_ff.fields.append(new_field)

for time_step in range(len_t):
for level in range(len_z):
add_field(level, time_step + 1)

ff_path = save_paths.get("FF", None)
pp_path = save_paths.get("PP", None)
nc_path = save_paths.get("NetCDF", None)

if ff_path:
new_ff.to_file(ff_path)
if pp_path:
fields_to_pp_file(str(pp_path), new_ff.fields)
if nc_path:
temp_ff_path = None
# Need an Iris Cube from the FF content.
if ff_path:
# Use the existing file.
ff_cube = load_cube(ff_path)
else:
# Make a temporary file.
temp_ff_path = NamedTemporaryFile()
new_ff.to_file(temp_ff_path.name)
ff_cube = load_cube(temp_ff_path.name)

save_cube(ff_cube, nc_path, zlib=compress)
if temp_ff_path:
temp_ff_path.close()


FILE_EXTENSIONS = {"FF": "", "PP": ".pp", "NetCDF": ".nc"}


def create_um_files(
len_x: int,
len_y: int,
len_z: int,
len_t: int,
compress: bool,
file_types: list,
) -> dict:
"""
Generate FF-based FF / PP / NetCDF files with specified shape and compression.

Saved to a directory for all files that shape. A dictionary of the saved
pp-mo marked this conversation as resolved.
Show resolved Hide resolved
paths is returned.
pp-mo marked this conversation as resolved.
Show resolved Hide resolved
"""
# Self contained imports to avoid linting confusion with _create_um_files().
from . import BENCHMARK_DATA, REUSE_DATA, run_function_elsewhere

save_name_sections = ["UM", len_x, len_y, len_z, len_t]
save_name = "_".join(str(section) for section in save_name_sections)
save_dir = BENCHMARK_DATA / save_name
if not save_dir.is_dir():
save_dir.mkdir(parents=True)

save_paths = {}
files_exist = True
for file_type in file_types:
file_ext = FILE_EXTENSIONS[file_type]
save_path = (save_dir / f"{compress}").with_suffix(file_ext)
files_exist = files_exist and save_path.is_file()
save_paths[file_type] = str(save_path)

if not REUSE_DATA or not files_exist:
_ = run_function_elsewhere(
_create_um_files, len_x, len_y, len_z, len_t, compress, save_paths
)

return save_paths