From 119e831eac1187e1153869f1f9b52a021e2f6792 Mon Sep 17 00:00:00 2001 From: Ruxandra Valcu Date: Mon, 2 Oct 2023 15:04:22 +0300 Subject: [PATCH] Merge/add transient noise mask (#82) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added the feature described in issue 1 * Moved code for the signal attenuation mask to new file * Resolved merge conflicts by accepting remote versions of files * Update mask_attenuated_signal.py * renamed utils * Added unit test * Test can now automatically download its data * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Update echopype/utils/mask_transformation.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> * Cleaned up mask_transformation * moved files and rerouted importing paths accordingly (#36) Co-authored-by: alejandro-ariza * Feature/add transient noise mask (#35) * Completed feature #3 transient noise mask --------- Co-authored-by: Mihai Boldeanu <35964395+mihaiboldeanu@users.noreply.github.com> Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> Co-authored-by: ruxandra valcu * Added extra tests for the utils/mask_transformation module to increase coverage * Reverted test_mask.py reformating * Moved impulse and transient noise functions to echopype/clean * Fixed import order * Fixed missing import in clean/conftest.py * Moved content of tests/clean/conftest.py to tests/conftest.py * Refactoring * Running some linters on the code * Fix unused imports in conftest * Update echopype/mask/mask_attenuated_signal.py Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> Update mask_transformation.py Cleaned up mask_transformation Delinted Removing pytest req Refactoring Refactoring and cleaning imports Fixed twod test Fixing some future merge conflicts Fix unused import Added attenuation mask to init * Added channel selection to signal attenuation mask * Changed parser in `parse_azfp.py` to maintain consistency over other parsers. (#1135) * replaced usage of xml.dom.minidom with xml.etree.ElementTree * modified azfp parser and parsed all parameters * refactored the parse_azfp code and added new tests * small tweak in test, rename conversion function to _camel_to_snake * updated parameter names * Replace camel-to-snake conversion funcitonality (#1) * Create utils camel-to-snake-case function in new misc.py, and use it in ek_raw_parsers * Replace AZFP camel-to-snake function with new utils/misc.py function * fixed minor error in code * minor change in test_convert_azfp.py * fixed failing tests * fixed failing tests related to attribute name * fixed minor bug --------- Co-authored-by: Wu-Jung Lee Co-authored-by: Emilio Mayorga * ci: added support for running individual test files (#1166) Added a support for testing individual test files also instead of having support for testing only subpackages. * Delinting * refactor(convert): refactor and cleanup parsed2zarr (#1070) * Bump pypa/gh-action-pypi-publish from 1.6.4 to 1.8.1 (#999) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.6.4 to 1.8.1. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.6.4...v1.8.1) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump actions/cache from 3.2.5 to 3.3.1 (#982) Bumps [actions/cache](https://github.com/actions/cache) from 3.2.5 to 3.3.1. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v3.2.5...v3.3.1) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * [pre-commit.ci] pre-commit autoupdate (#1022) updates: - [github.com/psf/black: 23.1.0 → 23.3.0](https://github.com/psf/black/compare/23.1.0...23.3.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Bump pypa/gh-action-pypi-publish from 1.8.1 to 1.8.5 (#1021) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.1 to 1.8.5. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.1...v1.8.5) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump actions/setup-python from 4.5.0 to 4.6.0 (#1036) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 4.5.0 to 4.6.0. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v4.5.0...v4.6.0) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump pypa/gh-action-pypi-publish from 1.8.5 to 1.8.6 (#1041) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.5 to 1.8.6. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.5...v1.8.6) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump mamba-org/provision-with-micromamba from 15 to 16 (#1048) Bumps [mamba-org/provision-with-micromamba](https://github.com/mamba-org/provision-with-micromamba) from 15 to 16. - [Release notes](https://github.com/mamba-org/provision-with-micromamba/releases) - [Commits](https://github.com/mamba-org/provision-with-micromamba/compare/v15...v16) --- updated-dependencies: - dependency-name: mamba-org/provision-with-micromamba dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump actions/setup-python from 4.6.0 to 4.6.1 (#1052) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 4.6.0 to 4.6.1. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v4.6.0...v4.6.1) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Extract out whether_write_to_zarr and remove use_swap arg * Initial add of destination_path * Start using destination path and storage options, modify cleanup * Update docstrings, and add some checks * Add sonar model attribute to parser object * Pass sonar_model to parser and fix bug * Move dataarrays creation to parsed2zarr * Remove atexit registration * Add default DEFAULT_ZARR_TEMP_DIR global var * Initial test for parsed2zarr components * Add explicit no swap for conversion test * fix: add parsed2zarr_obj to self to fix bug * fix: uncomment zarr store close to close the store * fix: remove unneeded typing * fix: no need to close fsmap zarr store anymore, removing code * feat: write tx datagram to zarr in during p2z * feat: add property for ek80 p2z * feat: set from swap array * fix: missing transmit data when no swap * fix: only delete col when it exists in p2z_ek80 * test(echodata): add simple P2Z object to utils mock * fix: Import 'List' typehint Added import for 'List' that is currently in use but missing. * fix: Remove elif for column removal Changed the if elif to a for if so it removes both 'power' and 'angle' columns for RAW4 'tx_datagram_df' data. * fix: Removed dependency to 'more-itertools' Removed dependency to 'more-itertools' by using similar method that uses 'numpy.array_split' instead to evenly split data into desired chunks * test: Set 'no_swap' for 'test_combine_echodata_combined_append' Assign 'no_swap' to 'destination_path' during 'open_raw' to ensure that everything is in memory since the new parsed to zarr functionality is 'auto' by default. * feat: Add 'auto' keyword to enable auto 'use_swap' Changed the way that 'auto' determination of 'use_swap' by specifying an 'auto' keyword, rather than by default. Now defaulting back to 'no_swap' for empty 'destination_path'. * revert: Removed 'no_swap' in 'test_combine_echodata_combined_append' On https://github.com/OSOceanAcoustics/echopype/pull/1070/commits/b6b79fae877bdb3d99c7a4129adfca3a75cd1eb7 'no_swap' was set, however because now the default is 'no_swap' this shouldn't be needed! * fix: Use convention yaml for 'backscatter_x' Sets 'long_name' attributes for 'backscatter_r' and 'backscatter_i' from the convention yaml for p2z outputs. Additionally, units changed from 'V' to 'dB' to sync up with the "no_swap" counterpart. Old tests for 'test_direct_to_zarr_integration' has been activated again to ensure equivalency b/w the two methods. * test: Added P2Z Arrays tests Added testing for underlying methods that gets called during a 'datagram_to_zarr' call to ensure that zarr arrays actually gets created. Ref: https://github.com/OSOceanAcoustics/echopype/issues/777 --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * fix(commongrid): improve 'compute_MVBS' using flox [all tests ci] (#1124) * chore(deps): add flox dependency >=0.7.2 * fix(commongrid): fixes 'compute_MVBS' so it can work better and scale Under the hood, binning along ping time and echo range now uses flox. This allows for scalability and more community-maintained. * docs: add small code comment * refactor: change how ping_time index is retrieved * refactor: remove for loop for channel * test(mvbs): add mock Sv datasets and tests for dims (#2) Note that @leewujung also changed mean to nanmean for skipping NaNs in each bin. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: change dask to numpy Changed the use of dask for log10 to numpy instead since numpy can also handle dask array inputs properly. * feat: Add method argument Added 'method' argument to 'get_MVBS_along_channels' and also expose additional keyword arguments control for flox. * fix(commongrid): Fixed to include lat lon Fixed 'compute_MVBS' function to now include latitude and longitude if the variables exists in the Sv dataset. Additionally, the flox method and keyword arguments are now exposed within the 'compute_MVBS' function. Ref: Issue #1002 * refactor: Set defaults to recommended After some investigation, @lsetiawan concluded that at this time the method 'map-reduce', engine 'numpy', and reindex True works the best, so this is now set as default. Also, getting echo range maximum is through direct data slicing rather than computation. * feat(commongrid): Add 'range_var' argument to 'compute_MVBS' Added a new argument 'range_var' so that user can set the range variable to perform binning with. There are 2 options of 'echo_range' and 'depth': - 'echo_range': When this is set, variable 'water_level' is now included in the resulting MVBS dataset - 'depth': A check is in place to ensure that this variable exists before moving forward and use this to perform range binning. Ref: Issue #1002 * fix: Add missing attributes for lat lon * test: Update test to use random generator * fix: Add case for no 'water_level' Added a case for dataset that doesn't have water level variable. * test(nasc): Remove 'compute_NASC' import to avoid failure * fix: Removed assumption on echo range max Reverted back the echo range max computation to computing on the fly since there may be some NaN values. * test: Extract api test and add markings Extracted fixtures to conftest.py for commongrid. Additionally, clean up unused functions and mark tests b/w unit and integration. Added a new test module called 'test_api.py' for 'commongrid.api'. * test: Add latlon test for 'compute_MVBS' Added a test for Sv dataset that contains latitude and longitude going through 'compute_MVBS' to ensure that those variables gets propagated through. Ref: #1002 * test: Add small get_MVBS_along_channels test Added test for 'get_MVBS_along_channels' with either 'depth' as the 'range_var' or checking for 'has_positions' is True or False. * refactor: Integrate suggested changes Integrated suggested changes from review such as additional comments in code, fixing some variable names, and extracting out the lin2log and log2lin functions. Additionally, now echopype imports pint library to start having unit checks in the input for compute_MVBS. * test: Added check for position values * test: Update range_meter_bin to strings * test: Added 'compute_MVBS' values test * Update echopype/tests/utils/test_processinglevels_integration.py compute_MVBS now should preserve the processing level attributes. So, test for presence rather than absence * test: Add 'nan' sprinkles Sprinkled 'nan' values all over 'echo_range' to ensure that computed values from 'compute_MVBS' doesn't take into account the 'nan'. Added check for the expected distribution of 'nan' in the resulting array. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124#issuecomment-1703643268 * revert: Revert the use of 'pint' Removed dependency to 'pint' and use simple regex to ensure that 'range_bin' input is unit 'm'. Renamed 'range_meter_bin' argument to 'range_bin'. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124#issuecomment-1711629755 * feat: Allow 'range_bin' to have space * fix: Apply suggestions from code review Applied fix for regex not capturing decimal values by @emiliom Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124/files#r1320422121 Co-authored-by: Emilio Mayorga * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test: Fix test text for wrong unit * test: Remove the 'e.g.' part on pytest Removed the part with '(e.g., '10m')' since it's messing up pytests regex matching. * test: Remove remnant for test_ek.py * refactor: Extract range_bin parsing and add close arg Extracts out the 'range_bin' string to float into a private function. Additionally now there's a fine tune argument for bin close edges so user can specify either close is 'left' or 'right'. Bins are converted to pandas interval index before passing into 'get_MVBS_along_channels'. * refactor: Update arg types to include interval index Added argument type options for 'range_interval' and 'ping_interval' to also be interval index. * test: Update tests to have brute force creation Changed mock mvbs to be created by doing brute force rather than hard coding. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * test: Fix brute force mvbs gen Fixes the generation of expected mvbs with brute force as well as tweaks to mvbs_along_channel test. * chore: Clean up old code for doing compute MVBS Removes old code that perfoms compute_MVBS since now we've switched over to flox * chore(pytest): Added custom markers 'unit' and 'integration' * docs: Update docstring for `compute_MVBS` Added options for literal arguments * refactor: Change 'parse_range_bin' to 'parse_x_bin' Make bin parsing to be more general by making it to 'parse_x_bin'. * chore: Update suggested changes Update some texts from suggested review as discussed. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1124#pullrequestreview-1636819555 --------- Co-authored-by: Wu-Jung Lee Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Emilio Mayorga * ci: Fix run-test.py for module runs (#1180) Fixes the 'run-test.py' script so it can run module based testing rather than only specific test files. Ref: https://github.com/OSOceanAcoustics/echopype/pull/1166 * Impulse noise: modified ryan_iterable to use ryan directly * Impulse noise: renamed medianf * Consolidating functions * Impulse noise refactoring: using a parameter dictionary and a function map * Frequency filtering * Frequency filtering (temp add to transient noise and signal attenuation, will refactor) * Signal attenuation refactor * Transmission noise refactor * Impulse noise default params * Multichannel noise masks * Made the noise masks able to handle taking a file path as an input * Fixed import * Fix * Update conftest.py extra parenthesis on pytest fixture decorator * Update test_noise.py Fixed the missing parenthesis for the tests. --------- Signed-off-by: dependabot[bot] Co-authored-by: Mihai Boldeanu Co-authored-by: Mihai Boldeanu <35964395+mihaiboldeanu@users.noreply.github.com> Co-authored-by: Andrei Rusu Co-authored-by: Raluca Simedroni <92971445+simedroniraluca@users.noreply.github.com> Co-authored-by: Andrei Rusu Co-authored-by: alejandro-ariza Co-authored-by: Praneeth Ratna <63547155+praneethratna@users.noreply.github.com> Co-authored-by: Wu-Jung Lee Co-authored-by: Emilio Mayorga Co-authored-by: Don Setiawan Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .ci_helpers/run-test.py | 25 +- docs/source/contributing.rst | 9 +- echopype/calibrate/api.py | 4 +- echopype/calibrate/cal_params.py | 4 +- echopype/calibrate/calibrate_azfp.py | 2 +- echopype/calibrate/range.py | 2 +- echopype/clean/__init__.py | 11 +- echopype/clean/api.py | 263 +++--- echopype/clean/impulse_noise.py | 376 +++----- echopype/clean/signal_attenuation.py | 85 +- echopype/clean/transient_noise.py | 131 ++- echopype/commongrid/api.py | 216 ++++- echopype/commongrid/mvbs.py | 500 ++-------- echopype/consolidate/api.py | 4 +- echopype/convert/api.py | 91 +- echopype/convert/parse_ad2cp.py | 4 +- echopype/convert/parse_azfp.py | 125 +-- echopype/convert/parse_base.py | 23 +- echopype/convert/parse_ek60.py | 4 +- echopype/convert/parse_ek80.py | 4 +- echopype/convert/parsed_to_zarr.py | 313 +++++- echopype/convert/parsed_to_zarr_ek60.py | 4 +- echopype/convert/parsed_to_zarr_ek80.py | 128 ++- echopype/convert/set_groups_azfp.py | 50 +- echopype/convert/set_groups_base.py | 231 +---- echopype/convert/set_groups_ek60.py | 5 +- echopype/convert/set_groups_ek80.py | 36 +- echopype/convert/utils/ek.py | 89 ++ echopype/convert/utils/ek_raw_parsers.py | 21 +- echopype/echodata/echodata.py | 20 +- echopype/mask/__init__.py | 1 + .../tests/clean/test_clean_impulse_noise.py | 41 +- .../tests/clean/test_clean_transient_noise.py | 31 +- echopype/tests/clean/test_noise.py | 13 +- .../tests/clean/test_signal_attenuation.py | 22 +- echopype/tests/commongrid/conftest.py | 519 ++++++++++ echopype/tests/commongrid/test_api.py | 289 ++++++ echopype/tests/commongrid/test_mvbs.py | 888 ++---------------- echopype/tests/conftest.py | 5 +- echopype/tests/convert/test_convert_azfp.py | 52 + .../test_convert_source_target_locs.py | 4 +- echopype/tests/convert/test_parsed_to_zarr.py | 330 ++++++- echopype/tests/echodata/utils.py | 5 + .../test_processinglevels_integration.py | 6 +- echopype/utils/compute.py | 41 + echopype/utils/io.py | 18 + echopype/utils/misc.py | 25 + pytest.ini | 4 +- requirements-dev.txt | 1 + requirements.txt | 3 +- 50 files changed, 2795 insertions(+), 2283 deletions(-) create mode 100644 echopype/convert/utils/ek.py create mode 100644 echopype/tests/commongrid/conftest.py create mode 100644 echopype/tests/commongrid/test_api.py create mode 100644 echopype/utils/compute.py create mode 100644 echopype/utils/misc.py diff --git a/.ci_helpers/run-test.py b/.ci_helpers/run-test.py index bd8c0435b..87dd82904 100644 --- a/.ci_helpers/run-test.py +++ b/.ci_helpers/run-test.py @@ -118,15 +118,24 @@ for k, v in test_to_run.items(): print(f"=== RUNNING {k.upper()} TESTS===") print(f"Touched files: {','.join([os.path.basename(p) for p in v])}") - if k == "root": - file_glob_str = "echopype/tests/test_*.py" - cov_mod_arg = ["--cov=echopype"] + # Run specific test files + # The input files must all starts with "test" in their name + # otherwise module globbing for specific test files will + # be used + if all(f.name.startswith("test") for f in v): + # For specific test files + test_files = [str(p) for p in v] else: - file_glob_str = f"echopype/tests/{k}/*.py" - cov_mod_arg = [f"--cov=echopype/{k}"] - if args.include_cov: - pytest_args = original_pytest_args + cov_mod_arg - test_files = glob.glob(file_glob_str) + # Run all tests in a module + if k == "root": + file_glob_str = "echopype/tests/test_*.py" + cov_mod_arg = ["--cov=echopype"] + else: + file_glob_str = f"echopype/tests/{k}/*.py" + cov_mod_arg = [f"--cov=echopype/{k}"] + if args.include_cov: + pytest_args = original_pytest_args + cov_mod_arg + test_files = glob.glob(file_glob_str) final_args = pytest_args + test_files print(f"Pytest args: {final_args}") exit_code = pytest.main(final_args) diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index b4ac505de..e8f6f8f94 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -145,13 +145,20 @@ the latter via `minio `_. will execute all tests. The entire test suite can be a bit slow, taking up to 40 minutes or more. If your changes impact only some of the subpackages (``convert``, ``calibrate``, ``preprocess``, etc), you can run ``run-test.py`` with only a subset of tests by passing -as an argument a comma-separated list of the modules that have changed. For example: +as an argument a comma-separated list of the modules that have changed or also run only particular test +files by passing a comma-separated list of test files that you want to run. For example: .. code-block:: bash python .ci_helpers/run-test.py --local --pytest-args="-vv" echopype/calibrate/calibrate_ek.py,echopype/preprocess/noise_est.py will run only tests associated with the ``calibrate`` and ``preprocess`` subpackages. + +.. code-block:: bash + + python .ci_helpers/run-test.py --local --pytest-args="-vv" echopype/tests/convert/test_convert_azfp.py,echopype/tests/clean/test_noise.py + +will run only the tests in the ``test_convert_azfp.py`` and ``test_noise.py`` files. For ``run-test.py`` usage information, use the ``-h`` argument: ``python .ci_helpers/run-test.py -h`` diff --git a/echopype/calibrate/api.py b/echopype/calibrate/api.py index 01fa7630a..c2c34c137 100644 --- a/echopype/calibrate/api.py +++ b/echopype/calibrate/api.py @@ -153,7 +153,7 @@ def compute_Sv(echodata: EchoData, **kwargs) -> xr.Dataset: - for EK60 echosounder, allowed parameters include: `"sa_correction"`, `"gain_correction"`, `"equivalent_beam_angle"` - for AZFP echosounder, allowed parameters include: - `"EL"`, `"DS"`, `"TVR"`, `"VTX"`, `"equivalent_beam_angle"`, `"Sv_offset"` + `"EL"`, `"DS"`, `"TVR"`, `"VTX0"`, `"equivalent_beam_angle"`, `"Sv_offset"` Passing in calibration parameters for other echosounders are not currently supported. @@ -242,7 +242,7 @@ def compute_TS(echodata: EchoData, **kwargs): - for EK60 echosounder, allowed parameters include: `"sa_correction"`, `"gain_correction"`, `"equivalent_beam_angle"` - for AZFP echosounder, allowed parameters include: - `"EL"`, `"DS"`, `"TVR"`, `"VTX"`, `"equivalent_beam_angle"`, `"Sv_offset"` + `"EL"`, `"DS"`, `"TVR"`, `"VTX0"`, `"equivalent_beam_angle"`, `"Sv_offset"` Passing in calibration parameters for other echosounders are not currently supported. diff --git a/echopype/calibrate/cal_params.py b/echopype/calibrate/cal_params.py index 0c861c9e9..5d020cc9b 100644 --- a/echopype/calibrate/cal_params.py +++ b/echopype/calibrate/cal_params.py @@ -29,7 +29,7 @@ "impedance_transceiver", # z_er "receiver_sampling_frequency", ), - "AZFP": ("EL", "DS", "TVR", "VTX", "equivalent_beam_angle", "Sv_offset"), + "AZFP": ("EL", "DS", "TVR", "VTX0", "equivalent_beam_angle", "Sv_offset"), } EK80_DEFAULT_PARAMS = { @@ -352,7 +352,7 @@ def get_cal_params_AZFP(beam: xr.DataArray, vend: xr.DataArray, user_dict: dict) out_dict[p] = beam[p] # has only channel dim # Params from Vendor_specific group - elif p in ["EL", "DS", "TVR", "VTX", "Sv_offset"]: + elif p in ["EL", "DS", "TVR", "VTX0", "Sv_offset"]: out_dict[p] = vend[p] # these params only have the channel dimension return out_dict diff --git a/echopype/calibrate/calibrate_azfp.py b/echopype/calibrate/calibrate_azfp.py index bc2f848f0..1299be8bb 100644 --- a/echopype/calibrate/calibrate_azfp.py +++ b/echopype/calibrate/calibrate_azfp.py @@ -63,7 +63,7 @@ def _cal_power_samples(self, cal_type, **kwargs): # TODO: take care of dividing by zero encountered in log10 spreading_loss = 20 * np.log10(self.range_meter) absorption_loss = 2 * self.env_params["sound_absorption"] * self.range_meter - SL = self.cal_params["TVR"] + 20 * np.log10(self.cal_params["VTX"]) # eq.(2) + SL = self.cal_params["TVR"] + 20 * np.log10(self.cal_params["VTX0"]) # eq.(2) # scaling factor (slope) in Fig.G-1, units Volts/dB], see p.84 a = self.cal_params["DS"] diff --git a/echopype/calibrate/range.py b/echopype/calibrate/range.py index aa93a6061..b3c049c5f 100644 --- a/echopype/calibrate/range.py +++ b/echopype/calibrate/range.py @@ -60,7 +60,7 @@ def compute_range_AZFP(echodata: EchoData, env_params: Dict, cal_type: str) -> x # Notation below follows p.86 of user manual N = vend["number_of_samples_per_average_bin"] # samples per bin f = vend["digitization_rate"] # digitization rate - L = vend["lockout_index"] # number of lockout samples + L = vend["lock_out_index"] # number of lockout samples # keep this in ref of AZFP matlab code, # set to 1 since we want to calculate from raw data diff --git a/echopype/clean/__init__.py b/echopype/clean/__init__.py index b6f9caec3..3103f8637 100644 --- a/echopype/clean/__init__.py +++ b/echopype/clean/__init__.py @@ -1,10 +1,15 @@ -from .api import estimate_noise, remove_noise -from .api import get_impulse_noise_mask, get_transient_noise_mask, get_attenuation_mask +from .api import ( + estimate_noise, + get_attenuation_mask, + get_impulse_noise_mask, + get_transient_noise_mask, + remove_noise, +) __all__ = [ "estimate_noise", "remove_noise", "get_impulse_noise_mask", "get_transient_noise_mask", - "get_attenuation_mask" + "get_attenuation_mask", ] diff --git a/echopype/clean/api.py b/echopype/clean/api.py index 541e08499..99fbd25a8 100644 --- a/echopype/clean/api.py +++ b/echopype/clean/api.py @@ -2,19 +2,15 @@ Functions for reducing variabilities in backscatter data. """ import pathlib -from typing import List, Optional, Tuple, Union +from typing import Union -import numpy as np import xarray as xr from pandas import Index +from ..utils.io import get_dataset +from ..utils.misc import frequency_nominal_to_channel from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level -from . import signal_attenuation, transient_noise -from .impulse_noise import ( - _find_impulse_mask_ryan, - _find_impulse_mask_ryan_iterable, - _find_impulse_mask_wang, -) +from . import impulse_noise, signal_attenuation, transient_noise from .noise_est import NoiseEst @@ -90,9 +86,10 @@ def remove_noise(ds_Sv, ping_num, range_sample_num, noise_max=None, SNR_threshol def get_transient_noise_mask( source_Sv: Union[xr.Dataset, str, pathlib.Path], - desired_channel: str, - mask_type: str = "ryan", - **kwargs, + parameters: dict, + desired_channel: str = None, + desired_frequency: int = None, + method: str = "ryan", ) -> xr.DataArray: """ Create a mask based on the identified signal attenuations of Sv values at 38KHz. @@ -108,6 +105,10 @@ def get_transient_noise_mask( else it specifies the path to a zarr or netcdf file containing a Dataset. This input must correspond to a Dataset that has the coordinate ``channel`` and variables ``frequency_nominal`` and ``Sv``. + desired_channel: str + Name of the desired frequency channel. + desired_frequency: int + Desired frequency, in case the channel is not directly specified mask_type: str with either "ryan" or "fielding" based on the preferred method for signal attenuation mask generation Returns @@ -122,43 +123,28 @@ def get_transient_noise_mask( If neither ``ryan`` or ``fielding`` are given """ - assert mask_type in ["ryan", "fielding"], "mask_type must be either 'ryan' or 'fielding'" - selected_channel_Sv = source_Sv.sel(channel=desired_channel) - Sv = selected_channel_Sv["Sv"].values - r = source_Sv["echo_range"].values[0, 0] - if mask_type == "ryan": - # Define a list of the keyword arguments your function can handle - valid_args = {"m", "n", "thr", "excludeabove", "operation"} - # Use dictionary comprehension to filter out any kwargs not in your list - filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_args} - mask = transient_noise._ryan(Sv, r, m=5, **filtered_kwargs) - elif mask_type == "fielding": - # Define a list of the keyword arguments your function can handle - valid_args = {"r0", "r1", "roff", "n", "thr"} - # Use dictionary comprehension to filter out any kwargs not in your list - filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_args} - mask = transient_noise._fielding(Sv, r, **filtered_kwargs) - else: - raise ValueError("The provided mask_type must be ryan or fielding!") - - mask = np.logical_not(mask) - return_mask = xr.DataArray( - mask, - dims=("ping_time", "range_sample"), - coords={"ping_time": source_Sv.ping_time, "range_sample": source_Sv.range_sample}, - ) - return return_mask + source_Sv = get_dataset(source_Sv) + mask_map = { + "ryan": transient_noise._ryan, + "fielding": transient_noise._fielding, + } + if method not in mask_map.keys(): + raise ValueError(f"Unsupported method: {method}") + if desired_channel is None: + if desired_frequency is None: + raise ValueError("Must specify either desired channel or desired frequency") + else: + desired_channel = frequency_nominal_to_channel(source_Sv, desired_frequency) + + mask = mask_map[method](source_Sv, desired_channel, parameters) + return mask def get_impulse_noise_mask( - source_Sv: xr.Dataset, - desired_channel: str, - thr: Union[Tuple[float, float], int, float], - m: Optional[Union[int, float]] = None, - n: Optional[Union[int, Tuple[int, int]]] = None, - erode: Optional[List[Tuple[int, int]]] = None, - dilate: Optional[List[Tuple[int, int]]] = None, - median: Optional[List[Tuple[int, int]]] = None, + source_Sv: Union[xr.Dataset, str, pathlib.Path], + parameters: {}, + desired_channel: str = None, + desired_frequency: int = None, method: str = "ryan", ) -> xr.DataArray: """ @@ -166,28 +152,41 @@ def get_impulse_noise_mask( Parameters ---------- - source_Sv: xr.Dataset - Dataset containing the Sv data to create a mask + source_Sv: xr.Dataset or str or pathlib.Path + If a Dataset this value contains the Sv data to create a mask for, + else it specifies the path to a zarr or netcdf file containing + a Dataset. This input must correspond to a Dataset that has the + coordinate ``channel`` and variables ``frequency_nominal`` and ``Sv``. desired_channel: str Name of the desired frequency channel. - thr: float or tuple - User-defined threshold value (dB) (ryan and ryan iterable) o - r a 2-element tuple specifying the range of threshold values (wang). - m: int or float, optional - Vertical binning length (in number of samples or range) (ryan and ryan iterable). - Defaults to None. - n: int or tuple, optional - Number of pings either side for comparisons (ryan), - or a 2-element tuple specifying the range (ryan iterable). - Defaults to None. - erode: List of 2-element tuples, optional - List indicating the window's size for each erosion cycle (wang). Defaults to None. - dilate: List of 2-element tuples, optional - List indicating the window's size for each dilation cycle (wang). Defaults to None. - median: List of 2-element tuples, optional - List indicating the window's size for each median filter cycle (wang). Defaults to None. + desired_frequency: int + Desired frequency, in case the channel is not directly specified + parameters: {} + Parameter dictionary containing function-specific arguments. + Can contain the following: + thr: Union[Tuple[float, float], int, float] + User-defined threshold value (dB) (ryan and ryan iterable) + or a 2-element tuple with the range of threshold values (wang). + m: Optional[Union[int, float]] = None, + Vertical binning length (in number of samples or range) + (ryan and ryan iterable). + Defaults to None. + n: Optional[Union[int, Tuple[int, int]]] = None, + Number of pings either side for comparisons (ryan), + or a 2-element tuple specifying the range (ryan iterable). + Defaults to None. + erode: Optional[List[Tuple[int, int]]] = None, + Window size for each erosion cycle (wang). + Defaults to None. + dilate: Optional[List[Tuple[int, int]]] = None, + Window size for each dilation cycle (wang). + Defaults to None. + median: Optional[List[Tuple[int, int]]] = None, + Window size for each median filter cycle (wang). + Defaults to None. method: str, optional - The method (ryan, ryan iterable or wang) used to mask impulse noise. Defaults to 'ryan'. + The method (ryan, ryan_iterable or wang) used to mask impulse noise. + Defaults to 'ryan'. Returns ------- @@ -195,34 +194,33 @@ def get_impulse_noise_mask( A DataArray consisting of a mask for the Sv data, wherein True values signify samples that are free of noise. """ - - # Our goal is to have a mask where True represents samples that are NOT impulse noise. + # Our goal is to have a mask True on samples that are NOT impulse noise. # So, we negate the obtained mask. - - if method == "ryan": - impulse_mask_ryan = _find_impulse_mask_ryan(source_Sv, desired_channel, m, n, thr) - noise_free_mask = ~impulse_mask_ryan - elif method == "ryan_iterable": - impulse_mask_ryan_iterable = _find_impulse_mask_ryan_iterable( - source_Sv, desired_channel, m, n, thr - ) - noise_free_mask = ~impulse_mask_ryan_iterable - elif method == "wang": - impulse_mask_wang = _find_impulse_mask_wang( - source_Sv, desired_channel, thr, erode, dilate, median - ) - noise_free_mask = ~impulse_mask_wang - else: + source_Sv = get_dataset(source_Sv) + mask_map = { + "ryan": impulse_noise._ryan, + "ryan_iterable": impulse_noise._ryan_iterable, + "wang": impulse_noise._wang, + } + if method not in mask_map.keys(): raise ValueError(f"Unsupported method: {method}") + if desired_channel is None: + if desired_frequency is None: + raise ValueError("Must specify either desired channel or desired frequency") + else: + desired_channel = frequency_nominal_to_channel(source_Sv, desired_frequency) + impulse_mask = mask_map[method](source_Sv, desired_channel, parameters) + noise_free_mask = ~impulse_mask return noise_free_mask def get_attenuation_mask( source_Sv: Union[xr.Dataset, str, pathlib.Path], - desired_channel: str, - mask_type: str = "ryan", - **kwargs, + parameters: dict, + desired_channel: str = None, + desired_frequency: int = None, + method: str = "ryan", ) -> xr.DataArray: """ Create a mask based on the identified signal attenuations of Sv values at 38KHz. @@ -242,7 +240,12 @@ def get_attenuation_mask( else it specifies the path to a zarr or netcdf file containing a Dataset. This input must correspond to a Dataset that has the coordinate ``channel`` and variables ``frequency_nominal`` and ``Sv``. - desired_channel: the Dataset channel to be used for identifying the signal attenuation. + parameters: dict + Dictionary of parameters to pass to the relevant subfunctions. + desired_channel: str + Name of the desired frequency channel. + desired_frequency: int + Desired frequency, in case the channel is not directly specified mask_type: str with either "ryan" or "ariza" based on the preferred method for signal attenuation mask generation Returns @@ -254,7 +257,7 @@ def get_attenuation_mask( Raises ------ ValueError - If neither ``ryan`` or ``azira`` are given + If neither ``ryan`` or ``ariza`` are given Notes ----- @@ -264,31 +267,21 @@ def get_attenuation_mask( -------- """ - assert mask_type in ["ryan", "ariza"], "mask_type must be either 'ryan' or 'ariza'" - selected_channel_Sv = source_Sv.sel(channel=desired_channel) - Sv = selected_channel_Sv["Sv"].values - r = source_Sv["echo_range"].values[0, 0] - if mask_type == "ryan": - # Define a list of the keyword arguments your function can handle - valid_args = {"r0", "r1", "n", "thr", "start"} - # Use dictionary comprehension to filter out any kwargs not in your list - filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_args} - mask = signal_attenuation._ryan(Sv, r, **filtered_kwargs) - elif mask_type == "ariza": - # Define a list of the keyword arguments your function can handle - valid_args = {"offset", "thr", "m", "n"} - # Use dictionary comprehension to filter out any kwargs not in your list - filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_args} - mask = signal_attenuation._ariza(Sv, r, **filtered_kwargs) - else: - raise ValueError("The provided mask_type must be ryan or ariza!") - - return_mask = xr.DataArray( - mask, - dims=("ping_time", "range_sample"), - coords={"ping_time": source_Sv.ping_time, "range_sample": source_Sv.range_sample}, - ) - return return_mask + source_Sv = get_dataset(source_Sv) + mask_map = { + "ryan": signal_attenuation._ryan, + "ariza": signal_attenuation._ariza, + } + if method not in mask_map.keys(): + raise ValueError(f"Unsupported method: {method}") + if desired_channel is None: + if desired_frequency is None: + raise ValueError("Must specify either desired channel or desired frequency") + else: + desired_channel = frequency_nominal_to_channel(source_Sv, desired_frequency) + + mask = mask_map[method](source_Sv, desired_channel, parameters) + return mask def create_multichannel_mask(masks: [xr.Dataset], channels: [str]) -> xr.Dataset: @@ -319,8 +312,8 @@ def create_multichannel_mask(masks: [xr.Dataset], channels: [str]) -> xr.Dataset def get_transient_noise_mask_multichannel( source_Sv: Union[xr.Dataset, str, pathlib.Path], - mask_type: str = "ryan", - **kwargs, + parameters: dict, + method: str = "ryan", ) -> xr.DataArray: """ Create a mask based on the identified signal attenuations of Sv values at 38KHz. @@ -336,8 +329,10 @@ def get_transient_noise_mask_multichannel( else it specifies the path to a zarr or netcdf file containing a Dataset. This input must correspond to a Dataset that has the coordinate ``channel`` and variables ``frequency_nominal`` and ``Sv``. - mask_type: str with either "ryan" or "fielding" based on + method: str with either "ryan" or "fielding" based on the preferred method for signal attenuation mask generation + parameters: dict + Default method parameters Returns ------- xr.DataArray @@ -351,10 +346,13 @@ def get_transient_noise_mask_multichannel( If neither ``ryan`` or ``fielding`` are given """ + source_Sv = get_dataset(source_Sv) channel_list = source_Sv["channel"].values mask_list = [] for channel in channel_list: - mask = get_transient_noise_mask(source_Sv, channel, mask_type, **kwargs) + mask = get_transient_noise_mask( + source_Sv, parameters=parameters, desired_channel=channel, method=method + ) mask_list.append(mask) mask = create_multichannel_mask(mask_list, channel_list) return mask @@ -362,9 +360,8 @@ def get_transient_noise_mask_multichannel( def get_impulse_noise_mask_multichannel( source_Sv: xr.Dataset, + parameters: dict, method: str = "ryan", - thr: Union[Tuple[float, float], int, float] = 10, - **kwargs, ) -> xr.DataArray: """ Algorithms for masking Impulse noise. @@ -375,9 +372,8 @@ def get_impulse_noise_mask_multichannel( Dataset containing the Sv data to create a mask method: str, optional The method (ryan, ryan iterable or wang) used to mask impulse noise. Defaults to 'ryan'. - thr: float or tuple - User-defined threshold value (dB) (ryan and ryan iterable) o - r a 2-element tuple specifying the range of threshold values (wang) + parameters: dict + Default method parameters Returns ------- @@ -385,12 +381,15 @@ def get_impulse_noise_mask_multichannel( A multichannel DataArray consisting of a mask for the Sv data, wherein True values signify samples that are free of noise. """ + source_Sv = get_dataset(source_Sv) channel_list = source_Sv["channel"].values mask_list = [] - print(kwargs) for channel in channel_list: mask = get_impulse_noise_mask( - source_Sv, desired_channel=channel, method=method, thr=thr, **kwargs + source_Sv, + parameters=parameters, + desired_channel=channel, + method=method, ) mask_list.append(mask) mask = create_multichannel_mask(mask_list, channel_list) @@ -399,8 +398,8 @@ def get_impulse_noise_mask_multichannel( def get_attenuation_mask_multichannel( source_Sv: Union[xr.Dataset, str, pathlib.Path], - mask_type: str = "ryan", - **kwargs, + parameters: dict, + method: str = "ryan", ) -> xr.DataArray: """ Create a mask based on the identified signal attenuations of Sv values at 38KHz. @@ -420,8 +419,11 @@ def get_attenuation_mask_multichannel( else it specifies the path to a zarr or netcdf file containing a Dataset. This input must correspond to a Dataset that has the coordinate ``channel`` and variables ``frequency_nominal`` and ``Sv``. - mask_type: str with either "ryan" or "ariza" based on the + method: str with either "ryan" or "ariza" based on the preferred method for signal attenuation mask generation + parameters: dict + Default method parameters + Returns ------- xr.DataArray @@ -432,12 +434,19 @@ def get_attenuation_mask_multichannel( Raises ------ ValueError - If neither ``ryan`` or ``azira`` are given + If neither ``ryan`` or ``ariza`` are given """ + source_Sv = get_dataset(source_Sv) channel_list = source_Sv["channel"].values mask_list = [] for channel in channel_list: - mask = get_attenuation_mask(source_Sv, channel, mask_type, **kwargs) + mask = get_attenuation_mask( + source_Sv, + parameters=parameters, + desired_channel=channel, + method=method, + ) mask_list.append(mask) mask = create_multichannel_mask(mask_list, channel_list) return mask + \ No newline at end of file diff --git a/echopype/clean/impulse_noise.py b/echopype/clean/impulse_noise.py index ec5f1fe17..5c453be21 100644 --- a/echopype/clean/impulse_noise.py +++ b/echopype/clean/impulse_noise.py @@ -15,128 +15,29 @@ ] import warnings -from typing import List, Tuple, Union import numpy as np import xarray as xr -from scipy.ndimage import median_filter as medianf +from scipy.ndimage import median_filter from skimage.morphology import dilation, erosion from ..utils import mask_transformation - -def _ryan( - Sv: np.ndarray, iax: np.ndarray, m: Union[int, float], n: int, thr: Union[int, float] -) -> Tuple[np.ndarray, np.ndarray]: - """ - Mask impulse noise following the two-sided comparison method described - in: - Ryan et al. (2015) ‘Reducing bias due to noise and attenuation in - open-ocean echo integration data’, ICES Journal of Marine Science, - 72: 2482–2493. - - Parameters - ---------- - Sv (float) : 2D array with Sv data to be masked (dB). - iax (int/float): 1D array with i axis data (n samples or range). - m (int/float): vertical binning length (n samples or range). - n (int) : number of pings either side for comparisons. - thr (int/float): user-defined threshold value (dB). - - Returns - ------- - bool: 2D array with IN mask - bool: 2D array with mask indicating valid IN mask samples. - """ - - # resample down vertically - iax_ = np.arange(iax[0], iax[-1], m) - Sv_ = mask_transformation.oned(Sv, iax, iax_, 0, log_var=True)[0] - - # resample back to full resolution - jax = np.arange(len(Sv[0])) - Sv_, mask_ = mask_transformation.full(Sv_, iax_, jax, iax, jax) - - # side comparison (±n) - dummy = np.zeros((iax.shape[0], n)) * np.nan - comparison_forward = Sv_ - np.c_[Sv_[:, n:], dummy] - comparison_backward = Sv_ - np.c_[dummy, Sv_[:, 0:-n]] - - # get IN mask - comparison_forward[np.isnan(comparison_forward)] = np.inf - maskf = comparison_forward > thr - comparison_backward[np.isnan(comparison_backward)] = np.inf - maskb = comparison_backward > thr - mask = maskf & maskb - - # get second mask indicating valid samples in IN mask - mask_[:, 0:n] = False - mask_[:, -n:] = False - - return mask, mask_ - - -def _ryan_iterable( - Sv: Union[float, np.ndarray], - iax: Union[int, float, np.ndarray], - m: Union[int, float], - n: Tuple[int, ...], - thr: Union[int, float], -) -> Tuple[np.ndarray, np.ndarray]: - """ - Modified from "ryan" so that the parameter "n" can be provided multiple - times. It enables the algorithm to iterate and perform comparisons at - different n distances. Resulting masks at each iteration are combined in - a single mask. By setting multiple n distances the algorithm can detect - spikes adjacent each other. - - Parameters - ---------- - Sv (float) : 2D array with Sv data to be masked (dB). - iax (int, float): 1D array with i axis data (n samples or range). - m (int, float): vertical binning length (n samples or range). - n (int) : number of pings either side for comparisons. - thr (int,float) : user-defined threshold value (dB). - - Returns - ------- - bool: 2D array with IN mask - bool: 2D array with mask indicating valid IN mask samples. - """ - - # resample down vertically - iax_ = np.arange(iax[0], iax[-1], m) - Sv_ = mask_transformation.oned(Sv, iax, iax_, 0, log_var=True)[0] - - # resample back to full resolution - jax = np.arange(len(Sv[0])) - Sv_, mask_ = mask_transformation.full(Sv_, iax_, jax, iax, jax) - - # perform side comparisons and combine masks in one unique mask - mask = np.zeros_like(Sv, dtype=bool) - for i in n: - dummy = np.zeros((iax.shape[0], i)) - dummy[:] = np.nan - forward = Sv_ - np.c_[Sv_[:, i:], dummy] - backward = Sv_ - np.c_[dummy, Sv_[:, 0:-i]] - maskf = np.ma.masked_greater(forward, thr).mask - maskb = np.ma.masked_greater(backward, thr).mask - mask = mask | (maskf & maskb) - - # get second mask indicating valid samples in IN mask - mask_[:, 0 : max(n)] = True - mask_[:, -max(n) :] = True - - return mask, mask_ +RYAN_DEFAULT_PARAMS = {"thr": 10, "m": 5, "n": 1} +RYAN_ITERABLE_DEFAULT_PARAMS = {"thr": 10, "m": 5, "n": (1, 2)} +WANG_DEFAULT_PARAMS = { + "thr": (-70, -40), + "erode": [(3, 3)], + "dilate": [(5, 5), (7, 7)], + "median": [(7, 7)], +} def _wang( - Sv: np.ndarray, - thr: Tuple[float, float], - erode: List[Tuple[int, int]], - dilate: List[Tuple[int, int]], - median: List[Tuple[int, int]], -) -> Tuple[np.ndarray, np.ndarray]: + Sv_ds: xr.Dataset, + desired_channel: str, + parameters: dict = WANG_DEFAULT_PARAMS, +) -> xr.DataArray: """ Clean impulse noise from Sv data following the method described by: @@ -145,92 +46,21 @@ def _wang( SG-ASAM: 15/02. This algorithm runs different cycles of erosion, dilation, and median - filtering to clean impulse noise from Sv. Note that this function - returns a clean/corrected Sv array, instead of a boolean array indicating - the occurrence of impulse noise. - - Parameters - ---------- - Sv (float) : 2D numpy array with Sv data (dB). - thr (int/float): 2-element tuple with bottom/top Sv thresholds (dB) - erode (int) : list of 2-element tuples indicating the window's - size for each erosion cycle. - dilate (int) : list of 2-element tuples indicating the window's - size for each dilation cycle. - median (int) : list of 2-element tuples indicating the window's - size for each median filter cycle. - - Returns - ------- - float : 2D array with clean Sv data. - bool : 2D array with mask indicating valid clean Sv data. - """ - - # set weak noise and strong interference as vacant samples (-999) - Sv_thresholded = Sv.copy() - Sv_thresholded[(Sv < thr[0]) | (Sv > thr[1])] = -999 - - # remaining weak interferences will take neighbouring vacant values - # by running erosion cycles - Sv_eroded = Sv.copy() - for e in erode: - Sv_eroded = erosion(Sv_thresholded, np.ones(e)) - - # the last step might have turned interferences inside biology into vacant - # samples, this is solved by running dilation cycles - Sv_dilated = Sv_eroded.copy() - for d in dilate: - Sv_dilated = dilation(Sv_dilated, np.ones(d)) - - # dilation has modified the Sv value of biological features, so these are - # now corrected to corresponding Sv values before the erosion/dilation - Sv_corrected1 = Sv_dilated.copy() - mask_bio = (Sv_dilated >= thr[0]) & (Sv_dilated < thr[1]) - Sv_corrected1[mask_bio] = Sv_thresholded[mask_bio] - - # compute median convolution in Sv corrected array - Sv_median = Sv_corrected1.copy() - for m in median: - Sv_median = mask_transformation.log( - medianf(mask_transformation.lin(Sv_median), footprint=np.ones(m)) - ) - - # any vacant sample inside biological features will be corrected with - # the median of corresponding neighbouring samples - Sv_corrected2 = Sv_corrected1.copy() - mask_bio = (Sv >= thr[0]) & (Sv < thr[1]) - mask_vacant = Sv_corrected1 == -999 - Sv_corrected2[mask_vacant & mask_bio] = Sv_median[mask_vacant & mask_bio] - - # get mask indicating edges, where swarms analysis couldn't be performed - mask_ = np.ones_like(Sv_corrected2, dtype=bool) - idx = int((max([e[0], d[0]]) - 1) / 2) - jdx = int((max([e[1], d[1]]) - 1) / 2) - mask_[idx:-idx, jdx:-jdx] = False - - return Sv_corrected2, mask_ - - -def _find_impulse_mask_wang( - Sv_ds: xr.Dataset, - desired_channel: str, - thr: Tuple[float, float], - erode: List[Tuple[int, int]], - dilate: List[Tuple[int, int]], - median: List[Tuple[int, int]], -) -> xr.DataArray: - """ - Return a boolean mask indicating the location of impulse noise in Sv data. + filtering to clean impulse noise from Sv. + Returns a boolean mask indicating the location of impulse noise in Sv data. Parameters ---------- Sv_ds (xarray.Dataset): xr.DataArray with Sv data for multiple channels (dB). desired_channel (str): Name of the desired frequency channel. - thr : 2-element tuple with bottom/top Sv thresholds (dB). - erode : List of 2-element tuples indicating the window's size for each erosion cycle. - dilate : List of 2-element tuples indicating the window's size for each dilation cycle. - median : List of 2-element tuples indicating the window's - size for each median filter cycle. + parameters {}: parameter dict, should contain: + thr : 2-element tuple with bottom/top Sv thresholds (dB). + erode : List of 2-element tuples indicating the window's size + for each erosion cycle. + dilate : List of 2-element tuples indicating the window's size + for each dilation cycle. + median : List of 2-element tuples indicating the window's size + for each median filter cycle. Returns ------- @@ -253,6 +83,16 @@ def _find_impulse_mask_wang( """ + parameter_names = ("thr", "erode", "dilate", "median") + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError( + "Missing parameters - should be thr, erode, dilate, median, are" + + str(parameters.keys()) + ) + thr = parameters["thr"] + erode = parameters["erode"] + dilate = parameters["dilate"] + median = parameters["median"] # Select the desired frequency channel directly using 'sel' selected_channel_ds = Sv_ds.sel(channel=desired_channel) @@ -280,7 +120,50 @@ def _find_impulse_mask_wang( True represents edges where cleaning wasn't applied, and False represents areas where cleaning was applied """ - Sv_cleaned, mask_ = _wang(Sv, thr, erode, dilate, median) + # Sv_cleaned, mask_ = _wang(Sv, thr, erode, dilate, median) + # set weak noise and strong interference as vacant samples (-999) + Sv_thresholded = Sv.copy() + Sv_thresholded[(Sv < thr[0]) | (Sv > thr[1])] = -999 + + # remaining weak interferences will take neighbouring vacant values + # by running erosion cycles + Sv_eroded = Sv.copy() + for e in erode: + Sv_eroded = erosion(Sv_thresholded, np.ones(e)) + + # the last step might have turned interferences inside biology into vacant + # samples, this is solved by running dilation cycles + Sv_dilated = Sv_eroded.copy() + for d in dilate: + Sv_dilated = dilation(Sv_dilated, np.ones(d)) + + # dilation has modified the Sv value of biological features, so these are + # now corrected to corresponding Sv values before the erosion/dilation + Sv_corrected = Sv_dilated.copy() + mask_bio = (Sv_dilated >= thr[0]) & (Sv_dilated < thr[1]) + Sv_corrected[mask_bio] = Sv_thresholded[mask_bio] + + # compute median convolution in Sv corrected array + Sv_median = Sv_corrected.copy() + for m in median: + Sv_median = mask_transformation.log( + median_filter(mask_transformation.lin(Sv_median), footprint=np.ones(m)) + ) + + # any vacant sample inside biological features will be corrected with + # the median of corresponding neighbouring samples + Sv_cleaned = Sv_corrected.copy() + mask_bio = (Sv >= thr[0]) & (Sv < thr[1]) + mask_vacant = Sv_corrected == -999 + Sv_cleaned[mask_vacant & mask_bio] = Sv_median[mask_vacant & mask_bio] + + # get mask indicating edges, where swarms analysis couldn't be performed + mask_ = np.ones_like(Sv_cleaned, dtype=bool) + idx = int((max([e[0], d[0]]) - 1) / 2) + jdx = int((max([e[1], d[1]]) - 1) / 2) + mask_[idx:-idx, jdx:-jdx] = False + + # return Sv_corrected2, mask_ """ Create a boolean mask comparing the original and cleaned Sv data @@ -322,12 +205,10 @@ def _find_impulse_mask_wang( return mask_xr -def _find_impulse_mask_ryan( +def _ryan( Sv_ds: xr.Dataset, desired_channel: str, - m: Union[int, float], - n: int, - thr: Union[int, float], + parameters: dict = RYAN_DEFAULT_PARAMS, ) -> xr.DataArray: """ Mask impulse noise following the two-sided comparison method described in: @@ -338,9 +219,10 @@ def _find_impulse_mask_ryan( ---------- Sv_ds (xarray.Dataset): xr.DataArray with Sv data for multiple channels (dB). desired_channel (str): Name of the desired frequency channel. - m (int/float): Vertical binning length (n samples or range). - n (int): Number of pings either side for comparisons. - thr (int/float): User-defined threshold value (dB). + parameters (dict): Dictionary of parameters. Must contain the following: + m (int/float): Vertical binning length (n samples or range). + n (int): Number of pings either side for comparisons. + thr (int/float): User-defined threshold value (dB). Returns ------- @@ -358,6 +240,12 @@ def _find_impulse_mask_ryan( Then, we create a combined mask using a bitwise AND operation between 'mask' and '~mask_'. """ + parameter_names = ("m", "n", "thr") + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError("Missing parameters - should be m, n, thr, are" + str(parameters.keys())) + m = parameters["m"] + n = parameters["n"] + thr = parameters["thr"] # Select the desired frequency channel directly using 'sel' selected_channel_ds = Sv_ds.sel(channel=desired_channel) @@ -371,7 +259,31 @@ def _find_impulse_mask_ryan( iax = selected_channel_ds.range_sample.values # Call the existing ryan function - mask, mask_ = _ryan(Sv, iax, m, n, thr) + # mask, mask_ = _ryan(Sv, iax, m, n, thr) + iax_ = np.arange(iax[0], iax[-1], m) + Sv_ = mask_transformation.oned(Sv, iax, iax_, 0, log_var=True)[0] + + # resample back to full resolution + jax = np.arange(len(Sv[0])) + Sv_, mask_ = mask_transformation.full(Sv_, iax_, jax, iax, jax) + + # side comparison (±n) + dummy = np.zeros((iax.shape[0], n)) * np.nan + comparison_forward = Sv_ - np.c_[Sv_[:, n:], dummy] + comparison_backward = Sv_ - np.c_[dummy, Sv_[:, 0:-n]] + + # get IN mask + comparison_forward[np.isnan(comparison_forward)] = np.inf + maskf = comparison_forward > thr + comparison_backward[np.isnan(comparison_backward)] = np.inf + maskb = comparison_backward > thr + mask = maskf & maskb + + # get second mask indicating valid samples in IN mask + mask_[:, 0:n] = False + mask_[:, -n:] = False + + # return mask, mask_ # Transpose the mask back to its original shape mask = np.transpose(mask) @@ -391,12 +303,10 @@ def _find_impulse_mask_ryan( return mask_xr -def _find_impulse_mask_ryan_iterable( +def _ryan_iterable( Sv_ds: xr.Dataset, desired_channel: str, - m: Union[int, float], - n: Tuple[int, ...], - thr: Union[int, float], + parameters: dict = RYAN_ITERABLE_DEFAULT_PARAMS, ) -> xr.DataArray: """ Modified from "ryan" so that the parameter "n" can be provided multiple @@ -409,9 +319,10 @@ def _find_impulse_mask_ryan_iterable( ---------- Sv_ds (xarray.Dataset): xr.DataArray with Sv data for multiple channels (dB). desired_channel (str): Name of the desired frequency channel. - m (int/float): Vertical binning length (n samples or range). - n (int): Number of pings either side for comparisons. - thr (int/float): User-defined threshold value (dB). + parameters (dict): Dictionary of parameters. Must contain the following: + m (int/float): Vertical binning length (n samples or range). + n (int): Number of pings either side for comparisons. + thr (int/float): User-defined threshold value (dB). Returns ------- @@ -430,34 +341,21 @@ def _find_impulse_mask_ryan_iterable( Then, we create a combined mask using a bitwise AND operation between 'mask' and '~mask_'. """ - - # Select the desired frequency channel directly using 'sel' - selected_channel_ds = Sv_ds.sel(channel=desired_channel) - - # Extract Sv and iax for the desired frequency channel - Sv = selected_channel_ds["Sv"].values - - # But first, transpose the Sv data so that the vertical dimension is axis 0 - Sv = np.transpose(Sv) - - iax = selected_channel_ds.range_sample.values - - # Call the existing ryan function - mask, mask_ = _ryan_iterable(Sv, iax, m, n, thr) - - # Transpose the mask back to its original shape - mask = np.transpose(mask) - mask_ = np.transpose(mask_) - combined_mask = mask & (~mask_) - - # Create a new xarray for the mask with the correct dimensions and coordinates - mask_xr = xr.DataArray( - combined_mask, - dims=("ping_time", "range_sample"), - coords={ - "ping_time": selected_channel_ds.ping_time.values, - "range_sample": selected_channel_ds.range_sample.values, - }, - ) - + parameter_names = ("m", "n", "thr") + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError("Missing parameters - should be m, n, thr, are" + str(parameters.keys())) + m = parameters["m"] + n = parameters["n"] + thr = parameters["thr"] + + mask_list = [] + for n_i in n: + parameter_dict = {"m": m, "n": n_i, "thr": thr} + mask = _ryan(Sv_ds, desired_channel, parameter_dict) + mask_list.append(mask) + mask_xr = mask_list[0] + if len(mask_list) > 1: + for mask in mask_list[1:]: + mask_xr |= mask return mask_xr + \ No newline at end of file diff --git a/echopype/clean/signal_attenuation.py b/echopype/clean/signal_attenuation.py index d5fb02134..5fa0d4ab5 100644 --- a/echopype/clean/signal_attenuation.py +++ b/echopype/clean/signal_attenuation.py @@ -1,10 +1,15 @@ import numpy as np +import xarray as xr from skimage.measure import label from ..utils.mask_transformation import full as _full, lin as _lin, log as _log, twod as _twod -def _ryan(Sv, r, r0=180, r1=280, n=30, thr=6, start=0): +DEFAULT_RYAN_PARAMS = {"r0": 180, "r1": 280, "n": 30, "thr": -6, "start": 0} +DEFAULT_ARIZA_PARAMS = {"offset": 20, "thr": (-40, -35), "m": 20, "n": 50} + + +def _ryan(source_Sv: xr.DataArray, desired_channel: str, parameters=DEFAULT_RYAN_PARAMS): """ Locate attenuated signal and create a mask following the attenuated signal filter as in: @@ -28,18 +33,32 @@ def _ryan(Sv, r, r0=180, r1=280, n=30, thr=6, start=0): threshold value. Args: - Sv (float): 2D array with Sv data to be masked (dB). - r (float): 1D array with range data (m). - r0 (int): upper limit of SL (m). - r1 (int): lower limit of SL (m). - n (int): number of preceding & subsequent pings defining the block. - thr (int): user-defined threshold value (dB). - start (int): ping index to start processing. + source_Sv (xr.DataArray): Sv array + selected_channel (str): name of the channel to process + parameters(dict): dict of parameters, containing: + r0 (int): upper limit of SL (m). + r1 (int): lower limit of SL (m). + n (int): number of preceding & subsequent pings defining the block. + thr (int): user-defined threshold value (dB). + start (int): ping index to start processing. Returns: - list: 2D boolean array with AS mask and 2D boolean array with mask - indicating where AS detection was unfeasible. + xr.DataArray: boolean array with AS mask, with ping_time and range_sample dims """ + parameter_names = ("r0", "r1", "n", "thr", "start") + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError( + "Missing parameters - should be r0, r1, n, thr, start, are" + str(parameters.keys()) + ) + r0 = parameters["r0"] + r1 = parameters["r1"] + n = parameters["n"] + thr = parameters["thr"] + start = parameters["start"] + + selected_channel_Sv = source_Sv.sel(channel=desired_channel) + Sv = selected_channel_Sv["Sv"].values + r = source_Sv["echo_range"].values[0, 0] # raise errors if wrong arguments if r0 > r1: @@ -79,16 +98,51 @@ def _ryan(Sv, r, r0=180, r1=280, n=30, thr=6, start=0): mask[j, :] = True final_mask = np.logical_not(mask[start:, :] | mask_[start:, :]) - return final_mask + return_mask = xr.DataArray( + final_mask, + dims=("ping_time", "range_sample"), + coords={"ping_time": source_Sv.ping_time, "range_sample": source_Sv.range_sample}, + ) + return return_mask + +def _ariza(source_Sv, desired_channel, parameters=DEFAULT_ARIZA_PARAMS): -def _ariza(Sv, r, offset=20, thr=(-40, -35), m=20, n=50): """ Mask attenuated pings by looking at seabed breaches. Ariza et al. (2022) 'Acoustic seascape partitioning through functional data analysis', Journal of Biogeography, 00, 1– 15. https://doi.org/10.1111/jbi.14534 + Args: + source_Sv (xr.DataArray): Sv array + selected_channel (str): name of the channel to process + parameters(dict): dict of parameters, containing: + offset (int): + m (int): + n (int): + thr (int): + + Returns: + xr.DataArray: boolean array with AS mask, with ping_time and range_sample dims """ + parameter_names = ( + "thr", + "m", + "n", + "offset", + ) + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError( + "Missing parameters - should be thr, m, n, offset, are" + str(parameters.keys()) + ) + m = parameters["m"] + n = parameters["n"] + thr = parameters["thr"] + offset = parameters["offset"] + + selected_channel_Sv = source_Sv.sel(channel=desired_channel) + Sv = selected_channel_Sv["Sv"].values + r = source_Sv["echo_range"].values[0, 0] # get ping array p = np.arange(len(Sv)) @@ -135,4 +189,9 @@ def _ariza(Sv, r, offset=20, thr=(-40, -35), m=20, n=50): # get mask where this value falls below a Sv threshold (seabed breaches) mask = seabed_percentile < thr[0] mask = np.tile(mask, [len(Sv), 1]) - return mask + return_mask = xr.DataArray( + mask, + dims=("ping_time", "range_sample"), + coords={"ping_time": source_Sv.ping_time, "range_sample": source_Sv.range_sample}, + ) + return return_mask diff --git a/echopype/clean/transient_noise.py b/echopype/clean/transient_noise.py index d7cbfeb10..109b43b49 100644 --- a/echopype/clean/transient_noise.py +++ b/echopype/clean/transient_noise.py @@ -16,11 +16,30 @@ import numpy as np +import xarray as xr from ..utils.mask_transformation import lin as _lin, log as _log - -def _ryan(Sv, r, m=50, n=20, thr=20, excludeabove=250, operation="percentile15"): +RYAN_DEFAULT_PARAMS = { + "m": 5, + "n": 20, + "thr": 20, + "excludeabove": 250, + "operation": "percentile15", +} +FIELDING_DEFAULT_PARAMS = { + "r0": 200, + "r1": 1000, + "n": 20, + "thr": [2, 0], + "roff": 250, + "jumps": 5, + "maxts": -35, + "start": 0, +} + + +def _ryan(source_Sv: xr.DataArray, desired_channel: str, parameters: dict = RYAN_DEFAULT_PARAMS): """ Mask transient noise as in: @@ -34,21 +53,38 @@ def _ryan(Sv, r, m=50, n=20, thr=20, excludeabove=250, operation="percentile15") excluded above 250 m by default to avoid the removal of aggregated biota. Args: - Sv (float): 2D numpy array with Sv data to be masked (dB) - r (float): 1D numpy array with range data (m) - m (int): height of surrounding region (m) - n (int): width of surrounding region (pings) - threshold (int): user-defined threshold for comparisons (dB) - excludeabove (int): range above which masking is excluded (m) - operation (str): type of average operation: - 'mean' - 'percentileXX' - 'median' - 'mode'#not in numpy + source_Sv (xr.DataArray): Sv array + selected_channel (str): name of the channel to process + parameters(dict): dict of parameters, containing: + m (int): height of surrounding region (m) + n (int): width of surrounding region (pings) + thr (int): user-defined threshold for comparisons (dB) + excludeabove (int): range above which masking is excluded (m) + operation (str): type of average operation: + 'mean' + 'percentileXX' + 'median' + 'mode'#not in numpy Returns: - bool: 2D numpy array mask (transient noise = True) + xarray.DataArray: xr.DataArray with mask indicating the presence of transient noise. """ + parameter_names = ("m", "n", "thr", "excludeabove", "operation") + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError( + "Missing parameters - should be m, n, thr, excludeabove, operation, are" + + str(parameters.keys()) + ) + m = parameters["m"] + n = parameters["n"] + thr = parameters["thr"] + excludeabove = parameters["excludeabove"] + operation = parameters["operation"] + + selected_channel_Sv = source_Sv.sel(channel=desired_channel) + Sv = selected_channel_Sv["Sv"].values + r = source_Sv["echo_range"].values[0, 0] + # offsets for i and j indexes ioff = n joff = np.argmin(abs(r - m)) @@ -78,10 +114,19 @@ def _ryan(Sv, r, m=50, n=20, thr=20, excludeabove=250, operation="percentile15") ) mask[i, j] = sample - block > thr - return mask + mask = np.logical_not(mask) + return_mask = xr.DataArray( + mask, + dims=("ping_time", "range_sample"), + coords={"ping_time": source_Sv.ping_time, "range_sample": source_Sv.range_sample}, + ) + return return_mask + +def _fielding( + source_Sv: xr.DataArray, desired_channel: str, parameters: dict = FIELDING_DEFAULT_PARAMS +): -def _fielding(Sv, r, r0=200, r1=1000, n=20, thr=[2, 0], roff=250, jumps=5, maxts=-35, start=0): """ Mask transient noise with method proposed by Fielding et al (unpub.). @@ -104,22 +149,40 @@ def _fielding(Sv, r, r0=200, r1=1000, n=20, thr=[2, 0], roff=250, jumps=5, maxts with a secondary threshold or until it gets the exclusion range depth. Args: - Sv (float): 2D numpy array with Sv data to be masked (dB). - r (float): 1D numpy array with range data (m). - r0 (int ): range below which transient noise is evaluated (m). - r1 (int ): range above which transient noise is evaluated (m). - n (int ): n of preceding & subsequent pings defining the block. - thr (int ): user-defined threshold for side-comparisons (dB). - roff (int ): range above which masking is excluded (m). - maxts (int ): max transient noise permitted, prevents to interpret - seabed as transient noise (dB). - jumps (int ): height of vertical steps (m). - start (int ): ping index to start processing. + source_Sv (xr.DataArray): Sv array + selected_channel (str): name of the channel to process + parameters(dict): dict of parameters, containing: + r0 (int ): range below which transient noise is evaluated (m). + r1 (int ): range above which transient noise is evaluated (m). + n (int ): n of preceding & subsequent pings defining the block. + thr (int ): user-defined threshold for side-comparisons (dB). + roff (int ): range above which masking is excluded (m). + maxts (int ): max transient noise permitted, prevents to interpret + seabed as transient noise (dB). + jumps (int ): height of vertical steps (m). + start (int ): ping index to start processing. Returns: - list: 2D boolean array with TN mask and 2D boolean array with mask - indicating where TN detection was unfeasible. + xarray.DataArray: xr.DataArray with mask indicating the presence of transient noise. """ + parameter_names = ("r0", "r1", "n", "thr", "roff", "maxts", "jumps", "start") + if not all(name in parameters.keys() for name in parameter_names): + raise ValueError( + "Missing parameters - should be r0, r1, n, thr, roff, maxts, jumps, start, are" + + str(parameters.keys()) + ) + r0 = parameters["r0"] + r1 = parameters["r1"] + n = parameters["n"] + thr = parameters["thr"] + roff = parameters["roff"] + maxts = parameters["maxts"] + jumps = parameters["jumps"] + start = parameters["start"] + + selected_channel_Sv = source_Sv.sel(channel=desired_channel) + Sv = selected_channel_Sv["Sv"].values + r = source_Sv["echo_range"].values[0, 0] # raise errors if wrong arguments if r0 > r1: @@ -165,5 +228,11 @@ def _fielding(Sv, r, r0=200, r1=1000, n=20, thr=[2, 0], roff=250, jumps=5, maxts break mask[j, r0:] = True - final_mask = mask[:, start:] | mask_[:, start:] - return final_mask + mask = mask[:, start:] | mask_[:, start:] + mask = np.logical_not(mask) + return_mask = xr.DataArray( + mask, + dims=("ping_time", "range_sample"), + coords={"ping_time": source_Sv.ping_time, "range_sample": source_Sv.range_sample}, + ) + return return_mask diff --git a/echopype/commongrid/api.py b/echopype/commongrid/api.py index f53f17c95..171a6d9e6 100644 --- a/echopype/commongrid/api.py +++ b/echopype/commongrid/api.py @@ -1,12 +1,14 @@ """ Functions for enhancing the spatial and temporal coherence of data. """ -from typing import Union +import re +from typing import Literal, Union import numpy as np import pandas as pd import xarray as xr +from ..consolidate.api import POSITION_VARIABLES from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level from .mvbs import get_MVBS_along_channels from .nasc import ( @@ -70,11 +72,117 @@ def _set_MVBS_attrs(ds): ) +def _convert_bins_to_interval_index( + bins: list, closed: Literal["left", "right"] = "left" +) -> pd.IntervalIndex: + """ + Convert bins to sorted pandas IntervalIndex + with specified closed end + + Parameters + ---------- + bins : list + The bin edges + closed : {'left', 'right'}, default 'left' + Which side of bin interval is closed + + Returns + ------- + pd.IntervalIndex + The resulting IntervalIndex + """ + return pd.IntervalIndex.from_breaks(bins, closed=closed).sort_values() + + +def _parse_x_bin(x_bin: str, x_label="range_bin") -> float: + """ + Parses x bin string, check unit, + and returns x bin in the specified unit. + + Currently only available for: + range_bin: meters (m) + dist_bin: nautical miles (nmi) + + Parameters + ---------- + x_bin : str + X bin string, e.g., "0.5nmi" or "10m" + x_label : {"range_bin", "dist_bin"}, default "range_bin" + The label of the x bin. + + Returns + ------- + float + The resulting x bin value in x unit, + based on label. + + Raises + ------ + ValueError + If the x bin string doesn't include unit value. + TypeError + If the x bin is not a type string. + KeyError + If the x label is not one of the available labels. + """ + x_bin_map = { + "range_bin": { + "name": "Range bin", + "unit": "m", + "ex": "10m", + "unit_label": "meters", + "pattern": r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(m)", + }, + "dist_bin": { + "name": "Distance bin", + "unit": "nmi", + "ex": "0.5nmi", + "unit_label": "nautical miles", + "pattern": r"([\d+]*[.,]{0,1}[\d+]*)(\s+)?(nmi)", + }, + } + x_bin_info = x_bin_map.get(x_label, None) + + if x_bin_info is None: + raise KeyError(f"x_label must be one of {list(x_bin_map.keys())}") + + # First check for bin types + if not isinstance(x_bin, str): + raise TypeError("'x_bin' must be a string") + # normalize to lower case + # for x_bin + x_bin = x_bin.strip().lower() + # Only matches meters + match_obj = re.match(x_bin_info["pattern"], x_bin) + + # Do some checks on x_bin inputs + if match_obj is None: + # This shouldn't be other units + raise ValueError( + f"{x_bin_info['name']} must be in " + f"{x_bin_info['unit_label']} " + f"(e.g., '{x_bin_info['ex']}')." + ) + + # Convert back to float + x_bin = float(match_obj.group(1)) + return x_bin + + @add_processing_level("L3*") -def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): +def compute_MVBS( + ds_Sv: xr.Dataset, + range_var: Literal["echo_range", "depth"] = "echo_range", + range_bin: str = "20m", + ping_time_bin: str = "20S", + method="map-reduce", + closed: Literal["left", "right"] = "left", + **flox_kwargs, +): """ Compute Mean Volume Backscattering Strength (MVBS) - based on intervals of range (``echo_range``) and ``ping_time`` specified in physical units. + based on intervals of range (``echo_range``) or depth (``depth``) + and ``ping_time`` specified in physical units. Output of this function differs from that of ``compute_MVBS_index_binning``, which computes bin-averaged Sv according to intervals of ``echo_range`` and ``ping_time`` specified as @@ -84,41 +192,99 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): ---------- ds_Sv : xr.Dataset dataset containing Sv and ``echo_range`` [m] - range_meter_bin : Union[int, float] - bin size along ``echo_range`` in meters, default to ``20`` - ping_time_bin : str - bin size along ``ping_time``, default to ``20S`` + range_var: {'echo_range', 'depth'}, default 'echo_range' + The variable to use for range binning. + Must be one of ``echo_range`` or ``depth``. + Note that ``depth`` is only available if the input dataset contains + ``depth`` as a data variable. + range_bin : str, default '20m' + bin size along ``echo_range`` or ``depth`` in meters. + ping_time_bin : str, default '20S' + bin size along ``ping_time`` + method: str, default 'map-reduce' + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + closed: {'left', 'right'}, default 'left' + Which side of bin interval is closed. + **kwargs + Additional keyword arguments to be passed + to flox reduction function. Returns ------- A dataset containing bin-averaged Sv """ + if not isinstance(ping_time_bin, str): + raise TypeError("ping_time_bin must be a string") + + range_bin = _parse_x_bin(range_bin, "range_bin") + + # Clean up filenames dimension if it exists + # not needed here + if "filenames" in ds_Sv.dims: + ds_Sv = ds_Sv.drop_dims("filenames") + + # Check if range_var is valid + if range_var not in ["echo_range", "depth"]: + raise ValueError("range_var must be one of 'echo_range' or 'depth'.") + + # Check if range_var exists in ds_Sv + if range_var not in ds_Sv.data_vars: + raise ValueError(f"range_var '{range_var}' does not exist in the input dataset.") + + # Check for closed values + if closed not in ["right", "left"]: + raise ValueError(f"{closed} is not a valid option. Options are 'left' or 'right'.") + # create bin information for echo_range - range_interval = np.arange(0, ds_Sv["echo_range"].max() + range_meter_bin, range_meter_bin) + # this computes the echo range max since there might NaNs in the data + echo_range_max = ds_Sv[range_var].max() + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) # create bin information needed for ping_time - ping_interval = ( - ds_Sv.ping_time.resample(ping_time=ping_time_bin, skipna=True).asfreq().ping_time.values + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .first() # Not actually being used, but needed to get the bin groups + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values + + # Set interval index for groups + ping_interval = _convert_bins_to_interval_index(ping_interval, closed=closed) + range_interval = _convert_bins_to_interval_index(range_interval, closed=closed) + raw_MVBS = get_MVBS_along_channels( + ds_Sv, + range_interval, + ping_interval, + range_var=range_var, + method=method, + **flox_kwargs, ) - - # calculate the MVBS along each channel - MVBS_values = get_MVBS_along_channels(ds_Sv, range_interval, ping_interval) # create MVBS dataset + # by transforming the binned dimensions to regular coords ds_MVBS = xr.Dataset( - data_vars={"Sv": (["channel", "ping_time", "echo_range"], MVBS_values)}, + data_vars={"Sv": (["channel", "ping_time", range_var], raw_MVBS["Sv"].data)}, coords={ - "ping_time": ping_interval, - "channel": ds_Sv.channel, - "echo_range": range_interval[:-1], + "ping_time": np.array([v.left for v in raw_MVBS.ping_time_bins.values]), + "channel": raw_MVBS.channel.values, + range_var: np.array([v.left for v in raw_MVBS[f"{range_var}_bins"].values]), }, ) - # TODO: look into why 'filenames' exist here as a variable - # Added this check to support the test in test_process.py::test_compute_MVBS - if "filenames" in ds_MVBS.variables: - ds_MVBS = ds_MVBS.drop_vars("filenames") + # "has_positions" attribute is inserted in get_MVBS_along_channels + # when the dataset has position information + # propagate this to the final MVBS dataset + if raw_MVBS.attrs.get("has_positions", False): + for var in POSITION_VARIABLES: + ds_MVBS[var] = (["ping_time"], raw_MVBS[var].data, ds_Sv[var].attrs) + + # Add water level if uses echo_range and it exists in Sv dataset + if range_var == "echo_range" and "water_level" in ds_Sv.data_vars: + ds_MVBS["water_level"] = ds_Sv["water_level"] # ping_time_bin parsing and conversions # Need to convert between pd.Timedelta and np.timedelta64 offsets/frequency strings @@ -151,17 +317,17 @@ def compute_MVBS(ds_Sv, range_meter_bin=20, ping_time_bin="20S"): # Attach attributes _set_MVBS_attrs(ds_MVBS) - ds_MVBS["echo_range"].attrs = {"long_name": "Range distance", "units": "m"} + ds_MVBS[range_var].attrs = {"long_name": "Range distance", "units": "m"} ds_MVBS["Sv"] = ds_MVBS["Sv"].assign_attrs( { "cell_methods": ( f"ping_time: mean (interval: {ping_time_bin_resvalue} {ping_time_bin_resunit_label} " # noqa "comment: ping_time is the interval start) " - f"echo_range: mean (interval: {range_meter_bin} meter " - "comment: echo_range is the interval start)" + f"{range_var}: mean (interval: {range_bin} meter " + f"comment: {range_var} is the interval start)" ), "binning_mode": "physical units", - "range_meter_interval": str(range_meter_bin) + "m", + "range_meter_interval": str(range_bin) + "m", "ping_time_interval": ping_time_bin, "actual_range": [ round(float(ds_MVBS["Sv"].min().values), 2), diff --git a/echopype/commongrid/mvbs.py b/echopype/commongrid/mvbs.py index 2c0006f35..2fbc1b38f 100644 --- a/echopype/commongrid/mvbs.py +++ b/echopype/commongrid/mvbs.py @@ -2,411 +2,25 @@ Contains core functions needed to compute the MVBS of an input dataset. """ -import warnings -from typing import Tuple, Union +from typing import Literal, Union -import dask.array import numpy as np +import pandas as pd import xarray as xr +from flox.xarray import xarray_reduce - -def get_bin_indices( - echo_range: np.ndarray, bins_er: np.ndarray, times: np.ndarray, bins_time: np.ndarray -) -> Tuple[np.ndarray, np.ndarray]: - """ - Obtains the bin index of ``echo_range`` and ``times`` based - on the binning ``bins_er`` and ``bins_time``, respectively. - - Parameters - ---------- - echo_range: np.ndarray - 2D array of echo range values - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - - Returns - ------- - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - bin_time_ind: np.ndarray - 1D array of bin indices for ``times`` - """ - - # get bin index for each echo range value - digitized_echo_range = np.digitize(echo_range, bins_er, right=False) - - # turn datetime into integers, so we can use np.digitize - if isinstance(times, dask.array.Array): - times_i8 = times.compute().data.view("i8") - else: - times_i8 = times.view("i8") - - # turn datetime into integers, so we can use np.digitize - bins_time_i8 = bins_time.view("i8") - - # get bin index for each time - bin_time_ind = np.digitize(times_i8, bins_time_i8, right=False) - - return digitized_echo_range, bin_time_ind - - -def bin_and_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], digitized_echo_range: np.ndarray, n_bin_er: int -) -> Union[np.ndarray, dask.array.Array]: - """ - Bins and means ``arr`` with respect to the ``echo_range`` bins. - - Parameters - ---------- - arr: np.ndarray or dask.array.Array - 2D array (dimension: [``echo_range`` x ``ping_time``]) to bin along ``echo_range`` - and compute mean of each bin - digitized_echo_range: np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: np.ndarray or dask.array.Array - 2D array representing the bin and mean of ``arr`` along ``echo_range`` - """ - - binned_means = [] - for bin_er in range(1, n_bin_er): - # Catch a known warning that can occur, which does not impact the results - with warnings.catch_warnings(): - # ignore warnings caused by taking a mean of an array filled with NaNs - warnings.filterwarnings(action="ignore", message="Mean of empty slice") - - # bin and mean echo_range dimension - er_selected_data = np.nanmean(arr[:, digitized_echo_range == bin_er], axis=1) - - # collect all echo_range bins - binned_means.append(er_selected_data) - - # create full echo_range binned array - er_means = np.vstack(binned_means) - - return er_means - - -def get_unequal_rows(mat: np.ndarray, row: np.ndarray) -> np.ndarray: - """ - Obtains those row indices of ``mat`` that are not equal - to ``row``. - - Parameters - ---------- - mat: np.ndarray - 2D array with the same column dimension as the number - of elements in ``row`` - row: np.ndarray - 1D array with the same number of element elements as - the column dimension of ``mat`` - - Returns - ------- - row_ind_not_equal: np.ndarray - The row indices of ``mat`` that are not equal to ``row`` - - Notes - ----- - Elements with NaNs are considered equal if they are in the same position. - """ - - # compare row against all rows in mat (allowing for NaNs to be equal) - element_nan_equal = (mat == row) | (np.isnan(mat) & np.isnan(row)) - - # determine if mat row is equal to row - row_not_equal = np.logical_not(np.all(element_nan_equal, axis=1)) - - if isinstance(row_not_equal, dask.array.Array): - row_not_equal = row_not_equal.compute() - - # get those row indices that are not equal to row - row_ind_not_equal = np.argwhere(row_not_equal).flatten() - - return row_ind_not_equal - - -def if_all_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - A comprehensive check that determines if all ``echo_range`` values - along ``ping_time`` have the same step size. If they do not have - the same step sizes, then grouping of the ``echo_range`` values - will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # grab the in-memory numpy echo_range values, if necessary - if isinstance(er_chan, xr.DataArray): - er_chan = er_chan.values - - # grab the first ping_time that is not filled with NaNs - ping_index = 0 - while np.all(np.isnan(er_chan[ping_index, :])): - ping_index += 1 - - # determine those rows of er_chan that are not equal to the row ping_index - unequal_ping_ind = get_unequal_rows(er_chan, er_chan[ping_index, :]) - - if len(unequal_ping_ind) > 0: - # see if all unequal_ping_ind are filled with NaNs - all_nans = np.all(np.all(np.isnan(er_chan[unequal_ping_ind, :]), axis=1)) - - if all_nans: - # All echo_range values have the same step size - return False - else: - # Some echo_range values have different step sizes - return True - else: - # All echo_range values have the same step size - return False - - -def if_last_er_steps_identical(er_chan: Union[xr.DataArray, np.ndarray]) -> bool: - """ - An alternative (less comprehensive) check that determines if all - ``echo_range`` values along ``ping_time`` have the same step size. - If they do not have the same step sizes, then grouping of the - ``echo_range`` values will be necessary. - - Parameters - ---------- - er_chan: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - - Returns - ------- - bool - True, if grouping of ``echo_range`` along ``ping_time`` is necessary, otherwise False - - Notes - ----- - It is possible that this method will incorrectly determine if grouping - is necessary. - - ``er_chan`` should have rows corresponding to ``ping_time`` and columns - corresponding to ``range_sample`` - """ - - # determine the number of NaNs in each ping and find the unique number of NaNs - unique_num_nans = np.unique(np.isnan(er_chan.data).sum(axis=1)) - - # compute the results, if necessary, to allow for downstream checks - if isinstance(unique_num_nans, dask.array.Array): - unique_num_nans = unique_num_nans.compute() - - # determine if any value is not 0 or er_chan.shape[1] - unexpected_num_nans = False in np.logical_or( - unique_num_nans == 0, unique_num_nans == er_chan.shape[1] - ) - - if unexpected_num_nans: - # echo_range varies with ping_time - return True - else: - # make sure that the final echo_range value for each ping_time is the same (account for NaN) - num_non_nans = np.logical_not(np.isnan(np.unique(er_chan.data[:, -1]))).sum() - - # compute the results, if necessary, to allow for downstream checks - if isinstance(num_non_nans, dask.array.Array): - num_non_nans = num_non_nans.compute() - - if num_non_nans > 1: - # echo_range varies with ping_time - return True - else: - # echo_range does not vary with ping_time - return False - - -def is_er_grouping_needed( - echo_range: Union[xr.DataArray, np.ndarray], comprehensive_er_check: bool -) -> bool: - """ - Determines if ``echo_range`` values along ``ping_time`` can change and - thus need to be grouped. - - Parameters - ---------- - echo_range: xr.DataArray or np.ndarray - 2D array containing the ``echo_range`` values for each ``ping_time`` - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - bool - If True grouping of ``echo_range`` will be required, else it will not - be necessary - """ - - if comprehensive_er_check: - return if_all_er_steps_identical(echo_range) - else: - return if_last_er_steps_identical(echo_range) - - -def group_dig_er_bin_mean_echo_range( - arr: Union[np.ndarray, dask.array.Array], - digitized_echo_range: Union[np.ndarray, dask.array.Array], - n_bin_er: int, -) -> Union[np.ndarray, dask.array.Array]: - """ - Groups the rows of ``arr`` such that they have the same corresponding - row values in ``digitized_echo_range``, then applies ``bin_and_mean_echo_range`` - on each group, and lastly assembles the correctly ordered ``er_means`` array - representing the bin and mean of ``arr`` with respect to ``echo_range``. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - digitized_echo_range: dask.array.Array or np.ndarray - 2D array of bin indices for ``echo_range`` - n_bin_er: int - The number of echo range bins - - Returns - ------- - er_means: dask.array.Array or np.ndarray - The bin and mean of ``arr`` with respect to ``echo_range`` - """ - - # compute bin indices to allow for downstream processes (mainly axis argument in unique) - if isinstance(digitized_echo_range, dask.array.Array): - digitized_echo_range = digitized_echo_range.compute() - - # determine the unique rows of digitized_echo_range and the inverse - unique_er_bin_ind, unique_inverse = np.unique(digitized_echo_range, axis=0, return_inverse=True) - - # create groups of row indices using the unique inverse - grps_same_ind = [ - np.argwhere(unique_inverse == grp).flatten() for grp in np.unique(unique_inverse) - ] - - # for each group bin and mean arr along echo_range - # note: the values appended may not be in the correct final order - binned_er = [] - for count, grp in enumerate(grps_same_ind): - binned_er.append( - bin_and_mean_echo_range(arr[grp, :], unique_er_bin_ind[count, :], n_bin_er) - ) - - # construct er_means and put the columns in the correct order - binned_er_array = np.hstack(binned_er) - correct_column_ind = np.argsort(np.concatenate(grps_same_ind)) - er_means = binned_er_array[:, correct_column_ind] - - return er_means - - -def bin_and_mean_2d( - arr: Union[dask.array.Array, np.ndarray], - bins_time: np.ndarray, - bins_er: np.ndarray, - times: np.ndarray, - echo_range: np.ndarray, - comprehensive_er_check: bool = True, -) -> np.ndarray: - """ - Bins and means ``arr`` based on ``times`` and ``echo_range``, - and their corresponding bins. If ``arr`` is ``Sv`` then this - will compute the MVBS. - - Parameters - ---------- - arr: dask.array.Array or np.ndarray - The 2D array whose values should be binned - bins_time: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``times`` - bins_er: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - times: np.ndarray - 1D array corresponding to the time values that should be binned - echo_range: np.ndarray - 2D array of echo range values - comprehensive_er_check: bool - If True, a more comprehensive check will be completed to determine if ``echo_range`` - grouping along ``ping_time`` is needed, otherwise a less comprehensive check will be done - - Returns - ------- - final_reduced: np.ndarray - The final binned and mean ``arr``, if ``arr`` is ``Sv`` then this is the MVBS - - Notes - ----- - This function assumes that ``arr`` has rows corresponding to - ``ping_time`` and columns corresponding to ``echo_range``. - - This function should not be run if the number of ``echo_range`` values - vary amongst ``ping_times``. This should not occur for our current use - of echopype-generated Sv data. - """ - - # get the number of echo range and time bins - n_bin_er = len(bins_er) - n_bin_time = len(bins_time) - - # obtain the bin indices for echo_range and times - digitized_echo_range, bin_time_ind = get_bin_indices(echo_range, bins_er, times, bins_time) - - # determine if grouping of echo_range values with the same step size is necessary - er_grouping_needed = is_er_grouping_needed(echo_range, comprehensive_er_check) - - if er_grouping_needed: - # groups, bins, and means arr with respect to echo_range - er_means = group_dig_er_bin_mean_echo_range(arr, digitized_echo_range, n_bin_er) - else: - # bin and mean arr with respect to echo_range - er_means = bin_and_mean_echo_range(arr, digitized_echo_range[0, :], n_bin_er) - - # if er_means is a dask array we compute it so the graph does not get too large - if isinstance(er_means, dask.array.Array): - er_means = er_means.compute() - - # create final reduced array i.e. MVBS - final = np.empty((n_bin_time, n_bin_er - 1)) - for bin_time in range(1, n_bin_time + 1): - # obtain er_mean indices corresponding to the time bin - indices = np.argwhere(bin_time_ind == bin_time).flatten() - - if len(indices) == 0: - # fill values with NaN, if there are no values in the bin - final[bin_time - 1, :] = np.nan - else: - # bin and mean the er_mean time bin - final[bin_time - 1, :] = np.nanmean(er_means[:, indices], axis=1) - - return final +from ..consolidate.api import POSITION_VARIABLES +from ..utils.compute import _lin2log, _log2lin def get_MVBS_along_channels( - ds_Sv: xr.Dataset, echo_range_interval: np.ndarray, ping_interval: np.ndarray -) -> np.ndarray: + ds_Sv: xr.Dataset, + range_interval: Union[pd.IntervalIndex, np.ndarray], + ping_interval: Union[pd.IntervalIndex, np.ndarray], + range_var: Literal["echo_range", "depth"] = "echo_range", + method: str = "map-reduce", + **kwargs +) -> xr.Dataset: """ Computes the MVBS of ``ds_Sv`` along each channel for the given intervals. @@ -416,46 +30,60 @@ def get_MVBS_along_channels( ds_Sv: xr.Dataset A Dataset containing ``Sv`` and ``echo_range`` data with coordinates ``channel``, ``ping_time``, and ``range_sample`` - echo_range_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``echo_range`` - ping_interval: np.ndarray - 1D array (used by np.digitize) representing the binning required for ``ping_time`` + range_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``range_var`` + ping_interval: pd.IntervalIndex or np.ndarray + 1D array or interval index representing + the bins required for ``ping_time`` + range_var: str + The variable to use for range binning. + Either ``echo_range`` or ``depth``. + method: str + The flox strategy for reduction of dask arrays only. + See flox `documentation `_ + for more details. + **kwargs + Additional keyword arguments to be passed + to flox reduction function Returns ------- - np.ndarray - The MVBS value of the input ``ds_Sv`` for all channels - - Notes - ----- - If the values in ``ds_Sv`` are delayed then the binning and mean of ``Sv`` with - respect to ``echo_range`` will take place, then the delayed result will be computed, - and lastly the binning and mean with respect to ``ping_time`` will be completed. It - is necessary to apply a compute midway through this method because Dask graph layers - get too large and this makes downstream operations very inefficient. - """ - - all_MVBS = [] - for chan in ds_Sv.channel: - # squeeze to remove "channel" dim if present - # TODO: not sure why not already removed for the AZFP case. Investigate. - ds = ds_Sv.sel(channel=chan).squeeze() - - # average should be done in linear domain - sv = 10 ** (ds["Sv"] / 10) - - # get MVBS for channel in linear domain - chan_MVBS = bin_and_mean_2d( - sv.data, - bins_time=ping_interval, - bins_er=echo_range_interval, - times=sv.ping_time.data, - echo_range=ds["echo_range"], - comprehensive_er_check=True, + xr.Dataset + The MVBS dataset of the input ``ds_Sv`` for all channels + """ + + # average should be done in linear domain + sv = ds_Sv["Sv"].pipe(_log2lin) + + # Get positions if exists + # otherwise just use an empty dataset + ds_Pos = xr.Dataset(attrs={"has_positions": False}) + if all(v in ds_Sv for v in POSITION_VARIABLES): + ds_Pos = xarray_reduce( + ds_Sv[POSITION_VARIABLES], + ds_Sv["ping_time"], + func="nanmean", + expected_groups=(ping_interval), + isbin=True, + method=method, ) + ds_Pos.attrs["has_positions"] = True + + # reduce along ping_time and echo_range or depth + # by binning and averaging + mvbs = xarray_reduce( + sv, + sv["channel"], + ds_Sv["ping_time"], + ds_Sv[range_var], + func="nanmean", + expected_groups=(None, ping_interval, range_interval), + isbin=[False, True, True], + method=method, + **kwargs + ) - # apply inverse mapping to get back to the original domain and store values - all_MVBS.append(10 * np.log10(chan_MVBS)) - - # collect the MVBS values for each channel - return np.stack(all_MVBS, axis=0) + # apply inverse mapping to get back to the original domain and store values + da_MVBS = mvbs.pipe(_lin2log) + return xr.merge([ds_Pos, da_MVBS]) diff --git a/echopype/consolidate/api.py b/echopype/consolidate/api.py index a08144a5c..364c76be3 100644 --- a/echopype/consolidate/api.py +++ b/echopype/consolidate/api.py @@ -12,6 +12,8 @@ from ..utils.prov import add_processing_level from .split_beam_angle import add_angle_to_ds, get_angle_complex_samples, get_angle_power_samples +POSITION_VARIABLES = ["latitude", "longitude"] + def swap_dims_channel_frequency(ds: xr.Dataset) -> xr.Dataset: """ @@ -185,7 +187,7 @@ def sel_interp(var, time_dim_name): f"{datetime.datetime.utcnow()} +00:00. " "Interpolated or propagated from Platform latitude/longitude." # noqa ) - for da_name in ["latitude", "longitude"]: + for da_name in POSITION_VARIABLES: interp_ds[da_name] = interp_ds[da_name].assign_attrs({"history": history_attr}) if time_dim_name in interp_ds: diff --git a/echopype/convert/api.py b/echopype/convert/api.py index fbc9e8766..107af1bc0 100644 --- a/echopype/convert/api.py +++ b/echopype/convert/api.py @@ -17,6 +17,7 @@ from ..utils.coding import COMPRESSION_SETTINGS from ..utils.log import _init_logger from ..utils.prov import add_processing_level +from .utils.ek import should_use_swap BEAM_SUBGROUP_DEFAULT = "Beam_group1" @@ -315,6 +316,8 @@ def open_raw( convert_params: Optional[Dict[str, str]] = None, storage_options: Optional[Dict[str, str]] = None, use_swap: bool = False, + destination_path: Optional[str] = None, + destination_storage_options: Optional[Dict[str, str]] = None, max_mb: int = 100, ) -> Optional[EchoData]: """Create an EchoData object containing parsed data from a single raw data file. @@ -342,11 +345,21 @@ def open_raw( convert_params : dict parameters (metadata) that may not exist in the raw file and need to be added to the converted file - storage_options : dict + storage_options : dict, optional options for cloud storage use_swap: bool + **DEPRECATED: This flag is ignored** If True, variables with a large memory footprint will be written to a temporary zarr store at ``~/.echopype/temp_output/parsed2zarr_temp_files`` + destination_path: str + The path to a swap directory in a case of a large memory footprint. + This path can be a remote path like s3://bucket/swap_dir. + By default, it will create a temporary zarr store at + ``~/.echopype/temp_output/parsed2zarr_temp_files`` if needed, + when set to "auto". + destination_storage_options: dict, optional + Options for remote storage for the swap directory ``destination_path`` + argument. max_mb : int The maximum data chunk size in Megabytes (MB), when offloading variables with a large memory footprint to a temporary zarr store @@ -369,10 +382,23 @@ def open_raw( Notes ----- - ``use_swap=True`` is only available for the following + In a case of a large memory footprint, the program will determine if using + a temporary swap space is needed. If so, it will use that space + during conversion to prevent out of memory errors. + Users can override this behaviour by either passing ``"swap"`` or ``"no_swap"`` + into the ``destination_path`` argument. + This feature is only available for the following echosounders: EK60, ES70, EK80, ES80, EA640. Additionally, this feature is currently in beta. """ + + # Initially set use_swap False + use_swap = False + + # Set initial destination_path of "no_swap" + if destination_path is None: + destination_path = "no_swap" + if raw_file is None: raise FileNotFoundError("The path to the raw data file must be specified.") @@ -402,15 +428,6 @@ def open_raw( # Check file extension and existence file_chk, xml_chk = _check_file(raw_file, sonar_model, xml_path, storage_options) - # TODO: remove once 'auto' option is added - if not isinstance(use_swap, bool): - raise ValueError("use_swap must be of type bool.") - - # Ensure use_swap is 'auto', if it is a string - # TODO: use the following when we allow for 'auto' option - # if isinstance(use_swap, str) and use_swap != "auto": - # raise ValueError("use_swap must be a bool or equal to 'auto'.") - # TODO: the if-else below only works for the AZFP vs EK contrast, # but is brittle since it is abusing params by using it implicitly if SONAR_MODELS[sonar_model]["xml"]: @@ -423,29 +440,49 @@ def open_raw( # Parse raw file and organize data into groups parser = SONAR_MODELS[sonar_model]["parser"]( - file_chk, params=params, storage_options=storage_options, dgram_zarr_vars=dgram_zarr_vars + file_chk, + params=params, + storage_options=storage_options, + dgram_zarr_vars=dgram_zarr_vars, + sonar_model=sonar_model, ) parser.parse_raw() # Direct offload to zarr and rectangularization only available for some sonar models if sonar_model in ["EK60", "ES70", "EK80", "ES80", "EA640"]: - # Create sonar_model-specific p2z object - p2z = SONAR_MODELS[sonar_model]["parsed2zarr"](parser) - - # Determines if writing to zarr is necessary and writes to zarr - p2z_flag = use_swap is True or ( - use_swap == "auto" and p2z.whether_write_to_zarr(mem_mult=0.4) - ) - - if p2z_flag: - p2z.datagram_to_zarr(max_mb=max_mb) - # Rectangularize the transmit data - parser.rectangularize_transmit_ping_data(data_type="complex") + swap_map = { + "swap": True, + "no_swap": False, + } + if destination_path == "auto": + # Overwrite use_swap if it's True below + # Use local swap directory + use_swap = should_use_swap(parser.zarr_datagrams, dgram_zarr_vars, mem_mult=0.4) + elif destination_path in swap_map: + use_swap = swap_map[destination_path] + else: + # TODO: Add docstring about swap path + use_swap = True + if "://" in destination_path and destination_storage_options is None: + raise ValueError( + ( + "Please provide storage options for remote destination. ", + "If access is already configured locally, ", + "simply put an empty dictionary.", + ) + ) + + if use_swap: + # Create sonar_model-specific p2z object + p2z = SONAR_MODELS[sonar_model]["parsed2zarr"](parser) + p2z.datagram_to_zarr( + dest_path=destination_path, + dest_storage_options=destination_storage_options, + max_mb=max_mb, + ) else: - del p2z - # Create general p2z object - p2z = Parsed2Zarr(parser) + p2z = Parsed2Zarr(parser) # Create general p2z object parser.rectangularize_data() else: diff --git a/echopype/convert/parse_ad2cp.py b/echopype/convert/parse_ad2cp.py index 3b5fcea70..6c5c796fd 100644 --- a/echopype/convert/parse_ad2cp.py +++ b/echopype/convert/parse_ad2cp.py @@ -219,8 +219,8 @@ class NoMorePackets(Exception): class ParseAd2cp(ParseBase): - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, storage_options) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="AD2CP"): + super().__init__(file, storage_options, sonar_model) self.config = None self.packets: List[Ad2cpDataPacket] = [] diff --git a/echopype/convert/parse_azfp.py b/echopype/convert/parse_azfp.py index a890068a7..2c031a53f 100644 --- a/echopype/convert/parse_azfp.py +++ b/echopype/convert/parse_azfp.py @@ -1,5 +1,5 @@ import os -import xml.dom.minidom +import xml.etree.ElementTree as ET from collections import defaultdict from datetime import datetime as dt from struct import unpack @@ -8,48 +8,11 @@ import numpy as np from ..utils.log import _init_logger +from ..utils.misc import camelcase2snakecase from .parse_base import ParseBase FILENAME_DATETIME_AZFP = "\\w+.01A" -XML_INT_PARAMS = { - "NumFreq": "num_freq", - "SerialNumber": "serial_number", - "BurstInterval": "burst_interval", - "PingsPerBurst": "pings_per_burst", - "AverageBurstPings": "average_burst_pings", - "SensorsFlag": "sensors_flag", -} -XML_FLOAT_PARAMS = [ - # Temperature coeffs - "ka", - "kb", - "kc", - "A", - "B", - "C", - # Tilt coeffs - "X_a", - "X_b", - "X_c", - "X_d", - "Y_a", - "Y_b", - "Y_c", - "Y_d", -] -XML_FREQ_PARAMS = { - "RangeSamples": "range_samples", - "RangeAveragingSamples": "range_averaging_samples", - "DigRate": "dig_rate", - "LockOutIndex": "lockout_index", - "Gain": "gain", - "PulseLen": "pulse_length", - "DS": "DS", - "EL": "EL", - "TVR": "TVR", - "VTX0": "VTX", - "BP": "BP", -} + HEADER_FIELDS = ( ("profile_flag", "u2"), ("profile_number", "u2"), @@ -64,7 +27,7 @@ ("second", "u2"), # Second ("hundredths", "u2"), # Hundredths of a second ("dig_rate", "u2", 4), # Digitalization rate for each channel - ("lockout_index", "u2", 4), # Lockout index for each channel + ("lock_out_index", "u2", 4), # Lockout index for each channel ("num_bins", "u2", 4), # Number of bins for each channel ( "range_samples_per_bin", @@ -88,7 +51,7 @@ ("num_chan", "u1"), # 1, 2, 3, or 4 ("gain", "u1", 4), # gain channel 1-4 ("spare_chan", "u1"), # spare channel - ("pulse_length", "u2", 4), # Pulse length chan 1-4 uS + ("pulse_len", "u2", 4), # Pulse length chan 1-4 uS ("board_num", "u2", 4), # The board the data came from channel 1-4 ("frequency", "u2", 4), # frequency for channel 1-4 in kHz ( @@ -110,42 +73,49 @@ class ParseAZFP(ParseBase): HEADER_FORMAT = ">HHHHIHHHHHHHHHHHHHHHHHHHHHHHHHHHHHBBBBHBBBBBBBBHHHHHHHHHHHHHHHHHHHH" FILE_TYPE = 64770 - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, storage_options) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="AZFP"): + super().__init__(file, storage_options, sonar_model) # Parent class attributes # regex pattern used to grab datetime embedded in filename self.timestamp_pattern = FILENAME_DATETIME_AZFP self.xml_path = params # Class attributes - self.parameters = dict() + self.parameters = defaultdict(list) self.unpacked_data = defaultdict(list) self.sonar_type = "AZFP" def load_AZFP_xml(self): - """Parse XML file to get params for reading AZFP data.""" - """Parses the AZFP XML file. """ - - def get_value_by_tag_name(tag_name, element=0): - """Returns the value in an XML tag given the tag name and the number of occurrences.""" - return px.getElementsByTagName(tag_name)[element].childNodes[0].data + Parses the AZFP XML file. + """ xmlmap = fsspec.get_mapper(self.xml_path, **self.storage_options) - px = xml.dom.minidom.parse(xmlmap.fs.open(xmlmap.root)) - - # Retrieve integer parameters from the xml file - for old_name, new_name in XML_INT_PARAMS.items(): - self.parameters[new_name] = int(get_value_by_tag_name(old_name)) - # Retrieve floating point parameters from the xml file - for param in XML_FLOAT_PARAMS: - self.parameters[param] = float(get_value_by_tag_name(param)) - # Retrieve frequency dependent parameters from the xml file - for old_name, new_name in XML_FREQ_PARAMS.items(): - self.parameters[new_name] = [ - float(get_value_by_tag_name(old_name, ch)) - for ch in range(self.parameters["num_freq"]) - ] + root = ET.parse(xmlmap.fs.open(xmlmap.root)).getroot() + + for child in root.iter(): + if len(child.tag) > 3 and not child.tag.startswith("VTX"): + camel_case_tag = camelcase2snakecase(child.tag) + else: + camel_case_tag = child.tag + if len(child.attrib) > 0: + for key, val in child.attrib.items(): + self.parameters[camel_case_tag + "_" + camelcase2snakecase(key)].append(val) + + if all(char == "\n" for char in child.text): + continue + else: + try: + val = int(child.text) + except ValueError: + val = float(child.text) + + self.parameters[camel_case_tag].append(val) + + # Handling the case where there is only one value for each parameter + for key, val in self.parameters.items(): + if len(val) == 1: + self.parameters[key] = val[0] def _compute_temperature(self, ping_num, is_valid): """ @@ -245,7 +215,6 @@ def _test_valid_params(params): header_chunk = file.read(self.HEADER_SIZE) if header_chunk: header_unpacked = unpack(self.HEADER_FORMAT, header_chunk) - # Reading will stop if the file contains an unexpected flag if self._split_header(file, header_unpacked): # Appends the actual 'data values' to unpacked_data @@ -354,12 +323,12 @@ def _split_header(self, raw, header_unpacked): field_w_freq = ( "dig_rate", - "lockout_index", + "lock_out_index", "num_bins", "range_samples_per_bin", # fields with num_freq data "data_type", "gain", - "pulse_length", + "pulse_len", "board_num", "frequency", ) @@ -417,12 +386,12 @@ def _check_uniqueness(self): # fields with num_freq data field_w_freq = ( "dig_rate", - "lockout_index", + "lock_out_index", "num_bins", "range_samples_per_bin", "data_type", "gain", - "pulse_length", + "pulse_len", "board_num", "frequency", ) @@ -478,22 +447,22 @@ def _get_ping_time(self): self.ping_time = ping_time @staticmethod - def _calc_Sv_offset(f, pulse_length): + def _calc_Sv_offset(f, pulse_len): """Calculate the compensation factor for Sv calculation.""" # TODO: this method seems should be in echopype.process if f > 38000: - if pulse_length == 300: + if pulse_len == 300: return 1.1 - elif pulse_length == 500: + elif pulse_len == 500: return 0.8 - elif pulse_length == 700: + elif pulse_len == 700: return 0.5 - elif pulse_length == 900: + elif pulse_len == 900: return 0.3 - elif pulse_length == 1000: + elif pulse_len == 1000: return 0.3 else: - if pulse_length == 500: + if pulse_len == 500: return 1.1 - elif pulse_length == 1000: + elif pulse_len == 1000: return 0.7 diff --git a/echopype/convert/parse_base.py b/echopype/convert/parse_base.py index e14ae276a..3f4d960e4 100644 --- a/echopype/convert/parse_base.py +++ b/echopype/convert/parse_base.py @@ -17,12 +17,14 @@ class ParseBase: """Parent class for all convert classes.""" - def __init__(self, file, storage_options): + def __init__(self, file, storage_options, sonar_model): self.source_file = file self.timestamp_pattern = None # regex pattern used to grab datetime embedded in filename self.ping_time = [] # list to store ping time self.storage_options = storage_options self.zarr_datagrams = [] # holds all parsed datagrams + self.zarr_tx_datagrams = [] # holds all parsed transmit datagrams + self.sonar_model = sonar_model def _print_status(self): """Prints message to console giving information about the raw file being parsed.""" @@ -31,8 +33,8 @@ def _print_status(self): class ParseEK(ParseBase): """Class for converting data from Simrad echosounders.""" - def __init__(self, file, params, storage_options, dgram_zarr_vars): - super().__init__(file, storage_options) + def __init__(self, file, params, storage_options, dgram_zarr_vars, sonar_model): + super().__init__(file, storage_options, sonar_model) # Parent class attributes # regex pattern used to grab datetime embedded in filename @@ -78,6 +80,10 @@ def rectangularize_data(self): for dgram in self.zarr_datagrams: self._append_channel_ping_data(dgram, zarr_vars=False) + # append zarr transmit datagrams to channel ping data + for dgram in self.zarr_tx_datagrams: + self._append_channel_ping_data(dgram, rx=False, zarr_vars=False) + # Rectangularize all data and convert to numpy array indexed by channel for data_type in ["power", "angle", "complex"]: # Receive data @@ -350,7 +356,7 @@ def _read_datagrams(self, fid): else: logger.info("Unknown datagram type: " + str(new_datagram["type"])) - def _append_zarr_dgram(self, full_dgram: dict): + def _append_zarr_dgram(self, full_dgram: dict, rx: bool): """ Selects a subset of the datagram values that need to be sent directly to a zarr file and @@ -389,7 +395,10 @@ def _append_zarr_dgram(self, full_dgram: dict): reduced_datagram["power"] = reduced_datagram["power"].astype("float32") * INDEX2POWER if reduced_datagram: - self.zarr_datagrams.append(reduced_datagram) + if rx: + self.zarr_datagrams.append(reduced_datagram) + else: + self.zarr_tx_datagrams.append(reduced_datagram) def _append_channel_ping_data(self, datagram, rx=True, zarr_vars=True): """ @@ -411,10 +420,10 @@ def _append_channel_ping_data(self, datagram, rx=True, zarr_vars=True): ch_id = datagram["channel_id"] if "channel_id" in datagram else datagram["channel"] # append zarr variables, if they exist - if zarr_vars and rx: + if zarr_vars: common_vars = set(self.dgram_zarr_vars.keys()).intersection(set(datagram.keys())) if common_vars: - self._append_zarr_dgram(datagram) + self._append_zarr_dgram(datagram, rx=rx) for var in common_vars: del datagram[var] diff --git a/echopype/convert/parse_ek60.py b/echopype/convert/parse_ek60.py index 9502c3214..a8666a386 100644 --- a/echopype/convert/parse_ek60.py +++ b/echopype/convert/parse_ek60.py @@ -4,8 +4,8 @@ class ParseEK60(ParseEK): """Class for converting data from Simrad EK60 echosounders.""" - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, params, storage_options, dgram_zarr_vars) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="EK60"): + super().__init__(file, params, storage_options, dgram_zarr_vars, sonar_model) def _select_datagrams(self, params): # Translates user input into specific datagrams or ALL diff --git a/echopype/convert/parse_ek80.py b/echopype/convert/parse_ek80.py index 55027ffe3..583fac8c4 100644 --- a/echopype/convert/parse_ek80.py +++ b/echopype/convert/parse_ek80.py @@ -4,8 +4,8 @@ class ParseEK80(ParseEK): """Class for converting data from Simrad EK80 echosounders.""" - def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}): - super().__init__(file, params, storage_options, dgram_zarr_vars) + def __init__(self, file, params, storage_options={}, dgram_zarr_vars={}, sonar_model="EK80"): + super().__init__(file, params, storage_options, dgram_zarr_vars, sonar_model) self.environment = {} # dictionary to store environment data def _select_datagrams(self, params): diff --git a/echopype/convert/parsed_to_zarr.py b/echopype/convert/parsed_to_zarr.py index 95fde46c9..9e8a302a9 100644 --- a/echopype/convert/parsed_to_zarr.py +++ b/echopype/convert/parsed_to_zarr.py @@ -1,14 +1,28 @@ import secrets import sys -from pathlib import Path -from typing import List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union -import more_itertools as miter +import dask.array +import fsspec import numpy as np import pandas as pd +import xarray as xr import zarr -from ..utils.io import ECHOPYPE_DIR, check_file_permissions +from ..echodata.convention import sonarnetcdf_1 +from ..utils.io import ECHOPYPE_DIR, check_file_permissions, validate_output_path + +DEFAULT_ZARR_TEMP_DIR = ECHOPYPE_DIR / "temp_output" / "parsed2zarr_temp_files" + + +def _create_zarr_store_map(path, storage_options): + file_path = validate_output_path( + source_file=secrets.token_hex(16), + engine="zarr", + save_path=path, + output_storage_options=storage_options, + ) + return fsspec.get_mapper(file_path, **storage_options) class Parsed2Zarr: @@ -26,53 +40,63 @@ def __init__(self, parser_obj): self.zarr_root = None self.parser_obj = parser_obj # parser object ParseEK60/ParseEK80/etc. - def _create_zarr_info(self): + self._varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] + + if hasattr(self.parser_obj, "sonar_model"): + self.sonar_model = self.parser_obj.sonar_model + + def _create_zarr_info( + self, dest_path: str = None, dest_storage_options: Optional[Dict] = None, retries: int = 10 + ): """ Creates the temporary directory for zarr storage, zarr file name, zarr store, and the root group of the zarr store. """ - # get current working directory - current_dir = Path.cwd() - - # Check permission of cwd, raise exception if no permission - check_file_permissions(current_dir) + if dest_path is None: + # Check permission of cwd, raise exception if no permission + check_file_permissions(ECHOPYPE_DIR) - # construct temporary directory that will hold the zarr file - out_dir = current_dir / ECHOPYPE_DIR / "temp_output" / "parsed2zarr_temp_files" - if not out_dir.exists(): - out_dir.mkdir(parents=True) + # construct temporary directory that will hold the zarr file + dest_path = DEFAULT_ZARR_TEMP_DIR + if not dest_path.exists(): + dest_path.mkdir(parents=True) # establish temporary directory we will write zarr files to - self.temp_zarr_dir = str(out_dir) - - # create zarr store name - zarr_file_name = str(out_dir / secrets.token_hex(16)) + ".zarr" - - # attempt to find different zarr_file_name, if it already exists - count = 0 - while Path(zarr_file_name).exists() and count < 10: - # generate new zarr_file_name - zarr_file_name = str(out_dir / secrets.token_hex(16)) + ".zarr" - count += 1 - - # error out if we are unable to get a unique name, else assign name to class variable - if (count == 10) and Path(zarr_file_name).exists(): - raise RuntimeError("Unable to construct an unused zarr file name for Parsed2Zarr!") - else: - self.zarr_file_name = zarr_file_name + self.temp_zarr_dir = str(dest_path) + + # Set default storage options if None + if dest_storage_options is None: + dest_storage_options = {} + + # attempt to find different zarr_file_name + attempt = 0 + exists = True + while exists: + zarr_store = _create_zarr_store_map( + path=dest_path, storage_options=dest_storage_options + ) + exists = zarr_store.fs.exists(zarr_store.root) + attempt += 1 + + if attempt == retries and exists: + raise RuntimeError( + ( + "Unable to construct an unused zarr file name for Parsed2Zarr ", + f"after {retries} retries!", + ) + ) # create zarr store and zarr group we want to write to - self.store = zarr.DirectoryStore(self.zarr_file_name) + self.store = zarr_store self.zarr_root = zarr.group(store=self.store, overwrite=True) def _close_store(self): """properly closes zarr store""" - # consolidate metadata and close zarr store + # consolidate metadata zarr.consolidate_metadata(self.store) - self.store.close() @staticmethod def set_multi_index( @@ -383,7 +407,7 @@ def write_df_column( # evenly chunk unique times so that the smallest and largest # chunk differ by at most 1 element - chunks = list(miter.chunked_even(unique_time_ind, max_num_times)) + chunks = np.array_split(unique_time_ind, np.ceil(len(unique_time_ind) / max_num_times)) self.write_chunks(pd_series, zarr_grp, is_array, chunks, chunk_shape) @@ -399,6 +423,225 @@ def _get_zarr_dgrams_size(self) -> int: return size + def _get_channel_ids(self, chan_str: np.ndarray) -> List[str]: + """ + Obtains the channel IDs associated with ``chan_str``. + + Parameters + ---------- + chan_str : np.ndarray + A numpy array of strings corresponding to the + keys of ``config_datagram["transceivers"]`` + + Returns + ------- + A list of strings representing the channel IDS + """ + if self.sonar_model in ["EK60", "ES70"]: + return [ + self.parser_obj.config_datagram["transceivers"][int(i)]["channel_id"] + for i in chan_str + ] + else: + return [ + self.parser_obj.config_datagram["configuration"][i]["channel_id"] for i in chan_str + ] + + @property + def power_dataarray(self) -> xr.DataArray: + """ + Constructs a DataArray from a Dask array for the power + data. + + Returns + ------- + DataArray named "backscatter_r" representing the + power data. + """ + zarr_path = self.store + + # collect variables associated with the power data + power = dask.array.from_zarr(zarr_path, component="power/power") + + pow_time_path = "power/" + self.power_dims[0] + pow_chan_path = "power/" + self.power_dims[1] + power_time = dask.array.from_zarr(zarr_path, component=pow_time_path).compute() + power_channel = dask.array.from_zarr(zarr_path, component=pow_chan_path).compute() + + # obtain channel names for power data + pow_chan_names = self._get_channel_ids(power_channel) + + backscatter_r = xr.DataArray( + data=power, + coords={ + "ping_time": ( + ["ping_time"], + power_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + pow_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "range_sample": ( + ["range_sample"], + np.arange(power.shape[2]), + self._varattrs["beam_coord_default"]["range_sample"], + ), + }, + name="backscatter_r", + attrs={ + "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], + "units": "dB", + }, + ) + + return backscatter_r + + @property + def angle_dataarrays(self) -> Tuple[xr.DataArray, xr.DataArray]: + """ + Constructs the DataArrays from Dask arrays associated + with the angle data. + + Returns + ------- + DataArrays named "angle_athwartship" and "angle_alongship", + respectively, representing the angle data. + """ + + zarr_path = self.store + + # collect variables associated with the angle data + angle_along = dask.array.from_zarr(zarr_path, component="angle/angle_alongship") + angle_athwart = dask.array.from_zarr(zarr_path, component="angle/angle_athwartship") + + ang_time_path = "angle/" + self.angle_dims[0] + ang_chan_path = "angle/" + self.angle_dims[1] + angle_time = dask.array.from_zarr(zarr_path, component=ang_time_path).compute() + angle_channel = dask.array.from_zarr(zarr_path, component=ang_chan_path).compute() + + # obtain channel names for angle data + ang_chan_names = self._get_channel_ids(angle_channel) + + array_coords = { + "ping_time": ( + ["ping_time"], + angle_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + ang_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "range_sample": ( + ["range_sample"], + np.arange(angle_athwart.shape[2]), + self._varattrs["beam_coord_default"]["range_sample"], + ), + } + + angle_athwartship = xr.DataArray( + data=angle_athwart, + coords=array_coords, + name="angle_athwartship", + attrs={ + "long_name": "electrical athwartship angle", + "comment": ( + "Introduced in echopype for Simrad echosounders. " # noqa + + "The athwartship angle corresponds to the major angle in SONAR-netCDF4 vers 2. " # noqa + ), + }, + ) + + angle_alongship = xr.DataArray( + data=angle_along, + coords=array_coords, + name="angle_alongship", + attrs={ + "long_name": "electrical alongship angle", + "comment": ( + "Introduced in echopype for Simrad echosounders. " # noqa + + "The alongship angle corresponds to the minor angle in SONAR-netCDF4 vers 2. " # noqa + ), + }, + ) + + return angle_athwartship, angle_alongship + + @property + def complex_dataarrays(self) -> Tuple[xr.DataArray, xr.DataArray]: + """ + Constructs the DataArrays from Dask arrays associated + with the complex data. + + Returns + ------- + DataArrays named "backscatter_r" and "backscatter_i", + respectively, representing the complex data. + """ + + zarr_path = self.store + + # collect variables associated with the complex data + complex_r = dask.array.from_zarr(zarr_path, component="complex/backscatter_r") + complex_i = dask.array.from_zarr(zarr_path, component="complex/backscatter_i") + + comp_time_path = "complex/" + self.complex_dims[0] + comp_chan_path = "complex/" + self.complex_dims[1] + complex_time = dask.array.from_zarr(zarr_path, component=comp_time_path).compute() + complex_channel = dask.array.from_zarr(zarr_path, component=comp_chan_path).compute() + + # obtain channel names for complex data + comp_chan_names = self._get_channel_ids(complex_channel) + + array_coords = { + "ping_time": ( + ["ping_time"], + complex_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + comp_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "range_sample": ( + ["range_sample"], + np.arange(complex_r.shape[2]), + self._varattrs["beam_coord_default"]["range_sample"], + ), + "beam": ( + ["beam"], + np.arange(start=1, stop=complex_r.shape[3] + 1).astype(str), + self._varattrs["beam_coord_default"]["beam"], + ), + } + + backscatter_r = xr.DataArray( + data=complex_r, + coords=array_coords, + name="backscatter_r", + attrs={ + "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], + "units": "dB", + }, + ) + + backscatter_i = xr.DataArray( + data=complex_i, + coords=array_coords, + name="backscatter_i", + attrs={ + "long_name": self._varattrs["beam_var_default"]["backscatter_i"]["long_name"], + "units": "dB", + }, + ) + + return backscatter_r, backscatter_i + def array_series_bytes(self, pd_series: pd.Series, n_rows: int) -> int: """ Determines the amount of bytes required for a diff --git a/echopype/convert/parsed_to_zarr_ek60.py b/echopype/convert/parsed_to_zarr_ek60.py index 17f80c8a0..1568b237e 100644 --- a/echopype/convert/parsed_to_zarr_ek60.py +++ b/echopype/convert/parsed_to_zarr_ek60.py @@ -253,7 +253,7 @@ def whether_write_to_zarr(self, mem_mult: float = 0.3) -> bool: return mem.total * mem_mult < req_mem - def datagram_to_zarr(self, max_mb: int) -> None: + def datagram_to_zarr(self, dest_path: str, dest_storage_options: dict, max_mb: int) -> None: """ Facilitates the conversion of a list of datagrams to a form that can be written @@ -275,7 +275,7 @@ def datagram_to_zarr(self, max_mb: int) -> None: the same. """ - self._create_zarr_info() + self._create_zarr_info(dest_path=dest_path, dest_storage_options=dest_storage_options) # create datagram df, if it does not exist if not isinstance(self.datagram_df, pd.DataFrame): diff --git a/echopype/convert/parsed_to_zarr_ek80.py b/echopype/convert/parsed_to_zarr_ek80.py index 468b1f6b0..19b117668 100644 --- a/echopype/convert/parsed_to_zarr_ek80.py +++ b/echopype/convert/parsed_to_zarr_ek80.py @@ -1,6 +1,11 @@ +from typing import Optional, Tuple + +import dask.array import numpy as np import pandas as pd import psutil +import xarray as xr +from zarr.errors import ArrayNotFoundError from .parsed_to_zarr_ek60 import Parsed2ZarrEK60 @@ -20,6 +25,7 @@ def __init__(self, parser_obj): self.p2z_ch_ids = {} # channel ids for power, angle, complex self.pow_ang_df = None # df that holds power and angle data self.complex_df = None # df that holds complex data + self.tx_df = None # df that holds transmit data # get channel and channel_id association and sort by channel_id channels_old = list(self.parser_obj.config_datagram["configuration"].keys()) @@ -31,6 +37,78 @@ def __init__(self, parser_obj): # obtain sort rule for the channel index self.channel_sort_rule = {ch: channels_new.index(ch) for ch in channels_old} + @property + def tx_complex_dataarrays(self) -> Optional[Tuple[xr.DataArray, xr.DataArray]]: + """ + Constructs the DataArrays from Dask arrays associated + with the transmit complex data (RAW4). + + Returns + ------- + DataArrays named "transmit_pulse_r" and "transmit_pulse_i", + respectively, representing the complex data. + """ + try: + zarr_path = self.store + + # collect variables associated with the complex data + complex_r = dask.array.from_zarr(zarr_path, component="tx_complex/transmit_pulse_r") + complex_i = dask.array.from_zarr(zarr_path, component="tx_complex/transmit_pulse_i") + + comp_time_path = "tx_complex/" + self.complex_dims[0] + comp_chan_path = "tx_complex/" + self.complex_dims[1] + complex_time = dask.array.from_zarr(zarr_path, component=comp_time_path).compute() + complex_channel = dask.array.from_zarr(zarr_path, component=comp_chan_path).compute() + + # obtain channel names for complex data + comp_chan_names = self._get_channel_ids(complex_channel) + + array_coords = { + "ping_time": ( + ["ping_time"], + complex_time, + self._varattrs["beam_coord_default"]["ping_time"], + ), + "channel": ( + ["channel"], + comp_chan_names, + self._varattrs["beam_coord_default"]["channel"], + ), + "transmit_sample": ( + ["transmit_sample"], + np.arange(complex_r.shape[2]), + { + "long_name": "Transmit pulse sample number, base 0", + "comment": "Only exist for Simrad EK80 file with RAW4 datagrams", + }, + ), + } + + transmit_pulse_r = xr.DataArray( + data=complex_r, + coords=array_coords, + name="transmit_pulse_r", + attrs={ + "long_name": "Real part of the transmit pulse", + "units": "V", + "comment": "Only exist for Simrad EK80 file with RAW4 datagrams", + }, + ) + + transmit_pulse_i = xr.DataArray( + data=complex_i, + coords=array_coords, + name="transmit_pulse_i", + attrs={ + "long_name": "Imaginary part of the transmit pulse", + "units": "V", + "comment": "Only exist for Simrad EK80 file with RAW4 datagrams", + }, + ) + return transmit_pulse_r, transmit_pulse_i + except ArrayNotFoundError: + return None + def _get_num_transd_sec(self, x: pd.DataFrame): """ Returns the number of transducer sectors. @@ -86,7 +164,7 @@ def _reshape_series(self, complex_series: pd.Series) -> pd.Series: ) @staticmethod - def _split_complex_data(complex_series: pd.Series) -> pd.DataFrame: + def _split_complex_data(complex_series: pd.Series, rx: bool = True) -> pd.DataFrame: """ Splits the 1D complex data into two 1D arrays representing the real and imaginary parts of @@ -105,6 +183,9 @@ def _split_complex_data(complex_series: pd.Series) -> pd.DataFrame: respectively. The DataFrame will have the same index as ``complex_series``. """ + columns = ["backscatter_r", "backscatter_i"] + if not rx: + columns = ["transmit_pulse_r", "transmit_pulse_i"] complex_split = complex_series.apply( lambda x: [np.real(x), np.imag(x)] if isinstance(x, np.ndarray) else [None, None] @@ -112,11 +193,11 @@ def _split_complex_data(complex_series: pd.Series) -> pd.DataFrame: return pd.DataFrame( data=complex_split.to_list(), - columns=["backscatter_r", "backscatter_i"], + columns=columns, index=complex_series.index, ) - def _write_complex(self, df: pd.DataFrame, max_mb: int): + def _write_complex(self, df: pd.DataFrame, max_mb: int, rx: bool = True): """ Writes the complex data and associated indices to a zarr group. @@ -142,11 +223,15 @@ def _write_complex(self, df: pd.DataFrame, max_mb: int): ) channels = channels[indexer] - complex_series = self._reshape_series(complex_series) + if rx: + complex_series = self._reshape_series(complex_series) + grp_key = "complex" + else: + grp_key = "tx_complex" - complex_df = self._split_complex_data(complex_series) + self.p2z_ch_ids[grp_key] = channels.values # store channel ids for variable - self.p2z_ch_ids["complex"] = channels.values # store channel ids for variable + complex_df = self._split_complex_data(complex_series, rx=rx) # create multi index using the product of the unique dims unique_dims = [times, channels] @@ -154,7 +239,11 @@ def _write_complex(self, df: pd.DataFrame, max_mb: int): complex_df = self.set_multi_index(complex_df, unique_dims) # write complex data to the complex group - zarr_grp = self.zarr_root.create_group("complex") + if rx: + zarr_grp = self.zarr_root.create_group(grp_key) + else: + zarr_grp = self.zarr_root.create_group(grp_key) + for column in complex_df: self.write_df_column( pd_series=complex_df[column], @@ -219,6 +308,17 @@ def _get_zarr_dfs(self): self.complex_df = datagram_df.dropna().copy() + def _get_tx_zarr_df(self) -> None: + """Create dataframe for the transmit data.""" + + tx_datagram_df = pd.DataFrame.from_dict(self.parser_obj.zarr_tx_datagrams) + # remove power and angle to conserve memory + for col in ["power", "angle"]: + if col in tx_datagram_df.columns: + del tx_datagram_df[col] + + self.tx_df = tx_datagram_df.dropna().copy() + def whether_write_to_zarr(self, mem_mult: float = 0.3) -> bool: """ Determines if the zarr data provided will expand @@ -266,7 +366,7 @@ def whether_write_to_zarr(self, mem_mult: float = 0.3) -> bool: return mem.total * mem_mult < req_mem - def datagram_to_zarr(self, max_mb: int) -> None: + def datagram_to_zarr(self, dest_path: str, dest_storage_options: dict, max_mb: int) -> None: """ Facilitates the conversion of a list of datagrams to a form that can be written @@ -288,7 +388,7 @@ def datagram_to_zarr(self, max_mb: int) -> None: the same. """ - self._create_zarr_info() + self._create_zarr_info(dest_path=dest_path, dest_storage_options=dest_storage_options) # create zarr dfs, if they do not exist if not isinstance(self.pow_ang_df, pd.DataFrame) and not isinstance( @@ -308,4 +408,14 @@ def datagram_to_zarr(self, max_mb: int) -> None: del self.complex_df # free memory + # write transmit data + if not isinstance(self.tx_df, pd.DataFrame): + self._get_tx_zarr_df() + del self.parser_obj.zarr_tx_datagrams # free memory + + if not self.tx_df.empty: + self._write_complex(df=self.tx_df, max_mb=max_mb, rx=False) + + del self.tx_df # free memory + self._close_store() diff --git a/echopype/convert/set_groups_azfp.py b/echopype/convert/set_groups_azfp.py index 57f754ce9..9c37eec1e 100644 --- a/echopype/convert/set_groups_azfp.py +++ b/echopype/convert/set_groups_azfp.py @@ -81,13 +81,14 @@ def _create_unique_channel_name(self): """ serial_number = self.parser_obj.unpacked_data["serial_number"] + frequency_number = self.parser_obj.parameters["frequency_number"] if serial_number.size == 1: freq_as_str = self.freq_sorted.astype(int).astype(str) # TODO: replace str(i+1) with Frequency Number from XML channel_id = [ - str(serial_number) + "-" + freq + "-" + str(i + 1) + str(serial_number) + "-" + freq + "-" + frequency_number[i] for i, freq in enumerate(freq_as_str) ] @@ -146,8 +147,7 @@ def set_sonar(self) -> xr.Dataset: "sonar_model": self.sonar_model, "sonar_serial_number": int(self.parser_obj.unpacked_data["serial_number"]), "sonar_software_name": "AZFP", - # TODO: software version is hardwired. Read it from the XML file's AZFP_Version node - "sonar_software_version": "1.4", + "sonar_software_version": "based on AZFP Matlab version 1.4", "sonar_type": "echosounder", } ds = ds.assign_attrs(sonar_attr_dict) @@ -320,7 +320,7 @@ def set_beam(self) -> List[xr.Dataset]: del N_tmp tdn = ( - unpacked_data["pulse_length"][self.freq_ind_sorted] / 1e6 + unpacked_data["pulse_len"][self.freq_ind_sorted] / 1e6 ) # Convert microseconds to seconds range_samples_per_bin = unpacked_data["range_samples_per_bin"][ self.freq_ind_sorted @@ -500,7 +500,7 @@ def set_vendor(self) -> xr.Dataset: unpacked_data = self.parser_obj.unpacked_data parameters = self.parser_obj.parameters ping_time = self.parser_obj.ping_time - tdn = parameters["pulse_length"][self.freq_ind_sorted] / 1e6 + tdn = parameters["pulse_len"][self.freq_ind_sorted] / 1e6 anc = np.array(unpacked_data["ancillary"]) # convert to np array for easy slicing # Build variables in the output xarray Dataset @@ -508,7 +508,7 @@ def set_vendor(self) -> xr.Dataset: for ind, ich in enumerate(self.freq_ind_sorted): # TODO: should not access the private function, better to compute Sv_offset in parser Sv_offset[ind] = self.parser_obj._calc_Sv_offset( - self.freq_sorted[ind], unpacked_data["pulse_length"][ich] + self.freq_sorted[ind], unpacked_data["pulse_len"][ich] ) ds = xr.Dataset( @@ -532,9 +532,9 @@ def set_vendor(self) -> xr.Dataset: "A/D converter when digitizing the returned acoustic signal" }, ), - "lockout_index": ( + "lock_out_index": ( ["channel"], - unpacked_data["lockout_index"][self.freq_ind_sorted], + unpacked_data["lock_out_index"][self.freq_ind_sorted], { "long_name": "The distance, rounded to the nearest Bin Size after the " "pulse is transmitted that over which AZFP will ignore echoes" @@ -648,6 +648,17 @@ def set_vendor(self) -> xr.Dataset: parameters["gain"][self.freq_ind_sorted], {"long_name": "(From XML file) Gain correction"}, ), + "instrument_type": parameters["instrument_type"][0], + "minor": parameters["minor"], + "major": parameters["major"], + "date": parameters["date"], + "program": parameters["program"], + "cpu": parameters["cpu"], + "serial_number": parameters["serial_number"], + "board_version": parameters["board_version"], + "file_version": parameters["file_version"], + "parameter_version": parameters["parameter_version"], + "configuration_version": parameters["configuration_version"], "XML_digitization_rate": ( ["channel"], parameters["dig_rate"][self.freq_ind_sorted], @@ -659,7 +670,7 @@ def set_vendor(self) -> xr.Dataset: ), "XML_lockout_index": ( ["channel"], - parameters["lockout_index"][self.freq_ind_sorted], + parameters["lock_out_index"][self.freq_ind_sorted], { "long_name": "(From XML file) The distance, rounded to the nearest " "Bin Size after the pulse is transmitted that over which AZFP will " @@ -680,10 +691,25 @@ def set_vendor(self) -> xr.Dataset: "units": "dB re 1uPa/V at 1m", }, ), - "VTX": ( + "VTX0": ( ["channel"], - parameters["VTX"][self.freq_ind_sorted], - {"long_name": "Amplified voltage sent to the transducer"}, + parameters["VTX0"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 0 sent to the transducer"}, + ), + "VTX1": ( + ["channel"], + parameters["VTX1"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 1 sent to the transducer"}, + ), + "VTX2": ( + ["channel"], + parameters["VTX2"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 2 sent to the transducer"}, + ), + "VTX3": ( + ["channel"], + parameters["VTX3"][self.freq_ind_sorted], + {"long_name": "Amplified voltage 3 sent to the transducer"}, ), "Sv_offset": (["channel"], Sv_offset), "number_of_samples_digitized_per_pings": ( diff --git a/echopype/convert/set_groups_base.py b/echopype/convert/set_groups_base.py index 4cb2c5353..ddbba24bb 100644 --- a/echopype/convert/set_groups_base.py +++ b/echopype/convert/set_groups_base.py @@ -1,8 +1,7 @@ import abc import warnings -from typing import List, Set, Tuple +from typing import List, Set -import dask.array import numpy as np import pynmea2 import xarray as xr @@ -51,7 +50,7 @@ def __init__( else: self.compression_settings = COMPRESSION_SETTINGS[self.engine] - self._varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] + self._varattrs = self.parsed2zarr_obj._varattrs # self._beamgroups must be a list of dicts, eg: # [{"name":"Beam_group1", "descr":"contains complex backscatter data # and other beam or channel-specific data."}] @@ -361,229 +360,3 @@ def beam_groups_to_convention( self._add_ping_time_dim(ds, beam_ping_time_names, ping_time_only_names) self._add_beam_dim(ds, beam_only_names, beam_ping_time_names) - - def _get_channel_ids(self, chan_str: np.ndarray) -> List[str]: - """ - Obtains the channel IDs associated with ``chan_str``. - - Parameters - ---------- - chan_str : np.ndarray - A numpy array of strings corresponding to the - keys of ``config_datagram["transceivers"]`` - - Returns - ------- - A list of strings representing the channel IDS - """ - if self.sonar_model in ["EK60", "ES70"]: - return [ - self.parser_obj.config_datagram["transceivers"][int(i)]["channel_id"] - for i in chan_str - ] - else: - return [ - self.parser_obj.config_datagram["configuration"][i]["channel_id"] for i in chan_str - ] - - def _get_power_dataarray(self, zarr_path: str) -> xr.DataArray: - """ - Constructs a DataArray from a Dask array for the power - data. - - Parameters - ---------- - zarr_path: str - Path to the zarr file that contain the power data - - Returns - ------- - DataArray named "backscatter_r" representing the - power data. - """ - - # collect variables associated with the power data - power = dask.array.from_zarr(zarr_path, component="power/power") - - pow_time_path = "power/" + self.parsed2zarr_obj.power_dims[0] - pow_chan_path = "power/" + self.parsed2zarr_obj.power_dims[1] - power_time = dask.array.from_zarr(zarr_path, component=pow_time_path).compute() - power_channel = dask.array.from_zarr(zarr_path, component=pow_chan_path).compute() - - # obtain channel names for power data - pow_chan_names = self._get_channel_ids(power_channel) - - backscatter_r = xr.DataArray( - data=power, - coords={ - "ping_time": ( - ["ping_time"], - power_time, - self._varattrs["beam_coord_default"]["ping_time"], - ), - "channel": ( - ["channel"], - pow_chan_names, - self._varattrs["beam_coord_default"]["channel"], - ), - "range_sample": ( - ["range_sample"], - np.arange(power.shape[2]), - self._varattrs["beam_coord_default"]["range_sample"], - ), - }, - name="backscatter_r", - attrs={ - "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], - "units": "dB", - }, - ) - - return backscatter_r - - def _get_angle_dataarrays(self, zarr_path: str) -> Tuple[xr.DataArray, xr.DataArray]: - """ - Constructs the DataArrays from Dask arrays associated - with the angle data. - - Parameters - ---------- - zarr_path: str - Path to the zarr file that contains the angle data - - Returns - ------- - DataArrays named "angle_athwartship" and "angle_alongship", - respectively, representing the angle data. - """ - - # collect variables associated with the angle data - angle_along = dask.array.from_zarr(zarr_path, component="angle/angle_alongship") - angle_athwart = dask.array.from_zarr(zarr_path, component="angle/angle_athwartship") - - ang_time_path = "angle/" + self.parsed2zarr_obj.angle_dims[0] - ang_chan_path = "angle/" + self.parsed2zarr_obj.angle_dims[1] - angle_time = dask.array.from_zarr(zarr_path, component=ang_time_path).compute() - angle_channel = dask.array.from_zarr(zarr_path, component=ang_chan_path).compute() - - # obtain channel names for angle data - ang_chan_names = self._get_channel_ids(angle_channel) - - array_coords = { - "ping_time": ( - ["ping_time"], - angle_time, - self._varattrs["beam_coord_default"]["ping_time"], - ), - "channel": ( - ["channel"], - ang_chan_names, - self._varattrs["beam_coord_default"]["channel"], - ), - "range_sample": ( - ["range_sample"], - np.arange(angle_athwart.shape[2]), - self._varattrs["beam_coord_default"]["range_sample"], - ), - } - - angle_athwartship = xr.DataArray( - data=angle_athwart, - coords=array_coords, - name="angle_athwartship", - attrs={ - "long_name": "electrical athwartship angle", - "comment": ( - "Introduced in echopype for Simrad echosounders. " # noqa - + "The athwartship angle corresponds to the major angle in SONAR-netCDF4 vers 2. " # noqa - ), - }, - ) - - angle_alongship = xr.DataArray( - data=angle_along, - coords=array_coords, - name="angle_alongship", - attrs={ - "long_name": "electrical alongship angle", - "comment": ( - "Introduced in echopype for Simrad echosounders. " # noqa - + "The alongship angle corresponds to the minor angle in SONAR-netCDF4 vers 2. " # noqa - ), - }, - ) - - return angle_athwartship, angle_alongship - - def _get_complex_dataarrays(self, zarr_path: str) -> Tuple[xr.DataArray, xr.DataArray]: - """ - Constructs the DataArrays from Dask arrays associated - with the complex data. - - Parameters - ---------- - zarr_path: str - Path to the zarr file that contains the complex data - - Returns - ------- - DataArrays named "backscatter_r" and "backscatter_i", - respectively, representing the complex data. - """ - - # collect variables associated with the complex data - complex_r = dask.array.from_zarr(zarr_path, component="complex/backscatter_r") - complex_i = dask.array.from_zarr(zarr_path, component="complex/backscatter_i") - - comp_time_path = "complex/" + self.parsed2zarr_obj.complex_dims[0] - comp_chan_path = "complex/" + self.parsed2zarr_obj.complex_dims[1] - complex_time = dask.array.from_zarr(zarr_path, component=comp_time_path).compute() - complex_channel = dask.array.from_zarr(zarr_path, component=comp_chan_path).compute() - - # obtain channel names for complex data - comp_chan_names = self._get_channel_ids(complex_channel) - - array_coords = { - "ping_time": ( - ["ping_time"], - complex_time, - self._varattrs["beam_coord_default"]["ping_time"], - ), - "channel": ( - ["channel"], - comp_chan_names, - self._varattrs["beam_coord_default"]["channel"], - ), - "range_sample": ( - ["range_sample"], - np.arange(complex_r.shape[2]), - self._varattrs["beam_coord_default"]["range_sample"], - ), - "beam": ( - ["beam"], - np.arange(start=1, stop=complex_r.shape[3] + 1).astype(str), - self._varattrs["beam_coord_default"]["beam"], - ), - } - - backscatter_r = xr.DataArray( - data=complex_r, - coords=array_coords, - name="backscatter_r", - attrs={ - "long_name": self._varattrs["beam_var_default"]["backscatter_r"]["long_name"], - "units": "dB", - }, - ) - - backscatter_i = xr.DataArray( - data=complex_i, - coords=array_coords, - name="backscatter_i", - attrs={ - "long_name": self._varattrs["beam_var_default"]["backscatter_i"]["long_name"], - "units": "dB", - }, - ) - - return backscatter_r, backscatter_i diff --git a/echopype/convert/set_groups_ek60.py b/echopype/convert/set_groups_ek60.py index f5270014b..c8529091b 100644 --- a/echopype/convert/set_groups_ek60.py +++ b/echopype/convert/set_groups_ek60.py @@ -336,9 +336,8 @@ def _set_beam_group1_zarr_vars(self, ds: xr.Dataset) -> xr.Dataset: # functions below. # obtain DataArrays using zarr variables - zarr_path = self.parsed2zarr_obj.zarr_file_name - backscatter_r = self._get_power_dataarray(zarr_path) - angle_athwartship, angle_alongship = self._get_angle_dataarrays(zarr_path) + backscatter_r = self.parsed2zarr_obj.power_dataarray + angle_athwartship, angle_alongship = self.parsed2zarr_obj.angle_dataarrays # append DataArrays created from zarr file ds = ds.assign( diff --git a/echopype/convert/set_groups_ek80.py b/echopype/convert/set_groups_ek80.py index 51c4aae23..77b06cdf3 100644 --- a/echopype/convert/set_groups_ek80.py +++ b/echopype/convert/set_groups_ek80.py @@ -74,7 +74,7 @@ def __init__(self, *args, **kwargs): # if we have zarr files, create parser_obj.ch_ids if self.parsed2zarr_obj.temp_zarr_dir: for k, v in self.parsed2zarr_obj.p2z_ch_ids.items(): - self.parser_obj.ch_ids[k] = self._get_channel_ids(v) + self.parser_obj.ch_ids[k] = self.parsed2zarr_obj._get_channel_ids(v) # obtain sorted channel dict in ascending order for each usage scenario self.sorted_channel = { @@ -1067,26 +1067,25 @@ def _get_ds_beam_power_zarr(self, ds_invariant_power: xr.Dataset) -> xr.Dataset: # functions below. # obtain DataArrays using zarr variables - zarr_path = self.parsed2zarr_obj.zarr_file_name - backscatter_r = self._get_power_dataarray(zarr_path) - angle_athwartship, angle_alongship = self._get_angle_dataarrays(zarr_path) + backscatter_r = self.parsed2zarr_obj.power_dataarray + angle_athwartship, angle_alongship = self.parsed2zarr_obj.angle_dataarrays + + # Obtain RAW4 transmit pulse data if it exists + tx_pulse_list = [] + if ( + hasattr(self.parsed2zarr_obj, "tx_complex_dataarrays") + and self.parsed2zarr_obj.tx_complex_dataarrays is not None + ): + tx_pulse_list = list(self.parsed2zarr_obj.tx_complex_dataarrays) # create power related ds using DataArrays created from zarr file - ds_power = xr.merge([backscatter_r, angle_athwartship, angle_alongship]) + ds_power = xr.merge( + [backscatter_r, angle_athwartship, angle_alongship] + tx_pulse_list, + combine_attrs="override", + ) ds_power = set_time_encodings(ds_power) - # obtain additional variables that need to be added to ds_power - ds_tmp = [] - for ch in self.sorted_channel["power"]: - ds_data = self._add_trasmit_pulse_complex(ds_tmp=xr.Dataset(), ch=ch) - ds_data = set_time_encodings(ds_data) - - ds_data = self._attach_vars_to_ds_data(ds_data, ch, rs_size=ds_power.range_sample.size) - ds_tmp.append(ds_data) - - ds_tmp = self.merge_save(ds_tmp, ds_invariant_power) - - return xr.merge([ds_tmp, ds_power], combine_attrs="override") + return xr.merge([ds_invariant_power, ds_power], combine_attrs="override") def _get_ds_complex_zarr(self, ds_invariant_complex: xr.Dataset) -> xr.Dataset: """ @@ -1109,8 +1108,7 @@ def _get_ds_complex_zarr(self, ds_invariant_complex: xr.Dataset) -> xr.Dataset: # functions below. # obtain DataArrays using zarr variables - zarr_path = self.parsed2zarr_obj.zarr_file_name - backscatter_r, backscatter_i = self._get_complex_dataarrays(zarr_path) + backscatter_r, backscatter_i = self.parsed2zarr_obj.complex_dataarrays # create power related ds using DataArrays created from zarr file ds_complex = xr.merge([backscatter_r, backscatter_i]) diff --git a/echopype/convert/utils/ek.py b/echopype/convert/utils/ek.py new file mode 100644 index 000000000..3393f1723 --- /dev/null +++ b/echopype/convert/utils/ek.py @@ -0,0 +1,89 @@ +import sys +from functools import reduce + +import numpy as np +import pandas as pd +import psutil + +COMPLEX_VAR = "complex" + + +def _get_power_dims(dgram_zarr_vars): + return list(reduce(lambda x, y: {*x, *y}, dgram_zarr_vars.values())) + + +def _extract_datagram_dfs(zarr_datagrams, dgram_zarr_vars): + data_keys = dgram_zarr_vars.keys() + power_dims = _get_power_dims(dgram_zarr_vars) + power_angle = [k for k in data_keys if k != COMPLEX_VAR] + + datagram_df = pd.DataFrame.from_dict(zarr_datagrams) + + pow_ang_df = datagram_df[power_dims + power_angle] + + complex_df = None + if COMPLEX_VAR in datagram_df: + # Not EK60 + complex_df = datagram_df[power_dims + [COMPLEX_VAR]] + + # Clean up nans if there's any + if isinstance(pow_ang_df, pd.DataFrame): + pow_ang_df = pow_ang_df.dropna().reset_index(drop=True) + + if isinstance(complex_df, pd.DataFrame): + complex_df = complex_df.dropna().reset_index(drop=True) + + return pow_ang_df, complex_df + + +def get_req_mem(datagram_df, dgram_zarr_vars): + total_req_mem = 0 + if datagram_df is not None: + power_dims = _get_power_dims(dgram_zarr_vars) + df_shapes = datagram_df.apply( + lambda col: col.unique().shape + if col.name in power_dims + else col.apply(lambda row: row.shape).max(), + result_type="reduce", + ) + + for k, v in dgram_zarr_vars.items(): + if k in df_shapes: + cols = v + [k] + expected_shape = reduce(lambda x, y: x + y, df_shapes[cols]) + itemsize = datagram_df[k].dtype.itemsize + req_mem = np.prod(expected_shape) * itemsize + total_req_mem += req_mem + + return total_req_mem + + +def _get_zarr_dgrams_size(zarr_datagrams) -> int: + """ + Returns the size in bytes of the list of zarr + datagrams. + """ + + size = 0 + for i in zarr_datagrams: + size += sum([sys.getsizeof(val) for val in i.values()]) + + return size + + +def should_use_swap(zarr_datagrams, dgram_zarr_vars, mem_mult: float = 0.3) -> bool: + zdgrams_mem = _get_zarr_dgrams_size(zarr_datagrams) + + # Estimate expansion size + pow_ang_df, complex_df = _extract_datagram_dfs(zarr_datagrams, dgram_zarr_vars) + pow_ang_mem = get_req_mem(pow_ang_df, dgram_zarr_vars) + complex_mem = get_req_mem(complex_df, dgram_zarr_vars) + total_mem = pow_ang_mem + complex_mem + + # get statistics about system memory usage + mem = psutil.virtual_memory() + + # approx. the amount of memory that will be used after expansion + req_mem = mem.used - zdgrams_mem + total_mem + + return mem.total * mem_mult < req_mem diff --git a/echopype/convert/utils/ek_raw_parsers.py b/echopype/convert/utils/ek_raw_parsers.py index dae9161a7..8c349a0f2 100644 --- a/echopype/convert/utils/ek_raw_parsers.py +++ b/echopype/convert/utils/ek_raw_parsers.py @@ -16,6 +16,7 @@ import numpy as np from ...utils.log import _init_logger +from ...utils.misc import camelcase2snakecase from .ek_date_conversion import nt_to_unix TCVR_CH_NUM_MATCHER = re.compile(r"\d{6}-\w{1,2}|\w{12}-\w{1,2}") @@ -706,22 +707,6 @@ def _unpack_contents(self, raw_string, bytes_read, version): :returns: None """ - def from_CamelCase(xml_param): - """ - convert name from CamelCase to fit with existing naming convention by - inserting an underscore before each capital and then lowering the caps - e.g. CamelCase becomes camel_case. - """ - idx = list(reversed([i for i, c in enumerate(xml_param) if c.isupper()])) - param_len = len(xml_param) - for i in idx: - # check if we should insert an underscore - if i > 0 and i < param_len: - xml_param = xml_param[:i] + "_" + xml_param[i:] - xml_param = xml_param.lower() - - return xml_param - def dict_to_dict(xml_dict, data_dict, parse_opts): """ dict_to_dict appends the ETree xml value dicts to a provided dictionary @@ -760,13 +745,13 @@ def dict_to_dict(xml_dict, data_dict, parse_opts): data_dict[parse_opts[k][1]] = data else: # add using the default key name wrangling - data_dict[from_CamelCase(k)] = data + data_dict[camelcase2snakecase(k)] = data else: # nothing to do with the value string data = xml_dict[k] # add the parameter to the provided dictionary - data_dict[from_CamelCase(k)] = data + data_dict[camelcase2snakecase(k)] = data header_values = struct.unpack( self.header_fmt(version), raw_string[: self.header_size(version)] diff --git a/echopype/echodata/echodata.py b/echopype/echodata/echodata.py index 083eb47a2..7adeb3dbe 100644 --- a/echopype/echodata/echodata.py +++ b/echopype/echodata/echodata.py @@ -1,5 +1,4 @@ import datetime -import shutil import warnings from html import escape from pathlib import Path @@ -76,18 +75,19 @@ def __init__( self._varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] - def __del__(self): - # TODO: this destructor seems to not work in Jupyter Lab if restart or - # even clear all outputs is used. It will work if you explicitly delete the object - - if (self.parsed2zarr_obj is not None) and (self.parsed2zarr_obj.zarr_file_name is not None): + def cleanup(self): + if (self.parsed2zarr_obj is not None) and (self.parsed2zarr_obj.store is not None): # get Path object of temporary zarr file created by Parsed2Zarr - p2z_temp_file = Path(self.parsed2zarr_obj.zarr_file_name) + p2z_temp_file = self.parsed2zarr_obj.store # remove temporary directory created by Parsed2Zarr, if it exists - if p2z_temp_file.exists(): - # TODO: do we need to check file permissions here? - shutil.rmtree(p2z_temp_file) + if p2z_temp_file.fs.exists(p2z_temp_file.root): + p2z_temp_file.fs.rm(p2z_temp_file.root, recursive=True) + + def __del__(self): + # TODO: this destructor seems to not work in Jupyter Lab if restart or + # even clear all outputs is used. It will work if you explicitly delete the object + self.cleanup() def __str__(self) -> str: fpath = "Internal Memory" diff --git a/echopype/mask/__init__.py b/echopype/mask/__init__.py index 5867b1656..c2233c112 100644 --- a/echopype/mask/__init__.py +++ b/echopype/mask/__init__.py @@ -1,3 +1,4 @@ from .api import apply_mask, frequency_differencing, get_seabed_mask, shoal_weill __all__ = ["frequency_differencing", "apply_mask", "get_seabed_mask", "shoal_weill"] + diff --git a/echopype/tests/clean/test_clean_impulse_noise.py b/echopype/tests/clean/test_clean_impulse_noise.py index 3d401299e..5bd55aa83 100644 --- a/echopype/tests/clean/test_clean_impulse_noise.py +++ b/echopype/tests/clean/test_clean_impulse_noise.py @@ -1,45 +1,40 @@ -import os import pytest - import numpy as np -import xarray as xr - import echopype.clean - +from echopype.clean.impulse_noise import ( + RYAN_DEFAULT_PARAMS, + RYAN_ITERABLE_DEFAULT_PARAMS, + WANG_DEFAULT_PARAMS, +) # Note: We've removed all the setup and utility functions since they are now in conftest.py +DESIRED_CHANNEL = "GPT 120 kHz 00907203422d 1 ES120-7" +DESIRED_FREQUENCY = 120000 + @pytest.mark.parametrize( - "method,thr,m,n,erode,dilate,median,expected_true_false_counts", + "method,parameters,desired_channel,desired_frequency,expected_true_false_counts", [ - ("ryan", 10, 5, 1, None, None, None, (2130885, 32419)), - ("ryan_iterable", 10, 5, (1, 2), None, None, None, (2125144, 38160)), - ("wang", (-70, -40), None, None, [(3, 3)], [(5, 5), (7, 7)], [(7, 7)], (635732, 1527572)), + ("ryan", RYAN_DEFAULT_PARAMS, DESIRED_CHANNEL, None, (2130885, 32419)), + ("ryan_iterable", RYAN_ITERABLE_DEFAULT_PARAMS, DESIRED_CHANNEL, None, (2124976, 38328)), + ("wang", WANG_DEFAULT_PARAMS, None, DESIRED_FREQUENCY, (635732, 1527572)), ], ) def test_get_impulse_noise_mask( sv_dataset_jr230, # Use the specific fixture for the JR230 file method, - thr, - m, - n, - erode, - dilate, - median, + parameters, + desired_channel, + desired_frequency, expected_true_false_counts, ): source_Sv = sv_dataset_jr230 - desired_channel = "GPT 120 kHz 00907203422d 1 ES120-7" mask = echopype.clean.get_impulse_noise_mask( source_Sv, - desired_channel, - thr=thr, - m=m, - n=n, - erode=erode, - dilate=dilate, - median=median, + parameters=parameters, + desired_channel=desired_channel, + desired_frequency=desired_frequency, method=method, ) count_true = np.count_nonzero(mask) diff --git a/echopype/tests/clean/test_clean_transient_noise.py b/echopype/tests/clean/test_clean_transient_noise.py index e94795097..6c668b957 100644 --- a/echopype/tests/clean/test_clean_transient_noise.py +++ b/echopype/tests/clean/test_clean_transient_noise.py @@ -1,47 +1,34 @@ -import os import pytest import numpy as np -import xarray as xr import echopype.clean +from echopype.clean.transient_noise import RYAN_DEFAULT_PARAMS, FIELDING_DEFAULT_PARAMS # Note: We've removed all the setup and utility functions since they are now in conftest.py @pytest.mark.parametrize( - "mask_type,r0,r1,n,roff,thr,excludeabove,operation,expected_true_false_counts", + "method, parameters ,expected_true_false_counts", [ - ("ryan", None, None, 20, None, 20, 250, "percentile15", (1941916, 225015)), - ("fielding", 200, 1000, 20, 250, [2, 0], None, None, (1890033, 276898)), + ("ryan", RYAN_DEFAULT_PARAMS, (1941916, 225015)), + ("fielding", FIELDING_DEFAULT_PARAMS, (1890033, 276898)), ], ) def test_get_transient_mask( sv_dataset_jr161, # Use the specific fixture for the JR161 file - mask_type, - r0, - r1, - n, - roff, - thr, - excludeabove, - operation, + method, + parameters, expected_true_false_counts, ): source_Sv = sv_dataset_jr161 desired_channel = "GPT 38 kHz 009072033fa5 1 ES38" mask = echopype.clean.get_transient_noise_mask( source_Sv, - desired_channel, - mask_type=mask_type, - r0=r0, - r1=r1, - n=n, - roff=roff, - thr=thr, - excludeabove=excludeabove, - operation=operation, + parameters=parameters, + desired_channel=desired_channel, + method=method, ) count_true = np.count_nonzero(mask) count_false = mask.size - count_true diff --git a/echopype/tests/clean/test_noise.py b/echopype/tests/clean/test_noise.py index 6ee61768f..65bd65b9c 100644 --- a/echopype/tests/clean/test_noise.py +++ b/echopype/tests/clean/test_noise.py @@ -3,6 +3,9 @@ import echopype as ep import pytest from numpy.random import default_rng +from echopype.clean.transient_noise import RYAN_DEFAULT_PARAMS as RYAN_DEFAULT_PARAMS_TR +from echopype.clean.signal_attenuation import DEFAULT_RYAN_PARAMS +from echopype.clean.impulse_noise import RYAN_DEFAULT_PARAMS def test_remove_noise(): @@ -89,19 +92,23 @@ def test_remove_noise_no_sound_absorption(): def test_transient_mask_all(sv_dataset_jr161): source_Sv = sv_dataset_jr161 - ml = ep.clean.api.get_transient_noise_mask_multichannel(source_Sv) + ml = ep.clean.api.get_transient_noise_mask_multichannel( + source_Sv, method="ryan", parameters=RYAN_DEFAULT_PARAMS_TR + ) assert np.all(ml["channel"] == source_Sv["channel"]) def test_impulse_mask_all(sv_dataset_jr230): source_Sv = sv_dataset_jr230 ml = ep.clean.api.get_impulse_noise_mask_multichannel( - source_Sv, method="ryan", thr=10, m=5, n=1 + source_Sv, method="ryan", parameters=RYAN_DEFAULT_PARAMS ) assert np.all(ml["channel"] == source_Sv["channel"]) def test_attenuation_mask_all(sv_dataset_jr161): source_Sv = sv_dataset_jr161 - ml = ep.clean.api.get_attenuation_mask_multichannel(source_Sv) + ml = ep.clean.api.get_attenuation_mask_multichannel( + source_Sv, method="ryan", parameters=DEFAULT_RYAN_PARAMS + ) assert np.all(ml["channel"] == source_Sv["channel"]) diff --git a/echopype/tests/clean/test_signal_attenuation.py b/echopype/tests/clean/test_signal_attenuation.py index fbc6a4cb8..5ac1f1f3e 100644 --- a/echopype/tests/clean/test_signal_attenuation.py +++ b/echopype/tests/clean/test_signal_attenuation.py @@ -3,31 +3,29 @@ import echopype.clean +DEFAULT_RYAN_PARAMS = {"r0": 180, "r1": 280, "n": 30, "thr": -6, "start": 0} +DEFAULT_ARIZA_PARAMS = {"offset": 20, "thr": (-40, -35), "m": 20, "n": 50} + + @pytest.mark.parametrize( - "mask_type,r0,r1,n,m,thr,start,offset,expected_true_false_counts", + "method,parameters,expected_true_false_counts", [ - ("ryan", 180, 280, 30, None, -6, 0, 0, (1613100, 553831)), + ("ryan", DEFAULT_RYAN_PARAMS, (1613100, 553831)), + ("ariza", DEFAULT_ARIZA_PARAMS, (39897, 2127034)), ], ) def test_get_signal_attenuation_mask( - sv_dataset_jr161, mask_type, r0, r1, n, m, thr, start, offset, expected_true_false_counts + sv_dataset_jr161, method, parameters, expected_true_false_counts ): # source_Sv = get_sv_dataset(test_data_path) desired_channel = "GPT 38 kHz 009072033fa5 1 ES38" mask = echopype.clean.api.get_attenuation_mask( sv_dataset_jr161, + parameters=parameters, + method=method, desired_channel=desired_channel, - mask_type=mask_type, - r0=r0, - r1=r1, - thr=thr, - m=m, - n=n, - start=start, - offset=offset, ) count_true = np.count_nonzero(mask) count_false = mask.size - count_true true_false_counts = (count_true, count_false) - assert true_false_counts == expected_true_false_counts diff --git a/echopype/tests/commongrid/conftest.py b/echopype/tests/commongrid/conftest.py new file mode 100644 index 000000000..27b166a03 --- /dev/null +++ b/echopype/tests/commongrid/conftest.py @@ -0,0 +1,519 @@ +import pytest + +import xarray as xr +import numpy as np +import pandas as pd + +from echopype.consolidate import add_depth +import echopype as ep + + +@pytest.fixture +def random_number_generator(): + """Random number generator for tests""" + return np.random.default_rng() + + +@pytest.fixture +def mock_nan_ilocs(): + """NaN i locations for irregular Sv dataset + + It's a list of tuples, each tuple contains + (channel, ping_time, range_sample) + + Notes + ----- + This was created with the following code: + + ``` + import numpy as np + + random_positions = [] + for i in range(20): + random_positions.append(( + np.random.randint(0, 2), + np.random.randint(0, 5), + np.random.randint(0, 20)) + ) + ``` + """ + return [ + (1, 1, 10), + (1, 0, 16), + (0, 3, 6), + (0, 2, 11), + (0, 2, 6), + (1, 1, 14), + (0, 1, 17), + (1, 4, 19), + (0, 3, 3), + (0, 0, 19), + (0, 1, 5), + (1, 2, 9), + (1, 4, 18), + (0, 1, 5), + (0, 4, 4), + (0, 1, 6), + (1, 2, 2), + (0, 1, 2), + (0, 4, 8), + (0, 1, 1), + ] + + +@pytest.fixture +def mock_parameters(): + """Small mock parameters""" + return { + "channel_len": 2, + "ping_time_len": 10, + "depth_len": 20, + "ping_time_interval": "0.3S", + } + + +@pytest.fixture +def mock_Sv_sample(mock_parameters): + """ + Mock Sv sample + + Dimension: (2, 10, 20) + """ + channel_len = mock_parameters["channel_len"] + ping_time_len = mock_parameters["ping_time_len"] + depth_len = mock_parameters["depth_len"] + + depth_data = np.linspace(0, 1, num=depth_len) + return np.tile(depth_data, (channel_len, ping_time_len, 1)) + + +@pytest.fixture +def mock_Sv_dataset_regular(mock_parameters, mock_Sv_sample): + ds_Sv = _gen_Sv_echo_range_regular(**mock_parameters, ping_time_jitter_max_ms=0) + ds_Sv["Sv"].data = mock_Sv_sample + return ds_Sv + + +@pytest.fixture +def mock_Sv_dataset_irregular(mock_parameters, mock_Sv_sample, mock_nan_ilocs): + depth_interval = [0.5, 0.32, 0.2] + depth_ping_time_len = [2, 3, 5] + ds_Sv = _gen_Sv_echo_range_irregular( + **mock_parameters, + depth_interval=depth_interval, + depth_ping_time_len=depth_ping_time_len, + ping_time_jitter_max_ms=30, # Added jitter to ping_time + ) + ds_Sv["Sv"].data = mock_Sv_sample + # Sprinkle nans around echo_range + for pos in mock_nan_ilocs: + ds_Sv["echo_range"][pos] = np.nan + return ds_Sv + + +@pytest.fixture +def mock_mvbs_inputs(): + return dict(range_meter_bin=2, ping_time_bin="1s") + + +@pytest.fixture +def mock_mvbs_array_regular(mock_Sv_dataset_regular, mock_mvbs_inputs, mock_parameters): + """ + Mock Sv sample result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + ds_Sv = mock_Sv_dataset_regular + ping_time_bin = mock_mvbs_inputs["ping_time_bin"] + range_bin = mock_mvbs_inputs["range_meter_bin"] + channel_len = mock_parameters["channel_len"] + expected_mvbs_val = _get_expected_mvbs_val( + ds_Sv, ping_time_bin, range_bin, channel_len + ) + + return expected_mvbs_val + + +@pytest.fixture +def mock_mvbs_array_irregular(mock_Sv_dataset_irregular, mock_mvbs_inputs, mock_parameters): + """ + Mock Sv sample irregular result from compute_MVBS + + Dimension: (2, 3, 5) + Ping time bin: 1s + Range bin: 2m + """ + ds_Sv = mock_Sv_dataset_irregular + ping_time_bin = mock_mvbs_inputs["ping_time_bin"] + range_bin = mock_mvbs_inputs["range_meter_bin"] + channel_len = mock_parameters["channel_len"] + expected_mvbs_val = _get_expected_mvbs_val( + ds_Sv, ping_time_bin, range_bin, channel_len + ) + + return expected_mvbs_val + + +@pytest.fixture( + params=[ + ( + ("EK60", "ncei-wcsd", "Summer2017-D20170719-T211347.raw"), + "EK60", + None, + {}, + ), + ( + ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), + "EK80", + None, + {"waveform_mode": "BB", "encode_mode": "complex"}, + ), + ( + ("EK80_NEW", "D20211004-T233354.raw"), + "EK80", + None, + {"waveform_mode": "CW", "encode_mode": "power"}, + ), + ( + ("EK80_NEW", "D20211004-T233115.raw"), + "EK80", + None, + {"waveform_mode": "CW", "encode_mode": "complex"}, + ), + (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), + (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), + ( + ("AD2CP", "raw", "090", "rawtest.090.00001.ad2cp"), + "AD2CP", + None, + {}, + ), + ], + ids=[ + "ek60_cw_power", + "ek80_bb_complex", + "ek80_cw_power", + "ek80_cw_complex", + "es70", + "azfp", + "ad2cp", + ], +) +def test_data_samples(request, test_path): + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = request.param + if sonar_model.lower() in ["es70", "ad2cp"]: + pytest.xfail( + reason="Not supported at the moment", + ) + path_model, *paths = filepath + filepath = test_path[path_model].joinpath(*paths) + + if azfp_xml_path is not None: + path_model, *paths = azfp_xml_path + azfp_xml_path = test_path[path_model].joinpath(*paths) + return ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) + + +@pytest.fixture +def regular_data_params(): + return { + "channel_len": 4, + "depth_len": 4000, + "ping_time_len": 100, + "ping_time_jitter_max_ms": 0, + } + + +@pytest.fixture +def ds_Sv_echo_range_regular(regular_data_params, random_number_generator): + return _gen_Sv_echo_range_regular( + **regular_data_params, + random_number_generator=random_number_generator, + ) + + +@pytest.fixture +def latlon_history_attr(): + return ( + "2023-08-31 12:00:00.000000 +00:00. " + "Interpolated or propagated from Platform latitude/longitude." # noqa + ) + + +@pytest.fixture +def lat_attrs(latlon_history_attr): + """Latitude attributes""" + return { + "long_name": "Platform latitude", + "standard_name": "latitude", + "units": "degrees_north", + "valid_range": "(-90.0, 90.0)", + "history": latlon_history_attr, + } + + +@pytest.fixture +def lon_attrs(latlon_history_attr): + """Longitude attributes""" + return { + "long_name": "Platform longitude", + "standard_name": "longitude", + "units": "degrees_east", + "valid_range": "(-180.0, 180.0)", + "history": latlon_history_attr, + } + + +@pytest.fixture +def depth_offset(): + """Depth offset for calculating depth""" + return 2.5 + + +@pytest.fixture +def ds_Sv_echo_range_regular_w_latlon(ds_Sv_echo_range_regular, lat_attrs, lon_attrs): + """Sv dataset with latitude and longitude""" + n_pings = ds_Sv_echo_range_regular.ping_time.shape[0] + latitude = np.linspace(42, 43, num=n_pings) + longitude = np.linspace(-124, -125, num=n_pings) + + ds_Sv_echo_range_regular["latitude"] = (["ping_time"], latitude, lat_attrs) + ds_Sv_echo_range_regular["longitude"] = (["ping_time"], longitude, lon_attrs) + + # Need processing level code for compute MVBS to work! + ds_Sv_echo_range_regular.attrs["processing_level"] = "Level 2A" + return ds_Sv_echo_range_regular + + +@pytest.fixture +def ds_Sv_echo_range_regular_w_depth(ds_Sv_echo_range_regular, depth_offset): + """Sv dataset with depth""" + return ds_Sv_echo_range_regular.pipe(add_depth, depth_offset=depth_offset) + + +@pytest.fixture +def ds_Sv_echo_range_irregular(random_number_generator): + depth_interval = [0.5, 0.32, 0.13] + depth_ping_time_len = [100, 300, 200] + ping_time_len = 600 + ping_time_interval = "0.3S" + return _gen_Sv_echo_range_irregular( + depth_interval=depth_interval, + depth_ping_time_len=depth_ping_time_len, + ping_time_len=ping_time_len, + ping_time_interval=ping_time_interval, + ping_time_jitter_max_ms=0, + random_number_generator=random_number_generator, + ) + + +# Helper functions to generate mock Sv and MVBS dataset +def _get_expected_mvbs_val( + ds_Sv: xr.Dataset, ping_time_bin: str, range_bin: float, channel_len: int = 2 +) -> np.ndarray: + """ + Helper functions to generate expected MVBS outputs from mock Sv dataset + by brute-force looping and compute the mean + + Parameters + ---------- + ds_Sv : xr.Dataset + Mock Sv dataset + ping_time_bin : str + Ping time bin + range_bin : float + Range bin + channel_len : int, default 2 + Number of channels + """ + # create bin information needed for ping_time + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .first() # Not actually being used, but needed to get the bin groups + .indexes["ping_time"] + ) + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]).values + + # create bin information for echo_range + # this computes the echo range max since there might NaNs in the data + echo_range_max = ds_Sv["echo_range"].max() + range_interval = np.arange(0, echo_range_max + 2, range_bin) + + sv = ds_Sv["Sv"].pipe(ep.utils.compute._log2lin) + + expected_mvbs_val = np.ones((2, len(ping_interval) - 1, len(range_interval) - 1)) * np.nan + + for ch_idx in range(channel_len): + for p_idx in range(len(ping_interval) - 1): + for r_idx in range(len(range_interval) - 1): + echo_range = ( + ds_Sv['echo_range'] + .isel(channel=ch_idx) + .sel(ping_time=slice(ping_interval[p_idx], ping_interval[p_idx+1])) + ) + r_idx_active = np.logical_and( + echo_range.data >= range_interval[r_idx], + echo_range.data < range_interval[r_idx+1] + ) + sv_tmp = sv.isel(channel=ch_idx).sel( + ping_time=slice(ping_interval[p_idx], ping_interval[p_idx+1])).data[r_idx_active] + if 0 in sv_tmp.shape: + expected_mvbs_val[ch_idx, p_idx, r_idx] = np.nan + else: + expected_mvbs_val[ch_idx, p_idx, r_idx] = np.mean(sv_tmp) + return ep.utils.compute._lin2log(expected_mvbs_val) + + +def _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms=0): + ping_time = pd.date_range("2018-07-01", periods=ping_time_len, freq=ping_time_interval) + if ping_time_jitter_max_ms != 0: # if to add jitter + jitter = ( + np.random.randint(ping_time_jitter_max_ms, size=ping_time_len) / 1000 + ) # convert to seconds + ping_time = pd.to_datetime(ping_time.astype(int) / 1e9 + jitter, unit="s") + return ping_time + + +def _gen_Sv_echo_range_regular( + channel_len=2, + depth_len=100, + depth_interval=0.5, + ping_time_len=600, + ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + + Parameters + ------------ + channel_len + number of channels + depth_len + number of total depth bins + depth_interval + depth intervals, may have multiple values + ping_time_len + total number of ping_time + ping_time_interval + interval between pings + ping_time_jitter_max_ms + jitter of ping_time in milliseconds + """ + + if random_number_generator is None: + random_number_generator = np.random.default_rng() + + # regular echo_range + echo_range = np.array([[np.arange(depth_len)] * ping_time_len] * channel_len) * depth_interval + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)), + ), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + }, + ) + + return ds_Sv + + +def _gen_Sv_echo_range_irregular( + channel_len=2, + depth_len=100, + depth_interval=[0.5, 0.32, 0.13], + depth_ping_time_len=[100, 300, 200], + ping_time_len=600, + ping_time_interval="0.3S", + ping_time_jitter_max_ms=0, + random_number_generator=None, +): + """ + Generate a Sv dataset with uniform echo_range across all ping_time. + + ping_time_jitter_max_ms controlled jitter in milliseconds in ping_time. + + Parameters + ------------ + channel_len + number of channels + depth_len + number of total depth bins + depth_interval + depth intervals, may have multiple values + depth_ping_time_len + the number of pings to use each of the depth_interval + for example, with depth_interval=[0.5, 0.32, 0.13] + and depth_ping_time_len=[100, 300, 200], + the first 100 pings have echo_range with depth intervals of 0.5 m, + the next 300 pings have echo_range with depth intervals of 0.32 m, + and the last 200 pings have echo_range with depth intervals of 0.13 m. + ping_time_len + total number of ping_time + ping_time_interval + interval between pings + ping_time_jitter_max_ms + jitter of ping_time in milliseconds + """ + if random_number_generator is None: + random_number_generator = np.random.default_rng() + + # check input + if len(depth_interval) != len(depth_ping_time_len): + raise ValueError("The number of depth_interval and depth_ping_time_len must be equal!") + + if ping_time_len != np.array(depth_ping_time_len).sum(): + raise ValueError("The number of total pings does not match!") + + # irregular echo_range + echo_range_list = [] + for d, dp in zip(depth_interval, depth_ping_time_len): + echo_range_list.append(np.array([[np.arange(depth_len)] * dp] * channel_len) * d) + echo_range = np.hstack(echo_range_list) + + # generate dataset + ds_Sv = xr.Dataset( + data_vars={ + "Sv": ( + ["channel", "ping_time", "range_sample"], + random_number_generator.random(size=(channel_len, ping_time_len, depth_len)), + ), + "echo_range": (["channel", "ping_time", "range_sample"], echo_range), + "frequency_nominal": (["channel"], np.arange(channel_len)), + }, + coords={ + "channel": [f"ch_{ch}" for ch in range(channel_len)], + "ping_time": _gen_ping_time(ping_time_len, ping_time_interval, ping_time_jitter_max_ms), + "range_sample": np.arange(depth_len), + }, + ) + + return ds_Sv + + +# End helper functions diff --git a/echopype/tests/commongrid/test_api.py b/echopype/tests/commongrid/test_api.py new file mode 100644 index 000000000..d618d6443 --- /dev/null +++ b/echopype/tests/commongrid/test_api.py @@ -0,0 +1,289 @@ +import pytest +import echopype as ep +import numpy as np + + +# Utilities Tests +@pytest.mark.parametrize( + ["x_bin", "x_label", "expected_result"], + [ + # Success + ("10m", "range_bin", 10.0), + ("0.2m", "range_bin", 0.2), + ("0.5nmi", "dist_bin", 0.5), + # Errored + (10, "range_bin", TypeError), + ("10km", "range_bin", ValueError), + ("10", "range_bin", ValueError), + ("10m", "invalid_label", KeyError), + ], +) +def test__parse_x_bin(x_bin, x_label, expected_result): + if x_label == "invalid_label": + expected_error_msg = r"x_label must be one of" + elif isinstance(x_bin, int): + expected_error_msg = r"must be a string" + elif x_bin in ["10km", "10"]: + expected_error_msg = r"must be in" + + if not isinstance(expected_result, float): + with pytest.raises(expected_result, match=expected_error_msg): + ep.commongrid.api._parse_x_bin(x_bin, x_label) + else: + assert ep.commongrid.api._parse_x_bin(x_bin, x_label) == expected_result + +# NASC Tests +@pytest.mark.integration +@pytest.mark.skip(reason="NASC is not implemented yet") +def test_compute_NASC(test_data_samples): + pass + + +# MVBS Tests +@pytest.mark.integration +def test_compute_MVBS_index_binning(ds_Sv_echo_range_regular, regular_data_params): + """Test compute_MVBS_index_binning on mock data""" + + ping_num = 3 # number of pings to average over + range_sample_num = 7 # number of range_samples to average over + nchan = regular_data_params["channel_len"] + npings = regular_data_params["ping_time_len"] + nrange_samples = regular_data_params["depth_len"] + + # Binned MVBS test + ds_MVBS = ep.commongrid.compute_MVBS_index_binning( + ds_Sv_echo_range_regular, range_sample_num=range_sample_num, ping_num=ping_num + ) + + # Shape test + data_binned_shape = np.ceil( + (nchan, npings / ping_num, nrange_samples / range_sample_num) + ).astype(int) + assert np.all(ds_MVBS.Sv.shape == data_binned_shape) + + # Expected values compute + # average should be done in linear domain + da_sv = 10 ** (ds_Sv_echo_range_regular["Sv"] / 10) + expected = 10 * np.log10( + da_sv.coarsen(ping_time=ping_num, range_sample=range_sample_num, boundary="pad").mean( + skipna=True + ) + ) + + # Test all values in MVBS + assert np.array_equal(ds_MVBS.Sv.data, expected.data) + + +@pytest.mark.unit +@pytest.mark.parametrize( + ["range_bin", "ping_time_bin"], [(5, "10S"), ("10m", 10), ("10km", "10S"), ("10", "10S")] +) +def test_compute_MVBS_bin_inputs_fail(ds_Sv_echo_range_regular, range_bin, ping_time_bin): + expected_error = ValueError + if isinstance(range_bin, int) or isinstance(ping_time_bin, int): + expected_error = TypeError + match = r"must be a string" + else: + match = r"Range bin must be in meters" + + with pytest.raises(expected_error, match=match): + ep.commongrid.compute_MVBS( + ds_Sv_echo_range_regular, range_bin=range_bin, ping_time_bin=ping_time_bin + ) + + +@pytest.mark.unit +def test_compute_MVBS_w_latlon(ds_Sv_echo_range_regular_w_latlon, lat_attrs, lon_attrs): + """Testing for compute_MVBS with latitude and longitude""" + from echopype.consolidate.api import POSITION_VARIABLES + + ds_MVBS = ep.commongrid.compute_MVBS( + ds_Sv_echo_range_regular_w_latlon, range_bin="5m", ping_time_bin="10S" + ) + for var in POSITION_VARIABLES: + # Check to ensure variable is in dataset + assert var in ds_MVBS.data_vars + # Check for correct shape, which is just ping time + assert ds_MVBS[var].shape == ds_MVBS.ping_time.shape + + # Check if attributes match + if var == "latitude": + assert ds_MVBS[var].attrs == lat_attrs + elif var == "longitude": + assert ds_MVBS[var].attrs == lon_attrs + + +@pytest.mark.unit +@pytest.mark.parametrize("range_var", ["my_range", "echo_range", "depth"]) +def test_compute_MVBS_invalid_range_var(ds_Sv_echo_range_regular, range_var): + """Test compute MVBS range_var on mock data""" + + if range_var == "my_range": + with pytest.raises(ValueError, match="range_var must be one of 'echo_range' or 'depth'."): + ep.commongrid.compute_MVBS(ds_Sv_echo_range_regular, range_var=range_var) + elif range_var == "depth": + with pytest.raises( + ValueError, match=f"range_var '{range_var}' does not exist in the input dataset." + ): + ep.commongrid.compute_MVBS(ds_Sv_echo_range_regular, range_var=range_var) + else: + pass + + +@pytest.mark.integration +def test_compute_MVBS(test_data_samples): + """ + Test running through from open_raw to compute_MVBS. + """ + ( + filepath, + sonar_model, + azfp_xml_path, + range_kwargs, + ) = test_data_samples + ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) + if ed.sonar_model.lower() == "azfp": + avg_temperature = ed["Environment"]["temperature"].values.mean() + env_params = { + "temperature": avg_temperature, + "salinity": 27.9, + "pressure": 59, + } + range_kwargs["env_params"] = env_params + if "azfp_cal_type" in range_kwargs: + range_kwargs.pop("azfp_cal_type") + Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) + ping_time_bin = "20S" + ds_MVBS = ep.commongrid.compute_MVBS(Sv, ping_time_bin=ping_time_bin) + assert ds_MVBS is not None + + # Test to see if ping_time was resampled correctly + expected_ping_time = ( + Sv["ping_time"].resample(ping_time=ping_time_bin, skipna=True).asfreq().indexes["ping_time"] + ) + assert np.array_equal(ds_MVBS.ping_time.data, expected_ping_time.values) + + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_range_output(request, er_type): + """ + Tests the shape of compute_MVBS output on regular and irregular data. + The irregularity in the input echo_range would cause some rows or columns + of the output Sv to contain NaN. + Here we test for the expected shape after dropping the NaNs + for specific ping_time bins. + """ + # set jitter=0 to get predictable number of ping within each echo_range groups + if er_type == "regular": + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular") + else: + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_irregular") + + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin="5m", ping_time_bin="10S") + + if er_type == "regular": + expected_len = ( + ds_Sv["channel"].size, # channel + np.ceil(np.diff(ds_Sv["ping_time"][[0, -1]].astype(int)) / 1e9 / 10), # ping_time + np.ceil(ds_Sv["echo_range"].max() / 5), # depth + ) + assert ds_MVBS["Sv"].shape == expected_len + else: + assert (ds_MVBS["Sv"].isel(ping_time=slice(None, 3)).dropna(dim="echo_range").shape) == ( + 2, + 3, + 10, + ) # full array, no NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(3, 12)).dropna(dim="echo_range").shape) == ( + 2, + 9, + 7, + ) # bottom bins contain NaN + assert (ds_MVBS["Sv"].isel(ping_time=slice(12, None)).dropna(dim="echo_range").shape) == ( + 2, + 6, + 3, + ) # bottom bins contain NaN + + +@pytest.mark.integration +@pytest.mark.parametrize( + ("er_type"), + [ + ("regular"), + ("irregular"), + ], +) +def test_compute_MVBS_values(request, er_type): + """Tests for the values of compute_MVBS on regular and irregular data.""" + + def _parse_nans(mvbs, ds_Sv) -> np.ndarray: + """Go through and figure out nan values in result""" + echo_range_step = np.unique(np.diff(mvbs.Sv.echo_range.values))[0] + expected_outs = [] + # Loop over channels + for chan in mvbs.Sv.channel.values: + # Get ping times for this channel + ping_times = mvbs.Sv.ping_time.values + # Compute the total number of pings + ping_len = len(ping_times) + # Variable to store the expected output for this channel + chan_expected = [] + for idx in range(ping_len): + # Loop over pings and create slices + if idx < ping_len - 1: + ping_slice = slice(ping_times[idx], ping_times[idx + 1]) + else: + ping_slice = slice(ping_times[idx], None) + + # Get the original echo_range data for this channel and ping slice + da = ds_Sv.echo_range.sel(channel=chan, ping_time=ping_slice) + # Drop the nan values since this shouldn't be included in actual + # computation for compute_MVBS, a.k.a. 'nanmean' + mean_values = da.dropna(dim="ping_time", how="all").values + # Compute the histogram of the mean values to get distribution + hist, _ = np.histogram( + mean_values[~np.isnan(mean_values)], + bins=np.append( + mvbs.Sv.echo_range.values, + # Add one more bin to account for the last value + mvbs.Sv.echo_range.values.max() + echo_range_step, + ), + ) + # Convert any non-zero values to False, and zero values to True + # to imitate having nan values since there's no value for that bin + chan_expected.append([False if v > 0 else True for v in hist]) + expected_outs.append(chan_expected) + return np.array(expected_outs) + + range_bin = "2m" + ping_time_bin = "1s" + + if er_type == "regular": + ds_Sv = request.getfixturevalue("mock_Sv_dataset_regular") + expected_mvbs = request.getfixturevalue("mock_mvbs_array_regular") + else: + # Mock irregular dataset contains jitter + # and NaN values in the bottom echo_range + ds_Sv = request.getfixturevalue("mock_Sv_dataset_irregular") + expected_mvbs = request.getfixturevalue("mock_mvbs_array_irregular") + + ds_MVBS = ep.commongrid.compute_MVBS(ds_Sv, range_bin=range_bin, ping_time_bin=ping_time_bin) + + expected_outputs = _parse_nans(ds_MVBS, ds_Sv) + + assert ds_MVBS.Sv.shape == expected_mvbs.shape + # Floating digits need to check with all close not equal + # Compare the values of the MVBS array with the expected values + assert np.allclose(ds_MVBS.Sv.values, expected_mvbs, atol=1e-10, rtol=1e-10, equal_nan=True) + + # Ensures that the computation of MVBS takes doesn't take into account NaN values + # that are sporadically placed in the echo_range values + assert np.array_equal(np.isnan(ds_MVBS.Sv.values), expected_outputs) diff --git a/echopype/tests/commongrid/test_mvbs.py b/echopype/tests/commongrid/test_mvbs.py index 449fe0b9c..78a77be0a 100644 --- a/echopype/tests/commongrid/test_mvbs.py +++ b/echopype/tests/commongrid/test_mvbs.py @@ -1,835 +1,67 @@ -import dask.array import numpy as np -from numpy.random import default_rng import pandas as pd import pytest -from typing import Tuple, Iterable, Union -import xarray as xr - -import echopype as ep -from echopype.commongrid.mvbs import bin_and_mean_2d - - -@pytest.fixture( - params=[ - ( - ("EK60", "ncei-wcsd", "Summer2017-D20170719-T211347.raw"), - "EK60", - None, - {}, - ), - ( - ("EK80_NEW", "echopype-test-D20211004-T235930.raw"), - "EK80", - None, - {'waveform_mode': 'BB', 'encode_mode': 'complex'}, - ), - ( - ("EK80_NEW", "D20211004-T233354.raw"), - "EK80", - None, - {'waveform_mode': 'CW', 'encode_mode': 'power'}, - ), - ( - ("EK80_NEW", "D20211004-T233115.raw"), - "EK80", - None, - {'waveform_mode': 'CW', 'encode_mode': 'complex'}, - ), - (("ES70", "D20151202-T020259.raw"), "ES70", None, {}), - (("AZFP", "17082117.01A"), "AZFP", ("AZFP", "17041823.XML"), {}), - ( - ("AD2CP", "raw", "090", "rawtest.090.00001.ad2cp"), - "AD2CP", - None, - {}, - ), - ], - ids=[ - "ek60_cw_power", - "ek80_bb_complex", - "ek80_cw_power", - "ek80_cw_complex", - "es70", - "azfp", - "ad2cp", - ], -) -def test_data_samples(request, test_path): - ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) = request.param - if sonar_model.lower() in ['es70', 'ad2cp']: - pytest.xfail( - reason="Not supported at the moment", - ) - path_model, *paths = filepath - filepath = test_path[path_model].joinpath(*paths) - - if azfp_xml_path is not None: - path_model, *paths = azfp_xml_path - azfp_xml_path = test_path[path_model].joinpath(*paths) - return ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) - - -def _construct_MVBS_toy_data( - nchan, npings, nrange_samples, ping_size, range_sample_size -): - """Construct data with values that increase every ping_num and ``range_sample_num`` - so that the result of computing MVBS is a smaller array - that increases regularly for each resampled ``ping_time`` and ``range_sample`` - - Parameters - ---------- - nchan : int - number of channels - npings : int - number of pings - nrange_samples : int - number of range samples - ping_size : int - number of pings with the same value - range_sample_size : int - number of range samples with the same value - - Returns - ------- - np.ndarray - Array with blocks of ``ping_time`` and ``range_sample`` with the same value, - so that computing the MVBS will result in regularly increasing values - every row and column - """ - data = np.ones((nchan, npings, nrange_samples)) - for p_i, ping in enumerate(range(0, npings, ping_size)): - for r_i, rb in enumerate(range(0, nrange_samples, range_sample_size)): - data[0, ping : ping + ping_size, rb : rb + range_sample_size] += ( - r_i + p_i - ) - # First channel increases by 1 each row and column, second increases by 2, third by 3, etc. - for f in range(nchan): - data[f] = data[0] * (f + 1) - - return data - - -def _construct_MVBS_test_data(nchan, npings, nrange_samples): - """Construct data for testing the toy data from - `_construct_MVBS_toy_data` after it has gone through the - MVBS calculation. - - Parameters - ---------- - nchan : int - number of channels - npings : int - number of pings - nrange_samples : int - number of range samples - - Returns - ------- - np.ndarray - Array with values that increases regularly - every ping and range sample - """ - - # Construct test array - test_array = np.add(*np.indices((npings, nrange_samples))) - return np.array([(test_array + 1) * (f + 1) for f in range(nchan)]) - - -def test_compute_MVBS_index_binning(): - """Test compute_MVBS_index_binning on toy data""" - - # Parameters for toy data - nchan, npings, nrange_samples = 4, 40, 400 - ping_num = 3 # number of pings to average over - range_sample_num = 7 # number of range_samples to average over - - # Construct toy data that increases regularly every ping_num and range_sample_num - data = _construct_MVBS_toy_data( - nchan=nchan, - npings=npings, - nrange_samples=nrange_samples, - ping_size=ping_num, - range_sample_size=range_sample_num, - ) - - data_log = 10 * np.log10(data) # Convert to log domain - chan_index = np.arange(nchan).astype(str) - ping_index = np.arange(npings) - range_sample = np.arange(nrange_samples) - Sv = xr.DataArray( - data_log, - coords=[ - ('channel', chan_index), - ('ping_time', ping_index), - ('range_sample', range_sample), - ], - ) - Sv.name = "Sv" - ds_Sv = Sv.to_dataset() - ds_Sv["frequency_nominal"] = chan_index # just so there's value in freq_nominal - ds_Sv = ds_Sv.assign( - echo_range=xr.DataArray( - np.array([[np.linspace(0, 10, nrange_samples)] * npings] * nchan), - coords=Sv.coords, - ) - ) - - # Binned MVBS test - ds_MVBS = ep.commongrid.compute_MVBS_index_binning( - ds_Sv, range_sample_num=range_sample_num, ping_num=ping_num - ) - data_test = 10 ** (ds_MVBS.Sv / 10) # Convert to linear domain - - # Shape test - data_binned_shape = np.ceil( - (nchan, npings / ping_num, nrange_samples / range_sample_num) - ).astype(int) - assert np.all(data_test.shape == data_binned_shape) - - # Construct test array that increases by 1 for each range_sample and ping_time - test_array = _construct_MVBS_test_data( - nchan, data_binned_shape[1], data_binned_shape[2] - ) - - # Test all values in MVBS - assert np.allclose(data_test, test_array, rtol=0, atol=1e-12) - - -def _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin): - """A collection of tests for test_compute_MVBS""" - - ds_MVBS = ep.commongrid.compute_MVBS( - ds_Sv, - range_meter_bin=range_meter_bin, - ping_time_bin=f'{ping_time_bin}S', - ) - - data_test = 10 ** (ds_MVBS.Sv / 10) # Convert to linear domain - - # Shape test - data_binned_shape = np.ceil((nchan, ping_num, range_sample_num)).astype(int) - assert np.all(data_test.shape == data_binned_shape) - - # Construct test array that increases by 1 for each range_sample and ping_time - test_array = _construct_MVBS_test_data( - nchan, data_binned_shape[1], data_binned_shape[2] - ) - - # Test all values in MVBS - assert np.allclose(data_test, test_array, rtol=0, atol=1e-12) - - # Test to see if ping_time was resampled correctly - test_ping_time = pd.date_range( - '1/1/2020', periods=np.ceil(ping_num), freq=f'{ping_time_bin}S' - ) - assert np.array_equal(data_test.ping_time, test_ping_time) - - # Test to see if range was resampled correctly - test_range = np.arange(0, total_range, range_meter_bin) - assert np.array_equal(data_test.echo_range, test_range) - - -def _fill_w_nans(narr, nan_ping_time, nan_range_sample): - """ - A routine that fills a numpy array with nans. - - Parameters - ---------- - narr : numpy array - Array of dimensions (ping_time, range_sample) - nan_ping_time : list - ping times to fill with nans - nan_range_sample: list - range samples to fill with nans - """ - if len(nan_ping_time) != len(nan_range_sample): - raise ValueError('These lists must be the same size!') - - # fill in nans according to the provided lists - for i, j in zip(nan_ping_time, nan_range_sample): - narr[i, j] = np.nan - - return narr - - -def _nan_cases_comp_MVBS(ds_Sv, chan): - """ - For a single channel, obtains numpy array - filled with nans for various cases - """ - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - - # ping times to fill with NaNs - nan_ping_time_1 = [slice(None), slice(None)] - # range samples to fill with NaNs - nan_range_sample_1 = [3, 4] - # pad all ping_times with nans for a certain range_sample - case_1 = _fill_w_nans(one_chan_er, nan_ping_time_1, nan_range_sample_1) - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - # ping times to fill with NaNs - nan_ping_time_2 = [1, 3, 5, 9] - # range samples to fill with NaNs - nan_range_sample_2 = [slice(None), slice(None), slice(None), slice(None)] - # pad all range_samples of certain ping_times - case_2 = _fill_w_nans(one_chan_er, nan_ping_time_2, nan_range_sample_2) - - # get echo_range values for a single channel - one_chan_er = ds_Sv.echo_range.sel(channel=chan).copy().values - # ping times to fill with NaNs - nan_ping_time_3 = [0, 2, 5, 7] - # range samples to fill with NaNs - nan_range_sample_3 = [slice(0, 2), slice(None), slice(None), slice(0, 3)] - # pad all range_samples of certain ping_times and - # pad some ping_times with nans for a certain range_sample - case_3 = _fill_w_nans(one_chan_er, nan_ping_time_3, nan_range_sample_3) - - return case_1, case_2, case_3 - - -def test_compute_MVBS(): - """Test compute_MVBS on toy data""" - - # Parameters for fake data - nchan, npings, nrange_samples = 4, 100, 4000 - range_meter_bin = 7 # range in meters to average over - ping_time_bin = 3 # number of seconds to average over - ping_rate = 2 # Number of pings per second - range_sample_per_meter = 30 # Number of range_samples per meter - - # Useful conversions - ping_num = ( - npings / ping_rate / ping_time_bin - ) # number of pings to average over - range_sample_num = ( - nrange_samples / range_sample_per_meter / range_meter_bin - ) # number of range_samples to average over - total_range = nrange_samples / range_sample_per_meter # total range in meters - - # Construct data with values that increase with range and time - # so that when compute_MVBS is performed, the result is a smaller array - # that increases by a constant for each meter_bin and time_bin - data = _construct_MVBS_toy_data( - nchan=nchan, - npings=npings, - nrange_samples=nrange_samples, - ping_size=ping_rate * ping_time_bin, - range_sample_size=range_sample_per_meter * range_meter_bin, - ) - - data_log = 10 * np.log10(data) # Convert to log domain - chan_index = np.arange(nchan).astype(str) - freq_nom = np.arange(nchan) - # Generate a date range with `npings` number of pings with the frequency of the ping_rate - ping_time = pd.date_range( - '1/1/2020', periods=npings, freq=f'{1/ping_rate}S' +from echopype.commongrid.mvbs import get_MVBS_along_channels +from echopype.consolidate.api import POSITION_VARIABLES +from flox.xarray import xarray_reduce + +@pytest.mark.unit +@pytest.mark.parametrize(["range_var", "lat_lon"], [("depth", False), ("echo_range", True), ("echo_range", False)]) +def test_get_MVBS_along_channels(request, range_var, lat_lon): + """Testing the underlying function of compute_MVBS""" + range_bin = 20 + ping_time_bin = "20S" + method = "map-reduce" + + flox_kwargs = { + "reindex": True + } + + # Retrieve the correct dataset + if range_var == "depth": + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular_w_depth") + elif range_var == "echo_range" and lat_lon is True: + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular_w_latlon") + else: + ds_Sv = request.getfixturevalue("ds_Sv_echo_range_regular") + + # compute range interval + echo_range_max = ds_Sv[range_var].max() + range_interval = np.arange(0, echo_range_max + range_bin, range_bin) + + # create bin information needed for ping_time + d_index = ( + ds_Sv["ping_time"] + .resample(ping_time=ping_time_bin, skipna=True) + .asfreq() + .indexes["ping_time"] ) - range_sample = np.arange(nrange_samples) - Sv = xr.DataArray( - data_log, - coords=[ - ('channel', chan_index), - ('ping_time', ping_time), - ('range_sample', range_sample), - ], + ping_interval = d_index.union([d_index[-1] + pd.Timedelta(ping_time_bin)]) + + raw_MVBS = get_MVBS_along_channels( + ds_Sv, range_interval, ping_interval, + range_var=range_var, method=method, **flox_kwargs ) - Sv.name = "Sv" - ds_Sv = Sv.to_dataset() - ds_Sv = ds_Sv.assign( - frequency_nominal=xr.DataArray(freq_nom, coords={'channel': chan_index}), - echo_range=xr.DataArray( - np.array( - [[np.linspace(0, total_range, nrange_samples)] * npings] * nchan - ), - coords=Sv.coords, + + # Check that the range_var is in the dimension + assert f"{range_var}_bins" in raw_MVBS.dims + + # When it's echo_range and lat_lon, the dataset should have positions + if range_var == "echo_range" and lat_lon is True: + assert raw_MVBS.attrs["has_positions"] is True + assert all(v in raw_MVBS for v in POSITION_VARIABLES) + + # Compute xarray reduce manually for this + expected_Pos = xarray_reduce( + ds_Sv[POSITION_VARIABLES], + ds_Sv["ping_time"], + func="nanmean", + expected_groups=(ping_interval), + isbin=True, + method=method, ) - ) - - # initial test of compute_MVBS - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # TODO: use @pytest.fixture params/ids - # for multiple similar tests using the same set of parameters - # different nan cases for a single channel - case_1, case_2, case_3 = _nan_cases_comp_MVBS(ds_Sv, chan='0') - - # pad all ping_times with nans for a certain range_sample - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_1 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # pad all range_samples of certain ping_times - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_2 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - # pad all range_samples of certain ping_times and - # pad some ping_times with nans for a certain range_sample - ds_Sv['echo_range'].loc[{'channel': '0'}] = case_3 - - _coll_test_comp_MVBS(ds_Sv, nchan, ping_num, - range_sample_num, ping_time_bin, - total_range, range_meter_bin) - - -def test_commongrid_mvbs(test_data_samples): - """ - Test running through from open_raw to compute_MVBS. - """ - ( - filepath, - sonar_model, - azfp_xml_path, - range_kwargs, - ) = test_data_samples - ed = ep.open_raw(filepath, sonar_model, azfp_xml_path) - if ed.sonar_model.lower() == 'azfp': - avg_temperature = ed["Environment"]['temperature'].values.mean() - env_params = { - 'temperature': avg_temperature, - 'salinity': 27.9, - 'pressure': 59, - } - range_kwargs['env_params'] = env_params - if 'azfp_cal_type' in range_kwargs: - range_kwargs.pop('azfp_cal_type') - Sv = ep.calibrate.compute_Sv(ed, **range_kwargs) - assert ep.commongrid.compute_MVBS(Sv) is not None - - -def create_bins(csum_array: np.ndarray) -> Iterable: - """ - Constructs bin ranges based off of a cumulative - sum array. - - Parameters - ---------- - csum_array: np.ndarray - 1D array representing a cumulative sum - - Returns - ------- - bins: list - A list whose elements are the lower and upper bin ranges - """ - - bins = [] - - # construct bins - for count, csum in enumerate(csum_array): - - if count == 0: - - bins.append([0, csum]) - - else: - - # add 0.01 so that left bins don't overlap - bins.append([csum_array[count-1] + 0.01, csum]) - - return bins - - -def create_echo_range_related_data(ping_bins: Iterable, - num_pings_in_bin: np.ndarray, - er_range: list, er_bins: Iterable, - final_num_er_bins: int, - create_dask: bool, - rng: np.random.Generator, - ping_bin_nan_ind: int) -> Tuple[list, list, list]: - """ - Creates ``echo_range`` values and associated bin information. - - Parameters - ---------- - ping_bins: list - A list whose elements are the lower and upper ping time bin ranges - num_pings_in_bin: np.ndarray - Specifies the number of pings in each ping time bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - er_bins: list - A list whose elements are the lower and upper echo range bin ranges - final_num_er_bins: int - The total number of echo range bins - create_dask: bool - If True ``final_arrays`` values will be - dask arrays, else they will be numpy arrays - rng: np.random.Generator - The generator for random values - ping_bin_nan_ind: int - The ping bin index to fill with NaNs - - Returns - ------- - all_er_bin_nums: list of np.ndarray - A list whose elements are the number of values in each echo_range - bin, for each ping bin - ping_times_in_bin: list of np.ndarray - A list whose elements are the ping_time values for each corresponding bin - final_arrays: list of np.ndarray or dask.array.Array - A list whose elements are the echo_range values for a given ping and - echo range bin block - """ - - final_arrays = [] - all_er_bin_nums = [] - ping_times_in_bin = [] - - # build echo_range array - for ping_ind, ping_bin in enumerate(ping_bins): - - # create the ping times associated with each ping bin - ping_times_in_bin.append(rng.uniform(ping_bin[0], ping_bin[1], (num_pings_in_bin[ping_ind],))) - - # randomly determine the number of values in each echo_range bin - num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) - - # store the number of values in each echo_range bin - all_er_bin_nums.append(num_er_in_bin) - - er_row_block = [] - for count, bin_val in enumerate(er_bins): - - # create a block of echo_range values - if create_dask: - a = dask.array.random.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) - else: - a = rng.uniform(bin_val[0], bin_val[1], (num_pings_in_bin[ping_ind], - num_er_in_bin[count])) - - # store the block of echo_range values - er_row_block.append(a) - - # set all echo_range values at ping index to NaN - if ping_ind == ping_bin_nan_ind: - a[:, :] = np.nan - - # collect and construct echo_range row block - final_arrays.append(np.concatenate(er_row_block, axis=1)) - - return all_er_bin_nums, ping_times_in_bin, final_arrays - - -def construct_2d_echo_range_array(final_arrays: Iterable[np.ndarray], - ping_csum: np.ndarray, - create_dask: bool) -> Tuple[Union[np.ndarray, dask.array.Array], int]: - """ - Creates the final 2D ``echo_range`` array with appropriate padding. - - Parameters - ---------- - final_arrays: list of np.ndarray - A list whose elements are the echo_range values for a given ping and - echo range bin block - ping_csum: np.ndarray - 1D array representing the cumulative sum for the number of ping times - in each ping bin - create_dask: bool - If True ``final_er`` will be a dask array, else it will be a numpy array - - Returns - ------- - final_er: np.ndarray or dask.array.Array - The final 2D ``echo_range`` array - max_num_er_elem: int - The maximum number of ``echo_range`` elements amongst all times - """ - - # get maximum number of echo_range elements amongst all times - max_num_er_elem = max([arr.shape[1] for arr in final_arrays]) - - # total number of ping times - tot_num_times = ping_csum[-1] - - # pad echo_range dimension with nans and create final echo_range - if create_dask: - final_er = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan + + for v in POSITION_VARIABLES: + assert np.array_equal(raw_MVBS[v].data, expected_Pos[v].data) else: - final_er = np.empty((tot_num_times, max_num_er_elem)) - final_er[:] = np.nan - - for count, arr in enumerate(final_arrays): - - if count == 0: - final_er[0:ping_csum[count], 0:arr.shape[1]] = arr - else: - final_er[ping_csum[count - 1]:ping_csum[count], 0:arr.shape[1]] = arr - - return final_er, max_num_er_elem - - -def construct_2d_sv_array(max_num_er_elem: int, ping_csum: np.ndarray, - all_er_bin_nums: Iterable[np.ndarray], - num_pings_in_bin: np.ndarray, - create_dask: bool, - ping_bin_nan_ind: int) -> Tuple[Union[np.ndarray, dask.array.Array], - np.ndarray]: - """ - Creates the final 2D Sv array with appropriate padding. - - Parameters - ---------- - max_num_er_elem: int - The maximum number of ``echo_range`` elements amongst all times - ping_csum: np.ndarray - 1D array representing the cumulative sum for the number of ping times - in each ping bin - all_er_bin_nums: list of np.ndarray - A list whose elements are the number of values in each echo_range - bin, for each ping bin - num_pings_in_bin: np.ndarray - Specifies the number of pings in each ping time bin - create_dask: bool - If True ``final_sv`` will be a dask array, else it will be a numpy array - ping_bin_nan_ind: int - The ping bin index to fill with NaNs - - Returns - ------- - final_sv: np.ndarray or dask.array.Array - The final 2D Sv array - final_MVBS: np.ndarray - The final 2D known MVBS array - """ - - # total number of ping times - tot_num_times = ping_csum[-1] - - # pad echo_range dimension with nans and create final sv - if create_dask: - final_sv = dask.array.ones(shape=(tot_num_times, max_num_er_elem)) * np.nan - else: - final_sv = np.empty((tot_num_times, max_num_er_elem)) - final_sv[:] = np.nan - - final_means = [] - for count, arr in enumerate(all_er_bin_nums): - - # create sv row values using natural numbers - sv_row_list = [np.arange(1, num_elem + 1, 1, dtype=np.float64) for num_elem in arr] - - # create final sv row - sv_row = np.concatenate(sv_row_list) - - # get final mean which is n+1/2 (since we are using natural numbers) - ping_mean = [(len(elem) + 1) / 2.0 for elem in sv_row_list] - - # create sv row block - sv_row_block = np.tile(sv_row, (num_pings_in_bin[count], 1)) - - if count == ping_bin_nan_ind: - - # fill values with NaNs - ping_mean = [np.nan]*len(sv_row_list) - sv_row_block[:, :] = np.nan - - # store means for ping - final_means.append(ping_mean) - - if count == 0: - final_sv[0:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block - else: - final_sv[ping_csum[count - 1]:ping_csum[count], 0:sv_row_block.shape[1]] = sv_row_block - - # create final sv MVBS - final_MVBS = np.vstack(final_means) - - return final_sv, final_MVBS - - -def create_known_mean_data(final_num_ping_bins: int, - final_num_er_bins: int, - ping_range: list, - er_range: list, create_dask: bool, - rng: np.random.Generator) -> Tuple[np.ndarray, np.ndarray, Iterable, - Iterable, np.ndarray, np.ndarray]: - """ - Orchestrates the creation of ``echo_range``, ``ping_time``, and ``Sv`` arrays - where the MVBS is known. - - Parameters - ---------- - final_num_ping_bins: int - The total number of ping time bins - final_num_er_bins: int - The total number of echo range bins - ping_range: list - A list whose first element is the lowest and second element is - the highest possible number of ping time values in a given bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - create_dask: bool - If True the ``Sv`` and ``echo_range`` values produced will be - dask arrays, else they will be numpy arrays. - rng: np.random.Generator - generator for random integers - - Returns - ------- - final_MVBS: np.ndarray - The final 2D known MVBS array - final_sv: np.ndarray - The final 2D Sv array - ping_bins: Iterable - A list whose elements are the lower and upper ping time bin ranges - er_bins: Iterable - A list whose elements are the lower and upper echo range bin ranges - final_er: np.ndarray - The final 2D ``echo_range`` array - final_ping_time: np.ndarray - The final 1D ``ping_time`` array - """ - - # randomly generate the number of pings in each ping bin - num_pings_in_bin = rng.integers(low=ping_range[0], high=ping_range[1], size=final_num_ping_bins) - - # create bins for ping_time dimension - ping_csum = np.cumsum(num_pings_in_bin) - ping_bins = create_bins(ping_csum) - - # create bins for echo_range dimension - num_er_in_bin = rng.integers(low=er_range[0], high=er_range[1], size=final_num_er_bins) - er_csum = np.cumsum(num_er_in_bin) - er_bins = create_bins(er_csum) - - # randomly select one ping bin to fill with NaNs - ping_bin_nan_ind = rng.choice(len(ping_bins)) - - # create the echo_range data and associated bin information - all_er_bin_nums, ping_times_in_bin, final_er_arrays = create_echo_range_related_data(ping_bins, num_pings_in_bin, - er_range, er_bins, - final_num_er_bins, - create_dask, - rng, - ping_bin_nan_ind) - - # create the final echo_range array using created data and padding - final_er, max_num_er_elem = construct_2d_echo_range_array(final_er_arrays, ping_csum, create_dask) - - # get final ping_time dimension - final_ping_time = np.concatenate(ping_times_in_bin).astype('datetime64[ns]') - - # create the final sv array - final_sv, final_MVBS = construct_2d_sv_array(max_num_er_elem, ping_csum, - all_er_bin_nums, num_pings_in_bin, - create_dask, ping_bin_nan_ind) - - return final_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time - - -@pytest.fixture( - params=[ - { - "create_dask": True, - "final_num_ping_bins": 10, - "final_num_er_bins": 10, - "ping_range": [10, 1000], - "er_range": [10, 1000] - }, - { - "create_dask": False, - "final_num_ping_bins": 10, - "final_num_er_bins": 10, - "ping_range": [10, 1000], - "er_range": [10, 1000] - }, - ], - ids=[ - "delayed_data", - "in_memory_data" - ], -) -def bin_and_mean_2d_params(request): - """ - Obtains all necessary parameters for ``test_bin_and_mean_2d``. - """ - - return list(request.param.values()) - - -def test_bin_and_mean_2d(bin_and_mean_2d_params) -> None: - """ - Tests the function ``bin_and_mean_2d``, which is the core - method for ``compute_MVBS``. This is done by creating mock - data (which can have varying number of ``echo_range`` values - for each ``ping_time``) with known means. - - Parameters - ---------- - create_dask: bool - If True the ``Sv`` and ``echo_range`` values produced will be - dask arrays, else they will be numpy arrays. - final_num_ping_bins: int - The total number of ping time bins - final_num_er_bins: int - The total number of echo range bins - ping_range: list - A list whose first element is the lowest and second element is - the highest possible number of ping time values in a given bin - er_range: list - A list whose first element is the lowest and second element is - the highest possible number of echo range values in a given bin - """ - - # get all parameters needed to create the mock data - create_dask, final_num_ping_bins, final_num_er_bins, ping_range, er_range = bin_and_mean_2d_params - - # randomly generate a seed - seed = np.random.randint(low=10, high=100000, size=1)[0] - - print(f"seed used to generate mock data: {seed}") - - # establish generator for random integers - rng = default_rng(seed=seed) - - # seed dask random generator - if create_dask: - dask.array.random.seed(seed=seed) - - # create echo_range, ping_time, and Sv arrays where the MVBS is known - known_MVBS, final_sv, ping_bins, er_bins, final_er, final_ping_time = create_known_mean_data(final_num_ping_bins, - final_num_er_bins, - ping_range, er_range, - create_dask, - rng) - - # put the created ping bins into a form that works with bin_and_mean_2d - digitize_ping_bin = np.array([*ping_bins[0]] + [bin_val[1] for bin_val in ping_bins[1:-1]]) - digitize_ping_bin = digitize_ping_bin.astype('datetime64[ns]') - - # put the created echo range bins into a form that works with bin_and_mean_2d - digitize_er_bin = np.array([*er_bins[0]] + [bin_val[1] for bin_val in er_bins[1:]]) - - # calculate MVBS for mock data set - calc_MVBS = bin_and_mean_2d(arr=final_sv, bins_time=digitize_ping_bin, - bins_er=digitize_er_bin, times=final_ping_time, - echo_range=final_er, comprehensive_er_check=True) - - # compare known MVBS solution against its calculated counterpart - assert np.allclose(calc_MVBS, known_MVBS, atol=1e-10, rtol=1e-10, equal_nan=True) + assert raw_MVBS.attrs["has_positions"] is False diff --git a/echopype/tests/conftest.py b/echopype/tests/conftest.py index 26e56fca3..0e22135f6 100644 --- a/echopype/tests/conftest.py +++ b/echopype/tests/conftest.py @@ -1,6 +1,7 @@ """``pytest`` configuration.""" from ftplib import FTP + import os import subprocess import pytest @@ -85,7 +86,6 @@ def minio_bucket(): secret="minioadmin", ) - @pytest.fixture(scope="session") def setup_test_data_jr230(): file_name = "JR230-D20091215-T121917.raw" @@ -104,7 +104,6 @@ def setup_test_data_jr179(): return _setup_file(file_name) -# Separate Sv dataset fixtures for each file @pytest.fixture(scope="session") def sv_dataset_jr230(setup_test_data_jr230) -> xr.DataArray: return _get_sv_dataset(setup_test_data_jr230) @@ -130,3 +129,5 @@ def complete_dataset_jr179(setup_test_data_jr179): def raw_dataset_jr179(setup_test_data_jr179): ed = _get_raw_dataset(setup_test_data_jr179) return ed + + diff --git a/echopype/tests/convert/test_convert_azfp.py b/echopype/tests/convert/test_convert_azfp.py index 075fde0fb..4b8f411b6 100644 --- a/echopype/tests/convert/test_convert_azfp.py +++ b/echopype/tests/convert/test_convert_azfp.py @@ -10,12 +10,14 @@ from scipy.io import loadmat from echopype import open_raw import pytest +from echopype.convert.parse_azfp import ParseAZFP @pytest.fixture def azfp_path(test_path): return test_path["AZFP"] + def check_platform_required_scalar_vars(echodata): # check convention-required variables in the Platform group for var in [ @@ -172,3 +174,53 @@ def test_convert_azfp_01a_notemperature_notilt(azfp_path): assert "tilt_y" in echodata["Platform"] assert echodata["Platform"]["tilt_x"].isnull().all() assert echodata["Platform"]["tilt_y"].isnull().all() + + +def test_load_parse_azfp_xml(azfp_path): + + azfp_01a_path = azfp_path / '17082117.01A' + azfp_xml_path = azfp_path / '17030815.XML' + parseAZFP = ParseAZFP(str(azfp_01a_path), str(azfp_xml_path)) + parseAZFP.load_AZFP_xml() + expected_params = ['instrument_type_string', 'instrument_type', 'major', 'minor', 'date', + 'program_name', 'program', 'CPU', 'serial_number', 'board_version', + 'file_version', 'parameter_version', 'configuration_version', 'eclock', + 'digital_board_version', 'sensors_flag_pressure_sensor_installed', + 'sensors_flag_paros_installed', 'sensors_flag', 'U0', 'Y1', 'Y2', 'Y3', + 'C1', 'C2', 'C3', 'D1', 'D2', 'T1', 'T2', 'T3', 'T4', 'T5', 'X_a', 'X_b', + 'X_c', 'X_d', 'Y_a', 'Y_b', 'Y_c', 'Y_d', 'period', 'ppm_offset', + 'calibration', 'a0', 'a1', 'a2', 'a3', 'ka', 'kb', 'kc', 'A', 'B', 'C', + 'num_freq', 'kHz_units', 'kHz', 'TVR', 'num_vtx', 'VTX0', 'VTX1', 'VTX2', + 'VTX3', 'BP', 'EL', 'DS', 'min_pulse_len', 'sound_speed', + 'start_date_svalue', 'start_date', 'num_frequencies', 'num_phases', + 'data_output_svalue', 'data_output', 'frequency_units', 'frequency', + 'phase_number', 'phase_type_svalue', 'phase_type', 'duration_svalue', + 'duration', 'ping_period_units', 'ping_period', 'burst_interval_units', + 'burst_interval', 'pings_per_burst_units', 'pings_per_burst', + 'average_burst_pings_units', 'average_burst_pings', 'frequency_number', + 'acquire_frequency_units', 'acquire_frequency', 'pulse_len_units', + 'pulse_len', 'dig_rate_units', 'dig_rate', 'range_samples_units', + 'range_samples', 'range_averaging_samples_units', 'range_averaging_samples', + 'lock_out_index_units', 'lock_out_index', 'gain_units', 'gain', + 'storage_format_units', 'storage_format'] + assert set(parseAZFP.parameters.keys()) == set(expected_params) + assert list(set(parseAZFP.parameters['instrument_type_string']))[0] == 'AZFP' + assert isinstance(parseAZFP.parameters['num_freq'], int) + assert isinstance(parseAZFP.parameters['pulse_len'], list) + assert parseAZFP.parameters['num_freq'] == 4 + assert len(parseAZFP.parameters['frequency_number']) == 4 + assert parseAZFP.parameters['frequency_number'] == ['1', '2', '3', '4'] + assert parseAZFP.parameters['kHz'] == [125, 200, 455, 769] + + expected_len_params = ['acquire_frequency', 'pulse_len', 'dig_rate', 'range_samples', + 'range_averaging_samples', 'lock_out_index', 'gain', 'storage_format'] + assert all(len(parseAZFP.parameters[x]) == 4 for x in expected_len_params) + assert parseAZFP.parameters['acquire_frequency'] == [1, 1, 1, 1] + assert parseAZFP.parameters['pulse_len'] == [300, 300, 300, 300] + assert parseAZFP.parameters['dig_rate'] == [20000, 20000, 20000, 20000] + assert parseAZFP.parameters['range_samples'] == [1752, 1752, 1764, 540] + assert parseAZFP.parameters['range_averaging_samples'] == [4, 4, 4, 4] + assert parseAZFP.parameters['lock_out_index'] == [0, 0, 0, 0] + assert parseAZFP.parameters['gain'] == [1, 1, 1, 1] + assert parseAZFP.parameters['storage_format'] == [1, 1, 1, 1] + diff --git a/echopype/tests/convert/test_convert_source_target_locs.py b/echopype/tests/convert/test_convert_source_target_locs.py index 66cbbfe40..6b1af93e0 100644 --- a/echopype/tests/convert/test_convert_source_target_locs.py +++ b/echopype/tests/convert/test_convert_source_target_locs.py @@ -236,7 +236,7 @@ def test_convert_time_encodings(sonar_model, raw_file, xml_path, test_path): xml_path = str(test_path[path_model].joinpath(*xml_path).absolute()) ed = open_raw( - sonar_model=sonar_model, raw_file=raw_file, xml_path=xml_path + sonar_model=sonar_model, raw_file=raw_file, xml_path=xml_path, destination_path="no_swap" ) ed.to_netcdf(overwrite=True) for group, details in ed.group_map.items(): @@ -297,6 +297,7 @@ def test_convert_ek( raw_file=ipath, sonar_model=sonar_model, storage_options=input_storage_options, + destination_path="no_swap" ) if ( @@ -370,6 +371,7 @@ def test_convert_azfp( xml_path=azfp_xml_paths, sonar_model=model, storage_options=input_storage_options, + destination_path="no_swap" ) assert echodata.xml_path == azfp_xml_paths diff --git a/echopype/tests/convert/test_parsed_to_zarr.py b/echopype/tests/convert/test_parsed_to_zarr.py index 8ae96c5e7..9b836753c 100644 --- a/echopype/tests/convert/test_parsed_to_zarr.py +++ b/echopype/tests/convert/test_parsed_to_zarr.py @@ -1,18 +1,95 @@ import pytest import xarray as xr -from typing import List, Tuple +import pandas as pd +from typing import List, Optional, Tuple from echopype import open_raw -from pathlib import Path +import shutil +from zarr.hierarchy import Group as ZGroup import os.path +from fsspec import FSMap +from s3fs import S3FileSystem +import requests +import time +from echopype.convert.parsed_to_zarr import Parsed2Zarr, DEFAULT_ZARR_TEMP_DIR +from echopype.convert.parsed_to_zarr_ek60 import Parsed2ZarrEK60 +from echopype.echodata.convention import sonarnetcdf_1 +from echopype.convert.api import _check_file, SONAR_MODELS + +test_bucket_name = "echopype-test" +port = 5555 +endpoint_uri = "http://127.0.0.1:%s/" % port + + +@pytest.fixture() +def s3_base(): + # writable local S3 system + import shlex + import subprocess + + try: + # should fail since we didn't start server yet + r = requests.get(endpoint_uri) + except: # noqa + pass + else: + if r.ok: + raise RuntimeError("moto server already up") + if "AWS_SECRET_ACCESS_KEY" not in os.environ: + os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" + if "AWS_ACCESS_KEY_ID" not in os.environ: + os.environ["AWS_ACCESS_KEY_ID"] = "foo" + proc = subprocess.Popen( + shlex.split("moto_server s3 -p %s" % port), + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, + ) + + timeout = 5 + while timeout > 0: + try: + print("polling for moto server") + r = requests.get(endpoint_uri) + if r.ok: + break + except: # noqa + pass + timeout -= 0.1 + time.sleep(0.1) + print("server up") + yield + print("moto done") + proc.terminate() + proc.wait() + + +def get_boto3_client(): + from botocore.session import Session + + # NB: we use the sync botocore client for setup + session = Session() + return session.create_client("s3", endpoint_url=endpoint_uri) + + +@pytest.fixture() +def s3(s3_base): + client = get_boto3_client() + client.create_bucket(Bucket=test_bucket_name, ACL="public-read") + + S3FileSystem.clear_instance_cache() + s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) + s3.invalidate_cache() + yield s3 @pytest.fixture def ek60_path(test_path): - return test_path['EK60'] + return test_path["EK60"] -def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, - var_to_comp: List[str], ed_path) -> Tuple[xr.Dataset, xr.Dataset]: +def compare_zarr_vars( + ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, var_to_comp: List[str], ed_path +) -> Tuple[xr.Dataset, xr.Dataset]: """ Compares the dask variables in ``ed_zarr`` against their counterparts in ``ed_no_zarr`` by computing the dask results @@ -41,9 +118,7 @@ def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, """ for var in var_to_comp: - for chan in ed_zarr[ed_path][var].channel: - # here we compute to make sure values are being compared, rather than just shapes var_zarr = ed_zarr[ed_path][var].sel(channel=chan).compute() var_no_zarr = ed_no_zarr[ed_path][var].sel(channel=chan) @@ -56,13 +131,13 @@ def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, @pytest.mark.parametrize( - ["raw_file", "sonar_model", "use_swap"], + ["raw_file", "sonar_model", "destination_path"], [ - ("L0003-D20040909-T161906-EK60.raw", "EK60", True), + ("L0003-D20040909-T161906-EK60.raw", "EK60", "swap"), pytest.param( "L0003-D20040909-T161906-EK60.raw", "EK60", - False, + "no_swap", marks=pytest.mark.xfail( run=False, reason="Expected out of memory error. See https://github.com/OSOceanAcoustics/echopype/issues/489", @@ -71,18 +146,17 @@ def compare_zarr_vars(ed_zarr: xr.Dataset, ed_no_zarr: xr.Dataset, ], ids=["noaa_offloaded", "noaa_not_offloaded"], ) -def test_raw2zarr(raw_file, sonar_model, use_swap, ek60_path): +def test_raw2zarr(raw_file, sonar_model, destination_path, ek60_path): """Tests for memory expansion relief""" import os from tempfile import TemporaryDirectory from echopype.echodata.echodata import EchoData - name = os.path.basename(raw_file).replace('.raw', '') - fname = f"{name}__{use_swap}.zarr" + + name = os.path.basename(raw_file).replace(".raw", "") + fname = f"{name}__{destination_path}.zarr" file_path = ek60_path / raw_file echodata = open_raw( - raw_file=file_path, - sonar_model=sonar_model, - use_swap=use_swap + raw_file=file_path, sonar_model=sonar_model, destination_path=destination_path ) # Most likely succeed if it doesn't crash assert isinstance(echodata, EchoData) @@ -92,14 +166,13 @@ def test_raw2zarr(raw_file, sonar_model, use_swap, ek60_path): # If it goes all the way to here it is most likely successful assert os.path.exists(output_save_path) - if use_swap: - # create a copy of zarr_file_name. The join is necessary so that we are not referencing zarr_file_name - temp_zarr_path = ''.join(echodata.parsed2zarr_obj.zarr_file_name) + if echodata.parsed2zarr_obj.store is not None: + temp_zarr_path = echodata.parsed2zarr_obj.store del echodata # make sure that the temporary zarr was deleted - assert Path(temp_zarr_path).exists() is False + assert temp_zarr_path.fs.exists(temp_zarr_path.root) is False @pytest.mark.parametrize( @@ -107,16 +180,23 @@ def test_raw2zarr(raw_file, sonar_model, use_swap, ek60_path): [ ("EK60", os.path.join("ncei-wcsd", "Summer2017-D20170615-T190214.raw"), "EK60"), ("EK60", "DY1002_EK60-D20100318-T023008_rep_freq.raw", "EK60"), - ("EK80", "Summer2018--D20180905-T033113.raw", "EK80"), + ("EK80", "Summer2018--D20180905-T033113.raw", "EK80"), ("EK80_CAL", "2018115-D20181213-T094600.raw", "EK80"), - ("EK80", "Green2.Survey2.FM.short.slow.-D20191004-T211557.raw", "EK80"), - ("EK80", "2019118 group2survey-D20191214-T081342.raw", "EK80"), + ("EK80", "Green2.Survey2.FM.short.slow.-D20191004-T211557.raw", "EK80"), + ("EK80", "2019118 group2survey-D20191214-T081342.raw", "EK80"), + ], + ids=[ + "ek60_summer_2017", + "ek60_rep_freq", + "ek80_summer_2018", + "ek80_bb_w_cal", + "ek80_short_slow", + "ek80_grp_2_survey", ], - ids=["ek60_summer_2017", "ek60_rep_freq", "ek80_summer_2018", - "ek80_bb_w_cal", "ek80_short_slow", "ek80_grp_2_survey"], ) -def test_direct_to_zarr_integration(path_model: str, raw_file: str, - sonar_model: str, test_path: dict) -> None: +def test_direct_to_zarr_integration( + path_model: str, raw_file: str, sonar_model: str, test_path: dict +) -> None: """ Integration Test that ensure writing variables directly to a temporary zarr store and then assigning @@ -144,11 +224,10 @@ def test_direct_to_zarr_integration(path_model: str, raw_file: str, raw_file_path = test_path[path_model] / raw_file - ed_zarr = open_raw(raw_file_path, sonar_model=sonar_model, use_swap=True, max_mb=100) - ed_no_zarr = open_raw(raw_file_path, sonar_model=sonar_model, use_swap=False) + ed_zarr = open_raw(raw_file_path, sonar_model=sonar_model, max_mb=100, destination_path="swap") + ed_no_zarr = open_raw(raw_file_path, sonar_model=sonar_model) for grp in ed_zarr.group_paths: - # remove conversion time so we can do a direct comparison if "conversion_time" in ed_zarr[grp].attrs: del ed_zarr[grp].attrs["conversion_time"] @@ -156,24 +235,193 @@ def test_direct_to_zarr_integration(path_model: str, raw_file: str, # Compare angle, power, complex, if zarr drop the zarr variables and compare datasets if grp == "Sonar/Beam_group2": - var_to_comp = ['angle_athwartship', 'angle_alongship', 'backscatter_r'] + var_to_comp = ["angle_athwartship", "angle_alongship", "backscatter_r"] ed_zarr, ed_no_zarr = compare_zarr_vars(ed_zarr, ed_no_zarr, var_to_comp, grp) if grp == "Sonar/Beam_group1": - - if 'backscatter_i' in ed_zarr[grp]: - var_to_comp = ['backscatter_r', 'backscatter_i'] + if "backscatter_i" in ed_zarr[grp]: + var_to_comp = ["backscatter_r", "backscatter_i"] else: - var_to_comp = ['angle_athwartship', 'angle_alongship', 'backscatter_r'] + var_to_comp = ["angle_athwartship", "angle_alongship", "backscatter_r"] ed_zarr, ed_no_zarr = compare_zarr_vars(ed_zarr, ed_no_zarr, var_to_comp, grp) - assert ed_zarr[grp].identical(ed_no_zarr[grp]) + assert ed_zarr[grp] is not None - # create a copy of zarr_file_name. The join is necessary so that we are not referencing zarr_file_name - temp_zarr_path = ''.join(ed_zarr.parsed2zarr_obj.zarr_file_name) + if ed_zarr.parsed2zarr_obj.store is not None: + temp_zarr_path = ed_zarr.parsed2zarr_obj.store - del ed_zarr + del ed_zarr - # make sure that the temporary zarr was deleted - assert Path(temp_zarr_path).exists() is False + # make sure that the temporary zarr was deleted + assert temp_zarr_path.fs.exists(temp_zarr_path.root) is False + + +class TestParsed2Zarr: + sample_file = "L0003-D20040909-T161906-EK60.raw" + sonar_model = "EK60" + xml_path = None + convert_params = None + storage_options = {} + max_mb = 100 + ek60_expected_shapes = { + "angle_alongship": (9923, 3, 10417), + "angle_athwartship": (9923, 3, 10417), + "channel": (3,), + "timestamp": (9923,), + "power": (9923, 3, 10417), + } + + @pytest.fixture(scope="class") + def ek60_parsed2zarr_obj(self, ek60_parser_obj): + return Parsed2ZarrEK60(ek60_parser_obj) + + @pytest.fixture(scope="class") + def ek60_parsed2zarr_obj_w_df(self, ek60_parsed2zarr_obj): + ek60_parsed2zarr_obj._create_zarr_info() + ek60_parsed2zarr_obj.datagram_df = pd.DataFrame.from_dict( + ek60_parsed2zarr_obj.parser_obj.zarr_datagrams + ) + # convert channel column to a string + ek60_parsed2zarr_obj.datagram_df["channel"] = ek60_parsed2zarr_obj.datagram_df["channel"].astype(str) + yield ek60_parsed2zarr_obj + + def _get_storage_options(self, dest_path: Optional[str]) -> Optional[dict]: + """Retrieve storage options for destination path""" + dest_storage_options = None + if dest_path is not None and "s3://" in dest_path: + dest_storage_options = {"anon": False, "client_kwargs": {"endpoint_url": endpoint_uri}} + return dest_storage_options + + @pytest.fixture(scope="class") + def ek60_parser_obj(self, test_path): + folder_path = test_path[self.sonar_model] + raw_file = str(folder_path / self.sample_file) + + file_chk, xml_chk = _check_file( + raw_file, self.sonar_model, self.xml_path, self.storage_options + ) + + if SONAR_MODELS[self.sonar_model]["xml"]: + params = xml_chk + else: + params = "ALL" + + # obtain dict associated with directly writing to zarr + dgram_zarr_vars = SONAR_MODELS[self.sonar_model]["dgram_zarr_vars"] + + # Parse raw file and organize data into groups + parser = SONAR_MODELS[self.sonar_model]["parser"]( + file_chk, + params=params, + storage_options=self.storage_options, + dgram_zarr_vars=dgram_zarr_vars, + ) + + # Parse the data + parser.parse_raw() + return parser + + @pytest.mark.parametrize( + ["sonar_model", "p2z_class"], + [ + (None, Parsed2Zarr), + ("EK60", Parsed2ZarrEK60), + ], + ) + def test_constructor(self, sonar_model, p2z_class, ek60_parser_obj): + if sonar_model is None: + p2z = p2z_class(None) + assert p2z.parser_obj is None + assert p2z.temp_zarr_dir is None + assert p2z.zarr_file_name is None + assert p2z.store is None + assert p2z.zarr_root is None + assert p2z._varattrs == sonarnetcdf_1.yaml_dict["variable_and_varattributes"] + else: + p2z = p2z_class(ek60_parser_obj) + assert isinstance(p2z.parser_obj, SONAR_MODELS[self.sonar_model]["parser"]) + assert p2z.sonar_model == self.sonar_model + + @pytest.mark.parametrize("dest_path", [None, "./", f"s3://{test_bucket_name}/my-dir/"]) + def test__create_zarr_info(self, ek60_parsed2zarr_obj, dest_path, s3): + dest_storage_options = self._get_storage_options(dest_path) + + ek60_parsed2zarr_obj._create_zarr_info(dest_path, dest_storage_options=dest_storage_options) + + zarr_store = ek60_parsed2zarr_obj.store + zarr_root = ek60_parsed2zarr_obj.zarr_root + + assert isinstance(zarr_store, FSMap) + assert isinstance(zarr_root, ZGroup) + assert zarr_store.root.endswith(".zarr") + + if dest_path is None: + assert os.path.dirname(zarr_store.root) == str(DEFAULT_ZARR_TEMP_DIR) + assert ek60_parsed2zarr_obj.temp_zarr_dir == str(DEFAULT_ZARR_TEMP_DIR) + elif "s3://" not in dest_path: + shutil.rmtree(zarr_store.root) + + def test__close_store(self, ek60_parsed2zarr_obj): + ek60_parsed2zarr_obj._create_zarr_info() + + zarr_store = ek60_parsed2zarr_obj.store + + # Initially metadata should not exist + assert not zarr_store.fs.exists(zarr_store.root + "/.zmetadata") + + # Close store will consolidate metadata + ek60_parsed2zarr_obj._close_store() + + # Now metadata should exist + assert zarr_store.fs.exists(zarr_store.root + "/.zmetadata") + + def test__write_power(self, ek60_parsed2zarr_obj_w_df): + # There shouldn't be any group here + assert "power" not in ek60_parsed2zarr_obj_w_df.zarr_root + + ek60_parsed2zarr_obj_w_df._write_power( + df=ek60_parsed2zarr_obj_w_df.datagram_df, + max_mb=self.max_mb + ) + + # There should now be power group + assert "power" in ek60_parsed2zarr_obj_w_df.zarr_root + + for k, arr in ek60_parsed2zarr_obj_w_df.zarr_root["/power"].arrays(): + assert arr.shape == self.ek60_expected_shapes[k] + + def test__write_angle(self, ek60_parsed2zarr_obj_w_df): + # There shouldn't be any group here + assert "angle" not in ek60_parsed2zarr_obj_w_df.zarr_root + + ek60_parsed2zarr_obj_w_df._write_angle( + df=ek60_parsed2zarr_obj_w_df.datagram_df, + max_mb=self.max_mb + ) + # There should now be angle group + assert "angle" in ek60_parsed2zarr_obj_w_df.zarr_root + + for k, arr in ek60_parsed2zarr_obj_w_df.zarr_root["/angle"].arrays(): + assert arr.shape == self.ek60_expected_shapes[k] + + def test_power_dataarray(self, ek60_parsed2zarr_obj_w_df): + power_dataarray = ek60_parsed2zarr_obj_w_df.power_dataarray + assert isinstance(power_dataarray, xr.DataArray) + + assert power_dataarray.name == "backscatter_r" + assert power_dataarray.dims == ("ping_time", "channel", "range_sample") + assert power_dataarray.shape == self.ek60_expected_shapes["power"] + + def test_angle_dataarrays(self, ek60_parsed2zarr_obj_w_df): + angle_athwartship, angle_alongship = ek60_parsed2zarr_obj_w_df.angle_dataarrays + assert isinstance(angle_athwartship, xr.DataArray) + assert isinstance(angle_alongship, xr.DataArray) + + assert angle_alongship.name == "angle_alongship" + assert angle_alongship.dims == ("ping_time", "channel", "range_sample") + assert angle_alongship.shape == self.ek60_expected_shapes["angle_alongship"] + + assert angle_athwartship.name == "angle_athwartship" + assert angle_athwartship.dims == ("ping_time", "channel", "range_sample") + assert angle_athwartship.shape == self.ek60_expected_shapes["angle_athwartship"] diff --git a/echopype/tests/echodata/utils.py b/echopype/tests/echodata/utils.py index 011dafeeb..31bcf93ce 100644 --- a/echopype/tests/echodata/utils.py +++ b/echopype/tests/echodata/utils.py @@ -10,6 +10,10 @@ from echopype.convert.set_groups_base import SetGroupsBase from echopype.echodata.echodata import EchoData +from echopype.echodata.convention import sonarnetcdf_1 + +class P2Z: + _varattrs = sonarnetcdf_1.yaml_dict["variable_and_varattributes"] class SetGroupsTest(SetGroupsBase): @@ -95,6 +99,7 @@ def get_mock_echodata( output_path=None, sonar_model=sonar_model, params={"survey_name": "mock_survey"}, + parsed2zarr_obj=P2Z(), ) tree_dict["/"] = setgrouper.set_toplevel( sonar_model, date_created=np.datetime64("1970-01-01") diff --git a/echopype/tests/utils/test_processinglevels_integration.py b/echopype/tests/utils/test_processinglevels_integration.py index 0dadbfc87..10c81c9eb 100644 --- a/echopype/tests/utils/test_processinglevels_integration.py +++ b/echopype/tests/utils/test_processinglevels_integration.py @@ -127,8 +127,6 @@ def _freqdiff_applymask(test_ds): # ---- Compute MVBS # compute_MVBS expects a variable named "Sv" - # No product level is assigned because at present compute_MVBS drops the lat/lon data - # associated with the input Sv dataset # ds = ds.rename_vars(name_dict={"Sv": "Sv_unmasked", "Sv_ch0": "Sv"}) - mvbs_ds = ep.commongrid.compute_MVBS(ds, range_meter_bin=30, ping_time_bin='1min') - _absence_test(mvbs_ds) + mvbs_ds = ep.commongrid.compute_MVBS(ds, range_bin="30m", ping_time_bin='1min') + _presence_test(mvbs_ds, "Level 3B") diff --git a/echopype/utils/compute.py b/echopype/utils/compute.py new file mode 100644 index 000000000..936a1187d --- /dev/null +++ b/echopype/utils/compute.py @@ -0,0 +1,41 @@ +"""compute.py + +Module containing various helper functions +for performing computations within echopype. +""" +from typing import Union + +import dask.array +import numpy as np + + +def _log2lin(data: Union[dask.array.Array, np.ndarray]) -> Union[dask.array.Array, np.ndarray]: + """Perform log to linear transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + return 10 ** (data / 10) + + +def _lin2log(data: Union[dask.array.Array, np.ndarray]) -> Union[dask.array.Array, np.ndarray]: + """Perform linear to log transform on data + + Parameters + ---------- + data : dask.array.Array or np.ndarray + The data to be transformed + + Returns + ------- + dask.array.Array or np.ndarray + The transformed data + """ + return 10 * np.log10(data) diff --git a/echopype/utils/io.py b/echopype/utils/io.py index 3b93ee91e..2350eef35 100644 --- a/echopype/utils/io.py +++ b/echopype/utils/io.py @@ -422,3 +422,21 @@ def validate_source_ds_da( check_file_existence(file_path=source_ds_da, storage_options=storage_options) return source_ds_da, file_type + + +def get_dataset(source: Union[xr.Dataset, str, pathlib.Path]): + """ + Given an input that can be either a dataset or a path to it, + returns the dataset + """ + if isinstance(source, xr.Dataset): + return source + else: + validated_source, file_format = validate_source_ds_da(source) + open_map = { + "netcdf4": xr.open_dataset, + "zarr": xr.open_zarr, + } + with open_map[file_format](validated_source) as ds: + dataset = ds.load() + return dataset diff --git a/echopype/utils/misc.py b/echopype/utils/misc.py new file mode 100644 index 000000000..838f3ece6 --- /dev/null +++ b/echopype/utils/misc.py @@ -0,0 +1,25 @@ +def camelcase2snakecase(camel_case_str): + """ + Convert string from CamelCase to snake_case + e.g. CamelCase becomes camel_case. + """ + idx = list(reversed([i for i, c in enumerate(camel_case_str) if c.isupper()])) + param_len = len(camel_case_str) + for i in idx: + # check if we should insert an underscore + if i > 0 and i < param_len: + camel_case_str = camel_case_str[:i] + "_" + camel_case_str[i:] + + return camel_case_str.lower() + + +def frequency_nominal_to_channel(source_Sv, frequency_nominal: int): + """ + Given a value for a nominal frequency, returns the channel associated with it + """ + channels = source_Sv["frequency_nominal"].coords["channel"].values + freqs = source_Sv["frequency_nominal"].values + chan = channels[freqs == frequency_nominal] + assert len(chan) == 1, "Frequency not uniquely identified" + channel = chan[0] + return channel diff --git a/pytest.ini b/pytest.ini index 7a3a8bfd2..9ad97f3a7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,7 @@ # test directory [pytest] testpaths = echopype/tests - cache_dir = .cache +markers = + unit: marks tests as unit tests + integration: marks tests as integration tests diff --git a/requirements-dev.txt b/requirements-dev.txt index fe3bd8841..80fbb3f60 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,6 +9,7 @@ flake8-mutable flake8-print isort mypy +moto numpydoc pre-commit pylint diff --git a/requirements.txt b/requirements.txt index fac895538..42422871d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ requests aiohttp xarray-datatree==0.0.6 psutil>=5.9.1 -more-itertools==8.13.0 geopy scikit-image +flox>=0.7.2,<1.0.0 +