Add Further Selection Functionality for Regions2D (#97)

* add further selection functionality * add selection functionality for depth range * add selection functionality for time range * create test to check for both time and depth functionality * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove option for dataframe and series * updated select region by removing option for region_id to be series or dataframe * update test to test for "bad" list values * add timestamp * add timestamp import * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * convert numpy num to python num * convert numpy numeric values to basic python float values * formatting changes * make description changes for preciseness * change multiple isinstance to single isinstance * initiate region first; remove untouched * make copy start with true * initiate region with copy value at beginning * remove untouched * remove checking if region is none * rework with itterrows .apply and lambda * for both selecting rows with "valid" time and depth values, exchange itterrows implementation with a .apply and lambda implementation --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
OSOceanAcoustics · Jul 3, 2023 · 3a3251b · 3a3251b
1 parent 1c1510e
commit 3a3251b
Show file tree

Hide file tree

Showing 2 changed files with 146 additions and 29 deletions.
diff --git a/echoregions/regions2d/regions2d.py b/echoregions/regions2d/regions2d.py
@@ -6,7 +6,7 @@
 import numpy as np
 import regionmask
 import xarray as xr
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, Timestamp
 from xarray import DataArray
 
 from ..utils.io import validate_path
@@ -73,47 +73,134 @@ def to_json(self, save_path: str = None) -> None:
         """
 
     def select_region(
-        self, region: Union[float, str, list, Series, DataFrame] = None, copy=False
+        self,
+        region_id: Union[float, int, str, List[Union[float, int, str]]] = None,
+        time_range: List[Timestamp] = None,
+        depth_range: List[Union[float, int]] = None,
+        copy=True,
     ) -> DataFrame:
-        """Ensure that region is a DataFrame.
+        """Selects a subset of this Region2D object's dataframe.
 
         Parameters
         ----------
-        region : float, str, list, Series, DataFrame, ``None``
-            A region id provided as a number, string, list of these,
-            or a DataFrame/Series containing the region_id column name.
+        region_id : float, int, str, list, ``None``
+            A region id provided as a number, a string, or list of these.
+        time_range: List of 2 Pandas Timestamps.
+            Datetime range for expected output of subselected DataFrame. 1st
+            index value must be later than 0th index value.
+        depth_range: List of 2 floats.
+            Depth range for expected output of subselected DataFrame. 1st
+            index value must be larger than 0th index value.
         copy : bool
             Return a copy of the `data` DataFrame
         Returns
         -------
         DataFrame
-            A DataFrame subselected from Regions2D.data.
-            There is a row for each region id provided by the region parameter.
+            There is a row for each region id provided by the ``region_id`` parameter,
+            and each row has time and depth within or on the boundaries passed
+            in by the ``time_range`` and ``depth_range`` values.
         """
-        if region is not None:
-            if isinstance(region, DataFrame):
-                region = list(region.region_id)
-            elif isinstance(region, Series):
-                region = [region.region_id]
-            elif (
-                isinstance(region, float)
-                or isinstance(region, int)
-                or isinstance(region, str)
-            ):
-                region = [region]
-            elif not isinstance(region, list):
+        # Make copy of original dataframe; else, use original dataframe in selection.
+        if copy:
+            region = self.data.copy()
+        else:
+            region = self.data
+        if region_id is not None:
+            if isinstance(region_id, (float, int, str)):
+                region_id = [region_id]
+            elif not isinstance(region_id, list):
                 raise TypeError(
-                    f"Invalid Region Type: {type(region)}. Must be \
-                                of type float, str, list, Series, DataFrame, ``None``"
+                    f"Invalid region_id type: {type(region_id)}. Must be \
+                                of type float, int, str, list, ``None``."
                 )
             # Select row by column id
-            region = self.data[self.data["region_id"].isin(region)]
-        else:
-            region = self.data
-        if copy:
-            return region.copy()
-        else:
-            return region
+            for value in region_id:
+                if not isinstance(value, (float, int, str)):
+                    raise TypeError(
+                        f"Invalid element in list region_id. Is of \
+                            type: {type(value)}Must be \
+                            of type float, int, str."
+                    )
+            region = self.data[self.data["region_id"].isin(region_id)]
+        if time_range is not None:
+            if isinstance(time_range, List):
+                if len(time_range) == 2:
+                    if isinstance(time_range[0], Timestamp) and isinstance(
+                        time_range[1], Timestamp
+                    ):
+                        if time_range[0] < time_range[1]:
+                            # Select rows with time values that are all within time range
+                            region = region[
+                                region["time"].apply(
+                                    lambda time_array: all(
+                                        time_range[0] <= Timestamp(x)
+                                        or time_range[1] >= Timestamp(x)
+                                        for x in time_array
+                                    )
+                                )
+                            ]
+                        else:
+                            raise ValueError(
+                                f"1st index value must be later than 0th index \
+                                             value. Currently 0th index value is {time_range[0]} \
+                                             and 1st index value is {time_range[1]}"
+                            )
+                    else:
+                        raise TypeError(
+                            f"Invalid time_range value types: \
+                                        {type(time_range[0])} and {type(time_range[1])}. Must \
+                                        be both of type Timestamp."
+                        )
+                else:
+                    raise ValueError(
+                        f"Invalid time_range size: {len(time_range)}. \
+                        Must be of size 2."
+                    )
+            else:
+                raise TypeError(
+                    f"Invalid time_range type: {type(time_range)}. Must be \
+                                of type List."
+                )
+        if depth_range is not None:
+            if isinstance(depth_range, List):
+                if len(depth_range) == 2:
+                    if isinstance(depth_range[0], (float, int)) and isinstance(
+                        depth_range[1], (float, int)
+                    ):
+                        if depth_range[0] < depth_range[1]:
+                            # Select rows with depth values that are all within depth range
+                            region = region[
+                                region["time"].apply(
+                                    lambda depth_array: all(
+                                        depth_range[0] <= float(x)
+                                        or depth_range[1] >= float(x)
+                                        for x in depth_array
+                                    )
+                                )
+                            ]
+                        else:
+                            raise ValueError(
+                                f"1st index value must be later than 0th index \
+                                             value. Currently 0th index value is {depth_range[0]} \
+                                             and 1st index value is {depth_range[1]}"
+                            )
+                    else:
+                        raise TypeError(
+                            f"Invalid depth_range value types: \
+                                        {type(depth_range[0])} and {type(depth_range[1])}. Must \
+                                        be both of type either float or int."
+                        )
+                else:
+                    raise ValueError(
+                        f"Invalid depth_range size: {len(depth_range)}. \
+                        Must be of size 2."
+                    )
+            else:
+                raise TypeError(
+                    f"Invalid depth_range type: {type(depth_range)}. Must be \
+                                of type List."
+                )
+        return region
 
     def close_region(
         self, region: Union[float, str, List, Series, DataFrame] = None

diff --git a/echoregions/tests/test_r2d.py b/echoregions/tests/test_r2d.py
@@ -2,6 +2,7 @@
 from datetime import timedelta
 
 import numpy as np
+import pandas as pd
 import pytest
 import xarray as xr
 from xarray import DataArray, Dataset
@@ -150,6 +151,33 @@ def test_select_sonar_file():
     assert raw == ["Summer2017-D20170625-T195927.nc"]
 
 
+def test_select_region():
+    """
+    tests select region functionality
+    """
+    evr_path = data_dir + "x1.evr"
+    r2d = er.read_evr(evr_path)
+    region_id = 2
+    time_range = [
+        pd.to_datetime("2017-06-24T16:31:36.338500000"),
+        pd.to_datetime("2017-06-26T16:31:40.211500000"),
+    ]
+    depth_range = [-10000.0, 10000.0]
+    df_1 = r2d.select_region(region_id=region_id)
+    df_2 = r2d.select_region(time_range=time_range)
+    df_3 = r2d.select_region(depth_range=depth_range)
+    for df_region_id in df_1["region_id"]:
+        assert df_region_id == region_id
+    for time_array in df_2["time"]:
+        for time in time_array:
+            assert time >= time_range[0]
+            assert time <= time_range[1]
+    for depth_array in df_3["depth"]:
+        for depth in depth_array:
+            assert depth >= depth_range[0]
+            assert depth <= depth_range[1]
+
+
 @pytest.mark.filterwarnings("ignore:No gridpoint belongs to any region")
 def test_mask_no_overlap():
     """
@@ -186,6 +214,8 @@ def test_mask_correct_labels():
     r2d = er.read_evr(evr_path)
     region_ids = r2d.data.region_id.values  # Output is that of IntegerArray
     region_ids = list(region_ids)  # Convert to List
+    # Convert numpy numeric values to basic Python float values
+    region_ids = [region_id.item() for region_id in region_ids]
     da_Sv = xr.open_dataset(os.path.join(data_dir, "x1_test.nc")).Sv
     M = r2d.mask(da_Sv, region_ids, mask_labels=region_ids)
     # it matches only a 11th region becasue x1_test.nc is a chunk around that region only