Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partial collapse of multi-dimensional string coords #4294

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ This document explains the changes made to Iris for this release
#. `@rcomer`_ made the :obj:`~iris.analysis.WPERCENTILE` aggregator work with
:func:`~iris.cube.Cube.rolling_window`. (:issue:`5777`, :pull:`5825`)

#. `@rcomer`_ enabled partial collapse of multi-dimensional string coordinates,
fixing :issue:`3653`. (:pull:`4294`)


💣 Incompatible Changes
=======================
Expand Down
64 changes: 44 additions & 20 deletions lib/iris/coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
from functools import lru_cache
from itertools import zip_longest
import operator
from typing import Iterable, Optional, Union
import warnings
import zlib

import dask.array as da
import numpy as np
import numpy.ma as ma
import numpy.typing as npt

from iris._data_manager import DataManager
import iris._lazy_data as _lazy
Expand All @@ -32,11 +34,15 @@
import iris.exceptions
import iris.time
import iris.util
import iris.warnings

#: The default value for ignore_axis which controls guess_coord_axis' behaviour
DEFAULT_IGNORE_AXIS = False

# Define some typing aliases.
Dims = Union[int, Iterable[int]]
RealData = Union[np.ndarray, ma.MaskedArray]
RealOrLazyData = Union[RealData, da.Array]


class _DimensionalMetadata(CFVariableMixin, metaclass=ABCMeta):
"""Superclass for dimensional metadata."""
Expand Down Expand Up @@ -238,7 +244,7 @@ def _lazy_values(self):
"""Return a lazy array representing the dimensional metadata values."""
return self._values_dm.lazy_data()

def _core_values(self):
def _core_values(self) -> RealOrLazyData:
"""Value array of this dimensional metadata which may be a NumPy array or a dask array."""
result = self._values_dm.core_data()
if not _lazy.is_lazy_data(result):
Expand Down Expand Up @@ -771,7 +777,7 @@ def dtype(self):
return self._values_dm.dtype

@property
def ndim(self):
def ndim(self) -> int:
"""Return the number of dimensions of the current dimensional metadata object."""
return self._values_dm.ndim

Expand Down Expand Up @@ -1584,7 +1590,7 @@ def points(self, points):
self._values = points

@property
def bounds(self):
def bounds(self) -> Optional[RealData]:
"""Coordinate bounds values.

The coordinate bounds values, as a NumPy array,
Expand Down Expand Up @@ -1716,11 +1722,11 @@ def lazy_bounds(self):
lazy_bounds = self._bounds_dm.lazy_data()
return lazy_bounds

def core_points(self):
def core_points(self) -> RealOrLazyData:
"""Core points array at the core of this coord, which may be a NumPy array or a dask array."""
return super()._core_values()

def core_bounds(self):
def core_bounds(self) -> Optional[RealOrLazyData]:
"""Core bounds. The points array at the core of this coord, which may be a NumPy array or a dask array."""
result = None
if self.has_bounds():
Expand Down Expand Up @@ -2099,7 +2105,7 @@ def cell(self, index):

return Cell(point, bound)

def collapsed(self, dims_to_collapse=None):
def collapsed(self, dims_to_collapse: Optional[Dims] = None) -> "Coord":
"""Return a copy of this coordinate, which has been collapsed along the specified dimensions.

Replaces the points & bounds with a simple bounded region.
Expand All @@ -2108,28 +2114,46 @@ def collapsed(self, dims_to_collapse=None):
# through to numpy
if isinstance(dims_to_collapse, (int, np.integer)):
dims_to_collapse = (dims_to_collapse,)
if isinstance(dims_to_collapse, list):
if isinstance(dims_to_collapse, Iterable):
dims_to_collapse = tuple(dims_to_collapse)

if np.issubdtype(self.dtype, np.str_):
# Collapse the coordinate by serializing the points and
# bounds as strings.
def serialize(x):
return "|".join([str(i) for i in x.flatten()])
def serialize(
x: npt.NDArray[np.str_], axis: Optional[Iterable[int]]
) -> Union[npt.NDArray[np.str_], str]:
if axis is None:
return "|".join(str(i) for i in x.flatten())

# np.apply_along_axis does not work with str.join, so we
# need to loop through the array directly. First move (possibly
# multiple) axis of interest to trailing dim(s), then make a 2D
# array we can loop through.
work_array = np.moveaxis(x, axis, range(-len(axis), 0))
out_shape = work_array.shape[: -len(axis)]
work_array = work_array.reshape(np.prod(out_shape, dtype=int), -1)

joined = []
for arr_slice in work_array:
joined.append(serialize(arr_slice, None))

return np.array(joined).reshape(out_shape)

bounds = None
if self.has_bounds():
shape = self._bounds_dm.shape[1:]
bounds = []
for index in np.ndindex(shape):
index_slice = (slice(None),) + tuple(index)
bounds.append(serialize(self.bounds[index_slice]))
Copy link
Member Author

@rcomer rcomer Aug 24, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I've understood the old slicing method, it and the new method are consistent for the basic case of 2D bounds (n_points, n_bounds). For higher dimensions, the old method doesn't look right to me: if we had a 2D (n, m) coordinate with bounds (n, m, n_bounds), it looks like we would get m * n_bounds elements in the list. But I think we would have wanted just n_bounds elements.

I didn't even know bounded string coords were a thing!

dtype = np.dtype("U{}".format(max(map(len, bounds))))
bounds = np.array(bounds, dtype=dtype).reshape((1,) + shape)
points = serialize(self.points)
dtype = np.dtype("U{}".format(len(points)))
# Express dims_to_collapse as non-negative integers.
if dims_to_collapse is None:
dims_to_collapse = range(self.ndim)
else:
dims_to_collapse = tuple(
dim % self.ndim for dim in dims_to_collapse
)
bounds = serialize(self.bounds, dims_to_collapse)

points = serialize(self.points, dims_to_collapse)
# Create the new collapsed coordinate.
coord = self.copy(points=np.array(points, dtype=dtype), bounds=bounds)
coord = self.copy(points=np.array(points), bounds=bounds)
else:
# Collapse the coordinate by calculating the bounded extremes.
if self.ndim > 1:
Expand Down
83 changes: 83 additions & 0 deletions lib/iris/tests/unit/coords/test_Coord.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import dask.array as da
import numpy as np
import numpy.ma as ma
import pytest

import iris
Expand Down Expand Up @@ -542,6 +543,88 @@ def test_lazy_3_bounds(self):
self.assertArrayAlmostEqual(collapsed_coord.points, da.array([2.0]))
self.assertArrayAlmostEqual(collapsed_coord.bounds, da.array([[0.0, 4.0]]))

def test_string_masked(self):
points = ma.array(["foo", "bar", "bing"], mask=[0, 1, 0], dtype=str)
coord = AuxCoord(points)

collapsed_coord = coord.collapsed(0)

expected = "foo|--|bing"
self.assertEqual(collapsed_coord.points, expected)

def test_string_nd_first(self):
self.setupTestArrays((3, 4))
coord = AuxCoord(self.pts_real.astype(str))

collapsed_coord = coord.collapsed(0)
expected = [
"0.0|40.0|80.0",
"10.0|50.0|90.0",
"20.0|60.0|100.0",
"30.0|70.0|110.0",
]

self.assertArrayEqual(collapsed_coord.points, expected)

def test_string_nd_second(self):
self.setupTestArrays((3, 4))
coord = AuxCoord(self.pts_real.astype(str))

collapsed_coord = coord.collapsed(1)
expected = [
"0.0|10.0|20.0|30.0",
"40.0|50.0|60.0|70.0",
"80.0|90.0|100.0|110.0",
]

self.assertArrayEqual(collapsed_coord.points, expected)

def test_string_nd_bounds_first(self):
self.setupTestArrays((3, 4))
coord = AuxCoord(self.pts_real.astype(str), bounds=self.bds_real.astype(str))

collapsed_coord = coord.collapsed(0)

# Points handling is as for non bounded case. So just check bounds.
expected_lower = [
"-2.0|38.0|78.0",
"8.0|48.0|88.0",
"18.0|58.0|98.0",
"28.0|68.0|108.0",
]

expected_upper = [
"2.0|42.0|82.0",
"12.0|52.0|92.0",
"22.0|62.0|102.0",
"32.0|72.0|112.0",
]

self.assertArrayEqual(collapsed_coord.bounds[:, 0], expected_lower)
self.assertArrayEqual(collapsed_coord.bounds[:, 1], expected_upper)

def test_string_nd_bounds_second(self):
self.setupTestArrays((3, 4))
coord = AuxCoord(self.pts_real.astype(str), bounds=self.bds_real.astype(str))

collapsed_coord = coord.collapsed(1)

# Points handling is as for non bounded case. So just check bounds.
expected_lower = [
"-2.0|8.0|18.0|28.0",
"38.0|48.0|58.0|68.0",
"78.0|88.0|98.0|108.0",
]

expected_upper = [
"2.0|12.0|22.0|32.0",
"42.0|52.0|62.0|72.0",
"82.0|92.0|102.0|112.0",
]

self.assertArrayEqual(collapsed_coord.bounds[:, 0], expected_lower)
self.assertArrayEqual(collapsed_coord.bounds[:, 1], expected_upper)


class Test_is_compatible(tests.IrisTest):
def setUp(self):
Expand Down
Loading