From 7d557c21d92d85dcb6fc886b1f8974a113b2cceb Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 13:40:45 +0200 Subject: [PATCH 1/9] fix(groupby): group by the name of a non-dimension coordinate `LinearExpression.groupby` could not group by an attached non-dimension coordinate. `expr.groupby("period").sum()` raised `ValueError: period already exists as coordinate or variable name`, and passing the coordinate `DataArray` (`groupby(period)`) raised because the fast path dropped only the dimension index, then renamed the group dim onto a name still held by the attached coordinate. Fix both paths: - `sum()` resolves a string group naming an existing coordinate to that coordinate so it takes the fast path, and drops every coordinate aligned to the grouped dimension (index, MultiIndex levels, auxiliary coords) before reshaping, since collapsing the dimension invalidates them all. - The `groupby` property detaches an attached non-dimension coordinate used as the group before handing it to xarray, so xarray does not try to re-expand it when recombining groups (the `use_fallback=True` path). `expr.groupby("period").sum()` now mirrors `xarray.Dataset.groupby`. Closes #750 Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/release_notes.rst | 1 + linopy/expressions.py | 38 +++++++++++++---- test/test_linear_expression.py | 74 ++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 7 deletions(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index b2ff60de..dd8e5e10 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -58,6 +58,7 @@ Most users should keep calling ``model.solve(...)``. If you want more control, y * ``add_variables`` / ``add_constraints``: extends 0.7.0's coords-as-truth rule to ``lower``, ``upper`` and ``mask`` for every bound type and dim order. Pandas ``Series`` / ``DataFrame`` bounds or masks missing a dimension are broadcast to ``coords`` instead of being silently dropped (`#709 `__); the variable's dimension order always follows ``coords`` (`#706 `__); bare-tuple coord entries (``coords=[(0, 1, 2)]``) now behave like lists. Mismatched values or extra dims raise ``ValueError`` with a labelled message; sparse-coord masks (formerly a v0.6.3 ``FutureWarning``, #580) raise ``ValueError``, and masks with dims not in the data raise ``ValueError`` instead of ``AssertionError``. * Pandas inputs whose index names *levels* of a stacked-``MultiIndex`` ``coords`` dimension are now projected onto that dimension: a level subset broadcasts across the others, the full set aligns element-wise. This fixes PyPSA multi-investment arithmetic (e.g. an expression over a ``(period, timestep)`` ``snapshot`` MultiIndex times a ``period``-indexed weighting). In ``add_variables`` / ``add_constraints`` the input must provide a value for every level combination of the MultiIndex or a ``ValueError`` is raised (the error lists the missing combinations). **Implicit level projections are deprecated**: they emit an ``EvolvingAPIWarning`` everywhere — in arithmetic *and* in ``add_variables`` / ``add_constraints`` — and will raise under the upcoming v1 convention. Project the input onto the dimension explicitly (select with the dimension's level values) to keep current behavior. Aligning the full level set with full coverage stays silent. Strict validation also rejects a ``MultiIndex`` input with *unnamed* levels whose combinations don't match ``coords`` (previously a silent bypass, as such inputs can't be projected by level name). +* ``LinearExpression.groupby`` now accepts the *name* of an attached non-dimension coordinate (``expr.groupby("period").sum()``), and grouping by a ``DataArray`` whose name matches an attached coordinate no longer raises ``ValueError: ... already exists``. Both the fast path and the ``use_fallback=True`` path are fixed, and any auxiliary coordinate aligned to the grouped dimension is correctly dropped instead of breaking the reshape (`#750 `__). * ``add_piecewise_formulation`` now produces a reproducible dimension order in the broadcast breakpoint array. The previous set-based expansion gave a hash-randomized order that varied between processes. * SOS constraints on masked variables no longer cause solver-specific failures (Gurobi ``IndexError``, Xpress ``?404 Invalid column number``, LP parse errors, silent set corruption). ``Model.solve()`` and ``Model.to_file()`` now raise a clear ``NotImplementedError`` referring users to `#688 `__; pass ``reformulate_sos=True`` as a workaround. * ``Model.solve(..., reformulate_sos=True)`` now actually reformulates SOS constraints even when the solver supports them natively. Previously it was silently ignored with a warning. diff --git a/linopy/expressions.py b/linopy/expressions.py index b0515ea2..3e00c075 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -168,7 +168,18 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: else: group = self.group # type: ignore - return self.data.groupby(group=group, **self.kwargs) + # Detach a non-dimension coordinate used as the group so xarray does + # not try to re-expand it while recombining the groups (GH #750); the + # group values are still supplied through ``group`` itself. + data = self.data + if isinstance(group, str) and group in data.coords and group not in data.dims: + group = data[group] + if isinstance(group, DataArray) and group.name in set(data.coords) - set( + data.dims + ): + data = data.drop_vars([group.name]) + + return data.groupby(group=group, **self.kwargs) def map( self, @@ -226,9 +237,15 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: LinearExpression The sum of the groupby object. """ + group = self.group + # A string selects an existing coordinate, mirroring xarray's + # ``Dataset.groupby("name")``. Resolve it to that coordinate so it + # takes the fast path below instead of the slower xarray fallback. + if isinstance(group, str) and group in self.data.coords: + group = self.data[group] + non_fallback_types = (pd.Series, pd.DataFrame, xr.DataArray) - if isinstance(self.group, non_fallback_types) and not use_fallback: - group: pd.Series | pd.DataFrame | xr.DataArray = self.group + if isinstance(group, non_fallback_types) and not use_fallback: if isinstance(group, pd.DataFrame): # dataframes do not have a name, so we need to set it final_group_name = "group" @@ -254,10 +271,17 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: arrays = [group, group.groupby(group).cumcount()] idx = pd.MultiIndex.from_arrays(arrays, names=[GROUP_DIM, GROUPED_TERM_DIM]) new_coords = Coordinates.from_pandas_multiindex(idx, group_dim) - coords = self.data.indexes[group_dim] - names_to_drop = [coords.name] - if isinstance(coords, pd.MultiIndex): - names_to_drop += list(coords.names) + # Collapsing ``group_dim`` into groups invalidates every coordinate + # aligned to it: the dimension index, any MultiIndex levels, and + # auxiliary (non-dimension) coords such as the one being grouped by. + # Drop them all before reshaping, otherwise they clash with the + # regrouped dimension or with the final rename onto + # ``final_group_name`` (GH #750). + names_to_drop = [ + name + for name, coord in self.data.coords.items() + if group_dim in coord.dims + ] ds = self.data.drop_vars(names_to_drop).assign_coords(new_coords) ds = ds.unstack(group_dim, fill_value=LinearExpression._fill_value) ds = LinearExpression._sum(ds, dim=GROUPED_TERM_DIM) diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py index 2580f033..315aad7d 100644 --- a/test/test_linear_expression.py +++ b/test/test_linear_expression.py @@ -1384,6 +1384,80 @@ def test_linear_expression_groupby_on_same_name_as_target_dim( assert grouped.nterm == 10 +class TestGroupbyByAttachedCoordinate: + """GH #750: group by an attached non-dimension coordinate (name or array).""" + + # Such a coordinate, referenced by its name (like xarray's + # ``Dataset.groupby("name")``) or as the coordinate ``DataArray`` itself, + # previously raised ``ValueError: ... already exists`` / ``KeyError``. + + @pytest.fixture + def period(self, v: Variable) -> xr.DataArray: + return xr.DataArray([2020] * 10 + [2030] * 10, coords=v.coords, name="period") + + @pytest.fixture + def expr(self, v: Variable, period: xr.DataArray) -> LinearExpression: + return (1 * v).assign_coords(period=period) + + @pytest.fixture(params=["name", "dataarray"]) + def group(self, request: Any, period: xr.DataArray) -> str | xr.DataArray: + # the same coordinate referenced either by name or as the DataArray + return "period" if request.param == "name" else period + + @pytest.mark.parametrize("use_fallback", [True, False]) + def test_groups_like_detached( + self, + expr: LinearExpression, + period: xr.DataArray, + group: str | xr.DataArray, + use_fallback: bool, + ) -> None: + grouped = expr.groupby(group).sum(use_fallback=use_fallback) + + assert "period" in grouped.dims + assert (grouped.data.period == [2020, 2030]).all() + assert grouped.nterm == 10 + # same as detaching the coord and grouping by the array explicitly + expected = ( + expr.drop_vars("period").groupby(period).sum(use_fallback=use_fallback) + ) + assert_linequal(grouped, expected) + + def test_drops_other_aux_coords( + self, v: Variable, expr: LinearExpression, group: str | xr.DataArray + ) -> None: + # A second auxiliary coord on the grouped dimension must not break the + # reshape (it raised ``KeyError: 'timestep'`` before the fix); the fast + # path drops coords invalidated by collapsing the dimension. + other = xr.DataArray(list("ab" * 10), coords=v.coords, name="timestep") + grouped = expr.assign_coords(timestep=other).groupby(group).sum() + + assert "period" in grouped.dims + assert "timestep" not in grouped.coords + assert_linequal(grouped, expr.groupby(group).sum()) + + @pytest.mark.parametrize("by", ["name", "dataarray"]) + def test_keeps_other_dim(self, by: str) -> None: + # On a 2-D expression, grouping one dimension by an aux coord must keep + # the other dimension intact and sum the right terms. + m = Model() + snapshot = pd.RangeIndex(4, name="snapshot") + gen = pd.Index(["g1", "g2"], name="gen") + period = xr.DataArray( + [2020, 2020, 2030, 2030], coords=[snapshot], name="period" + ) + y = m.add_variables(coords=[snapshot, gen], name="y") + expr = (1 * y).assign_coords(period=period) + group = "period" if by == "name" else period + + grouped = expr.groupby(group).sum() + + assert {"period", "gen"} <= set(grouped.dims) + assert grouped.sizes["gen"] == 2 + assert (grouped.data.period == [2020, 2030]).all() + assert_linequal(grouped, expr.drop_vars("period").groupby(period).sum()) + + @pytest.mark.parametrize("use_fallback", [True]) def test_linear_expression_groupby_ndim(z: Variable, use_fallback: bool) -> None: # TODO: implement fallback for n-dim groupby, see https://github.com/PyPSA/linopy/issues/299 From ce1a8f5fdfbeef0619383549bbd02cfb15ab2b05 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:03:54 +0200 Subject: [PATCH 2/9] test(groupby): assert grouping against hard-coded results Reorganize the #750 coverage into TestGroupbyByAttachedCoordinate, a parametrized matrix that asserts the grouped expression against hard-coded `vars`/`coeffs` literals (on a deterministic 4- and 8-variable model) instead of comparing to a sibling computation that could share the same bug. Covers single-key (name / DataArray x use_fallback), multi-key (list / tuple x use_fallback), an extra auxiliary coord on the grouped dimension, and a 2-D variable that must keep its other dimension. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/test_linear_expression.py | 127 +++++++++++++++++++++++---------- 1 file changed, 89 insertions(+), 38 deletions(-) diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py index 315aad7d..a02bb03d 100644 --- a/test/test_linear_expression.py +++ b/test/test_linear_expression.py @@ -1385,77 +1385,128 @@ def test_linear_expression_groupby_on_same_name_as_target_dim( class TestGroupbyByAttachedCoordinate: - """GH #750: group by an attached non-dimension coordinate (name or array).""" + """GH #750: group by an attached non-dimension coordinate.""" # Such a coordinate, referenced by its name (like xarray's # ``Dataset.groupby("name")``) or as the coordinate ``DataArray`` itself, # previously raised ``ValueError: ... already exists`` / ``KeyError``. + # + # Results are asserted against hard-coded ``vars``/``coeffs`` so a + # regression in the grouping is caught directly -- not relative to another + # (possibly equally wrong) computation. The base expression is four + # variables labelled ``0..3`` on dim ``t`` with coefficient ``2.0`` and two + # attached level coords; the 2-D case uses an eight-variable grid (0..7). @pytest.fixture - def period(self, v: Variable) -> xr.DataArray: - return xr.DataArray([2020] * 10 + [2030] * 10, coords=v.coords, name="period") + def t(self) -> pd.RangeIndex: + return pd.RangeIndex(4, name="t") @pytest.fixture - def expr(self, v: Variable, period: xr.DataArray) -> LinearExpression: - return (1 * v).assign_coords(period=period) + def period(self, t: pd.RangeIndex) -> xr.DataArray: + return xr.DataArray( + [2020, 2020, 2030, 2030], dims="t", coords={"t": t}, name="period" + ) + + @pytest.fixture + def season(self, t: pd.RangeIndex) -> xr.DataArray: + return xr.DataArray(list("wsws"), dims="t", coords={"t": t}, name="season") - @pytest.fixture(params=["name", "dataarray"]) - def group(self, request: Any, period: xr.DataArray) -> str | xr.DataArray: - # the same coordinate referenced either by name or as the DataArray - return "period" if request.param == "name" else period + @pytest.fixture + def expr( + self, t: pd.RangeIndex, period: xr.DataArray, season: xr.DataArray + ) -> LinearExpression: + m = Model() + x = m.add_variables(coords=[t], name="x") + return (2.0 * x).assign_coords(period=period, season=season) @pytest.mark.parametrize("use_fallback", [True, False]) - def test_groups_like_detached( + @pytest.mark.parametrize("by", ["name", "dataarray"]) + def test_single_key( self, expr: LinearExpression, period: xr.DataArray, - group: str | xr.DataArray, + by: str, use_fallback: bool, ) -> None: + group = "period" if by == "name" else period + grouped = expr.groupby(group).sum(use_fallback=use_fallback) - assert "period" in grouped.dims - assert (grouped.data.period == [2020, 2030]).all() - assert grouped.nterm == 10 - # same as detaching the coord and grouping by the array explicitly - expected = ( - expr.drop_vars("period").groupby(period).sum(use_fallback=use_fallback) - ) - assert_linequal(grouped, expected) + assert grouped.data.period.values.tolist() == [2020, 2030] + assert grouped.vars.transpose("period", TERM_DIM).values.tolist() == [ + [0, 1], + [2, 3], + ] + assert grouped.coeffs.transpose("period", TERM_DIM).values.tolist() == [ + [2.0, 2.0], + [2.0, 2.0], + ] - def test_drops_other_aux_coords( - self, v: Variable, expr: LinearExpression, group: str | xr.DataArray + @pytest.mark.parametrize("use_fallback", [True, False]) + @pytest.mark.parametrize("spelling", [list, tuple], ids=["list", "tuple"]) + def test_multi_key( + self, expr: LinearExpression, spelling: type, use_fallback: bool + ) -> None: + group = spelling(["period", "season"]) + + grouped = expr.groupby(group).sum(use_fallback=use_fallback) + + assert dict(grouped.sizes) == {"period": 2, "season": 2, TERM_DIM: 1} + assert grouped.data.period.values.tolist() == [2020, 2030] + assert grouped.data.season.values.tolist() == ["s", "w"] + assert grouped.vars.transpose("period", "season", TERM_DIM).values.tolist() == [ + [[1], [0]], + [[3], [2]], + ] + assert (grouped.coeffs == 2.0).all() + + def test_extra_aux_coord_does_not_change_result( + self, t: pd.RangeIndex, period: xr.DataArray ) -> None: - # A second auxiliary coord on the grouped dimension must not break the - # reshape (it raised ``KeyError: 'timestep'`` before the fix); the fast - # path drops coords invalidated by collapsing the dimension. - other = xr.DataArray(list("ab" * 10), coords=v.coords, name="timestep") - grouped = expr.assign_coords(timestep=other).groupby(group).sum() + # A second auxiliary coord on the grouped dimension must neither break + # the reshape (it raised ``KeyError`` before the fix) nor change the sum. + m = Model() + x = m.add_variables(coords=[t], name="x") + timestep = xr.DataArray( + list("abab"), dims="t", coords={"t": t}, name="timestep" + ) + expr = (2.0 * x).assign_coords(period=period, timestep=timestep) + + grouped = expr.groupby("period").sum() - assert "period" in grouped.dims assert "timestep" not in grouped.coords - assert_linequal(grouped, expr.groupby(group).sum()) + assert grouped.vars.transpose("period", TERM_DIM).values.tolist() == [ + [0, 1], + [2, 3], + ] + assert (grouped.coeffs == 2.0).all() @pytest.mark.parametrize("by", ["name", "dataarray"]) - def test_keeps_other_dim(self, by: str) -> None: - # On a 2-D expression, grouping one dimension by an aux coord must keep - # the other dimension intact and sum the right terms. + def test_two_dimensional(self, by: str) -> None: + # Grouping one dimension of a 2-D variable by an aux coord must keep the + # other dimension intact and pair up the right variable labels. m = Model() snapshot = pd.RangeIndex(4, name="snapshot") gen = pd.Index(["g1", "g2"], name="gen") + y = m.add_variables(coords=[snapshot, gen], name="y") # labels 0..7 period = xr.DataArray( - [2020, 2020, 2030, 2030], coords=[snapshot], name="period" + [2020, 2020, 2030, 2030], + dims="snapshot", + coords={"snapshot": snapshot}, + name="period", ) - y = m.add_variables(coords=[snapshot, gen], name="y") - expr = (1 * y).assign_coords(period=period) + expr = (1.0 * y).assign_coords(period=period) group = "period" if by == "name" else period grouped = expr.groupby(group).sum() - assert {"period", "gen"} <= set(grouped.dims) - assert grouped.sizes["gen"] == 2 - assert (grouped.data.period == [2020, 2030]).all() - assert_linequal(grouped, expr.drop_vars("period").groupby(period).sum()) + assert grouped.data.period.values.tolist() == [2020, 2030] + assert grouped.data.gen.values.tolist() == ["g1", "g2"] + assert grouped.vars.transpose("period", "gen", TERM_DIM).values.tolist() == [ + [[0, 2], [1, 3]], + [[4, 6], [5, 7]], + ] + assert (grouped.coeffs == 1.0).all() @pytest.mark.parametrize("use_fallback", [True]) From cf3716ef1237a50189bd78c9be0820bf6cb2204a Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:08:35 +0200 Subject: [PATCH 3/9] fix(groupby): support single-element key list; document multi-key contract A single-element key list (`groupby(["period"])`) now groups like the scalar key, matching xarray -- it is unwrapped in both `sum()` and the `groupby` property. Multi-key grouping must be spelled with names (`["period", "season"]`); a list of `DataArray`s is unhashable and raises in xarray itself, so linopy mirrors that (covered by an explicit `pytest.raises` row in the matrix). Also shorten the test class docstring to house style. Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/release_notes.rst | 2 +- linopy/expressions.py | 19 ++++++++++------- test/test_linear_expression.py | 38 ++++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index dd8e5e10..12154c3c 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -58,7 +58,7 @@ Most users should keep calling ``model.solve(...)``. If you want more control, y * ``add_variables`` / ``add_constraints``: extends 0.7.0's coords-as-truth rule to ``lower``, ``upper`` and ``mask`` for every bound type and dim order. Pandas ``Series`` / ``DataFrame`` bounds or masks missing a dimension are broadcast to ``coords`` instead of being silently dropped (`#709 `__); the variable's dimension order always follows ``coords`` (`#706 `__); bare-tuple coord entries (``coords=[(0, 1, 2)]``) now behave like lists. Mismatched values or extra dims raise ``ValueError`` with a labelled message; sparse-coord masks (formerly a v0.6.3 ``FutureWarning``, #580) raise ``ValueError``, and masks with dims not in the data raise ``ValueError`` instead of ``AssertionError``. * Pandas inputs whose index names *levels* of a stacked-``MultiIndex`` ``coords`` dimension are now projected onto that dimension: a level subset broadcasts across the others, the full set aligns element-wise. This fixes PyPSA multi-investment arithmetic (e.g. an expression over a ``(period, timestep)`` ``snapshot`` MultiIndex times a ``period``-indexed weighting). In ``add_variables`` / ``add_constraints`` the input must provide a value for every level combination of the MultiIndex or a ``ValueError`` is raised (the error lists the missing combinations). **Implicit level projections are deprecated**: they emit an ``EvolvingAPIWarning`` everywhere — in arithmetic *and* in ``add_variables`` / ``add_constraints`` — and will raise under the upcoming v1 convention. Project the input onto the dimension explicitly (select with the dimension's level values) to keep current behavior. Aligning the full level set with full coverage stays silent. Strict validation also rejects a ``MultiIndex`` input with *unnamed* levels whose combinations don't match ``coords`` (previously a silent bypass, as such inputs can't be projected by level name). -* ``LinearExpression.groupby`` now accepts the *name* of an attached non-dimension coordinate (``expr.groupby("period").sum()``), and grouping by a ``DataArray`` whose name matches an attached coordinate no longer raises ``ValueError: ... already exists``. Both the fast path and the ``use_fallback=True`` path are fixed, and any auxiliary coordinate aligned to the grouped dimension is correctly dropped instead of breaking the reshape (`#750 `__). +* ``LinearExpression.groupby`` now accepts the *name* of an attached non-dimension coordinate (``expr.groupby("period").sum()``), and grouping by a ``DataArray`` whose name matches an attached coordinate no longer raises ``ValueError: ... already exists``. Both the fast path and the ``use_fallback=True`` path are fixed, a single-element key list (``groupby(["period"])``) groups like the scalar key, and any auxiliary coordinate aligned to the grouped dimension is correctly dropped instead of breaking the reshape (`#750 `__). * ``add_piecewise_formulation`` now produces a reproducible dimension order in the broadcast breakpoint array. The previous set-based expansion gave a hash-randomized order that varied between processes. * SOS constraints on masked variables no longer cause solver-specific failures (Gurobi ``IndexError``, Xpress ``?404 Invalid column number``, LP parse errors, silent set corruption). ``Model.solve()`` and ``Model.to_file()`` now raise a clear ``NotImplementedError`` referring users to `#688 `__; pass ``reformulate_sos=True`` as a workaround. * ``Model.solve(..., reformulate_sos=True)`` now actually reformulates SOS constraints even when the solver supports them natively. Previously it was silently ignored with a warning. diff --git a/linopy/expressions.py b/linopy/expressions.py index 3e00c075..4421cd20 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -158,15 +158,17 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: xarray.core.groupby.DataArrayGroupBy The groupby object. """ - if isinstance(self.group, pd.DataFrame): + group = self.group + if isinstance(group, (list, tuple)) and len(group) == 1: + # a single-element key groups like the scalar key (xarray parity) + group = group[0] + + if isinstance(group, pd.DataFrame): raise ValueError( "Grouping by a DataFrame only supported for `sum` operation with `use_fallback=False`." ) - if isinstance(self.group, pd.Series): - group_name = self.group.name or "group" - group = DataArray(self.group, name=group_name) - else: - group = self.group # type: ignore + if isinstance(group, pd.Series): + group = DataArray(group, name=group.name or "group") # Detach a non-dimension coordinate used as the group so xarray does # not try to re-expand it while recombining the groups (GH #750); the @@ -179,7 +181,7 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: ): data = data.drop_vars([group.name]) - return data.groupby(group=group, **self.kwargs) + return data.groupby(group=group, **self.kwargs) # type: ignore[arg-type] def map( self, @@ -238,6 +240,9 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: The sum of the groupby object. """ group = self.group + if isinstance(group, (list, tuple)) and len(group) == 1: + # a single-element key groups like the scalar key (xarray parity) + group = group[0] # A string selects an existing coordinate, mirroring xarray's # ``Dataset.groupby("name")``. Resolve it to that coordinate so it # takes the fast path below instead of the slower xarray fallback. diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py index a02bb03d..9d9ffcee 100644 --- a/test/test_linear_expression.py +++ b/test/test_linear_expression.py @@ -1385,17 +1385,11 @@ def test_linear_expression_groupby_on_same_name_as_target_dim( class TestGroupbyByAttachedCoordinate: - """GH #750: group by an attached non-dimension coordinate.""" - - # Such a coordinate, referenced by its name (like xarray's - # ``Dataset.groupby("name")``) or as the coordinate ``DataArray`` itself, - # previously raised ``ValueError: ... already exists`` / ``KeyError``. - # - # Results are asserted against hard-coded ``vars``/``coeffs`` so a - # regression in the grouping is caught directly -- not relative to another - # (possibly equally wrong) computation. The base expression is four - # variables labelled ``0..3`` on dim ``t`` with coefficient ``2.0`` and two - # attached level coords; the 2-D case uses an eight-variable grid (0..7). + """ + GH #750: group by an attached non-dimension coordinate. + + Asserts grouping against hard-coded ``vars``/``coeffs`` to catch regressions. + """ @pytest.fixture def t(self) -> pd.RangeIndex: @@ -1508,6 +1502,28 @@ def test_two_dimensional(self, by: str) -> None: ] assert (grouped.coeffs == 1.0).all() + @pytest.mark.parametrize("use_fallback", [True, False]) + def test_single_element_list_groups_like_scalar( + self, expr: LinearExpression, use_fallback: bool + ) -> None: + # ``groupby(["period"])`` groups like the scalar key, mirroring xarray. + grouped = expr.groupby(["period"]).sum(use_fallback=use_fallback) + + assert grouped.data.period.values.tolist() == [2020, 2030] + assert grouped.vars.transpose("period", TERM_DIM).values.tolist() == [ + [0, 1], + [2, 3], + ] + assert (grouped.coeffs == 2.0).all() + + def test_multi_key_dataarrays_unsupported( + self, expr: LinearExpression, period: xr.DataArray, season: xr.DataArray + ) -> None: + # Multi-key grouping must be spelled with names; a list of DataArrays + # is unhashable and raises in xarray itself, so linopy mirrors that. + with pytest.raises(TypeError, match="unhashable"): + expr.groupby([period, season]).sum() + @pytest.mark.parametrize("use_fallback", [True]) def test_linear_expression_groupby_ndim(z: Variable, use_fallback: bool) -> None: From 91462184e03aea7bf07339a015129b54f89df1c7 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:09:06 +0200 Subject: [PATCH 4/9] docs: make groupby #750 release note concise and user-facing Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 12154c3c..0a4c8e66 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -58,7 +58,7 @@ Most users should keep calling ``model.solve(...)``. If you want more control, y * ``add_variables`` / ``add_constraints``: extends 0.7.0's coords-as-truth rule to ``lower``, ``upper`` and ``mask`` for every bound type and dim order. Pandas ``Series`` / ``DataFrame`` bounds or masks missing a dimension are broadcast to ``coords`` instead of being silently dropped (`#709 `__); the variable's dimension order always follows ``coords`` (`#706 `__); bare-tuple coord entries (``coords=[(0, 1, 2)]``) now behave like lists. Mismatched values or extra dims raise ``ValueError`` with a labelled message; sparse-coord masks (formerly a v0.6.3 ``FutureWarning``, #580) raise ``ValueError``, and masks with dims not in the data raise ``ValueError`` instead of ``AssertionError``. * Pandas inputs whose index names *levels* of a stacked-``MultiIndex`` ``coords`` dimension are now projected onto that dimension: a level subset broadcasts across the others, the full set aligns element-wise. This fixes PyPSA multi-investment arithmetic (e.g. an expression over a ``(period, timestep)`` ``snapshot`` MultiIndex times a ``period``-indexed weighting). In ``add_variables`` / ``add_constraints`` the input must provide a value for every level combination of the MultiIndex or a ``ValueError`` is raised (the error lists the missing combinations). **Implicit level projections are deprecated**: they emit an ``EvolvingAPIWarning`` everywhere — in arithmetic *and* in ``add_variables`` / ``add_constraints`` — and will raise under the upcoming v1 convention. Project the input onto the dimension explicitly (select with the dimension's level values) to keep current behavior. Aligning the full level set with full coverage stays silent. Strict validation also rejects a ``MultiIndex`` input with *unnamed* levels whose combinations don't match ``coords`` (previously a silent bypass, as such inputs can't be projected by level name). -* ``LinearExpression.groupby`` now accepts the *name* of an attached non-dimension coordinate (``expr.groupby("period").sum()``), and grouping by a ``DataArray`` whose name matches an attached coordinate no longer raises ``ValueError: ... already exists``. Both the fast path and the ``use_fallback=True`` path are fixed, a single-element key list (``groupby(["period"])``) groups like the scalar key, and any auxiliary coordinate aligned to the grouped dimension is correctly dropped instead of breaking the reshape (`#750 `__). +* You can now group a ``LinearExpression`` by a coordinate that is not a dimension, e.g. ``expr.groupby("period").sum()``, matching ``xarray.Dataset.groupby`` (`#750 `__). * ``add_piecewise_formulation`` now produces a reproducible dimension order in the broadcast breakpoint array. The previous set-based expansion gave a hash-randomized order that varied between processes. * SOS constraints on masked variables no longer cause solver-specific failures (Gurobi ``IndexError``, Xpress ``?404 Invalid column number``, LP parse errors, silent set corruption). ``Model.solve()`` and ``Model.to_file()`` now raise a clear ``NotImplementedError`` referring users to `#688 `__; pass ``reformulate_sos=True`` as a workaround. * ``Model.solve(..., reformulate_sos=True)`` now actually reformulates SOS constraints even when the solver supports them natively. Previously it was silently ignored with a warning. From ccf36d01885ebe14ea3c5c230449357776edc8b3 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:23:29 +0200 Subject: [PATCH 5/9] fix(groupby): don't detach a MultiIndex level in the fallback path The GH #750 detach must only drop *free* (non-indexed) coordinates. The earlier change also dropped a MultiIndex level when grouping by it via `use_fallback=True`, leaving the dimension without an index (`('snapshot',) are not coordinates with an index`). Guard the detach with `group.name not in data.xindexes` so MultiIndex levels are left intact. Grouping by a MultiIndex level now works on both paths (the pydata/xarray 6836 case, fixed upstream). Add a parametrized regression test over both levels and both paths. Co-Authored-By: Claude Opus 4.8 (1M context) --- linopy/expressions.py | 10 +++++++--- test/test_linear_expression.py | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/linopy/expressions.py b/linopy/expressions.py index 4421cd20..0979affd 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -172,12 +172,16 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: # Detach a non-dimension coordinate used as the group so xarray does # not try to re-expand it while recombining the groups (GH #750); the - # group values are still supplied through ``group`` itself. + # group values are still supplied through ``group`` itself. Only free + # (non-indexed) coords are detached -- never a MultiIndex level, whose + # removal would leave its dimension without an index. data = self.data if isinstance(group, str) and group in data.coords and group not in data.dims: group = data[group] - if isinstance(group, DataArray) and group.name in set(data.coords) - set( - data.dims + if ( + isinstance(group, DataArray) + and group.name in set(data.coords) - set(data.dims) + and group.name not in data.xindexes ): data = data.drop_vars([group.name]) diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py index 9d9ffcee..e096ec1e 100644 --- a/test/test_linear_expression.py +++ b/test/test_linear_expression.py @@ -1524,6 +1524,30 @@ def test_multi_key_dataarrays_unsupported( with pytest.raises(TypeError, match="unhashable"): expr.groupby([period, season]).sum() + @pytest.mark.parametrize("use_fallback", [True, False]) + @pytest.mark.parametrize( + "level, values, vars_", + [ + ("period", [2020, 2030], [[0, 1, 2], [3, 4, 5]]), + ("timestep", ["t1", "t2", "t3"], [[0, 3], [1, 4], [2, 5]]), + ], + ) + def test_multiindex_level( + self, level: str, values: list, vars_: list, use_fallback: bool + ) -> None: + # Grouping by a level of a real ``MultiIndex`` dimension (the + # pydata/xarray#6836 case, fixed upstream) works through linopy. + m = Model() + mi = pd.MultiIndex.from_product( + [[2020, 2030], ["t1", "t2", "t3"]], names=["period", "timestep"] + ) + x = m.add_variables(coords={"snapshot": mi}, name="x") # labels 0..5 + + grouped = (1 * x).groupby(level).sum(use_fallback=use_fallback) + + assert grouped.data[level].values.tolist() == values + assert grouped.vars.transpose(level, TERM_DIM).values.tolist() == vars_ + @pytest.mark.parametrize("use_fallback", [True]) def test_linear_expression_groupby_ndim(z: Variable, use_fallback: bool) -> None: From 5a9b2e57b56b091fe2c78e286ba74e1855592062 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:37:05 +0200 Subject: [PATCH 6/9] refactor(groupby): extract _resolve_group helper; address review - Extract the single-element-list unwrap + coordinate-name resolution shared by `sum()` and the `groupby` property into one `_resolve_group` helper, removing the duplication (and the drift between the two that caused the earlier MultiIndex-level regression). - Drop the now-stale `(GH #750)` references from code comments; the link lives in the release notes. - Add a test for grouping by a dimension coordinate name (the fast-path broadening), and note it in the release note. - Simplify `test_multi_key`: a multi-key group always uses the xarray fallback, so drop the redundant `use_fallback` parametrization. Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/release_notes.rst | 2 +- linopy/expressions.py | 50 +++++++++++++++++++--------------- test/test_linear_expression.py | 27 ++++++++++++++---- 3 files changed, 51 insertions(+), 28 deletions(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 0a4c8e66..8fdc39ac 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -58,7 +58,7 @@ Most users should keep calling ``model.solve(...)``. If you want more control, y * ``add_variables`` / ``add_constraints``: extends 0.7.0's coords-as-truth rule to ``lower``, ``upper`` and ``mask`` for every bound type and dim order. Pandas ``Series`` / ``DataFrame`` bounds or masks missing a dimension are broadcast to ``coords`` instead of being silently dropped (`#709 `__); the variable's dimension order always follows ``coords`` (`#706 `__); bare-tuple coord entries (``coords=[(0, 1, 2)]``) now behave like lists. Mismatched values or extra dims raise ``ValueError`` with a labelled message; sparse-coord masks (formerly a v0.6.3 ``FutureWarning``, #580) raise ``ValueError``, and masks with dims not in the data raise ``ValueError`` instead of ``AssertionError``. * Pandas inputs whose index names *levels* of a stacked-``MultiIndex`` ``coords`` dimension are now projected onto that dimension: a level subset broadcasts across the others, the full set aligns element-wise. This fixes PyPSA multi-investment arithmetic (e.g. an expression over a ``(period, timestep)`` ``snapshot`` MultiIndex times a ``period``-indexed weighting). In ``add_variables`` / ``add_constraints`` the input must provide a value for every level combination of the MultiIndex or a ``ValueError`` is raised (the error lists the missing combinations). **Implicit level projections are deprecated**: they emit an ``EvolvingAPIWarning`` everywhere — in arithmetic *and* in ``add_variables`` / ``add_constraints`` — and will raise under the upcoming v1 convention. Project the input onto the dimension explicitly (select with the dimension's level values) to keep current behavior. Aligning the full level set with full coverage stays silent. Strict validation also rejects a ``MultiIndex`` input with *unnamed* levels whose combinations don't match ``coords`` (previously a silent bypass, as such inputs can't be projected by level name). -* You can now group a ``LinearExpression`` by a coordinate that is not a dimension, e.g. ``expr.groupby("period").sum()``, matching ``xarray.Dataset.groupby`` (`#750 `__). +* You can now group a ``LinearExpression`` by a coordinate name, e.g. ``expr.groupby("period").sum()`` -- including coordinates that are not dimensions and levels of a ``MultiIndex`` -- matching ``xarray.Dataset.groupby`` (`#750 `__). * ``add_piecewise_formulation`` now produces a reproducible dimension order in the broadcast breakpoint array. The previous set-based expansion gave a hash-randomized order that varied between processes. * SOS constraints on masked variables no longer cause solver-specific failures (Gurobi ``IndexError``, Xpress ``?404 Invalid column number``, LP parse errors, silent set corruption). ``Model.solve()`` and ``Model.to_file()`` now raise a clear ``NotImplementedError`` referring users to `#688 `__; pass ``reformulate_sos=True`` as a workaround. * ``Model.solve(..., reformulate_sos=True)`` now actually reformulates SOS constraints even when the solver supports them natively. Previously it was silently ignored with a warning. diff --git a/linopy/expressions.py b/linopy/expressions.py index 0979affd..2121593e 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -136,6 +136,22 @@ def _expr_unwrap( logger = logging.getLogger(__name__) +def _resolve_group(group: Any, data: Dataset) -> Any: + """ + Normalize a groupby key. + + Unwrap a single-element key list to the scalar key, and resolve a string + naming a coordinate to that coordinate -- so ``groupby("name")`` behaves + like ``groupby(data["name"])``, mirroring xarray. Other inputs (Series, + DataFrame, DataArray, multi-key lists) are returned unchanged. + """ + if isinstance(group, (list, tuple)) and len(group) == 1: + group = group[0] + if isinstance(group, str) and group in data.coords: + group = data[group] + return group + + @dataclass @forward_as_properties(groupby=["dims", "groups"]) class LinearExpressionGroupby: @@ -158,10 +174,8 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: xarray.core.groupby.DataArrayGroupBy The groupby object. """ - group = self.group - if isinstance(group, (list, tuple)) and len(group) == 1: - # a single-element key groups like the scalar key (xarray parity) - group = group[0] + data = self.data + group = _resolve_group(self.group, data) if isinstance(group, pd.DataFrame): raise ValueError( @@ -171,13 +185,11 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: group = DataArray(group, name=group.name or "group") # Detach a non-dimension coordinate used as the group so xarray does - # not try to re-expand it while recombining the groups (GH #750); the - # group values are still supplied through ``group`` itself. Only free - # (non-indexed) coords are detached -- never a MultiIndex level, whose - # removal would leave its dimension without an index. - data = self.data - if isinstance(group, str) and group in data.coords and group not in data.dims: - group = data[group] + # not try to re-expand it while recombining the groups; the group + # values are still supplied through ``group`` itself. Only free + # (non-indexed) coords are detached -- never a MultiIndex level (whose + # removal would leave its dimension without an index) and never the + # dimension's own coordinate. if ( isinstance(group, DataArray) and group.name in set(data.coords) - set(data.dims) @@ -185,7 +197,7 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: ): data = data.drop_vars([group.name]) - return data.groupby(group=group, **self.kwargs) # type: ignore[arg-type] + return data.groupby(group=group, **self.kwargs) def map( self, @@ -243,15 +255,9 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: LinearExpression The sum of the groupby object. """ - group = self.group - if isinstance(group, (list, tuple)) and len(group) == 1: - # a single-element key groups like the scalar key (xarray parity) - group = group[0] - # A string selects an existing coordinate, mirroring xarray's - # ``Dataset.groupby("name")``. Resolve it to that coordinate so it - # takes the fast path below instead of the slower xarray fallback. - if isinstance(group, str) and group in self.data.coords: - group = self.data[group] + # Resolving a coordinate name to its coordinate lets ``groupby("name")`` + # take the fast path below instead of the slower xarray fallback. + group = _resolve_group(self.group, self.data) non_fallback_types = (pd.Series, pd.DataFrame, xr.DataArray) if isinstance(group, non_fallback_types) and not use_fallback: @@ -285,7 +291,7 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: # auxiliary (non-dimension) coords such as the one being grouped by. # Drop them all before reshaping, otherwise they clash with the # regrouped dimension or with the final rename onto - # ``final_group_name`` (GH #750). + # ``final_group_name``. names_to_drop = [ name for name, coord in self.data.coords.items() diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py index e096ec1e..3e3bff74 100644 --- a/test/test_linear_expression.py +++ b/test/test_linear_expression.py @@ -1436,14 +1436,13 @@ def test_single_key( [2.0, 2.0], ] - @pytest.mark.parametrize("use_fallback", [True, False]) @pytest.mark.parametrize("spelling", [list, tuple], ids=["list", "tuple"]) - def test_multi_key( - self, expr: LinearExpression, spelling: type, use_fallback: bool - ) -> None: + def test_multi_key(self, expr: LinearExpression, spelling: type) -> None: + # A multi-key group always goes through the xarray fallback (a list is + # not a fast-path type), so there is no separate use_fallback case. group = spelling(["period", "season"]) - grouped = expr.groupby(group).sum(use_fallback=use_fallback) + grouped = expr.groupby(group).sum() assert dict(grouped.sizes) == {"period": 2, "season": 2, TERM_DIM: 1} assert grouped.data.period.values.tolist() == [2020, 2030] @@ -1502,6 +1501,24 @@ def test_two_dimensional(self, by: str) -> None: ] assert (grouped.coeffs == 1.0).all() + @pytest.mark.parametrize("use_fallback", [True, False]) + def test_dimension_coordinate_by_name(self, use_fallback: bool) -> None: + # A dimension coordinate may also be grouped by name; it collapses that + # dimension and keeps the other one. + m = Model() + snapshot = pd.RangeIndex(4, name="snapshot") + gen = pd.Index(["g1", "g2"], name="gen") + y = m.add_variables(coords=[snapshot, gen], name="y") # labels 0..7 + + grouped = (1 * y).groupby("gen").sum(use_fallback=use_fallback) + + assert grouped.data.gen.values.tolist() == ["g1", "g2"] + assert grouped.sizes["snapshot"] == 4 + assert grouped.vars.transpose("gen", "snapshot", TERM_DIM).values.tolist() == [ + [[0], [2], [4], [6]], + [[1], [3], [5], [7]], + ] + @pytest.mark.parametrize("use_fallback", [True, False]) def test_single_element_list_groups_like_scalar( self, expr: LinearExpression, use_fallback: bool From e506d85be6de6495b607ea5bb0397365a77786f3 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:44:33 +0200 Subject: [PATCH 7/9] docs: scope groupby #750 note to non-dimension coordinates Grouping by a dimension coordinate or a MultiIndex level by name already worked; only the non-dimension (free) coordinate case was broken. Correct the release note, which had overclaimed. Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/release_notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 8fdc39ac..61335735 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -58,7 +58,7 @@ Most users should keep calling ``model.solve(...)``. If you want more control, y * ``add_variables`` / ``add_constraints``: extends 0.7.0's coords-as-truth rule to ``lower``, ``upper`` and ``mask`` for every bound type and dim order. Pandas ``Series`` / ``DataFrame`` bounds or masks missing a dimension are broadcast to ``coords`` instead of being silently dropped (`#709 `__); the variable's dimension order always follows ``coords`` (`#706 `__); bare-tuple coord entries (``coords=[(0, 1, 2)]``) now behave like lists. Mismatched values or extra dims raise ``ValueError`` with a labelled message; sparse-coord masks (formerly a v0.6.3 ``FutureWarning``, #580) raise ``ValueError``, and masks with dims not in the data raise ``ValueError`` instead of ``AssertionError``. * Pandas inputs whose index names *levels* of a stacked-``MultiIndex`` ``coords`` dimension are now projected onto that dimension: a level subset broadcasts across the others, the full set aligns element-wise. This fixes PyPSA multi-investment arithmetic (e.g. an expression over a ``(period, timestep)`` ``snapshot`` MultiIndex times a ``period``-indexed weighting). In ``add_variables`` / ``add_constraints`` the input must provide a value for every level combination of the MultiIndex or a ``ValueError`` is raised (the error lists the missing combinations). **Implicit level projections are deprecated**: they emit an ``EvolvingAPIWarning`` everywhere — in arithmetic *and* in ``add_variables`` / ``add_constraints`` — and will raise under the upcoming v1 convention. Project the input onto the dimension explicitly (select with the dimension's level values) to keep current behavior. Aligning the full level set with full coverage stays silent. Strict validation also rejects a ``MultiIndex`` input with *unnamed* levels whose combinations don't match ``coords`` (previously a silent bypass, as such inputs can't be projected by level name). -* You can now group a ``LinearExpression`` by a coordinate name, e.g. ``expr.groupby("period").sum()`` -- including coordinates that are not dimensions and levels of a ``MultiIndex`` -- matching ``xarray.Dataset.groupby`` (`#750 `__). +* ``LinearExpression.groupby`` now accepts a **non-dimension** coordinate as the key -- by name (``expr.groupby("period").sum()``, where ``period`` labels another dimension) or as the coordinate ``DataArray`` -- which previously raised ``ValueError: ... already exists``. Grouping by a dimension or a ``MultiIndex`` level already worked (`#750 `__). * ``add_piecewise_formulation`` now produces a reproducible dimension order in the broadcast breakpoint array. The previous set-based expansion gave a hash-randomized order that varied between processes. * SOS constraints on masked variables no longer cause solver-specific failures (Gurobi ``IndexError``, Xpress ``?404 Invalid column number``, LP parse errors, silent set corruption). ``Model.solve()`` and ``Model.to_file()`` now raise a clear ``NotImplementedError`` referring users to `#688 `__; pass ``reformulate_sos=True`` as a workaround. * ``Model.solve(..., reformulate_sos=True)`` now actually reformulates SOS constraints even when the solver supports them natively. Previously it was silently ignored with a warning. From ae23a3ca5e0956a960dc2e4844abdc24316db9d7 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:51:47 +0200 Subject: [PATCH 8/9] style(groupby): trim inline comments to one-liners Drop the verbose comment blocks; the rationale lives in the _resolve_group docstring and the regression tests enforce the invariants (e.g. test_multiindex_level guards the xindexes detach guard). Co-Authored-By: Claude Opus 4.8 (1M context) --- linopy/expressions.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/linopy/expressions.py b/linopy/expressions.py index 2121593e..07c2217f 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -184,12 +184,7 @@ def groupby(self) -> xarray.core.groupby.DatasetGroupBy: if isinstance(group, pd.Series): group = DataArray(group, name=group.name or "group") - # Detach a non-dimension coordinate used as the group so xarray does - # not try to re-expand it while recombining the groups; the group - # values are still supplied through ``group`` itself. Only free - # (non-indexed) coords are detached -- never a MultiIndex level (whose - # removal would leave its dimension without an index) and never the - # dimension's own coordinate. + # detach an attached free coordinate (never an indexed/level coord) if ( isinstance(group, DataArray) and group.name in set(data.coords) - set(data.dims) @@ -255,8 +250,6 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: LinearExpression The sum of the groupby object. """ - # Resolving a coordinate name to its coordinate lets ``groupby("name")`` - # take the fast path below instead of the slower xarray fallback. group = _resolve_group(self.group, self.data) non_fallback_types = (pd.Series, pd.DataFrame, xr.DataArray) @@ -286,12 +279,7 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: arrays = [group, group.groupby(group).cumcount()] idx = pd.MultiIndex.from_arrays(arrays, names=[GROUP_DIM, GROUPED_TERM_DIM]) new_coords = Coordinates.from_pandas_multiindex(idx, group_dim) - # Collapsing ``group_dim`` into groups invalidates every coordinate - # aligned to it: the dimension index, any MultiIndex levels, and - # auxiliary (non-dimension) coords such as the one being grouped by. - # Drop them all before reshaping, otherwise they clash with the - # regrouped dimension or with the final rename onto - # ``final_group_name``. + # collapsing group_dim invalidates every coordinate aligned to it names_to_drop = [ name for name, coord in self.data.coords.items() From 3cc774ffd5cc1ff9aae6a1f71fc4fdb37414fb93 Mon Sep 17 00:00:00 2001 From: FBumann <117816358+FBumann@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:31:33 +0200 Subject: [PATCH 9/9] feat(groupby): fast path for multi-key groupby([names]) `groupby(["a","b"]).sum()` previously dropped to the slow xarray fallback. Resolve a list of coordinate names (1-D, same dim) to a value frame so it rides the existing reindex fast path, then unstack the stacked result back into one dimension per key -- byte-identical to the fallback, sparse fill cells included. The DataFrame grouper is untouched and stays compact (stacked MultiIndex over observed combinations only), so this is non-breaking. One dimension per key is a dense cartesian grid, so a sparse key crossing materialises mostly-fill cells. Warn (pointing at the DataFrame grouper) when the grid is much larger than the observed combinations; the check reads the collapsed MultiIndex levels, so it is O(observed) and fires before unstack allocates. See #753; sparse-representation follow-ups tracked against #740. Co-Authored-By: Claude Opus 4.8 (1M context) --- linopy/expressions.py | 33 +++++++++++++++ test/test_linear_expression.py | 77 +++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/linopy/expressions.py b/linopy/expressions.py index 07c2217f..b15e2156 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -252,6 +252,20 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: """ group = _resolve_group(self.group, self.data) + # a list of coord names rides the fast path, then unstacks to one dim per key + unstack_multikey = False + if ( + not use_fallback + and isinstance(group, (list, tuple)) + and len(group) > 1 + and all(isinstance(g, str) and g in self.data.coords for g in group) + ): + coord_dims = {self.data[g].dims for g in group} + if len(coord_dims) == 1 and len(next(iter(coord_dims))) == 1: + names = list(group) + group = self.data[names].to_dataframe()[names] + unstack_multikey = True + non_fallback_types = (pd.Series, pd.DataFrame, xr.DataArray) if isinstance(group, non_fallback_types) and not use_fallback: if isinstance(group, pd.DataFrame): @@ -297,6 +311,25 @@ def sum(self, use_fallback: bool = False, **kwargs: Any) -> LinearExpression: ds = ds.assign_coords(new_coords) ds = ds.rename({GROUP_DIM: final_group_name}) + if unstack_multikey: + # warn before allocating the grid when most cells would be fill + mi = ds.indexes[final_group_name].remove_unused_levels() + observed = len(mi) + grid = int(np.prod([len(level) for level in mi.levels])) + if grid > 2 * observed and grid - observed > 10_000: + warn( + f"Grouping a LinearExpression by {names} produces a dense " + f"{grid:,}-cell grid, but only {observed:,} of those " + f"combinations occur -- the {grid - observed:,} absent ones " + f"are materialised as fill values. Group by a `pd.DataFrame` " + f"of these keys instead to keep the result compact over only " + f"the observed combinations.", + UserWarning, + stacklevel=2, + ) + ds = ds.unstack( + final_group_name, fill_value=LinearExpression._fill_value + ) return LinearExpression(ds, self.model) def func(ds: Dataset) -> Dataset: diff --git a/test/test_linear_expression.py b/test/test_linear_expression.py index 3e3bff74..a8f184fa 100644 --- a/test/test_linear_expression.py +++ b/test/test_linear_expression.py @@ -7,6 +7,7 @@ from __future__ import annotations +import warnings from typing import Any import numpy as np @@ -1384,9 +1385,83 @@ def test_linear_expression_groupby_on_same_name_as_target_dim( assert grouped.nterm == 10 +class TestMultiKeyFastPath: + """ + Group a LinearExpression by a list of coordinate names: takes the fast + reindex path and returns one dimension per key, like the xarray fallback. + """ + + @staticmethod + def _expr(period_vals: list, season_vals: list) -> LinearExpression: + n = len(period_vals) + s = pd.RangeIndex(n, name="s") + m = Model() + x = m.add_variables(coords=[s], name="x") + return (1.0 * x).assign_coords( + period=xr.DataArray(period_vals, dims="s", coords={"s": s}, name="period"), + season=xr.DataArray(season_vals, dims="s", coords={"s": s}, name="season"), + ) + + @pytest.mark.parametrize("spelling", [list, tuple], ids=["list", "tuple"]) + def test_matches_fallback(self, spelling: type) -> None: + # the fast path must equal the slow fallback, sparse cells included + expr = self._expr([2020, 2020, 2030, 2030, 2030], list("wswws")) + group = spelling(["period", "season"]) + + fast = expr.groupby(group).sum() + slow = expr.groupby(group).sum(use_fallback=True) + + assert_linequal(fast, slow) + + def test_separate_dims_not_stacked(self) -> None: + # built via a stacked index internally, but returns one dim per key + expr = self._expr([2020, 2020, 2030, 2030], list("wsws")) + + grouped = expr.groupby(["period", "season"]).sum() + + assert {"period", "season"} <= set(grouped.dims) + assert "group" not in grouped.dims + assert not isinstance(grouped.data.indexes.get("period"), pd.MultiIndex) + + def test_sparse_combination_filled(self) -> None: + # (2020, "s") never occurs -> empty term in the grid + expr = self._expr([2020, 2020, 2030, 2030], list("wwws")) + + grouped = expr.groupby(["period", "season"]).sum() + + cell = grouped.sel(period=2020, season="s") + assert (cell.vars == -1).all() + assert cell.coeffs.isnull().all() + + def test_dataframe_grouper_stays_compact(self) -> None: + # the DataFrame grouper keeps the stacked observed-only group dim + expr = self._expr([2020, 2020, 2030, 2030], list("wwws")) + df = expr.data[["period", "season"]].to_dataframe()[["period", "season"]] + + grouped = expr.groupby(df).sum() + + assert "group" in grouped.dims + assert isinstance(grouped.data.indexes["group"], pd.MultiIndex) + assert grouped.sizes["group"] == 3 # observed, not the 2x2=4 grid + + def test_blowup_warns_when_sparse(self) -> None: + # 200 observed combos, 200x200 grid -> nudge toward the DataFrame grouper + expr = self._expr(list(range(200)), list(range(200))) + + with pytest.warns(UserWarning, match="dense .* grid"): + expr.groupby(["period", "season"]).sum() + + def test_no_warning_when_dense(self) -> None: + expr = self._expr([2020, 2020, 2030, 2030], list("wsws")) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + expr.groupby(["period", "season"]).sum() + + class TestGroupbyByAttachedCoordinate: """ - GH #750: group by an attached non-dimension coordinate. + Group by an attached non-dimension coordinate. Asserts grouping against hard-coded ``vars``/``coeffs`` to catch regressions. """