REGR: Expand ValueError catching in series aggregate

Closes pandas-dev#31802 This "fixes" pandas-dev#31802 by expanding the number of cases where we swallow an exception in libreduction. Currently, we're creating an invalid Series in SeriesBinGrouper where the `.mgr_locs` doesn't match the values. See pandas-dev#31802 (comment) for more. For now, we simply catch more cases that fall back to Python. I've gone with a minimal change which addresses only issues hitting this exact exception. We might want to go broader, but that's not clear.
TomAugspurger · Mar 9, 2020 · 5007426 · 5007426
1 parent 787dc8a
commit 5007426
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
@@ -20,6 +20,7 @@ Fixed regressions
 - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`)
 - Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.Rolling.corr>` when using a time offset (:issue:`31789`)
 - Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
+- Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`)
 - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
 - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
 - Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -639,9 +639,16 @@ def agg_series(self, obj: Series, func):
         try:
             return self._aggregate_series_fast(obj, func)
         except ValueError as err:
-            if "Function does not reduce" in str(err):
+            msg = str(err)
+            if "Function does not reduce" in msg:
                 # raised in libreduction
                 pass
+            elif "Wrong number of items" in msg:
+                # https://github.com/pandas-dev/pandas/issues/31802
+                # libreduction.SeriesGrouper can create invalid Series /
+                # Blocks, which might raise arbitrary exceptions when
+                # operated upon.
+                pass
             else:
                 raise
         return self._aggregate_series_pure_python(obj, func)

diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
@@ -5,6 +5,7 @@
 
 from pandas.core.dtypes.common import ensure_int64
 
+import pandas as pd
 from pandas import Index, Series, isna
 import pandas._testing as tm
 
@@ -51,6 +52,32 @@ def test_series_bin_grouper():
     tm.assert_almost_equal(counts, exp_counts)
 
 
+def assert_block_lengths(x):
+    assert len(x) == len(x._data.blocks[0].mgr_locs)
+    return 0
+
+
+def cumsum_max(x):
+    x.cumsum().max()  # triggers the ValueError when creating a block
+    return 0
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        cumsum_max,
+        pytest.param(assert_block_lengths, marks=pytest.mark.xfail(reason="debatable")),
+    ],
+)
+def test_operation_on_invalid_block_passes(func):
+    # https://github.com/pandas-dev/pandas/issues/31802
+    # SeriesBinGrouper creates an invalid block, which may
+    # raise arbitrary exceptions.
+    df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
+    result = df.groupby(["A", "B"]).agg(func)
+    assert isinstance(result, pd.DataFrame)
+
+
 @pytest.mark.parametrize(
     "binner,closed,expected",
     [