DEPR: dropping nuisance columns in DataFrame reductions (pandas-dev#4…

…1480)
TLouf · Jun 1, 2021 · 993934e · 993934e
1 parent 65f4aec
commit 993934e
Show file tree

Hide file tree

Showing 14 changed files with 199 additions and 48 deletions.
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -381,6 +381,7 @@ this pathological behavior (:issue:`37827`):
 *New behavior*:
 
 .. ipython:: python
+   :okwarning:
 
     df.mean()
 
@@ -394,6 +395,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:`
 :issue:`28949`, :issue:`21020`).
 
 .. ipython:: python
+   :okwarning:
 
     ser = pd.Series([0, 1], dtype="category", name="A")
     df = ser.to_frame()
@@ -411,6 +413,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:`
 *New behavior*:
 
 .. ipython:: python
+   :okwarning:
 
     df.any()
 

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -679,6 +679,47 @@ Deprecations
 - Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`)
 - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype.  Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
 
+.. _whatsnew_130.deprecations.nuisance_columns:
+
+Deprecated Dropping Nuisance Columns in DataFrame Reductions and DataFrameGroupBy Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The default of calling a reduction (.min, .max, .sum, ...) on a :class:`DataFrame` with
+``numeric_only=None`` (the default, columns on which the reduction raises ``TypeError``
+are silently ignored and dropped from the result.
+
+This behavior is deprecated. In a future version, the ``TypeError`` will be raised,
+and users will need to select only valid columns before calling the function.
+
+For example:
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
+   df
+
+*Old behavior*:
+
+.. code-block:: ipython
+
+    In [3]: df.prod()
+    Out[3]:
+    Out[3]:
+    A    24
+    dtype: int64
+
+*Future behavior*:
+
+.. code-block:: ipython
+
+    In [4]: df.prod()
+    ...
+    TypeError: 'DatetimeArray' does not implement reduction 'prod'
+
+    In [5]: df[["A"]].prod()
+    Out[5]:
+    A    24
+    dtype: int64
+
 .. ---------------------------------------------------------------------------
 
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9854,6 +9854,21 @@ def _get_data() -> DataFrame:
                 # Even if we are object dtype, follow numpy and return
                 #  float64, see test_apply_funcs_over_empty
                 out = out.astype(np.float64)
+
+            if numeric_only is None and out.shape[0] != df.shape[1]:
+                # columns have been dropped GH#41480
+                arg_name = "numeric_only"
+                if name in ["all", "any"]:
+                    arg_name = "bool_only"
+                warnings.warn(
+                    "Dropping of nuisance columns in DataFrame reductions "
+                    f"(with '{arg_name}=None') is deprecated; in a future "
+                    "version this will raise TypeError.  Select only valid "
+                    "columns before calling the reduction.",
+                    FutureWarning,
+                    stacklevel=5,
+                )
+
             return out
 
         assert numeric_only is None
@@ -9874,6 +9889,19 @@ def _get_data() -> DataFrame:
             with np.errstate(all="ignore"):
                 result = func(values)
 
+            # columns have been dropped GH#41480
+            arg_name = "numeric_only"
+            if name in ["all", "any"]:
+                arg_name = "bool_only"
+            warnings.warn(
+                "Dropping of nuisance columns in DataFrame reductions "
+                f"(with '{arg_name}=None') is deprecated; in a future "
+                "version this will raise TypeError.  Select only valid "
+                "columns before calling the reduction.",
+                FutureWarning,
+                stacklevel=5,
+            )
+
         if hasattr(result, "dtype"):
             if filter_type == "bool" and notna(result).all():
                 result = result.astype(np.bool_)

diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -1209,7 +1209,10 @@ def test_nuiscance_columns():
     )
     tm.assert_frame_equal(result, expected)
 
-    result = df.agg("sum")
+    with tm.assert_produces_warning(
+        FutureWarning, match="Select only valid", check_stacklevel=False
+    ):
+        result = df.agg("sum")
     expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
     tm.assert_series_equal(result, expected)
 
@@ -1426,8 +1429,9 @@ def test_apply_datetime_tz_issue():
 @pytest.mark.parametrize("method", ["min", "max", "sum"])
 def test_consistency_of_aggregates_of_columns_with_missing_values(df, method):
     # GH 16832
-    none_in_first_column_result = getattr(df[["A", "B"]], method)()
-    none_in_second_column_result = getattr(df[["B", "A"]], method)()
+    with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
+        none_in_first_column_result = getattr(df[["A", "B"]], method)()
+        none_in_second_column_result = getattr(df[["B", "A"]], method)()
 
     tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result)
 

diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
@@ -342,6 +342,7 @@ def test_transform_wont_agg_series(string_series, func):
 @pytest.mark.parametrize(
     "op_wrapper", [lambda x: x, lambda x: [x], lambda x: {"A": x}, lambda x: {"A": [x]}]
 )
+@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
 def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper):
     # GH 35964
     op = op_wrapper(all_reductions)

diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -56,7 +56,8 @@ def test_quantile(self, datetime_frame):
         # non-numeric exclusion
         df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
         rs = df.quantile(0.5)
-        xp = df.median().rename(0.5)
+        with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
+            xp = df.median().rename(0.5)
         tm.assert_series_equal(rs, xp)
 
         # axis

diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -248,6 +248,7 @@ def test_rank_methods_frame(self):
 
     @td.skip_array_manager_not_yet_implemented
     @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
+    @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
     def test_rank_descending(self, method, dtype):
 
         if "i" in dtype:

diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -1021,6 +1021,7 @@ def test_zero_len_frame_with_series_corner_cases():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
 def test_frame_single_columns_object_sum_axis_1():
     # GH 13758
     data = {