Skip to content

Commit

Permalink
DEPR: dropping nuisance columns in DataFrame reductions (pandas-dev#4…
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and TLouf committed Jun 1, 2021
1 parent 65f4aec commit 993934e
Show file tree
Hide file tree
Showing 14 changed files with 199 additions and 48 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ this pathological behavior (:issue:`37827`):
*New behavior*:

.. ipython:: python
:okwarning:
df.mean()
Expand All @@ -394,6 +395,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:`
:issue:`28949`, :issue:`21020`).

.. ipython:: python
:okwarning:
ser = pd.Series([0, 1], dtype="category", name="A")
df = ser.to_frame()
Expand All @@ -411,6 +413,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:`
*New behavior*:

.. ipython:: python
:okwarning:
df.any()
Expand Down
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,47 @@ Deprecations
- Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`)
- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)

.. _whatsnew_130.deprecations.nuisance_columns:

Deprecated Dropping Nuisance Columns in DataFrame Reductions and DataFrameGroupBy Operations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The default of calling a reduction (.min, .max, .sum, ...) on a :class:`DataFrame` with
``numeric_only=None`` (the default, columns on which the reduction raises ``TypeError``
are silently ignored and dropped from the result.

This behavior is deprecated. In a future version, the ``TypeError`` will be raised,
and users will need to select only valid columns before calling the function.

For example:

.. ipython:: python
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
df
*Old behavior*:

.. code-block:: ipython
In [3]: df.prod()
Out[3]:
Out[3]:
A 24
dtype: int64
*Future behavior*:

.. code-block:: ipython
In [4]: df.prod()
...
TypeError: 'DatetimeArray' does not implement reduction 'prod'
In [5]: df[["A"]].prod()
Out[5]:
A 24
dtype: int64
.. ---------------------------------------------------------------------------
Expand Down
28 changes: 28 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9854,6 +9854,21 @@ def _get_data() -> DataFrame:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

if numeric_only is None and out.shape[0] != df.shape[1]:
# columns have been dropped GH#41480
arg_name = "numeric_only"
if name in ["all", "any"]:
arg_name = "bool_only"
warnings.warn(
"Dropping of nuisance columns in DataFrame reductions "
f"(with '{arg_name}=None') is deprecated; in a future "
"version this will raise TypeError. Select only valid "
"columns before calling the reduction.",
FutureWarning,
stacklevel=5,
)

return out

assert numeric_only is None
Expand All @@ -9874,6 +9889,19 @@ def _get_data() -> DataFrame:
with np.errstate(all="ignore"):
result = func(values)

# columns have been dropped GH#41480
arg_name = "numeric_only"
if name in ["all", "any"]:
arg_name = "bool_only"
warnings.warn(
"Dropping of nuisance columns in DataFrame reductions "
f"(with '{arg_name}=None') is deprecated; in a future "
"version this will raise TypeError. Select only valid "
"columns before calling the reduction.",
FutureWarning,
stacklevel=5,
)

if hasattr(result, "dtype"):
if filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1209,7 +1209,10 @@ def test_nuiscance_columns():
)
tm.assert_frame_equal(result, expected)

result = df.agg("sum")
with tm.assert_produces_warning(
FutureWarning, match="Select only valid", check_stacklevel=False
):
result = df.agg("sum")
expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -1426,8 +1429,9 @@ def test_apply_datetime_tz_issue():
@pytest.mark.parametrize("method", ["min", "max", "sum"])
def test_consistency_of_aggregates_of_columns_with_missing_values(df, method):
# GH 16832
none_in_first_column_result = getattr(df[["A", "B"]], method)()
none_in_second_column_result = getattr(df[["B", "A"]], method)()
with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
none_in_first_column_result = getattr(df[["A", "B"]], method)()
none_in_second_column_result = getattr(df[["B", "A"]], method)()

tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def test_transform_wont_agg_series(string_series, func):
@pytest.mark.parametrize(
"op_wrapper", [lambda x: x, lambda x: [x], lambda x: {"A": x}, lambda x: {"A": [x]}]
)
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper):
# GH 35964
op = op_wrapper(all_reductions)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def test_quantile(self, datetime_frame):
# non-numeric exclusion
df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
rs = df.quantile(0.5)
xp = df.median().rename(0.5)
with tm.assert_produces_warning(FutureWarning, match="Select only valid"):
xp = df.median().rename(0.5)
tm.assert_series_equal(rs, xp)

# axis
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def test_rank_methods_frame(self):

@td.skip_array_manager_not_yet_implemented
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_rank_descending(self, method, dtype):

if "i" in dtype:
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,7 @@ def test_zero_len_frame_with_series_corner_cases():
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_frame_single_columns_object_sum_axis_1():
# GH 13758
data = {
Expand Down

0 comments on commit 993934e

Please sign in to comment.