REF: Decouple Series.apply from Series.agg (pandas-dev#53400)

WillAyd · Jun 5, 2023 · d9c3777 · d9c3777
1 parent 7c6b54f
commit d9c3777
Show file tree

Hide file tree

Showing 4 changed files with 227 additions and 94 deletions.
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -101,6 +101,7 @@ Other enhancements
 - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
 - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`)
 - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
+- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
 - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
 - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
 - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -16,6 +16,7 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Sequence,
     cast,
 )
@@ -288,6 +289,11 @@ def agg_list_like(self) -> DataFrame | Series:
         -------
         Result of aggregation.
         """
+        return self.agg_or_apply_list_like(op_name="agg")
+
+    def agg_or_apply_list_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
         from pandas.core.groupby.generic import (
             DataFrameGroupBy,
             SeriesGroupBy,
@@ -296,6 +302,9 @@ def agg_list_like(self) -> DataFrame | Series:
 
         obj = self.obj
         func = cast(List[AggFuncTypeBase], self.func)
+        kwargs = self.kwargs
+        if op_name == "apply":
+            kwargs = {**kwargs, "by_row": False}
 
         if getattr(obj, "axis", 0) == 1:
             raise NotImplementedError("axis other than 0 is not supported")
@@ -313,8 +322,6 @@ def agg_list_like(self) -> DataFrame | Series:
         keys = []
 
         is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
-        is_ser_or_df = isinstance(obj, (ABCDataFrame, ABCSeries))
-        this_args = [self.axis, *self.args] if is_ser_or_df else self.args
 
         context_manager: ContextManager
         if is_groupby:
@@ -323,12 +330,19 @@ def agg_list_like(self) -> DataFrame | Series:
             context_manager = com.temp_setattr(obj, "as_index", True)
         else:
             context_manager = nullcontext()
+
+        def include_axis(colg) -> bool:
+            return isinstance(colg, ABCDataFrame) or (
+                isinstance(colg, ABCSeries) and op_name == "agg"
+            )
+
         with context_manager:
             # degenerate case
             if selected_obj.ndim == 1:
                 for a in func:
                     colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
-                    new_res = colg.aggregate(a, *this_args, **self.kwargs)
+                    args = [self.axis, *self.args] if include_axis(colg) else self.args
+                    new_res = getattr(colg, op_name)(a, *args, **kwargs)
                     results.append(new_res)
 
                     # make sure we find a good name
@@ -339,7 +353,8 @@ def agg_list_like(self) -> DataFrame | Series:
                 indices = []
                 for index, col in enumerate(selected_obj):
                     colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
-                    new_res = colg.aggregate(func, *this_args, **self.kwargs)
+                    args = [self.axis, *self.args] if include_axis(colg) else self.args
+                    new_res = getattr(colg, op_name)(func, *args, **kwargs)
                     results.append(new_res)
                     indices.append(index)
                 keys = selected_obj.columns.take(indices)
@@ -366,15 +381,23 @@ def agg_dict_like(self) -> DataFrame | Series:
         -------
         Result of aggregation.
         """
+        return self.agg_or_apply_dict_like(op_name="agg")
+
+    def agg_or_apply_dict_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
         from pandas import Index
         from pandas.core.groupby.generic import (
             DataFrameGroupBy,
             SeriesGroupBy,
         )
         from pandas.core.reshape.concat import concat
 
+        assert op_name in ["agg", "apply"]
+
         obj = self.obj
         func = cast(AggFuncTypeDict, self.func)
+        kwargs = {"by_row": False} if op_name == "apply" else {}
 
         if getattr(obj, "axis", 0) == 1:
             raise NotImplementedError("axis other than 0 is not supported")
@@ -387,7 +410,7 @@ def agg_dict_like(self) -> DataFrame | Series:
             selected_obj = obj._selected_obj
             selection = obj._selection
 
-        func = self.normalize_dictlike_arg("agg", selected_obj, func)
+        func = self.normalize_dictlike_arg(op_name, selected_obj, func)
 
         is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
         context_manager: ContextManager
@@ -404,17 +427,18 @@ def agg_dict_like(self) -> DataFrame | Series:
         )
 
         # Numba Groupby engine/engine-kwargs passthrough
-        kwargs = {}
         if is_groupby:
             engine = self.kwargs.get("engine", None)
             engine_kwargs = self.kwargs.get("engine_kwargs", None)
-            kwargs = {"engine": engine, "engine_kwargs": engine_kwargs}
+            kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs})
 
         with context_manager:
             if selected_obj.ndim == 1:
                 # key only used for output
                 colg = obj._gotitem(selection, ndim=1)
-                result_data = [colg.agg(how, **kwargs) for _, how in func.items()]
+                result_data = [
+                    getattr(colg, op_name)(how, **kwargs) for _, how in func.items()
+                ]
                 result_index = list(func.keys())
             elif is_non_unique_col:
                 # key used for column selection and output
@@ -429,7 +453,9 @@ def agg_dict_like(self) -> DataFrame | Series:
                         label_to_indices[label].append(index)
 
                     key_data = [
-                        selected_obj._ixs(indice, axis=1).agg(how, **kwargs)
+                        getattr(selected_obj._ixs(indice, axis=1), op_name)(
+                            how, **kwargs
+                        )
                         for label, indices in label_to_indices.items()
                         for indice in indices
                     ]
@@ -439,7 +465,7 @@ def agg_dict_like(self) -> DataFrame | Series:
             else:
                 # key used for column selection and output
                 result_data = [
-                    obj._gotitem(key, ndim=1).agg(how, **kwargs)
+                    getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
                     for key, how in func.items()
                 ]
                 result_index = list(func.keys())
@@ -535,7 +561,7 @@ def apply_str(self) -> DataFrame | Series:
                     self.kwargs["axis"] = self.axis
         return self._apply_str(obj, func, *self.args, **self.kwargs)
 
-    def apply_multiple(self) -> DataFrame | Series:
+    def apply_list_or_dict_like(self) -> DataFrame | Series:
         """
         Compute apply in case of a list-like or dict-like.
 
@@ -551,9 +577,9 @@ def apply_multiple(self) -> DataFrame | Series:
         kwargs = self.kwargs
 
         if is_dict_like(func):
-            result = self.agg_dict_like()
+            result = self.agg_or_apply_dict_like(op_name="apply")
         else:
-            result = self.agg_list_like()
+            result = self.agg_or_apply_list_like(op_name="apply")
 
         result = reconstruct_and_relabel_result(result, func, **kwargs)
 
@@ -692,9 +718,9 @@ def values(self):
 
     def apply(self) -> DataFrame | Series:
         """compute the results"""
-        # dispatch to agg
+        # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
-            return self.apply_multiple()
+            return self.apply_list_or_dict_like()
 
         # all empty
         if len(self.columns) == 0 and len(self.index) == 0:
@@ -1041,13 +1067,15 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
 class SeriesApply(NDFrameApply):
     obj: Series
     axis: AxisInt = 0
+    by_row: bool  # only relevant for apply()
 
     def __init__(
         self,
         obj: Series,
         func: AggFuncType,
         *,
         convert_dtype: bool | lib.NoDefault = lib.no_default,
+        by_row: bool = True,
         args,
         kwargs,
     ) -> None:
@@ -1062,6 +1090,7 @@ def __init__(
                 stacklevel=find_stack_level(),
             )
         self.convert_dtype = convert_dtype
+        self.by_row = by_row
 
         super().__init__(
             obj,
@@ -1078,9 +1107,9 @@ def apply(self) -> DataFrame | Series:
         if len(obj) == 0:
             return self.apply_empty_result()
 
-        # dispatch to agg
+        # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
-            return self.apply_multiple()
+            return self.apply_list_or_dict_like()
 
         if isinstance(self.func, str):
             # if we are a string, try to dispatch
@@ -1126,6 +1155,8 @@ def apply_standard(self) -> DataFrame | Series:
         if isinstance(func, np.ufunc):
             with np.errstate(all="ignore"):
                 return func(obj, *self.args, **self.kwargs)
+        elif not self.by_row:
+            return func(obj, *self.args, **self.kwargs)
 
         if self.args or self.kwargs:
             # _map_values does not support args/kwargs

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4496,6 +4496,8 @@ def apply(
         func: AggFuncType,
         convert_dtype: bool | lib.NoDefault = lib.no_default,
         args: tuple[Any, ...] = (),
+        *,
+        by_row: bool = True,
         **kwargs,
     ) -> DataFrame | Series:
         """
@@ -4523,6 +4525,12 @@ def apply(
                 instead if you want ``convert_dtype=False``.
         args : tuple
             Positional arguments passed to func after the series value.
+        by_row : bool, default True
+            If False, the func will be passed the whole Series at once.
+            If True, will func will be passed each element of the Series, like
+            Series.map (backward compatible).
+
+            .. versionadded:: 2.1.0
         **kwargs
             Additional keyword arguments passed to func.
 
@@ -4611,7 +4619,12 @@ def apply(
         dtype: float64
         """
         return SeriesApply(
-            self, func, convert_dtype=convert_dtype, args=args, kwargs=kwargs
+            self,
+            func,
+            convert_dtype=convert_dtype,
+            by_row=by_row,
+            args=args,
+            kwargs=kwargs,
         ).apply()
 
     def _reindex_indexer(