Skip to content

Commit

Permalink
REF: Decouple Series.apply from Series.agg (pandas-dev#53400)
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 committed Jun 5, 2023
1 parent 7c6b54f commit d9c3777
Show file tree
Hide file tree
Showing 4 changed files with 227 additions and 94 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ Other enhancements
- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
- :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`)
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
Expand Down
65 changes: 48 additions & 17 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Iterable,
Iterator,
List,
Literal,
Sequence,
cast,
)
Expand Down Expand Up @@ -288,6 +289,11 @@ def agg_list_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
return self.agg_or_apply_list_like(op_name="agg")

def agg_or_apply_list_like(
self, op_name: Literal["agg", "apply"]
) -> DataFrame | Series:
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
Expand All @@ -296,6 +302,9 @@ def agg_list_like(self) -> DataFrame | Series:

obj = self.obj
func = cast(List[AggFuncTypeBase], self.func)
kwargs = self.kwargs
if op_name == "apply":
kwargs = {**kwargs, "by_row": False}

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand All @@ -313,8 +322,6 @@ def agg_list_like(self) -> DataFrame | Series:
keys = []

is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
is_ser_or_df = isinstance(obj, (ABCDataFrame, ABCSeries))
this_args = [self.axis, *self.args] if is_ser_or_df else self.args

context_manager: ContextManager
if is_groupby:
Expand All @@ -323,12 +330,19 @@ def agg_list_like(self) -> DataFrame | Series:
context_manager = com.temp_setattr(obj, "as_index", True)
else:
context_manager = nullcontext()

def include_axis(colg) -> bool:
return isinstance(colg, ABCDataFrame) or (
isinstance(colg, ABCSeries) and op_name == "agg"
)

with context_manager:
# degenerate case
if selected_obj.ndim == 1:
for a in func:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
new_res = colg.aggregate(a, *this_args, **self.kwargs)
args = [self.axis, *self.args] if include_axis(colg) else self.args
new_res = getattr(colg, op_name)(a, *args, **kwargs)
results.append(new_res)

# make sure we find a good name
Expand All @@ -339,7 +353,8 @@ def agg_list_like(self) -> DataFrame | Series:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
new_res = colg.aggregate(func, *this_args, **self.kwargs)
args = [self.axis, *self.args] if include_axis(colg) else self.args
new_res = getattr(colg, op_name)(func, *args, **kwargs)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)
Expand All @@ -366,15 +381,23 @@ def agg_dict_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
return self.agg_or_apply_dict_like(op_name="agg")

def agg_or_apply_dict_like(
self, op_name: Literal["agg", "apply"]
) -> DataFrame | Series:
from pandas import Index
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
)
from pandas.core.reshape.concat import concat

assert op_name in ["agg", "apply"]

obj = self.obj
func = cast(AggFuncTypeDict, self.func)
kwargs = {"by_row": False} if op_name == "apply" else {}

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand All @@ -387,7 +410,7 @@ def agg_dict_like(self) -> DataFrame | Series:
selected_obj = obj._selected_obj
selection = obj._selection

func = self.normalize_dictlike_arg("agg", selected_obj, func)
func = self.normalize_dictlike_arg(op_name, selected_obj, func)

is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
context_manager: ContextManager
Expand All @@ -404,17 +427,18 @@ def agg_dict_like(self) -> DataFrame | Series:
)

# Numba Groupby engine/engine-kwargs passthrough
kwargs = {}
if is_groupby:
engine = self.kwargs.get("engine", None)
engine_kwargs = self.kwargs.get("engine_kwargs", None)
kwargs = {"engine": engine, "engine_kwargs": engine_kwargs}
kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs})

with context_manager:
if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(selection, ndim=1)
result_data = [colg.agg(how, **kwargs) for _, how in func.items()]
result_data = [
getattr(colg, op_name)(how, **kwargs) for _, how in func.items()
]
result_index = list(func.keys())
elif is_non_unique_col:
# key used for column selection and output
Expand All @@ -429,7 +453,9 @@ def agg_dict_like(self) -> DataFrame | Series:
label_to_indices[label].append(index)

key_data = [
selected_obj._ixs(indice, axis=1).agg(how, **kwargs)
getattr(selected_obj._ixs(indice, axis=1), op_name)(
how, **kwargs
)
for label, indices in label_to_indices.items()
for indice in indices
]
Expand All @@ -439,7 +465,7 @@ def agg_dict_like(self) -> DataFrame | Series:
else:
# key used for column selection and output
result_data = [
obj._gotitem(key, ndim=1).agg(how, **kwargs)
getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
for key, how in func.items()
]
result_index = list(func.keys())
Expand Down Expand Up @@ -535,7 +561,7 @@ def apply_str(self) -> DataFrame | Series:
self.kwargs["axis"] = self.axis
return self._apply_str(obj, func, *self.args, **self.kwargs)

def apply_multiple(self) -> DataFrame | Series:
def apply_list_or_dict_like(self) -> DataFrame | Series:
"""
Compute apply in case of a list-like or dict-like.
Expand All @@ -551,9 +577,9 @@ def apply_multiple(self) -> DataFrame | Series:
kwargs = self.kwargs

if is_dict_like(func):
result = self.agg_dict_like()
result = self.agg_or_apply_dict_like(op_name="apply")
else:
result = self.agg_list_like()
result = self.agg_or_apply_list_like(op_name="apply")

result = reconstruct_and_relabel_result(result, func, **kwargs)

Expand Down Expand Up @@ -692,9 +718,9 @@ def values(self):

def apply(self) -> DataFrame | Series:
"""compute the results"""
# dispatch to agg
# dispatch to handle list-like or dict-like
if is_list_like(self.func):
return self.apply_multiple()
return self.apply_list_or_dict_like()

# all empty
if len(self.columns) == 0 and len(self.index) == 0:
Expand Down Expand Up @@ -1041,13 +1067,15 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
class SeriesApply(NDFrameApply):
obj: Series
axis: AxisInt = 0
by_row: bool # only relevant for apply()

def __init__(
self,
obj: Series,
func: AggFuncType,
*,
convert_dtype: bool | lib.NoDefault = lib.no_default,
by_row: bool = True,
args,
kwargs,
) -> None:
Expand All @@ -1062,6 +1090,7 @@ def __init__(
stacklevel=find_stack_level(),
)
self.convert_dtype = convert_dtype
self.by_row = by_row

super().__init__(
obj,
Expand All @@ -1078,9 +1107,9 @@ def apply(self) -> DataFrame | Series:
if len(obj) == 0:
return self.apply_empty_result()

# dispatch to agg
# dispatch to handle list-like or dict-like
if is_list_like(self.func):
return self.apply_multiple()
return self.apply_list_or_dict_like()

if isinstance(self.func, str):
# if we are a string, try to dispatch
Expand Down Expand Up @@ -1126,6 +1155,8 @@ def apply_standard(self) -> DataFrame | Series:
if isinstance(func, np.ufunc):
with np.errstate(all="ignore"):
return func(obj, *self.args, **self.kwargs)
elif not self.by_row:
return func(obj, *self.args, **self.kwargs)

if self.args or self.kwargs:
# _map_values does not support args/kwargs
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4496,6 +4496,8 @@ def apply(
func: AggFuncType,
convert_dtype: bool | lib.NoDefault = lib.no_default,
args: tuple[Any, ...] = (),
*,
by_row: bool = True,
**kwargs,
) -> DataFrame | Series:
"""
Expand Down Expand Up @@ -4523,6 +4525,12 @@ def apply(
instead if you want ``convert_dtype=False``.
args : tuple
Positional arguments passed to func after the series value.
by_row : bool, default True
If False, the func will be passed the whole Series at once.
If True, will func will be passed each element of the Series, like
Series.map (backward compatible).
.. versionadded:: 2.1.0
**kwargs
Additional keyword arguments passed to func.
Expand Down Expand Up @@ -4611,7 +4619,12 @@ def apply(
dtype: float64
"""
return SeriesApply(
self, func, convert_dtype=convert_dtype, args=args, kwargs=kwargs
self,
func,
convert_dtype=convert_dtype,
by_row=by_row,
args=args,
kwargs=kwargs,
).apply()

def _reindex_indexer(
Expand Down
Loading

0 comments on commit d9c3777

Please sign in to comment.