Skip to content

Commit

Permalink
BUG: Handle fill_value in Categorical.take
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger committed Oct 23, 2018
1 parent 104ccfd commit 8ec7746
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 14 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -973,6 +973,7 @@ Categorical
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).

Datetimelike
^^^^^^^^^^^^
Expand Down
75 changes: 61 additions & 14 deletions pandas/core/arrays/categorical.py
Expand Up @@ -1019,15 +1019,7 @@ def add_categories(self, new_categories, inplace=False):
set_categories
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not is_list_like(new_categories):
new_categories = [new_categories]
already_included = set(new_categories) & set(self.dtype.categories)
if len(already_included) != 0:
msg = ("new categories must not include old categories: "
"{already_included!s}")
raise ValueError(msg.format(already_included=already_included))
new_categories = list(self.dtype.categories) + list(new_categories)
new_dtype = CategoricalDtype(new_categories, self.ordered)
new_dtype = self.dtype._add_categories(new_categories)

cat = self if inplace else self.copy()
cat._dtype = new_dtype
Expand Down Expand Up @@ -1768,8 +1760,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
Parameters
----------
indexer : sequence of integers
allow_fill : bool, default None.
indexer : sequence of int
The indices in `self` to take. The meaning of negative values in
`indexer` depends on the value of `allow_fill`.
allow_fill : bool, default None
How to handle negative values in `indexer`.
* False: negative values in `indices` indicate positional indices
Expand All @@ -1786,26 +1780,79 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
default is ``True``. In the future, this will change to
``False``.
fill_value : object
The value to use for `indices` that are missing (-1), when
``allow_fill=True``. This should be the category, i.e. a value
in ``self.categories``, not a code.
Specifying a `fill_value` that's not in ``self.categories`` is
allowed. The new category is added to the end of the existing
categories.
Returns
-------
Categorical
This Categorical will have the same categories and ordered as
`self`.
See Also
--------
Series.take : Similar method for Series.
numpy.ndarray.take : Similar method for NumPy arrays.
Examples
--------
>>> cat = pd.Categorical(['a', 'a', 'b'])
>>> cat
[a, a, b]
Categories (2, object): [a, b]
Specify ``allow_fill==False`` to have negative indices mean indexing
from the right.
>>> cat.take([0, -1, -2], allow_fill=False)
[a, b, a]
Categories (2, object): [a, b]
With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
values that should be filled with the `fill_value`, which is
``np.nan`` by default.
>>> cat.take([0, -1, -1], allow_fill=True)
[a, NaN, NaN]
Categories (2, object): [a, b]
The fill value can be specified. Notice that if the `fill_value` was
not previously present in ``self.categories``, it is added to the end
of the categories in the output Categorical.
>>> cat.take([0, -1, -1], allow_fill=True, fill_value='c')
[a, c, c]
Categories (3, object): [a, b, c]
"""
indexer = np.asarray(indexer, dtype=np.intp)
if allow_fill is None:
if (indexer < 0).any():
warn(_take_msg, FutureWarning, stacklevel=2)
allow_fill = True

dtype = self.dtype

if isna(fill_value):
# For categorical, any NA value is considered a user-facing
# NA value. Our storage NA value is -1.
fill_value = -1
elif allow_fill and fill_value is not None:
# convert user-provided `fill_value` to codes
if fill_value in self.categories:
fill_value = self.categories.get_loc(fill_value)
else:
dtype = self.dtype._add_categories(fill_value)
fill_value = dtype.categories.get_loc(fill_value)

codes = take(self._codes, indexer, allow_fill=allow_fill,
fill_value=fill_value)
result = self._constructor(codes, dtype=self.dtype, fastpath=True)
result = type(self).from_codes(codes,
categories=dtype.categories,
ordered=dtype.ordered)
return result

take = take_nd
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/dtypes/dtypes.py
Expand Up @@ -469,6 +469,23 @@ def _is_boolean(self):

return is_bool_dtype(self.categories)

def _add_categories(self, new_categories):
"""
Return a new CategoricalDtype with new categories added at the end.
"""
from pandas.core.dtypes.common import is_list_like

if not is_list_like(new_categories):
new_categories = [new_categories]
already_included = set(new_categories) & set(self.categories)
if len(already_included) != 0:
msg = ("new categories must not include old categories: "
"{already_included!s}")
raise ValueError(msg.format(already_included=already_included))
new_categories = list(self.categories) + list(new_categories)
return CategoricalDtype(new_categories, self.ordered)


class DatetimeTZDtype(PandasExtensionDtype):

Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/arrays/categorical/test_algos.py
Expand Up @@ -111,3 +111,37 @@ def test_positional_take_unobserved(self, ordered):
expected = pd.Categorical(['b', 'a'], categories=cat.categories,
ordered=ordered)
tm.assert_categorical_equal(result, expected)

def test_take_allow_fill(self):
cat = pd.Categorical(['a', 'a', 'b'])
result = cat.take([0, -1, -1], allow_fill=True)
expected = pd.Categorical(['a', np.nan, np.nan],
categories=['a', 'b'])
tm.assert_categorical_equal(result, expected)

def test_take_fill_with_negative_one(self):
# -1 was a category
cat = pd.Categorical([-1, 0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
tm.assert_categorical_equal(result, expected)

# -1 was not a category
cat = pd.Categorical([0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1])
tm.assert_categorical_equal(result, expected)

def test_take_fill_value(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = pd.Categorical(['a', 'b', 'c'])
result = cat.take([0, 1, -1], fill_value='a', allow_fill=True)
expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

def test_take_fill_value_adds_categories(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = pd.Categorical(['a', 'b', 'c'])
result = cat.take([0, 1, -1], fill_value='d', allow_fill=True)
expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd'])
tm.assert_categorical_equal(result, expected)

0 comments on commit 8ec7746

Please sign in to comment.