From e1e6314a946b9058ffeb02f7bb904792ddecf04a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 11 Sep 2018 14:12:38 -0500 Subject: [PATCH] BUG: EA-backed boolean indexers Closes https://github.com/pandas-dev/pandas/issues/22665 Closes https://github.com/pandas-dev/pandas/issues/22326 --- doc/source/whatsnew/v0.24.0.txt | 3 +++ pandas/core/common.py | 9 ++++++--- pandas/core/dtypes/common.py | 12 ++++++++++++ pandas/tests/arrays/categorical/test_indexing.py | 12 +++++++++++- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3660c1e843f6cf..070f1c2614c443 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -484,6 +484,7 @@ ExtensionType Changes - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) @@ -491,6 +492,7 @@ ExtensionType Changes - :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) - :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) .. _whatsnew_0240.api.incompatibilities: @@ -608,6 +610,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug when indexing with a boolean-valued ``Categorical``. Now categoricals are treated as a boolean mask (:issue:`22665`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index a3fba762509f15..5d52724fd96192 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -15,7 +15,9 @@ from pandas import compat from pandas.compat import iteritems, PY36, OrderedDict from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass -from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.common import ( + is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like +) from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike @@ -100,7 +102,8 @@ def maybe_box_datetimelike(value): def is_bool_indexer(key): - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)): + if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or + (is_array_like(key) and is_extension_array_dtype(key.dtype))): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) @@ -110,7 +113,7 @@ def is_bool_indexer(key): 'NA / NaN values') return False return True - elif key.dtype == np.bool_: + elif is_bool_dtype(key.dtype): return True elif isinstance(key, list): try: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b8cbb41501dd19..da5130d99e8c77 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1608,6 +1608,8 @@ def is_bool_dtype(arr_or_dtype): False >>> is_bool_dtype(np.array([True, False])) True + >>> is_bool_dtype(pd.Categorical([True, False])) + True """ if arr_or_dtype is None: @@ -1618,6 +1620,13 @@ def is_bool_dtype(arr_or_dtype): # this isn't even a dtype return False + if isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): + arr_or_dtype = arr_or_dtype.dtype + + if isinstance(arr_or_dtype, CategoricalDtype): + arr_or_dtype = arr_or_dtype.categories + # now we use the special definition for Index + if isinstance(arr_or_dtype, ABCIndexClass): # TODO(jreback) @@ -1626,6 +1635,9 @@ def is_bool_dtype(arr_or_dtype): # guess this return (arr_or_dtype.is_object and arr_or_dtype.inferred_type == 'boolean') + elif is_extension_array_dtype(arr_or_dtype): + dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) + return issubclass(dtype.type, np.bool_) return issubclass(tipo, np.bool_) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index b54ac2835bee3b..5044b522a96f1c 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -5,7 +5,8 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, CategoricalIndex, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex, Series +from pandas.core.common import is_bool_indexer from pandas.tests.arrays.categorical.common import TestCategorical @@ -121,3 +122,12 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) + + +def test_mask_with_boolean(): + s = Series(range(3)) + idx = CategoricalIndex([True, False, True]) + assert is_bool_indexer(idx) + result = s[idx] + expected = s[idx.astype('object')] + tm.assert_series_equal(result, expected)