Skip to content

Commit

Permalink
PERF: add method Categorical.__contains__ (pandas-dev#21508)
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored and victor committed Sep 30, 2018
1 parent 4d3f504 commit cc51575
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 29 deletions.
10 changes: 7 additions & 3 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,11 @@ class Contains(object):
def setup(self):
N = 10**5
self.ci = tm.makeCategoricalIndex(N)
self.cat = self.ci.categories[0]
self.c = self.ci.values
self.key = self.ci.categories[0]

def time_contains(self):
self.cat in self.ci
def time_categorical_index_contains(self):
self.key in self.ci

def time_categorical_contains(self):
self.key in self.c
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Performance Improvements

- Improved performance of membership checks in :class:`CategoricalIndex`
(i.e. ``x in ci``-style checks are much faster). :meth:`CategoricalIndex.contains`
is likewise much faster (:issue:`21369`)
is likewise much faster (:issue:`21369`, :issue:`21508`)
- Improved performance of :meth:`MultiIndex.is_unique` (:issue:`21522`)
-

Expand Down
59 changes: 59 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,57 @@ def _maybe_to_categorical(array):
return array


def contains(cat, key, container):
"""
Helper for membership check for ``key`` in ``cat``.
This is a helper method for :method:`__contains__`
and :class:`CategoricalIndex.__contains__`.
Returns True if ``key`` is in ``cat.categories`` and the
location of ``key`` in ``categories`` is in ``container``.
Parameters
----------
cat : :class:`Categorical`or :class:`categoricalIndex`
key : a hashable object
The key to check membership for.
container : Container (e.g. list-like or mapping)
The container to check for membership in.
Returns
-------
is_in : bool
True if ``key`` is in ``self.categories`` and location of
``key`` in ``categories`` is in ``container``, else False.
Notes
-----
This method does not check for NaN values. Do that separately
before calling this method.
"""
hash(key)

# get location of key in categories.
# If a KeyError, the key isn't in categories, so logically
# can't be in container either.
try:
loc = cat.categories.get_loc(key)
except KeyError:
return False

# loc is the location of key in categories, but also the *value*
# for key in container. So, `key` may be in categories,
# but still not in `container`. Example ('b' in categories,
# but not in values):
# 'b' in Categorical(['a'], categories=['a', 'b']) # False
if is_scalar(loc):
return loc in container
else:
# if categories is an IntervalIndex, loc is an array.
return any(loc_ in container for loc_ in loc)


_codes_doc = """The category codes of this categorical.
Level codes are an array if integer which are the positions of the real
Expand Down Expand Up @@ -1846,6 +1897,14 @@ def __iter__(self):
"""Returns an Iterator over the values of this Categorical."""
return iter(self.get_values().tolist())

def __contains__(self, key):
"""Returns True if `key` is in this Categorical."""
# if key is a NaN, check if any NaN is in self.
if isna(key):
return self.isna().any()

return contains(self, key, container=self._codes)

def _tidy_repr(self, max_vals=10, footer=True):
""" a short repr displaying only max_vals and an optional (but default
footer)
Expand Down
29 changes: 4 additions & 25 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pandas.core.common as com
import pandas.core.missing as missing
import pandas.core.indexes.base as ibase
from pandas.core.arrays.categorical import Categorical, contains

_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update(dict(target_klass='CategoricalIndex'))
Expand Down Expand Up @@ -125,7 +126,6 @@ def _create_from_codes(self, codes, categories=None, ordered=None,
CategoricalIndex
"""

from pandas.core.arrays import Categorical
if categories is None:
categories = self.categories
if ordered is None:
Expand Down Expand Up @@ -162,7 +162,6 @@ def _create_categorical(self, data, categories=None, ordered=None,
if not isinstance(data, ABCCategorical):
if ordered is None and dtype is None:
ordered = False
from pandas.core.arrays import Categorical
data = Categorical(data, categories=categories, ordered=ordered,
dtype=dtype)
else:
Expand Down Expand Up @@ -323,32 +322,14 @@ def _reverse_indexer(self):

@Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs)
def __contains__(self, key):
hash(key)

if isna(key): # if key is a NaN, check if any NaN is in self.
# if key is a NaN, check if any NaN is in self.
if isna(key):
return self.hasnans

# is key in self.categories? Then get its location.
# If not (i.e. KeyError), it logically can't be in self either
try:
loc = self.categories.get_loc(key)
except KeyError:
return False

# loc is the location of key in self.categories, but also the value
# for key in self.codes and in self._engine. key may be in categories,
# but still not in self, check this. Example:
# 'b' in CategoricalIndex(['a'], categories=['a', 'b']) # False
if is_scalar(loc):
return loc in self._engine
else:
# if self.categories is IntervalIndex, loc is an array
# check if any scalar of the array is in self._engine
return any(loc_ in self._engine for loc_ in loc)
return contains(self, key, container=self._engine)

@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
def contains(self, key):
hash(key)
return key in self

def __array__(self, dtype=None):
Expand Down Expand Up @@ -479,7 +460,6 @@ def where(self, cond, other=None):
other = self._na_value
values = np.where(cond, self.values, other)

from pandas.core.arrays import Categorical
cat = Categorical(values,
categories=self.categories,
ordered=self.ordered)
Expand Down Expand Up @@ -862,7 +842,6 @@ def _delegate_method(self, name, *args, **kwargs):
def _add_accessors(cls):
""" add in Categorical accessor methods """

from pandas.core.arrays import Categorical
CategoricalIndex._add_delegate_accessors(
delegate=Categorical, accessors=["rename_categories",
"reorder_categories",
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/categorical/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,20 @@ def test_numeric_like_ops(self):

# invalid ufunc
pytest.raises(TypeError, lambda: np.log(s))

def test_contains(self):
# GH21508
c = pd.Categorical(list('aabbca'), categories=list('cab'))

assert 'b' in c
assert 'z' not in c
assert np.nan not in c
with pytest.raises(TypeError):
assert [1] in c

# assert codes NOT in index
assert 0 not in c
assert 1 not in c

c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab'))
assert np.nan in c

0 comments on commit cc51575

Please sign in to comment.