From a7eb835ed79c77658f048ab03a56db1b6b3e318b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 24 Aug 2017 16:04:02 -0500 Subject: [PATCH] ENH: Parametrized CategoricalDtype We extended the CategoricalDtype to accept optional categories and ordered argument. ```python pd.CategoricalDtype(categories=['a', 'b'], ordered=True ``` CategoricalDtype is now part of the public API. This allows users to specify the desired categories and orderedness of an operation ahead of time. The current behavior, which is still possible with categories=None, the default, is to infer the categories from whatever is present. This change will make it easy to implement support for specifying categories that are know ahead of time in other places e.g. .astype, .read_csv, and the Series constructor. Closes #14711 Closes #15078 Closes #14676 --- doc/source/advanced.rst | 2 +- doc/source/categorical.rst | 78 ++++++++- doc/source/merging.rst | 8 +- doc/source/whatsnew/v0.21.0.txt | 24 +++ pandas/core/api.py | 1 + pandas/core/categorical.py | 202 +++++++++------------- pandas/core/dtypes/common.py | 19 ++- pandas/core/dtypes/dtypes.py | 203 +++++++++++++++++++++-- pandas/core/indexes/base.py | 15 +- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/interval.py | 3 +- pandas/core/internals.py | 20 ++- pandas/core/series.py | 3 +- pandas/core/sorting.py | 3 +- pandas/core/util/hashing.py | 2 +- pandas/tests/api/test_api.py | 3 +- pandas/tests/frame/test_analytics.py | 3 + pandas/tests/series/test_analytics.py | 11 +- pandas/tests/series/test_constructors.py | 20 +++ pandas/tests/series/test_dtypes.py | 33 +++- pandas/tests/test_categorical.py | 25 +++ 21 files changed, 510 insertions(+), 170 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 711c3e9a95d05d..90cb4d42d30214 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -654,7 +654,7 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) - df['B'] = df['B'].astype('category', categories=list('cab')) + df['B'] = df['B'].astype(pd.CategoricalDtype(list('cab'))) df df.dtypes df.B.cat.categories diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 02d7920bc4a84e..29079da67cfa32 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -96,12 +96,19 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of + +1. categories are inferred from the data +2. categories are unordered. + +To control those behaviors, instead of passing ``'category'``, use an instance +of :class:`CategoricalDtype`. .. ipython:: python - s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + s = pd.Series(["a", "b", "c", "a"]) + cat_type = pd.CategoricalDtype(categories=["b", "c", "d"], ordered=False) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -140,6 +147,61 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +CategoricalDtype +---------------- + +.. versionchanged:: 0.21.0 + +A categorical's type is fully described by 1.) its categories (an iterable with +unique values and no missing values), and 2.) its orderedness (a boolean). +This information can be stored in a :class:`~pandas.CategoricalDtype`. +The ``categories`` argument is optional, which implies that the actual categories +should be inferred from whatever is present in the data when the +:class:`pandas.Categorical` is created. + +.. ipython:: python + + pd.CategoricalDtype(['a', 'b', 'c']) + pd.CategoricalDtype(['a', 'b', 'c'], ordered=True) + pd.CategoricalDtype() + +A :class:`~pandas.CategoricalDtype` can be used in any place pandas expects a +`dtype`. For example :func:`pandas.read_csv`, :func:`pandas.DataFrame.astype`, +or the Series constructor. + +As a convenience, you can use the string `'category'` in place of a +:class:`pandas.CategoricalDtype` when you want the default behavior of +the categories being unordered, and equal to the set values present in the array. +On other words, ``dtype='category'`` is equivalent to ``dtype=pd.CategoricalDtype()``. + +Equality Semantics +~~~~~~~~~~~~~~~~~~ + +Two instances of :class:`pandas.CategoricalDtype` compare equal whenever the have +the same categories and orderedness. When comparing two unordered categoricals, the +order of the ``categories`` is not considered + +.. ipython:: python + + c1 = pd.CategoricalDtype(['a', 'b', 'c'], ordered=False) + # Equal, since order is not considered when ordered=False + c1 == pd.CategoricalDtype(['b', 'c', 'a'], ordered=False) + # Unequal, since the second CategoricalDtype is ordered + c1 == pd.CategoricalDtype(['a', 'b', 'c'], ordered=True) + +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` + +.. ipython:: python + + c1 == 'category' + + +.. warning:: + + Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, + and since all instances ``CategoricalDtype`` compare equal to ``'`category'``, + all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)`` + Description ----------- @@ -189,7 +251,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype('category', categories=list('abcd')) + s = pd.Series(list('babc')).astype(pd.CategoricalDtype(list('abcd'))) s # categories @@ -306,7 +368,7 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype(pd.CategoricalDtype(ordered=True)) s.sort_values(inplace=True) s s.min(), s.max() @@ -406,9 +468,9 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype(pd.CategoricalDtype([3, 2, 1], ordered=True)) + cat_base = pd.Series([2,2,2]).astype(pd.CategoricalDtype([3, 2, 1], ordered=True)) + cat_base2 = pd.Series([2,2,2]).astype(pd.CategoricalDtype(ordered=True)) cat cat_base diff --git a/doc/source/merging.rst b/doc/source/merging.rst index d956f1ca54e6b8..c1f9a7214f2040 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -831,7 +831,7 @@ The left frame. .. ipython:: python X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype('category', categories=['foo', 'bar']) + X = X.astype(pd.CategoricalDtype(categories=['foo', 'bar'])) left = pd.DataFrame({'X': X, 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) @@ -842,8 +842,10 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + right = pd.DataFrame({ + 'X': pd.Series(['foo', 'bar'], dtype=pd.CategoricalDtype(['foo', 'bar'])), + 'Z': [1, 2] + }) right right.dtypes diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 2e841bb233cb8d..bdff219bdcd99b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -22,6 +22,8 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ +- New user-facing :class:`CategoricalDtype` for specifying categorical independent + of the data (:issue:`14711`, :issue:`15078`) - Support for `PEP 519 -- Adding a file system path protocol `_ on most readers and writers (:issue:`13823`) - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, @@ -106,6 +108,28 @@ This does not permit that column to be accessed as an attribute: Both of these now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`CategoricalDtype` has been added to the public API and expanded to +include the ``categories`` and ``ordered`` attributes. A ``CategoricalDtype`` +can be used to specify the set of categories and orderedness of an array, +independent of the data themselves. This can be useful, e.g., when converting +string data to a ``Categorical``: + +.. ipython:: python + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = pd.CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of ``CategoricalDtype``. + +See :ref:`CategoricalDtype ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/api.py b/pandas/core/api.py index 086fedd7d7cf89..039b79bde6ce0d 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,6 +6,7 @@ from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.dtypes.missing import isna, isnull, notna, notnull +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.io.formats.format import set_eng_float_format diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1c2a29333001ca..94fd59598f286f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -23,7 +23,7 @@ is_datetimelike, is_categorical, is_categorical_dtype, - is_integer_dtype, is_bool, + is_integer_dtype, is_list_like, is_sequence, is_scalar) from pandas.core.common import is_null_slice @@ -228,7 +228,7 @@ class Categorical(PandasObject): >>> a.min() 'c' """ - dtype = CategoricalDtype() + _dtype = CategoricalDtype() """The dtype (always "category")""" """Whether or not this Categorical is ordered. @@ -250,20 +250,15 @@ class Categorical(PandasObject): def __init__(self, values, categories=None, ordered=False, fastpath=False): - self._validate_ordered(ordered) - if fastpath: - # fast path + self._dtype = CategoricalDtype(categories, ordered) self._codes = coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories( - categories, fastpath=isinstance(categories, ABCIndexClass)) - self._ordered = ordered return # sanitize input if is_categorical_dtype(values): - # we are either a Series or a CategoricalIndex + # we are either a Series, CategoricalIndex or CategoricalDtype if isinstance(values, (ABCSeries, ABCCategoricalIndex)): values = values._values @@ -313,7 +308,7 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - categories = self._validate_categories(categories) + dtype = CategoricalDtype(categories, ordered) else: # there were two ways if categories are present @@ -325,12 +320,13 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # make sure that we always have the same type here, no matter what # we get passed in - categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) + dtype = CategoricalDtype(categories, ordered) + codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 - if is_integer_dtype(values) and not is_integer_dtype(categories): + if (is_integer_dtype(values) and + not is_integer_dtype(dtype.categories)): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) @@ -341,9 +337,29 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) - self.set_ordered(ordered or False, inplace=True) - self._categories = categories - self._codes = coerce_indexer_dtype(codes, categories) + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + return self.dtype.ordered + + @property + def dtype(self): + return self._dtype def __dir__(self): # Avoid IPython warnings for deprecated properties @@ -474,13 +490,16 @@ def from_codes(cls, codes, categories, ordered=False): categorical. If not given, the resulting categorical will be unordered. """ + from pandas import Index + try: codes = np.asarray(codes, np.int64) except: raise ValueError( "codes need to be convertible to an arrays of integers") - categories = cls._validate_categories(categories) + # have to use the instance, not property + categories = cls._dtype._validate_categories(Index(categories)) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -523,69 +542,6 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _categories = None - - @classmethod - def _validate_ordered(cls, ordered): - """ - Validates that we have a valid ordered parameter. If - it is not a boolean, a TypeError will be raised. - - Parameters - ---------- - ordered : object - The parameter to be verified. - - Raises - ------ - TypeError - If 'ordered' is not a boolean. - """ - if not is_bool(ordered): - raise TypeError("'ordered' must either be 'True' or 'False'") - - @classmethod - def _validate_categories(cls, categories, fastpath=False): - """ - Validates that we have good categories - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - """ - if not isinstance(categories, ABCIndexClass): - dtype = None - if not hasattr(categories, "dtype"): - if not is_list_like(categories): - raise TypeError("`categories` must be list-like. " - "Got {} instead".format(repr(categories))) - categories = _convert_to_list_like(categories) - # On categories with NaNs, int values would be converted to - # float. Use "object" dtype to prevent this. - if isna(categories).any(): - without_na = np.array([x for x in categories - if notna(x)]) - with_na = np.array(categories) - if with_na.dtype != without_na.dtype: - dtype = "object" - - from pandas import Index - categories = Index(categories, dtype=dtype) - - if not fastpath: - - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - - # Categories must be unique. - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') - - return categories - def _set_categories(self, categories, fastpath=False): """ Sets new categories @@ -596,21 +552,13 @@ def _set_categories(self, categories, fastpath=False): """ - categories = self._validate_categories(categories, fastpath=fastpath) - if (not fastpath and self._categories is not None and - len(categories) != len(self._categories)): + new = CategoricalDtype(categories, self.ordered, fastpath) + if (not fastpath and self.dtype.categories is not None and + len(new.categories) != len(self.dtype.categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") - self._categories = categories - - def _get_categories(self): - """ Gets the categories """ - # categories is an Index, which is immutable -> no need to copy - return self._categories - - categories = property(fget=_get_categories, fset=_set_categories, - doc=_categories_doc) + self._dtype = new def _codes_for_groupby(self, sort): """ @@ -652,8 +600,6 @@ def _codes_for_groupby(self, sort): return self.reorder_categories(cat.categories) - _ordered = None - def set_ordered(self, value, inplace=False): """ Sets the ordered attribute to the boolean value @@ -667,9 +613,9 @@ def set_ordered(self, value, inplace=False): of this categorical with ordered set to the value """ inplace = validate_bool_kwarg(inplace, 'inplace') - self._validate_ordered(value) + new = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._ordered = value + cat._dtype = new if not inplace: return cat @@ -699,12 +645,6 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) - def _get_ordered(self): - """ Gets the ordered attribute """ - return self._ordered - - ordered = property(fget=_get_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -757,21 +697,20 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_unused_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_categories = self._validate_categories(new_categories) + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + cat = self if inplace else self.copy() if rename: - if (cat._categories is not None and - len(new_categories) < len(cat._categories)): + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_categories)] = -1 + self._codes[self._codes >= len(new_dtype.categories)] = -1 else: values = cat.__array__() - cat._codes = _get_codes_for_values(values, new_categories) - cat._categories = new_categories - - if ordered is None: - ordered = self.ordered - cat.set_ordered(ordered, inplace=True) + cat._codes = _get_codes_for_values(values, new_dtype.categories) + cat._dtype = new_dtype if not inplace: return cat @@ -851,7 +790,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self._categories) != set(new_categories): + if set(self.dtype.categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, @@ -892,15 +831,17 @@ def add_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_list_like(new_categories): new_categories = [new_categories] - already_included = set(new_categories) & set(self._categories) + already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) - new_categories = list(self._categories) + list(new_categories) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self if inplace else self.copy() - cat._categories = self._validate_categories(new_categories) - cat._codes = coerce_indexer_dtype(cat._codes, new_categories) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: return cat @@ -940,8 +881,9 @@ def remove_categories(self, removals, inplace=False): removals = [removals] removal_set = set(list(removals)) - not_included = removal_set - set(self._categories) - new_categories = [c for c in self._categories if c not in removal_set] + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -983,8 +925,11 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._categories = cat.categories.take(idx) - cat._codes = coerce_indexer_dtype(inv, self._categories) + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype(new_categories, ordered=self.ordered, + fastpath=True) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) if not inplace: return cat @@ -1085,7 +1030,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self._validate_categories(state.pop( + state['_categories'] = self.dtype._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( @@ -1100,6 +1045,11 @@ def __setstate__(self, state): else: state['_ordered'] = False + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1109,7 +1059,7 @@ def T(self): @property def nbytes(self): - return self._codes.nbytes + self._categories.values.nbytes + return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep=False): """ @@ -1134,7 +1084,8 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self._categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) @@ -1978,8 +1929,7 @@ def is_dtype_equal(self, other): """ try: - return (self.categories.equals(other.categories) and - self.ordered == other.ordered) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c47e61dc446be2..1ac9f37c3e3847 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -692,6 +692,21 @@ def is_dtype_equal(source, target): return False +def _is_dtype_union_equal(source, target): + """ + Check whether two arrays have compatible dtypes to do a unoin. + numpy types are checked with ``is_dtype_equal``. Extension types are + checked separately. + """ + source = _get_dtype(source) + target = _get_dtype(target) + if is_categorical_dtype(source) and is_categorical_dtype(target): + # ordered False for both + return source.ordered is target.ordered + else: + return is_dtype_equal(source, target) + + def is_any_int_dtype(arr_or_dtype): """ DEPRECATED: This function will be removed in a future version. @@ -1671,7 +1686,9 @@ def _coerce_to_dtype(dtype): """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc2c56ea476f9d..d07dc2fcd87385 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,5 +1,6 @@ """ define extension dtypes """ +import weakref import re import numpy as np from pandas import compat @@ -110,37 +111,150 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical - dtype. - - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + Type for categorical data with the categories and orderedness, + but not the values. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + categories : list or None + ordered : bool, default False + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> s = Series(['a', 'a', 'b', 'b', 'a']) + >>> s.astype(t) + 0 a + 1 a + 2 b + 3 b + 4 a + dtype: category + Categories (2, object): [b < a] """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = [] - _cache = {} - - def __new__(cls): + _metadata = ['categories', 'ordered'] + _cache = weakref.WeakValueDictionary() + + def __new__(cls, categories=None, ordered=False, fastpath=False): + from pandas.core.indexes.base import Index + if categories is not None: + categories = Index(categories, tupleize_cols=False) + # validation + cls._validate_categories(categories, fastpath=fastpath) + cls._validate_ordered(ordered) + # We have a choice when hashing *unordered* categoricals: + # Should the two values `[a, b, c]` and `[b, a, c]` + # hash the same when both are unordered? + # + # Ignoring the order can cause some confusion when combined with + # our caching of CategoricalDtypes to have singletons (per params). + # If they first do CategoricalDtype(['b', 'a']) then + # CategoricalDtype(['a', 'b']) they get CategoricalDtype(['b','a']) + # which is surprising. For this reason, we choose to include order + # in the hashing, even if it's unordered + + hashed = cls._hash_categories(categories, ordered=True) + else: + hashed = None try: - return cls._cache[cls.name] + return cls._cache[(hashed, ordered)] except KeyError: - c = object.__new__(cls) - cls._cache[cls.name] = c - return c + categorical = object.__new__(cls) + categorical._categories = categories + categorical._ordered = ordered + cls._cache[(hashed, ordered)] = categorical + return categorical def __hash__(self): - # make myself hashable - return hash(str(self)) + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, CategoricalDtype) + if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered: + return other.ordered and self.categories.equals(other.categories) + elif other.ordered: + return False + else: + # both unordered; this could probably be optimized / cached + return hash(self) == hash(other) + + def __unicode__(self): + tpl = 'CategoricalDtype({}ordered={})' + if self.categories is None: + data = "None, " + else: + data = self.categories._format_data(name=self.__class__.__name__) + return tpl.format(data, self.ordered) + + def __repr__(self): + return str(self) + + def __getnewargs__(self): + return (self.categories, self.ordered) + + @staticmethod + def _hash_categories(categories, ordered=True): + from pandas.core.util.hashing import ( + hash_array, _combine_hash_arrays, hash_tuples + ) + + categories = np.asarray(categories) + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == 'O': + types = [type(x) for x in categories] + if not len(set(types)) == 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + cat_array = np.array([hash(x) for x in categories]) + hashed = hash((tuple(categories), ordered)) + return hashed + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack([ + cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) + ]) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), + num_items=len(cat_array)) + hashed = np.bitwise_xor.reduce(hashed) + return hashed @classmethod def construct_from_string(cls, string): @@ -154,6 +268,65 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + @staticmethod + def _validate_ordered(ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def _validate_categories(categories, fastpath=False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas.core.dtypes.generic import ABCIndexClass + from pandas import Index + + if not isinstance(categories, ABCIndexClass): + categories = Index(categories) + + if not fastpath: + + if categories.hasnans: + raise ValueError('Categorial categories cannot be null') + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + + return categories + + @property + def categories(self): + return self._categories + + @property + def ordered(self): + return self._ordered + class DatetimeTZDtypeType(type): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5c098fc8e26863..d6ddb59bf3fa6b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,6 +27,7 @@ is_integer, is_float, is_dtype_equal, + _is_dtype_union_equal, is_object_dtype, is_categorical_dtype, is_interval_dtype, @@ -848,7 +849,7 @@ def _formatter_func(self): """ return default_pprint - def _format_data(self): + def _format_data(self, name=None): """ Return the formatted data as a unicode string """ @@ -857,9 +858,11 @@ def _format_data(self): display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 + if name is None: + name = self.__class__.__name__ - space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + space1 = "\n%s" % (' ' * (len(name) + 1)) + space2 = "\n%s" % (' ' * (len(name) + 2)) n = len(self) sep = ',' @@ -2174,7 +2177,11 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not is_dtype_equal(self.dtype, other.dtype): + # TODO: _is_dtype_union_equal is a hack around lack of + # 1. buggy Multiset joins + # 2. CategoricalIndex lacking setops + # I'd like to fix those before merging CategoricalDtype + if not _is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f22407308e0944..a12348d5e7fc3f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -234,7 +234,7 @@ def _format_attrs(self): ('ordered', self.ordered)] if self.name is not None: attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'%s'" % self.dtype.name)) max_seq_items = get_option('display.max_seq_items') or len(self) if len(self) > max_seq_items: attrs.append(('length', len(self))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e0ed6c7ea35c0c..265f39a9f05223 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -944,9 +944,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): na_rep=na_rep, justify='all').get_result() - def _format_data(self): + def _format_data(self, name=None): # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option( 'display.max_seq_items') or n) // 10, 10) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83b382ec0ed723..e510ca87e44aa7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if is_categorical_dtype(dtype): - if dtype == CategoricalDtype(): - return True - + if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) + elif is_categorical_dtype(dtype): + return True + return False def external_values(self, dtype=None): @@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + if (('categories' in kwargs or 'ordered' in kwargs) and + isinstance(dtype, CategoricalDtype)): + raise TypeError("Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index 75dc3d6403650c..323a1493531bdf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3011,7 +3011,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 12e8d8aba91779..27252b9616a445 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,7 +2,6 @@ import numpy as np from pandas.compat import long, string_types, PY3 -from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, @@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): + from pandas.core.categorical import Categorical + labels = [] shape = [] if isinstance(orders, bool): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 07e993d7ef5092..0c82773b75c289 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. + # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 09cccd54b74f85..4f7ad2b608a867 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -48,7 +48,8 @@ class TestPDApi(Base): 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseSeries', 'TimeGrouper', 'Timedelta', - 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex'] + 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex', + 'CategoricalDtype'] # these are already deprecated; awaiting removal deprecated_classes = ['WidePanel', 'Panel4D', diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 93514a8a422151..6e9b531dec566d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2082,6 +2082,9 @@ def test_n_error(self, df_main_dtypes, method, columns): df = df_main_dtypes error_msg = self.dtype_error_msg_template.format( column=columns[1], method=method, dtype=df[columns[1]].dtype) + # escape some characters that may be in the repr + error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") + .replace("[", "\\[").replace("]", "\\]")) with tm.assert_raises_regex(TypeError, error_msg): getattr(df, method)(2, columns) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index f1d044f7a11325..914181dc941549 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1756,7 +1756,6 @@ class TestNLargestNSmallest(object): # not supported on some archs # Series([3., 2, 1, 2, 5], dtype='complex256'), Series([3., 2, 1, 2, 5], dtype='complex128'), - Series(list('abcde'), dtype='category'), Series(list('abcde'))]) def test_error(self, r): dt = r.dtype @@ -1768,6 +1767,16 @@ def test_error(self, r): with tm.assert_raises_regex(TypeError, msg): method(arg) + def test_error_categorical_dtype(self): + # same as test_error, but regex hard to escape properly + msg = ("Cannot use method 'n(larg|small)est' with dtype " + "CategoricalDtype.+") + with tm.assert_raises_regex(TypeError, msg): + Series(list('ab'), dtype='category').nlargest(2) + + with tm.assert_raises_regex(TypeError, msg): + Series(list('ab'), dtype='category').nsmallest(2) + @pytest.mark.parametrize( "s", [v for k, v in s_main_dtypes().iteritems()]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3b95c2803dd9e6..adeb871edb2cbb 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -157,6 +157,26 @@ def test_constructor_categorical(self): assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) + def test_constructor_categorical_dtype(self): + result = pd.Series(['a', 'b'], + dtype=pd.CategoricalDtype(['a', 'b', 'c'], + ordered=True)) + assert is_categorical_dtype(result) is True + tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) + assert result.cat.ordered + + result = pd.Series(['a', 'b'], dtype=pd.CategoricalDtype(['b', 'a'])) + assert is_categorical_dtype(result) + tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) + assert result.cat.ordered is False + + def test_unordered_compare_equal(self): + left = pd.Series(['a', 'b', 'c'], + dtype=pd.CategoricalDtype(['a', 'b'])) + right = pd.Series(pd.Categorical(['a', 'b', np.nan], + categories=['a', 'b'])) + tm.assert_series_equal(left, right) + def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index c214280ee8386a..bce0a7da4e7a23 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -12,7 +12,10 @@ from numpy import nan import numpy as np -from pandas import Series, Timestamp, Timedelta, DataFrame, date_range +from pandas import ( + Series, Timestamp, Timedelta, DataFrame, date_range, + Categorical, CategoricalDtype, Index +) from pandas.compat import lrange, range, u from pandas import compat @@ -182,6 +185,34 @@ def test_astype_dict_like(self, dtype_class): with pytest.raises(KeyError): s.astype(dt5) + def test_astype_categoricaldtype(self): + s = Series(['a', 'b', 'a']) + result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) + expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) + tm.assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) + expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) + tm.assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) + expected = Series(Categorical(['a', 'b', 'a'], + categories=['a', 'b', 'c'], + ordered=False)) + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) + + def test_astype_categoricaldtype_with_args(self): + s = Series(['a', 'b']) + type_ = CategoricalDtype(['a', 'b']) + + with pytest.raises(TypeError): + s.astype(type_, ordered=True) + with pytest.raises(TypeError): + s.astype(type_, categories=['a', 'b']) + with pytest.raises(TypeError): + s.astype(type_, categories=['a', 'b'], ordered=False) + def test_astype_generic_timestamp_deprecated(self): # see gh-15524 data = [1] diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7bbe220378993b..ac90e867a9f92a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -122,6 +122,26 @@ def test_constructor_empty(self): expected = pd.Int64Index([1, 2, 3]) tm.assert_index_equal(c.categories, expected) + def test_constructor_tuples(self): + values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object) + result = Categorical(values) + expected = Index([(1,), (1, 2)], tupleize_cols=False) + tm.assert_index_equal(result.categories, expected) + assert result.ordered is False + + def test_constructor_tuples_datetimes(self): + # numpy will auto reshape when all of the tuples are the + # same len, so add an extra one with 2 items and slice it off + values = np.array([(Timestamp('2010-01-01'),), + (Timestamp('2010-01-02'),), + (Timestamp('2010-01-01'),), + (Timestamp('2010-01-02'),), + ('a', 'b')], dtype=object)[:-1] + result = Categorical(values) + expected = Index([(Timestamp('2010-01-01'),), + (Timestamp('2010-01-02'),)], tupleize_cols=False) + tm.assert_index_equal(result.categories, expected) + def test_constructor_unsortable(self): # it works! @@ -623,6 +643,11 @@ def test_categories_none(self): 'a', 'c', 'c', 'c'], ordered=True) tm.assert_categorical_equal(factor, self.factor) + def test_set_categories_inplace(self): + cat = self.factor.copy() + cat.set_categories(['a', 'b', 'c', 'd'], inplace=True) + tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd'])) + def test_describe(self): # string type desc = self.factor.describe()