From 7314570a0e26616cb69ef69f2c6935dcf0bb02c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 24 Aug 2017 16:04:02 -0500 Subject: [PATCH] ENH: Parametrized CategoricalDtype We extended the CategoricalDtype to accept optional categories and ordered argument. ```python pd.CategoricalDtype(categories=['a', 'b'], ordered=True ``` CategoricalDtype is now part of the public API. This allows users to specify the desired categories and orderedness of an operation ahead of time. The current behavior, which is still possible with categories=None, the default, is to infer the categories from whatever is present. This change will make it easy to implement support for specifying categories that are know ahead of time in other places e.g. .astype, .read_csv, and the Series constructor. Closes #14711 Closes #15078 Closes #14676 --- doc/source/advanced.rst | 4 +- doc/source/api.rst | 5 +- doc/source/categorical.rst | 98 +++++++- doc/source/merging.rst | 11 +- doc/source/whatsnew/v0.21.0.txt | 26 ++ pandas/core/categorical.py | 232 ++++++++---------- pandas/core/dtypes/common.py | 38 ++- pandas/core/dtypes/dtypes.py | 200 +++++++++++++-- pandas/core/indexes/base.py | 14 +- pandas/core/indexes/category.py | 53 ++-- pandas/core/indexes/interval.py | 3 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/range.py | 2 +- pandas/core/internals.py | 20 +- pandas/core/series.py | 3 +- pandas/core/sorting.py | 3 +- pandas/core/util/hashing.py | 2 +- pandas/tests/dtypes/test_common.py | 9 +- pandas/tests/dtypes/test_dtypes.py | 106 +++++++- pandas/tests/frame/test_analytics.py | 3 + pandas/tests/indexes/test_category.py | 10 +- .../tests/io/json/test_json_table_schema.py | 5 +- pandas/tests/io/test_parquet.py | 3 + pandas/tests/io/test_pytables.py | 10 +- pandas/tests/reshape/test_merge.py | 4 +- pandas/tests/series/test_analytics.py | 11 +- pandas/tests/series/test_constructors.py | 21 ++ pandas/tests/series/test_dtypes.py | 34 ++- pandas/tests/test_algos.py | 72 +++--- pandas/tests/test_categorical.py | 126 +++++++++- pandas/util/testing.py | 7 +- 31 files changed, 887 insertions(+), 250 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 3f145cf9556645..5ee8938df119ba 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) - df['B'] = df['B'].astype('category', categories=list('cab')) + df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes df.B.cat.categories diff --git a/doc/source/api.rst b/doc/source/api.rst index 1541bbccefe214..399ebec3c83a57 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like Categorical ~~~~~~~~~~~ -If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical +.. autoclass:: api.types.CategoricalDtype + :members: categories, ordered + +If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the following usable methods and properties: diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 8835c4a1533d0c..2071d86ab753de 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of + +1. categories are inferred from the data +2. categories are unordered. + +To control those behaviors, instead of passing ``'category'``, use an instance +of :class:`~pd.api.types.CategoricalDtype`. .. ipython:: python - s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + from pandas.api.types import CategoricalDtype + + s = pd.Series(["a", "b", "c", "a"]) + cat_type = CategoricalDtype(categories=["b", "c", "d"], + ordered=False) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -133,6 +143,70 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. _categorical.categoricaldtype: + +CategoricalDtype +---------------- + +.. versionchanged:: 0.21.0 + +A categorical's type is fully described by + +1. its categories: a sequence of unique values and no missing values +2. its orderedness: a boolean + +This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`. +The ``categories`` argument is optional, which implies that the actual categories +should be inferred from whatever is present in the data when the +:class:`pandas.Categorical` is created. + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + CategoricalDtype(['a', 'b', 'c']) + CategoricalDtype(['a', 'b', 'c'], ordered=True) + CategoricalDtype() + +A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas +expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, or in the Series constructor. + +As a convenience, you can use the string ``'category'`` in place of a +:class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of +the categories being unordered, and equal to the set values present in the +array. In other words, ``dtype='category'`` is equivalent to +``dtype=CategoricalDtype()``. + +Equality Semantics +~~~~~~~~~~~~~~~~~~ + +Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal whenever the have +the same categories and orderedness. When comparing two unordered categoricals, the +order of the ``categories`` is not considered + +.. ipython:: python + + c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + + # Equal, since order is not considered when ordered=False + c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + + # Unequal, since the second CategoricalDtype is ordered + c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` + +.. ipython:: python + + c1 == 'category' + +.. warning:: + + Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, + and since all instances ``CategoricalDtype`` compare equal to ``'`category'``, + all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)`` + Description ----------- @@ -182,7 +256,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype('category', categories=list('abcd')) + s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) s # categories @@ -295,7 +369,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype( + CategoricalDtype(ordered=True) + ) s.sort_values(inplace=True) s s.min(), s.max() @@ -395,9 +471,15 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base = pd.Series([2,2,2]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base2 = pd.Series([2,2,2]).astype( + CategoricalDtype(ordered=True) + ) cat cat_base diff --git a/doc/source/merging.rst b/doc/source/merging.rst index a5ee1b1a9384cc..ace89bcbaa0afe 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -830,8 +830,10 @@ The left frame. .. ipython:: python + from pandas.api.types import CategoricalDtype + X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype('category', categories=['foo', 'bar']) + X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) left = pd.DataFrame({'X': X, 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) @@ -842,8 +844,11 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + right = pd.DataFrame({ + 'X': pd.Series(['foo', 'bar'], + dtype=CategoricalDtype(['foo', 'bar'])), + 'Z': [1, 2] + }) right right.dtypes diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 52e056103cbdc3..44c6c2742534b9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -10,6 +10,8 @@ users upgrade to this version. Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying + categoricals independent of the data (:issue:`14711`, :issue:`15078`) Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -88,6 +90,30 @@ This does not raise any obvious exceptions, but also does not create a new colum Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`pandas.api.types.CategoricalDtype` has been added to the public API and +expanded to include the ``categories`` and ``ordered`` attributes. A +``CategoricalDtype`` can be used to specify the set of categories and +orderedness of an array, independent of the data themselves. This can be useful, +e.g., when converting string data to a ``Categorical``: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of ``CategoricalDtype``. + +See :ref:`CategoricalDtype ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e67ce2936819f5..372de1d8af9a42 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -23,7 +23,7 @@ is_datetimelike, is_categorical, is_categorical_dtype, - is_integer_dtype, is_bool, + is_integer_dtype, is_list_like, is_sequence, is_scalar) from pandas.core.common import is_null_slice, _maybe_box_datetimelike @@ -202,6 +202,7 @@ class Categorical(PandasObject): categorical, read only. ordered : boolean Whether or not this Categorical is ordered. + dtype : CategoricalDtype Raises ------ @@ -228,7 +229,7 @@ class Categorical(PandasObject): >>> a.min() 'c' """ - dtype = CategoricalDtype() + _dtype = CategoricalDtype() """The dtype (always "category")""" """Whether or not this Categorical is ordered. @@ -248,22 +249,30 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, fastpath=False): + def __init__(self, values, categories=None, ordered=None, dtype=None, + fastpath=False): - self._validate_ordered(ordered) + if dtype is not None: + if categories is not None or ordered is not None: + raise ValueError("Cannot specify both `dtype` and `categories`" + " or `ordered`.") + categories = dtype.categories + ordered = dtype.ordered + + if ordered is None: + ordered = False if fastpath: - # fast path + if dtype is None: + dtype = CategoricalDtype(categories, ordered) self._codes = coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories( - categories, fastpath=isinstance(categories, ABCIndexClass)) - self._ordered = ordered + self._dtype = dtype return # sanitize input if is_categorical_dtype(values): - # we are either a Series or a CategoricalIndex + # we are either a Series, CategoricalIndex if isinstance(values, (ABCSeries, ABCCategoricalIndex)): values = values._values @@ -313,7 +322,8 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - categories = self._validate_categories(categories) + if dtype is None or isinstance(dtype, str): + dtype = CategoricalDtype(categories, ordered) else: # there were two ways if categories are present @@ -325,12 +335,15 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # make sure that we always have the same type here, no matter what # we get passed in - categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) + if dtype is None or isinstance(dtype, str): + dtype = CategoricalDtype(categories, ordered) + + codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 - if is_integer_dtype(values) and not is_integer_dtype(categories): + if (is_integer_dtype(values) and + not is_integer_dtype(dtype.categories)): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) @@ -341,9 +354,29 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) - self.set_ordered(ordered or False, inplace=True) - self._categories = categories - self._codes = coerce_indexer_dtype(codes, categories) + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + return self.dtype.ordered + + @property + def dtype(self): + return self._dtype def __dir__(self): # Avoid IPython warnings for deprecated properties @@ -492,7 +525,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = cls._validate_categories(categories) + categories = CategoricalDtype._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -535,69 +568,6 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _categories = None - - @classmethod - def _validate_ordered(cls, ordered): - """ - Validates that we have a valid ordered parameter. If - it is not a boolean, a TypeError will be raised. - - Parameters - ---------- - ordered : object - The parameter to be verified. - - Raises - ------ - TypeError - If 'ordered' is not a boolean. - """ - if not is_bool(ordered): - raise TypeError("'ordered' must either be 'True' or 'False'") - - @classmethod - def _validate_categories(cls, categories, fastpath=False): - """ - Validates that we have good categories - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - """ - if not isinstance(categories, ABCIndexClass): - dtype = None - if not hasattr(categories, "dtype"): - if not is_list_like(categories): - raise TypeError("`categories` must be list-like. " - "Got {} instead".format(repr(categories))) - categories = _convert_to_list_like(categories) - # On categories with NaNs, int values would be converted to - # float. Use "object" dtype to prevent this. - if isna(categories).any(): - without_na = np.array([x for x in categories - if notna(x)]) - with_na = np.array(categories) - if with_na.dtype != without_na.dtype: - dtype = "object" - - from pandas import Index - categories = Index(categories, dtype=dtype) - - if not fastpath: - - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - - # Categories must be unique. - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') - - return categories - def _set_categories(self, categories, fastpath=False): """ Sets new categories @@ -608,21 +578,13 @@ def _set_categories(self, categories, fastpath=False): """ - categories = self._validate_categories(categories, fastpath=fastpath) - if (not fastpath and self._categories is not None and - len(categories) != len(self._categories)): + new_dtype = CategoricalDtype(categories, self.ordered, fastpath) + if (not fastpath and self.dtype.categories is not None and + len(new_dtype.categories) != len(self.dtype.categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") - self._categories = categories - - def _get_categories(self): - """ Gets the categories """ - # categories is an Index, which is immutable -> no need to copy - return self._categories - - categories = property(fget=_get_categories, fset=_set_categories, - doc=_categories_doc) + self._dtype = new_dtype def _codes_for_groupby(self, sort): """ @@ -664,7 +626,21 @@ def _codes_for_groupby(self, sort): return self.reorder_categories(cat.categories) - _ordered = None + def _set_dtype(self, dtype): + """Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = _recode_for_categories(self.codes, self.categories, + dtype.categories) + return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): """ @@ -679,9 +655,9 @@ def set_ordered(self, value, inplace=False): of this categorical with ordered set to the value """ inplace = validate_bool_kwarg(inplace, 'inplace') - self._validate_ordered(value) + new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._ordered = value + cat._dtype = new_dtype if not inplace: return cat @@ -711,12 +687,6 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) - def _get_ordered(self): - """ Gets the ordered attribute """ - return self._ordered - - ordered = property(fget=_get_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -769,22 +739,21 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_unused_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_categories = self._validate_categories(new_categories) + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + cat = self if inplace else self.copy() if rename: - if (cat._categories is not None and - len(new_categories) < len(cat._categories)): + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_categories)] = -1 + self._codes[self._codes >= len(new_dtype.categories)] = -1 else: codes = _recode_for_categories(self.codes, self.categories, - new_categories) + new_dtype.categories) cat._codes = codes - cat._categories = new_categories - - if ordered is None: - ordered = self.ordered - cat.set_ordered(ordered, inplace=True) + cat._dtype = new_dtype if not inplace: return cat @@ -864,7 +833,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self._categories) != set(new_categories): + if set(self.dtype.categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, @@ -905,15 +874,17 @@ def add_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_list_like(new_categories): new_categories = [new_categories] - already_included = set(new_categories) & set(self._categories) + already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) - new_categories = list(self._categories) + list(new_categories) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self if inplace else self.copy() - cat._categories = self._validate_categories(new_categories) - cat._codes = coerce_indexer_dtype(cat._codes, new_categories) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: return cat @@ -953,8 +924,9 @@ def remove_categories(self, removals, inplace=False): removals = [removals] removal_set = set(list(removals)) - not_included = removal_set - set(self._categories) - new_categories = [c for c in self._categories if c not in removal_set] + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -996,8 +968,11 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._categories = cat.categories.take(idx) - cat._codes = coerce_indexer_dtype(inv, self._categories) + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype(new_categories, ordered=self.ordered, + fastpath=True) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) if not inplace: return cat @@ -1098,7 +1073,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self._validate_categories(state.pop( + state['_categories'] = self.dtype._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( @@ -1113,6 +1088,11 @@ def __setstate__(self, state): else: state['_ordered'] = False + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1122,7 +1102,7 @@ def T(self): @property def nbytes(self): - return self._codes.nbytes + self._categories.values.nbytes + return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep=False): """ @@ -1147,7 +1127,8 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self._categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) @@ -1278,7 +1259,7 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, categories=cat, ordered=obj.ordered, + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1991,8 +1972,7 @@ def is_dtype_equal(self, other): """ try: - return (self.categories.equals(other.categories) and - self.ordered == other.ordered) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c47e61dc446be2..f60c0d5ffdca0b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -692,6 +692,40 @@ def is_dtype_equal(source, target): return False +def is_dtype_union_equal(source, target): + """ + Check whether two arrays have compatible dtypes to do a union. + numpy types are checked with ``is_dtype_equal``. Extension types are + checked separately. + + Parameters + ---------- + source : The first dtype to compare + target : The second dtype to compare + + Returns + ---------- + boolean : Whether or not the two dtypes are equal. + + >>> is_dtype_equal("int", int) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'])) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'], ordered=True)) + False + """ + source = _get_dtype(source) + target = _get_dtype(target) + if is_categorical_dtype(source) and is_categorical_dtype(target): + # ordered False for both + return source.ordered is target.ordered + return is_dtype_equal(source, target) + + def is_any_int_dtype(arr_or_dtype): """ DEPRECATED: This function will be removed in a future version. @@ -1671,7 +1705,9 @@ def _coerce_to_dtype(dtype): """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc2c56ea476f9d..ddcaff5bf945f9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,6 +3,7 @@ import re import numpy as np from pandas import compat +from pandas.core.dtypes.generic import ABCIndexClass class ExtensionDtype(object): @@ -110,37 +111,144 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical - dtype. - - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + Type for categorical data with the categories and orderedness + + .. versionchanged:: 0.21.0 + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + ordered : bool, default False + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. See :ref:`categorical.categoricaldtype` + for more. + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> s = Series(['a', 'a', 'b', 'b', 'a'], dtype=t) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): [b < a] + + See Also + -------- + Categorical """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = [] + _metadata = ['categories', 'ordered'] _cache = {} - def __new__(cls): + def __new__(cls, categories=None, ordered=False, fastpath=False): + from pandas.core.indexes.base import Index - try: - return cls._cache[cls.name] - except KeyError: - c = object.__new__(cls) - cls._cache[cls.name] = c - return c + if categories is not None: + categories = Index(categories, tupleize_cols=False) + # validation + cls._validate_categories(categories, fastpath=fastpath) + cls._validate_ordered(ordered) + categorical = object.__new__(cls) + categorical._categories = categories + categorical._ordered = ordered + return categorical def __hash__(self): - # make myself hashable - return hash(str(self)) + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, CategoricalDtype) + if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered: + return other.ordered and self.categories.equals(other.categories) + elif other.ordered: + return False + else: + # both unordered; this could probably be optimized / cached + return hash(self) == hash(other) + + def __unicode__(self): + tpl = u'CategoricalDtype({}ordered={})' + if self.categories is None: + data = u"None, " + else: + data = self.categories._format_data(name=self.__class__.__name__) + return tpl.format(data, self.ordered) + + def __repr__(self): + return str(self) + + def __getnewargs__(self): + return (self.categories, self.ordered) + + @staticmethod + def _hash_categories(categories, ordered=True): + from pandas.core.util.hashing import ( + hash_array, _combine_hash_arrays, hash_tuples + ) + + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == 'O': + types = [type(x) for x in categories] + if not len(set(types)) == 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + cat_array = np.array([hash(x) for x in categories]) + hashed = hash((tuple(categories), ordered)) + return hashed + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack([ + cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) + ]) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), + num_items=len(cat_array)) + if len(hashed) == 0: + # bug in Numpy<1.12 for length 0 arrays. Just return the correct + # value of 0 + return 0 + else: + return np.bitwise_xor.reduce(hashed) @classmethod def construct_from_string(cls, string): @@ -154,6 +262,68 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + @staticmethod + def _validate_ordered(ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def _validate_categories(categories, fastpath=False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas import Index + + if not isinstance(categories, ABCIndexClass): + categories = Index(categories) + + if not fastpath: + + if categories.hasnans: + raise ValueError('Categorial categories cannot be null') + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + + return categories + + @property + def categories(self): + """ + An ``Index`` containing the unique categories allowed. + """ + return self._categories + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self._ordered + class DatetimeTZDtypeType(type): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 008828cf4f309a..e24faeb338c999 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,6 +27,7 @@ is_integer, is_float, is_dtype_equal, + is_dtype_union_equal, is_object_dtype, is_categorical_dtype, is_interval_dtype, @@ -847,7 +848,7 @@ def _formatter_func(self): """ return default_pprint - def _format_data(self): + def _format_data(self, name=None): """ Return the formatted data as a unicode string """ @@ -856,9 +857,11 @@ def _format_data(self): display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 + if name is None: + name = self.__class__.__name__ - space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + space1 = "\n%s" % (' ' * (len(name) + 1)) + space2 = "\n%s" % (' ' * (len(name) + 2)) n = len(self) sep = ',' @@ -2170,7 +2173,10 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not is_dtype_equal(self.dtype, other.dtype): + # TODO: is_dtype_union_equal is a hack around + # 1. buggy joins with duplicates + # 2. CategoricalIndex lacking setops + if not is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ef1dc4d971f37f..86258f6a365309 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -58,16 +58,18 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): if fastpath: - return cls._simple_new(data, name=name) + return cls._simple_new(data, name=name, dtype=dtype) if name is None and hasattr(data, 'name'): name = data.name if isinstance(data, ABCCategorical): - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) elif isinstance(data, CategoricalIndex): data = data._data - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) else: # don't allow scalars @@ -114,7 +116,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None, return CategoricalIndex(cat, name=name) @staticmethod - def _create_categorical(self, data, categories=None, ordered=None): + def _create_categorical(self, data, categories=None, ordered=None, + dtype=None): """ *this is an internal non-public method* @@ -125,6 +128,7 @@ def _create_categorical(self, data, categories=None, ordered=None): data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing + dtype : CategoricalDtype, defaults to existing Returns ------- @@ -135,22 +139,30 @@ def _create_categorical(self, data, categories=None, ordered=None): data = data.values if not isinstance(data, ABCCategorical): - ordered = False if ordered is None else ordered + if ordered is None and dtype is None: + ordered = False from pandas.core.categorical import Categorical - data = Categorical(data, categories=categories, ordered=ordered) + data = Categorical(data, categories=categories, ordered=ordered, + dtype=dtype) else: + from pandas.core.dtypes.dtypes import CategoricalDtype + if categories is not None: - data = data.set_categories(categories) - if ordered is not None: + data = data.set_categories(categories, ordered=ordered) + elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) + if isinstance(dtype, CategoricalDtype): + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) return data @classmethod def _simple_new(cls, values, name=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): result = object.__new__(cls) - values = cls._create_categorical(cls, values, categories, ordered) + values = cls._create_categorical(cls, values, categories, ordered, + dtype=dtype) result._data = values result.name = name for k, v in compat.iteritems(kwargs): @@ -161,16 +173,27 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): # categories and ordered can't be part of attributes, # as these are properties + # we want to reuse self.dtype if possible, i.e. neither are + # overridden. + if dtype is not None and (categories is not None or + ordered is not None): + raise TypeError + + if categories is None and ordered is None: + dtype = self.dtype if dtype is None else dtype + return super(CategoricalIndex, self)._shallow_copy( + values=values, dtype=dtype, **kwargs) if categories is None: categories = self.categories if ordered is None: ordered = self.ordered - return super(CategoricalIndex, - self)._shallow_copy(values=values, categories=categories, - ordered=ordered, **kwargs) + + return super(CategoricalIndex, self)._shallow_copy( + values=values, categories=categories, + ordered=ordered, **kwargs) def _is_dtype_compat(self, other): """ @@ -236,7 +259,7 @@ def _format_attrs(self): ('ordered', self.ordered)] if self.name is not None: attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'%s'" % self.dtype.name)) max_seq_items = get_option('display.max_seq_items') or len(self) if len(self) > max_seq_items: attrs.append(('length', len(self))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6e80f6c900386d..0cc5515e5cbc27 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -950,9 +950,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): na_rep=na_rep, justify='all').get_result() - def _format_data(self): + def _format_data(self, name=None): # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option( 'display.max_seq_items') or n) // 10, 10) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8b2cf0e7c0b407..f8c141b7e2462e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def _format_attrs(self): def _format_space(self): return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b759abaed4e564..81600f1baa842b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -189,7 +189,7 @@ def _format_attrs(self): attrs.append(('name', ibase.default_pprint(self.name))) return attrs - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83b382ec0ed723..e510ca87e44aa7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if is_categorical_dtype(dtype): - if dtype == CategoricalDtype(): - return True - + if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) + elif is_categorical_dtype(dtype): + return True + return False def external_values(self, dtype=None): @@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + if (('categories' in kwargs or 'ordered' in kwargs) and + isinstance(dtype, CategoricalDtype)): + raise TypeError("Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index ac11c5f908fdcf..bc84bd09f0b443 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2987,7 +2987,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 12e8d8aba91779..27252b9616a445 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,7 +2,6 @@ import numpy as np from pandas.compat import long, string_types, PY3 -from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, @@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): + from pandas.core.categorical import Categorical + labels = [] shape = [] if isinstance(orders, bool): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 07e993d7ef5092..0c82773b75c289 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. + # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4a..7827001c3f94c0 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -545,10 +545,11 @@ def test_is_complex_dtype(): (pd.Index([1, 2]), np.dtype('int64')), (pd.Index(['a', 'b']), np.dtype(object)), ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()), - (pd.Categorical(['a', 'b']), CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()), + (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), + (CategoricalDtype(), CategoricalDtype()), (pd.DatetimeIndex([1, 2]), np.dtype('