diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 3f145cf9556645..3f53db48ea27c1 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -640,7 +640,7 @@ and allows efficient indexing and storage of an index with a large number of dup df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) - df['B'] = df['B'].astype('category', categories=list('cab')) + df['B'] = df['B'].astype(pd.api.types.CategoricalDtype(list('cab'))) df df.dtypes df.B.cat.categories diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 8835c4a1533d0c..f74d5a8205a178 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -89,12 +89,20 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of + +1. categories are inferred from the data +2. categories are unordered. + +To control those behaviors, instead of passing ``'category'``, use an instance +of :class:`~pd.api.types.CategoricalDtype`. .. ipython:: python - s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + s = pd.Series(["a", "b", "c", "a"]) + cat_type = pd.api.types.CategoricalDtype(categories=["b", "c", "d"], + ordered=False) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -133,6 +141,62 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +CategoricalDtype +---------------- + +.. versionchanged:: 0.21.0 + +A categorical's type is fully described by 1.) its categories (an iterable with +unique values and no missing values), and 2.) its orderedness (a boolean). +This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`. +The ``categories`` argument is optional, which implies that the actual categories +should be inferred from whatever is present in the data when the +:class:`pandas.Categorical` is created. + +.. ipython:: python + + pd.api.types.CategoricalDtype(['a', 'b', 'c']) + pd.api.types.CategoricalDtype(['a', 'b', 'c'], ordered=True) + pd.api.types.CategoricalDtype() + +A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas +expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, or the Series constructor. + +As a convenience, you can use the string `'category'` in place of a +:class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of +the categories being unordered, and equal to the set values present in the +array. On other words, ``dtype='category'`` is equivalent to +``dtype=pd.api.types.CategoricalDtype()``. + +Equality Semantics +~~~~~~~~~~~~~~~~~~ + +Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal whenever the have +the same categories and orderedness. When comparing two unordered categoricals, the +order of the ``categories`` is not considered + +.. ipython:: python + + c1 = pd.api.types.CategoricalDtype(['a', 'b', 'c'], ordered=False) + # Equal, since order is not considered when ordered=False + c1 == pd.api.types.CategoricalDtype(['b', 'c', 'a'], ordered=False) + # Unequal, since the second CategoricalDtype is ordered + c1 == pd.api.types.CategoricalDtype(['a', 'b', 'c'], ordered=True) + +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` + +.. ipython:: python + + c1 == 'category' + + +.. warning:: + + Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, + and since all instances ``CategoricalDtype`` compare equal to ``'`category'``, + all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)`` + Description ----------- @@ -182,7 +246,9 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype('category', categories=list('abcd')) + s = pd.Series(list('babc')).astype( + pd.api.types.CategoricalDtype(list('abcd')) + ) s # categories @@ -295,7 +361,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype( + pd.api.types.CategoricalDtype(ordered=True) + ) s.sort_values(inplace=True) s s.min(), s.max() @@ -395,9 +463,15 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype( + pd.api.types.CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base = pd.Series([2,2,2]).astype( + pd.api.types.CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base2 = pd.Series([2,2,2]).astype( + pd.api.types.CategoricalDtype(ordered=True) + ) cat cat_base diff --git a/doc/source/merging.rst b/doc/source/merging.rst index a5ee1b1a9384cc..44e086e79b1ece 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -831,7 +831,7 @@ The left frame. .. ipython:: python X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype('category', categories=['foo', 'bar']) + X = X.astype(pd.api.types.CategoricalDtype(categories=['foo', 'bar'])) left = pd.DataFrame({'X': X, 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) @@ -842,8 +842,13 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + from pandas.api.types import CategoricalDtype + + right = pd.DataFrame({ + 'X': pd.Series(['foo', 'bar'], + dtype=CategoricalDtype(['foo', 'bar'])), + 'Z': [1, 2] + }) right right.dtypes diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6ffa903c741500..0c26f725dc20c5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -22,6 +22,8 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ +- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying + categoricals independent of the data (:issue:`14711`, :issue:`15078`) - Support for `PEP 519 -- Adding a file system path protocol `_ on most readers and writers (:issue:`13823`) - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, @@ -88,6 +90,30 @@ This does not raise any obvious exceptions, but also does not create a new colum Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`pandas.api.types.CategoricalDtype` has been added to the public API and +expanded to include the ``categories`` and ``ordered`` attributes. A +``CategoricalDtype`` can be used to specify the set of categories and +orderedness of an array, independent of the data themselves. This can be useful, +e.g., when converting string data to a ``Categorical``: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of ``CategoricalDtype``. + +See :ref:`CategoricalDtype ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index dbd2a79b7e46d9..adf278cd27daad 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -23,7 +23,7 @@ is_datetimelike, is_categorical, is_categorical_dtype, - is_integer_dtype, is_bool, + is_integer_dtype, is_list_like, is_sequence, is_scalar) from pandas.core.common import is_null_slice @@ -228,7 +228,7 @@ class Categorical(PandasObject): >>> a.min() 'c' """ - dtype = CategoricalDtype() + _dtype = CategoricalDtype() """The dtype (always "category")""" """Whether or not this Categorical is ordered. @@ -250,20 +250,15 @@ class Categorical(PandasObject): def __init__(self, values, categories=None, ordered=False, fastpath=False): - self._validate_ordered(ordered) - if fastpath: - # fast path + self._dtype = CategoricalDtype(categories, ordered) self._codes = coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories( - categories, fastpath=isinstance(categories, ABCIndexClass)) - self._ordered = ordered return # sanitize input if is_categorical_dtype(values): - # we are either a Series or a CategoricalIndex + # we are either a Series, CategoricalIndex or CategoricalDtype if isinstance(values, (ABCSeries, ABCCategoricalIndex)): values = values._values @@ -313,7 +308,7 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - categories = self._validate_categories(categories) + dtype = CategoricalDtype(categories, ordered) else: # there were two ways if categories are present @@ -325,12 +320,13 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # make sure that we always have the same type here, no matter what # we get passed in - categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) + dtype = CategoricalDtype(categories, ordered) + codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 - if is_integer_dtype(values) and not is_integer_dtype(categories): + if (is_integer_dtype(values) and + not is_integer_dtype(dtype.categories)): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) @@ -341,9 +337,29 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) - self.set_ordered(ordered or False, inplace=True) - self._categories = categories - self._codes = coerce_indexer_dtype(codes, categories) + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + return self.dtype.ordered + + @property + def dtype(self): + return self._dtype def __dir__(self): # Avoid IPython warnings for deprecated properties @@ -480,13 +496,16 @@ def from_codes(cls, codes, categories, ordered=False): categorical. If not given, the resulting categorical will be unordered. """ + from pandas import Index + try: codes = np.asarray(codes, np.int64) except: raise ValueError( "codes need to be convertible to an arrays of integers") - categories = cls._validate_categories(categories) + # have to use the instance, not property + categories = cls._dtype._validate_categories(Index(categories)) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -529,69 +548,6 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _categories = None - - @classmethod - def _validate_ordered(cls, ordered): - """ - Validates that we have a valid ordered parameter. If - it is not a boolean, a TypeError will be raised. - - Parameters - ---------- - ordered : object - The parameter to be verified. - - Raises - ------ - TypeError - If 'ordered' is not a boolean. - """ - if not is_bool(ordered): - raise TypeError("'ordered' must either be 'True' or 'False'") - - @classmethod - def _validate_categories(cls, categories, fastpath=False): - """ - Validates that we have good categories - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - """ - if not isinstance(categories, ABCIndexClass): - dtype = None - if not hasattr(categories, "dtype"): - if not is_list_like(categories): - raise TypeError("`categories` must be list-like. " - "Got {} instead".format(repr(categories))) - categories = _convert_to_list_like(categories) - # On categories with NaNs, int values would be converted to - # float. Use "object" dtype to prevent this. - if isna(categories).any(): - without_na = np.array([x for x in categories - if notna(x)]) - with_na = np.array(categories) - if with_na.dtype != without_na.dtype: - dtype = "object" - - from pandas import Index - categories = Index(categories, dtype=dtype) - - if not fastpath: - - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - - # Categories must be unique. - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') - - return categories - def _set_categories(self, categories, fastpath=False): """ Sets new categories @@ -602,21 +558,13 @@ def _set_categories(self, categories, fastpath=False): """ - categories = self._validate_categories(categories, fastpath=fastpath) - if (not fastpath and self._categories is not None and - len(categories) != len(self._categories)): + new = CategoricalDtype(categories, self.ordered, fastpath) + if (not fastpath and self.dtype.categories is not None and + len(new.categories) != len(self.dtype.categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") - self._categories = categories - - def _get_categories(self): - """ Gets the categories """ - # categories is an Index, which is immutable -> no need to copy - return self._categories - - categories = property(fget=_get_categories, fset=_set_categories, - doc=_categories_doc) + self._dtype = new def _codes_for_groupby(self, sort): """ @@ -658,8 +606,6 @@ def _codes_for_groupby(self, sort): return self.reorder_categories(cat.categories) - _ordered = None - def set_ordered(self, value, inplace=False): """ Sets the ordered attribute to the boolean value @@ -673,9 +619,9 @@ def set_ordered(self, value, inplace=False): of this categorical with ordered set to the value """ inplace = validate_bool_kwarg(inplace, 'inplace') - self._validate_ordered(value) + new = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._ordered = value + cat._dtype = new if not inplace: return cat @@ -705,12 +651,6 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) - def _get_ordered(self): - """ Gets the ordered attribute """ - return self._ordered - - ordered = property(fget=_get_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -763,21 +703,20 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_unused_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_categories = self._validate_categories(new_categories) + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + cat = self if inplace else self.copy() if rename: - if (cat._categories is not None and - len(new_categories) < len(cat._categories)): + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_categories)] = -1 + self._codes[self._codes >= len(new_dtype.categories)] = -1 else: values = cat.__array__() - cat._codes = _get_codes_for_values(values, new_categories) - cat._categories = new_categories - - if ordered is None: - ordered = self.ordered - cat.set_ordered(ordered, inplace=True) + cat._codes = _get_codes_for_values(values, new_dtype.categories) + cat._dtype = new_dtype if not inplace: return cat @@ -857,7 +796,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self._categories) != set(new_categories): + if set(self.dtype.categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, @@ -898,15 +837,17 @@ def add_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_list_like(new_categories): new_categories = [new_categories] - already_included = set(new_categories) & set(self._categories) + already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) - new_categories = list(self._categories) + list(new_categories) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self if inplace else self.copy() - cat._categories = self._validate_categories(new_categories) - cat._codes = coerce_indexer_dtype(cat._codes, new_categories) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: return cat @@ -946,8 +887,9 @@ def remove_categories(self, removals, inplace=False): removals = [removals] removal_set = set(list(removals)) - not_included = removal_set - set(self._categories) - new_categories = [c for c in self._categories if c not in removal_set] + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -989,8 +931,11 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._categories = cat.categories.take(idx) - cat._codes = coerce_indexer_dtype(inv, self._categories) + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype(new_categories, ordered=self.ordered, + fastpath=True) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) if not inplace: return cat @@ -1091,7 +1036,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self._validate_categories(state.pop( + state['_categories'] = self.dtype._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( @@ -1106,6 +1051,11 @@ def __setstate__(self, state): else: state['_ordered'] = False + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1115,7 +1065,7 @@ def T(self): @property def nbytes(self): - return self._codes.nbytes + self._categories.values.nbytes + return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep=False): """ @@ -1140,7 +1090,8 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self._categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) @@ -1984,8 +1935,7 @@ def is_dtype_equal(self, other): """ try: - return (self.categories.equals(other.categories) and - self.ordered == other.ordered) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c47e61dc446be2..1ac9f37c3e3847 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -692,6 +692,21 @@ def is_dtype_equal(source, target): return False +def _is_dtype_union_equal(source, target): + """ + Check whether two arrays have compatible dtypes to do a unoin. + numpy types are checked with ``is_dtype_equal``. Extension types are + checked separately. + """ + source = _get_dtype(source) + target = _get_dtype(target) + if is_categorical_dtype(source) and is_categorical_dtype(target): + # ordered False for both + return source.ordered is target.ordered + else: + return is_dtype_equal(source, target) + + def is_any_int_dtype(arr_or_dtype): """ DEPRECATED: This function will be removed in a future version. @@ -1671,7 +1686,9 @@ def _coerce_to_dtype(dtype): """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc2c56ea476f9d..e40f6995657f73 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -110,37 +110,144 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical - dtype. - - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + Type for categorical data with the categories and orderedness + + .. versionchanged:: 0.21.0 + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + ordered : bool, default False + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> s = Series(['a', 'a', 'b', 'b', 'a'], dtype=t) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): [b < a] + + See Also + -------- + Categorical """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = [] + _metadata = ['categories', 'ordered'] _cache = {} - def __new__(cls): + def __new__(cls, categories=None, ordered=False, fastpath=False): + from pandas.core.indexes.base import Index - try: - return cls._cache[cls.name] - except KeyError: - c = object.__new__(cls) - cls._cache[cls.name] = c - return c + if categories is not None: + categories = Index(categories, tupleize_cols=False) + # validation + cls._validate_categories(categories, fastpath=fastpath) + cls._validate_ordered(ordered) + categorical = object.__new__(cls) + categorical._categories = categories + categorical._ordered = ordered + return categorical def __hash__(self): - # make myself hashable - return hash(str(self)) + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, CategoricalDtype) + if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered: + return other.ordered and self.categories.equals(other.categories) + elif other.ordered: + return False + else: + # both unordered; this could probably be optimized / cached + return hash(self) == hash(other) + + def __unicode__(self): + tpl = u'CategoricalDtype({}ordered={})' + if self.categories is None: + data = u"None, " + else: + data = self.categories._format_data(name=self.__class__.__name__) + return tpl.format(data, self.ordered) + + def __repr__(self): + return str(self) + + def __getnewargs__(self): + return (self.categories, self.ordered) + + @staticmethod + def _hash_categories(categories, ordered=True): + from pandas.core.util.hashing import ( + hash_array, _combine_hash_arrays, hash_tuples + ) + + categories = np.asarray(categories) + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == 'O': + types = [type(x) for x in categories] + if not len(set(types)) == 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + cat_array = np.array([hash(x) for x in categories]) + hashed = hash((tuple(categories), ordered)) + return hashed + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack([ + cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) + ]) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), + num_items=len(cat_array)) + if len(hashed) == 0: + # bug in Numpy<1.12 for length 0 arrays. Just return the correct + # value of 0 + return 0 + else: + return np.bitwise_xor.reduce(hashed) @classmethod def construct_from_string(cls, string): @@ -154,6 +261,65 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + @staticmethod + def _validate_ordered(ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def _validate_categories(categories, fastpath=False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas.core.dtypes.generic import ABCIndexClass + from pandas import Index + + if not isinstance(categories, ABCIndexClass): + categories = Index(categories) + + if not fastpath: + + if categories.hasnans: + raise ValueError('Categorial categories cannot be null') + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + + return categories + + @property + def categories(self): + return self._categories + + @property + def ordered(self): + return self._ordered + class DatetimeTZDtypeType(type): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 008828cf4f309a..326f8876468fb4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,6 +27,7 @@ is_integer, is_float, is_dtype_equal, + _is_dtype_union_equal, is_object_dtype, is_categorical_dtype, is_interval_dtype, @@ -847,7 +848,7 @@ def _formatter_func(self): """ return default_pprint - def _format_data(self): + def _format_data(self, name=None): """ Return the formatted data as a unicode string """ @@ -856,9 +857,11 @@ def _format_data(self): display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 + if name is None: + name = self.__class__.__name__ - space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + space1 = "\n%s" % (' ' * (len(name) + 1)) + space2 = "\n%s" % (' ' * (len(name) + 2)) n = len(self) sep = ',' @@ -2170,7 +2173,11 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not is_dtype_equal(self.dtype, other.dtype): + # TODO: _is_dtype_union_equal is a hack around lack of + # 1. buggy Multiset joins + # 2. CategoricalIndex lacking setops + # I'd like to fix those before merging CategoricalDtype + if not _is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index baa3ebce6abbcc..ad14fcc6a03991 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -232,7 +232,7 @@ def _format_attrs(self): ('ordered', self.ordered)] if self.name is not None: attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'%s'" % self.dtype.name)) max_seq_items = get_option('display.max_seq_items') or len(self) if len(self) > max_seq_items: attrs.append(('length', len(self))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e0ed6c7ea35c0c..265f39a9f05223 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -944,9 +944,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): na_rep=na_rep, justify='all').get_result() - def _format_data(self): + def _format_data(self, name=None): # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option( 'display.max_seq_items') or n) // 10, 10) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8b2cf0e7c0b407..f8c141b7e2462e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def _format_attrs(self): def _format_space(self): return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b759abaed4e564..81600f1baa842b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -189,7 +189,7 @@ def _format_attrs(self): attrs.append(('name', ibase.default_pprint(self.name))) return attrs - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83b382ec0ed723..e510ca87e44aa7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if is_categorical_dtype(dtype): - if dtype == CategoricalDtype(): - return True - + if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) + elif is_categorical_dtype(dtype): + return True + return False def external_values(self, dtype=None): @@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + if (('categories' in kwargs or 'ordered' in kwargs) and + isinstance(dtype, CategoricalDtype)): + raise TypeError("Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index ac11c5f908fdcf..bc84bd09f0b443 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2987,7 +2987,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 12e8d8aba91779..27252b9616a445 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,7 +2,6 @@ import numpy as np from pandas.compat import long, string_types, PY3 -from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, @@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): + from pandas.core.categorical import Categorical + labels = [] shape = [] if isinstance(orders, bool): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 07e993d7ef5092..0c82773b75c289 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. + # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4a..7827001c3f94c0 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -545,10 +545,11 @@ def test_is_complex_dtype(): (pd.Index([1, 2]), np.dtype('int64')), (pd.Index(['a', 'b']), np.dtype(object)), ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()), - (pd.Categorical(['a', 'b']), CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()), + (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), + (CategoricalDtype(), CategoricalDtype()), (pd.DatetimeIndex([1, 2]), np.dtype('