From 7314570a0e26616cb69ef69f2c6935dcf0bb02c9 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 24 Aug 2017 16:04:02 -0500
Subject: [PATCH] ENH: Parametrized CategoricalDtype

We extended the CategoricalDtype to accept optional categories and ordered
argument.

```python
pd.CategoricalDtype(categories=['a', 'b'], ordered=True
```

CategoricalDtype is now part of the public API. This allows users to
specify the desired categories and orderedness of an operation ahead of time.
The current behavior, which is still possible with categories=None, the
default, is to infer the categories from whatever is present.

This change will make it easy to implement support for specifying categories
that are know ahead of time in other places e.g. .astype, .read_csv, and the
Series constructor.

Closes #14711
Closes #15078
Closes #14676
---
 doc/source/advanced.rst                       |   4 +-
 doc/source/api.rst                            |   5 +-
 doc/source/categorical.rst                    |  98 +++++++-
 doc/source/merging.rst                        |  11 +-
 doc/source/whatsnew/v0.21.0.txt               |  26 ++
 pandas/core/categorical.py                    | 232 ++++++++----------
 pandas/core/dtypes/common.py                  |  38 ++-
 pandas/core/dtypes/dtypes.py                  | 200 +++++++++++++--
 pandas/core/indexes/base.py                   |  14 +-
 pandas/core/indexes/category.py               |  53 ++--
 pandas/core/indexes/interval.py               |   3 +-
 pandas/core/indexes/multi.py                  |   2 +-
 pandas/core/indexes/range.py                  |   2 +-
 pandas/core/internals.py                      |  20 +-
 pandas/core/series.py                         |   3 +-
 pandas/core/sorting.py                        |   3 +-
 pandas/core/util/hashing.py                   |   2 +-
 pandas/tests/dtypes/test_common.py            |   9 +-
 pandas/tests/dtypes/test_dtypes.py            | 106 +++++++-
 pandas/tests/frame/test_analytics.py          |   3 +
 pandas/tests/indexes/test_category.py         |  10 +-
 .../tests/io/json/test_json_table_schema.py   |   5 +-
 pandas/tests/io/test_parquet.py               |   3 +
 pandas/tests/io/test_pytables.py              |  10 +-
 pandas/tests/reshape/test_merge.py            |   4 +-
 pandas/tests/series/test_analytics.py         |  11 +-
 pandas/tests/series/test_constructors.py      |  21 ++
 pandas/tests/series/test_dtypes.py            |  34 ++-
 pandas/tests/test_algos.py                    |  72 +++---
 pandas/tests/test_categorical.py              | 126 +++++++++-
 pandas/util/testing.py                        |   7 +-
 31 files changed, 887 insertions(+), 250 deletions(-)

diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
index 3f145cf9556645..5ee8938df119ba 100644
--- a/doc/source/advanced.rst
+++ b/doc/source/advanced.rst
@@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup
 
 .. ipython:: python
 
+   from pandas.api.types import CategoricalDtype
+
    df = pd.DataFrame({'A': np.arange(6),
                       'B': list('aabbca')})
-   df['B'] = df['B'].astype('category', categories=list('cab'))
+   df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
    df
    df.dtypes
    df.B.cat.categories
diff --git a/doc/source/api.rst b/doc/source/api.rst
index 1541bbccefe214..399ebec3c83a57 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like
 Categorical
 ~~~~~~~~~~~
 
-If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical
+.. autoclass:: api.types.CategoricalDtype
+   :members: categories, ordered
+
+If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical
 data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the
 following usable methods and properties:
 
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index 8835c4a1533d0c..2071d86ab753de 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
     df["B"] = raw_cat
     df
 
-You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``:
+Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of
+
+1. categories are inferred from the data
+2. categories are unordered.
+
+To control those behaviors, instead of passing ``'category'``, use an instance
+of :class:`~pd.api.types.CategoricalDtype`.
 
 .. ipython:: python
 
-    s = pd.Series(["a","b","c","a"])
-    s_cat = s.astype("category", categories=["b","c","d"], ordered=False)
+    from pandas.api.types import CategoricalDtype
+
+    s = pd.Series(["a", "b", "c", "a"])
+    cat_type = CategoricalDtype(categories=["b", "c", "d"],
+                                ordered=False)
+    s_cat = s.astype(cat_type)
     s_cat
 
 Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
@@ -133,6 +143,70 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
+.. _categorical.categoricaldtype:
+
+CategoricalDtype
+----------------
+
+.. versionchanged:: 0.21.0
+
+A categorical's type is fully described by
+
+1. its categories: a sequence of unique values and no missing values
+2. its orderedness: a boolean
+
+This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`.
+The ``categories`` argument is optional, which implies that the actual categories
+should be inferred from whatever is present in the data when the
+:class:`pandas.Categorical` is created.
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   CategoricalDtype(['a', 'b', 'c'])
+   CategoricalDtype(['a', 'b', 'c'], ordered=True)
+   CategoricalDtype()
+
+A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas
+expects a `dtype`. For example :func:`pandas.read_csv`,
+:func:`pandas.DataFrame.astype`, or in the Series constructor.
+
+As a convenience, you can use the string ``'category'`` in place of a
+:class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of
+the categories being unordered, and equal to the set values present in the
+array. In other words, ``dtype='category'`` is equivalent to
+``dtype=CategoricalDtype()``.
+
+Equality Semantics
+~~~~~~~~~~~~~~~~~~
+
+Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal whenever the have
+the same categories and orderedness. When comparing two unordered categoricals, the
+order of the ``categories`` is not considered
+
+.. ipython:: python
+
+   c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)
+
+   # Equal, since order is not considered when ordered=False
+   c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False)
+
+   # Unequal, since the second CategoricalDtype is ordered
+   c1 == CategoricalDtype(['a',  'b', 'c'], ordered=True)
+
+All instances of ``CategoricalDtype`` compare equal to the string ``'category'``
+
+.. ipython:: python
+
+   c1 == 'category'
+
+.. warning::
+
+   Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``,
+   and since all instances ``CategoricalDtype`` compare equal to ``'`category'``,
+   all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)``
+
 Description
 -----------
 
@@ -182,7 +256,7 @@ It's also possible to pass in the categories in a specific order:
 
     .. ipython:: python
 
-         s = pd.Series(list('babc')).astype('category', categories=list('abcd'))
+         s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd')))
          s
 
          # categories
@@ -295,7 +369,9 @@ meaning and certain operations are possible. If the categorical is unordered, ``
 
     s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False))
     s.sort_values(inplace=True)
-    s = pd.Series(["a","b","c","a"]).astype('category', ordered=True)
+    s = pd.Series(["a","b","c","a"]).astype(
+        CategoricalDtype(ordered=True)
+    )
     s.sort_values(inplace=True)
     s
     s.min(), s.max()
@@ -395,9 +471,15 @@ categories or a categorical with any list-like object, will raise a TypeError.
 
 .. ipython:: python
 
-    cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True)
-    cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True)
-    cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True)
+    cat = pd.Series([1,2,3]).astype(
+        CategoricalDtype([3, 2, 1], ordered=True)
+    )
+    cat_base = pd.Series([2,2,2]).astype(
+        CategoricalDtype([3, 2, 1], ordered=True)
+    )
+    cat_base2 = pd.Series([2,2,2]).astype(
+        CategoricalDtype(ordered=True)
+    )
 
     cat
     cat_base
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
index a5ee1b1a9384cc..ace89bcbaa0afe 100644
--- a/doc/source/merging.rst
+++ b/doc/source/merging.rst
@@ -830,8 +830,10 @@ The left frame.
 
 .. ipython:: python
 
+   from pandas.api.types import CategoricalDtype
+
    X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,)))
-   X = X.astype('category', categories=['foo', 'bar'])
+   X = X.astype(CategoricalDtype(categories=['foo', 'bar']))
 
    left = pd.DataFrame({'X': X,
                         'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
@@ -842,8 +844,11 @@ The right frame.
 
 .. ipython:: python
 
-   right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']),
-                         'Z': [1, 2]})
+   right = pd.DataFrame({
+        'X': pd.Series(['foo', 'bar'],
+                       dtype=CategoricalDtype(['foo', 'bar'])),
+        'Z': [1, 2]
+   })
    right
    right.dtypes
 
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 52e056103cbdc3..44c6c2742534b9 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -10,6 +10,8 @@ users upgrade to this version.
 Highlights include:
 
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
+- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
+  categoricals independent of the data (:issue:`14711`, :issue:`15078`)
 
 Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
 
@@ -88,6 +90,30 @@ This does not raise any obvious exceptions, but also does not create a new colum
 
 Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access <indexing.attribute_access>`.
 
+.. _whatsnew_0210.enhancements.categorical_dtype:
+
+``CategoricalDtype`` for specifying categoricals
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`pandas.api.types.CategoricalDtype` has been added to the public API and
+expanded to include the ``categories`` and ``ordered`` attributes. A
+``CategoricalDtype`` can be used to specify the set of categories and
+orderedness of an array, independent of the data themselves. This can be useful,
+e.g., when converting string data to a ``Categorical``:
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   s = pd.Series(['a', 'b', 'c', 'a'])  # strings
+   dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True)
+   s.astype(dtype)
+
+The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
+``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
+
+See :ref:`CategoricalDtype <categorical.categoricaldtype>` for more.
+
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index e67ce2936819f5..372de1d8af9a42 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -23,7 +23,7 @@
     is_datetimelike,
     is_categorical,
     is_categorical_dtype,
-    is_integer_dtype, is_bool,
+    is_integer_dtype,
     is_list_like, is_sequence,
     is_scalar)
 from pandas.core.common import is_null_slice, _maybe_box_datetimelike
@@ -202,6 +202,7 @@ class Categorical(PandasObject):
         categorical, read only.
     ordered : boolean
         Whether or not this Categorical is ordered.
+    dtype : CategoricalDtype
 
     Raises
     ------
@@ -228,7 +229,7 @@ class Categorical(PandasObject):
     >>> a.min()
     'c'
     """
-    dtype = CategoricalDtype()
+    _dtype = CategoricalDtype()
     """The dtype (always "category")"""
     """Whether or not this Categorical is ordered.
 
@@ -248,22 +249,30 @@ class Categorical(PandasObject):
     __array_priority__ = 1000
     _typ = 'categorical'
 
-    def __init__(self, values, categories=None, ordered=False, fastpath=False):
+    def __init__(self, values, categories=None, ordered=None, dtype=None,
+                 fastpath=False):
 
-        self._validate_ordered(ordered)
+        if dtype is not None:
+            if categories is not None or ordered is not None:
+                raise ValueError("Cannot specify both `dtype` and `categories`"
+                                 " or `ordered`.")
+            categories = dtype.categories
+            ordered = dtype.ordered
+
+        if ordered is None:
+            ordered = False
 
         if fastpath:
-            # fast path
+            if dtype is None:
+                dtype = CategoricalDtype(categories, ordered)
             self._codes = coerce_indexer_dtype(values, categories)
-            self._categories = self._validate_categories(
-                categories, fastpath=isinstance(categories, ABCIndexClass))
-            self._ordered = ordered
+            self._dtype = dtype
             return
 
         # sanitize input
         if is_categorical_dtype(values):
 
-            # we are either a Series or a CategoricalIndex
+            # we are either a Series, CategoricalIndex
             if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
                 values = values._values
 
@@ -313,7 +322,8 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
                 raise NotImplementedError("> 1 ndim Categorical are not "
                                           "supported at this time")
 
-            categories = self._validate_categories(categories)
+            if dtype is None or isinstance(dtype, str):
+                dtype = CategoricalDtype(categories, ordered)
 
         else:
             # there were two ways if categories are present
@@ -325,12 +335,15 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
 
             # make sure that we always have the same type here, no matter what
             # we get passed in
-            categories = self._validate_categories(categories)
-            codes = _get_codes_for_values(values, categories)
+            if dtype is None or isinstance(dtype, str):
+                dtype = CategoricalDtype(categories, ordered)
+
+            codes = _get_codes_for_values(values, dtype.categories)
 
             # TODO: check for old style usage. These warnings should be removes
             # after 0.18/ in 2016
-            if is_integer_dtype(values) and not is_integer_dtype(categories):
+            if (is_integer_dtype(values) and
+                    not is_integer_dtype(dtype.categories)):
                 warn("Values and categories have different dtypes. Did you "
                      "mean to use\n'Categorical.from_codes(codes, "
                      "categories)'?", RuntimeWarning, stacklevel=2)
@@ -341,9 +354,29 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
                      "mean to use\n'Categorical.from_codes(codes, "
                      "categories)'?", RuntimeWarning, stacklevel=2)
 
-        self.set_ordered(ordered or False, inplace=True)
-        self._categories = categories
-        self._codes = coerce_indexer_dtype(codes, categories)
+        self._dtype = dtype
+        self._codes = coerce_indexer_dtype(codes, dtype.categories)
+
+    @property
+    def categories(self):
+        return self.dtype.categories
+
+    @categories.setter
+    def categories(self, categories):
+        new_dtype = CategoricalDtype(categories, ordered=self.ordered)
+        if (self.dtype.categories is not None and
+                len(self.dtype.categories) != len(new_dtype.categories)):
+            raise ValueError("new categories need to have the same number of "
+                             "items as the old categories!")
+        self._dtype = new_dtype
+
+    @property
+    def ordered(self):
+        return self.dtype.ordered
+
+    @property
+    def dtype(self):
+        return self._dtype
 
     def __dir__(self):
         # Avoid IPython warnings for deprecated properties
@@ -492,7 +525,7 @@ def from_codes(cls, codes, categories, ordered=False):
             raise ValueError(
                 "codes need to be convertible to an arrays of integers")
 
-        categories = cls._validate_categories(categories)
+        categories = CategoricalDtype._validate_categories(categories)
 
         if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
             raise ValueError("codes need to be between -1 and "
@@ -535,69 +568,6 @@ def _get_labels(self):
 
     labels = property(fget=_get_labels, fset=_set_codes)
 
-    _categories = None
-
-    @classmethod
-    def _validate_ordered(cls, ordered):
-        """
-        Validates that we have a valid ordered parameter. If
-        it is not a boolean, a TypeError will be raised.
-
-        Parameters
-        ----------
-        ordered : object
-            The parameter to be verified.
-
-        Raises
-        ------
-        TypeError
-            If 'ordered' is not a boolean.
-        """
-        if not is_bool(ordered):
-            raise TypeError("'ordered' must either be 'True' or 'False'")
-
-    @classmethod
-    def _validate_categories(cls, categories, fastpath=False):
-        """
-        Validates that we have good categories
-
-        Parameters
-        ----------
-        fastpath : boolean (default: False)
-           Don't perform validation of the categories for uniqueness or nulls
-
-        """
-        if not isinstance(categories, ABCIndexClass):
-            dtype = None
-            if not hasattr(categories, "dtype"):
-                if not is_list_like(categories):
-                    raise TypeError("`categories` must be list-like. "
-                                    "Got {} instead".format(repr(categories)))
-                categories = _convert_to_list_like(categories)
-                # On categories with NaNs, int values would be converted to
-                # float. Use "object" dtype to prevent this.
-                if isna(categories).any():
-                    without_na = np.array([x for x in categories
-                                           if notna(x)])
-                    with_na = np.array(categories)
-                    if with_na.dtype != without_na.dtype:
-                        dtype = "object"
-
-            from pandas import Index
-            categories = Index(categories, dtype=dtype)
-
-        if not fastpath:
-
-            # Categories cannot contain NaN.
-            if categories.hasnans:
-                raise ValueError('Categorial categories cannot be null')
-
-            # Categories must be unique.
-            if not categories.is_unique:
-                raise ValueError('Categorical categories must be unique')
-
-        return categories
-
     def _set_categories(self, categories, fastpath=False):
         """ Sets new categories
 
@@ -608,21 +578,13 @@ def _set_categories(self, categories, fastpath=False):
 
         """
 
-        categories = self._validate_categories(categories, fastpath=fastpath)
-        if (not fastpath and self._categories is not None and
-                len(categories) != len(self._categories)):
+        new_dtype = CategoricalDtype(categories, self.ordered, fastpath)
+        if (not fastpath and self.dtype.categories is not None and
+                len(new_dtype.categories) != len(self.dtype.categories)):
             raise ValueError("new categories need to have the same number of "
                              "items than the old categories!")
 
-        self._categories = categories
-
-    def _get_categories(self):
-        """ Gets the categories """
-        # categories is an Index, which is immutable -> no need to copy
-        return self._categories
-
-    categories = property(fget=_get_categories, fset=_set_categories,
-                          doc=_categories_doc)
+        self._dtype = new_dtype
 
     def _codes_for_groupby(self, sort):
         """
@@ -664,7 +626,21 @@ def _codes_for_groupby(self, sort):
 
         return self.reorder_categories(cat.categories)
 
-    _ordered = None
+    def _set_dtype(self, dtype):
+        """Internal method for directly updating the CategoricalDtype
+
+        Parameters
+        ----------
+        dtype : CategoricalDtype
+
+        Notes
+        -----
+        We don't do any validation here. It's assumed that the dtype is
+        a (valid) instance of `CategoricalDtype`.
+        """
+        codes = _recode_for_categories(self.codes, self.categories,
+                                       dtype.categories)
+        return type(self)(codes, dtype=dtype, fastpath=True)
 
     def set_ordered(self, value, inplace=False):
         """
@@ -679,9 +655,9 @@ def set_ordered(self, value, inplace=False):
            of this categorical with ordered set to the value
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        self._validate_ordered(value)
+        new_dtype = CategoricalDtype(self.categories, ordered=value)
         cat = self if inplace else self.copy()
-        cat._ordered = value
+        cat._dtype = new_dtype
         if not inplace:
             return cat
 
@@ -711,12 +687,6 @@ def as_unordered(self, inplace=False):
         inplace = validate_bool_kwarg(inplace, 'inplace')
         return self.set_ordered(False, inplace=inplace)
 
-    def _get_ordered(self):
-        """ Gets the ordered attribute """
-        return self._ordered
-
-    ordered = property(fget=_get_ordered)
-
     def set_categories(self, new_categories, ordered=None, rename=False,
                        inplace=False):
         """ Sets the categories to the specified new_categories.
@@ -769,22 +739,21 @@ def set_categories(self, new_categories, ordered=None, rename=False,
         remove_unused_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        new_categories = self._validate_categories(new_categories)
+        if ordered is None:
+            ordered = self.dtype.ordered
+        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
+
         cat = self if inplace else self.copy()
         if rename:
-            if (cat._categories is not None and
-                    len(new_categories) < len(cat._categories)):
+            if (cat.dtype.categories is not None and
+                    len(new_dtype.categories) < len(cat.dtype.categories)):
                 # remove all _codes which are larger and set to -1/NaN
-                self._codes[self._codes >= len(new_categories)] = -1
+                self._codes[self._codes >= len(new_dtype.categories)] = -1
         else:
             codes = _recode_for_categories(self.codes, self.categories,
-                                           new_categories)
+                                           new_dtype.categories)
             cat._codes = codes
-        cat._categories = new_categories
-
-        if ordered is None:
-            ordered = self.ordered
-        cat.set_ordered(ordered, inplace=True)
+        cat._dtype = new_dtype
 
         if not inplace:
             return cat
@@ -864,7 +833,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False):
         set_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        if set(self._categories) != set(new_categories):
+        if set(self.dtype.categories) != set(new_categories):
             raise ValueError("items in new_categories are not the same as in "
                              "old categories")
         return self.set_categories(new_categories, ordered=ordered,
@@ -905,15 +874,17 @@ def add_categories(self, new_categories, inplace=False):
         inplace = validate_bool_kwarg(inplace, 'inplace')
         if not is_list_like(new_categories):
             new_categories = [new_categories]
-        already_included = set(new_categories) & set(self._categories)
+        already_included = set(new_categories) & set(self.dtype.categories)
         if len(already_included) != 0:
             msg = ("new categories must not include old categories: %s" %
                    str(already_included))
             raise ValueError(msg)
-        new_categories = list(self._categories) + list(new_categories)
+        new_categories = list(self.dtype.categories) + list(new_categories)
+        new_dtype = CategoricalDtype(new_categories, self.ordered)
+
         cat = self if inplace else self.copy()
-        cat._categories = self._validate_categories(new_categories)
-        cat._codes = coerce_indexer_dtype(cat._codes, new_categories)
+        cat._dtype = new_dtype
+        cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
         if not inplace:
             return cat
 
@@ -953,8 +924,9 @@ def remove_categories(self, removals, inplace=False):
             removals = [removals]
 
         removal_set = set(list(removals))
-        not_included = removal_set - set(self._categories)
-        new_categories = [c for c in self._categories if c not in removal_set]
+        not_included = removal_set - set(self.dtype.categories)
+        new_categories = [c for c in self.dtype.categories
+                          if c not in removal_set]
 
         # GH 10156
         if any(isna(removals)):
@@ -996,8 +968,11 @@ def remove_unused_categories(self, inplace=False):
         if idx.size != 0 and idx[0] == -1:  # na sentinel
             idx, inv = idx[1:], inv - 1
 
-        cat._categories = cat.categories.take(idx)
-        cat._codes = coerce_indexer_dtype(inv, self._categories)
+        new_categories = cat.dtype.categories.take(idx)
+        new_dtype = CategoricalDtype(new_categories, ordered=self.ordered,
+                                     fastpath=True)
+        cat._dtype = new_dtype
+        cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
 
         if not inplace:
             return cat
@@ -1098,7 +1073,7 @@ def __setstate__(self, state):
 
         # Provide compatibility with pre-0.15.0 Categoricals.
         if '_categories' not in state and '_levels' in state:
-            state['_categories'] = self._validate_categories(state.pop(
+            state['_categories'] = self.dtype._validate_categories(state.pop(
                 '_levels'))
         if '_codes' not in state and 'labels' in state:
             state['_codes'] = coerce_indexer_dtype(
@@ -1113,6 +1088,11 @@ def __setstate__(self, state):
             else:
                 state['_ordered'] = False
 
+        # 0.21.0 CategoricalDtype change
+        if '_dtype' not in state:
+            state['_dtype'] = CategoricalDtype(state['_categories'],
+                                               state['_ordered'])
+
         for k, v in compat.iteritems(state):
             setattr(self, k, v)
 
@@ -1122,7 +1102,7 @@ def T(self):
 
     @property
     def nbytes(self):
-        return self._codes.nbytes + self._categories.values.nbytes
+        return self._codes.nbytes + self.dtype.categories.values.nbytes
 
     def memory_usage(self, deep=False):
         """
@@ -1147,7 +1127,8 @@ def memory_usage(self, deep=False):
         --------
         numpy.ndarray.nbytes
         """
-        return self._codes.nbytes + self._categories.memory_usage(deep=deep)
+        return self._codes.nbytes + self.dtype.categories.memory_usage(
+            deep=deep)
 
     @Substitution(klass='Categorical')
     @Appender(_shared_docs['searchsorted'])
@@ -1278,7 +1259,7 @@ def value_counts(self, dropna=True):
             count = bincount(np.where(mask, code, ncat))
             ix = np.append(ix, -1)
 
-        ix = self._constructor(ix, categories=cat, ordered=obj.ordered,
+        ix = self._constructor(ix, dtype=self.dtype,
                                fastpath=True)
 
         return Series(count, index=CategoricalIndex(ix), dtype='int64')
@@ -1991,8 +1972,7 @@ def is_dtype_equal(self, other):
         """
 
         try:
-            return (self.categories.equals(other.categories) and
-                    self.ordered == other.ordered)
+            return hash(self.dtype) == hash(other.dtype)
         except (AttributeError, TypeError):
             return False
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index c47e61dc446be2..f60c0d5ffdca0b 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -692,6 +692,40 @@ def is_dtype_equal(source, target):
         return False
 
 
+def is_dtype_union_equal(source, target):
+    """
+    Check whether two arrays have compatible dtypes to do a union.
+    numpy types are checked with ``is_dtype_equal``. Extension types are
+    checked separately.
+
+    Parameters
+    ----------
+    source : The first dtype to compare
+    target : The second dtype to compare
+
+    Returns
+    ----------
+    boolean : Whether or not the two dtypes are equal.
+
+    >>> is_dtype_equal("int", int)
+    True
+
+    >>> is_dtype_equal(CategoricalDtype(['a', 'b'],
+    ...                CategoricalDtype(['b', 'c']))
+    True
+
+    >>> is_dtype_equal(CategoricalDtype(['a', 'b'],
+    ...                CategoricalDtype(['b', 'c'], ordered=True))
+    False
+    """
+    source = _get_dtype(source)
+    target = _get_dtype(target)
+    if is_categorical_dtype(source) and is_categorical_dtype(target):
+        # ordered False for both
+        return source.ordered is target.ordered
+    return is_dtype_equal(source, target)
+
+
 def is_any_int_dtype(arr_or_dtype):
     """
     DEPRECATED: This function will be removed in a future version.
@@ -1671,7 +1705,9 @@ def _coerce_to_dtype(dtype):
     """
 
     if is_categorical_dtype(dtype):
-        dtype = CategoricalDtype()
+        categories = getattr(dtype, 'categories', None)
+        ordered = getattr(dtype, 'ordered', False)
+        dtype = CategoricalDtype(categories=categories, ordered=ordered)
     elif is_datetime64tz_dtype(dtype):
         dtype = DatetimeTZDtype(dtype)
     elif is_period_dtype(dtype):
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index dc2c56ea476f9d..ddcaff5bf945f9 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -3,6 +3,7 @@
 import re
 import numpy as np
 from pandas import compat
+from pandas.core.dtypes.generic import ABCIndexClass
 
 
 class ExtensionDtype(object):
@@ -110,37 +111,144 @@ class CategoricalDtypeType(type):
 class CategoricalDtype(ExtensionDtype):
 
     """
-    A np.dtype duck-typed class, suitable for holding a custom categorical
-    dtype.
-
-    THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object
+    Type for categorical data with the categories and orderedness
+
+    .. versionchanged:: 0.21.0
+
+    Parameters
+    ----------
+    categories : sequence, optional
+        Must be unique, and must not contain any nulls.
+    ordered : bool, default False
+
+    Notes
+    -----
+    This class is useful for specifying the type of a ``Categorical``
+    independent of the values. See :ref:`categorical.categoricaldtype`
+    for more.
+
+    Examples
+    --------
+    >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True)
+    >>> s = Series(['a', 'a', 'b', 'b', 'a'], dtype=t)
+    >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
+    0      a
+    1      b
+    2      a
+    3    NaN
+    dtype: category
+    Categories (2, object): [b < a]
+
+    See Also
+    --------
+    Categorical
     """
+    # TODO: Document public vs. private API
     name = 'category'
     type = CategoricalDtypeType
     kind = 'O'
     str = '|O08'
     base = np.dtype('O')
-    _metadata = []
+    _metadata = ['categories', 'ordered']
     _cache = {}
 
-    def __new__(cls):
+    def __new__(cls, categories=None, ordered=False, fastpath=False):
+        from pandas.core.indexes.base import Index
 
-        try:
-            return cls._cache[cls.name]
-        except KeyError:
-            c = object.__new__(cls)
-            cls._cache[cls.name] = c
-            return c
+        if categories is not None:
+            categories = Index(categories, tupleize_cols=False)
+            # validation
+            cls._validate_categories(categories, fastpath=fastpath)
+            cls._validate_ordered(ordered)
+        categorical = object.__new__(cls)
+        categorical._categories = categories
+        categorical._ordered = ordered
+        return categorical
 
     def __hash__(self):
-        # make myself hashable
-        return hash(str(self))
+        # _hash_categories returns a uint64, so use the negative
+        # space for when we have unknown categories to avoid a conflict
+        if self.categories is None:
+            if self.ordered:
+                return -1
+            else:
+                return -2
+        # We *do* want to include the real self.ordered here
+        return int(self._hash_categories(self.categories, self.ordered))
 
     def __eq__(self, other):
         if isinstance(other, compat.string_types):
             return other == self.name
 
-        return isinstance(other, CategoricalDtype)
+        if not (hasattr(other, 'ordered') and hasattr(other, 'categories')):
+            return False
+        elif self.categories is None or other.categories is None:
+            # We're forced into a suboptimal corner thanks to math and
+            # backwards compatibility. We require that `CDT(...) == 'category'`
+            # for all CDTs **including** `CDT(None, ...)`. Therefore, *all*
+            # CDT(., .) = CDT(None, False) and *all*
+            # CDT(., .) = CDT(None, True).
+            return True
+        elif self.ordered:
+            return other.ordered and self.categories.equals(other.categories)
+        elif other.ordered:
+            return False
+        else:
+            # both unordered; this could probably be optimized / cached
+            return hash(self) == hash(other)
+
+    def __unicode__(self):
+        tpl = u'CategoricalDtype({}ordered={})'
+        if self.categories is None:
+            data = u"None, "
+        else:
+            data = self.categories._format_data(name=self.__class__.__name__)
+        return tpl.format(data, self.ordered)
+
+    def __repr__(self):
+        return str(self)
+
+    def __getnewargs__(self):
+        return (self.categories, self.ordered)
+
+    @staticmethod
+    def _hash_categories(categories, ordered=True):
+        from pandas.core.util.hashing import (
+            hash_array, _combine_hash_arrays, hash_tuples
+        )
+
+        if len(categories) and isinstance(categories[0], tuple):
+            # assumes if any individual category is a tuple, then all our. ATM
+            # I don't really want to support just some of the categories being
+            # tuples.
+            categories = list(categories)  # breaks if a np.array of categories
+            cat_array = hash_tuples(categories)
+        else:
+            if categories.dtype == 'O':
+                types = [type(x) for x in categories]
+                if not len(set(types)) == 1:
+                    # TODO: hash_array doesn't handle mixed types. It casts
+                    # everything to a str first, which means we treat
+                    # {'1', '2'} the same as {'1', 2}
+                    # find a better solution
+                    cat_array = np.array([hash(x) for x in categories])
+                    hashed = hash((tuple(categories), ordered))
+                    return hashed
+            cat_array = hash_array(np.asarray(categories), categorize=False)
+        if ordered:
+            cat_array = np.vstack([
+                cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)
+            ])
+        else:
+            cat_array = [cat_array]
+        hashed = _combine_hash_arrays(iter(cat_array),
+                                      num_items=len(cat_array))
+        if len(hashed) == 0:
+            # bug in Numpy<1.12 for length 0 arrays. Just return the correct
+            # value of 0
+            return 0
+        else:
+            return np.bitwise_xor.reduce(hashed)
 
     @classmethod
     def construct_from_string(cls, string):
@@ -154,6 +262,68 @@ def construct_from_string(cls, string):
 
         raise TypeError("cannot construct a CategoricalDtype")
 
+    @staticmethod
+    def _validate_ordered(ordered):
+        """
+        Validates that we have a valid ordered parameter. If
+        it is not a boolean, a TypeError will be raised.
+
+        Parameters
+        ----------
+        ordered : object
+            The parameter to be verified.
+
+        Raises
+        ------
+        TypeError
+            If 'ordered' is not a boolean.
+        """
+        from pandas.core.dtypes.common import is_bool
+        if not is_bool(ordered):
+            raise TypeError("'ordered' must either be 'True' or 'False'")
+
+    @staticmethod
+    def _validate_categories(categories, fastpath=False):
+        """
+        Validates that we have good categories
+
+        Parameters
+        ----------
+        categories : array-like
+        fastpath : bool
+            Whether to skip nan and uniqueness checks
+
+        Returns
+        -------
+        categories : Index
+        """
+        from pandas import Index
+
+        if not isinstance(categories, ABCIndexClass):
+            categories = Index(categories)
+
+        if not fastpath:
+
+            if categories.hasnans:
+                raise ValueError('Categorial categories cannot be null')
+
+            if not categories.is_unique:
+                raise ValueError('Categorical categories must be unique')
+
+        return categories
+
+    @property
+    def categories(self):
+        """
+        An ``Index`` containing the unique categories allowed.
+        """
+        return self._categories
+
+    @property
+    def ordered(self):
+        """Whether the categories have an ordered relationship"""
+        return self._ordered
+
 
 class DatetimeTZDtypeType(type):
     """
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 008828cf4f309a..e24faeb338c999 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -27,6 +27,7 @@
     is_integer,
     is_float,
     is_dtype_equal,
+    is_dtype_union_equal,
     is_object_dtype,
     is_categorical_dtype,
     is_interval_dtype,
@@ -847,7 +848,7 @@ def _formatter_func(self):
         """
         return default_pprint
 
-    def _format_data(self):
+    def _format_data(self, name=None):
         """
         Return the formatted data as a unicode string
         """
@@ -856,9 +857,11 @@ def _format_data(self):
         display_width, _ = get_console_size()
         if display_width is None:
             display_width = get_option('display.width') or 80
+        if name is None:
+            name = self.__class__.__name__
 
-        space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
-        space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2))
+        space1 = "\n%s" % (' ' * (len(name) + 1))
+        space2 = "\n%s" % (' ' * (len(name) + 2))
 
         n = len(self)
         sep = ','
@@ -2170,7 +2173,10 @@ def union(self, other):
         if len(self) == 0:
             return other._get_consensus_name(self)
 
-        if not is_dtype_equal(self.dtype, other.dtype):
+        # TODO: is_dtype_union_equal is a hack around
+        # 1. buggy joins with duplicates
+        # 2. CategoricalIndex lacking setops
+        if not is_dtype_union_equal(self.dtype, other.dtype):
             this = self.astype('O')
             other = other.astype('O')
             return this.union(other)
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index ef1dc4d971f37f..86258f6a365309 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -58,16 +58,18 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
                 copy=False, name=None, fastpath=False, **kwargs):
 
         if fastpath:
-            return cls._simple_new(data, name=name)
+            return cls._simple_new(data, name=name, dtype=dtype)
 
         if name is None and hasattr(data, 'name'):
             name = data.name
 
         if isinstance(data, ABCCategorical):
-            data = cls._create_categorical(cls, data, categories, ordered)
+            data = cls._create_categorical(cls, data, categories, ordered,
+                                           dtype)
         elif isinstance(data, CategoricalIndex):
             data = data._data
-            data = cls._create_categorical(cls, data, categories, ordered)
+            data = cls._create_categorical(cls, data, categories, ordered,
+                                           dtype)
         else:
 
             # don't allow scalars
@@ -114,7 +116,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None,
         return CategoricalIndex(cat, name=name)
 
     @staticmethod
-    def _create_categorical(self, data, categories=None, ordered=None):
+    def _create_categorical(self, data, categories=None, ordered=None,
+                            dtype=None):
         """
         *this is an internal non-public method*
 
@@ -125,6 +128,7 @@ def _create_categorical(self, data, categories=None, ordered=None):
         data : data for new Categorical
         categories : optional categories, defaults to existing
         ordered : optional ordered attribute, defaults to existing
+        dtype : CategoricalDtype, defaults to existing
 
         Returns
         -------
@@ -135,22 +139,30 @@ def _create_categorical(self, data, categories=None, ordered=None):
             data = data.values
 
         if not isinstance(data, ABCCategorical):
-            ordered = False if ordered is None else ordered
+            if ordered is None and dtype is None:
+                ordered = False
             from pandas.core.categorical import Categorical
-            data = Categorical(data, categories=categories, ordered=ordered)
+            data = Categorical(data, categories=categories, ordered=ordered,
+                               dtype=dtype)
         else:
+            from pandas.core.dtypes.dtypes import CategoricalDtype
+
             if categories is not None:
-                data = data.set_categories(categories)
-            if ordered is not None:
+                data = data.set_categories(categories, ordered=ordered)
+            elif ordered is not None and ordered != data.ordered:
                 data = data.set_ordered(ordered)
+            if isinstance(dtype, CategoricalDtype):
+                # we want to silently ignore dtype='category'
+                data = data._set_dtype(dtype)
         return data
 
     @classmethod
     def _simple_new(cls, values, name=None, categories=None, ordered=None,
-                    **kwargs):
+                    dtype=None, **kwargs):
         result = object.__new__(cls)
 
-        values = cls._create_categorical(cls, values, categories, ordered)
+        values = cls._create_categorical(cls, values, categories, ordered,
+                                         dtype=dtype)
         result._data = values
         result.name = name
         for k, v in compat.iteritems(kwargs):
@@ -161,16 +173,27 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None,
 
     @Appender(_index_shared_docs['_shallow_copy'])
     def _shallow_copy(self, values=None, categories=None, ordered=None,
-                      **kwargs):
+                      dtype=None, **kwargs):
         # categories and ordered can't be part of attributes,
         # as these are properties
+        # we want to reuse self.dtype if possible, i.e. neither are
+        # overridden.
+        if dtype is not None and (categories is not None or
+                                  ordered is not None):
+            raise TypeError
+
+        if categories is None and ordered is None:
+            dtype = self.dtype if dtype is None else dtype
+            return super(CategoricalIndex, self)._shallow_copy(
+                values=values, dtype=dtype, **kwargs)
         if categories is None:
             categories = self.categories
         if ordered is None:
             ordered = self.ordered
-        return super(CategoricalIndex,
-                     self)._shallow_copy(values=values, categories=categories,
-                                         ordered=ordered, **kwargs)
+
+        return super(CategoricalIndex, self)._shallow_copy(
+            values=values, categories=categories,
+            ordered=ordered, **kwargs)
 
     def _is_dtype_compat(self, other):
         """
@@ -236,7 +259,7 @@ def _format_attrs(self):
             ('ordered', self.ordered)]
         if self.name is not None:
             attrs.append(('name', ibase.default_pprint(self.name)))
-        attrs.append(('dtype', "'%s'" % self.dtype))
+        attrs.append(('dtype', "'%s'" % self.dtype.name))
         max_seq_items = get_option('display.max_seq_items') or len(self)
         if len(self) > max_seq_items:
             attrs.append(('length', len(self)))
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 6e80f6c900386d..0cc5515e5cbc27 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -950,9 +950,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs):
                                       na_rep=na_rep,
                                       justify='all').get_result()
 
-    def _format_data(self):
+    def _format_data(self, name=None):
 
         # TODO: integrate with categorical and make generic
+        # name argument is unused here; just for compat with base / categorical
         n = len(self)
         max_seq_items = min((get_option(
             'display.max_seq_items') or n) // 10, 10)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 8b2cf0e7c0b407..f8c141b7e2462e 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -490,7 +490,7 @@ def _format_attrs(self):
     def _format_space(self):
         return "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
 
-    def _format_data(self):
+    def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index b759abaed4e564..81600f1baa842b 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -189,7 +189,7 @@ def _format_attrs(self):
             attrs.append(('name', ibase.default_pprint(self.name)))
         return attrs
 
-    def _format_data(self):
+    def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 83b382ec0ed723..e510ca87e44aa7 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype):
         validate that we have a astypeable to categorical,
         returns a boolean if we are a categorical
         """
-        if is_categorical_dtype(dtype):
-            if dtype == CategoricalDtype():
-                return True
-
+        if dtype is Categorical or dtype is CategoricalDtype:
             # this is a pd.Categorical, but is not
             # a valid type for astypeing
             raise TypeError("invalid type {0} for astype".format(dtype))
 
+        elif is_categorical_dtype(dtype):
+            return True
+
         return False
 
     def external_values(self, dtype=None):
@@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
         # may need to convert to categorical
         # this is only called for non-categoricals
         if self.is_categorical_astype(dtype):
+            if (('categories' in kwargs or 'ordered' in kwargs) and
+                    isinstance(dtype, CategoricalDtype)):
+                raise TypeError("Cannot specify a CategoricalDtype and also "
+                                "`categories` or `ordered`. Use "
+                                "`dtype=CategoricalDtype(categories, ordered)`"
+                                " instead.")
+            kwargs = kwargs.copy()
+            categories = getattr(dtype, 'categories', None)
+            ordered = getattr(dtype, 'ordered', False)
+
+            kwargs.setdefault('categories', categories)
+            kwargs.setdefault('ordered', ordered)
             return self.make_block(Categorical(self.values, **kwargs))
 
         # astype processing
diff --git a/pandas/core/series.py b/pandas/core/series.py
index ac11c5f908fdcf..bc84bd09f0b443 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -2987,7 +2987,8 @@ def _try_cast(arr, take_fast_path):
                 subarr = np.array(subarr, dtype=dtype, copy=copy)
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):
-                subarr = Categorical(arr)
+                subarr = Categorical(arr, dtype.categories,
+                                     ordered=dtype.ordered)
             elif dtype is not None and raise_cast_failure:
                 raise
             else:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 12e8d8aba91779..27252b9616a445 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 from pandas.compat import long, string_types, PY3
-from pandas.core.categorical import Categorical
 from pandas.core.dtypes.common import (
     _ensure_platform_int,
     _ensure_int64,
@@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True):
 
 
 def lexsort_indexer(keys, orders=None, na_position='last'):
+    from pandas.core.categorical import Categorical
+
     labels = []
     shape = []
     if isinstance(orders, bool):
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 07e993d7ef5092..0c82773b75c289 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
     # For categoricals, we hash the categories, then remap the codes to the
     # hash values. (This check is above the complex check so that we don't ask
-    # numpy if categorical is a subdtype of complex, as it will choke.
+    # numpy if categorical is a subdtype of complex, as it will choke).
     if is_categorical_dtype(dtype):
         return _hash_categorical(vals, encoding, hash_key)
 
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 8a36f234484b4a..7827001c3f94c0 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -545,10 +545,11 @@ def test_is_complex_dtype():
     (pd.Index([1, 2]), np.dtype('int64')),
     (pd.Index(['a', 'b']), np.dtype(object)),
     ('category', 'category'),
-    (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()),
-    (pd.Categorical(['a', 'b']), CategoricalDtype()),
-    (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()),
-    (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()),
+    (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])),
+    (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])),
+    (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])),
+    (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])),
+    (CategoricalDtype(), CategoricalDtype()),
     (pd.DatetimeIndex([1, 2]), np.dtype('<M8[ns]')),
     (pd.DatetimeIndex([1, 2]).dtype, np.dtype('<M8[ns]')),
     ('<M8[ns]', np.dtype('<M8[ns]')),
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index fb20571213c15e..ccb2ee42438242 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -66,21 +66,13 @@ def test_pickle(self):
 
         # force back to the cache
         result = tm.round_trip_pickle(self.dtype)
-
-        # we are a singular object so we are added
-        # back to the cache upon unpickling
-        # this is to ensure object identity
-        assert len(self.dtype._cache) == 1
         assert result == self.dtype
 
     def test_hash_vs_equality(self):
-        # make sure that we satisfy is semantics
         dtype = self.dtype
         dtype2 = CategoricalDtype()
         assert dtype == dtype2
         assert dtype2 == dtype
-        assert dtype is dtype2
-        assert dtype2 is dtype
         assert hash(dtype) == hash(dtype2)
 
     def test_equality(self):
@@ -119,6 +111,11 @@ def test_basic(self):
         assert not is_categorical(np.dtype('float64'))
         assert not is_categorical(1.0)
 
+    def test_tuple_categories(self):
+        categories = [(1, 'a'), (2, 'b'), (3, 'c')]
+        result = CategoricalDtype(categories)
+        assert all(result.categories == categories)
+
 
 class TestDatetimeTZDtype(Base):
 
@@ -524,3 +521,96 @@ def test_caching(self):
         IntervalDtype.reset_cache()
         tm.round_trip_pickle(dtype)
         assert len(IntervalDtype._cache) == 0
+
+
+class TestCategoricalDtypeParametrized(object):
+
+    @pytest.mark.parametrize('categories, ordered', [
+        (['a', 'b', 'c', 'd'], False),
+        (['a', 'b', 'c', 'd'], True),
+        (np.arange(1000), False),
+        (np.arange(1000), True),
+        (['a', 'b', 10, 2, 1.3, True], False),
+        ([True, False], True),
+        ([True, False], False),
+        (pd.date_range('2017', periods=4), True),
+        (pd.date_range('2017', periods=4), False),
+    ])
+    def test_basic(self, categories, ordered):
+        c1 = CategoricalDtype(categories, ordered=ordered)
+        tm.assert_index_equal(c1.categories, pd.Index(categories))
+        assert c1.ordered is ordered
+
+    def test_order_matters(self):
+        categories = ['a', 'b']
+        c1 = CategoricalDtype(categories, ordered=False)
+        c2 = CategoricalDtype(categories, ordered=True)
+        assert c1 is not c2
+
+    def test_unordered_same(self):
+        c1 = CategoricalDtype(['a', 'b'])
+        c2 = CategoricalDtype(['b', 'a'])
+        assert hash(c1) == hash(c2)
+
+    def test_categories(self):
+        result = CategoricalDtype(['a', 'b', 'c'])
+        tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c']))
+        assert result.ordered is False
+
+    def test_equal_but_different(self):
+        c1 = CategoricalDtype([1, 2, 3])
+        c2 = CategoricalDtype([1., 2., 3.])
+        assert c1 is not c2
+        assert c1 != c2
+
+    @pytest.mark.parametrize('v1, v2', [
+        ([1, 2, 3], [1, 2, 3]),
+        ([1, 2, 3], [3, 2, 1]),
+    ])
+    def test_order_hashes_different(self, v1, v2):
+        c1 = CategoricalDtype(v1)
+        c2 = CategoricalDtype(v2, ordered=True)
+        assert c1 is not c2
+
+    def test_nan_invalid(self):
+        with pytest.raises(ValueError):
+            CategoricalDtype([1, 2, np.nan])
+
+    def test_non_unique_invalid(self):
+        with pytest.raises(ValueError):
+            CategoricalDtype([1, 2, 1])
+
+    def test_same_categories_different_order(self):
+        c1 = CategoricalDtype(['a', 'b'], ordered=True)
+        c2 = CategoricalDtype(['b', 'a'], ordered=True)
+        assert c1 is not c2
+
+    @pytest.mark.parametrize('ordered, other, expected', [
+        (True, CategoricalDtype(['a', 'b'], True), True),
+        (False, CategoricalDtype(['a', 'b'], False), True),
+        (True, CategoricalDtype(['a', 'b'], False), False),
+        (False, CategoricalDtype(['a', 'b'], True), False),
+        (True, CategoricalDtype([1, 2], False), False),
+        (False, CategoricalDtype([1, 2], True), False),
+        (False, CategoricalDtype(None, True), True),
+        (True, CategoricalDtype(None, True), True),
+        (False, CategoricalDtype(None, False), True),
+        (True, CategoricalDtype(None, False), True),
+        (True, 'category', True),
+        (False, 'category', True),
+        (True, 'not a category', False),
+        (False, 'not a category', False),
+    ])
+    def test_categorical_equality(self, ordered, other, expected):
+        c1 = CategoricalDtype(['a', 'b'], ordered)
+        result = c1 == other
+        assert result == expected
+
+    def test_invalid_raises(self):
+        with tm.assert_raises_regex(TypeError, 'ordered'):
+            CategoricalDtype(['a', 'b'], ordered='foo')
+
+    def test_mixed(self):
+        a = CategoricalDtype(['a', 'b', 1, 2])
+        b = CategoricalDtype(['a', 'b', '1', '2'])
+        assert hash(a) != hash(b)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 93514a8a422151..6e9b531dec566d 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -2082,6 +2082,9 @@ def test_n_error(self, df_main_dtypes, method, columns):
         df = df_main_dtypes
         error_msg = self.dtype_error_msg_template.format(
             column=columns[1], method=method, dtype=df[columns[1]].dtype)
+        # escape some characters that may be in the repr
+        error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
+                              .replace("[", "\\[").replace("]", "\\]"))
         with tm.assert_raises_regex(TypeError, error_msg):
             getattr(df, method)(2, columns)
 
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
index cf365465763fab..d8ec23b9c7e0e4 100644
--- a/pandas/tests/indexes/test_category.py
+++ b/pandas/tests/indexes/test_category.py
@@ -654,7 +654,11 @@ def test_equals_categorical(self):
         # make sure that we are testing for category inclusion properly
         ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b'])
         assert not ci.equals(list('aabca'))
-        assert not ci.equals(CategoricalIndex(list('aabca')))
+        # Same categories, but different order
+        # Unordered
+        assert ci.equals(CategoricalIndex(list('aabca')))
+        # Ordered
+        assert not ci.equals(CategoricalIndex(list('aabca'), ordered=True))
         assert ci.equals(ci.copy())
 
         ci = CategoricalIndex(list('aabca') + [np.nan],
@@ -666,7 +670,9 @@ def test_equals_categorical(self):
         ci = CategoricalIndex(list('aabca') + [np.nan],
                               categories=['c', 'a', 'b'])
         assert not ci.equals(list('aabca') + [np.nan])
-        assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan]))
+        assert ci.equals(CategoricalIndex(list('aabca') + [np.nan]))
+        assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan],
+                                              ordered=True))
         assert ci.equals(ci.copy())
 
     def test_string_categorical_index_repr(self):
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
index e447a74b2b4628..a8c7c0cac65d8a 100644
--- a/pandas/tests/io/json/test_json_table_schema.py
+++ b/pandas/tests/io/json/test_json_table_schema.py
@@ -164,7 +164,10 @@ def test_as_json_table_type_string_dtypes(self):
             assert as_json_table_type(t) == 'string'
 
     def test_as_json_table_type_categorical_dtypes(self):
-        assert as_json_table_type(pd.Categorical) == 'any'
+        # TODO: I think before is_categorical_dtype(Categorical)
+        # returned True, but now it's False. Figure out why or
+        # if it matters
+        assert as_json_table_type(pd.Categorical(['a'])) == 'any'
         assert as_json_table_type(CategoricalDtype()) == 'any'
 
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 78c72e2a055669..8571970f30d78b 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from distutils.version import LooseVersion
 from pandas.compat import PY3, is_platform_windows
 from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
                                PyArrowImpl, FastParquetImpl)
@@ -364,6 +365,8 @@ def test_unsupported(self, fp):
         self.check_error_on_write(df, fp, ValueError)
 
     def test_categorical(self, fp):
+        if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
+            pytest.skip("CategoricalDtype not supported for older fp")
         df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
         self.check_round_trip(df, fp, compression=None)
 
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
index 9c488cb2389bed..2fecc40ea65b5e 100644
--- a/pandas/tests/io/test_pytables.py
+++ b/pandas/tests/io/test_pytables.py
@@ -18,6 +18,7 @@
 
 from pandas.compat import is_platform_windows, PY3, PY35, BytesIO, text_type
 from pandas.io.formats.printing import pprint_thing
+from pandas.core.dtypes.common import is_categorical_dtype
 
 tables = pytest.importorskip('tables')
 from pandas.io.pytables import TableIterator
@@ -1090,7 +1091,12 @@ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''):
                          nan_rep=nan_rep)
                 retr = read_hdf(store, key)
                 s_nan = s.replace(nan_rep, np.nan)
-                assert_series_equal(s_nan, retr, check_categorical=False)
+                if is_categorical_dtype(s_nan):
+                    assert is_categorical_dtype(retr)
+                    assert_series_equal(s_nan, retr, check_dtype=False,
+                                        check_categorical=False)
+                else:
+                    assert_series_equal(s_nan, retr)
 
         for s in examples:
             roundtrip(s)
@@ -4845,7 +4851,7 @@ def test_categorical(self):
             # Make sure the metadata is OK
             info = store.info()
             assert '/df2   ' in info
-            assert '/df2/meta/values_block_0/meta' in info
+            # assert '/df2/meta/values_block_0/meta' in info
             assert '/df2/meta/values_block_1/meta' in info
 
             # unordered
diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py
index 338596d1523e4f..df75983a29d80f 100644
--- a/pandas/tests/reshape/test_merge.py
+++ b/pandas/tests/reshape/test_merge.py
@@ -1468,8 +1468,6 @@ def test_other_columns(self, left, right):
 
     @pytest.mark.parametrize(
         'change', [lambda x: x,
-                   lambda x: x.astype('category',
-                                      categories=['bar', 'foo']),
                    lambda x: x.astype('category',
                                       categories=['foo', 'bar', 'bah']),
                    lambda x: x.astype('category', ordered=True)])
@@ -1481,7 +1479,7 @@ def test_dtype_on_merged_different(self, change, how, left, right):
         X = change(right.X.astype('object'))
         right = right.assign(X=X)
         assert is_categorical_dtype(left.X.values)
-        assert not left.X.values.is_dtype_equal(right.X.values)
+        # assert not left.X.values.is_dtype_equal(right.X.values)
 
         merged = pd.merge(left, right, on='X', how=how)
 
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index f1d044f7a11325..914181dc941549 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1756,7 +1756,6 @@ class TestNLargestNSmallest(object):
               # not supported on some archs
               # Series([3., 2, 1, 2, 5], dtype='complex256'),
               Series([3., 2, 1, 2, 5], dtype='complex128'),
-              Series(list('abcde'), dtype='category'),
               Series(list('abcde'))])
     def test_error(self, r):
         dt = r.dtype
@@ -1768,6 +1767,16 @@ def test_error(self, r):
             with tm.assert_raises_regex(TypeError, msg):
                 method(arg)
 
+    def test_error_categorical_dtype(self):
+        # same as test_error, but regex hard to escape properly
+        msg = ("Cannot use method 'n(larg|small)est' with dtype "
+               "CategoricalDtype.+")
+        with tm.assert_raises_regex(TypeError, msg):
+            Series(list('ab'), dtype='category').nlargest(2)
+
+        with tm.assert_raises_regex(TypeError, msg):
+            Series(list('ab'), dtype='category').nsmallest(2)
+
     @pytest.mark.parametrize(
         "s",
         [v for k, v in s_main_dtypes().iteritems()])
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 3b95c2803dd9e6..df7d7a946e881b 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -10,6 +10,7 @@
 import numpy.ma as ma
 import pandas as pd
 
+from pandas.api.types import CategoricalDtype
 from pandas.core.dtypes.common import (
     is_categorical_dtype,
     is_datetime64tz_dtype)
@@ -157,6 +158,26 @@ def test_constructor_categorical(self):
         assert is_categorical_dtype(s)
         assert is_categorical_dtype(s.dtype)
 
+    def test_constructor_categorical_dtype(self):
+        result = pd.Series(['a', 'b'],
+                           dtype=CategoricalDtype(['a', 'b', 'c'],
+                                                  ordered=True))
+        assert is_categorical_dtype(result) is True
+        tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
+        assert result.cat.ordered
+
+        result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
+        assert is_categorical_dtype(result)
+        tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
+        assert result.cat.ordered is False
+
+    def test_unordered_compare_equal(self):
+        left = pd.Series(['a', 'b', 'c'],
+                         dtype=CategoricalDtype(['a', 'b']))
+        right = pd.Series(pd.Categorical(['a', 'b', np.nan],
+                                         categories=['a', 'b']))
+        tm.assert_series_equal(left, right)
+
     def test_constructor_maskedarray(self):
         data = ma.masked_all((3, ), dtype=float)
         result = Series(data)
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
index c214280ee8386a..b432640ea11ba5 100644
--- a/pandas/tests/series/test_dtypes.py
+++ b/pandas/tests/series/test_dtypes.py
@@ -12,7 +12,11 @@
 from numpy import nan
 import numpy as np
 
-from pandas import Series, Timestamp, Timedelta, DataFrame, date_range
+from pandas import (
+    Series, Timestamp, Timedelta, DataFrame, date_range,
+    Categorical, Index
+)
+from pandas.api.types import CategoricalDtype
 
 from pandas.compat import lrange, range, u
 from pandas import compat
@@ -182,6 +186,34 @@ def test_astype_dict_like(self, dtype_class):
         with pytest.raises(KeyError):
             s.astype(dt5)
 
+    def test_astype_categoricaldtype(self):
+        s = Series(['a', 'b', 'a'])
+        result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
+        expected = Series(Categorical(['a', 'b', 'a'], ordered=True))
+        tm.assert_series_equal(result, expected)
+
+        result = s.astype(CategoricalDtype(['a', 'b'], ordered=False))
+        expected = Series(Categorical(['a', 'b', 'a'], ordered=False))
+        tm.assert_series_equal(result, expected)
+
+        result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False))
+        expected = Series(Categorical(['a', 'b', 'a'],
+                                      categories=['a', 'b', 'c'],
+                                      ordered=False))
+        tm.assert_series_equal(result, expected)
+        tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c']))
+
+    def test_astype_categoricaldtype_with_args(self):
+        s = Series(['a', 'b'])
+        type_ = CategoricalDtype(['a', 'b'])
+
+        with pytest.raises(TypeError):
+            s.astype(type_, ordered=True)
+        with pytest.raises(TypeError):
+            s.astype(type_, categories=['a', 'b'])
+        with pytest.raises(TypeError):
+            s.astype(type_, categories=['a', 'b'], ordered=False)
+
     def test_astype_generic_timestamp_deprecated(self):
         # see gh-15524
         data = [1]
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index b26089ea7a8226..3694bba594adb2 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -760,55 +760,57 @@ def test_duplicated_with_nas(self):
         expected = np.array(trues + trues)
         tm.assert_numpy_array_equal(result, expected)
 
-    def test_numeric_object_likes(self):
-        cases = [np.array([1, 2, 1, 5, 3,
-                           2, 4, 1, 5, 6]),
-                 np.array([1.1, 2.2, 1.1, np.nan, 3.3,
-                           2.2, 4.4, 1.1, np.nan, 6.6]),
-                 np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
-                           2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
-                 np.array(['a', 'b', 'a', 'e', 'c',
-                           'b', 'd', 'a', 'e', 'f'], dtype=object),
-                 np.array([1, 2**63, 1, 3**5, 10,
-                           2**63, 39, 1, 3**5, 7], dtype=np.uint64)]
-
+    @pytest.mark.parametrize('case', [
+        np.array([1, 2, 1, 5, 3,
+                  2, 4, 1, 5, 6]),
+        np.array([1.1, 2.2, 1.1, np.nan, 3.3,
+                  2.2, 4.4, 1.1, np.nan, 6.6]),
+        pytest.mark.xfail(resaon="Complex bug. GH 16399")(
+            np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
+                     2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j])
+        ),
+        np.array(['a', 'b', 'a', 'e', 'c',
+                  'b', 'd', 'a', 'e', 'f'], dtype=object),
+        np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7],
+                 dtype=np.uint64),
+    ])
+    def test_numeric_object_likes(self, case):
         exp_first = np.array([False, False, True, False, False,
                               True, False, True, True, False])
         exp_last = np.array([True, True, True, True, False,
                              False, False, False, False, False])
         exp_false = exp_first | exp_last
 
-        for case in cases:
-            res_first = algos.duplicated(case, keep='first')
-            tm.assert_numpy_array_equal(res_first, exp_first)
+        res_first = algos.duplicated(case, keep='first')
+        tm.assert_numpy_array_equal(res_first, exp_first)
 
-            res_last = algos.duplicated(case, keep='last')
-            tm.assert_numpy_array_equal(res_last, exp_last)
+        res_last = algos.duplicated(case, keep='last')
+        tm.assert_numpy_array_equal(res_last, exp_last)
 
-            res_false = algos.duplicated(case, keep=False)
-            tm.assert_numpy_array_equal(res_false, exp_false)
+        res_false = algos.duplicated(case, keep=False)
+        tm.assert_numpy_array_equal(res_false, exp_false)
 
-            # index
-            for idx in [pd.Index(case), pd.Index(case, dtype='category')]:
-                res_first = idx.duplicated(keep='first')
-                tm.assert_numpy_array_equal(res_first, exp_first)
+        # index
+        for idx in [pd.Index(case), pd.Index(case, dtype='category')]:
+            res_first = idx.duplicated(keep='first')
+            tm.assert_numpy_array_equal(res_first, exp_first)
 
-                res_last = idx.duplicated(keep='last')
-                tm.assert_numpy_array_equal(res_last, exp_last)
+            res_last = idx.duplicated(keep='last')
+            tm.assert_numpy_array_equal(res_last, exp_last)
 
-                res_false = idx.duplicated(keep=False)
-                tm.assert_numpy_array_equal(res_false, exp_false)
+            res_false = idx.duplicated(keep=False)
+            tm.assert_numpy_array_equal(res_false, exp_false)
 
-            # series
-            for s in [Series(case), Series(case, dtype='category')]:
-                res_first = s.duplicated(keep='first')
-                tm.assert_series_equal(res_first, Series(exp_first))
+        # series
+        for s in [Series(case), Series(case, dtype='category')]:
+            res_first = s.duplicated(keep='first')
+            tm.assert_series_equal(res_first, Series(exp_first))
 
-                res_last = s.duplicated(keep='last')
-                tm.assert_series_equal(res_last, Series(exp_last))
+            res_last = s.duplicated(keep='last')
+            tm.assert_series_equal(res_last, Series(exp_last))
 
-                res_false = s.duplicated(keep=False)
-                tm.assert_series_equal(res_false, Series(exp_false))
+            res_false = s.duplicated(keep=False)
+            tm.assert_series_equal(res_false, Series(exp_false))
 
     def test_datetime_likes(self):
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 8a5f6bf110be32..7f16d07adbe8b0 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -123,6 +123,26 @@ def test_constructor_empty(self):
         expected = pd.Int64Index([1, 2, 3])
         tm.assert_index_equal(c.categories, expected)
 
+    def test_constructor_tuples(self):
+        values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
+        result = Categorical(values)
+        expected = Index([(1,), (1, 2)], tupleize_cols=False)
+        tm.assert_index_equal(result.categories, expected)
+        assert result.ordered is False
+
+    def test_constructor_tuples_datetimes(self):
+        # numpy will auto reshape when all of the tuples are the
+        # same len, so add an extra one with 2 items and slice it off
+        values = np.array([(Timestamp('2010-01-01'),),
+                           (Timestamp('2010-01-02'),),
+                           (Timestamp('2010-01-01'),),
+                           (Timestamp('2010-01-02'),),
+                           ('a', 'b')], dtype=object)[:-1]
+        result = Categorical(values)
+        expected = Index([(Timestamp('2010-01-01'),),
+                          (Timestamp('2010-01-02'),)], tupleize_cols=False)
+        tm.assert_index_equal(result.categories, expected)
+
     def test_constructor_unsortable(self):
 
         # it works!
@@ -154,13 +174,13 @@ def test_is_equal_dtype(self):
         assert c1.is_dtype_equal(c1)
         assert c2.is_dtype_equal(c2)
         assert c3.is_dtype_equal(c3)
-        assert not c1.is_dtype_equal(c2)
+        assert c1.is_dtype_equal(c2)
         assert not c1.is_dtype_equal(c3)
         assert not c1.is_dtype_equal(Index(list('aabca')))
         assert not c1.is_dtype_equal(c1.astype(object))
         assert c1.is_dtype_equal(CategoricalIndex(c1))
-        assert not (c1.is_dtype_equal(
-            CategoricalIndex(c1, categories=list('cab'))))
+        assert (c1.is_dtype_equal(
+            CategoricalIndex(c1, categories=list('cab'))))  # XXX: changed
         assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
 
     def test_constructor(self):
@@ -413,6 +433,27 @@ def test_constructor_invariant(self):
             c2 = Categorical(c)
             tm.assert_categorical_equal(c, c2)
 
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_constructor_with_dtype(self, ordered):
+        categories = ['b', 'a', 'c']
+        dtype = CategoricalDtype(categories, ordered=ordered)
+        result = pd.Categorical(['a', 'b', 'a', 'c'], dtype=dtype)
+        expected = pd.Categorical(['a', 'b', 'a', 'c'], categories=categories,
+                                  ordered=ordered)
+        tm.assert_categorical_equal(result, expected)
+        assert result.ordered is ordered
+
+    def test_constructor_dtype_and_others_raises(self):
+        dtype = CategoricalDtype(['a', 'b'], ordered=True)
+        with tm.assert_raises_regex(ValueError, "Cannot"):
+            Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)
+
+        with tm.assert_raises_regex(ValueError, "Cannot"):
+            Categorical(['a', 'b'], ordered=True, dtype=dtype)
+
+        with tm.assert_raises_regex(ValueError, "Cannot"):
+            Categorical(['a', 'b'], ordered=False, dtype=dtype)
+
     def test_from_codes(self):
 
         # too few categories
@@ -624,6 +665,11 @@ def test_categories_none(self):
                               'a', 'c', 'c', 'c'], ordered=True)
         tm.assert_categorical_equal(factor, self.factor)
 
+    def test_set_categories_inplace(self):
+        cat = self.factor.copy()
+        cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
+        tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd']))
+
     def test_describe(self):
         # string type
         desc = self.factor.describe()
@@ -834,6 +880,60 @@ def test_ordered_api(self):
         tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a']))
         assert cat4.ordered
 
+    def test_set_dtype_same(self):
+        c = Categorical(['a', 'b', 'c'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
+        tm.assert_categorical_equal(result, c)
+
+    def test_set_dtype_new_categories(self):
+        c = Categorical(['a', 'b', 'c'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b', 'c', 'd']))
+        tm.assert_numpy_array_equal(result.codes, c.codes)
+        tm.assert_index_equal(result.dtype.categories,
+                              pd.Index(['a', 'b', 'c', 'd']))
+
+    def test_set_dtype_nans(self):
+        c = Categorical(['a', 'b', np.nan])
+        result = c._set_dtype(CategoricalDtype(['a', 'c']))
+        tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
+                                                           dtype='int8'))
+
+    @pytest.mark.parametrize('values, categories, new_categories', [
+        # No NaNs, same cats, same order
+        (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+        # No NaNs, same cats, different order
+        (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+        # Same, unsorted
+        (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+        # No NaNs, same cats, different order
+        (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+        # NaNs
+        (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+        (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+        # Introduce NaNs
+        (['a', 'b', 'c'], ['a', 'b'], ['a']),
+        (['a', 'b', 'c'], ['a', 'b'], ['b']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a']),
+        # No overlap
+        (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+    ])
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_set_dtype_many(self, values, categories, new_categories,
+                            ordered):
+        c = Categorical(values, categories)
+        expected = Categorical(values, new_categories, ordered)
+        result = c._set_dtype(expected.dtype)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_set_dtype_no_overlap(self):
+        c = Categorical(['a', 'b', 'c'], ['d', 'e'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b']))
+        expected = Categorical([None, None, None], categories=['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
     def test_set_ordered(self):
 
         cat = Categorical(["a", "b", "c", "a"], ordered=True)
@@ -1507,7 +1607,7 @@ def test_shift(self):
 
     def test_nbytes(self):
         cat = pd.Categorical([1, 2, 3])
-        exp = cat._codes.nbytes + cat._categories.values.nbytes
+        exp = 3 + 3 * 8  # 3 int8s for values + 3 int64s for categories
         assert cat.nbytes == exp
 
     def test_memory_usage(self):
@@ -1681,6 +1781,13 @@ def test_validate_inplace(self):
             with pytest.raises(ValueError):
                 cat.sort_values(inplace=value)
 
+    @pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
+    def test_imaginary(self):
+        values = [1, 2, 3 + 1j]
+        c1 = pd.Categorical(values)
+        tm.assert_index_equal(c1.categories, pd.Index(values))
+        tm.assert_numpy_array_equal(np.array(c1), np.array(values))
+
 
 class TestCategoricalAsBlock(object):
 
@@ -2113,15 +2220,18 @@ def test_assignment_to_dataframe(self):
 
         result = df.dtypes
         expected = Series(
-            [np.dtype('int32'), CategoricalDtype()], index=['value', 'D'])
+            [np.dtype('int32'), CategoricalDtype(categories=labels,
+                                                 ordered=False)],
+            index=['value', 'D'])
         tm.assert_series_equal(result, expected)
 
         df['E'] = s
         str(df)
 
         result = df.dtypes
-        expected = Series([np.dtype('int32'), CategoricalDtype(),
-                           CategoricalDtype()],
+        expected = Series([np.dtype('int32'),
+                           CategoricalDtype(categories=labels, ordered=False),
+                           CategoricalDtype(categories=labels, ordered=False)],
                           index=['value', 'D', 'E'])
         tm.assert_series_equal(result, expected)
 
@@ -4031,7 +4141,7 @@ def test_categorical_index_preserver(self):
 
         # wrong catgories
         df3 = DataFrame({'A': a,
-                         'B': pd.Categorical(b, categories=list('abc'))
+                         'B': pd.Categorical(b, categories=list('abe'))  # XXX
                          }).set_index('B')
         pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
 
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 7dac83953ad8f7..65f095a1406ca7 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1244,7 +1244,12 @@ def assert_series_equal(left, right, check_dtype=True,
                        obj='{obj}.index'.format(obj=obj))
 
     if check_dtype:
-        assert_attr_equal('dtype', left, right)
+        if (is_categorical_dtype(left) and is_categorical_dtype(right) and
+                not check_categorical):
+            # compat with pandas 0.21.0 CategoricalDtype
+            pass
+        else:
+            assert_attr_equal('dtype', left, right)
 
     if check_exact:
         assert_numpy_array_equal(left.get_values(), right.get_values(),