DEPR: __array__ for tz-aware Series/Index

This deprecates the current behvior when converting tz-aware Series or Index to an ndarray. Previously, we converted to M8[ns], throwing away the timezone information. In the future, we will return an object-dtype array filled with Timestamps, each of which has the correct tz. ```python In [1]: import pandas as pd; import numpy as np In [2]: ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) In [3]: np.asarray(ser) /bin/ipython:1: FutureWarning: Converting timezone-aware DatetimeArray to timezone-naive ndarray with 'datetime64[ns]' dtype. In the future, this will return an ndarray with 'object' dtype where each element is a 'pandas.Timestamp' with the correct 'tz'. To accept the future behavior, pass 'dtype=object'. To keep the old behavior, pass 'dtype="datetime64[ns]"'. #!/Users/taugspurger/Envs/pandas-dev/bin/python3 Out[3]: array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'], dtype='datetime64[ns]') ``` xref pandas-dev#23569
TomAugspurger · Jan 3, 2019 · 9aa0413 · 9aa0413
1 parent 62506ca
commit 9aa0413
Show file tree

Hide file tree

Showing 16 changed files with 301 additions and 27 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1228,7 +1228,7 @@ Deprecations
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 
 Integer Addition/Subtraction with Datetimes and Timedeltas is Deprecated
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 In the past, users could—in some cases—add or subtract integers or integer-dtype
 arrays from :class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaIndex`.
@@ -1266,6 +1266,60 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`).
     dti = pd.date_range('2001-01-01', periods=2, freq='7D')
     dti + pd.Index([1 * dti.freq, 2 * dti.freq])
 
+
+.. _whatsnew_0240.deprecations.tz_aware_array:
+
+Converting Timezone-Aware Series and Index to NumPy Arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The conversion from a :class:`Series` or :class:`Index` with timezone-aware
+datetime data will changing to preserve timezones by default (:issue:`23569`).
+
+NumPy doesn't have a dedicated dtype for timezone-aware datetimes.
+In the past, converting a :class:`Series` or :class:`DatetimeIndex` with
+timezone-aware datatimes would convert to a NumPy array by
+
+1. converting the tz-aware data to UTC
+2. dropping the timezone-info
+3. returning a :class:`numpy.ndarray` with ``datetime64[ns]`` dtype
+
+Future versions of pandas will preserve the timezone information by returning an
+object-dtype NumPy array where each value is a :class:`Timestamp` with the correct
+timezone attached
+
+.. ipython:: python
+
+   ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+   ser
+
+The default behavior renames the same, but issues a warning
+
+.. code-block:: python
+
+   In [8]: np.asarray(ser)
+   /bin/ipython:1: FutureWarning: Converting timezone-aware DatetimeArray to timezone-naive
+         ndarray with 'datetime64[ns]' dtype. In the future, this will return an ndarray
+         with 'object' dtype where each element is a 'pandas.Timestamp' with the correct 'tz'.
+
+           To accept the future behavior, pass 'dtype=object'.
+           To keep the old behavior, pass 'dtype="datetime64[ns]"'.
+     #!/bin/python3
+   Out[8]:
+   array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
+         dtype='datetime64[ns]')
+
+The old or new behavior can be obtained by specifying the ``dtype``
+
+.. ipython:: python
+   :okwarning:
+
+   # Old behavior
+   np.asarray(ser, dtype='datetime64[ns]')
+
+   # New behavior
+   np.asarray(ser, dtype=object)
+
+
 .. _whatsnew_0240.prior_deprecations:
 
 Removal of prior version deprecations/changes

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -522,7 +522,7 @@ def _resolution(self):
     # Array-Like / EA-Interface Methods
 
     def __array__(self, dtype=None):
-        if is_object_dtype(dtype):
+        if is_object_dtype(dtype) or (dtype is None and self.tz):
             return np.array(list(self), dtype=object)
         elif is_int64_dtype(dtype):
             return self.asi8

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1020,7 +1020,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
                             # datetime64tz is assumed to be naive which should
                             # be localized to the timezone.
                             is_dt_string = is_string_dtype(value)
-                            value = to_datetime(value, errors=errors)
+                            value = to_datetime(value, errors=errors).array
                             if is_dt_string:
                                 # Strings here are naive, so directly localize
                                 value = value.tz_localize(dtype.tz)

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -318,6 +318,7 @@ def _hash_categories(categories, ordered=True):
         from pandas.core.util.hashing import (
             hash_array, _combine_hash_arrays, hash_tuples
         )
+        from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE
 
         if len(categories) and isinstance(categories[0], tuple):
             # assumes if any individual category is a tuple, then all our. ATM
@@ -335,6 +336,11 @@ def _hash_categories(categories, ordered=True):
                     # find a better solution
                     hashed = hash((tuple(categories), ordered))
                     return hashed
+
+            if is_datetime64tz_dtype(categories.dtype):
+                # Avoid future warning.
+                categories = categories.astype(_NS_DTYPE)
+
             cat_array = hash_array(np.asarray(categories), categorize=False)
         if ordered:
             cat_array = np.vstack([

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -26,7 +26,8 @@ class providing the base-class of operations.
 
 from pandas.core.dtypes.cast import maybe_downcast_to_dtype
 from pandas.core.dtypes.common import (
-    ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
+    _NS_DTYPE, ensure_float, is_datetime64tz_dtype, is_extension_array_dtype,
+    is_numeric_dtype, is_scalar)
 from pandas.core.dtypes.missing import isna, notna
 
 import pandas.core.algorithms as algorithms
@@ -1269,10 +1270,18 @@ def f(self, **kwargs):
             return f
 
         def first_compat(x, axis=0):
+            # This is a bit strange.
+            # We only hit this block when grouping a DatetimeTZBlock *and*
+            # a categorical. Something strange going on with first for
+            # categorical dta.
+            if is_datetime64tz_dtype(x.dtype):
+                dtype = _NS_DTYPE
+            else:
+                dtype = None
 
             def first(x):
 
-                x = np.asarray(x)
+                x = np.asarray(x, dtype=dtype)
                 x = x[notna(x)]
                 if len(x) == 0:
                     return np.nan
@@ -1284,10 +1293,18 @@ def first(x):
                 return first(x)
 
         def last_compat(x, axis=0):
+            # This is a bit strange.
+            # We only hit this block when grouping a DatetimeTZBlock *and*
+            # a categorical. Something strange going on with first for
+            # categorical dta.
+            if is_datetime64tz_dtype(x.dtype):
+                dtype = _NS_DTYPE
+            else:
+                dtype = None
 
             def last(x):
 
-                x = np.asarray(x)
+                x = np.asarray(x, dtype=dtype)
                 x = x[notna(x)]
                 if len(x) == 0:
                     return np.nan

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -339,6 +339,21 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
 
     # --------------------------------------------------------------------
 
+    def __array__(self, dtype=None):
+        if dtype is None and isinstance(self._data, DatetimeArray)\
+                and getattr(self.dtype, 'tz', None):
+            msg = (
+                "Converting timezone-aware DatetimeArray to timezone-naive "
+                "ndarray with 'datetime64[ns]' dtype. In the future, this "
+                "will return an ndarray with 'object' dtype where each "
+                "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t"
+                "To accept the future behavior, pass 'dtype=object'.\n\t"
+                "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
+            )
+            warnings.warn(msg, FutureWarning, stacklevel=3)
+            dtype = 'M8[ns]'
+        return np.asarray(self._data, dtype=dtype)
+
     @property
     def dtype(self):
         return self._eadata.dtype
@@ -1114,7 +1129,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
 
     strftime = ea_passthrough(DatetimeArray.strftime)
     _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz)
-    __array__ = ea_passthrough(DatetimeArray.__array__)
 
     @property
     def offset(self):

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -581,7 +581,12 @@ def can_do_equal_len():
                         setter(item, v)
 
                 # we have an equal len ndarray/convertible to our labels
-                elif np.array(value).ndim == 2:
+                # hasattr first, to avoid coercing to ndarray without reason.
+                # But we may be relying on the ndarray coercion to check ndim.
+                # Why not just convert to an ndarray earlier on if needed?
+                elif ((hasattr(value, 'ndim') and value.ndim == 2)
+                      or (not hasattr(value, 'ndim') and
+                          np.array(value).ndim) == 2):
 
                     # note that this coerces the dtype if we are mixed
                     # GH 7551

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -2400,6 +2400,12 @@ def get_values(self, dtype=None):
             values = values.reshape(1, -1)
         return values
 
+    def to_dense(self):
+        # we request M8[ns] dtype here, even though it discards tzinfo,
+        # as lots of code (e.g. anything using values_from_object)
+        # expects that behavior.
+        return np.asarray(self.values, dtype=_NS_DTYPE)
+
     def _slice(self, slicer):
         """ return a slice of my values """
         if isinstance(slicer, tuple):
@@ -2544,6 +2550,40 @@ def setitem(self, indexer, value):
                               klass=ObjectBlock,)
             return newb.setitem(indexer, value)
 
+    def quantile(self, qs, interpolation='linear', axis=0, axes=None):
+        # TODO: Add quantile as a reduction method.
+        # We can't just use Block.quantile, as that converts the DTA
+        # to an ndarray[object] via get_values.
+        # This method
+        # 1. Convert DatetimeTZBlock -> DatetimeBlock
+        # 2. Perform the op via Block.quantile
+        # 3. Converts back to tz-aware
+        # Alternatively, we could special case the call to `get_values`
+        # in Block.quantile for DatetimeTZ.
+
+        new_values = np.asarray(self.values, dtype=_NS_DTYPE)
+        if self.ndim == 2:
+            new_values = new_values[None, :]
+
+        new_block = DatetimeBlock(new_values, placement=self.mgr_locs)
+
+        ax, naive = new_block.quantile(qs, interpolation=interpolation,
+                                       axis=axis, axes=axes)
+
+        ndim = getattr(naive, 'ndim', None) or 0
+        if ndim == 0:
+            return ax, self.make_block_scalar(
+                tslibs.Timestamp(naive.values.value, tz=self.values.tz)
+            )
+        else:
+            naive = naive.values.ravel()
+
+        result = DatetimeArray(naive, dtype=self.values.dtype)
+
+        return ax, make_block(result,
+                              placement=np.arange(len(result)),
+                              ndim=ndim)
+
 
 class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
     __slots__ = ()

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -34,6 +34,7 @@
 from pandas.core.indexes import base as ibase
 from pandas.core.internals import (
     create_block_manager_from_arrays, create_block_manager_from_blocks)
+from pandas.core.internals.arrays import extract_array
 
 # ---------------------------------------------------------------------
 # BlockManager Interface
@@ -539,7 +540,6 @@ def sanitize_array(data, index, dtype=None, copy=False,
     Sanitize input data to an ndarray, copy if specified, coerce to the
     dtype if specified.
     """
-
     if dtype is not None:
         dtype = pandas_dtype(dtype)
 
@@ -551,8 +551,10 @@ def sanitize_array(data, index, dtype=None, copy=False,
         else:
             data = data.copy()
 
+    data = extract_array(data, extract_numpy=True)
+
     # GH#846
-    if isinstance(data, (np.ndarray, Index, ABCSeries)):
+    if isinstance(data, np.ndarray):
 
         if dtype is not None:
             subarr = np.array(data, copy=False)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -8,7 +8,7 @@
 from pandas._libs.lib import infer_dtype
 
 from pandas.core.dtypes.common import (
-    ensure_int64, is_categorical_dtype, is_datetime64_dtype,
+    _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype,
     is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer,
     is_scalar, is_timedelta64_dtype)
 from pandas.core.dtypes.missing import isna
@@ -226,7 +226,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
             raise ValueError('Overlapping IntervalIndex is not accepted.')
 
     else:
-        bins = np.asarray(bins)
+        if is_datetime64tz_dtype(bins):
+            bins = np.asarray(bins, dtype=_NS_DTYPE)
+        else:
+            bins = np.asarray(bins)
         bins = _convert_bin_to_numeric_type(bins, dtype)
         if (np.diff(bins) < 0).any():
             raise ValueError('bins must increase monotonically.')

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -21,7 +21,8 @@
     is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
     is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
 from pandas.core.dtypes.generic import (
-    ABCDataFrame, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries)
+    ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, ABCSeries,
+    ABCSparseArray, ABCSparseSeries)
 from pandas.core.dtypes.missing import (
     isna, na_value_for_dtype, notna, remove_na_arraylike)
 
@@ -665,7 +666,20 @@ def __array__(self, result=None):
         """
         The array interface, return my values.
         """
-        return self.get_values()
+        # TODO: change the keyword name from result to dtype?
+        if (result is None and isinstance(self.array, ABCDatetimeArray)
+                and getattr(self.dtype, 'tz', None)):
+            msg = (
+                "Converting timezone-aware DatetimeArray to timezone-naive "
+                "ndarray with 'datetime64[ns]' dtype. In the future, this "
+                "will return an ndarray with 'object' dtype where each "
+                "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t"
+                "To accept the future behavior, pass 'dtype=object'.\n\t"
+                "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
+            )
+            warnings.warn(msg, FutureWarning, stacklevel=3)
+            result = 'M8[ns]'
+        return np.asarray(self.array, result)
 
     def __array_wrap__(self, result, context=None):
         """

diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -264,11 +264,11 @@ def test_array(self, tz_naive_fixture):
         arr = DatetimeArray(dti)
 
         expected = dti.asi8.view('M8[ns]')
-        result = np.array(arr)
+        result = np.array(arr, dtype='M8[ns]')
         tm.assert_numpy_array_equal(result, expected)
 
         # check that we are not making copies when setting copy=False
-        result = np.array(arr, copy=False)
+        result = np.array(arr, dtype='M8[ns]', copy=False)
         assert result.base is expected.base
         assert result.base is not None
 

diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
@@ -178,6 +178,39 @@ def test_fillna_preserves_tz(self, method):
         assert arr[2] is pd.NaT
         assert dti[2] == pd.Timestamp('2000-01-03', tz='US/Central')
 
+    def test_array_interface_tz(self):
+        tz = "US/Central"
+        data = DatetimeArray(pd.date_range('2017', periods=2, tz=tz))
+        result = np.asarray(data)
+
+        expected = np.array([pd.Timestamp('2017-01-01T00:00:00', tz=tz),
+                             pd.Timestamp('2017-01-02T00:00:00', tz=tz)],
+                            dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.asarray(data, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.asarray(data, dtype='M8[ns]')
+
+        expected = np.array(['2017-01-01T06:00:00',
+                             '2017-01-02T06:00:00'], dtype="M8[ns]")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_array_interface(self):
+        data = DatetimeArray(pd.date_range('2017', periods=2))
+        expected = np.array(['2017-01-01T00:00:00', '2017-01-02T00:00:00'],
+                            dtype='datetime64[ns]')
+
+        result = np.asarray(data)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.asarray(data, dtype=object)
+        expected = np.array([pd.Timestamp('2017-01-01T00:00:00'),
+                             pd.Timestamp('2017-01-02T00:00:00')],
+                            dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
 
 class TestSequenceToDT64NS(object):