BUG/REF: TimedeltaIndex.__new__ (pandas-dev#23539)

Pingviinituutti · Feb 28, 2019 · 13a3054 · 13a3054
1 parent 3d6ab98
commit 13a3054
Show file tree

Hide file tree

Showing 12 changed files with 330 additions and 103 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -247,6 +247,7 @@ Backwards incompatible API changes
 
 - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`)
 - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`)
+- Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`)
 
 .. _whatsnew_0240.api_breaking.deps:
 
@@ -969,6 +970,7 @@ Deprecations
 - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`)
 - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of
   `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`)
+- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`)
 
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -234,9 +234,7 @@ def __new__(cls, values, freq=None, tz=None, dtype=None):
 
         result = cls._simple_new(values, freq=freq, tz=tz)
         if freq_infer:
-            inferred = result.inferred_freq
-            if inferred:
-                result.freq = to_offset(inferred)
+            result.freq = to_offset(result.inferred_freq)
 
         # NB: Among other things not yet ported from the DatetimeIndex
         # constructor, this does not call _deepcopy_if_needed

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -1,18 +1,28 @@
 # -*- coding: utf-8 -*-
 from datetime import timedelta
+import warnings
 
 import numpy as np
 
 from pandas._libs import tslibs
-from pandas._libs.tslibs import Timedelta, Timestamp, NaT
+from pandas._libs.tslibs import Timedelta, Timestamp, NaT, iNaT
 from pandas._libs.tslibs.fields import get_timedelta_field
-from pandas._libs.tslibs.timedeltas import array_to_timedelta64
+from pandas._libs.tslibs.timedeltas import (
+    array_to_timedelta64, parse_timedelta_unit)
 
 from pandas import compat
 
 from pandas.core.dtypes.common import (
-    _TD_DTYPE, is_list_like)
-from pandas.core.dtypes.generic import ABCSeries
+    _TD_DTYPE,
+    is_object_dtype,
+    is_string_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_timedelta64_dtype,
+    is_datetime64_dtype,
+    is_list_like,
+    ensure_int64)
+from pandas.core.dtypes.generic import ABCSeries, ABCTimedeltaIndex
 from pandas.core.dtypes.missing import isna
 
 import pandas.core.common as com
@@ -139,9 +149,7 @@ def __new__(cls, values, freq=None):
 
         result = cls._simple_new(values, freq=freq)
         if freq_infer:
-            inferred = result.inferred_freq
-            if inferred:
-                result.freq = to_offset(inferred)
+            result.freq = to_offset(result.inferred_freq)
 
         return result
 
@@ -397,6 +405,163 @@ def f(x):
 # ---------------------------------------------------------------------
 # Constructor Helpers
 
+def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
+    """
+    Parameters
+    ----------
+    array : list-like
+    copy : bool, default False
+    unit : str, default "ns"
+    errors : {"raise", "coerce", "ignore"}, default "raise"
+
+    Returns
+    -------
+    ndarray[timedelta64[ns]]
+    inferred_freq : Tick or None
+
+    Raises
+    ------
+    ValueError : data cannot be converted to timedelta64[ns]
+
+    Notes
+    -----
+    Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause
+    errors to be ignored; they are caught and subsequently ignored at a
+    higher level.
+    """
+    inferred_freq = None
+    unit = parse_timedelta_unit(unit)
+
+    # Unwrap whatever we have into a np.ndarray
+    if not hasattr(data, 'dtype'):
+        # e.g. list, tuple
+        if np.ndim(data) == 0:
+            # i.e. generator
+            data = list(data)
+        data = np.array(data, copy=False)
+    elif isinstance(data, ABCSeries):
+        data = data._values
+    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)):
+        inferred_freq = data.freq
+        data = data._data
+
+    # Convert whatever we have into timedelta64[ns] dtype
+    if is_object_dtype(data) or is_string_dtype(data):
+        # no need to make a copy, need to convert if string-dtyped
+        data = objects_to_td64ns(data, unit=unit, errors=errors)
+        copy = False
+
+    elif is_integer_dtype(data):
+        # treat as multiples of the given unit
+        data, copy_made = ints_to_td64ns(data, unit=unit)
+        copy = copy and not copy_made
+
+    elif is_float_dtype(data):
+        # treat as multiples of the given unit.  If after converting to nanos,
+        #  there are fractional components left, these are truncated
+        #  (i.e. NOT rounded)
+        mask = np.isnan(data)
+        coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns')
+        data = (coeff * data).astype(np.int64).view('timedelta64[ns]')
+        data[mask] = iNaT
+        copy = False
+
+    elif is_timedelta64_dtype(data):
+        if data.dtype != _TD_DTYPE:
+            # non-nano unit
+            # TODO: watch out for overflows
+            data = data.astype(_TD_DTYPE)
+            copy = False
+
+    elif is_datetime64_dtype(data):
+        # GH#23539
+        warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is "
+                      "deprecated, will raise a TypeError in a future "
+                      "version",
+                      FutureWarning, stacklevel=3)
+        data = ensure_int64(data).view(_TD_DTYPE)
+
+    else:
+        raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]"
+                        .format(dtype=data.dtype))
+
+    data = np.array(data, copy=copy)
+    assert data.dtype == 'm8[ns]', data
+    return data, inferred_freq
+
+
+def ints_to_td64ns(data, unit="ns"):
+    """
+    Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating
+    the integers as multiples of the given timedelta unit.
+
+    Parameters
+    ----------
+    data : np.ndarray with integer-dtype
+    unit : str, default "ns"
+
+    Returns
+    -------
+    ndarray[timedelta64[ns]]
+    bool : whether a copy was made
+    """
+    copy_made = False
+    unit = unit if unit is not None else "ns"
+
+    if data.dtype != np.int64:
+        # converting to int64 makes a copy, so we can avoid
+        # re-copying later
+        data = data.astype(np.int64)
+        copy_made = True
+
+    if unit != "ns":
+        dtype_str = "timedelta64[{unit}]".format(unit=unit)
+        data = data.view(dtype_str)
+
+        # TODO: watch out for overflows when converting from lower-resolution
+        data = data.astype("timedelta64[ns]")
+        # the astype conversion makes a copy, so we can avoid re-copying later
+        copy_made = True
+
+    else:
+        data = data.view("timedelta64[ns]")
+
+    return data, copy_made
+
+
+def objects_to_td64ns(data, unit="ns", errors="raise"):
+    """
+    Convert a object-dtyped or string-dtyped array into an
+    timedelta64[ns]-dtyped array.
+
+    Parameters
+    ----------
+    data : ndarray or Index
+    unit : str, default "ns"
+    errors : {"raise", "coerce", "ignore"}, default "raise"
+
+    Returns
+    -------
+    ndarray[timedelta64[ns]]
+
+    Raises
+    ------
+    ValueError : data cannot be converted to timedelta64[ns]
+
+    Notes
+    -----
+    Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause
+    errors to be ignored; they are caught and subsequently ignored at a
+    higher level.
+    """
+    # coerce Index to np.ndarray, converting string-dtype if necessary
+    values = np.array(data, dtype=np.object_, copy=False)
+
+    result = array_to_timedelta64(values,
+                                  unit=unit, errors=errors)
+    return result.view('timedelta64[ns]')
+
+
 def _generate_regular_range(start, end, periods, offset):
     stride = offset.nanos
     if periods is None:

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -239,6 +239,21 @@ def __new__(cls, data=None,
                 dayfirst=False, yearfirst=False, dtype=None,
                 copy=False, name=None, verify_integrity=True):
 
+        if data is None:
+            # TODO: Remove this block and associated kwargs; GH#20535
+            result = cls._generate_range(start, end, periods,
+                                         freq=freq, tz=tz, normalize=normalize,
+                                         closed=closed, ambiguous=ambiguous)
+            result.name = name
+            return result
+
+        if is_scalar(data):
+            raise TypeError("{cls}() must be called with a "
+                            "collection of some kind, {data} was passed"
+                            .format(cls=cls.__name__, data=repr(data)))
+
+        # - Cases checked above all return/raise before reaching here - #
+
         # This allows to later ensure that the 'copy' parameter is honored:
         if isinstance(data, Index):
             ref_to_data = data._data
@@ -253,20 +268,8 @@ def __new__(cls, data=None,
         # if dtype has an embedded tz, capture it
         tz = dtl.validate_tz_from_dtype(dtype, tz)
 
-        if data is None:
-            # TODO: Remove this block and associated kwargs; GH#20535
-            result = cls._generate_range(start, end, periods,
-                                         freq=freq, tz=tz, normalize=normalize,
-                                         closed=closed, ambiguous=ambiguous)
-            result.name = name
-            return result
-
         if not isinstance(data, (np.ndarray, Index, ABCSeries,
                                  DatetimeArrayMixin)):
-            if is_scalar(data):
-                raise ValueError('DatetimeIndex() must be called with a '
-                                 'collection of some kind, %s was passed'
-                                 % repr(data))
             # other iterable of some kind
             if not isinstance(data, (list, tuple)):
                 data = list(data)
@@ -328,9 +331,7 @@ def __new__(cls, data=None,
                 cls._validate_frequency(subarr, freq, ambiguous=ambiguous)
 
         if freq_infer:
-            inferred = subarr.inferred_freq
-            if inferred:
-                subarr.freq = to_offset(inferred)
+            subarr.freq = to_offset(subarr.inferred_freq)
 
         return subarr._deepcopy_if_needed(ref_to_data, copy)
 

diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -15,7 +15,8 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core.arrays.timedeltas import (
-    TimedeltaArrayMixin, _is_convertible_to_td, _to_m8)
+    TimedeltaArrayMixin, _is_convertible_to_td, _to_m8,
+    sequence_to_td64ns)
 from pandas.core.arrays import datetimelike as dtl
 
 from pandas.core.indexes.base import Index
@@ -33,10 +34,9 @@
     TimelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op,
     wrap_array_method, wrap_field_accessor)
 from pandas.core.tools.timedeltas import (
-    to_timedelta, _coerce_scalar_to_timedelta_type)
+    _coerce_scalar_to_timedelta_type)
 from pandas._libs import (lib, index as libindex,
                           join as libjoin, Timedelta, NaT)
-from pandas._libs.tslibs.timedeltas import array_to_timedelta64
 
 
 class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin,
@@ -139,12 +139,6 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,
                 periods=None, closed=None, dtype=None, copy=False,
                 name=None, verify_integrity=True):
 
-        if isinstance(data, TimedeltaIndex) and freq is None and name is None:
-            if copy:
-                return data.copy()
-            else:
-                return data._shallow_copy()
-
         freq, freq_infer = dtl.maybe_infer_freq(freq)
 
         if data is None:
@@ -154,32 +148,31 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,
             result.name = name
             return result
 
-        if unit is not None:
-            data = to_timedelta(data, unit=unit, box=False)
-
         if is_scalar(data):
-            raise ValueError('TimedeltaIndex() must be called with a '
-                             'collection of some kind, {data} was passed'
-                             .format(data=repr(data)))
-
-        # convert if not already
-        if getattr(data, 'dtype', None) != _TD_DTYPE:
-            data = to_timedelta(data, unit=unit, box=False)
-        elif copy:
-            data = np.array(data, copy=True)
-
-        data = np.array(data, copy=False)
-        if data.dtype == np.object_:
-            data = array_to_timedelta64(data)
-        if data.dtype != _TD_DTYPE:
-            if is_timedelta64_dtype(data):
-                # non-nano unit
-                # TODO: watch out for overflows
-                data = data.astype(_TD_DTYPE)
+            raise TypeError('{cls}() must be called with a '
+                            'collection of some kind, {data} was passed'
+                            .format(cls=cls.__name__, data=repr(data)))
+
+        if isinstance(data, TimedeltaIndex) and freq is None and name is None:
+            if copy:
+                return data.copy()
             else:
-                data = ensure_int64(data).view(_TD_DTYPE)
+                return data._shallow_copy()
 
-        assert data.dtype == 'm8[ns]', data.dtype
+        # - Cases checked above all return/raise before reaching here - #
+
+        data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit)
+        if inferred_freq is not None:
+            if freq is not None and freq != inferred_freq:
+                raise ValueError('Inferred frequency {inferred} from passed '
+                                 'values does not conform to passed frequency '
+                                 '{passed}'
+                                 .format(inferred=inferred_freq,
+                                         passed=freq.freqstr))
+            elif freq_infer:
+                freq = inferred_freq
+                freq_infer = False
+            verify_integrity = False
 
         subarr = cls._simple_new(data, name=name, freq=freq)
         # check that we are matching freqs
@@ -188,9 +181,7 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,
                 cls._validate_frequency(subarr, freq)
 
         if freq_infer:
-            inferred = subarr.inferred_freq
-            if inferred:
-                subarr.freq = to_offset(inferred)
+            subarr.freq = to_offset(subarr.inferred_freq)
 
         return subarr