From 125ca0b7cf5796b806e76c2c38d780e55ea12176 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 08:37:07 -0500 Subject: [PATCH] Simplify Upcasting is still broken --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 119 +++++++++--------- pandas/core/dtypes/cast.py | 6 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 9 +- pandas/core/internals.py | 8 +- pandas/core/series.py | 7 +- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/decimal/array.py | 1 + .../tests/extension/decimal/test_decimal.py | 3 +- pandas/tests/extension/json/array.py | 6 +- 11 files changed, 88 insertions(+), 77 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d1fbaf91f3365..39342fc78db3b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1510,7 +1510,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs # dispatch to internal type takes if is_extension_array_dtype(arr): - return arr.take(indexer, fill_value=fill_value) + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1942a848c57bf..bd65483904ae4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -5,71 +5,14 @@ This is an experimental API and subject to breaking changes without warning. """ -import textwrap - import numpy as np from pandas.errors import AbstractMethodError from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution _not_implemented_message = "{} does not implement {}." -_take_docstring = textwrap.dedent("""\ -Take elements from an array. - -Parameters ----------- -%(arr)s\ -indexer : sequence of integers - Indices to be taken. See Notes for how negative indicies - are handled. -fill_value : any, optional - Fill value to use for NA-indicies. This has a few behaviors. - - * fill_value is not specified : triggers NumPy's semantics - where negative values in `indexer` mean slices from the end. - * fill_value is NA : Fill positions where `indexer` is ``-1`` - with ``self.dtype.na_value``. Anything considered NA by - :func:`pandas.isna` will result in ``self.dtype.na_value`` - being used to fill. - * fill_value is not NA : Fill positions where `indexer` is ``-1`` - with `fill_value`. - -Returns -------- -ExtensionArray - -Raises ------- -IndexError - When the indexer is out of bounds for the array. -ValueError - When the indexer contains negative values other than ``-1`` - and `fill_value` is specified. - -Notes ------ -The meaning of negative values in `indexer` depends on the -`fill_value` argument. By default, we follow the behavior -:meth:`numpy.take` of where negative indices indicate slices -from the end. - -When `fill_value` is specified, we follow pandas semantics of ``-1`` -indicating a missing value. In this case, positions where `indexer` -is ``-1`` will be filled with `fill_value` or the default NA value -for this type. - -ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, -``iloc``, when the indexer is a sequence of values. Additionally, -it's called by :meth:`Series.reindex` with a `fill_value`. - -See Also --------- -numpy.take""") - - class ExtensionArray(object): """Abstract base class for custom 1-D array types. @@ -532,15 +475,66 @@ def _values_for_take(self): """ return self.astype(object) - @Substitution(arr='') - @Appender(_take_docstring) def take(self, indexer, fill_value=None, allow_fill=None): - # type: (Sequence[int], Optional[Any]) -> ExtensionArray - # assert fill_value is not np.nan + # type: (Sequence[int], Optional[Any], Optional[bool]) -> ExtensionArray + """Take elements from an array. + + Parameters + ---------- + indexer : sequence of integers + Indices to be taken. See Notes for how negative indicies + are handled. + fill_value : any, optional + Fill value to use for NA-indicies. This has a few behaviors. + + * fill_value is not specified : triggers NumPy's semantics + where negative values in `indexer` mean slices from the end. + * fill_value is NA : Fill positions where `indexer` is ``-1`` + with ``self.dtype.na_value``. Anything considered NA by + :func:`pandas.isna` will result in ``self.dtype.na_value`` + being used to fill. + * fill_value is not NA : Fill positions where `indexer` is ``-1`` + with `fill_value`. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indexer is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `fill_value` is specified. + + Notes + ----- + The meaning of negative values in `indexer` depends on the + `fill_value` argument. By default, we follow the behavior + :meth:`numpy.take` of where negative indices indicate slices + from the end. + + When `fill_value` is specified, we follow pandas semantics of ``-1`` + indicating a missing value. In this case, positions where `indexer` + is ``-1`` will be filled with `fill_value` or the default NA value + for this type. + + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when the indexer is a sequence of values. Additionally, + it's called by :meth:`Series.reindex` with a `fill_value`. + + See Also + -------- + numpy.take + """ from pandas.core.algorithms import take data = self._values_for_take() - result = take(data, indexer, fill_value=fill_value) + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result) def copy(self, deep=False): @@ -605,4 +599,3 @@ def _ndarray_values(self): used for interacting with our indexers. """ return np.array(self) - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8deb777ca7d73..3d5c7fa6ab81e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -256,7 +256,11 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): # if we passed an array here, determine the fill value by dtype - if isinstance(fill_value, np.ndarray): + if is_extension_array_dtype(dtype): + # XXX: verify this change + fill_value = dtype.na_value + + elif isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = iNaT else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de6985ef3b4ea..82d5a0286b117 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3476,7 +3476,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan, allow_dups=False) def _reindex_columns(self, new_columns, method, copy, level, - fill_value=np.nan, limit=None, tolerance=None): + fill_value=None, limit=None, tolerance=None): new_columns, indexer = self.columns.reindex(new_columns, method=method, level=level, limit=limit, tolerance=tolerance) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 86342b6996abf..508fa14a325b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3660,7 +3660,7 @@ def reindex(self, *args, **kwargs): copy = kwargs.pop('copy', True) limit = kwargs.pop('limit', None) tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', np.nan) + fill_value = kwargs.pop('fill_value', None) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -3776,7 +3776,7 @@ def _reindex_multi(self, axes, copy, fill_value): @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): + limit=None, fill_value=None): msg = ("'.reindex_axis' is deprecated and will be removed in a future " "version. Use '.reindex' instead.") self._consolidate_inplace() @@ -3790,7 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, return self._reindex_with_indexers({axis: [new_index, indexer]}, fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, + def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, allow_dups=False): """allow_dups indicates an internal call here """ @@ -7252,7 +7252,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, raise TypeError('unsupported type: %s' % type(other)) def _align_frame(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=np.nan, method=None, limit=None, + copy=True, fill_value=None, method=None, limit=None, fill_axis=0): # defaults join_index, join_columns = None, None @@ -7420,6 +7420,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if other.ndim <= self.ndim: _, other = self.align(other, join='left', axis=axis, + # XXX level=level, fill_value=np.nan) # if we are NOT aligned, raise as we cannot where index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e98899b2f5c1a..3e9219d4bc449 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1888,6 +1888,11 @@ def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def fill_value(self): + # Used in reindex_indexer + return self.values.dtype.na_value + @property def _can_hold_na(self): # The default ExtensionArray._can_hold_na is True @@ -1951,7 +1956,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take(indexer, fill_value=fill_value) + new_values = self.values.take(indexer, fill_value=fill_value, + allow_fill=True) # if we are a 1-dim object, then always place at 0 if self.ndim == 1: diff --git a/pandas/core/series.py b/pandas/core/series.py index f2ee225f50514..53ca38157dab2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2185,7 +2185,7 @@ def _binop(self, other, func, level=None, fill_value=None): result.name = None return result - def combine(self, other, func, fill_value=np.nan): + def combine(self, other, func, fill_value=None): """ Perform elementwise binary operation on two Series using given function with optional fill value when an index is missing from one Series or @@ -3216,7 +3216,10 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer) + # TODO: determine if we want EA to handle fill_value=None + # if not, then we have to determine this here. + new_values = algorithms.take_1d(self._values, indexer, + fill_value=None, allow_fill=True) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 796879d658be8..e4ff919146a7c 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -155,7 +155,7 @@ def test_take_non_na_fill_value(self, data_missing): def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError): - data.take([0, -2], fill_value=na_value) + data.take([0, -2], fill_value=na_value, allow_fill=True) @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") def test_take_series(self, data): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 8aedf7d763833..c2375f57900aa 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -28,6 +28,7 @@ class DecimalArray(ExtensionArray): dtype = DecimalDtype() def __init__(self, values): + assert all(isinstance(v, decimal.Decimal) for v in values) values = np.asarray(values, dtype=object) self._data = values diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7ca92dd057a61..1cb6c3b7dfe44 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -119,7 +119,8 @@ def test_take_basic(self): decimal.Decimal('3')]) self.assert_extension_array_equal(result, expected) - result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value) + result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value, + allow_fill=True) expected = DecimalArray([decimal.Decimal('2'), decimal.Decimal('3'), decimal.Decimal('NaN')]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index be0c904891a2c..c353cc9959765 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -94,7 +94,7 @@ def nbytes(self): def isna(self): return np.array([x == self.dtype.na_value for x in self.data]) - def take(self, indexer, fill_value=None): + def take(self, indexer, fill_value=None, allow_fill=None): # re-implement here, since NumPy has trouble setting # sized objects like UserDicts into scalar slots of # an ndarary. @@ -102,12 +102,14 @@ def take(self, indexer, fill_value=None): msg = ("Index is out of bounds or cannot do a " "non-empty take from an empty array.") - if fill_value is None: + if allow_fill is None: try: output = [self.data[loc] for loc in indexer] except IndexError: raise IndexError(msg) else: + if fill_value is None: + fill_value = self.dtype.na_value # bounds check if (indexer < -1).any(): raise ValueError