Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
* indexer -> indices
* doc user-facing vs physical
* assert na_cmps
* test reindex w/ non-NA fill_value
  • Loading branch information
TomAugspurger committed Apr 26, 2018
1 parent 741f284 commit fbc4425
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 39 deletions.
32 changes: 17 additions & 15 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,8 +1448,9 @@ def func(arr, indexer, out, fill_value=np.nan):
return func


def take(arr, indexer, allow_fill=False, fill_value=None):
"""Take elements from an array.
def take(arr, indices, allow_fill=False, fill_value=None):
"""
Take elements from an array.
.. versionadded:: 0.23.0
Expand All @@ -1458,22 +1459,23 @@ def take(arr, indexer, allow_fill=False, fill_value=None):
arr : sequence
Non array-likes (sequences without a dtype) are coereced
to an ndarray.
indexer : sequence of integers
indices : sequence of integers
Indices to be taken.
allow_fill : bool, default False
How to handle negative values in `indexer`.
How to handle negative values in `indices`.
* False: negative values in `indexer` indicate
slices from the right (the default)
* False: negative values in `indices` indicate indexing from
the right (the default). This is similar to :func:`numpy.take`.
* True: negative values in `indexer` indicate
* True: negative values in `indices` indicate
missing values. These values are set to `fill_value`. Any other
other negative values raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indicies when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type, ``self.dtype.na_value``, is used.
the type is used. For ndarrays, :attr:`numpy.nan` is used. For
ExtensionArrays, a different value may be used.
Returns
-------
Expand All @@ -1483,17 +1485,17 @@ def take(arr, indexer, allow_fill=False, fill_value=None):
Raises
------
IndexError
When the indexer is out of bounds for the array.
When `indices` is out of bounds for the array.
ValueError
When the indexer contains negative values other than ``-1``
and `allow_fill` is True.
Notes
-----
When `allow_fill` is False, `indexer` may be whatever dimensionality
When `allow_fill` is False, `indices` may be whatever dimensionality
is accepted by NumPy for `arr`.
When `allow_fill` is True, `indexer` should be 1-D.
When `allow_fill` is True, `indices` should be 1-D.
See Also
--------
Expand Down Expand Up @@ -1524,15 +1526,15 @@ def take(arr, indexer, allow_fill=False, fill_value=None):
arr = np.asarray(arr)

# Do we require int64 or intp here?
indexer = np.asarray(indexer, dtype='int')
indices = np.asarray(indices, dtype='int')

if allow_fill:
# Pandas style, -1 means NA
validate_indices(indexer, len(arr))
result = take_1d(arr, indexer, allow_fill=True, fill_value=fill_value)
validate_indices(indices, len(arr))
result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value)
else:
# NumPy style
result = arr.take(indexer)
result = arr.take(indices)
return result


Expand Down
29 changes: 19 additions & 10 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,45 +463,51 @@ def factorize(self, na_sentinel=-1):
# Indexing methods
# ------------------------------------------------------------------------

def take(self, indexer, allow_fill=False, fill_value=None):
def take(self, indices, allow_fill=False, fill_value=None):
# type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
"""Take elements from an array.
Parameters
----------
indexer : sequence of integers
indices : sequence of integers
Indices to be taken. See Notes for how negative indicies
are handled.
allow_fill : bool, default False
How to handle negative values in `indexer`.
How to handle negative values in `indices`.
For False values (the default), negative values in `indexer`
For False values (the default), negative values in `indices`
indiciate slices from the right.
For True values, indicies where `indexer` is ``-1`` indicate
For True values, indicies where `indices` is ``-1`` indicate
missing values. These values are set to `fill_value`. Any other
other negative value should raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indicies when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type, ``self.dtype.na_value``, is used.
For many ExtensionArrays, there will be two representations of
`fill_value`: a user-facing "boxed" scalar, and a low-level
physical NA value. `fill_value` should be the user-facing version,
and the implementation should handle translating that to the
physical version for processing the take if nescessary.
Returns
-------
ExtensionArray
Raises
------
IndexError
When the indexer is out of bounds for the array.
When the indices are out of bounds for the array.
ValueError
When the indexer contains negative values other than ``-1``
When `indices` contains negative values other than ``-1``
and `allow_fill` is True.
Notes
-----
ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
``iloc``, when the indexer is a sequence of values. Additionally,
``iloc``, when `indices` is a sequence of values. Additionally,
it's called by :meth:`Series.reindex`, or any other method
that causes realignemnt, with a `fill_value`.
Expand All @@ -518,14 +524,17 @@ def take(self, indexer, allow_fill=False, fill_value=None):
.. code-block:: python
def take(self, indexer, allow_fill=False, fill_value=None):
def take(self, indices, allow_fill=False, fill_value=None):
from pandas.core.algorithms import take
# If the ExtensionArray is backed by an ndarray, then
# just pass that here instead of coercing to object.
data = self.astype(object)
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indexer, fill_value=fill_value,
result = take(data, indices, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)
"""
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ class _DtypeOpsMixin(object):
# class's methods can be moved to ExtensionDtype and removed.

# na_value is the default NA value to use for this type. This is used in
# e.g. ExtensionArray.take.
# e.g. ExtensionArray.take. This should be the user-facing "boxed" version
# of the NA value, not the physical NA vaalue for storage.
na_value = np.nan

def __eq__(self, other):
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5405,9 +5405,6 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):

for placement, join_units in concat_plan:

# The issue: we have a join unit (or maybe several) that needs to be
# reindexed.

if len(join_units) == 1 and not join_units[0].indexers:
b = join_units[0].block
values = b.values
Expand Down
20 changes: 17 additions & 3 deletions pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,11 @@ def test_take(self, data, na_value, na_cmp):
result = data.take([0, -1])
assert result.dtype == data.dtype
assert result[0] == data[0]
na_cmp(result[1], na_value)
assert result[1] == data[-1]

result = data.take([0, -1], allow_fill=True, fill_value=na_value)
assert result[0] == data[0]
assert na_cmp(result[1], na_value)

with tm.assert_raises_regex(IndexError, "out of bounds"):
data.take([len(data) + 1])
Expand All @@ -136,7 +140,7 @@ def test_take_empty(self, data, na_value, na_cmp):
empty = data[:0]

result = empty.take([-1], allow_fill=True)
na_cmp(result[0], na_value)
assert na_cmp(result[0], na_value)

with pytest.raises(IndexError):
empty.take([-1])
Expand Down Expand Up @@ -170,7 +174,6 @@ def test_take_out_of_bounds_raises(self, data, allow_fill):
with pytest.raises(IndexError):
arr.take(np.asarray([0, 3]), allow_fill=allow_fill)

@pytest.mark.xfail(reason="Series.take with extension array buggy for -1")
def test_take_series(self, data):
s = pd.Series(data)
result = s.take([0, -1])
Expand All @@ -196,3 +199,14 @@ def test_reindex(self, data, na_value):
expected = pd.Series(data._from_sequence([na_value, na_value]),
index=[n, n + 1])
self.assert_series_equal(result, expected)

def test_reindex_non_na_fill_value(self, data_missing):
valid = data_missing[1]
na = data_missing[0]

array = data_missing._from_sequence([na, valid])
ser = pd.Series(array)
result = ser.reindex([0, 1, 2], fill_value=valid)
expected = pd.Series(data_missing._from_sequence([na, valid, valid]))

self.assert_series_equal(result, expected)
10 changes: 9 additions & 1 deletion pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,15 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests):


class TestGetitem(BaseDecimal, base.BaseGetitemTests):
pass

def test_take_na_value_other_decimal(self):
arr = DecimalArray([decimal.Decimal('1.0'),
decimal.Decimal('2.0')])
result = arr.take([0, -1], allow_fill=True,
fill_value=decimal.Decimal('-1.0'))
expected = DecimalArray([decimal.Decimal('1.0'),
decimal.Decimal('-1.0')])
self.assert_extension_array_equal(result, expected)


class TestMissing(BaseDecimal, base.BaseMissingTests):
Expand Down
18 changes: 12 additions & 6 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
"""Test extension array for storing nested data in a pandas container.
The JSONArray stores lists of dictionaries. The storage mechanism is a list,
not an ndarray.
Note:
We currently store lists of UserDicts (Py3 only). Pandas has a few places
internally that specifically check for dicts, and does non-scalar things
in that case. We *want* the dictionaries to be treated as scalars, so we
hack around pandas by using UserDicts.
"""
import collections
import itertools
import numbers
Expand Down Expand Up @@ -125,12 +137,6 @@ def take(self, indexer, allow_fill=False, fill_value=None):

return self._from_sequence(output)

# def astype(self, dtype, copy=True):
# # NumPy has issues when all the dicts are the same length.
# # np.array([UserDict(...), UserDict(...)]) fails,
# # but np.array([{...}, {...}]) works, so cast.
# return np.array([dict(x) for x in self], dtype=dtype, copy=copy)

def copy(self, deep=False):
return type(self)(self.data[:])

Expand Down

0 comments on commit fbc4425

Please sign in to comment.