From e9190bf325311c717d4dfb977ccdf67fdece4db3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 07:18:14 -0500 Subject: [PATCH 1/2] TST: Fixed failures in JSON asserts (#20827) Fixes an occasional failure in the json tests. They'd fail when the Series held objects of equal length. --- pandas/tests/extension/json/array.py | 6 ++ pandas/tests/extension/json/test_json.py | 73 ++++++++++++++++++++---- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 95f868e89ac39..2e75bb3b8c326 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -105,6 +105,12 @@ def take(self, indexer, allow_fill=True, fill_value=None): def copy(self, deep=False): return type(self)(self.data[:]) + def astype(self, dtype, copy=True): + # NumPy has issues when all the dicts are the same length. + # np.array([UserDict(...), UserDict(...)]) fails, + # but np.array([{...}, {...}]) works, so cast. + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + def unique(self): # Parent method doesn't work since np.array will try to infer # a 2-dim object. diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index dcf08440738e7..0ef34c3b0f679 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -1,8 +1,10 @@ import operator +import collections import pytest - +import pandas as pd +import pandas.util.testing as tm from pandas.compat import PY2, PY36 from pandas.tests.extension import base @@ -59,27 +61,76 @@ def data_for_grouping(): ]) -class TestDtype(base.BaseDtypeTests): +class BaseJSON(object): + # NumPy doesn't handle an array of equal-length UserDicts. + # The default assert_series_equal eventually does a + # Series.values, which raises. We work around it by + # converting the UserDicts to dicts. + def assert_series_equal(self, left, right, **kwargs): + if left.dtype.name == 'json': + assert left.dtype == right.dtype + left = pd.Series(JSONArray(left.values.astype(object)), + index=left.index, name=left.name) + right = pd.Series(JSONArray(right.values.astype(object)), + index=right.index, name=right.name) + tm.assert_series_equal(left, right, **kwargs) + + def assert_frame_equal(self, left, right, *args, **kwargs): + tm.assert_index_equal( + left.columns, right.columns, + exact=kwargs.get('check_column_type', 'equiv'), + check_names=kwargs.get('check_names', True), + check_exact=kwargs.get('check_exact', False), + check_categorical=kwargs.get('check_categorical', True), + obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + + jsons = (left.dtypes == 'json').index + + for col in jsons: + self.assert_series_equal(left[col], right[col], + *args, **kwargs) + + left = left.drop(columns=jsons) + right = right.drop(columns=jsons) + tm.assert_frame_equal(left, right, *args, **kwargs) + + +class TestDtype(BaseJSON, base.BaseDtypeTests): pass -class TestInterface(base.BaseInterfaceTests): - pass +class TestInterface(BaseJSON, base.BaseInterfaceTests): + def test_custom_asserts(self): + # This would always trigger the KeyError from trying to put + # an array of equal-length UserDicts inside an ndarray. + data = JSONArray([collections.UserDict({'a': 1}), + collections.UserDict({'b': 2}), + collections.UserDict({'c': 3})]) + a = pd.Series(data) + self.assert_series_equal(a, a) + self.assert_frame_equal(a.to_frame(), a.to_frame()) + + b = pd.Series(data.take([0, 0, 1])) + with pytest.raises(AssertionError): + self.assert_series_equal(a, b) + + with pytest.raises(AssertionError): + self.assert_frame_equal(a.to_frame(), b.to_frame()) -class TestConstructors(base.BaseConstructorsTests): +class TestConstructors(BaseJSON, base.BaseConstructorsTests): pass -class TestReshaping(base.BaseReshapingTests): +class TestReshaping(BaseJSON, base.BaseReshapingTests): pass -class TestGetitem(base.BaseGetitemTests): +class TestGetitem(BaseJSON, base.BaseGetitemTests): pass -class TestMissing(base.BaseMissingTests): +class TestMissing(BaseJSON, base.BaseMissingTests): @pytest.mark.xfail(reason="Setting a dict as a scalar") def test_fillna_series(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" @@ -94,7 +145,7 @@ def test_fillna_frame(self): reason="Dictionary order unstable") -class TestMethods(base.BaseMethodsTests): +class TestMethods(BaseJSON, base.BaseMethodsTests): @unhashable def test_value_counts(self, all_data, dropna): pass @@ -126,7 +177,7 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending): data_missing_for_sorting, ascending) -class TestCasting(base.BaseCastingTests): +class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.xfail def test_astype_str(self): """This currently fails in NumPy on np.array(self, dtype=str) with @@ -139,7 +190,7 @@ def test_astype_str(self): # internals has trouble setting sequences of values into scalar positions. -class TestGroupby(base.BaseGroupbyTests): +class TestGroupby(BaseJSON, base.BaseGroupbyTests): @unhashable def test_groupby_extension_transform(self): From 6cacdde5630c593999059833b516e1fec60aaf72 Mon Sep 17 00:00:00 2001 From: "Dr. Irv" Date: Thu, 26 Apr 2018 11:34:45 -0400 Subject: [PATCH 2/2] Change _can_hold_na to a class attribute and document that it shouldn't be changed (#20819) --- pandas/core/arrays/base.py | 26 ++++++++++-------------- pandas/tests/extension/base/interface.py | 3 ++- pandas/tests/extension/base/missing.py | 5 +---- pandas/tests/extension/conftest.py | 2 +- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9958be47267ee..f1a81b5eefddd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -38,10 +38,9 @@ class ExtensionArray(object): * copy * _concat_same_type - Some additional methods are available to satisfy pandas' internal, private - block API: + An additional method is available to satisfy pandas' internal, + private block API. - * _can_hold_na * _formatting_values Some methods require casting the ExtensionArray to an ndarray of Python @@ -399,7 +398,8 @@ def _values_for_factorize(self): Returns ------- values : ndarray - An array suitable for factoraization. This should maintain order + + An array suitable for factorization. This should maintain order and be a supported dtype (Float64, Int64, UInt64, String, Object). By default, the extension array is cast to object dtype. na_value : object @@ -422,7 +422,7 @@ def factorize(self, na_sentinel=-1): Returns ------- labels : ndarray - An interger NumPy array that's an indexer into the original + An integer NumPy array that's an indexer into the original ExtensionArray. uniques : ExtensionArray An ExtensionArray containing the unique values of `self`. @@ -566,16 +566,12 @@ def _concat_same_type(cls, to_concat): """ raise AbstractMethodError(cls) - @property - def _can_hold_na(self): - # type: () -> bool - """Whether your array can hold missing values. True by default. - - Notes - ----- - Setting this to false will optimize some operations like fillna. - """ - return True + # The _can_hold_na attribute is set to True so that pandas internals + # will use the ExtensionDtype.na_value as the NA value in operations + # such as take(), reindex(), shift(), etc. In addition, those results + # will then be of the ExtensionArray subclass rather than an array + # of objects + _can_hold_na = True @property def _ndarray_values(self): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 9b60652fbace3..8ef8debbdc666 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -21,7 +21,8 @@ def test_ndim(self, data): assert data.ndim == 1 def test_can_hold_na_valid(self, data): - assert data._can_hold_na in {True, False} + # GH-20761 + assert data._can_hold_na is True def test_memory_usage(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index f6cee9af0b722..32cf29818e069 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -9,10 +9,7 @@ class BaseMissingTests(BaseExtensionTests): def test_isna(self, data_missing): - if data_missing._can_hold_na: - expected = np.array([True, False]) - else: - expected = np.array([False, False]) + expected = np.array([True, False]) result = pd.isna(data_missing) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 4cb4ea21d9be3..bbd31c4071b91 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -57,7 +57,7 @@ def na_cmp(): Should return a function of two arguments that returns True if both arguments are (scalar) NA for your type. - By default, uses ``operator.or`` + By default, uses ``operator.is_`` """ return operator.is_