From 43989fd7cb9917e885c2b55a172c4f9f3838d59d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 May 2016 10:53:37 -0400 Subject: [PATCH 01/14] DOC: xref #13112, add back lexsorting example --- doc/source/advanced.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 7c7895a95310d..e50e792201d26 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -528,6 +528,13 @@ return a copy of the data rather than a view: jim joe 1 z 0.64094 +Furthermore if you try to index something that is not fully lexsorted, this can raise: + +.. code-block:: ipython + + In [5]: dfm.loc[(0,'y'):(1, 'z')] + KeyError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' + The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and the ``lexsort_depth`` property returns the sort depth: .. ipython:: python @@ -542,6 +549,12 @@ The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and t dfm.index.is_lexsorted() dfm.index.lexsort_depth +And now selection works as expected. + +.. ipython:: python + + dfm.loc[(0,'y'):(1, 'z')] + Take Methods ------------ From f0e47a9c9350e0d8fc0fe00a1ca0237582437e9d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 11 May 2016 09:32:25 -0400 Subject: [PATCH 02/14] COMPAT: boto import issues Author: Jeff Reback Closes #13136 from jreback/boto and squashes the following commits: dcb02d2 [Jeff Reback] COMPAT: boto import issues --- pandas/io/common.py | 109 ++-------------------------------- pandas/io/s3.py | 112 +++++++++++++++++++++++++++++++++++ pandas/io/tests/test_data.py | 3 - pandas/io/tests/test_s3.py | 14 +++++ 4 files changed, 130 insertions(+), 108 deletions(-) create mode 100644 pandas/io/s3.py create mode 100644 pandas/io/tests/test_s3.py diff --git a/pandas/io/common.py b/pandas/io/common.py index dc7c483c1fb68..cf4bba6e97afb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -104,85 +104,6 @@ def __next__(self): BaseIterator.next = lambda self: self.__next__() -try: - from boto.s3 import key - - class BotoFileLikeReader(key.Key): - """boto Key modified to be more file-like - - This modification of the boto Key will read through a supplied - S3 key once, then stop. The unmodified boto Key object will repeatedly - cycle through a file in S3: after reaching the end of the file, - boto will close the file. Then the next call to `read` or `next` will - re-open the file and start reading from the beginning. - - Also adds a `readline` function which will split the returned - values by the `\n` character. - """ - - def __init__(self, *args, **kwargs): - encoding = kwargs.pop("encoding", None) # Python 2 compat - super(BotoFileLikeReader, self).__init__(*args, **kwargs) - # Add a flag to mark the end of the read. - self.finished_read = False - self.buffer = "" - self.lines = [] - if encoding is None and compat.PY3: - encoding = "utf-8" - self.encoding = encoding - self.lines = [] - - def next(self): - return self.readline() - - __next__ = next - - def read(self, *args, **kwargs): - if self.finished_read: - return b'' if compat.PY3 else '' - return super(BotoFileLikeReader, self).read(*args, **kwargs) - - def close(self, *args, **kwargs): - self.finished_read = True - return super(BotoFileLikeReader, self).close(*args, **kwargs) - - def seekable(self): - """Needed for reading by bz2""" - return False - - def readline(self): - """Split the contents of the Key by '\n' characters.""" - if self.lines: - retval = self.lines[0] - self.lines = self.lines[1:] - return retval - if self.finished_read: - if self.buffer: - retval, self.buffer = self.buffer, "" - return retval - else: - raise StopIteration - - if self.encoding: - self.buffer = "{}{}".format( - self.buffer, self.read(8192).decode(self.encoding)) - else: - self.buffer = "{}{}".format(self.buffer, self.read(8192)) - - split_buffer = self.buffer.split("\n") - self.lines.extend(split_buffer[:-1]) - self.buffer = split_buffer[-1] - - return self.readline() -except ImportError: - # boto is only needed for reading from S3. - pass -except TypeError: - # boto/boto3 issues - # GH11915 - pass - - def _is_url(url): """Check to see if a URL has a valid protocol. @@ -319,32 +240,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, return tuple(to_return) if _is_s3_url(filepath_or_buffer): - try: - import boto - except: - raise ImportError("boto is required to handle s3 files") - # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST - # are environment variables - parsed_url = parse_url(filepath_or_buffer) - s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') - - try: - conn = boto.connect_s3(host=s3_host) - except boto.exception.NoAuthHandlerFound: - conn = boto.connect_s3(host=s3_host, anon=True) - - b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and (compression == 'gzip' or - (compression == 'infer' and - filepath_or_buffer.endswith(".gz"))): - k = boto.s3.key.Key(b, parsed_url.path) - filepath_or_buffer = BytesIO(k.get_contents_as_string( - encoding=encoding)) - else: - k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) - k.open('r') # Expose read errors immediately - filepath_or_buffer = k - return filepath_or_buffer, None, compression + from pandas.io.s3 import get_filepath_or_buffer + return get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) diff --git a/pandas/io/s3.py b/pandas/io/s3.py new file mode 100644 index 0000000000000..df8f1d9187031 --- /dev/null +++ b/pandas/io/s3.py @@ -0,0 +1,112 @@ +""" s3 support for remote file interactivity """ + +import os +from pandas import compat +from pandas.compat import BytesIO + +try: + import boto + from boto.s3 import key +except: + raise ImportError("boto is required to handle s3 files") + +if compat.PY3: + from urllib.parse import urlparse as parse_url +else: + from urlparse import urlparse as parse_url + + +class BotoFileLikeReader(key.Key): + """boto Key modified to be more file-like + + This modification of the boto Key will read through a supplied + S3 key once, then stop. The unmodified boto Key object will repeatedly + cycle through a file in S3: after reaching the end of the file, + boto will close the file. Then the next call to `read` or `next` will + re-open the file and start reading from the beginning. + + Also adds a `readline` function which will split the returned + values by the `\n` character. + """ + + def __init__(self, *args, **kwargs): + encoding = kwargs.pop("encoding", None) # Python 2 compat + super(BotoFileLikeReader, self).__init__(*args, **kwargs) + # Add a flag to mark the end of the read. + self.finished_read = False + self.buffer = "" + self.lines = [] + if encoding is None and compat.PY3: + encoding = "utf-8" + self.encoding = encoding + self.lines = [] + + def next(self): + return self.readline() + + __next__ = next + + def read(self, *args, **kwargs): + if self.finished_read: + return b'' if compat.PY3 else '' + return super(BotoFileLikeReader, self).read(*args, **kwargs) + + def close(self, *args, **kwargs): + self.finished_read = True + return super(BotoFileLikeReader, self).close(*args, **kwargs) + + def seekable(self): + """Needed for reading by bz2""" + return False + + def readline(self): + """Split the contents of the Key by '\n' characters.""" + if self.lines: + retval = self.lines[0] + self.lines = self.lines[1:] + return retval + if self.finished_read: + if self.buffer: + retval, self.buffer = self.buffer, "" + return retval + else: + raise StopIteration + + if self.encoding: + self.buffer = "{}{}".format( + self.buffer, self.read(8192).decode(self.encoding)) + else: + self.buffer = "{}{}".format(self.buffer, self.read(8192)) + + split_buffer = self.buffer.split("\n") + self.lines.extend(split_buffer[:-1]) + self.buffer = split_buffer[-1] + + return self.readline() + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None): + + # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST + # are environment variables + parsed_url = parse_url(filepath_or_buffer) + s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') + + try: + conn = boto.connect_s3(host=s3_host) + except boto.exception.NoAuthHandlerFound: + conn = boto.connect_s3(host=s3_host, anon=True) + + b = conn.get_bucket(parsed_url.netloc, validate=False) + if compat.PY2 and (compression == 'gzip' or + (compression == 'infer' and + filepath_or_buffer.endswith(".gz"))): + k = boto.s3.key.Key(b, parsed_url.path) + filepath_or_buffer = BytesIO(k.get_contents_as_string( + encoding=encoding)) + else: + k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) + k.open('r') # Expose read errors immediately + filepath_or_buffer = k + return filepath_or_buffer, None, compression diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index d9c09fa788332..6845eb009df5d 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -472,9 +472,6 @@ def test_options_source_warning(self): class TestDataReader(tm.TestCase): - def test_is_s3_url(self): - from pandas.io.common import _is_s3_url - self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) @network def test_read_yahoo(self): diff --git a/pandas/io/tests/test_s3.py b/pandas/io/tests/test_s3.py new file mode 100644 index 0000000000000..8058698a906ea --- /dev/null +++ b/pandas/io/tests/test_s3.py @@ -0,0 +1,14 @@ +import nose +from pandas.util import testing as tm + +from pandas.io.common import _is_s3_url + + +class TestS3URL(tm.TestCase): + def test_is_s3_url(self): + self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) + self.assertFalse(_is_s3_url("s4://pandas/somethingelse.com")) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From d0734ba4d0f4c228110dc3974943ce4ec2adeea4 Mon Sep 17 00:00:00 2001 From: Yadunandan Date: Wed, 11 May 2016 18:13:30 -0400 Subject: [PATCH 03/14] BUG: Added checks for NaN in __call__ of EngFormatter closes #11981 Author: Yadunandan Closes #13124 from yaduart/bugfix-11981 and squashes the following commits: 8de1f64 [Yadunandan] BUG: Added checks for Nan in __call__ of EngFormatter --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/formats/format.py | 3 +++ pandas/tests/formats/test_format.py | 19 +++++++++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index fa426aa30bc65..5ffbce9867121 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -132,3 +132,4 @@ Bug Fixes - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index c3ffc018d1031..70b506a1415c1 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2590,6 +2590,9 @@ def __call__(self, num): import math dnum = decimal.Decimal(str(num)) + if decimal.Decimal.is_nan(dnum): + return 'NaN' + sign = 1 if dnum < 0: # pragma: no cover diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 4fcee32c46067..96770a86ff383 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3087,11 +3087,11 @@ def test_to_csv_doublequote(self): def test_to_csv_escapechar(self): df = DataFrame({'col': ['a"a', '"bb"']}) - expected = """\ + expected = '''\ "","col" "0","a\\"a" "1","\\"bb\\"" -""" +''' with tm.ensure_clean('test.csv') as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') @@ -3925,6 +3925,21 @@ def test_rounding(self): result = formatter(0) self.assertEqual(result, u(' 0.000')) + def test_nan(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.nan) + self.assertEqual(result, u('NaN')) + + df = pd.DataFrame({'a':[1.5, 10.3, 20.5], + 'b':[50.3, 60.67, 70.12], + 'c':[100.2, 101.33, 120.33]}) + pt = df.pivot_table(values='a', index='b', columns='c') + fmt.set_eng_float_format(accuracy=1) + result = pt.to_string() + self.assertTrue('NaN' in result) + self.reset_display_options() def _three_digit_exp(): return '%.4g' % 1.7e8 == '1.7e+008' From 2a99394bf96415a5b525e6db206a04d3d2ff68c3 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 11 May 2016 18:41:01 -0400 Subject: [PATCH 04/14] TST: fix assert_categorical_equal message stage 1 of #13076 Author: sinhrks Closes #13080 from sinhrks/test_categorical_message and squashes the following commits: 81172ce [sinhrks] TST: fix assert_categorical_equal message --- pandas/core/categorical.py | 2 +- pandas/tests/series/test_datetime_values.py | 2 - pandas/tests/test_testing.py | 72 ++++++++++++++++----- pandas/util/testing.py | 19 +++--- 4 files changed, 65 insertions(+), 30 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 4f80c610c1126..44c91862227d8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -985,7 +985,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_codes' not in state and 'labels' in state: - state['_codes'] = state.pop('labels') + state['_codes'] = state.pop('labels').astype(np.int8) if '_categories' not in state and '_levels' in state: state['_categories'] = self._validate_categories(state.pop( '_levels')) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 5b12baf6c6fc5..6e82f81f901a9 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -320,8 +320,6 @@ def test_strftime(self): expected = np.array(['2015/03/01', '2015/03/02', '2015/03/03', '2015/03/04', '2015/03/05'], dtype=np.object_) # dtype may be S10 or U10 depending on python version - print(result) - print(expected) self.assert_numpy_array_equal(result, expected, check_dtype=False) period_index = period_range('20150301', periods=5) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 9294bccce013f..357d53cb58c72 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -65,9 +65,8 @@ def test_assert_almost_equal_dicts(self): self._assert_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) self._assert_not_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 3}) - self._assert_not_almost_equal_both( - {'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 3} - ) + self._assert_not_almost_equal_both({'a': 1, 'b': 2}, + {'a': 1, 'b': 2, 'c': 3}) self._assert_not_almost_equal_both({'a': 1}, 1) self._assert_not_almost_equal_both({'a': 1}, 'abc') self._assert_not_almost_equal_both({'a': 1}, [1, ]) @@ -215,11 +214,11 @@ def test_numpy_array_equal_message(self): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with assertRaisesRegexp(AssertionError, expected): - assert_numpy_array_equal( - np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + assert_numpy_array_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) with assertRaisesRegexp(AssertionError, expected): - assert_almost_equal( - np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + assert_almost_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) expected = """numpy array are different @@ -339,8 +338,8 @@ def test_index_equal_message(self): labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" idx1 = pd.Index([1, 2, 3]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2, exact=False) @@ -350,10 +349,10 @@ def test_index_equal_message(self): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4 - )]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), + ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2) with assertRaisesRegexp(AssertionError, expected): @@ -434,10 +433,10 @@ def test_index_equal_message(self): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4 - )]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), + ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2) with assertRaisesRegexp(AssertionError, expected): @@ -674,6 +673,45 @@ def test_notisinstance(self): tm.assertNotIsInstance(pd.Series([1]), pd.Series) +class TestAssertCategoricalEqual(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_categorical_equal_message(self): + + expected = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)""" + + a = pd.Categorical([1, 2, 3, 4]) + b = pd.Categorical([1, 2, 3, 5]) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + expected = """Categorical\\.codes are different + +Categorical\\.codes values are different \\(50\\.0 %\\) +\\[left\\]: \\[0, 1, 3, 2\\] +\\[right\\]: \\[0, 1, 2, 3\\]""" + + a = pd.Categorical([1, 2, 4, 3], categories=[1, 2, 3, 4]) + b = pd.Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + expected = """Categorical are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""" + + a = pd.Categorical([1, 2, 3, 4], ordered=False) + b = pd.Categorical([1, 2, 3, 4], ordered=True) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + class TestRNGContext(unittest.TestCase): def test_RNGContext(self): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 3ea4a09c453ee..8682302b542be 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -903,18 +903,17 @@ def assertNotIsInstance(obj, cls, msg=''): raise AssertionError(err_msg.format(msg, cls)) -def assert_categorical_equal(res, exp): - assertIsInstance(res, pd.Categorical, '[Categorical] ') - assertIsInstance(exp, pd.Categorical, '[Categorical] ') +def assert_categorical_equal(left, right, check_dtype=True, + obj='Categorical'): + assertIsInstance(left, pd.Categorical, '[Categorical] ') + assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(res.categories, exp.categories) + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) - if not array_equivalent(res.codes, exp.codes): - raise AssertionError( - 'codes not equivalent: {0} vs {1}.'.format(res.codes, exp.codes)) - - if res.ordered != exp.ordered: - raise AssertionError("ordered not the same") + assert_attr_equal('ordered', left, right, obj=obj) def raise_assert_detail(obj, message, left, right): From 4aa6323e7d72fe00417d8aab783a5f78cf497018 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 3 May 2016 16:30:40 +0900 Subject: [PATCH 05/14] BUG: Series ops with object dtype may incorrectly fail closes #13043 closes #13072 --- doc/source/whatsnew/v0.18.2.txt | 14 ++++++ pandas/core/ops.py | 24 ++++++++-- pandas/tseries/tests/test_period.py | 62 +++++++++++++++++++++++++ pandas/tseries/tests/test_timedeltas.py | 32 +++++++++++++ 4 files changed, 128 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 5ffbce9867121..34bd2956319fc 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -128,8 +128,22 @@ Bug Fixes +- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) + - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) + + + + + + + + + + + + diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 63fea71895da2..b02f94cc92e22 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -19,6 +19,7 @@ from pandas.tslib import iNaT from pandas.compat import bind_method import pandas.core.missing as missing +import pandas.algos as _algos import pandas.core.algorithms as algos from pandas.core.common import (is_list_like, notnull, isnull, _values_from_object, _maybe_match_name, @@ -600,6 +601,21 @@ def na_op(x, y): result = missing.fill_zeros(result, x, y, name, fill_zeros) return result + def safe_na_op(lvalues, rvalues): + try: + return na_op(lvalues, rvalues) + except Exception: + if isinstance(rvalues, ABCSeries): + if is_object_dtype(rvalues): + # if dtype is object, try elementwise op + return _algos.arrmap_object(rvalues, + lambda x: op(lvalues, x)) + else: + if is_object_dtype(lvalues): + return _algos.arrmap_object(lvalues, + lambda x: op(x, rvalues)) + raise + def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): @@ -638,9 +654,8 @@ def wrapper(left, right, name=name, na_op=na_op): if ridx is not None: rvalues = algos.take_1d(rvalues, ridx) - arr = na_op(lvalues, rvalues) - - return left._constructor(wrap_results(arr), index=index, + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=index, name=name, dtype=dtype) else: # scalars @@ -648,7 +663,8 @@ def wrapper(left, right, name=name, na_op=na_op): not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values - return left._constructor(wrap_results(na_op(lvalues, rvalues)), + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=left.index, name=left.name, dtype=dtype) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 740a158c52f87..4217cc9a299a3 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4151,6 +4151,68 @@ def test_intercept_astype_object(self): result = df.values.squeeze() self.assertTrue((result[:, 0] == expected.values).all()) + def test_ops_series_timedelta(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Period('2015-01-02', freq='D'), + pd.Period('2015-01-03', freq='D')], name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + tm.assert_series_equal(s + pd.tseries.offsets.Day(), exp) + tm.assert_series_equal(pd.tseries.offsets.Day() + s, exp) + + def test_ops_series_period(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + p = pd.Period('2015-01-10', freq='D') + # dtype will be object because of original dtype + exp = pd.Series([9, 8], name='xxx', dtype=object) + tm.assert_series_equal(p - s, exp) + tm.assert_series_equal(s - p, -exp) + + s2 = pd.Series([pd.Period('2015-01-05', freq='D'), + pd.Period('2015-01-04', freq='D')], name='xxx') + self.assertEqual(s2.dtype, object) + + exp = pd.Series([4, 2], name='xxx', dtype=object) + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + def test_ops_frame_period(self): + # GH 13043 + df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), + pd.Period('2015-02', freq='M')], + 'B': [pd.Period('2014-01', freq='M'), + pd.Period('2014-02', freq='M')]}) + self.assertEqual(df['A'].dtype, object) + self.assertEqual(df['B'].dtype, object) + + p = pd.Period('2015-03', freq='M') + # dtype will be object because of original dtype + exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), + 'B': np.array([14, 13], dtype=object)}) + tm.assert_frame_equal(p - df, exp) + tm.assert_frame_equal(df - p, -exp) + + df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')], + 'B': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')]}) + self.assertEqual(df2['A'].dtype, object) + self.assertEqual(df2['B'].dtype, object) + + exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), + 'B': np.array([16, 16], dtype=object)}) + tm.assert_frame_equal(df2 - df, exp) + tm.assert_frame_equal(df - df2, -exp) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index c764f34b697c1..8474bbbc91931 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -413,6 +413,38 @@ def test_ops_series(self): tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) + def test_ops_series_object(self): + # GH 13043 + s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + # object series & object series + s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), + pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s2.dtype, object) + exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], + name='xxx') + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], + name='xxx', dtype=object) + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) + tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) + def test_compare_timedelta_series(self): # regresssion test for GH5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) From 4de83d25d751d8ca102867b2d46a5547c01d7248 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 12 May 2016 09:08:51 -0400 Subject: [PATCH 06/14] PERF: quantile now operates per block boosting perf / fix quantile with nan closes #11623 closes #13098 Author: Jeff Reback Closes #13122 from jreback/quantile and squashes the following commits: aad72cb [Jeff Reback] PERF: quantile now operates per block boosting perf REGR: series quantile with nan --- asv_bench/benchmarks/frame_methods.py | 13 +- codecov.yml | 3 +- doc/source/whatsnew/v0.18.1.txt | 1 - doc/source/whatsnew/v0.18.2.txt | 16 +- pandas/core/frame.py | 34 ++-- pandas/core/internals.py | 273 ++++++++++++++++++++++---- pandas/core/series.py | 21 +- pandas/io/pytables.py | 10 +- pandas/src/inference.pyx | 27 ++- pandas/tests/frame/test_quantile.py | 55 ++++-- pandas/tests/series/test_quantile.py | 8 + pandas/tests/test_groupby.py | 6 +- 12 files changed, 352 insertions(+), 115 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 9367c42f8d39a..5c5a1df4ea1f8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -423,7 +423,7 @@ class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() @@ -985,3 +985,14 @@ def setup(self): def time_series_string_vector_slice(self): self.s.str[:5] + + +class frame_quantile_axis1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_quantile_axis1(self): + self.df.quantile([0.1, 0.5], axis=1) diff --git a/codecov.yml b/codecov.yml index edf2d821e07e5..86e7dd55c9550 100644 --- a/codecov.yml +++ b/codecov.yml @@ -9,4 +9,5 @@ coverage: branches: null changes: default: - branches: null + branches: + - master diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 7f837bef5251c..51982c42499ff 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -563,7 +563,6 @@ Performance Improvements - Improved speed of SAS reader (:issue:`12656`, :issue:`12961`) - Performance improvements in ``.groupby(..).cumcount()`` (:issue:`11039`) - Improved memory usage in ``pd.read_csv()`` when using ``skiprows=an_integer`` (:issue:`13005`) - - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`). - Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 34bd2956319fc..85209c0dfa03d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -97,6 +97,9 @@ Performance Improvements - Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) +- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) + + @@ -110,6 +113,7 @@ Bug Fixes +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) @@ -135,15 +139,3 @@ Bug Fixes - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - - - - - - - - - - - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b209b6d6ec543..3bf442349ef04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4989,31 +4989,27 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.5 2.5 55.0 """ self._check_percentile(q) - if not com.is_list_like(q): - q = [q] - squeeze = True - else: - squeeze = False data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) + is_transposed = axis == 1 - def _quantile(series): - res = series.quantile(q, interpolation=interpolation) - return series.name, res - - if axis == 1: + if is_transposed: data = data.T - # unable to use DataFrame.apply, becasuse data may be empty - result = dict(_quantile(s) for (_, s) in data.iteritems()) - result = self._constructor(result, columns=data.columns) - if squeeze: - if result.shape == (1, 1): - result = result.T.iloc[:, 0] # don't want scalar - else: - result = result.T.squeeze() - result.name = None # For groupby, so it can set an index name + result = data._data.quantile(qs=q, + axis=1, + interpolation=interpolation, + transposed=is_transposed) + + if result.ndim == 2: + result = self._constructor(result) + else: + result = self._constructor_sliced(result, name=q) + + if is_transposed: + result = result.T + return result def to_timestamp(self, freq=None, how='start', axis=0, copy=True): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index abfc5c989056e..97df81ad6be48 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -40,7 +40,7 @@ from pandas.util.decorators import cache_readonly from pandas.tslib import Timedelta -from pandas import compat +from pandas import compat, _np_version_under1p9 from pandas.compat import range, map, zip, u from pandas.lib import BlockPlacement @@ -84,7 +84,7 @@ def __init__(self, values, placement, ndim=None, fastpath=False): self.mgr_locs = placement self.values = values - if len(self.mgr_locs) != len(self.values): + if ndim and len(self.mgr_locs) != len(self.values): raise ValueError('Wrong number of items passed %d, placement ' 'implies %d' % (len(self.values), len(self.mgr_locs))) @@ -180,6 +180,12 @@ def make_block(self, values, placement=None, ndim=None, **kwargs): return make_block(values, placement=placement, ndim=ndim, **kwargs) + def make_block_scalar(self, values, **kwargs): + """ + Create a ScalarBlock + """ + return ScalarBlock(values) + def make_block_same_class(self, values, placement=None, fastpath=True, **kwargs): """ Wrap given values in a block of same type as self. """ @@ -324,7 +330,8 @@ def apply(self, func, mgr=None, **kwargs): """ result = func(self.values, **kwargs) if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result)) + result = self.make_block(values=_block_shape(result, + ndim=self.ndim)) return result @@ -1260,32 +1267,117 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def quantile(self, qs, mgr=None, **kwargs): + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ compute the quantiles of the Parameters ---------- - qs : a scalar or list of the quantiles to be computed + qs: a scalar or list of the quantiles to be computed + interpolation: type of interpolation, default 'linear' + axis: axis to compute, default 0 + + Returns + ------- + tuple of (axis, block) + """ + if _np_version_under1p9: + if interpolation != 'linear': + raise ValueError("Interpolation methods other than linear " + "are not supported in numpy < 1.9.") + + kw = {} + if not _np_version_under1p9: + kw.update({'interpolation': interpolation}) values = self.get_values() - values, mask, _, _ = self._try_coerce_args(values, values) + values, _, _, _ = self._try_coerce_args(values, values) + mask = isnull(self.values) if not lib.isscalar(mask) and mask.any(): - values = values[~mask] - if len(values) == 0: - if com.is_list_like(qs): - result = np.array([self.fill_value]) + # even though this could be a 2-d mask it appears + # as a 1-d result + mask = mask.reshape(values.shape) + result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) + values = _block_shape(values[~mask], ndim=self.ndim) + if self.ndim > 1: + values = values.reshape(result_shape) + + from pandas import Float64Index + is_empty = values.shape[axis] == 0 + if com.is_list_like(qs): + ax = Float64Index(qs) + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self._na_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: - result = self._na_value - elif com.is_list_like(qs): - values = [_quantile(values, x * 100, **kwargs) for x in qs] - result = np.array(values) + + try: + result = _quantile(values, np.array(qs) * 100, + axis=axis, **kw) + except ValueError: + + # older numpies don't handle an array for q + result = [_quantile(values, q * 100, + axis=axis, **kw) for q in qs] + + result = np.array(result, copy=False) + if self.ndim > 1: + result = result.T + else: - result = _quantile(values, qs * 100, **kwargs) - return self._try_coerce_result(result) + if self.ndim == 1: + ax = Float64Index([qs]) + else: + ax = mgr.axes[0] + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + result = np.array([self._na_value] * len(self)) + else: + result = _quantile(values, qs * 100, axis=axis, **kw) + + ndim = getattr(result, 'ndim', None) or 0 + result = self._try_coerce_result(result) + if lib.isscalar(result): + return ax, self.make_block_scalar(result) + return ax, make_block(result, + placement=np.arange(len(result)), + ndim=ndim) + + +class ScalarBlock(Block): + """ + a scalar compat Block + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + + def __init__(self, values): + self.ndim = 0 + self.mgr_locs = [0] + self.values = values + + @property + def dtype(self): + return type(self.values) + + @property + def shape(self): + return tuple([0]) + + def __len__(self): + return 0 class NonConsolidatableMixIn(object): @@ -1378,6 +1470,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] + + mask = mask.reshape(new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1676,6 +1770,7 @@ def convert(self, *args, **kwargs): can return multiple blocks! """ + if args: raise NotImplementedError by_item = True if 'by_item' not in kwargs else kwargs['by_item'] @@ -1706,8 +1801,13 @@ def convert(self, *args, **kwargs): for i, rl in enumerate(self.mgr_locs): values = self.iget(i) - values = fn(values.ravel(), **fn_kwargs).reshape(values.shape) - values = _block_shape(values, ndim=self.ndim) + shape = values.shape + values = fn(values.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except AttributeError: + pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -2115,7 +2215,10 @@ def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('M8[ns]') + try: + result = result.astype('M8[ns]') + except ValueError: + pass elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) return result @@ -2219,11 +2322,6 @@ def to_object_block(self, mgr): kwargs['placement'] = [0] return self.make_block(values, klass=ObjectBlock, **kwargs) - def replace(self, *args, **kwargs): - # if we are forced to ObjectBlock, then don't coerce (to UTC) - kwargs['convert'] = False - return super(DatetimeTZBlock, self).replace(*args, **kwargs) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): @@ -2246,8 +2344,8 @@ def _try_coerce_args(self, values, other): ------- base-type values, values mask, base-type other, other mask """ - values_mask = isnull(values) - values = values.tz_localize(None).asi8 + values_mask = _block_shape(isnull(values), ndim=self.ndim) + values = _block_shape(values.tz_localize(None).asi8, ndim=self.ndim) other_mask = False if isinstance(other, ABCSeries): @@ -2283,6 +2381,9 @@ def _try_coerce_result(self, result): elif isinstance(result, (np.integer, np.float, np.datetime64)): result = lib.Timestamp(result).tz_localize(self.values.tz) if isinstance(result, np.ndarray): + # allow passing of > 1dim if its trivial + if result.ndim > 1: + result = result.reshape(len(result)) result = self._holder(result).tz_localize(self.values.tz) return result @@ -2809,7 +2910,7 @@ def _verify_integrity(self): len(self.items), tot_items)) def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, raw=False, **kwargs): + consolidate=True, **kwargs): """ iterate over the blocks, collect and create a new block manager @@ -2823,7 +2924,6 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, integrity check consolidate: boolean, default True. Join together blocks having same dtype - raw: boolean, default False. Return the raw returned results Returns ------- @@ -2890,17 +2990,102 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) - if raw: - if self._is_single_block: - return result_blocks[0] - return result_blocks - elif len(result_blocks) == 0: + if len(result_blocks) == 0: return self.make_empty(axes or self.axes) bm = self.__class__(result_blocks, axes or self.axes, do_integrity_check=do_integrity_check) bm._consolidate_inplace() return bm + def reduction(self, f, axis=0, consolidate=True, transposed=False, + **kwargs): + """ + iterate over the blocks, collect and create a new block manager. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + f: the callable or function name to operate on at the block level + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + + Returns + ------- + Block Manager (new object) + + """ + + if consolidate: + self._consolidate_inplace() + + axes, blocks = [], [] + for b in self.blocks: + kwargs['mgr'] = self + axe, block = getattr(b, f)(axis=axis, **kwargs) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = set([b.ndim for b in blocks]) + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate( + [ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [b.make_block(b.values.T, + placement=np.arange(b.shape[1]) + ) for b in blocks] + + return self.__class__(blocks, new_axes) + + # 0 ndim + if 0 in ndim and 1 not in ndim: + values = np.array([b.values for b in blocks]) + if len(values) == 1: + return values.item() + blocks = [make_block(values, ndim=1)] + axes = Index([ax[0] for ax in axes]) + + # single block + values = _concat._concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype='int64') + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, + ndim=1, + placement=np.arange(len(values)))], + axes[0]) + def isnull(self, **kwargs): return self.apply('apply', **kwargs) @@ -2911,7 +3096,7 @@ def eval(self, **kwargs): return self.apply('eval', **kwargs) def quantile(self, **kwargs): - return self.apply('quantile', raw=True, **kwargs) + return self.reduction('quantile', **kwargs) def setitem(self, **kwargs): return self.apply('setitem', **kwargs) @@ -3068,7 +3253,6 @@ def combine(self, blocks, copy=True): indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_items = self.items.take(indexer) new_blocks = [] for b in blocks: @@ -3077,9 +3261,10 @@ def combine(self, blocks, copy=True): axis=0, allow_fill=False) new_blocks.append(b) - new_axes = list(self.axes) - new_axes[0] = new_items - return self.__class__(new_blocks, new_axes, do_integrity_check=False) + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return self.__class__(new_blocks, axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): if axis >= self.ndim: @@ -3829,6 +4014,16 @@ def _block(self): def _values(self): return self._block.values + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return @@ -4317,7 +4512,7 @@ def _extend_blocks(result, blocks=None): def _block_shape(values, ndim=1, shape=None): """ guarantee the shape of the values to be at least 1 d """ - if values.ndim <= ndim: + if values.ndim < ndim: if shape is None: shape = values.shape values = values.reshape(tuple((1, ) + shape)) diff --git a/pandas/core/series.py b/pandas/core/series.py index 58e983ad904ba..43b4ba3a51212 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -57,8 +57,6 @@ from pandas.core.config import get_option -from pandas import _np_version_under1p9 - __all__ = ['Series'] _shared_doc_kwargs = dict( @@ -1349,21 +1347,12 @@ def quantile(self, q=0.5, interpolation='linear'): self._check_percentile(q) - if _np_version_under1p9: - if interpolation != 'linear': - raise ValueError("Interpolation methods other than linear " - "are not supported in numpy < 1.9.") - - kwargs = dict() - if not _np_version_under1p9: - kwargs.update({'interpolation': interpolation}) + result = self._data.quantile(qs=q, interpolation=interpolation) - result = self._data.quantile(qs=q, **kwargs) - - if com.is_list_like(result): - # explicitly use Float64Index to coerce empty result to float dtype - index = Float64Index(q) - return self._constructor(result, index=index, name=self.name) + if com.is_list_like(q): + return self._constructor(result, + index=Float64Index(q), + name=self.name) else: # scalar return result diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dff2c6f0df7b1..318fd17b8f88e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3826,24 +3826,24 @@ def write_data(self, chunksize, dropna=False): nrows = self.nrows_expected # if dropna==True, then drop ALL nan rows + masks = [] if dropna: - masks = [] for a in self.values_axes: # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1', copy=False)) + if isinstance(mask, np.ndarray): + masks.append(mask.astype('u1', copy=False)) - # consolidate masks + # consolidate masks + if len(masks): mask = masks[0] for m in masks[1:]: mask = mask & m mask = mask.ravel() - else: - mask = None # broadcast the indexes if needed diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 843031fafa1a9..3ccc1c4f9336c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -642,6 +642,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_float = 0 bint seen_complex = 0 bint seen_datetime = 0 + bint seen_datetimetz = 0 bint seen_timedelta = 0 bint seen_int = 0 bint seen_bool = 0 @@ -675,6 +676,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen_null = 1 floats[i] = complexes[i] = fnan + elif val is NaT: + if convert_datetime: + idatetimes[i] = iNaT + seen_datetime = 1 + if convert_timedelta: + itimedeltas[i] = iNaT + seen_timedelta = 1 + if not (convert_datetime or convert_timedelta): + seen_object = 1 elif util.is_bool_object(val): seen_bool = 1 bools[i] = val @@ -710,9 +720,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, complexes[i] = val seen_complex = 1 elif PyDateTime_Check(val) or util.is_datetime64_object(val): + + # if we have an tz's attached then return the objects if convert_datetime: - seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + if getattr(val, 'tzinfo', None) is not None: + seen_datetimetz = 1 + break + else: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value else: seen_object = 1 break @@ -731,6 +747,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_numeric = seen_complex or seen_float or seen_int + # we try to coerce datetime w/tz but must all have the same tz + if seen_datetimetz: + if len(set([ getattr(val, 'tz', None) for val in objects ])) == 1: + from pandas import DatetimeIndex + return DatetimeIndex(objects) + seen_object = 1 + if not seen_object: if not safe: diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index d883363812ddb..52e8697abe850 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -28,9 +28,12 @@ def test_quantile(self): q = self.tsframe.quantile(0.1, axis=0) self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + tm.assert_index_equal(q.index, self.tsframe.columns) + q = self.tsframe.quantile(0.9, axis=1) - q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + self.assertEqual(q['2000-01-17'], + percentile(self.tsframe.loc['2000-01-17'], 90)) + tm.assert_index_equal(q.index, self.tsframe.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) @@ -39,13 +42,13 @@ def test_quantile(self): # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) - xp = df.median() + xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) @@ -59,9 +62,25 @@ def test_quantile(self): df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) - expected = Series([3., 4.], index=[0, 1]) + expected = Series([3., 4.], index=[0, 1], name=0.5) assert_series_equal(result, expected) + def test_quantile_axis_mixed(self): + + # mixed on axis=1 + df = DataFrame({"A": [1, 2, 3], + "B": [2., 3., 4.], + "C": pd.date_range('20130101', periods=3), + "D": ['foo', 'bar', 'baz']}) + result = df.quantile(.5, axis=1) + expected = Series([1.5, 2.5, 3.5], name=0.5) + assert_series_equal(result, expected) + + # must raise + def f(): + df.quantile(.5, axis=1, numeric_only=False) + self.assertRaises(TypeError, f) + def test_quantile_axis_parameter(self): # GH 9543/9544 @@ -69,7 +88,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis=0) - expected = Series([2., 3.], index=["A", "B"]) + expected = Series([2., 3.], index=["A", "B"], name=0.5) assert_series_equal(result, expected) expected = df.quantile(.5, axis="index") @@ -77,7 +96,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile(.5, axis="columns") @@ -107,22 +126,23 @@ def test_quantile_interpolation(self): # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1, 2, 3], index=[1, 2, 3]) + expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) + # cross-check interpolation=nearest results in original dtype exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], dtype='int64') + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64') assert_series_equal(result, expected) # float df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1., 2., 3.], index=[1, 2, 3]) + expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], dtype='float64') + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64') assert_series_equal(result, expected) # axis @@ -217,7 +237,8 @@ def test_quantile_datetime(self): # datetime result = df.quantile(.5, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], - index=['a', 'b']) + index=['a', 'b'], + name=0.5) assert_series_equal(result, expected) # datetime w/ multi @@ -231,7 +252,8 @@ def test_quantile_datetime(self): result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00')], - index=[0, 1]) + index=[0, 1], + name=0.5) assert_series_equal(result, expected) result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) @@ -256,12 +278,13 @@ def test_quantile_box(self): 'C': [pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days')]}) + res = df.quantile(0.5, numeric_only=False) - # when squeezed, result.name is explicitly reset + exp = pd.Series([pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days')], - name=None, index=['A', 'B', 'C']) + name=0.5, index=['A', 'B', 'C']) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) @@ -305,7 +328,7 @@ def test_quantile_box(self): pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days'), pd.Timedelta('2 days')], - name=None, index=list('AaBbCc')) + name=0.5, index=list('AaBbCc')) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index f538fa4e90401..e0bff7fbd39e4 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -126,6 +126,14 @@ def test_quantile_interpolation_np_lt_1p9(self): interpolation='higher') def test_quantile_nan(self): + + # GH 13098 + s = pd.Series([1, 2, 3, 4, np.nan]) + result = s.quantile(0.5) + expected = 2.5 + self.assertEqual(result, expected) + + # all nan/empty cases = [Series([]), Series([np.nan, np.nan])] for s in cases: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5bd5c80f18386..583b1c7aea270 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2676,7 +2676,7 @@ def f(x, q=None, axis=0): trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected) + assert_series_equal(agg_result, agg_expected, check_names=False) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) @@ -2692,11 +2692,11 @@ def f(x, q=None, axis=0): apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) def test_size(self): From c9ffd7891dadd6e5590695e142f77a3476b5c4e3 Mon Sep 17 00:00:00 2001 From: dsm054 Date: Fri, 13 May 2016 10:47:06 +0200 Subject: [PATCH 07/14] DOC: Fix delim_whitespace regex typo. Minor typo in the explanation of delim_whitespace which tripped up a user on SO (although the user should probably have been using `delim_whitespace=True` directly anyhow.) Author: dsm054 Closes #13165 from dsm054/fix-delim_whitespace-regex and squashes the following commits: c8f13d2 [dsm054] DOC: Fix delim_whitespace regex typo. --- doc/source/io.rst | 2 +- pandas/io/parsers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index cc51fbd1e30ab..af8bca14e5d6f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -99,7 +99,7 @@ delimiter : str, default ``None`` Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) - will be used as the delimiter. Equivalent to setting ``sep='\+s'``. + will be used as the delimiter. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f4527df56db88..25639984e4ccf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -55,7 +55,7 @@ Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\+s'``. If this option + used as the sep. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. From e5c18b4383bd49b7a6f42f9e3c299c8746b5a347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Fri, 13 May 2016 09:14:18 -0400 Subject: [PATCH 08/14] BUG: Correct KeyError from matplotlib when processing Series yerr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #11858 Author: Gábor Lipták Closes #13114 from gliptak/yerr1 and squashes the following commits: 926329a [Gábor Lipták] Correct KeyError from matplotlib when processing Series xerr/yerr --- codecov.yml | 4 ---- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/tools/plotting.py | 4 ++++ pandas/tseries/tests/test_plotting.py | 7 +++++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/codecov.yml b/codecov.yml index 86e7dd55c9550..45a6040c6a50d 100644 --- a/codecov.yml +++ b/codecov.yml @@ -7,7 +7,3 @@ coverage: default: target: '50' branches: null - changes: - default: - branches: - - master diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 85209c0dfa03d..0bab6c2ff74e0 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -33,7 +33,6 @@ Other enhancements - .. _whatsnew_0182.api: API changes @@ -108,6 +107,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 808c9d22c53c8..baca8045f0cc1 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1331,6 +1331,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): x = x._mpl_repr() if is_errorbar: + if 'xerr' in kwds: + kwds['xerr'] = np.array(kwds.get('xerr')) + if 'yerr' in kwds: + kwds['yerr'] = np.array(kwds.get('yerr')) return ax.errorbar(x, y, **kwds) else: # prevent style kwarg from going to errorbar, where it is diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 9fab9c0990ef0..0284df9e58933 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -76,6 +76,13 @@ def test_frame_inferred(self): df = DataFrame(np.random.randn(len(idx), 3), index=idx) _check_plot_works(df.plot) + def test_is_error_nozeroindex(self): + # GH11858 + i = np.array([1, 2, 3]) + a = DataFrame(i, index=i) + _check_plot_works(a.plot, xerr=a) + _check_plot_works(a.plot, yerr=a) + def test_nonnumeric_exclude(self): import matplotlib.pyplot as plt From 00d4ec3e7b7fa68d5cf226f7b63a5eea23167b45 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 13 May 2016 09:20:23 -0400 Subject: [PATCH 09/14] BUG: Misc fixes for SparseSeries indexing with MI closes #13144 Author: sinhrks Closes #13163 from sinhrks/sparse_multi and squashes the following commits: eb24102 [sinhrks] BUG: Misc fixes for SparseSeries indexing with MI --- doc/source/whatsnew/v0.18.2.txt | 3 + pandas/indexes/multi.py | 4 +- pandas/sparse/series.py | 20 ++-- pandas/sparse/tests/test_format.py | 60 +++++++++++ pandas/sparse/tests/test_indexing.py | 142 ++++++++++++++++++++++++--- pandas/sparse/tests/test_series.py | 9 ++ pandas/tests/formats/test_format.py | 19 ---- 7 files changed, 214 insertions(+), 43 deletions(-) create mode 100644 pandas/sparse/tests/test_format.py diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 0bab6c2ff74e0..bae8b1358826b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -106,6 +106,9 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 3effc9b1315e6..db2f80ae78446 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None): def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels - from pandas.core.series import Series # Label-based s = _values_from_object(series) @@ -604,7 +603,8 @@ def _try_mi(k): new_values = series._values[loc] new_index = self[loc] new_index = maybe_droplevels(new_index, k) - return Series(new_values, index=new_index, name=series.name) + return series._constructor(new_values, index=new_index, + name=series.name).__finalize__(self) try: return self._engine.get_value(s, k) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a783a7c596955..519068b97a010 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -5,14 +5,13 @@ # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import warnings import operator from pandas.compat.numpy import function as nv from pandas.core.common import isnull, _values_from_object, _maybe_match_name -from pandas.core.index import Index, _ensure_index +from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.internals import SingleBlockManager @@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if is_sparse_array: fill_value = data.fill_value else: - fill_value = nan + fill_value = np.nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: @@ -393,8 +392,10 @@ def _get_val_at(self, loc): def __getitem__(self, key): try: - return self._get_val_at(self.index.get_loc(key)) + return self.index.get_value(self, key) + except InvalidIndexError: + pass except KeyError: if isinstance(key, (int, np.integer)): return self._get_val_at(key) @@ -406,13 +407,12 @@ def __getitem__(self, key): # Could not hash item, must be array-like? pass - # is there a case where this would NOT be an ndarray? - # need to find an example, I took out the case for now - key = _values_from_object(key) - dataSlice = self.values[key] - new_index = Index(self.index.view(ndarray)[key]) - return self._constructor(dataSlice, index=new_index).__finalize__(self) + if self.index.nlevels > 1 and isinstance(key, tuple): + # to handle MultiIndex labels + key = self.index.get_loc(key) + return self._constructor(self.values[key], + index=self.index[key]).__finalize__(self) def _get_values(self, indexer): try: diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py new file mode 100644 index 0000000000000..2981e0f4af0bf --- /dev/null +++ b/pandas/sparse/tests/test_format.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np +import pandas as pd + +import pandas.util.testing as tm +from pandas.compat import (is_platform_windows, + is_platform_32bit) +from pandas.core.config import option_context + + +use_32bit_repr = is_platform_windows() or is_platform_32bit() + + +class TestSeriesFormatting(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_sparse_max_row(self): + s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" + "4 NaN\ndtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 10560 + result = repr(s) + exp = ("0 1.0\n ... \n4 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + def test_sparse_mi_max_row(self): + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1), ('C', 2)]) + s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], + index=idx).to_sparse() + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" + "C 0 3.0\n 1 NaN\n 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3], dtype=int32)\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 13144 + result = repr(s) + exp = ("A 0 1.0\n ... \nC 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3], dtype=int32)\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index ca2996941aef7..1f88d22bd8f93 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase): _multiprocess_can_split_ = True + def setUp(self): + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + self.sparse = self.orig.to_sparse() + def test_getitem(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse[0], 1) self.assertTrue(np.isnan(sparse[1])) @@ -33,8 +37,9 @@ def test_getitem(self): tm.assert_sp_series_equal(result, exp) def test_getitem_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse()) tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse()) tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) @@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self): orig[-5:].to_sparse(fill_value=0)) def test_loc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.loc[0], 1) self.assertTrue(np.isnan(sparse.loc[1])) @@ -154,10 +159,17 @@ def test_loc_index_fill_value(self): tm.assert_sp_series_equal(result, exp) def test_loc_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) + def test_loc_slice_index_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + sparse = orig.to_sparse(fill_value=0) + + tm.assert_sp_series_equal(sparse.loc['C':], + orig.loc['C':].to_sparse(fill_value=0)) + def test_loc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) @@ -165,8 +177,8 @@ def test_loc_slice_fill_value(self): orig.loc[2:].to_sparse(fill_value=0)) def test_iloc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.iloc[3], 3) self.assertTrue(np.isnan(sparse.iloc[2])) @@ -234,8 +246,9 @@ def test_at_fill_value(self): self.assertEqual(sparse.at['e'], orig.at['e']) def test_iat(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + self.assertEqual(sparse.iat[0], orig.iat[0]) self.assertTrue(np.isnan(sparse.iat[1])) self.assertTrue(np.isnan(sparse.iat[2])) @@ -356,6 +369,111 @@ def test_reindex_fill_value(self): tm.assert_sp_series_equal(res, exp) +class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): + + _multiprocess_can_split_ = True + + def setUp(self): + # Mi with duplicated values + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1)]) + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) + self.sparse = self.orig.to_sparse() + + def test_getitem_multi(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse[0], orig[0]) + self.assertTrue(np.isnan(sparse[1])) + self.assertEqual(sparse[3], orig[3]) + + tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse()) + tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse()) + + result = sparse[[1, 3, 4]] + exp = orig[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse[orig % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse[sparse % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_getitem_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse['C', 0], orig['C', 0]) + self.assertTrue(np.isnan(sparse['A', 1])) + self.assertTrue(np.isnan(sparse['B', 0])) + + def test_getitems_slice_multi(self): + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + def test_loc(self): + # need to be override to use different label + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse.loc['A'], + orig.loc['A'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B'], + orig.loc['B'].to_sparse()) + + result = sparse.loc[[1, 3, 4]] + exp = orig.loc[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # exceeds the bounds + result = sparse.loc[[1, 3, 4, 5]] + exp = orig.loc[[1, 3, 4, 5]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse.loc[orig % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_loc_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0]) + self.assertTrue(np.isnan(sparse.loc['A', 1])) + self.assertTrue(np.isnan(sparse.loc['B', 0])) + + def test_loc_slice(self): + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + class TestSparseDataFrameIndexing(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 44bc51077ef3e..5cbc509b836db 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self): check = check.dropna().to_sparse() tm.assert_sp_series_equal(ss, check) + def test_from_coo_long_repr(self): + # GH 13114 + # test it doesn't raise error. Formatting is tested in test_format + tm._skip_if_no_scipy() + import scipy.sparse + + sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18)) + repr(sparse) + def _run_test(self, ss, kwargs, check): results = ss.to_coo(**kwargs) self._check_results_to_coo(results, check) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 96770a86ff383..7a806280916f1 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3758,25 +3758,6 @@ def test_to_string_header(self): exp = '0 0\n ..\n9 9' self.assertEqual(res, exp) - def test_sparse_max_row(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() - result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - - with option_context("display.max_rows", 3): - # GH 10560 - result = repr(s) - exp = ("0 1.0\n ... \n4 NaN\n" - "dtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - class TestEngFormatter(tm.TestCase): _multiprocess_can_split_ = True From 82f54bd1dd53cb031e5d801405b34f062155d823 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 13 May 2016 09:22:45 -0400 Subject: [PATCH 10/14] ENH/BUG: str.extractall doesn't support index closes #10008 Author: sinhrks Closes #13156 from sinhrks/str_extractall and squashes the following commits: ed854ef [sinhrks] ENH/BUG: str.extractall doesn't support index --- doc/source/text.rst | 13 ++++++++++- doc/source/whatsnew/v0.18.2.txt | 6 ++++++ pandas/core/strings.py | 38 +++++++++++++++++++++------------ pandas/tests/test_strings.py | 28 ++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 17 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 16b16a320f75b..3822c713d7f85 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"]) + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as extractall_result extractall_result.xs(0, level="match") +``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the +same result as a ``Series.str.extractall`` with a default index (starts from 0). + +.. versionadded:: 0.18.2 + +.. ipython:: python + + pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) + + pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups) + Testing for Strings that Match or Contain a Pattern --------------------------------------------------- diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index bae8b1358826b..b86a7a81625e2 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -31,7 +31,12 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) + .. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") .. _whatsnew_0182.api: @@ -120,6 +125,7 @@ Bug Fixes +- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 524c0205d7f73..5b1b8bd05af42 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin +from pandas.types import api as gt from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): - from pandas.core.series import Series - if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, Series): + if isinstance(arr, gt.ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0): C 0 NaN 1 """ - from pandas import DataFrame, MultiIndex + regex = re.compile(pat, flags=flags) # the regex must contain capture groups. if regex.groups == 0: raise ValueError("pattern contains no capture groups") + + if isinstance(arr, gt.ABCIndex): + arr = arr.to_series().reset_index(drop=True) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] match_list = [] index_list = [] + is_mi = arr.index.nlevels > 1 + for subject_key, subject in arr.iteritems(): if isinstance(subject, compat.string_types): - try: - key_list = list(subject_key) - except TypeError: - key_list = [subject_key] + + if not is_mi: + subject_key = (subject_key, ) + for match_i, match_tuple in enumerate(regex.findall(subject)): - na_tuple = [ - np.NaN if group == "" else group for group in match_tuple] + na_tuple = [np.NaN if group == "" else group + for group in match_tuple] match_list.append(na_tuple) - result_key = tuple(key_list + [match_i]) + result_key = tuple(subject_key + (match_i, )) index_list.append(result_key) + if 0 < len(index_list): + from pandas import MultiIndex index = MultiIndex.from_tuples( index_list, names=arr.index.names + ["match"]) else: index = None - result = DataFrame(match_list, index, columns) + result = arr._constructor_expanddim(match_list, index=index, + columns=columns) return result @@ -1804,9 +1812,9 @@ class StringAccessorMixin(object): # string methods def _make_str_accessor(self): - from pandas.core.series import Series from pandas.core.index import Index - if (isinstance(self, Series) and + + if (isinstance(self, gt.ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): @@ -1819,6 +1827,8 @@ def _make_str_accessor(self): "values, which use np.object_ dtype in " "pandas") elif isinstance(self, Index): + # can't use ABCIndex to exclude non-str + # see scc/inferrence.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if self.inferred_type not in allowed_types: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4179949bc49a6..05525acedc245 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -982,6 +982,30 @@ def test_extractall_no_matches(self): "second"]) tm.assert_frame_equal(r, e) + def test_extractall_stringindex(self): + s = Series(["a1a2", "b1", "c1"], name='xxx') + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], + names=[None, 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + + # index should return the same result as the default index without name + # thus index.name doesn't affect to the result + for idx in [Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name='xxx')]: + + res = idx.str.extractall("[ab](?P\d)") + tm.assert_frame_equal(res, exp) + + s = Series(["a1a2", "b1", "c1"], name='s_name', + index=Index(["XX", "yy", "zz"], name='idx_name')) + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], + names=["idx_name", 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for @@ -991,8 +1015,8 @@ def test_extractall_errors(self): s.str.extractall(r'[a-z]') def test_extract_index_one_two_groups(self): - s = Series( - ['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name') + s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], + name='series_name') r = s.index.str.extract(r'([A-Z])', expand=True) e = DataFrame(['A', "B", "D"]) tm.assert_frame_equal(r, e) From 01dd11109a0d1def8bc3b03d06c533817cc273f2 Mon Sep 17 00:00:00 2001 From: Sanjiv Lobo Date: Fri, 13 May 2016 19:12:43 -0400 Subject: [PATCH 11/14] DOC: Fix additional join examples in "10 Minutes to pandas" #13029 - [x] closes #13029 Author: Sanjiv Lobo Closes #13171 from Xndr7/fix-additional-join-examples-in-"10-Minutes-to-pandas"-#13029 and squashes the following commits: 633c7ff [Sanjiv Lobo] fixed docs for issue #13029 --- doc/source/10min.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index d51290b2a983b..54bcd76855f32 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -483,6 +483,17 @@ SQL style merges. See the :ref:`Database style joining ` right pd.merge(left, right, on='key') +Another example that can be given is: + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left + right + pd.merge(left, right, on='key') + + Append ~~~~~~ From feee089e41cc2dd5ff88e1068a5ca5595b6ff2f6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 14 May 2016 08:00:41 -0400 Subject: [PATCH 12/14] BUG: Bug in .groupby(..).resample(..) when the same object is called multiple times closes #13174 Author: Jeff Reback Closes #13175 from jreback/resample and squashes the following commits: 56b405e [Jeff Reback] BUG: Bug in .groupby(..).resample(..) when the same object is called multiple times --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/tests/test_window.py | 14 ++++++++++++++ pandas/tseries/resample.py | 3 ++- pandas/tseries/tests/test_resample.py | 19 +++++++++++++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b86a7a81625e2..e92cb8cef4432 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -118,7 +118,7 @@ Bug Fixes - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - +- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 22ac583a3b808..a043e92bd2c76 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2835,6 +2835,20 @@ def test_getitem(self): result = self.frame.B.groupby(self.frame.A).rolling(2).mean() assert_series_equal(result, expected) + def test_getitem_multiple(self): + + # GH 13174 + g = self.frame.groupby('A') + r = g.rolling(2) + g_mutated = self.frame.groupby('A', mutated=True) + expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + + result = r.B.count() + assert_series_equal(result, expected) + + result = r.B.count() + assert_series_equal(result, expected) + def test_rolling(self): g = self.frame.groupby('A') r = g.rolling(window=4) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index a0f08a93a07d9..bb7915e978c3e 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1,6 +1,7 @@ from datetime import timedelta import numpy as np import warnings +import copy import pandas as pd from pandas.core.base import AbstractMethodError, GroupByMixin @@ -592,7 +593,7 @@ def __init__(self, obj, *args, **kwargs): self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True - self.groupby = parent.groupby + self.groupby = copy.copy(parent.groupby) def _apply(self, f, **kwargs): """ diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 77396c3e38c93..5dd2368db2cb8 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -2519,6 +2519,25 @@ def test_getitem(self): result = g.resample('2s').mean().B assert_series_equal(result, expected) + def test_getitem_multiple(self): + + # GH 13174 + # multiple calls after selection causing an issue with aliasing + data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}] + df = pd.DataFrame(data, index=pd.date_range('2016-01-01', periods=2)) + r = df.groupby('id').resample('1D') + result = r['buyer'].count() + expected = pd.Series([1, 1], + index=pd.MultiIndex.from_tuples( + [(1, pd.Timestamp('2016-01-01')), + (2, pd.Timestamp('2016-01-02'))], + names=['id', None]), + name='buyer') + assert_series_equal(result, expected) + + result = r['buyer'].count() + assert_series_equal(result, expected) + def test_methods(self): g = self.frame.groupby('A') r = g.resample('2s') From b38579999f7385cf3be59d6be7f3bb40990d12d1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 14 May 2016 08:01:54 -0400 Subject: [PATCH 13/14] DOC: Clarify Categorical Crosstab Behaviour Follow-on to #13073 by explaining the `Categorical` behaviour in the documentation. Author: gfyoung Closes #13177 from gfyoung/crosstab-categorical-explain and squashes the following commits: 11ebb94 [gfyoung] DOC: Clarify Categorical Crosstab Behaviour --- doc/source/reshaping.rst | 10 ++++++++++ pandas/tools/pivot.py | 16 +++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 21765b3f621ce..9ed2c42610b69 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -445,6 +445,16 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. pd.crosstab(df.A, df.B) +Any input passed containing ``Categorical`` data will have **all** of its +categories included in the cross-tabulation, even if the actual data does +not contain any instances of a particular category. + +.. ipython:: python + + foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + pd.crosstab(foo, bar) + Normalization ~~~~~~~~~~~~~ diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index de79e54e22270..a4e6cc404a457 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -410,7 +410,11 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Notes ----- Any Series passed will have their name attributes used unless row or column - names for the cross-tabulation are specified + names for the cross-tabulation are specified. + + Any input passed containing Categorical data will have **all** of its + categories included in the cross-tabulation, even if the actual data does + not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. @@ -434,6 +438,16 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, bar 1 2 1 0 foo 2 2 1 2 + >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, + # but they still will be counted in the output + col_0 d e f + row_0 + a 1 0 0 + b 0 1 0 + c 0 0 0 + Returns ------- crosstab : DataFrame From 2de2884a7e7abf64f9967f6d8bc05a2d45f59bb4 Mon Sep 17 00:00:00 2001 From: Ka Wo Chen Date: Sat, 14 May 2016 08:02:48 -0400 Subject: [PATCH 14/14] BUG: GH12896 where extra elements are returned in MultiIndex slicing closes #12896 Author: Ka Wo Chen Closes #13117 from kawochen/BUG-FIX-12896 and squashes the following commits: 7d49346 [Ka Wo Chen] BUG: GH12896 where extra elements are returned in MultiIndex slicing --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/indexes/multi.py | 3 ++- pandas/tests/indexing/test_indexing.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index e92cb8cef4432..3ac466158276f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -134,7 +134,7 @@ Bug Fixes - +- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index db2f80ae78446..6f3360cdf82a7 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1761,7 +1761,8 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): else: m = np.zeros(len(labels), dtype=bool) - m[np.in1d(labels, r, assume_unique=True)] = True + m[np.in1d(labels, r, + assume_unique=Index(labels).is_unique)] = True return m diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 4b8b5ae2571d0..fdc9d3599e8ac 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -2334,6 +2334,18 @@ def test_multiindex_slicers_non_unique(self): self.assertFalse(result.index.is_unique) assert_frame_equal(result, expected) + # GH12896 + # numpy-implementation dependent bug + ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, + 17, 18, 19, 200000, 200000] + n = len(ints) + idx = MultiIndex.from_arrays([['a'] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + assert_series_equal(result, expected) + def test_multiindex_slicers_datetimelike(self): # GH 7429