Skip to content

Commit

Permalink
Groupby tuples
Browse files Browse the repository at this point in the history
xref pandas-dev#17996

Author: Pietro Battiston <me@pietrobattiston.it>

Closes pandas-dev#18249 from toobaz/groupby_tuples and squashes the following commits:

dafc838 [Pietro Battiston] DOC: Clarification of groupby(by=) argument
e0bdfa7 [Pietro Battiston] TST: Test for tuples in columns, fixes to previous tests
74f91e0 [Pietro Battiston] TST: Fix tests which used tuples to pass multiple keys
201a4fe [Pietro Battiston] BUG: Never interpret a tuple as a list of keys
  • Loading branch information
toobaz authored and No-Stream committed Nov 28, 2017
1 parent b678d80 commit c4d438f
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 15 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ Indexing
^^^^^^^^

- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
- Bug in :func:`DataFrame.groupby` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`)
-
-

Expand Down
7 changes: 4 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5092,14 +5092,15 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
Parameters
----------
by : mapping, function, str, or iterable
by : mapping, function, label, or list of labels
Used to determine the groups for the groupby.
If ``by`` is a function, it's called on each value of the object's
index. If a dict or Series is passed, the Series or dict VALUES
will be used to determine the groups (the Series' values are first
aligned; see ``.align()`` method). If an ndarray is passed, the
values are used as-is determine the groups. A str or list of strs
may be passed to group by the columns in ``self``
values are used as-is determine the groups. A label or list of
labels may be passed to group by the columns in ``self``. Notice
that a tuple is interpreted a (single) key.
axis : int, default 0
level : int, level name, or sequence of such, default None
If the axis is a MultiIndex (hierarchical), group by a particular
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2756,7 +2756,6 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
"""
group_axis = obj._get_axis(axis)
is_axis_multiindex = isinstance(obj._info_axis, MultiIndex)

# validate that the passed single level is compatible with the passed
# axis of the object
Expand Down Expand Up @@ -2817,9 +2816,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
elif isinstance(key, BaseGrouper):
return key, [], obj

# when MultiIndex, allow tuple to be a key
if not isinstance(key, (tuple, list)) or \
(isinstance(key, tuple) and is_axis_multiindex):
# Everything which is not a list is a key (including tuples):
if not isinstance(key, list):
keys = [key]
match_axis_length = False
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_len(self):
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
assert len(df.groupby(('a'))) == 0
assert len(df.groupby(('b'))) == 3
assert len(df.groupby(('a', 'b'))) == 3
assert len(df.groupby(['a', 'b'])) == 3

def test_basic_regression(self):
# regression
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,13 +366,18 @@ def test_groupby_multiindex_tuple(self):
result = df.groupby(('b', 1)).groups
tm.assert_dict_equal(expected, result)

df2 = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
df2 = pd.DataFrame(df.values,
columns=pd.MultiIndex.from_arrays(
[['a', 'b', 'b', 'c'],
['d', 'd', 'e', 'e']]))
df2.groupby([('b', 'd')]).groups
expected = df.groupby([('b', 'd')]).groups
result = df.groupby(('b', 'd')).groups
expected = df2.groupby([('b', 'd')]).groups
result = df.groupby(('b', 1)).groups
tm.assert_dict_equal(expected, result)

df3 = pd.DataFrame(df.values,
columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c'])
expected = df3.groupby([('b', 'd')]).groups
result = df.groupby(('b', 1)).groups
tm.assert_dict_equal(expected, result)

@pytest.mark.parametrize('sort', [True, False])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_nth.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def test_nth(self):
freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, fourth and last two business days for each month
key = (df.index.year, df.index.month)
key = [df.index.year, df.index.month]
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(
['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def seed_df(seed_nans, n, m):

df = seed_df(seed_nans, n, m)
bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
keys = '1st', '2nd', ('1st', '2nd')
keys = '1st', '2nd', ['1st', '2nd']
for k, b in product(keys, bins):
binned.append((df, k, b, n, m))
ids.append("{}-{}-{}".format(k, n, m))
Expand Down

0 comments on commit c4d438f

Please sign in to comment.