Skip to content

Commit

Permalink
BUG:Time Grouper bug fix when applied for list groupers (pandas-dev#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
ruiann authored and No-Stream committed Nov 28, 2017
1 parent e76612f commit 01d1820
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 50 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,7 @@ Groupby/Resample/Rolling
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)
- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`)

Sparse
^^^^^^
Expand Down
119 changes: 94 additions & 25 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
def ax(self):
return self.grouper

def _get_grouper(self, obj):
def _get_grouper(self, obj, validate=True):
"""
Parameters
----------
obj : the subject object
validate : boolean, default True
if True, validate the grouper
Returns
-------
Expand All @@ -271,7 +273,8 @@ def _get_grouper(self, obj):
self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
axis=self.axis,
level=self.level,
sort=self.sort)
sort=self.sort,
validate=validate)
return self.binner, self.grouper, self.obj

def _set_grouper(self, obj, sort=False):
Expand Down Expand Up @@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False):
self.grouper = ax
return self.grouper

def _get_binner_for_grouping(self, obj):
""" default to the standard binner here """
group_axis = obj._get_axis(self.axis)
return Grouping(group_axis, None, obj=obj, name=self.key,
level=self.level, sort=self.sort, in_axis=False)

@property
def groups(self):
return self.grouper.groups
Expand Down Expand Up @@ -1733,16 +1730,34 @@ class BaseGrouper(object):
"""
This is an internal Grouper class, which actually holds
the generated groups
Parameters
----------
axis : int
the axis to group
groupings : array of grouping
all the grouping instances to handle in this grouper
for example for grouper list to groupby, need to pass the list
sort : boolean, default True
whether this grouper will give sorted result or not
group_keys : boolean, default True
mutated : boolean, default False
indexer : intp array, optional
the indexer created by Grouper
some groupers (TimeGrouper) will sort its axis and its
group_info is also sorted, so need the indexer to reorder
"""

def __init__(self, axis, groupings, sort=True, group_keys=True,
mutated=False):
mutated=False, indexer=None):
self._filter_empty_groups = self.compressed = len(groupings) != 1
self.axis = axis
self.groupings = groupings
self.sort = sort
self.group_keys = group_keys
self.mutated = mutated
self.indexer = indexer

@property
def shape(self):
Expand Down Expand Up @@ -1888,6 +1903,15 @@ def group_info(self):
comp_ids = _ensure_int64(comp_ids)
return comp_ids, obs_group_ids, ngroups

@cache_readonly
def label_info(self):
# return the labels of items in original grouped axis
labels, _, _ = self.group_info
if self.indexer is not None:
sorter = np.lexsort((labels, self.indexer))
labels = labels[sorter]
return labels

def _get_compressed_labels(self):
all_labels = [ping.labels for ping in self.groupings]
if len(all_labels) > 1:
Expand Down Expand Up @@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed):

class BinGrouper(BaseGrouper):

def __init__(self, bins, binlabels, filter_empty=False, mutated=False):
"""
This is an internal Grouper class
Parameters
----------
bins : the split index of binlabels to group the item of axis
binlabels : the label list
filter_empty : boolean, default False
mutated : boolean, default False
indexer : a intp array
Examples
--------
bins: [2, 4, 6, 8, 10]
binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
'2005-01-05', '2005-01-07', '2005-01-09'],
dtype='datetime64[ns]', freq='2D')
the group_info, which contains the label of each item in grouped
axis, the index of label in label list, group number, is
(array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
means that, the grouped axis has 10 items, can be grouped into 5
labels, the first and second items belong to the first label, the
third and forth items belong to the second label, and so on
"""

def __init__(self, bins, binlabels, filter_empty=False, mutated=False,
indexer=None):
self.bins = _ensure_int64(bins)
self.binlabels = _ensure_index(binlabels)
self._filter_empty_groups = filter_empty
self.mutated = mutated
self.indexer = indexer

@cache_readonly
def groups(self):
Expand Down Expand Up @@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.grouper, self._labels, self._group_index = \
index._get_grouper_for_level(self.grouper, level)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get labels
elif isinstance(self.grouper, Grouper):
# get the new grouper; we already have disambiguated
# what key/level refer to exactly, don't need to
# check again as we have by this point converted these
# to an actual value (rather than a pd.Grouper)
_, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
if self.name is None:
self.name = grouper.result_index.name
self.obj = self.grouper.obj
self.grouper = grouper

else:
if self.grouper is None and self.name is not None:
self.grouper = self.obj[self.name]
Expand All @@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
categories=c,
ordered=self.grouper.ordered))

# a passed Grouper like
elif isinstance(self.grouper, Grouper):

# get the new grouper
grouper = self.grouper._get_binner_for_grouping(self.obj)
self.obj = self.grouper.obj
self.grouper = grouper
if self.name is None:
self.name = grouper.name

# we are done
if isinstance(self.grouper, Grouping):
self.grouper = self.grouper.grouper
Expand Down Expand Up @@ -2536,6 +2594,10 @@ def ngroups(self):

@cache_readonly
def indices(self):
# we have a list of groupers
if isinstance(self.grouper, BaseGrouper):
return self.grouper.indices

values = _ensure_categorical(self.grouper)
return values._reverse_indexer()

Expand All @@ -2553,9 +2615,14 @@ def group_index(self):

def _make_labels(self):
if self._labels is None or self._group_index is None:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
# we have a list of groupers
if isinstance(self.grouper, BaseGrouper):
labels = self.grouper.label_info
uniques = self.grouper.result_index
else:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques

Expand All @@ -2566,7 +2633,7 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
mutated=False):
mutated=False, validate=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
Expand All @@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
are and then creates a Grouping for each one, combined into
a BaseGrouper.
If validate, then check for key/level overlaps
"""
group_axis = obj._get_axis(axis)

Expand Down Expand Up @@ -2707,7 +2776,7 @@ def is_in_obj(gpr):

elif is_in_axis(gpr): # df.groupby('name')
if gpr in obj:
if gpr in obj.index.names:
if validate and gpr in obj.index.names:
warnings.warn(
("'%s' is both a column name and an index level.\n"
"Defaulting to column but "
Expand Down
27 changes: 2 additions & 25 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def _get_binner(self):
"""

binner, bins, binlabels = self._get_binner_for_time()
bin_grouper = BinGrouper(bins, binlabels)
bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)
return binner, bin_grouper

def _assure_grouper(self):
Expand Down Expand Up @@ -1105,35 +1105,12 @@ def _get_resampler(self, obj, kind=None):
"TimedeltaIndex or PeriodIndex, "
"but got an instance of %r" % type(ax).__name__)

def _get_grouper(self, obj):
def _get_grouper(self, obj, validate=True):
# create the resampler and return our binner
r = self._get_resampler(obj)
r._set_binner()
return r.binner, r.grouper, r.obj

def _get_binner_for_grouping(self, obj):
# return an ordering of the transformed group labels,
# suitable for multi-grouping, e.g the labels for
# the resampled intervals
binner, grouper, obj = self._get_grouper(obj)

l = []
for key, group in grouper.get_iterator(self.ax):
l.extend([key] * len(group))

if isinstance(self.ax, PeriodIndex):
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
else:
# resampling causes duplicated values, specifying freq is invalid
grouper = binner.__class__(l, name=binner.name)

# since we may have had to sort
# may need to reorder groups here
if self.indexer is not None:
indexer = self.indexer.argsort(kind='quicksort')
grouper = grouper.take(indexer)
return grouper

def _get_time_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
raise TypeError('axis must be a DatetimeIndex, but got '
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,3 +623,22 @@ def test_nunique_with_timegrouper_and_nat(self):
result = test.groupby(grouper)['data'].nunique()
expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
tm.assert_series_equal(result, expected)

def test_scalar_call_versus_list_call(self):
# Issue: 17530
data_frame = {
'location': ['shanghai', 'beijing', 'shanghai'],
'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
'2017-08-11 22:23:15'],
dtype='datetime64[ns]'),
'value': [1, 2, 3]
}
data_frame = pd.DataFrame(data_frame).set_index('time')
grouper = pd.Grouper(freq='D')

grouped = data_frame.groupby(grouper)
result = grouped.count()
grouped = data_frame.groupby([grouper])
expected = grouped.count()

assert_frame_equal(result, expected)

0 comments on commit 01d1820

Please sign in to comment.