Skip to content

Commit

Permalink
Merge pull request #326 from modilabs/latest
Browse files Browse the repository at this point in the history
Latest
  • Loading branch information
Peter Lubell-Doughtie committed Dec 22, 2012
2 parents 42c932a + a658fc4 commit 6cb8267
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 190 deletions.
70 changes: 64 additions & 6 deletions bamboo/core/aggregations.py
@@ -1,5 +1,7 @@
from pandas import concat, DataFrame, Series from pandas import concat, DataFrame, Series


from bamboo.lib.utils import parse_float



class Aggregation(object): class Aggregation(object):
"""Abstract class for all aggregations. """Abstract class for all aggregations.
Expand All @@ -25,15 +27,19 @@ def eval(self, columns):


def group(self): def group(self):
"""For when aggregation is called with a group parameter.""" """For when aggregation is called with a group parameter."""
groupby = self.dframe[self.groups].join( return self._groupby().agg(self.formula_name)
self.column).groupby(self.groups, as_index=False)

return groupby.agg(self.formula_name)


def agg(self): def agg(self):
"""For when aggregation is called without a group parameter.""" """For when aggregation is called without a group parameter."""
result = float(self.column.__getattribute__(self.formula_name)()) result = float(self.column.__getattribute__(self.formula_name)())
return DataFrame({self.name: Series([result])}) return self._value_to_dframe(result)

def _value_to_dframe(self, value):
return DataFrame({self.name: Series([value])})

def _groupby(self):
return self.dframe[self.groups].join(
self.column).groupby(self.groups, as_index=False)




class MultiColumnAggregation(Aggregation): class MultiColumnAggregation(Aggregation):
Expand Down Expand Up @@ -95,10 +101,62 @@ class MaxAggregation(Aggregation):
formula_name = 'max' formula_name = 'max'




class ArgMaxAggregation(Aggregation):
"""Return the index for the maximum of a column.
Written as ``argmax(FORMULA)``. Where `FORMULA` is a valid formula.
"""

formula_name = 'argmax'

def group(self):
"""For when aggregation is called with a group parameter."""
indices = self.column.apply(lambda value: parse_float(value, value)
).reset_index().set_index(self.name)

def max_index_for_row(row):
return indices.get_value(row[self.name], 'index').max()

groupby_max = self._groupby().max()
column = groupby_max.apply(max_index_for_row, axis=1)
column.name = self.name

return DataFrame(column).join(groupby_max[self.groups])


class NewestAggregation(MultiColumnAggregation):
"""For the newest index column get the value column."""

formula_name = 'newest'

index_column = 0
value_column = 1

def agg(self):
idx = self.columns[self.index_column].argmax()
result = self.columns[self.value_column].ix[idx]

return self._value_to_dframe(result)

def group(self):
argmax_agg = ArgMaxAggregation(self.name, self.groups, self.dframe)
argmax_df = argmax_agg.eval(self.columns)
indices = argmax_df.pop(self.name)

newest_col = self.columns[self.value_column][indices]
newest_col.index = argmax_df.index

return argmax_df.join(newest_col)



class MeanAggregation(MultiColumnAggregation): class MeanAggregation(MultiColumnAggregation):
"""Calculate the arithmetic mean. """Calculate the arithmetic mean.
Written as ``mean(FORMULA)``. Where `FORMULA` is a valid formula. Written as ``mean(FORMULA)``. Where `FORMULA` is a valid formula.
Because mean is irreducible this inherits from `MultiColumnAggregation` to
use its reduce generic implementation.
""" """


formula_name = 'mean' formula_name = 'mean'
Expand Down Expand Up @@ -214,7 +272,7 @@ def agg(self):
else: else:
result = len(self.dframe) result = len(self.dframe)


return DataFrame({self.name: Series([result])}) return self._value_to_dframe(result)




# dict of formula names to aggregation classes # dict of formula names to aggregation classes
Expand Down
10 changes: 9 additions & 1 deletion bamboo/lib/utils.py
Expand Up @@ -8,8 +8,16 @@




def parse_int(value, default): def parse_int(value, default):
return _parse_type(int, value, default)


def parse_float(value, default):
return _parse_type(float, value, default)


def _parse_type(_type, value, default):
try: try:
return int(value) return _type(value)
except ValueError: except ValueError:
return default return default


Expand Down
4 changes: 4 additions & 0 deletions bamboo/tests/core/test_aggregations.py
Expand Up @@ -23,6 +23,8 @@ class TestAggregations(TestCalculator):
'ratio(risk_factor in ["low_risk"], 1)': 18.0 / 19, 'ratio(risk_factor in ["low_risk"], 1)': 18.0 / 19,
'count()': 19.0, 'count()': 19.0,
'count(risk_factor in ["low_risk"])': 18.0, 'count(risk_factor in ["low_risk"])': 18.0,
'argmax(submit_date)': 18.0,
'newest(submit_date, amount)': 28.0,
} }


GROUP_TO_RESULTS = { GROUP_TO_RESULTS = {
Expand Down Expand Up @@ -51,6 +53,8 @@ def setUp(self):
'ratio(risk_factor in ["low_risk"], 1)', 'ratio(risk_factor in ["low_risk"], 1)',
'count(risk_factor in ["low_risk"])', 'count(risk_factor in ["low_risk"])',
'count()', 'count()',
'argmax(submit_date)',
'newest(submit_date, amount)',
] ]
self.expected_length = defaultdict(int) self.expected_length = defaultdict(int)
self.groups_list = None self.groups_list = None
Expand Down
3 changes: 1 addition & 2 deletions bamboo/tests/core/test_frame.py
Expand Up @@ -13,8 +13,7 @@ class TestFrame(TestBase):


def setUp(self): def setUp(self):
TestBase.setUp(self) TestBase.setUp(self)
self.dframe = self.test_data['good_eats.csv' self.dframe = self.test_data['good_eats.csv']
]
self.bframe = BambooFrame(self.dframe) self.bframe = BambooFrame(self.dframe)


def _add_bamboo_reserved_keys(self, value=1): def _add_bamboo_reserved_keys(self, value=1):
Expand Down

0 comments on commit 6cb8267

Please sign in to comment.