Merge pull request #326 from modilabs/latest

Latest
SEL-Columbia · Dec 22, 2012 · 6cb8267 · 6cb8267
2 parents 42c932a + a658fc4
commit 6cb8267
Show file tree

Hide file tree

Showing 6 changed files with 271 additions and 190 deletions.
diff --git a/bamboo/core/aggregations.py b/bamboo/core/aggregations.py
@@ -1,5 +1,7 @@
 from pandas import concat, DataFrame, Series
 
+from bamboo.lib.utils import parse_float
+
 
 class Aggregation(object):
     """Abstract class for all aggregations.
@@ -25,15 +27,19 @@ def eval(self, columns):
 
     def group(self):
         """For when aggregation is called with a group parameter."""
-        groupby = self.dframe[self.groups].join(
+        return self._groupby().agg(self.formula_name)
-            self.column).groupby(self.groups, as_index=False)
-
-        return groupby.agg(self.formula_name)
 
     def agg(self):
         """For when aggregation is called without a group parameter."""
         result = float(self.column.__getattribute__(self.formula_name)())
-        return DataFrame({self.name: Series([result])})
+        return self._value_to_dframe(result)
+
+    def _value_to_dframe(self, value):
+        return DataFrame({self.name: Series([value])})
+
+    def _groupby(self):
+        return self.dframe[self.groups].join(
+            self.column).groupby(self.groups, as_index=False)
 
 
 class MultiColumnAggregation(Aggregation):
@@ -95,10 +101,62 @@ class MaxAggregation(Aggregation):
     formula_name = 'max'
 
 
+class ArgMaxAggregation(Aggregation):
+    """Return the index for the maximum of a column.
+
+    Written as ``argmax(FORMULA)``. Where `FORMULA` is a valid formula.
+    """
+
+    formula_name = 'argmax'
+
+    def group(self):
+        """For when aggregation is called with a group parameter."""
+        indices = self.column.apply(lambda value: parse_float(value, value)
+                                    ).reset_index().set_index(self.name)
+
+        def max_index_for_row(row):
+            return indices.get_value(row[self.name], 'index').max()
+
+        groupby_max = self._groupby().max()
+        column = groupby_max.apply(max_index_for_row, axis=1)
+        column.name = self.name
+
+        return DataFrame(column).join(groupby_max[self.groups])
+
+
+class NewestAggregation(MultiColumnAggregation):
+    """For the newest index column get the value column."""
+
+    formula_name = 'newest'
+
+    index_column = 0
+    value_column = 1
+
+    def agg(self):
+        idx = self.columns[self.index_column].argmax()
+        result = self.columns[self.value_column].ix[idx]
+
+        return self._value_to_dframe(result)
+
+    def group(self):
+        argmax_agg = ArgMaxAggregation(self.name, self.groups, self.dframe)
+        argmax_df = argmax_agg.eval(self.columns)
+        indices = argmax_df.pop(self.name)
+
+        newest_col = self.columns[self.value_column][indices]
+        newest_col.index = argmax_df.index
+
+        return argmax_df.join(newest_col)
+
+
+
 class MeanAggregation(MultiColumnAggregation):
     """Calculate the arithmetic mean.
 
     Written as ``mean(FORMULA)``. Where `FORMULA` is a valid formula.
+
+    Because mean is irreducible this inherits from `MultiColumnAggregation` to
+    use its reduce generic implementation.
     """
 
     formula_name = 'mean'
@@ -214,7 +272,7 @@ def agg(self):
         else:
             result = len(self.dframe)
 
-        return DataFrame({self.name: Series([result])})
+        return self._value_to_dframe(result)
 
 
 # dict of formula names to aggregation classes

diff --git a/bamboo/lib/utils.py b/bamboo/lib/utils.py
@@ -8,8 +8,16 @@
 
 
 def parse_int(value, default):
+    return _parse_type(int, value, default)
+
+
+def parse_float(value, default):
+    return _parse_type(float, value, default)
+
+
+def _parse_type(_type, value, default):
     try:
-        return int(value)
+        return _type(value)
     except ValueError:
         return default
 

diff --git a/bamboo/tests/core/test_aggregations.py b/bamboo/tests/core/test_aggregations.py
@@ -23,6 +23,8 @@ class TestAggregations(TestCalculator):
         'ratio(risk_factor in ["low_risk"], 1)': 18.0 / 19,
         'count()': 19.0,
         'count(risk_factor in ["low_risk"])': 18.0,
+        'argmax(submit_date)': 18.0,
+        'newest(submit_date, amount)': 28.0,
     }
 
     GROUP_TO_RESULTS = {
@@ -51,6 +53,8 @@ def setUp(self):
             'ratio(risk_factor in ["low_risk"], 1)',
             'count(risk_factor in ["low_risk"])',
             'count()',
+            'argmax(submit_date)',
+            'newest(submit_date, amount)',
         ]
         self.expected_length = defaultdict(int)
         self.groups_list = None

diff --git a/bamboo/tests/core/test_frame.py b/bamboo/tests/core/test_frame.py
@@ -13,8 +13,7 @@ class TestFrame(TestBase):
 
     def setUp(self):
         TestBase.setUp(self)
-        self.dframe = self.test_data['good_eats.csv'
+        self.dframe = self.test_data['good_eats.csv']
-            ]
         self.bframe = BambooFrame(self.dframe)
 
     def _add_bamboo_reserved_keys(self, value=1):