From d81f84fbdcfe2bc450b9b70402dcf0384c039cd5 Mon Sep 17 00:00:00 2001
From: Matt Davis <jiffyclub@gmail.com>
Date: Thu, 10 Apr 2014 10:11:46 -0700
Subject: [PATCH] Adding a couple safety checks in the hedonic modeling

- Verify that prediction results have the same length as in the
  input data. A mismatch usually indicates that there are null
  values in the input data, something that statsmodels apparently
  does not check.
- Raise an error if a user tries to get a model prediction before
  fitting the model.
---
 urbansim/exceptions.py                |  2 ++
 urbansim/models/hedonic.py            | 11 +++++++++++
 urbansim/models/tests/test_hedonic.py | 17 +++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 urbansim/exceptions.py

diff --git a/urbansim/exceptions.py b/urbansim/exceptions.py
new file mode 100644
index 00000000..f27cb68b
--- /dev/null
+++ b/urbansim/exceptions.py
@@ -0,0 +1,2 @@
+class ModelEvaluationError(Exception):
+    pass
diff --git a/urbansim/models/hedonic.py b/urbansim/models/hedonic.py
index c9578853..c5ea708e 100644
--- a/urbansim/models/hedonic.py
+++ b/urbansim/models/hedonic.py
@@ -2,6 +2,8 @@
 import pandas as pd
 import statsmodels.formula.api as smf
 
+from .. exceptions import ModelEvaluationError
+
 
 def apply_filter_query(df, filters=None):
     """
@@ -81,6 +83,13 @@ def predict(df, filters, model_fit, ytransform=None):
     """
     df = apply_filter_query(df, filters)
     sim_data = model_fit.predict(df)
+
+    if len(sim_data) != len(df):
+        raise ModelEvaluationError(
+            'Predicted data does not have the same length as input. '
+            'This suggests there are null values in one or more of '
+            'the input columns.')
+
     if ytransform:
         sim_data = ytransform(sim_data)
     return pd.Series(sim_data, index=df.index)
@@ -159,5 +168,7 @@ def predict(self, data):
             after applying filters.
 
         """
+        if not self.model_fit:
+            raise RuntimeError('Model has not been fit.')
         return predict(
             data, self.predict_filters, self.model_fit, self.ytransform)
diff --git a/urbansim/models/tests/test_hedonic.py b/urbansim/models/tests/test_hedonic.py
index 3ee5bd46..c0668ae7 100644
--- a/urbansim/models/tests/test_hedonic.py
+++ b/urbansim/models/tests/test_hedonic.py
@@ -5,6 +5,7 @@
 from statsmodels.regression.linear_model import RegressionResultsWrapper
 
 from .. import hedonic
+from ...exceptions import ModelEvaluationError
 
 
 @pytest.fixture
@@ -77,6 +78,18 @@ def test_predict_ytransform(test_df):
     pdt.assert_series_equal(predicted, expected)
 
 
+def test_predict_with_nans():
+    df = pd.DataFrame(
+        {'col1': range(5),
+         'col2': [5, 6, pd.np.nan, 8, 9]},
+        index=['a', 'b', 'c', 'd', 'e'])
+    fit = hedonic.fit_model(df.loc[['a', 'b', 'e']], None, 'col1 ~ col2')
+
+    with pytest.raises(ModelEvaluationError):
+        hedonic.predict(
+            df.loc[['c', 'd']], None, fit)
+
+
 def test_HedonicModel(test_df):
     fit_filters = ['col1 in [0, 2, 4]']
     predict_filters = ['col1 in [1, 3]']
@@ -93,6 +106,10 @@ def test_HedonicModel(test_df):
     assert model.name == name
     assert model.model_fit is None
 
+    # verify there's an error if there isn't a model fit yet
+    with pytest.raises(RuntimeError):
+        model.predict(test_df)
+
     fit = model.fit_model(test_df)
     assert isinstance(fit, RegressionResultsWrapper)
     assert isinstance(model.model_fit, RegressionResultsWrapper)