From d81f84fbdcfe2bc450b9b70402dcf0384c039cd5 Mon Sep 17 00:00:00 2001 From: Matt Davis Date: Thu, 10 Apr 2014 10:11:46 -0700 Subject: [PATCH] Adding a couple safety checks in the hedonic modeling - Verify that prediction results have the same length as in the input data. A mismatch usually indicates that there are null values in the input data, something that statsmodels apparently does not check. - Raise an error if a user tries to get a model prediction before fitting the model. --- urbansim/exceptions.py | 2 ++ urbansim/models/hedonic.py | 11 +++++++++++ urbansim/models/tests/test_hedonic.py | 17 +++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 urbansim/exceptions.py diff --git a/urbansim/exceptions.py b/urbansim/exceptions.py new file mode 100644 index 00000000..f27cb68b --- /dev/null +++ b/urbansim/exceptions.py @@ -0,0 +1,2 @@ +class ModelEvaluationError(Exception): + pass diff --git a/urbansim/models/hedonic.py b/urbansim/models/hedonic.py index c9578853..c5ea708e 100644 --- a/urbansim/models/hedonic.py +++ b/urbansim/models/hedonic.py @@ -2,6 +2,8 @@ import pandas as pd import statsmodels.formula.api as smf +from .. exceptions import ModelEvaluationError + def apply_filter_query(df, filters=None): """ @@ -81,6 +83,13 @@ def predict(df, filters, model_fit, ytransform=None): """ df = apply_filter_query(df, filters) sim_data = model_fit.predict(df) + + if len(sim_data) != len(df): + raise ModelEvaluationError( + 'Predicted data does not have the same length as input. ' + 'This suggests there are null values in one or more of ' + 'the input columns.') + if ytransform: sim_data = ytransform(sim_data) return pd.Series(sim_data, index=df.index) @@ -159,5 +168,7 @@ def predict(self, data): after applying filters. """ + if not self.model_fit: + raise RuntimeError('Model has not been fit.') return predict( data, self.predict_filters, self.model_fit, self.ytransform) diff --git a/urbansim/models/tests/test_hedonic.py b/urbansim/models/tests/test_hedonic.py index 3ee5bd46..c0668ae7 100644 --- a/urbansim/models/tests/test_hedonic.py +++ b/urbansim/models/tests/test_hedonic.py @@ -5,6 +5,7 @@ from statsmodels.regression.linear_model import RegressionResultsWrapper from .. import hedonic +from ...exceptions import ModelEvaluationError @pytest.fixture @@ -77,6 +78,18 @@ def test_predict_ytransform(test_df): pdt.assert_series_equal(predicted, expected) +def test_predict_with_nans(): + df = pd.DataFrame( + {'col1': range(5), + 'col2': [5, 6, pd.np.nan, 8, 9]}, + index=['a', 'b', 'c', 'd', 'e']) + fit = hedonic.fit_model(df.loc[['a', 'b', 'e']], None, 'col1 ~ col2') + + with pytest.raises(ModelEvaluationError): + hedonic.predict( + df.loc[['c', 'd']], None, fit) + + def test_HedonicModel(test_df): fit_filters = ['col1 in [0, 2, 4]'] predict_filters = ['col1 in [1, 3]'] @@ -93,6 +106,10 @@ def test_HedonicModel(test_df): assert model.name == name assert model.model_fit is None + # verify there's an error if there isn't a model fit yet + with pytest.raises(RuntimeError): + model.predict(test_df) + fit = model.fit_model(test_df) assert isinstance(fit, RegressionResultsWrapper) assert isinstance(model.model_fit, RegressionResultsWrapper)