UDST · fscottfoti · Jul 30, 2014 · Jul 18, 2014 · Jul 20, 2014 · Jul 20, 2014
diff --git a/urbansim/developer/sqftproforma.py b/urbansim/developer/sqftproforma.py
@@ -554,7 +554,6 @@ def lookup(self, form, df, only_built=True):
             df['max_far_from_dua'] = df.max_dua * df.ave_unit_size / self.config.building_efficiency
             df['min_max_fars'] = df[['min_max_fars', 'max_far_from_dua']].min(axis=1)
 
-        df['min_max_fars'] = df.min_max_fars.fillna(0)
         if only_built:
             df = df.query('min_max_fars > 0 and parcel_size > 0')
 

diff --git a/urbansim/models/lcm.py b/urbansim/models/lcm.py
@@ -14,6 +14,7 @@
 import toolz
 
 from . import util
+from ..exceptions import ModelEvaluationError
 from ..urbanchoice import interaction, mnl
 from ..utils import yamlio
 from ..utils.logutil import log_start_finish
@@ -238,6 +239,13 @@ def fit(self, choosers, alternatives, current_choice):
             choosers, alternatives, self.sample_size, current_choice)
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
+
+        if len(merged) != model_design.as_matrix().shape[0]:
+            raise ModelEvaluationError(
+                'Estimated data does not have the same length as input.  '
+                'This suggests there are null values in one or more of '
+                'the input columns.')
+
         self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
             model_design.as_matrix(), chosen, self.sample_size)
         self.fit_parameters.index = model_design.columns
@@ -336,6 +344,12 @@ def predict(self, choosers, alternatives, debug=False):
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
 
+        if len(merged) != model_design.as_matrix().shape[0]:
+            raise ModelEvaluationError(
+                'Simulated data does not have the same length as input.  '
+                'This suggests there are null values in one or more of '
+                'the input columns.')
+
         coeffs = [self.fit_parameters['Coefficient'][x]
                   for x in model_design.columns]
 
@@ -445,6 +459,83 @@ def columns_used(self):
             self.alts_columns_used(),
             self.interaction_columns_used())))
 
+    @classmethod
+    def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
+        """
+        Parameters
+        ----------
+        choosers : DataFrame
+            A dataframe of rows of agents which have locations assigned.
+        chosen_fname : string
+            A string indicating the column in the choosers dataframe which
+            gives which location the choosers have chosen.
+        alternatives : DataFrame
+            A dataframe of locations which should include the chosen locations
+            from the choosers dataframe as well as some other locations from
+            which to sample.  Values in choosers[chosen_fname] should index
+            into the alternatives dataframe.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+
+        Returns
+        -------
+        lcm : MNLLocationChoiceModel which was used to fit
+        """
+        logger.debug('start: fit from configuration {}'.format(cfgname))
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+        lcm.fit(choosers, alternatives, choosers[chosen_fname])
+        lcm.report_fit()
+        lcm.to_yaml(str_or_buffer=cfgname)
+        logger.debug('finish: fit from configuration {}'.format(cfgname))
+        return lcm
+
+    @classmethod
+    def predict_from_cfg(cls, movers, locations, cfgname,
+                         location_ratio=2.0, debug=False):
+        """
+        Simulate the location choices for the specified choosers
+
+        Parameters
+        ----------
+        movers : DataFrame
+            A dataframe of agents doing the choosing.
+        locations : DataFrame
+            A dataframe of locations which the choosers are location in and which
+            have a supply.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+        location_ratio : float
+            Above the location ratio (default of 2.0) of locations to choosers, the
+            locations will be sampled to meet this ratio (for performance reasons).
+        debug : boolean, optional (default False)
+            Whether to generate debug information on the model.
+
+        Returns
+        -------
+        choices : pandas.Series
+            Mapping of chooser ID to alternative ID. Some choosers
+            will map to a nan value when there are not enough alternatives
+            for all the choosers.
+        lcm : MNLLocationChoiceModel which was used to predict
+        """
+        logger.debug('start: predict from configuration {}'.format(cfgname))
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+
+        if len(locations) > len(movers) * location_ratio:
+            logger.info("Location ratio exceeded: %d locations and only %d choosers" %
+                        (len(locations), len(movers)))
+            idxes = np.random.choice(locations.index, size=len(movers) * location_ratio,
+                                     replace=False)
+            locations = locations.loc[idxes]
+            logger.info("  after sampling %d locations are available\n" % len(locations))
+
+        new_units = lcm.predict(movers, locations, debug=debug)
+        print("Assigned %d choosers to new units" % len(new_units.index))
+        logger.debug('finish: predict from configuration {}'.format(cfgname))
+        return new_units, lcm
+
 
 class MNLLocationChoiceModelGroup(object):
     """
@@ -1033,4 +1124,82 @@ def columns_used(self):
         return list(toolz.unique(toolz.concatv(
             self.choosers_columns_used(),
             self.alts_columns_used(),
-            self.interaction_columns_used())))
+            self.interaction_columns_used(),
+            [self.segmentation_col])))
+
+    @classmethod
+    def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
+        """
+        Parameters
+        ----------
+        choosers : DataFrame
+            A dataframe of rows of agents which have locations assigned.
+        chosen_fname : string
+            A string indicating the column in the choosers dataframe which
+            gives which location the choosers have chosen.
+        alternatives : DataFrame
+            A dataframe of locations which should include the chosen locations
+            from the choosers dataframe as well as some other locations from
+            which to sample.  Values in choosers[chosen_fname] should index
+            into the alternatives dataframe.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+
+        Returns
+        -------
+        lcm : SegmentedMNLLocationChoiceModel which was used to fit
+        """
+        logger.debug('start: fit from configuration {}'.format(cfgname))
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+        lcm.fit(choosers, alternatives, choosers[chosen_fname])
+        for k, v in lcm._group.models.items():
+            print("LCM RESULTS FOR SEGMENT %s\n" % str(k))
+            v.report_fit()
+        lcm.to_yaml(str_or_buffer=cfgname)
+        logger.debug('finish: fit from configuration {}'.format(cfgname))
+        return lcm
+
+    @classmethod
+    def predict_from_cfg(cls, movers, locations, cfgname,
+                         location_ratio=2.0, debug=False):
+        """
+        Simulate the location choices for the specified choosers
+
+        Parameters
+        ----------
+        movers : DataFrame
+            A dataframe of agents doing the choosing.
+        locations : DataFrame
+            A dataframe of locations which the choosers are location in and which
+            have a supply.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+        location_ratio : float
+            Above the location ratio (default of 2.0) of locations to choosers, the
+            locations will be sampled to meet this ratio (for performance reasons).
+
+        Returns
+        -------
+        choices : pandas.Series
+            Mapping of chooser ID to alternative ID. Some choosers
+            will map to a nan value when there are not enough alternatives
+            for all the choosers.
+        lcm : SegmentedMNLLocationChoiceModel which was used to predict
+        """
+        logger.debug('start: predict from configuration {}'.format(cfgname))
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+
+        if len(locations) > len(movers) * location_ratio:
+            logger.info("Location ratio exceeded: %d locations and only %d choosers" %
+                        (len(locations), len(movers)))
+            idxes = np.random.choice(locations.index, size=len(movers) * location_ratio,
+                                     replace=False)
+            locations = locations.loc[idxes]
+            logger.info("  after sampling %d locations are available\n" % len(locations))
+
+        new_units = lcm.predict(movers, locations, debug=debug)
+        print("Assigned %d choosers to new units" % len(new_units.index))
+        logger.debug('finish: predict from configuration {}'.format(cfgname))
+        return new_units, lcm
diff --git a/urbansim/models/regression.py b/urbansim/models/regression.py
@@ -42,6 +42,13 @@ def fit_model(df, filters, model_expression):
     """
     df = util.apply_filter_query(df, filters)
     model = smf.ols(formula=model_expression, data=df)
+
+    if len(model.exog) != len(df):
+        raise ModelEvaluationError(
+            'Estimated data does not have the same length as input.  '
+            'This suggests there are null values in one or more of '
+            'the input columns.')
+
     with log_start_finish('statsmodels OLS fit', logger):
         return model.fit()
 
@@ -325,6 +332,9 @@ class instance for use during prediction.
         self.fit_parameters = _model_fit_to_table(fit)
         if debug:
             index = util.apply_filter_query(data, self.fit_filters).index
+            assert len(fit.model.exog) == len(index), (
+                "The estimate data is unequal in length to the original "
+                "dataframe, usually caused by nans")
             df = pd.DataFrame(
                 fit.model.exog, columns=fit.model.exog_names, index=index)
             df[fit.model.endog_names] = fit.model.endog
@@ -456,6 +466,57 @@ def columns_used(self):
             util.columns_in_filters(self.predict_filters),
             util.columns_in_formula(self.model_expression))))
 
+    @classmethod
+    def fit_from_cfg(cls, df, cfgname, debug=False):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        debug : boolean, optional (default False)
+            Whether to generate debug information on the model.
+
+        Returns
+        -------
+        RegressionModel which was used to fit
+        """
+        logger.debug('start: fit from configuration {}'.format(cfgname))
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+        ret = hm.fit(df, debug=debug)
+        print ret.summary()
+        hm.to_yaml(str_or_buffer=cfgname)
+        logger.debug('start: fit from configuration {}'.format(cfgname))
+        return hm
+
+    @classmethod
+    def predict_from_cfg(cls, df, cfgname):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+
+        Returns
+        -------
+        predicted : pandas.Series
+            Predicted data in a pandas Series. Will have the index of `data`
+            after applying filters and minus any groups that do not have
+            models.
+        hm : RegressionModel which was used to predict
+        """
+        logger.debug('start: predict from configuration {}'.format(cfgname))
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+
+        price_or_rent = hm.predict(df)
+        print price_or_rent.describe()
+
+        logger.debug('start: predict from configuration {}'.format(cfgname))
+        return price_or_rent, hm
+
 
 class RegressionModelGroup(object):
     """
@@ -896,4 +957,66 @@ def columns_used(self):
         return list(toolz.unique(toolz.concatv(
             util.columns_in_filters(self.fit_filters),
             util.columns_in_filters(self.predict_filters),
-            self._group.columns_used())))
+            self._group.columns_used(),
+            [self.segmentation_col])))
+
+    @classmethod
+    def fit_from_cfg(cls, df, cfgname, debug=False, min_segment_size=None):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        debug : boolean, optional (default False)
+            Whether to generate debug information on the model.
+        min_segment_size : int, optional
+            Set attribute on the model.
+
+        Returns
+        -------
+        hm : SegmentedRegressionModel which was used to fit
+        """
+        logger.debug('start: fit from configuration {}'.format(cfgname))
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+        if min_segment_size:
+            hm.min_segment_size = min_segment_size
+
+        for k, v in hm.fit(df, debug=debug).items():
+            print "REGRESSION RESULTS FOR SEGMENT %s\n" % str(k)
+            print v.summary()
+        hm.to_yaml(str_or_buffer=cfgname)
+        logger.debug('finish: fit from configuration {}'.format(cfgname))
+        return hm
+
+    @classmethod
+    def predict_from_cfg(cls, df, cfgname, min_segment_size=None):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        min_segment_size : int, optional
+            Set attribute on the model.
+
+        Returns
+        -------
+        predicted : pandas.Series
+            Predicted data in a pandas Series. Will have the index of `data`
+            after applying filters and minus any groups that do not have
+            models.
+        hm : SegmentedRegressionModel which was used to predict
+        """
+        logger.debug('start: predict from configuration {}'.format(cfgname))
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+        if min_segment_size:
+            hm.min_segment_size = min_segment_size
+
+        price_or_rent = hm.predict(df)
+        print price_or_rent.describe()
+        logger.debug('finish: predict from configuration {}'.format(cfgname))
+
+        return price_or_rent, hm