Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This is the set up updates to finish moving urbansim to the new simulation framework #67

Merged
merged 31 commits into from
Jul 30, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
97ad7f2
checking in some intermediate changes testing new simulation framework
fscottfoti Jul 18, 2014
1ac8330
changes to yamlmodelrunner for new dataset style
fscottfoti Jul 20, 2014
8aef593
small changes after looking at diff
fscottfoti Jul 20, 2014
40863f3
when adding rows, need to be able to get only the local columns
fscottfoti Jul 20, 2014
9ee676b
small change - only keep local columns in transition model
fscottfoti Jul 20, 2014
e6faf4b
performance bug in yamlmodelrunner
fscottfoti Jul 20, 2014
4d21b45
fixing some important bugs in yamlmodelrunner
fscottfoti Jul 22, 2014
943b327
remove dataset after we update client repos
fscottfoti Jul 28, 2014
7f3a03f
need newling at end of file
fscottfoti Jul 28, 2014
d164f27
addressing matt's comments
fscottfoti Jul 28, 2014
d7dbb95
Merge branch 'master' into new-simulation-testing
fscottfoti Jul 28, 2014
ebb7b7c
Merge branch 'master' into new-simulation-testing
fscottfoti Jul 28, 2014
907dcb6
Merge branch 'master' into new-simulation-testing
fscottfoti Jul 29, 2014
f6f9bbf
working towards getting rid of yamlmodelrunner
fscottfoti Jul 29, 2014
f9ce17c
moving things that were in yamlmodelrunner into regression
fscottfoti Jul 29, 2014
78e20f0
moving from yamlmodelrunner to lcm
fscottfoti Jul 30, 2014
462d0c1
Merge branch 'master' into new-simulation-testing
fscottfoti Jul 30, 2014
f03663c
I added the segmentation_col to the set of columns_used
fscottfoti Jul 30, 2014
b21ebd1
pep8
fscottfoti Jul 30, 2014
e27b651
Trying no minimize changes so that I can merge this into master
fscottfoti Jul 30, 2014
905f1f1
looks like both matt and I added local_columns
fscottfoti Jul 30, 2014
2e7f293
adding tests to regression
fscottfoti Jul 30, 2014
d9b8fb7
test for partial_update
fscottfoti Jul 30, 2014
a51dcfc
pep8
fscottfoti Jul 30, 2014
2c35710
also pep8
fscottfoti Jul 30, 2014
0b03a82
testing fit_from_cfg for lcm
fscottfoti Jul 30, 2014
ca34f90
yep, pep8
fscottfoti Jul 30, 2014
be30e2a
Merge branch 'master' into new-simulation-testing
fscottfoti Jul 30, 2014
1623217
addressing most of matt's issues dicussed on github
fscottfoti Jul 30, 2014
c8e36d3
pep8
fscottfoti Jul 30, 2014
32ca53d
Merge branch 'master' into new-simulation-testing
fscottfoti Jul 30, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion urbansim/developer/sqftproforma.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,6 @@ def lookup(self, form, df, only_built=True):
df['max_far_from_dua'] = df.max_dua * df.ave_unit_size / self.config.building_efficiency
df['min_max_fars'] = df[['min_max_fars', 'max_far_from_dua']].min(axis=1)

df['min_max_fars'] = df.min_max_fars.fillna(0)
if only_built:
df = df.query('min_max_fars > 0 and parcel_size > 0')

Expand Down
171 changes: 170 additions & 1 deletion urbansim/models/lcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import toolz

from . import util
from ..exceptions import ModelEvaluationError
from ..urbanchoice import interaction, mnl
from ..utils import yamlio
from ..utils.logutil import log_start_finish
Expand Down Expand Up @@ -238,6 +239,13 @@ def fit(self, choosers, alternatives, current_choice):
choosers, alternatives, self.sample_size, current_choice)
model_design = dmatrix(
self.str_model_expression, data=merged, return_type='dataframe')

if len(merged) != model_design.as_matrix().shape[0]:
raise ModelEvaluationError(
'Estimated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')

self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
model_design.as_matrix(), chosen, self.sample_size)
self.fit_parameters.index = model_design.columns
Expand Down Expand Up @@ -336,6 +344,12 @@ def predict(self, choosers, alternatives, debug=False):
model_design = dmatrix(
self.str_model_expression, data=merged, return_type='dataframe')

if len(merged) != model_design.as_matrix().shape[0]:
raise ModelEvaluationError(
'Simulated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')

coeffs = [self.fit_parameters['Coefficient'][x]
for x in model_design.columns]

Expand Down Expand Up @@ -445,6 +459,83 @@ def columns_used(self):
self.alts_columns_used(),
self.interaction_columns_used())))

@classmethod
def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
"""
Parameters
----------
choosers : DataFrame
A dataframe of rows of agents which have locations assigned.
chosen_fname : string
A string indicating the column in the choosers dataframe which
gives which location the choosers have chosen.
alternatives : DataFrame
A dataframe of locations which should include the chosen locations
from the choosers dataframe as well as some other locations from
which to sample. Values in choosers[chosen_fname] should index
into the alternatives dataframe.
cfgname : string
The name of the yaml config file from which to read the location
choice model.

Returns
-------
lcm : MNLLocationChoiceModel which was used to fit
"""
logger.debug('start: fit from configuration {}'.format(cfgname))
lcm = cls.from_yaml(str_or_buffer=cfgname)
lcm.fit(choosers, alternatives, choosers[chosen_fname])
lcm.report_fit()
lcm.to_yaml(str_or_buffer=cfgname)
logger.debug('finish: fit from configuration {}'.format(cfgname))
return lcm

@classmethod
def predict_from_cfg(cls, movers, locations, cfgname,
location_ratio=2.0, debug=False):
"""
Simulate the location choices for the specified choosers

Parameters
----------
movers : DataFrame
A dataframe of agents doing the choosing.
locations : DataFrame
A dataframe of locations which the choosers are location in and which
have a supply.
cfgname : string
The name of the yaml config file from which to read the location
choice model.
location_ratio : float
Above the location ratio (default of 2.0) of locations to choosers, the
locations will be sampled to meet this ratio (for performance reasons).
debug : boolean, optional (default False)
Whether to generate debug information on the model.

Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
lcm : MNLLocationChoiceModel which was used to predict
"""
logger.debug('start: predict from configuration {}'.format(cfgname))
lcm = cls.from_yaml(str_or_buffer=cfgname)

if len(locations) > len(movers) * location_ratio:
logger.info("Location ratio exceeded: %d locations and only %d choosers" %
(len(locations), len(movers)))
idxes = np.random.choice(locations.index, size=len(movers) * location_ratio,
replace=False)
locations = locations.loc[idxes]
logger.info(" after sampling %d locations are available\n" % len(locations))

new_units = lcm.predict(movers, locations, debug=debug)
print("Assigned %d choosers to new units" % len(new_units.index))
logger.debug('finish: predict from configuration {}'.format(cfgname))
return new_units, lcm


class MNLLocationChoiceModelGroup(object):
"""
Expand Down Expand Up @@ -1033,4 +1124,82 @@ def columns_used(self):
return list(toolz.unique(toolz.concatv(
self.choosers_columns_used(),
self.alts_columns_used(),
self.interaction_columns_used())))
self.interaction_columns_used(),
[self.segmentation_col])))

@classmethod
def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
"""
Parameters
----------
choosers : DataFrame
A dataframe of rows of agents which have locations assigned.
chosen_fname : string
A string indicating the column in the choosers dataframe which
gives which location the choosers have chosen.
alternatives : DataFrame
A dataframe of locations which should include the chosen locations
from the choosers dataframe as well as some other locations from
which to sample. Values in choosers[chosen_fname] should index
into the alternatives dataframe.
cfgname : string
The name of the yaml config file from which to read the location
choice model.

Returns
-------
lcm : SegmentedMNLLocationChoiceModel which was used to fit
"""
logger.debug('start: fit from configuration {}'.format(cfgname))
lcm = cls.from_yaml(str_or_buffer=cfgname)
lcm.fit(choosers, alternatives, choosers[chosen_fname])
for k, v in lcm._group.models.items():
print("LCM RESULTS FOR SEGMENT %s\n" % str(k))
v.report_fit()
lcm.to_yaml(str_or_buffer=cfgname)
logger.debug('finish: fit from configuration {}'.format(cfgname))
return lcm

@classmethod
def predict_from_cfg(cls, movers, locations, cfgname,
location_ratio=2.0, debug=False):
"""
Simulate the location choices for the specified choosers

Parameters
----------
movers : DataFrame
A dataframe of agents doing the choosing.
locations : DataFrame
A dataframe of locations which the choosers are location in and which
have a supply.
cfgname : string
The name of the yaml config file from which to read the location
choice model.
location_ratio : float
Above the location ratio (default of 2.0) of locations to choosers, the
locations will be sampled to meet this ratio (for performance reasons).

Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
lcm : SegmentedMNLLocationChoiceModel which was used to predict
"""
logger.debug('start: predict from configuration {}'.format(cfgname))
lcm = cls.from_yaml(str_or_buffer=cfgname)

if len(locations) > len(movers) * location_ratio:
logger.info("Location ratio exceeded: %d locations and only %d choosers" %
(len(locations), len(movers)))
idxes = np.random.choice(locations.index, size=len(movers) * location_ratio,
replace=False)
locations = locations.loc[idxes]
logger.info(" after sampling %d locations are available\n" % len(locations))

new_units = lcm.predict(movers, locations, debug=debug)
print("Assigned %d choosers to new units" % len(new_units.index))
logger.debug('finish: predict from configuration {}'.format(cfgname))
return new_units, lcm
125 changes: 124 additions & 1 deletion urbansim/models/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def fit_model(df, filters, model_expression):
"""
df = util.apply_filter_query(df, filters)
model = smf.ols(formula=model_expression, data=df)

if len(model.exog) != len(df):
raise ModelEvaluationError(
'Estimated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')

with log_start_finish('statsmodels OLS fit', logger):
return model.fit()

Expand Down Expand Up @@ -325,6 +332,9 @@ class instance for use during prediction.
self.fit_parameters = _model_fit_to_table(fit)
if debug:
index = util.apply_filter_query(data, self.fit_filters).index
assert len(fit.model.exog) == len(index), (
"The estimate data is unequal in length to the original "
"dataframe, usually caused by nans")
df = pd.DataFrame(
fit.model.exog, columns=fit.model.exog_names, index=index)
df[fit.model.endog_names] = fit.model.endog
Expand Down Expand Up @@ -456,6 +466,57 @@ def columns_used(self):
util.columns_in_filters(self.predict_filters),
util.columns_in_formula(self.model_expression))))

@classmethod
def fit_from_cfg(cls, df, cfgname, debug=False):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
debug : boolean, optional (default False)
Whether to generate debug information on the model.

Returns
-------
RegressionModel which was used to fit
"""
logger.debug('start: fit from configuration {}'.format(cfgname))
hm = cls.from_yaml(str_or_buffer=cfgname)
ret = hm.fit(df, debug=debug)
print ret.summary()
hm.to_yaml(str_or_buffer=cfgname)
logger.debug('start: fit from configuration {}'.format(cfgname))
return hm

@classmethod
def predict_from_cfg(cls, df, cfgname):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.

Returns
-------
predicted : pandas.Series
Predicted data in a pandas Series. Will have the index of `data`
after applying filters and minus any groups that do not have
models.
hm : RegressionModel which was used to predict
"""
logger.debug('start: predict from configuration {}'.format(cfgname))
hm = cls.from_yaml(str_or_buffer=cfgname)

price_or_rent = hm.predict(df)
print price_or_rent.describe()

logger.debug('start: predict from configuration {}'.format(cfgname))
return price_or_rent, hm


class RegressionModelGroup(object):
"""
Expand Down Expand Up @@ -896,4 +957,66 @@ def columns_used(self):
return list(toolz.unique(toolz.concatv(
util.columns_in_filters(self.fit_filters),
util.columns_in_filters(self.predict_filters),
self._group.columns_used())))
self._group.columns_used(),
[self.segmentation_col])))

@classmethod
def fit_from_cfg(cls, df, cfgname, debug=False, min_segment_size=None):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
debug : boolean, optional (default False)
Whether to generate debug information on the model.
min_segment_size : int, optional
Set attribute on the model.

Returns
-------
hm : SegmentedRegressionModel which was used to fit
"""
logger.debug('start: fit from configuration {}'.format(cfgname))
hm = cls.from_yaml(str_or_buffer=cfgname)
if min_segment_size:
hm.min_segment_size = min_segment_size

for k, v in hm.fit(df, debug=debug).items():
print "REGRESSION RESULTS FOR SEGMENT %s\n" % str(k)
print v.summary()
hm.to_yaml(str_or_buffer=cfgname)
logger.debug('finish: fit from configuration {}'.format(cfgname))
return hm

@classmethod
def predict_from_cfg(cls, df, cfgname, min_segment_size=None):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
min_segment_size : int, optional
Set attribute on the model.

Returns
-------
predicted : pandas.Series
Predicted data in a pandas Series. Will have the index of `data`
after applying filters and minus any groups that do not have
models.
hm : SegmentedRegressionModel which was used to predict
"""
logger.debug('start: predict from configuration {}'.format(cfgname))
hm = cls.from_yaml(str_or_buffer=cfgname)
if min_segment_size:
hm.min_segment_size = min_segment_size

price_or_rent = hm.predict(df)
print price_or_rent.describe()
logger.debug('finish: predict from configuration {}'.format(cfgname))

return price_or_rent, hm
Loading