diff --git a/.gitignore b/.gitignore index 8a96515..56f8495 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +data/ + # Jupyter checkpoints **/.ipynb_checkpoints .pytest_cache/* diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index c6ad5c7..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,10 +0,0 @@ -formats: - - none - -conda: - file: docs/environment.yml - -python: - version: 3 - setup_py_install: true - \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index a7337d6..799b0ad 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,29 +5,23 @@ python: - "3.5" - "3.6" -before_install: - - pip install --upgrade pip - - pip install --upgrade wheel - - wget http://bit.ly/miniconda -O miniconda.sh - - bash miniconda.sh -b -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" - - hash -r - - conda config --set always_yes yes --set show_channel_urls true - - conda update conda - - conda config --add channels conda-forge --force - - conda config --add channels udst --force - - conda create --quiet --name TESTENV python=$TRAVIS_PYTHON_VERSION --file requirements.txt --file requirements-dev.txt - - source activate TESTENV - - conda info --all - - conda list +matrix: + include: + - python: "3.7" # temp solution until travis supports python 3.7 more cleanly + dist: xenial + sudo: true install: - pip install . + - pip install -r requirements-dev.txt + - # extra tests run if urbansim is present, but it can't install with python 3.7 + - if [ "$TRAVIS_PYTHON_VERSION" != "3.7" ]; then pip install urbansim; fi + - pip list - pip show choicemodels script: - - coverage run --source choicemodels -m pytest --verbose + - coverage run --source choicemodels --module pytest --verbose after_success: - - coverage report -m + - coverage report --show-missing - coveralls diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..fa70b79 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,75 @@ +# ChoiceModels change log + +### 0.2.1 (2019-01-30) + +- fixes a distribution error that excluded the LICENSE.txt file + +### 0.2 (2019-01-25) + +- production release + +### 0.2.dev10 (2019-01-25) + +- moves the `choicemodels.tools.distancematrix` functions directly into `choicemodels.tools` + +### 0.2.dev9 (2019-01-22) + +- improves documentation and packaging + +### 0.2.dev8 (2019-01-21) + +- prevents an infinite loop in `interative_lottery_choices()` when none of the remaining alternatives can accommodate any of the remaining choosers + +### 0.2.dev7 (2018-12-12) + +- adds a check to the `MergedChoiceTable` constructor to make sure there aren't any column names that overlap between the observations and alternatives tables + +### 0.2.dev6 (2018-11-23) + +- resolves deprecation warnings from older code + +- removes `choicemodels.tools.mnl_simulate()` (originally from `urbansim.urbanchoice.mnl`), because this functionality has been fully replaced + +- removes `choicemodels.Logit`, which wrapped a StatsModels estimator as proof of concept for MNL and didn't provide much value on its own + +### 0.2.dev5 (2018-11-12) + +- adds a `chooser_batch_size` parameter to `iterative_lottery_choices()`, to support batch simulation for very large datasets + +### 0.2.dev4 (2018-10-15) + +- adds a function `choicemodels.tools.iterative_lottery_choices()` for simulation of choices where the alternatives have limited capacity and choosers have varying probability distributions over the alternatives + +- in `MergedChoiceTable`, empty choosers or alternatives now produces an empty choice table (rather than an exception) + +- adds support for multiple tables of interaction terms in `MergedChoiceTable` + +### 0.2.dev3 (2018-10-03) + +- adds a function `choicemodels.tools.monte_carlo_choices()` for efficient simulation of choices for a list of scenarios that have differing probability distributions, but no capacity constraints on the alternatives + +### 0.2.dev2 (2018-09-12) + +- adds a `probabilities()` method to the `MultinomialLogitResults` class, which uses the fitted model coefficients to generate predicted probabilities for a table of choice scenarios + +- adds a required `model_experssion` parameter to the `MultinomialLogitResults` constructor + +### 0.2.dev1 (2018-08-06) + +- improves the reliability of the native MNL estimator: (a) reduces the chance of a memory overflow when exponentiating utilities and (b) reports warnings from SciPy if the likelihood maximization algorithm may not have converged correctly + +- adds substantial functionality to the `MergedChoiceTable` utility: sampling of alternatives with or without replacement, alternative-specific weights, interaction weights that apply to combinations of choosers and alternatives, automatic joining of interaction terms onto the merged table, non-sampling (all the alternatives available for each chooser), and estimation/simulation support for all combinations + +- `LargeMultinomialLogit` class now optionally accepts a `MergedChoiceTable` as input + +### 0.2.dev0 (2018-07-09) + +- adds additional information to the summary table for the native MNL estimator: number of observations, df of the model, df of the residuals, rho-squared, rho-bar-squared, BIC, AIC, p values, timestamp + +### 0.1.1 (2018-03-08) + +- packaging improvements + +### 0.1 (2018-03-08) + +- initial release \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..bcb1f71 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,94 @@ +Thanks for using ChoiceModels! + +This is an open source project that's part of the Urban Data Science Toolkit. Development and maintenance is a collaboration between UrbanSim Inc and U.C. Berkeley's Urban Analytics Lab. + +You can contact Sam Maurer, the lead developer, at `maurer@urbansim.com`. + + +## If you have a problem: + +- Take a look at the [open issues](https://github.com/UDST/choicemodels/issues) and [closed issues](https://github.com/UDST/choicemodels/issues?q=is%3Aissue+is%3Aclosed) to see if there's already a related discussion + +- Open a new issue describing the problem -- if possible, include any error messages, the operating system and version of python you're using, and versions of any libraries that may be relevant + + +## Feature proposals: + +- Take a look at the [open issues](https://github.com/UDST/choicemodels/issues) and [closed issues](https://github.com/UDST/choicemodels/issues?q=is%3Aissue+is%3Aclosed) to see if there's already a related discussion + +- Post your proposal as a new issue, so we can discuss it (some proposals may not be a good fit for the project) + + +## Contributing code: + +- Create a new branch of `UDST/choicemodels`, or fork the repository to your own account + +- Make your changes, following the existing styles for code and inline documentation + +- Add [tests](https://github.com/UDST/choicemodels/tree/master/tests) if possible! + +- Open a pull request to the `UDST/choicemodels` master branch, including a writeup of your changes -- take a look at some of the closed PR's for examples + +- Current maintainers will review the code, suggest changes, and hopefully merge it! + + +## Updating the version number: + +- Each pull request that changes substantive code should increment the development version number, e.g. from `0.2.dev7` to `0.2.dev8`, so that users know exactly which version they're running + +- It works best to do this just before merging (in case other PR's are merged first, and so you know the release date for the changelog and documentation) + +- There are three places where the version number needs to be changed: + - `setup.py` + - `choicemodels/__init__.py` + - `docs/source/index.rst` + +- Please also add a section to `CHANGELOG.md` describing the changes! + + +## Updating the documentation: + +- See instructions in `docs/README.md` + + +## Preparing a production release: + +- Make a new branch for release prep + +- Update the version number and `CHANGELOG.md` + +- Make sure all the tests are passing, and check if updates are needed to `README.md` or to the documentation + +- Open a pull request to the master branch and merge it + +- Tag the release on Github + + +## Distributing a release on PyPI (for pip installation): + +- Register an account at https://pypi.org, ask one of the current maintainers to add you to the project, and `pip install twine` + +- Run `python setup.py sdist bdist_wheel --universal` + +- This should create a `dist` directory containing two package files -- delete any old ones before the next step + +- Run `twine upload dist/*` -- this will prompt you for your pypi.org credentials + +- Check https://pypi.org/project/choicemodels/ for the new version + + +## Distributing a release on Conda Forge (for conda installation): + +- Make a fork of the [conda-forge/choicemodels-feedstock](https://github.com/conda-forge/choicemodels-feedstock) repository -- there may already be a fork in udst + +- Edit `recipe/meta.yaml`: + - update the version number + - paste a new hash matching the tar.gz file that was uploaded to pypi (it's available on the pypi.org project page) + +- Check that the run requirements still match `requirements.txt` + +- Open a pull request to the `conda-forge/choicemodels-feedstock` master branch + +- Automated tests will run, and after they pass one of the current project maintainers will be able to merge the PR -- you can add your Github user name to the maintainers list in `meta.yaml` for the next update + +- Check https://anaconda.org/conda-forge/choicemodels for the new version (may take a few minutes for it to appear) diff --git a/LICENSE b/LICENSE.txt similarity index 95% rename from LICENSE rename to LICENSE.txt index 15ac3d8..39e65bb 100644 --- a/LICENSE +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2018, Urban Analytics Lab. All rights reserved. +Copyright (c) 2019, Urban Analytics Lab. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/MANIFEST.in b/MANIFEST.in index 096dd50..c88f159 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ -include LICENSE +include LICENSE.txt include requirements.txt diff --git a/README.md b/README.md index 52bab4b..98b145f 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,40 @@ [![Build Status](https://travis-ci.org/UDST/choicemodels.svg?branch=master)](https://travis-ci.org/UDST/choicemodels) [![Coverage Status](https://coveralls.io/repos/github/UDST/choicemodels/badge.svg?branch=master)](https://coveralls.io/github/UDST/choicemodels?branch=master) +[![Docs Status](https://readthedocs.org/projects/choicemodels/badge/?version=latest)](https://choicemodels.readthedocs.io) # ChoiceModels -This is a package for discrete choice model estimation and simulation, with an emphasis on large choice sets and behavioral refinements to multinomial models. Most of these models are not available in Statsmodels or Scikit-learn. +ChoiceModels is a Python library for discrete choice modeling, with utilities for sampling, simulation, and other ancillary tasks. It's part of the [Urban Data Science Toolkit](https://docs.udst.org) (UDST). -The underlying estimation routines come from two main places: (1) the `urbanchoice` codebase, which has been moved into ChoiceModels, and (2) Timothy Brathwaite's PyLogit package, which handles more flexible model specifications. +### Features +The library currently focuses on tools to help integrate discrete choice models into larger workflows, drawing on other packages such as the excellent [PyLogit](https://github.com/timothyb0912/pylogit) for most estimation of models. -## Documentation +ChoiceModels can automate the creation of choice tables for estimation or simulation, using uniform or weighted random sampling of alternatives, as well as interaction terms or cartesian merges. -Package documentation is available on [readthedocs](https://choicemodels.readthedocs.io/). +It also provides general-purpose tools for Monte Carlo simulation of choices given probability distributions from fitted models, with fast algorithms for independent or capacity-constrained choices. +ChoiceModels includes a custom engine for Multinomial Logit estimation that's optimized for fast performance with large numbers of alternatives. -## Installation +### Installation -Install with pip: +ChoiceModels can be installed using the Pip or Conda package managers: -`pip install choicemodels` +``` +pip install choicemodels +``` -or with conda-forge. +``` +conda install choicemodels --channel conda-forge +``` +### Documentation -## Current functionality +See the online documentation for much more: https://choicemodels.readthedocs.io -`choicemodels.tools.MergedChoiceTable()` +Some additional documentation is available within the repo in `CHANGELOG.md`, `CONTRIBUTING.md`, `/docs/README.md`, and `/tests/README.md`. -- Generates a merged long-format table of choosers and alternatives. - -`choicemodels.MultinomialLogit()` - -- Fits MNL models, using either the ChoiceModels or PyLogit estimation engines. - -`chociemodels.MultinomialLogitResults()` - -- Stores and reports fitted MNL models. - -There's documentation in these classes' docstrings, and a usage demo in a Jupyter notebook. - -https://github.com/udst/choicemodels/blob/master/notebooks/Destination-choice-models-02.ipynb +There's discussion of current and planned features in the [Pull requests](https://github.com/udst/choicemodels/pulls?utf8=✓&q=is%3Apr) and [Issues](https://github.com/udst/choicemodels/issues?utf8=✓&q=is%3Aissue), both open and closed. diff --git a/choicemodels/__init__.py b/choicemodels/__init__.py index 2711294..f1505de 100644 --- a/choicemodels/__init__.py +++ b/choicemodels/__init__.py @@ -3,4 +3,4 @@ from .mnl import MultinomialLogit, MultinomialLogitResults -version = __version__ = '0.2.dev7' +version = __version__ = '0.2.1' diff --git a/choicemodels/mnl.py b/choicemodels/mnl.py index 9235d4b..552d42b 100644 --- a/choicemodels/mnl.py +++ b/choicemodels/mnl.py @@ -76,9 +76,6 @@ class MultinomialLogit(object): and the alternatives. Attributes of a particular alternative may vary for different choosers (distance, for example), but this must be set up manually in the input data. - [TO DO: comparison of the estimation engines] - [TO DO: testing and input validation] - Note that prediction methods are in a separate class: see MultinomialLogitResults(). Parameters @@ -250,7 +247,7 @@ class MultinomialLogitResults(object): If not provided, these will be extracted from the raw results. estimation_engine : str, optional - 'ChoiceModels' (default) or 'PyLogit'. # TO DO - infer from model_expression? + 'ChoiceModels' (default) or 'PyLogit'. """ def __init__(self, model_expression, results=None, fitted_parameters=None, @@ -287,11 +284,6 @@ def probabilities(self, data): Generate predicted probabilities for a table of choice scenarios, using the fitted parameters stored in the results object. - TO DO - make sure this handles pylogit case - - TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario - are consecutive? seems like a requirement here; should document it - Parameters ---------- data : choicemodels.tools.MergedChoiceTable @@ -307,6 +299,11 @@ def probabilities(self, data): pandas.Series with indexes matching the input """ + # TO DO - make sure this handles pylogit case + + # TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario + # are consecutive? seems like a requirement here; should document it + df = data.to_frame() numalts = data.sample_size # TO DO - make this an official MCT param diff --git a/choicemodels/tools/__init__.py b/choicemodels/tools/__init__.py index a4e3e3a..d8d1769 100644 --- a/choicemodels/tools/__init__.py +++ b/choicemodels/tools/__init__.py @@ -1,5 +1,6 @@ # ChoiceModels # See full license in LICENSE +from .distancematrix import * from .mergedchoicetable import * from .simulation import * \ No newline at end of file diff --git a/choicemodels/tools/mergedchoicetable.py b/choicemodels/tools/mergedchoicetable.py index 378a7df..7a19fd9 100644 --- a/choicemodels/tools/mergedchoicetable.py +++ b/choicemodels/tools/mergedchoicetable.py @@ -133,15 +133,11 @@ def __init__(self, observations, alternatives, chosen_alternatives=None, # Check for duplicate column names obs_cols = list(observations.columns) + list(observations.index.names) alt_cols = list(alternatives.columns) + list(alternatives.index.names) - dupes = [c for c in obs_cols if c in alt_cols] - - if len(dupes) == 1: - raise ValueError("Column '{}' appears in both input tables. Please ensure " - "column names are unique before merging".format(dupes[0])) - elif len(dupes) > 1: - raise ValueError("Columns '{}' appear in both input tables. Please ensure " - "column names are unique before merging"\ - .format("', '".join(dupes))) + dupes = set(obs_cols) & set(alt_cols) + + if len(dupes) > 0: + raise ValueError("Both input tables contain column {}. Please ensure " + "column names are unique before merging".format(dupes)) # Normalize weights to a pd.Series if (weights is not None) & isinstance(weights, str): diff --git a/choicemodels/tools/simulation.py b/choicemodels/tools/simulation.py index 2568959..925f332 100644 --- a/choicemodels/tools/simulation.py +++ b/choicemodels/tools/simulation.py @@ -142,9 +142,9 @@ def iterative_lottery_choices(choosers, alternatives, mct_callable, probs_callab all choosers are matched or no alternatives remain. chooser_batch_size : int or None, optional - Size of the batches for processing smaller groups of choosers one at a time. Useful - when the anticipated size of the merged choice tables (choosers X alternatives - X covariates) will be too large for python/pandas to handle. + Size of the batches for processing smaller groups of choosers one at a time. + Useful when the anticipated size of the merged choice tables (choosers X + alternatives X covariates) will be too large for python/pandas to handle. Returns ------- @@ -177,6 +177,10 @@ def iterative_lottery_choices(choosers, alternatives, mct_callable, probs_callab if max_iter is not None: if (iter > max_iter): break + if alts[capacity].max() < choosers[size].min(): + print("{} choosers cannot be allocated.".format(len(choosers))) + print("\nRemaining capacity on alternatives but not enough to accodomodate choosers' sizes") + break if chooser_batch_size is None or chooser_batch_size > len(choosers): mct = mct_callable(choosers.sample(frac=1), alts) else: diff --git a/data/.gitignore b/data/.gitignore deleted file mode 100644 index ca893bc..0000000 --- a/data/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.zip -*.csv -*.pdf -tl_2010_06_tract10 \ No newline at end of file diff --git a/data/README.md b/data/README.md deleted file mode 100644 index 44c779a..0000000 --- a/data/README.md +++ /dev/null @@ -1,9 +0,0 @@ -The demo notebooks use data from the 2010-2012 California Household Travel Survey. - -Information about the survey: http://www.dot.ca.gov/hq/tpp/offices/omsp/statewide_travel_analysis/chts.html - -Data download: https://www.nrel.gov/transportation/secure-transportation-data.html - -The data is open access, but you will need to fill out a registration form. Download the file named `caltrans_full_survey.zip`, 233.2 MB, and place it in this directory. - -You can download the California census tracts shapefile `tl_2010_06_tract10` from: https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=Census+Tracts diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 385fa29..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = python -msphinx -SPHINXPROJ = ChoiceModels -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..a52d67a --- /dev/null +++ b/docs/README.md @@ -0,0 +1,29 @@ +This folder generates the ChoiceModels online documentation, hosted at https://choicemodels.readthedocs.io and https://docs.udst.org. + + +### How it works + +Read the Docs builds and hosts the Sphinx pages defined in `docs/source/`. Builds are triggered when the master branch of this Github repo changes. + + +### Updating the documentation + +The pages are built using the Sphinx documentation generator. Here's a [good tutorial](https://pythonhosted.org/an_example_pypi_project/sphinx.html). Edit the `.rst` files and `conf.py` to change what appears in the rendered documentation. + + +### Previewing changes locally + +Install `sphinx` and `sphinx_rtd_theme`. + +From the `docs` directory, run `sphinx-build -b html source build` to build the documentation. HTML files will show up in `docs/build/`. + +The build files won't be committed to the repo; Read the Docs will create a separate copy when the master branch is updated. + + +### Hosting and troubleshooting + +Hosting settings are at https://readthedocs.org/projects/choicemodels/. If you don't have access, create a Read the Docs account and ask one of the existing maintainers to add you. + +The DNS settings for the docs.udst.org subdomain contain a CNAME record redirecting traffic to readthedocs.io. Read the Docs handles the SSL certificate for https. + +The docs.udst.org landing page is generated by the `UDST/udst-docs` repository, with hosting settings at https://readthedocs.org/projects/udst-docs/. diff --git a/docs/environment.yml b/docs/environment.yml deleted file mode 100644 index 0d95b48..0000000 --- a/docs/environment.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: CHOICEMODELS_DOCS - -channels: - - defaults - - conda-forge - -dependencies: - - python=3 - - future - - numpy - - pandas - - patsy - - pylogit - - scipy - - statsmodels - \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 6f9d24b..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,36 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=python -msphinx -) -set SOURCEDIR=source -set BUILDDIR=build -set SPHINXPROJ=ChoiceModels - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The Sphinx module was not found. Make sure you have Sphinx installed, - echo.then set the SPHINXBUILD environment variable to point to the full - echo.path of the 'sphinx-build' executable. Alternatively you may add the - echo.Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/source/choice-table-utilities.rst b/docs/source/choice-table-utilities.rst new file mode 100644 index 0000000..2917850 --- /dev/null +++ b/docs/source/choice-table-utilities.rst @@ -0,0 +1,29 @@ +Choice table utilities API +========================== + +Working with discrete choice models can require a lot of data preparation. Each chooser has to be matched with hypothetical alternatives, either to simulate choice probabilities or to compare them with the chosen alternative for model estimation. + +ChoiceModels includes a class called ``MergedChoiceTable`` that automates this. To build a merged table, create an instance of the class and pass it one ``pd.DataFrame`` of choosers and another of alternatives, with whatever other arguments are needed (see below for full API). + +The merged data table can be output to a DataFrame, or passed directly to other ChoiceModels tools as a ``MergedChoiceTable`` object. (This retains metadata about indexes and other special columns.) + +.. code-block:: python + + mct = choicemodels.tools.MergedChoiceTable(obs, alts, ..) + df = mct.to_frame() + +This tool is designed especially for models that need to sample from large numbers of alternatives. It supports: + +- uniform random sampling of alternatives, with or without replacement +- weighted random sampling based either on characteristics of the alternatives or on combinations of chooser and alternative +- interaction terms to be merged onto the final data table +- cartesian merging of all the choosers with all the alternatives, without sampling + +All of the sampling procedures work for both estimation (where the chosen alternative is known) and simulation (where it is not). + + +MergedChoiceTable +----------------- + +.. autoclass:: choicemodels.tools.MergedChoiceTable + :members: \ No newline at end of file diff --git a/docs/source/choicemodels.rst b/docs/source/choicemodels.rst deleted file mode 100644 index 13d1355..0000000 --- a/docs/source/choicemodels.rst +++ /dev/null @@ -1,54 +0,0 @@ -choicemodels package -==================== - -Submodules ----------- - -choicemodels.choicemodels module --------------------------------- - -.. automodule:: choicemodels.choicemodels - :members: - :undoc-members: - :show-inheritance: - -choicemodels.mnl module ------------------------ - -.. automodule:: choicemodels.mnl - :members: - :undoc-members: - :show-inheritance: - -choicemodels.tools.distancematrix module ----------------------------------------- - -.. automodule:: choicemodels.tools.distancematrix - :members: - :undoc-members: - :show-inheritance: - -choicemodels.tools.interaction module -------------------------------------- - -.. automodule:: choicemodels.tools.interaction - :members: - :undoc-members: - :show-inheritance: - -choicemodels.tools.pmat module ------------------------------- - -.. automodule:: choicemodels.tools.pmat - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: choicemodels - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index d8c3503..704bfec 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,6 +23,8 @@ # go up two levels from /docs/source to the package root sys.path.insert(0, os.path.abspath('../..')) +import sphinx_rtd_theme + # -- General configuration ------------------------------------------------ @@ -34,7 +36,7 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.autodoc', - #'sphinx.ext.napoleon', + 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinx.ext.githubpages'] @@ -52,7 +54,7 @@ # General information about the project. project = 'ChoiceModels' -copyright = '2018, Urban Data Science Toolkit' +copyright = '2019, Urban Data Science Toolkit' author = 'Urban Data Science Toolkit' # The version info for the project you're documenting, acts as replacement for @@ -90,7 +92,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -103,21 +105,6 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# This is required for the alabaster theme -# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars -html_sidebars = { - '**': [ - 'about.html', - 'navigation.html', - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', - 'donate.html', - ] -} - # -- Options for HTMLHelp output ------------------------------------------ diff --git a/docs/source/distance-utilities.rst b/docs/source/distance-utilities.rst new file mode 100644 index 0000000..378705f --- /dev/null +++ b/docs/source/distance-utilities.rst @@ -0,0 +1,20 @@ +Distance utilities API +====================== + +ChoiceModels also includes tools for constructing pairwise distance matrices and calculating which geographies are within various distance bands of some reference geography. + + +Distance matrices +----------------- + +.. autofunction:: choicemodels.tools.great_circle_distance_matrix + +.. autofunction:: choicemodels.tools.euclidean_distance_matrix + +.. autofunction:: choicemodels.tools.distance_matrix + + +Distance bands +-------------- + +.. autofunction:: choicemodels.tools.distance_bands diff --git a/docs/source/getting-started.rst b/docs/source/getting-started.rst new file mode 100644 index 0000000..4981d84 --- /dev/null +++ b/docs/source/getting-started.rst @@ -0,0 +1,99 @@ +Getting started +=============== + +Intro +----- + +ChoiceModels is a Python library for discrete choice modeling, with utilities for sampling, simulation, and other ancillary tasks. It's part of the `Urban Data Science Toolkit `__ (UDST). + +The library currently focuses on tools to help integrate discrete choice models into larger workflows, drawing on other packages such as the excellent `PyLogit `__ for most estimation of models. ChoiceModels can automate the creation of choice tables for estimation or simulation, using uniform or weighted random sampling of alternatives, as well as interaction terms or cartesian merges. It also provides general-purpose tools for Monte Carlo simulation of choices given probability distributions from fitted models, with fast algorithms for independent or capacity-constrained choices. ChoiceModels includes a custom engine for Multinomial Logit estimation that's optimized for fast performance with large numbers of alternatives. + +ChoiceModels is `hosted on Github `__ with a BSD 3-Clause open source license. The code repository includes some material not found in this documentation: a `change log `__, a `contributor's guide `__, and instructions for `running the tests `__, `updating the documentation `__, and `creating a new release `__. Another useful resource is the `issues `__ and `pull requests `__ on Github, which include detailed feature proposals and other discussions. + +ChoiceModels was created in 2016, with contributions from Sam Maurer (maurer@urbansim.com), Timothy Brathwaite, Geoff Boeing, Paul Waddell, Max Gardner, Eddie Janowicz, Arezoo Besharati Zadeh, Jacob Finkelman, Catalina Vanoli, and others. It includes earlier code written by Matt Davis, Fletcher Foti, and Paul Sohn. + + +Installation +------------ + +ChoiceModels is tested with Python 2.7, 3.5, 3.6, and 3.7. It should run on any platform. + + +Production releases +~~~~~~~~~~~~~~~~~~~ + +ChoiceModels can be installed using the Pip or Conda package managers. We recommend Conda because it resolves dependency conflicts better. + +.. code-block:: python + + pip install choicemodels + +.. code-block:: python + + conda install choicemodels --channel conda-forge + + +When new production releases of ChoiceModels come out, you can upgrade like this: + +.. code-block:: python + + pip install choicemodels --upgrade + +.. code-block:: python + + conda update choicemodels --channel conda-forge + + +Developer pre-releases +~~~~~~~~~~~~~~~~~~~~~~ + +Developer pre-releases of ChoiceModels can be installed using the Github URL. Additional information about the developer releases can be found in Github `pull requests `__. + +.. code-block:: python + + pip install git+git://github.com/udst/choicemodels.git + +You can use the same command to upgrade. + + +Cloning the repository +~~~~~~~~~~~~~~~~~~~~~~ + +You can also install ChoiceModels by cloning the Github repository, which is the best way to do it if you'll be modifying the code. The main branch contains the latest developer release. + +.. code-block:: python + + git clone https://github.com/udst/choicemodels.git + cd choicemodels + python setup.py develop + +Update it with ``git pull``. + + +Basic usage +----------- + +You can use components of ChoiceModels individually, or combine them together to streamline model estimation and simulation workflows. Other UDST libraries like UrbanSim Templates use ChoiceModels objects as inputs and outputs. + +If you have choosers and alternatives as Pandas DataFrames, you can prepare them for model estimation like this: + +.. code-block:: python + + mct = choicemodels.tools.MergedChoiceTable(obs, alts, chosen_alternatives='chosen', + sample_size=10, ..) + +Then, you can estimate a Multinomial Logit model like this: + +.. code-block:: python + + results = choicemodels.MultinomialLogit(mct, model_expression='x1 + x2 + x3') + +This provides a ``choicemodels.MultinomialLogitResults`` object, from which you can obtain probability distributions for out-of-sample choice scenarios in order to generate simulated choices. + +.. code-block:: python + + mct2 = choicemodels.tools.MergedChoiceTable(obs2, alts, sample_size=10, ..) + probs = results.probabilities(mct2) + choices = choicemodels.tools.monte_carlo_choices(probs) + + diff --git a/docs/source/index.rst b/docs/source/index.rst index def04e7..c03f19b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,19 +3,22 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to ChoiceModels's documentation! -======================================== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: +ChoiceModels +============ +ChoiceModels is a Python library for discrete choice modeling, with utilities for sampling, simulation, and other ancillary tasks. It's part of the `Urban Data Science Toolkit `__ (UDST). +v0.2.1, released January 30, 2019 -Indices and tables -================== +Contents +-------- -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` +.. toctree:: + :maxdepth: 2 + + getting-started + choice-table-utilities + multinomial-logit + simulation-utilities + distance-utilities diff --git a/docs/source/multinomial-logit.rst b/docs/source/multinomial-logit.rst new file mode 100644 index 0000000..2200e80 --- /dev/null +++ b/docs/source/multinomial-logit.rst @@ -0,0 +1,20 @@ +Multinomial Logit API +===================== + +ChoiceModels has built-in functionality for Multinomial Logit estimation and simulation. This can use either the `PyLogit `__ MNL estimation engine or a custom engine optimized for fast performance with large numbers of alternatives. The custom engine is originally from ``urbansim.urbanchoice``. + +Fitting a model yields a results object that can generate choice probabilities for out-of-sample scenarios. + + +MultinomialLogit +---------------- + +.. autoclass:: choicemodels.MultinomialLogit + :members: + + +MultinomialLogitResults +----------------------- + +.. autoclass:: choicemodels.MultinomialLogitResults + :members: diff --git a/docs/source/simulation-utilities.rst b/docs/source/simulation-utilities.rst index 484b86c..4233d8b 100644 --- a/docs/source/simulation-utilities.rst +++ b/docs/source/simulation-utilities.rst @@ -9,7 +9,6 @@ ChoiceModels provides general-purpose tools for Monte Carlo simulation of choice ``parallel_lottery_choices()`` works functionally the same as the above but the batches run in parallel rather than sequentially. - Independent choices ------------------- @@ -19,10 +18,11 @@ Independent choices Capacity-constrained choices ---------------------------- + .. autofunction:: choicemodels.tools.iterative_lottery_choices Parallelized capacity-constrained choices ---------------------------- -.. autofunction:: choicemodels.tools.parallel_lottery_choices \ No newline at end of file +.. autofunction:: choicemodels.tools.parallel_lottery_choices diff --git a/ez_setup.py b/ez_setup.py deleted file mode 100644 index 3762bd8..0000000 --- a/ez_setup.py +++ /dev/null @@ -1,391 +0,0 @@ -#!/usr/bin/env python - -""" -Setuptools bootstrapping installer. - -Run this script to install or upgrade setuptools. -""" - -import os -import shutil -import sys -import tempfile -import zipfile -import optparse -import subprocess -import platform -import textwrap -import contextlib -import warnings - -from distutils import log - -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen - -try: - from site import USER_SITE -except ImportError: - USER_SITE = None - -DEFAULT_VERSION = "18.0.1" -DEFAULT_URL = "https://pypi.python.org/packages/source/s/setuptools/" -DEFAULT_SAVE_DIR = os.curdir - - -def _python_cmd(*args): - """ - Execute a command. - - Return True if the command succeeded. - """ - args = (sys.executable,) + args - return subprocess.call(args) == 0 - - -def _install(archive_filename, install_args=()): - """Install Setuptools.""" - with archive_context(archive_filename): - # installing - log.warn('Installing Setuptools') - if not _python_cmd('setup.py', 'install', *install_args): - log.warn('Something went wrong during the installation.') - log.warn('See the error message above.') - # exitcode will be 2 - return 2 - - -def _build_egg(egg, archive_filename, to_dir): - """Build Setuptools egg.""" - with archive_context(archive_filename): - # building an egg - log.warn('Building a Setuptools egg in %s', to_dir) - _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) - # returning the result - log.warn(egg) - if not os.path.exists(egg): - raise IOError('Could not build the egg.') - - -class ContextualZipFile(zipfile.ZipFile): - - """Supplement ZipFile class to support context manager for Python 2.6.""" - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def __new__(cls, *args, **kwargs): - """Construct a ZipFile or ContextualZipFile as appropriate.""" - if hasattr(zipfile.ZipFile, '__exit__'): - return zipfile.ZipFile(*args, **kwargs) - return super(ContextualZipFile, cls).__new__(cls) - - -@contextlib.contextmanager -def archive_context(filename): - """ - Unzip filename to a temporary directory, set to the cwd. - - The unzipped target is cleaned up after. - """ - tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) - old_wd = os.getcwd() - try: - os.chdir(tmpdir) - with ContextualZipFile(filename) as archive: - archive.extractall() - - # going in the directory - subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) - os.chdir(subdir) - log.warn('Now working in %s', subdir) - yield - - finally: - os.chdir(old_wd) - shutil.rmtree(tmpdir) - - -def _do_download(version, download_base, to_dir, download_delay): - """Download Setuptools.""" - egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' - % (version, sys.version_info[0], sys.version_info[1])) - if not os.path.exists(egg): - archive = download_setuptools(version, download_base, - to_dir, download_delay) - _build_egg(egg, archive, to_dir) - sys.path.insert(0, egg) - - # Remove previously-imported pkg_resources if present (see - # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details). - if 'pkg_resources' in sys.modules: - del sys.modules['pkg_resources'] - - import setuptools - setuptools.bootstrap_install_from = egg - - -def use_setuptools( - version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=DEFAULT_SAVE_DIR, download_delay=15): - """ - Ensure that a setuptools version is installed. - - Return None. Raise SystemExit if the requested version - or later cannot be installed. - """ - to_dir = os.path.abspath(to_dir) - - # prior to importing, capture the module state for - # representative modules. - rep_modules = 'pkg_resources', 'setuptools' - imported = set(sys.modules).intersection(rep_modules) - - try: - import pkg_resources - pkg_resources.require("setuptools>=" + version) - # a suitable version is already installed - return - except ImportError: - # pkg_resources not available; setuptools is not installed; download - pass - except pkg_resources.DistributionNotFound: - # no version of setuptools was found; allow download - pass - except pkg_resources.VersionConflict as VC_err: - if imported: - _conflict_bail(VC_err, version) - - # otherwise, unload pkg_resources to allow the downloaded version to - # take precedence. - del pkg_resources - _unload_pkg_resources() - - return _do_download(version, download_base, to_dir, download_delay) - - -def _conflict_bail(VC_err, version): - """ - Setuptools was imported prior to invocation, so it is - unsafe to unload it. Bail out. - """ - conflict_tmpl = textwrap.dedent(""" - The required version of setuptools (>={version}) is not available, - and can't be installed while this script is running. Please - install a more recent version first, using - 'easy_install -U setuptools'. - - (Currently using {VC_err.args[0]!r}) - """) - msg = conflict_tmpl.format(**locals()) - sys.stderr.write(msg) - sys.exit(2) - - -def _unload_pkg_resources(): - del_modules = [ - name for name in sys.modules - if name.startswith('pkg_resources') - ] - for mod_name in del_modules: - del sys.modules[mod_name] - - -def _clean_check(cmd, target): - """ - Run the command to download target. - - If the command fails, clean up before re-raising the error. - """ - try: - subprocess.check_call(cmd) - except subprocess.CalledProcessError: - if os.access(target, os.F_OK): - os.unlink(target) - raise - - -def download_file_powershell(url, target): - """ - Download the file at url to target using Powershell. - - Powershell will validate trust. - Raise an exception if the command cannot complete. - """ - target = os.path.abspath(target) - ps_cmd = ( - "[System.Net.WebRequest]::DefaultWebProxy.Credentials = " - "[System.Net.CredentialCache]::DefaultCredentials; " - "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" - % vars() - ) - cmd = [ - 'powershell', - '-Command', - ps_cmd, - ] - _clean_check(cmd, target) - - -def has_powershell(): - """Determine if Powershell is available.""" - if platform.system() != 'Windows': - return False - cmd = ['powershell', '-Command', 'echo test'] - with open(os.path.devnull, 'wb') as devnull: - try: - subprocess.check_call(cmd, stdout=devnull, stderr=devnull) - except Exception: - return False - return True -download_file_powershell.viable = has_powershell - - -def download_file_curl(url, target): - cmd = ['curl', url, '--silent', '--output', target] - _clean_check(cmd, target) - - -def has_curl(): - cmd = ['curl', '--version'] - with open(os.path.devnull, 'wb') as devnull: - try: - subprocess.check_call(cmd, stdout=devnull, stderr=devnull) - except Exception: - return False - return True -download_file_curl.viable = has_curl - - -def download_file_wget(url, target): - cmd = ['wget', url, '--quiet', '--output-document', target] - _clean_check(cmd, target) - - -def has_wget(): - cmd = ['wget', '--version'] - with open(os.path.devnull, 'wb') as devnull: - try: - subprocess.check_call(cmd, stdout=devnull, stderr=devnull) - except Exception: - return False - return True -download_file_wget.viable = has_wget - - -def download_file_insecure(url, target): - """Use Python to download the file, without connection authentication.""" - src = urlopen(url) - try: - # Read all the data in one block. - data = src.read() - finally: - src.close() - - # Write all the data in one block to avoid creating a partial file. - with open(target, "wb") as dst: - dst.write(data) -download_file_insecure.viable = lambda: True - - -def get_best_downloader(): - downloaders = ( - download_file_powershell, - download_file_curl, - download_file_wget, - download_file_insecure, - ) - viable_downloaders = (dl for dl in downloaders if dl.viable()) - return next(viable_downloaders, None) - - -def download_setuptools( - version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=DEFAULT_SAVE_DIR, delay=15, - downloader_factory=get_best_downloader): - """ - Download setuptools from a specified location and return its filename. - - `version` should be a valid setuptools version number that is available - as an sdist for download under the `download_base` URL (which should end - with a '/'). `to_dir` is the directory where the egg will be downloaded. - `delay` is the number of seconds to pause before an actual download - attempt. - - ``downloader_factory`` should be a function taking no arguments and - returning a function for downloading a URL to a target. - """ - # making sure we use the absolute path - to_dir = os.path.abspath(to_dir) - zip_name = "setuptools-%s.zip" % version - url = download_base + zip_name - saveto = os.path.join(to_dir, zip_name) - if not os.path.exists(saveto): # Avoid repeated downloads - log.warn("Downloading %s", url) - downloader = downloader_factory() - downloader(url, saveto) - return os.path.realpath(saveto) - - -def _build_install_args(options): - """ - Build the arguments to 'python setup.py install' on the setuptools package. - - Returns list of command line arguments. - """ - return ['--user'] if options.user_install else [] - - -def _parse_args(): - """Parse the command line for options.""" - parser = optparse.OptionParser() - parser.add_option( - '--user', dest='user_install', action='store_true', default=False, - help='install in user site package (requires Python 2.6 or later)') - parser.add_option( - '--download-base', dest='download_base', metavar="URL", - default=DEFAULT_URL, - help='alternative URL from where to download the setuptools package') - parser.add_option( - '--insecure', dest='downloader_factory', action='store_const', - const=lambda: download_file_insecure, default=get_best_downloader, - help='Use internal, non-validating downloader' - ) - parser.add_option( - '--version', help="Specify which version to download", - default=DEFAULT_VERSION, - ) - parser.add_option( - '--to-dir', - help="Directory to save (and re-use) package", - default=DEFAULT_SAVE_DIR, - ) - options, args = parser.parse_args() - # positional arguments are ignored - return options - - -def _download_args(options): - """Return args for download_setuptools function from cmdline args.""" - return dict( - version=options.version, - download_base=options.download_base, - downloader_factory=options.downloader_factory, - to_dir=options.to_dir, - ) - - -def main(): - """Install or upgrade setuptools and EasyInstall.""" - options = _parse_args() - archive = download_setuptools(**_download_args(options)) - return _install(archive, _build_install_args(options)) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/notebooks/CHTS-exploration-02.ipynb b/notebooks/CHTS-exploration-02.ipynb deleted file mode 100644 index 41bd82a..0000000 --- a/notebooks/CHTS-exploration-02.ipynb +++ /dev/null @@ -1,2164 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring the public CHTS data\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version June 2017 (v01) \n", - "Updated Aug 2017 (v02) to look into data type issues in the raw data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib\n", - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# See ../data/README.md for instructions about how to get the data\n", - "\n", - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Households\n", - "\n", - "Households that participated in the travel diary survey" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Limit to the Bay Area\n", - "\n", - "households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(households_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 1076\n", - "SAN JOSE 939\n", - "OAKLAND 459\n", - "SANTA ROSA 321\n", - "BERKELEY 251\n", - "NAPA 228\n", - "PALO ALTO 218\n", - "SUNNYVALE 200\n", - "SAN MATEO 197\n", - "FREMONT 177\n", - "WALNUT CREEK 173\n", - "REDWOOD CITY 170\n", - "FAIRFIELD 159\n", - "CONCORD 158\n", - "SAN RAFAEL 158\n", - "Name: home_city, dtype: int64" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Top home locations\n", - "\n", - "households_ba.home_city.value_counts()[:15]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 2.571462\n", - "std 1.373733\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 2.000000\n", - "75% 3.000000\n", - "max 8.000000\n", - "Name: persons_count, dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.persons_count.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 0.999955\n", - "std 0.704667\n", - "min 0.003498\n", - "25% 0.447392\n", - "50% 0.915924\n", - "75% 1.376790\n", - "max 5.400840\n", - "Name: hhwgt, dtype: float64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.hhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 293.007784\n", - "std 206.482227\n", - "min 1.025146\n", - "25% 131.095416\n", - "50% 268.385115\n", - "75% 403.428487\n", - "max 1582.559559\n", - "Name: exphhwgt, dtype: float64" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.exphhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 4.242100e+04\n", - "mean 6.056293e+09\n", - "std 2.944557e+07\n", - "min 6.001400e+09\n", - "25% 6.037207e+09\n", - "50% 6.059042e+09\n", - "75% 6.079011e+09\n", - "max 6.115041e+09\n", - "Name: home_tract_id, dtype: float64" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.home_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Persons" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno travel_date gender relation education race1\n", - "0 7128119 1 2013-01-27 1 1 6 1.0\n", - "1 7128119 3 2013-01-27 2 3 1 1.0\n", - "2 7128138 1 2012-11-05 2 1 5 1.0\n", - "3 7128262 1 2012-12-21 2 1 1 1.0\n", - "4 7128262 3 2012-12-21 2 3 2 1.0\n", - "5 7128262 2 2012-12-21 1 2 1 1.0\n", - "6 7128288 2 2013-01-22 1 3 3 1.0\n", - "7 7128288 1 2013-01-22 2 1 5 1.0\n", - "8 7128316 1 2012-12-29 2 1 4 1.0\n", - "9 7128372 1 2012-12-29 2 1 6 1.0\n" - ] - } - ], - "source": [ - "print(persons[['sampno', 'perno', 'travel_date', 'gender', 'relation', \n", - " 'education', 'race1']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 108776.000000\n", - "mean 3.233838\n", - "std 2.954577\n", - "min 0.000000\n", - "25% 1.000000\n", - "50% 2.000000\n", - "75% 5.000000\n", - "max 33.000000\n", - "Name: person_trips, dtype: float64" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What is `person_trips`? -- not sure, but it looks related to the `tripno` field\n", - "\n", - "persons.person_trips.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 4.311100e+04\n", - "mean 6.241008e+09\n", - "std 3.120183e+09\n", - "min 2.614000e+03\n", - "25% 6.037238e+09\n", - "50% 6.059064e+09\n", - "75% 6.079011e+09\n", - "max 1.000000e+11\n", - "Name: empl_tract_id, dtype: float64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 66002\n", - "Name: empl_tract_id, dtype: object" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 2.543800e+04\n", - "mean 6.342777e+09\n", - "std 3.678070e+09\n", - "min 4.005001e+09\n", - "25% 6.037233e+09\n", - "50% 6.059063e+09\n", - "75% 6.079010e+09\n", - "max 1.000000e+11\n", - "Name: school_tract_id, dtype: float64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 83675\n", - "Name: school_tract_id, dtype: object" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113.000000\n", - "mean 0.999999\n", - "std 0.962373\n", - "min 0.000568\n", - "25% 0.322230\n", - "50% 0.717519\n", - "75% 1.329846\n", - "max 5.060089\n", - "Name: perwgt, dtype: float64" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.perwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Places\n", - "\n", - "Each record represents a single visit to a place" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['sampno', 'perno', 'plano', 'vehno', 'tripno', 'place_name',\n", - " 'travel_date', 'arr_time', 'dep_time', 'mode', 'trip_distance_miles',\n", - " 'air_trip_distance_miles', 'tripdistanceflag', 'prev_trip_duration_min',\n", - " 'act_dur', 'act_cnt', 'block_id', 'tract_id', 'county_id', 'state_id',\n", - " 'place_primarycity', 'city', 'zipcode', 'state', 'parked_loc_type',\n", - " 'parked_other_loc_type', 'parked_address', 'parked_minutes',\n", - " 'parked_payed', 'parked_amount_payed', 'parked_unit', 'parked_pay_type',\n", - " 'parked_other_pay_type', 'parked_paymen_ne', 'got_out_vehicle',\n", - " 'transit_system', 'transit_system_other', 'perwgt', 'expperwgt', 'tcf',\n", - " 'tcfperwgt', 'exptcfperwgt', 'tottr', 'hhmem', 'lon', 'lat',\n", - " 'non_hh_members', 'route', 'per1', 'per2', 'per3', 'per4', 'per5',\n", - " 'geom'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "print(places.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "b'1031985,1,1,,,REDACTED,2012-05-01,03:00:00,08:00:00,,,,,,340.0,3.0,REDACTED,252202.0,95.0,6.0,VALLEJO,VALLEJO,94591,CA,,,REDACTED,,,,,,,,,,,0.052086,17.647568,,,,,,REDACTED,REDACTED,,,,,,,,REDACTED\\r\\n'\n", - "b'1031985,1,2,97.0,1.0,REDACTED,2012-05-01,09:00:00,12:00:00,6.0,13.428271,7.647539,,22.0,231.0,1.0,REDACTED,252108.0,95.0,6.0,BENICIA,BENICIA,94510,CA,,,REDACTED,,,,,,,,1.0,,,0.052086,17.647568,0.969788,0.050512,17.114408,2.0,0.0,REDACTED,REDACTED,1.0,,,,,,,REDACTED\\r\\n'\n" - ] - } - ], - "source": [ - "# Print some raw data to make sure the pandas type inferences are reasonable\n", - "\n", - "with z.open('caltrans_full_survey/survey_place.csv', 'r') as f:\n", - " _ = f.readline() # discard column headers\n", - " print(f.readline())\n", - " print(f.readline())" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Filter for places visited by people who live in the Bay Area (may want to do use a\n", - "# different filter depending on the application)\n", - "\n", - "places_ba = places[places.sampno.isin(households_ba.sampno)]\n", - "\n", - "len(places_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano tripno\n", - "0 1031985 1 1 NaN\n", - "1 1031985 1 2 1.0\n", - "2 1031985 1 3 2.0\n", - "3 1031985 2 1 NaN\n", - "4 1031985 2 2 1.0\n", - "5 1031985 2 3 2.0\n", - "118 1033944 1 1 NaN\n", - "119 1033944 1 2 1.0\n", - "120 1033944 1 3 2.0\n", - "121 1033944 1 4 3.0\n" - ] - } - ], - "source": [ - "# Is there a unique identifier?\n", - "\n", - "# Might need to use combination of `sampno` (household), `perno` (person within hh),\n", - "# `plano` (place within person's travel diary)\n", - "\n", - "# What's `tripno`? (\"unlinked trip ID\" - maybe representing transfer between modes)\n", - "\n", - "print(places_ba[['sampno', 'perno', 'plano', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is every combination of `sampno`, `perno`, `plano` unique? -- Yes\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano']))" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "93406" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many places have a `tripno`? -- about 80%\n", - "\n", - "places_ba.tripno.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is the `tripno` ever repeated? -- No\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano', 'tripno']))" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406.000000\n", - "mean 3.817185\n", - "std 2.841705\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 3.000000\n", - "75% 5.000000\n", - "max 32.000000\n", - "Name: tripno, dtype: float64" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.tripno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 REDACTED\n", - "1 REDACTED\n", - "2 REDACTED\n", - "3 REDACTED\n", - "4 REDACTED\n", - "Name: place_name, dtype: object" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Can we see the place names? -- No\n", - "\n", - "places_ba.place_name.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 15680\n", - "SAN JOSE 11414\n", - "OAKLAND 5455\n", - "SANTA ROSA 3441\n", - "BERKELEY 3185\n", - "PALO ALTO 2664\n", - "SUNNYVALE 2440\n", - "SAN MATEO 2190\n", - "NAPA 2160\n", - "FREMONT 2126\n", - "REDWOOD CITY 2067\n", - "MOUNTAIN VIEW 1948\n", - "WALNUT CREEK 1896\n", - "SANTA CLARA 1816\n", - "CONCORD 1800\n", - "Name: city, dtype: int64" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.city.value_counts().head(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAD8CAYAAAC/1zkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGVlJREFUeJzt3X+0XWV95/H3xyAKKkMoFxrzowFXpAJLA9xBZhgdKwIB\nHQJdtYXpSGqZRhyYwoyzxqBdhaWli04VKqsONmpKsAgiPyTVUAwZldW1QAg/5FeguUAKl2SSSKhQ\ncaDBz/yxn6ubm3NvTm72ufse8nmtddbd+7uffc53B26+eZ5nn/3INhEREU14XdsJRETEa0eKSkRE\nNCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmBSViIhozB5tJzDZ9t9/f8+dO7ft\nNCIi+so999zzY9sDO2q32xWVuXPnsmbNmrbTiIjoK5L+sZt2Gf6KiIjGpKhERERjUlQiIqIxKSoR\nEdGYFJWIiGhMikpERDQmRSUiIhqTohIREY1JUYmIiMbsdt+o3xVzl3xnwueuv+SDDWYSETE1pacS\nERGNSVGJiIjGpKhERERjUlQiIqIxKSoREdGYFJWIiGhMz4qKpNmSvidpraSHJZ1X4vtJWiVpXfk5\nvcQl6XJJQ5IekHRk7b0WlfbrJC2qxY+S9GA553JJ6tX1RETEjvWyp7IN+ITtdwDHAOdIOhRYAqy2\nPQ9YXfYBTgLmlddi4AqoihBwIfBu4GjgwpFCVNosrp23oIfXExERO9CzomJ7o+17y/YLwFpgJrAQ\nWF6aLQdOLdsLgatcuRPYV9IM4ERgle2ttp8DVgELyrF9bN9h28BVtfeKiIgWTMqciqS5wBHAD4ED\nbW+EqvAAB5RmM4Gna6cNl9h48eEO8U6fv1jSGklrtmzZsquXExERY+h5UZH0ZuAG4Hzbz4/XtEPM\nE4hvH7SX2h60PTgwMLCjlCMiYoJ6WlQkvZ6qoFxt+8YS3lSGrig/N5f4MDC7dvosYMMO4rM6xCMi\noiW9vPtLwFeBtbYvrR1aAYzcwbUIuLkWP7PcBXYM8JMyPHYrcIKk6WWC/gTg1nLsBUnHlM86s/Ze\nERHRgl4+pfhY4CPAg5LuL7FPAZcA10k6C3gK+HA5thI4GRgCXgQ+CmB7q6TPAneXdp+xvbVsfxy4\nEtgLuKW8IiKiJT0rKrb/ns7zHgDHdWhv4Jwx3msZsKxDfA1w+C6kGRERDco36iMiojEpKhER0ZgU\nlYiIaEyKSkRENCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmBSViIhoTIpKREQ0\nJkUlIiIak6ISERGNSVGJiIjG9HLlx2WSNkt6qBb7hqT7y2v9yOJdkuZK+lnt2Jdq5xwl6UFJQ5Iu\nL6s8Imk/SaskrSs/p/fqWiIioju97KlcCSyoB2z/ju35tudTrV1/Y+3w4yPHbJ9di18BLAbmldfI\ney4BVtueB6wu+xER0aKeFRXbtwNbOx0rvY3fBq4Z7z0kzQD2sX1HWRnyKuDUcnghsLxsL6/FIyKi\nJW3NqbwH2GR7XS12kKT7JP1A0ntKbCYwXGszXGIAB9reCFB+HtDrpCMiYnw9W6N+B87g1b2UjcAc\n289KOgr4lqTD6LzGvXf2wyQtphpCY86cORNINyIiujHpPRVJewC/CXxjJGb7JdvPlu17gMeBt1P1\nTGbVTp8FbCjbm8rw2Mgw2eaxPtP2UtuDtgcHBgaavJyIiKhpY/jrA8Cjtn8xrCVpQNK0sn0w1YT8\nE2VY6wVJx5R5mDOBm8tpK4BFZXtRLR4RES3p5S3F1wB3AIdIGpZ0Vjl0OttP0L8XeEDSj4DrgbNt\nj0zyfxz4CjBE1YO5pcQvAY6XtA44vuxHRESLejanYvuMMeK/1yF2A9Utxp3arwEO7xB/Fjhu17KM\niIgm5Rv1ERHRmBSViIhoTIpKREQ0JkUlIiIak6ISERGNSVGJiIjGpKhERERjUlQiIqIxKSoREdGY\nFJWIiGhMikpERDQmRSUiIhqTohIREY1JUYmIiMakqERERGNSVCIiojG9XPlxmaTNkh6qxS6S9Iyk\n+8vr5NqxCyQNSXpM0om1+IISG5K0pBY/SNIPJa2T9A1Je/bqWiIioju97KlcCSzoEL/M9vzyWgkg\n6VCqZYYPK+f8b0nTyrr1XwROAg4FzihtAf6svNc84DngrNEfFBERk6tnRcX27cDWHTasLASutf2S\n7Sep1qM/uryGbD9h+2XgWmChJAHvp1rPHmA5cGqjFxARETutjTmVcyU9UIbHppfYTODpWpvhEhsr\n/ivAP9neNirekaTFktZIWrNly5amriMiIkaZ7KJyBfA2YD6wEfh8iatDW08g3pHtpbYHbQ8ODAzs\nXMYREdG1PSbzw2xvGtmW9GXg22V3GJhdazoL2FC2O8V/DOwraY/SW6m3j4iIlkxqT0XSjNruacDI\nnWErgNMlvUHSQcA84C7gbmBeudNrT6rJ/BW2DXwP+K1y/iLg5sm4hoiIGFvPeiqSrgHeB+wvaRi4\nEHifpPlUQ1XrgY8B2H5Y0nXAI8A24Bzbr5T3ORe4FZgGLLP9cPmITwLXSvoT4D7gq726loiI6E7P\niortMzqEx/yL3/bFwMUd4iuBlR3iT1DdHRYREVNEvlEfERGNSVGJiIjGpKhERERjUlQiIqIxKSoR\nEdGYFJWIiGhMikpERDQmRSUiIhozqc/+2p3NXfKdCZ+7/pIPNphJRETvdNVTkXR4rxOJiIj+1+3w\n15ck3SXpv0jat6cZRURE3+qqqNj+d8DvUj2Gfo2kr0s6vqeZRURE3+l6ot72OuCPqJ4O/O+ByyU9\nKuk3e5VcRET0l27nVN4p6TJgLdXa8P/B9jvK9mU9zC8iIvpIt3d//SXwZeBTtn82ErS9QdIf9SSz\niIjoO90Of50MfH2koEh6naS9AWx/rdMJkpZJ2izpoVrsz8uQ2QOSbhqZ9Jc0V9LPJN1fXl+qnXOU\npAclDUm6XJJKfD9JqyStKz+nT+yPICIimtJtUbkN2Ku2v3eJjedKYMGo2CrgcNvvBP4BuKB27HHb\n88vr7Fr8CmAx1RLD82rvuQRYbXsesLrsR0REi7otKm+0/c8jO2V77/FOsH07sHVU7Lu2t5XdO4FZ\n471HWdN+H9t3lHXprwJOLYcXAsvL9vJaPCIiWtJtUfmppCNHdiQdBfxsnPbd+H3gltr+QZLuk/QD\nSe8psZnAcK3NcIkBHGh7I0D5ecAu5hMREbuo24n684FvStpQ9mcAvzPRD5X0aWAbcHUJbQTm2H62\nFKxvSToMUIfTPYHPW0w1hMacOXMmlnREROxQV0XF9t2Sfh04hOov+kdt/8tEPlDSIuBDwHFlSAvb\nLwEvle17JD0OvJ2qZ1IfIpsFjBS2TZJm2N5Yhsk2j5P/UmApwODg4E4XpYiI6M7OPKX4XwPvBI4A\nzpB05s5+mKQFVF+ePMX2i7X4gKRpZftgqgn5J8qw1guSjil3fZ0J3FxOWwEsKtuLavGIiGhJVz0V\nSV8D3gbcD7xSwiMT52Odcw3wPmB/ScPAhVR3e70BWFXuDL6z3On1XuAzkraV9z/b9sgk/8ep7iTb\ni2oOZmQe5hLgOklnAU8BH+7mWiIione6nVMZBA4dGa7qhu0zOoS/OkbbG4Abxji2BtjuKcm2nwWO\n6zafiIjovW6Hvx4CfrWXiURERP/rtqeyP/CIpLsoE+oAtk/pSVYREdGXui0qF/UyiYiIeG3o9pbi\nH0j6NWCe7dvKc7+m9Ta1iIjoN90++v4PgOuBvyqhmcC3epVURET0p24n6s8BjgWeh18s2JXHokRE\nxKt0W1Resv3yyI6kPZjA41IiIuK1rdui8gNJnwL2KmvTfxP4296lFRER/ajborIE2AI8CHwMWEm1\nXn1ERMQvdHv318+plhP+cm/TiYiIftbts7+epMMciu2DG88oIiL61s48+2vEG6ke3rhf8+lEREQ/\n62pOxfaztdcztv8CeH+Pc4uIiD7T7fDXkbXd11H1XN7Sk4wiIqJvdTv89fna9jZgPfDbjWcTERF9\nrdu7v36j14lERET/63b467+Pd9z2pWOct4xqPfrNtg8vsf2AbwBzKT0e28+V5YK/AJwMvAj8nu17\nyzmL+OX3Yv7E9vISP4pfrgq5EjhvZxYSi4iIZnX75cdBqmV9Z5bX2cChVPMq482tXAksGBVbAqy2\nPQ9YXfYBTqJam34esBi4An5RhC4E3g0cDVwoaXo554rSduS80Z8VERGTaGcW6TrS9gsAki4Cvmn7\nP493ku3bJc0dFV5ItXY9wHLg+8AnS/yq0tO4U9K+kmaUtqtG1qyXtApYIOn7wD627yjxq4BT+eUa\n9hERMcm67anMAV6u7b9MNXw1EQfa3ghQfo487Xgm8HSt3TC/7BmNFR/uEI+IiJZ021P5GnCXpJuo\nvll/GnBVw7moQ8wTiG//xtJiqmEy5syZM9H8IiJiB7q9++tiSbcA7ymhj9q+b4KfuUnSDNsby/DW\n5hIfBmbX2s0CNpT4+0bFv1/iszq075T/UmApwODgYN9N5M9d8p1dOn/9JR9sKJOIiPF1O/wFsDfw\nvO0vAMOSDprgZ64AFpXtRcDNtfiZqhwD/KQMj90KnCBpepmgPwG4tRx7QdIx5c6xM2vvFRERLej2\nluILqe4AOwT4a+D1wN9QrQY53nnXUPUy9pc0THUX1yXAdZLOAp6ieo4YVLcEnwwMUd1S/FEA21sl\nfRa4u7T7zMikPdUdaVdS3VJ8C5mkj4hoVbdzKqcBRwD3AtjeIGmHj2mxfcYYh47r0NZUyxZ3ep9l\nwLIO8TXA4TvKIyIiJke3w18vl7/0DSDpTb1LKSIi+lW3ReU6SX8F7CvpD4DbyIJdERExSrd3f32u\nrE3/PNW8yh/bXtXTzCIiou/ssKhImkZ1t9UHgBSSiIgY0w6Hv2y/Arwo6V9NQj4REdHHur376/8B\nD5bnbv10JGj7D3uSVURE9KVui8p3yisiImJM4xYVSXNsPzWyfklERMR4djSn8q2RDUk39DiXiIjo\nczsqKvUnAR/cy0QiIqL/7aioeIztiIiI7exoov5dkp6n6rHsVbYp+7a9T0+zi4iIvjJuUbE9bbIS\niYiI/rcz66lERESMK0UlIiIaM+lFRdIhku6vvZ6XdL6kiyQ9U4ufXDvnAklDkh6TdGItvqDEhiQt\nmexriYiIV+v2G/WNsf0YMB9+8bDKZ4CbqFZ6vMz25+rtJR0KnA4cBrwVuE3S28vhLwLHU61Xf7ek\nFbYfmZQLiYiI7Ux6URnlOOBx2/9YLTPf0ULgWtsvAU9KGgKOLseGbD8BIOna0jZFJSKiJW3PqZwO\nXFPbP1fSA5KWSZpeYjOBp2tthktsrHhERLSktaIiaU/gFOCbJXQF8DaqobGNwOdHmnY43ePEO33W\nYklrJK3ZsmXLLuUdERFja7OnchJwr+1NALY32X7F9s+plioeGeIaBmbXzpsFbBgnvh3bS20P2h4c\nGBho+DIiImJEm3MqZ1Ab+pI0w/bGsnsa8FDZXgF8XdKlVBP184C7qHoq8yQdRDXZfzrwHycp974y\nd8nEVy1Yf8kHG8wkIl7rWikqkvamumvrY7Xw/5I0n2oIa/3IMdsPS7qOagJ+G3BOWY0SSecCtwLT\ngGW2H560i4iIiO20UlRsvwj8yqjYR8ZpfzFwcYf4SmBl4wlGRMSEtH33V0REvIakqERERGNSVCIi\nojEpKhER0ZgUlYiIaEyKSkRENCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmBSV\niIhoTIpKREQ0JkUlIiIa0+bKj9EHsmpkROyM1noqktZLelDS/ZLWlNh+klZJWld+Ti9xSbpc0pCk\nByQdWXufRaX9OkmL2rqeiIhof/jrN2zPtz1Y9pcAq23PA1aXfYCTqNamnwcsBq6AqggBFwLvBo4G\nLhwpRBERMfnaLiqjLQSWl+3lwKm1+FWu3AnsK2kGcCKwyvZW288Bq4AFk510RERU2iwqBr4r6R5J\ni0vsQNsbAcrPA0p8JvB07dzhEhsrHhERLWhzov5Y2xskHQCskvToOG3VIeZx4q8+uSpaiwHmzJkz\nkVwjIqILrfVUbG8oPzcDN1HNiWwqw1qUn5tL82Fgdu30WcCGceKjP2up7UHbgwMDA01fSkREFK0U\nFUlvkvSWkW3gBOAhYAUwcgfXIuDmsr0COLPcBXYM8JMyPHYrcIKk6WWC/oQSi4iIFrQ1/HUgcJOk\nkRy+bvvvJN0NXCfpLOAp4MOl/UrgZGAIeBH4KIDtrZI+C9xd2n3G9tbJu4yIiKhrpajYfgJ4V4f4\ns8BxHeIGzhnjvZYBy5rOMSIidt5Uu6U4IiL6WIpKREQ0Js/+ip7Jc8Midj/pqURERGNSVCIiojEp\nKhER0ZgUlYiIaEyKSkRENCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmDz7K6ak\nXXluGOTZYRFtSU8lIiIaM+lFRdJsSd+TtFbSw5LOK/GLJD0j6f7yOrl2zgWShiQ9JunEWnxBiQ1J\nWjLZ1xIREa/WxvDXNuATtu8t69TfI2lVOXaZ7c/VG0s6FDgdOAx4K3CbpLeXw18EjgeGgbslrbD9\nyKRcRUREbGfSi4rtjcDGsv2CpLXAzHFOWQhca/sl4ElJQ8DR5dhQWZoYSdeWtikqEREtaXWiXtJc\n4Ajgh8CxwLmSzgTWUPVmnqMqOHfWThvml0Xo6VHxd4/xOYuBxQBz5sxp7gJiysoCYRHtaG2iXtKb\ngRuA820/D1wBvA2YT9WT+fxI0w6ne5z49kF7qe1B24MDAwO7nHtERHTWSk9F0uupCsrVtm8EsL2p\ndvzLwLfL7jAwu3b6LGBD2R4rHhERLWjj7i8BXwXW2r60Fp9Ra3Ya8FDZXgGcLukNkg4C5gF3AXcD\n8yQdJGlPqsn8FZNxDRER0VkbPZVjgY8AD0q6v8Q+BZwhaT7VENZ64GMAth+WdB3VBPw24BzbrwBI\nOhe4FZgGLLP98GReSLw2ZT4mYuLauPvr7+k8H7JynHMuBi7uEF853nkRETG58o36iIhoTJ79FdGg\nDJ3F7i49lYiIaEx6KhFTRHo58VqQohLxGpClAmKqyPBXREQ0Jj2ViMjQWzQmRSUidkkKUtSlqERE\na3Z1LmhXpKD1RuZUIiKiMempRMRuKcN2vZGiEhGxk1KQxpaiEhExidqaR5qsYpY5lYiIaEyKSkRE\nNKbvi4qkBZIekzQkaUnb+URE7M76uqhImgZ8ETgJOJRq9chD280qImL31ddFBTgaGLL9hO2XgWuB\nhS3nFBGx2+r3ojITeLq2P1xiERHRgn6/pbjTWvferpG0GFhcdv9Z0mMT/Lz9gR9P8NzJMtVznOr5\nwdTPcarnB8mxCY3mpz/b5bf4tW4a9XtRGQZm1/ZnARtGN7K9FFi6qx8maY3twV19n16a6jlO9fxg\n6uc41fOD5NiEqZ7fWPp9+OtuYJ6kgyTtCZwOrGg5p4iI3VZf91Rsb5N0LnArMA1YZvvhltOKiNht\n9XVRAbC9Elg5SR+3y0Nok2Cq5zjV84Opn+NUzw+SYxOmen4dyd5uXjsiImJC+n1OJSIippAUlS5N\n5cfBSJot6XuS1kp6WNJ5bec0FknTJN0n6dtt5zKapH0lXS/p0fJn+W/azmk0Sf+t/Dd+SNI1kt44\nBXJaJmmzpIdqsf0krZK0rvycPsXy+/Py3/kBSTdJ2ret/MbKsXbsf0iypP3byG1npah0oQ8eB7MN\n+ITtdwDHAOdMsfzqzgPWtp3EGL4A/J3tXwfexRTLU9JM4A+BQduHU92ccnq7WQFwJbBgVGwJsNr2\nPGB12W/LlWyf3yrgcNvvBP4BuGCykxrlSrbPEUmzgeOBpyY7oYlKUenOlH4cjO2Ntu8t2y9Q/WU4\n5Z4sIGkW8EHgK23nMpqkfYD3Al8FsP2y7X9qN6uO9gD2krQHsDcdvpc12WzfDmwdFV4ILC/by4FT\nJzWpmk752f6u7W1l906q77i1Zow/Q4DLgP9Jhy91T1UpKt3pm8fBSJoLHAH8sN1MOvoLql+Qn7ed\nSAcHA1uAvy7Dc1+R9Ka2k6qz/QzwOap/tW4EfmL7u+1mNaYDbW+E6h89wAEt5zOe3wduaTuJ0SSd\nAjxj+0dt57IzUlS609XjYNom6c3ADcD5tp9vO586SR8CNtu+p+1cxrAHcCRwhe0jgJ/S7pDNdsq8\nxELgIOCtwJsk/ad2s+pvkj5NNXx8ddu51EnaG/g08Mdt57KzUlS609XjYNok6fVUBeVq2ze2nU8H\nxwKnSFpPNXz4fkl/025KrzIMDNse6eFdT1VkppIPAE/a3mL7X4AbgX/bck5j2SRpBkD5ubnlfLYj\naRHwIeB3PfW+W/E2qn88/Kj8zswC7pX0q61m1YUUle5M6cfBSBLVXMBa25e2nU8nti+wPcv2XKo/\nv/9je8r8K9v2/wWelnRICR0HPNJiSp08BRwjae/y3/w4ptjNBDUrgEVlexFwc4u5bEfSAuCTwCm2\nX2w7n9FsP2j7ANtzy+/MMHBk+f90SktR6UKZ0Bt5HMxa4Lop9jiYY4GPUP3r//7yOrntpPrQfwWu\nlvQAMB/405bzeZXSi7oeuBd4kOr3t/VvXUu6BrgDOETSsKSzgEuA4yWto7p76ZIplt9fAm8BVpXf\nly+1ld84OfalfKM+IiIak55KREQ0JkUlIiIak6ISERGNSVGJiIjGpKhERERjUlQiIqIxKSoREdGY\nFJWIiGjM/wfQ3LAf3y77kwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "places_ba.trip_distance_miles.plot.hist(bins=20, range=(0,15));" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2296" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Most detailed spatial identifier in public data is tract_id\n", - "\n", - "# How many different tracts are visited?\n", - "places_ba.tract_id.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different households?\n", - "places_ba.sampno.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "23939" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different people?\n", - "len(places_ba.groupby(['sampno','perno']))" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano tract_id county_id state_id\n", - "0 1031985 1 1 252202.0 95.0 6.0\n", - "1 1031985 1 2 252108.0 95.0 6.0\n", - "2 1031985 1 3 252202.0 95.0 6.0\n" - ] - } - ], - "source": [ - "# How are the ID's encoded?\n", - "\n", - "print(places_ba[['sampno','perno','plano','tract_id','county_id','state_id']].head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " tract_id county_id state_id\n", - "count 117344.0 117344.0 117344.0\n", - "mean 0.0 0.0 0.0\n", - "std 0.0 0.0 0.0\n", - "min 0.0 0.0 0.0\n", - "25% 0.0 0.0 0.0\n", - "50% 0.0 0.0 0.0\n", - "75% 0.0 0.0 0.0\n", - "max 0.0 0.0 0.0\n" - ] - } - ], - "source": [ - "# Do the floating point decimals encode anything, or is it just a mistake in the source\n", - "# data that they're not all stored as ints? (Looks like a mistake, and my guess is it\n", - "# happened because int columns can't have missing values in certain database systems.)\n", - "\n", - "asfloat = places_ba[['tract_id','county_id','state_id']].dropna()\n", - "asint = places_ba[['tract_id','county_id','state_id']].dropna().astype(int)\n", - "\n", - "print((asfloat - asint).describe())" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mode 23939\n", - "dtype: int64\n", - " mode\n", - "count 93406.000000\n", - "mean 5.279147\n", - "std 4.039473\n", - "min 1.000000\n", - "25% 5.000000\n", - "50% 5.000000\n", - "75% 6.000000\n", - "max 29.000000\n" - ] - } - ], - "source": [ - "# What does the travel mode data look like? We can replace null values with zero\n", - "\n", - "print(places_ba[['mode']].isnull().sum())\n", - "print(places_ba[['mode']].describe())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Census identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.0f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6912\n", - "14388\n" - ] - } - ], - "source": [ - "# Is the mapping between census tracts and city names consistent? -- No\n", - "\n", - "print(places.tract_id.drop_duplicates().shape[0])\n", - "print(places[['tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 191724\n", - "std 242716\n", - "min 100\n", - "25% 5911\n", - "50% 43317\n", - "75% 402800\n", - "max 999999\n", - "Name: tract_id, dtype: float64" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 58\n", - "std 50\n", - "min 1\n", - "25% 37\n", - "50% 59\n", - "75% 79\n", - "max 999\n", - "Name: county_id, dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.county_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460523\n", - "mean 6\n", - "std 5\n", - "min 1\n", - "25% 6\n", - "50% 6\n", - "75% 6\n", - "max 99\n", - "Name: state_id, dtype: float64" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6 455641\n", - "99 1064\n", - "32 957\n", - "41 454\n", - "4 412\n", - "Name: state_id, dtype: int64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# How to deal with this? I think `tract_id` is an integer representation\n", - "# of the 4-digit tract ID within the couty plus the 2 digit suffix. \n", - "\n", - "# So the full unique identifier is `state_id` + `county_id` (3 digits) + `tract_id` (6 digits)\n", - "\n", - "places['_full_tract_id'] = places.state_id * 1e9 + places.county_id * 1e6 + places.tract_id\n", - "\n", - "# Presumably the all-9 entries reflect missing data, but documentation doesn't specify\n", - "\n", - "places.loc[(places.tract_id == 999999) |\n", - " (places.county_id == 999) |\n", - " (places.state_id == 99), '_full_tract_id'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "14194\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(places[['_full_tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6115041100 14\n", - "6091010000 12\n", - "6027000800 11\n", - "6107000100 10\n", - "6097154303 10\n", - "Name: _full_tract_id, dtype: int64" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " _full_tract_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " _full_tract_id city\n", - "3238 6115041100 BROWNSVILLE\n", - "18952 6115041100 MARYSVILLE\n", - "33913 6115041100 NORTH SAN JUAN\n", - "44697 6115041100 DOBBINS\n", - "44705 6115041100 YUBA\n", - "100194 6115041100 BANGOR\n", - "160254 6115041100 CAMPTONVILLE\n", - "178724 6115041100 STRAWBERRY VALLEY\n", - "271235 6115041100 CHALLENGE-BROWNSVILLE\n", - "271250 6115041100 OREGON HOUSE\n", - "300021 6115041100 FORBESTOWN\n", - "317626 6115041100 CHALLENGE-BROWNSVILL\n", - "402446 6115041100 BROWNS VALLEY\n", - "403959 6115041100 RACKERBY\n" - ] - } - ], - "source": [ - "print(places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " loc[places._full_tract_id == 6115041100])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "So, there are still many census tracts that correspond to more than one city. I think we probably just want to use the census tracts as our unit of analysis. \n", - "\n", - "For descriptive purposes we can map each census tract to its most common corresponding city." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city\n", - "_full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Map each tract to its most common corresponding city\n", - "\n", - "tracts = places[['_full_tract_id', 'city']].groupby('_full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - " \n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "9097\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(tracts.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Activities\n", - "\n", - "\"The activity reported is for a single travel day and contains the highest level of detail about the survey participants' travel purpose\" (data dictionary)\n", - "\n", - "So, there can be multiple \"activities\" at each \"place\" visited as part of a trip." - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "157011" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# TO DO - fix to reflect households\n", - "\n", - "activities_ba = activities[activities.county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(activities_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano actno tripno\n", - "1 1041766 3 1 1 nan\n", - "4 1051203 1 9 1 8\n", - "8 1065929 1 1 1 nan\n", - "14 1097949 1 1 1 nan\n", - "22 1124271 1 5 1 4\n", - "27 1126030 2 1 1 nan\n", - "30 1127449 2 1 1 nan\n", - "32 1127626 1 1 1 nan\n", - "35 1128657 1 1 1 nan\n", - "37 1129482 1 1 1 nan\n" - ] - } - ], - "source": [ - "# What do the identifiers look like? \n", - "\n", - "print(activities_ba[['sampno', 'perno', 'plano', 'actno', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "118271\n", - "118271\n", - "117345\n" - ] - } - ], - "source": [ - "# Each place occurs in the activities table at least once\n", - "\n", - "print((activities_ba.actno == 1).sum()) # number of activities with id 1\n", - "\n", - "print(len(activities_ba.groupby(['sampno', 'perno', 'plano']))) # unique places referenced\n", - "\n", - "print(len(places_ba)) # records in places table" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2624572\n", - "std 1695612\n", - "min 1031985\n", - "25% 1662824\n", - "50% 1979173\n", - "75% 2797238\n", - "max 7212388\n", - "Name: sampno, dtype: float64" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.sampno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2\n", - "std 1\n", - "min 1\n", - "25% 1\n", - "50% 2\n", - "75% 3\n", - "max 8\n", - "Name: perno, dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.perno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 3\n", - "std 3\n", - "min 1\n", - "25% 1\n", - "50% 3\n", - "75% 5\n", - "max 34\n", - "Name: plano, dtype: float64" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.plano.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trips\n", - "\n", - "What's the correct way to aggregate places into trips?\n", - "\n", - "It seems like each person recorded their travel for a single day as a sequence of places visited, without explicit classification into trips or tours. So that's up to us to do by applying whatever rules seem appropriate. \n", - "\n", - "Probably it's not even possible to identify tours with certainty from the anonymized data, because the place names and precise locations are redacted." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "sampno perno\n", - "1031985 1 3\n", - " 2 3\n", - "1033944 1 16\n", - "1035274 1 8\n", - " 2 6\n", - "1037952 1 3\n", - " 2 1\n", - "1039620 1 5\n", - " 2 5\n", - "1041076 1 4\n", - "Name: plano, dtype: int64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Dig into `tripno` some more\n", - "\n", - "places_ba.groupby(['sampno', 'perno']).plano.max().head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 19512\n", - "mean 1\n", - "std 0\n", - "min 1\n", - "25% 1\n", - "50% 1\n", - "75% 1\n", - "max 1\n", - "dtype: float64" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Do any respondents have multiple trip sequences? -- No!\n", - "\n", - "plano_counts = places_ba.groupby(['sampno', 'perno']).plano.max()\n", - "tripno_counts = places_ba.groupby(['sampno', 'perno']).tripno.max()\n", - "\n", - "(plano_counts - tripno_counts).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406\n", - "mean 1\n", - "std 0\n", - "min 1\n", - "25% 1\n", - "50% 1\n", - "75% 1\n", - "max 1\n", - "dtype: float64" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(places_ba.plano - places_ba.tripno).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
travel_datearr_timedep_timetract_idcitymodetrip_distance_milesprev_trip_duration_minact_dur
1522012-07-1703:00:0010:00:00509000SUNNYVALEnannannan425
1532012-07-1710:00:0010:00:00509000SUNNYVALE511030
1542012-07-1711:00:0011:00:00508504SUNNYVALE52151
1552012-07-1711:00:0011:00:00508504SUNNYVALE1059
1562012-07-1711:00:0013:00:00508504SUNNYVALE105105
1572012-07-1713:00:0014:00:00509000SUNNYVALE521060
1582012-07-1714:00:0015:00:00500100SAN JOSE582025
1592012-07-1715:00:0002:00:00509000SUNNYVALE5920699
\n", - "
" - ], - "text/plain": [ - " travel_date arr_time dep_time tract_id city mode \\\n", - "152 2012-07-17 03:00:00 10:00:00 509000 SUNNYVALE nan \n", - "153 2012-07-17 10:00:00 10:00:00 509000 SUNNYVALE 5 \n", - "154 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 5 \n", - "155 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 1 \n", - "156 2012-07-17 11:00:00 13:00:00 508504 SUNNYVALE 1 \n", - "157 2012-07-17 13:00:00 14:00:00 509000 SUNNYVALE 5 \n", - "158 2012-07-17 14:00:00 15:00:00 500100 SAN JOSE 5 \n", - "159 2012-07-17 15:00:00 02:00:00 509000 SUNNYVALE 5 \n", - "\n", - " trip_distance_miles prev_trip_duration_min act_dur \n", - "152 nan nan 425 \n", - "153 1 10 30 \n", - "154 2 15 1 \n", - "155 0 5 9 \n", - "156 0 5 105 \n", - "157 2 10 60 \n", - "158 8 20 25 \n", - "159 9 20 699 " - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What does a sequence of places look like?\n", - "\n", - "varlist = ['travel_date', 'arr_time', 'dep_time', 'tract_id', 'city', 'mode', \n", - " 'trip_distance_miles', 'prev_trip_duration_min', 'act_dur']\n", - "\n", - "places_ba.loc[(places_ba.sampno == 1035274) & (places_ba.perno == 1), varlist]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, it looks like the key to identifying trip/tour semantics involves looking at the trip purposes in the activities table. Transfers are noted as a particular purpose, and those trip legs need to be aggregated together. \n", - "\n", - "The first and last activities of the day probably take place at home, but we can't verify using the public data.\n", - "\n", - "It looks like the arrival and departure times, and trip durations, are approximate based on people's recollections, but distances are precise because they come from the Google Maps interface." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Travel modes" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "5 50139\n", - "6 18632\n", - "1 15924\n", - "2 2244\n", - "15 1635\n", - "24 1444\n", - "7 566\n", - "26 459\n", - "8 299\n", - "25 293\n", - "Name: mode, dtype: int64" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the travel modes?\n", - "\n", - "places_ba['mode'].value_counts().head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "Travel mode:\n", - "\n", - "- 1- Walk; \n", - "- 2- Bike; \n", - "- 3- Wheelchair/mobility scooter; \n", - "- 4- Other non-motorized; \n", - "- 5- Auto/van/truck driver; \n", - "- 6- Auto/van/truck passenger; \n", - "- 7- Carpool/vanpool; \n", - "- 8- Motorcycle/scooter/moped; \n", - "- 9- Taxi/hired car/limo; \n", - "- 10- Rental car/vehicle; \n", - "- 11- Private shuttle (Super shuttle, employer, hotel, etc.); \n", - "- 12- Greyhound bus; \n", - "- 13- Plane; \n", - "- 14- Other private transit; \n", - "- 15- Local bus, rapid bus; \n", - "- 16- Express bus/commuter bus (AC Transbay, Golden Gate Transit, etc.); \n", - "- 17- Premium bus (Metro Orange/Silver Line); \n", - "- 18- School bus; \n", - "- 19- Public transit shuttle (DASH, Emery Go Round, etc.); \n", - "- 20- AirBART/LAX FlyAway; \n", - "- 21- Dial-a-ride/paratransit (access services, etc.); \n", - "- 22- Amtrak bus; \n", - "- 23- Other bus; \n", - "- 24- BART, Metro Red/Purple Line; \n", - "- 25- ACE, Amtrak, Cal- train, Coaster, Metrolink; \n", - "- 26- Metro Blue/Green/Gold Line, Muni Metro, Sacramento Light Rail, San Diego Sprinter/Trolley/Orange/ Blue/Green, VTA light rail; \n", - "- 27- Streetcar/cable car, \n", - "- 28- Other rail; \n", - "- 29- Ferry/boat; \n", - "- 99- RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trip purposes" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1 47241\n", - "2 16700\n", - "21 9523\n", - "9 9151\n", - "27 8583\n", - "22 7250\n", - "8 6151\n", - "7 5792\n", - "37 5040\n", - "31 4737\n", - "39 3484\n", - "17 3105\n", - "25 3039\n", - "34 2701\n", - "29 2541\n", - "Name: purpose, dtype: int64" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the trip purposes?\n", - "\n", - "activities_ba.purpose.value_counts().head(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "[Somewhere there's a `ptype` key indicating categories of purposes, probably based on the home/ work/ school locations, but I can't find it in these data tables.]\n", - "\n", - "Activity purpose: \n", - "\n", - "[These look like activities at home]\n", - "\n", - "- 1- Personal activities (sleeping, personal care, leisure, chores); \n", - "- 2- Preparing meals/eating; \n", - "- 3- Hosting visitors/entertaining guests; \n", - "- 4- Exercise (with or without equipment)/playing sports; \n", - "- 5- Study/schoolwork; \n", - "- 6- Work for pay at home using telecommunications equipment; \n", - "- 7- Using computer/telephone/cell or smart phone, or other communications device for personal activities; \n", - "- 8- All other activities at home; \n", - "\n", - "[These look like activites at work]\n", - "\n", - "- 9- Work/job duties; \n", - "- 10- Training; \n", - "- 11- Meals at work; \n", - "- 12- Work-sponsored social activities (holiday/birthday celebrations, etc.); \n", - "- 13- Non-work-related activities (social clubs, etc.); \n", - "- 14- Exercise/sports; \n", - "- 15- Volunteer work/activities, \n", - "- 16- All other work- related activities at work; \n", - "\n", - "[These look like activities at school]\n", - "\n", - "- 17- School/classroom/ laboratory; \n", - "- 18- Meals at school/college; \n", - "- 19- After-school or non-class-related sports/physical activities; \n", - "- 20- All other after-school or non-class-related activities (library, music rehearsal, clubs, etc.); \n", - "\n", - "[These look like transport-related]\n", - "\n", - "- 21- Change type of transportation/transfer (walk to bus, walk to/from parked car); \n", - "- 22- pick up/drop off passenger(s); \n", - "\n", - "[These look like activities at non-home, non-work, non-school locations]\n", - "\n", - "- 23- Drive-through meals (snacks, coffee, etc.) (show if PTYPE <> 1 [Home]); \n", - "- 24- Drive-through other (ATM, bank, etc.) (show if PTYPE <> 1); \n", - "- 25- Work-related (meetings, sales calls, deliveries); \n", - "- 26- Service private vehicle (gas, oil, lubes, repairs), \n", - "- 27- Routine shopping (groceries, clothing, convenience store, household maintenance, etc.); \n", - "- 28- Shopping for major purchases or specialty items (appliance, electronics, new vehicles, major household repairs, etc.); \n", - "- 29- Household errands (bank, dry cleaning, etc.); \n", - "- 30- Personal business (visit government office, attorney, accountant, etc.); \n", - "- 31- Eat meal at restaurant/diner; \n", - "- 32- Health care (doctor, dentist, eye care, chiropractor, veterinarian, etc.); \n", - "- 33- Civic/ religious activities; \n", - "- 34- Outdoor exercise (outdoor sports, jogging, bicycling, walking the dog, etc.); \n", - "- 35- Indoor exercise (gym, yoga, etc.); \n", - "- 36- Entertainment (movies, sporting events, etc.); \n", - "- 37- Social/visiting friends and relatives; \n", - "- 38- Other (specify), \n", - "\n", - "[Misc]\n", - "\n", - "- 39- Loop trip (for interviewer only- not listed on diary), \n", - "- 99- DK/RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/ChoiceModels progress update 2017-06-09.pdf b/notebooks/ChoiceModels progress update 2017-06-09.pdf deleted file mode 100644 index d298540..0000000 Binary files a/notebooks/ChoiceModels progress update 2017-06-09.pdf and /dev/null differ diff --git a/notebooks/Data-prep-02.ipynb b/notebooks/Data-prep-02.ipynb deleted file mode 100644 index 32bf1e2..0000000 --- a/notebooks/Data-prep-02.ipynb +++ /dev/null @@ -1,779 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data prep for estimating models\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version June 2017 (v01) \n", - "Updated Aug 2017 (v02) to fix int/float issues\n", - "\n", - "This notebook generates the data tables that are used the model estimation demos. For more about the California Household Travel Survey source data, you can refer to the \"CHTS-exploration\" notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load raw CHTS tables\n", - "\n", - "This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the \"data\" directory." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean up the places table and generate tract identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "460516\n" - ] - } - ], - "source": [ - "# Discard places with missing identifiers, and convert ID components to ints. \n", - "# (Some identifiers are stored as floats in the source table, but the \n", - "# \"CHTS-exploration\" notebook confirms that the decimal vlaues don't encode anything.)\n", - "\n", - "places.dropna(subset=['state_id','county_id','tract_id','city'], inplace=True)\n", - "\n", - "places['state_id'] = places.state_id.astype(int)\n", - "places['county_id'] = places.county_id.astype(int)\n", - "places['tract_id'] = places.tract_id.astype(int)\n", - "\n", - "print(len(places))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "460514\n" - ] - } - ], - "source": [ - "# Other missing values are encoded as nines; discard those as well\n", - "\n", - "places.drop((places.tract_id == 999999) | \n", - " (places.county_id == 999) | \n", - " (places.state_id == 99), inplace=True)\n", - "\n", - "print(len(places))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2 6\n", - "3 0\n", - "4 5\n", - "Name: mode, dtype: int64\n" - ] - } - ], - "source": [ - "# Clean up other data fields\n", - "\n", - "# Replace null travel mode with zero and encode as int (mode seems to be a protected\n", - "# keyword, so we have to use places['mode'] rather than places.mode)\n", - "\n", - "places['mode'] = places['mode'].fillna(0).astype(int)\n", - "\n", - "print(places['mode'].head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6098941414\n", - "6\n", - "98\n" - ] - } - ], - "source": [ - "# Define functions to move back and forth between full numerical tract ID and its components\n", - "\n", - "def full_tract_id(state_id, county_id, tract_id):\n", - " return state_id * 10**9 + county_id * 10**6 + tract_id\n", - "\n", - "def state_id(full_tract_id):\n", - " return full_tract_id // 10**9\n", - "\n", - "def county_id(full_tract_id):\n", - " _county_tract = np.fmod(full_tract_id, 10**9)\n", - " return _county_tract // 10**6\n", - "\n", - "print(full_tract_id(6, 98, 941414))\n", - "print(state_id(6098141414))\n", - "print(county_id(6098141414))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " state_id county_id tract_id city full_tract_id\n", - "2 6 95 252202 VALLEJO 6095252202\n", - "3 6 95 252202 VALLEJO 6095252202\n", - "4 6 95 251902 VALLEJO 6095251902\n" - ] - } - ], - "source": [ - "# Generate full tract identifiers\n", - "\n", - "places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)\n", - "\n", - "print(places[['state_id','county_id','tract_id','city','full_tract_id']].head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build a master table of census tracts" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9310\n", - " city\n", - "full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Generate a master list of census tracts, keeping the city name most commonly \n", - "# associated with each tract\n", - "\n", - "tracts = places[['full_tract_id','city']].groupby('full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city\n", - "full_tract_id \n", - "6001008309 TIJUANA\n", - "6001400100 BERKELEY\n", - "6001400200 OAKLAND\n", - "6001400300 OAKLAND\n", - "6001400400 OAKLAND\n" - ] - } - ], - "source": [ - "# Limit to the 9-county San Francisco Bay Area\n", - "\n", - "tracts = tracts[(state_id(tracts.index).isin([6])) & \n", - " (county_id(tracts.index).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "85 371\n", - "1 360\n", - "13 207\n", - "75 195\n", - "81 158\n", - "97 99\n", - "95 97\n", - "41 55\n", - "55 41\n", - "Name: full_tract_id, dtype: int64\n" - ] - } - ], - "source": [ - "print(county_id(tracts.index).value_counts())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate some tract-level covariates\n", - "\n", - "Residential density, school/employment density" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Note: the `home_tract_id` in the households table is already a full 11-digit\n", - "# identifier, with the same format that we generated for the places table.\n", - "# Same with `empl_tract_id` and `school_tract_id` in the persons table." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Residential density = sum of weighted household sizes by census tract of home\n", - "\n", - "households['_weighted_persons_count'] = households.persons_count * households.hhwgt\n", - "\n", - "home_density = households.groupby('home_tract_id')._weighted_persons_count.sum().\\\n", - " rename('home_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Employment density = sum of person weights by census tract of work location\n", - "\n", - "work_density = persons.groupby('empl_tract_id').perwgt.sum().\\\n", - " rename('work_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# School density = sum of person weights by census tract of school location\n", - "\n", - "school_density = persons.groupby('school_tract_id').perwgt.sum().\\\n", - " rename('school_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309 TIJUANA 0.000000 0.000000 0.000000\n", - "6001400100 BERKELEY 13.437961 13.130867 13.511570\n", - "6001400200 OAKLAND 11.089638 4.248928 0.894794\n", - "6001400300 OAKLAND 28.878399 7.671554 0.000000\n", - "6001400400 OAKLAND 16.884910 4.063805 8.150402\n" - ] - } - ], - "source": [ - "# Merge these into the census tracts table, only keeping Bay Area tracts\n", - "\n", - "tracts = pd.merge(tracts, home_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, work_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, school_density, how='left', left_index=True, right_index=True)\n", - "tracts = tracts.fillna(0) # fill missing values with zero\n", - "\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generate a table of trips\n", - "\n", - "For now, this is a table of places visited for non-school, non-work activities" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - trip destinations are in `places.full_tract_id` (sometimes missing)\n", - "# - trip purposes are in `activities.purpose`, and we want 23 thru 38\n", - "# - places and acitivities are linked by `sampno`, `perno`, `plano`, and there \n", - "# can be multiple activities per place" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10417660312\n" - ] - } - ], - "source": [ - "# Function to generate a single unique ID for places\n", - "\n", - "def place_id(sampno, perno, plano):\n", - " return sampno * 10**4 + perno * 10**2 + plano\n", - "\n", - "print(place_id(1041766, 3, 12))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Add place_id to places table and activities table\n", - "\n", - "places['place_id'] = place_id(places.sampno, places.perno, places.plano)\n", - "activities['place_id'] = place_id(activities.sampno, activities.perno, activities.plano)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Get list of places that have a secondary activity\n", - "\n", - "_secondary_activity_places = activities.loc[activities.purpose.isin(range(23, 38+1)),\n", - " 'place_id'].drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "147004\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10320360102 6073017051 5 3.619056\n", - "10320360104 6073009304 5 19.351620\n", - "10320360105 6073008511 5 6.451126\n", - "10320360202 6073020211 6 10.466616\n" - ] - } - ], - "source": [ - "# Generate a table of those places with some covariates\n", - "\n", - "trips = places.loc[places.place_id.isin(_secondary_activity_places) &\n", - " places.full_tract_id.notnull(),\n", - " ['place_id', 'full_tract_id', 'mode', \n", - " 'trip_distance_miles']].set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36764\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10335860102 6085511915 6 156.370628\n", - "10335860103 6085512027 6 1.615535\n", - "10335860104 6085512027 6 0.375708\n", - "10335860105 6085511915 6 0.894730\n" - ] - } - ], - "source": [ - "# Limit to destinations in the 9-county San Francisco Bay Area\n", - "\n", - "trips = trips[(state_id(trips.full_tract_id).isin([6])) & \n", - " (county_id(trips.full_tract_id).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save estimaton data to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "tracts.to_csv('../data/tracts_v02.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "trips.to_csv('../data/trips_v02.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# TO DO\n", - "# - for a mode choice model, could probably generate average travel times between\n", - "# tracts just from the observed data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/Destination-choice-models-02.ipynb b/notebooks/Destination-choice-models-02.ipynb deleted file mode 100644 index 861a722..0000000 --- a/notebooks/Destination-choice-models-02.ipynb +++ /dev/null @@ -1,652 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring destination choice models\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version June 2017 (v01) \n", - "Updated Aug 2017 (v02) to use new version of the estimation data (see \"Data-prep-02\" notebook)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "from urbansim.urbanchoice import interaction, mnl\n", - "\n", - "from choicemodels import MultinomialLogit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load estimation data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309 TIJUANA 0.000000 0.000000 0.000000\n", - "6001400100 BERKELEY 13.437961 13.130867 13.511570\n", - "6001400200 OAKLAND 11.089638 4.248928 0.894794\n", - "6001400300 OAKLAND 28.878399 7.671554 0.000000\n", - "6001400400 OAKLAND 16.884910 4.063805 8.150402\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts_v02.csv').set_index('full_tract_id')\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36764\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10335860102 6085511915 6 156.370628\n", - "10335860103 6085512027 6 1.615535\n", - "10335860104 6085512027 6 0.375708\n", - "10335860105 6085511915 6 0.894730\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips_v02.csv').set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL destination choice using urbansim.urbanchoice" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - each trip is a realized choice of a particular census tract\n", - "# - we can randomly sample alternative census tracts and build a model\n", - "# of destination choice" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# `interaction.mnl_interaction_dataset()` is not documented very well, but \n", - "# this is how it seems to work\n", - "\n", - "# Takes following input:\n", - "# - choosers: pandas.DataFrame with unique index\n", - "# - alternatives: pandas.DataFrame with unique index\n", - "# - SAMPLE_SIZE: number of alternatives for each choice scenario\n", - "# - chosenalts: list containing the alternative id chosen by each chooser?\n", - "\n", - "# Returns following output:\n", - "# - full list of alternatives that were sampled\n", - "# - long-format DataFrame merging the two tables\n", - "# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start with a sample of ~500 trips for easier computation" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "483\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "72006700102 6085503326 5 0.574042\n", - "71863140102 6085503108 5 1.718151\n", - "24974540206 6013307201 6 2.446018\n", - "70163300110 6075017102 1 0.038407\n", - "71669940202 6001403100 5 3.793155\n" - ] - } - ], - "source": [ - "choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]\n", - "choosers = choosers.loc[choosers.trip_distance_miles.notnull()]\n", - "\n", - "print(choosers.shape[0])\n", - "print(choosers.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sample 100 alternatives for each and set up a long-format data table" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "48300\n", - "(483, 100)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/Dropbox/Git-imac/udst/urbansim/urbansim/urbanchoice/interaction.py:83: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " alts_sample['join_index'] = np.repeat(choosers.index.values, SAMPLE_SIZE)\n" - ] - } - ], - "source": [ - "numalts = 100\n", - "\n", - "_, merged, chosen = interaction.mnl_interaction_dataset(\n", - " choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, \n", - " chosenalts=choosers.full_tract_id)\n", - "\n", - "print(merged.shape[0])\n", - "print(chosen.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use Patsy to generate the design matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Intercept home_density work_density school_density\n", - "full_tract_id \n", - "6085503326 1.0 33.103403 3.018663 13.646608\n", - "6041104102 1.0 17.376936 4.465194 3.304285\n", - "6001440304 1.0 3.324621 0.672532 0.000000\n", - "6013364002 1.0 12.594876 0.788063 0.762573\n", - "6095253107 1.0 26.588450 0.425587 4.469490\n" - ] - } - ], - "source": [ - "model_expression = \"home_density + work_density + school_density\"\n", - "\n", - "model_design = dmatrix(model_expression, data=merged, return_type='dataframe')\n", - "\n", - "print(model_design.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit the model using mnl_estimate()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'null': -2224.297199832249, 'convergence': -2183.4783133690826, 'ratio': 0.018351363507648655}\n", - " Coefficient Std. Error T-Score\n", - "0 -2.539330e-18 0.084172 -3.016817e-17\n", - "1 1.486461e-02 0.004156 3.576700e+00\n", - "2 1.106214e-02 0.001507 7.340532e+00\n", - "3 1.349303e-02 0.003850 3.504669e+00\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/Dropbox/Git-imac/udst/urbansim/urbansim/urbanchoice/pmat.py:48: RuntimeWarning: overflow encountered in exp\n", - " return PMAT(np.exp(self.mat))\n" - ] - } - ], - "source": [ - "log_likelihoods, fit_parameters = mnl.mnl_estimate(\n", - " model_design.as_matrix(), chosen, numalts=numalts)\n", - "\n", - "print(log_likelihoods)\n", - "print(fit_parameters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NEW -- Same process in ChoiceModels" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "483\n" - ] - } - ], - "source": [ - "# Start with the same sample of trips\n", - "\n", - "print(choosers.shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge choosers and alternatives using a new ChoiceModels interface" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "48300\n" - ] - } - ], - "source": [ - "merged = MergedChoiceTable(observations = choosers, \n", - " alternatives = tracts, \n", - " chosen_alternatives = choosers.full_tract_id, \n", - " sample_size = numalts)\n", - "\n", - "print(type(merged))\n", - "print(merged.to_frame().shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the ChoiceModels engine" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/Dropbox/Git-imac/udst/choicemodels/choicemodels/tools/pmat.py:48: RuntimeWarning: overflow encountered in exp\n", - " return PMAT(np.exp(self.mat))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -2,187.181\n", - "BIC: LL-Null: -2,224.297\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0139 0.003 5.298 \n", - "work_density 0.0094 0.001 6.361 \n", - "school_density 0.0151 0.004 3.963 \n", - "===================================================================\n", - "CPU times: user 219 ms, sys: 14 ms, total: 233 ms\n", - "Wall time: 70.9 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(), \n", - " observation_id_col = merged.observation_id_col, \n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(type(results))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the PyLogit engine\n", - "\n", - "Usage is the same, but with an OrderedDict model expression" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,224.2972\n", - "Initial Log-likelihood: -2,224.2972\n", - "Estimation Time: 0.06 seconds.\n", - "Final log-likelihood: -2,187.1807\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/anaconda/lib/python3.6/site-packages/scipy/optimize/_minimize.py:385: RuntimeWarning: Method BFGS does not use Hessian information (hess).\n", - " RuntimeWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 483\n", - "Model: Multinomial Logit Model Df Residuals: 480\n", - "Method: MLE Df Model: 3\n", - "Date: Thu, 10 Aug 2017 Pseudo R-squ.: 0.017\n", - "Time: 13:39:29 Pseudo R-bar-squ.: 0.015\n", - "AIC: 4,380.361 Log-Likelihood: -2,187.181\n", - "BIC: 4,392.901 LL-Null: -2,224.297\n", - "==================================================================================\n", - " coef std err z P>|z| [0.025 0.975]\n", - "----------------------------------------------------------------------------------\n", - "home_density 0.0139 0.004 3.330 0.001 0.006 0.022\n", - "work_density 0.0094 0.001 7.850 0.000 0.007 0.012\n", - "school_density 0.0151 0.004 3.818 0.000 0.007 0.023\n", - "==================================================================================\n", - "CPU times: user 12.6 s, sys: 12.4 s, total: 25.1 s\n", - "Wall time: 10.5 s\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = OrderedDict([('home_density', 'all_same'),\n", - " ('work_density', 'all_same'),\n", - " ('school_density', 'all_same')])\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(),\n", - " observation_id_col = merged.observation_id_col,\n", - " alternative_id_col = merged.alternative_id_col,\n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/MNL-prediction-demo-02.ipynb b/notebooks/MNL-prediction-demo-02.ipynb deleted file mode 100644 index 2b3a783..0000000 --- a/notebooks/MNL-prediction-demo-02.ipynb +++ /dev/null @@ -1,766 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL prediction demo\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version July 2017 (v01) \n", - "Updated July 2017 (v02) to include probabilities \n", - "Updated Aug 2017 (v02) to fix int/float problems \n", - "\n", - "### Summary\n", - "\n", - "This notebook demonstrates how to fit a model using the ChoiceModels interface and then use the UrbanSim MNL functions to generate probabilities and predictions. \n", - "\n", - "Eventually, a prediction interface will be incorporated into the `MultinomialLogitResults` object, but it's not there yet!\n", - "\n", - "This demo uses the estimation data that's set up in the `Data-prep-02` notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "\n", - "from choicemodels import mnl # could also import form urbansim.urbanchoice\n", - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1566\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001400100 BERKELEY 13.437961 13.130867 13.511570\n", - "6001400200 OAKLAND 11.089638 4.248928 0.894794\n", - "6001400300 OAKLAND 28.878399 7.671554 0.000000\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts_v02.csv').set_index('full_tract_id')\n", - "tracts = tracts.loc[(tracts.home_density > 0) | (tracts.work_density > 0) | (tracts.school_density > 0)]\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35786\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10335860102 6085511915 6 156.370628\n", - "10335860103 6085512027 6 1.615535\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips_v02.csv').set_index('place_id')\n", - "trips = trips.loc[trips.trip_distance_miles.notnull()]\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up estimation table\n", - "\n", - "Each observed trip is a realized choice of a particular destination census tract. We can randomly sample alternative census tracts to build a model of destination choice.\n", - "\n", - "We'll divide the trips into a training set and a testing set, fit an MNL model using the training data, use it to generate predicted choices for the testing data, and compare the predicted to the actual choices." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(100000, 9)\n", - "(3473300, 9)\n" - ] - } - ], - "source": [ - "training_observations = trips.iloc[:1000]\n", - "training = MergedChoiceTable(observations = training_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = training_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "testing_observations = trips.iloc[1000:]\n", - "testing = MergedChoiceTable(observations = testing_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = testing_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "print(training.to_frame().shape)\n", - "print(testing.to_frame().shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit a model using the training observations" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -4,506.216\n", - "BIC: LL-Null: -4,605.170\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0113 0.002 6.051 \n", - "work_density 0.0119 0.001 14.909 \n", - "school_density 0.0069 0.004 1.916 \n", - "===================================================================\n", - "CPU times: user 96.6 ms, sys: 55.7 ms, total: 152 ms\n", - "Wall time: 145 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = training.to_frame(), \n", - " observation_id_col = training.observation_id_col, \n", - " choice_col = training.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Predict destination choices for the testing observations\n", - "\n", - "We'll use the UrbanSim MNL functions directly, because this hasn't been integrated into the ChoiceModels results classes yet. https://github.com/UDST/choicemodels/blob/master/choicemodels/mnl.py#L536" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 0.011321\n", - "1 0.011928\n", - "2 0.006929\n", - "Name: Coefficient, dtype: float64\n" - ] - } - ], - "source": [ - "# Pull the coefs out of the results object (the PyLogit syntax would be different)\n", - "\n", - "coefs = results.get_raw_results()['fit_parameters']['Coefficient']\n", - "print(coefs)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(3473300, 3)\n", - " home_density work_density school_density\n", - "full_tract_id \n", - "6097150607 10.659461 6.868701 7.16003\n", - "6075020700 20.952573 4.410758 0.00000\n", - "6013319000 21.324330 9.745037 1.26180\n" - ] - } - ], - "source": [ - "# The data columns for prediction need to align with the coefficients; \n", - "# you can do this manually or with patsy, as here\n", - "\n", - "df = testing.to_frame().set_index('full_tract_id')\n", - "\n", - "testing_df = dmatrix(model_expression, data=df, return_type='dataframe')\n", - "print(testing_df.shape)\n", - "print(testing_df.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34733\n", - "[28 55 95 61 6]\n" - ] - } - ], - "source": [ - "# Simulate a destination choice for each testing observation\n", - "\n", - "choices = mnl.mnl_simulate(testing_df, coefs, numalts=100, returnprobs=False)\n", - "\n", - "print(len(choices))\n", - "print(choices[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['a', 'd']\n" - ] - } - ], - "source": [ - "# Annoyingly, that identifies the choices by position rather than by id;\n", - "# here's a function to get the id's\n", - "\n", - "def get_chosen_ids(ids, positions):\n", - " \"\"\"\n", - " We observe N choice scenarios. In each, one of J alternatives is chosen.\n", - " We have a long (len N * J) list of the available alternatives. We have a \n", - " list (len N) of which alternatives were chosen, but it identifies them \n", - " by POSITION and we want their ID. \n", - " \n", - " Parameters\n", - " ----------\n", - " ids : list or list-like\n", - " List of alternative ID's (len N * J).\n", - " \n", - " positions : list or list-like\n", - " List of chosen alternatives by position (len N), where each entry is\n", - " an int in range [0, J)\n", - " \n", - " Returns\n", - " -------\n", - " chosen_ids : list\n", - " List of chosen alternatives by ID (len N)\n", - " \n", - " \"\"\"\n", - " N = len(positions)\n", - " J = len(ids) // N\n", - " \n", - " ids_by_obs = np.reshape(ids, (N,J))\n", - " return [ids_by_obs[i][positions[i]] for i in range(N)]\n", - " \n", - "\n", - "print(get_chosen_ids(['a','b','c','d'], [0,1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34733\n", - "[6013352102, 6081604200, 6075042800, 6001435601, 6001424001]\n" - ] - } - ], - "source": [ - "# Get tract id's for the simulated choices\n", - "\n", - "predicted_tracts = get_chosen_ids(testing_df.index.tolist(), choices)\n", - "\n", - "print(len(predicted_tracts))\n", - "print(predicted_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34733\n", - "[6097150607, 6097153200, 6097151402, 6097151402, 6097151204]\n" - ] - } - ], - "source": [ - "# Get tract id's for observed choices\n", - "\n", - "df = testing.to_frame()\n", - "observed_tracts = df.loc[df.chosen == 1, 'full_tract_id'].tolist()\n", - "\n", - "print(len(observed_tracts))\n", - "print(observed_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compare the predicted choices to the observed ones\n", - "\n", - "Multinomial models are kind of tricky to validate. We don't expect the actual choices to match, because there are so many alternatives, but we do expect the characteristics of the predicted choices to be similar to the characteristics of the observed choices. \n", - "\n", - "Choose your own metric for this depending on what you're trying to evaluate! It's even plausible that the metric could be something not directly in the model, like the distance between the predicted and actual destination choices." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.016439697118\n" - ] - } - ], - "source": [ - "# What portion of predicted destination choices were a perfect match?\n", - "# With an uninformative model we would expect 0.01, given that the \n", - "# observed choice is included in the 100 available alternatives.\n", - "\n", - "perfect_match = np.equal(predicted_tracts, observed_tracts)\n", - "print(sum(perfect_match)/len(perfect_match))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.130483901201\n" - ] - } - ], - "source": [ - "# What's the correlation between employment density of the predicted and \n", - "# observed destinations? With an uninformative model we would expect 0.\n", - "\n", - "density_1 = pd.Series([tracts.loc[t,'work_density'] for t in predicted_tracts])\n", - "density_2 = pd.Series([tracts.loc[t,'work_density'] for t in observed_tracts])\n", - "\n", - "print(density_1.corr(density_2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### How does UrbanSim generate household location choices?\n", - "\n", - "These three class methods collectively set up the choosers and alternatives according to various parameters like the sample size, prediction filters, \"probability mode,\" and \"choice mode\" (aggregate or individual):\n", - "\n", - "- `urbansim.models.MNLDiscreteChocieModel.probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.summed_probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.predict()` \n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L474\n", - "\n", - "Then this lower-level function generates a table of probabilities for each alternative, which is passed back to the `MNLDiscreteChoiceModel` class for further processing:\n", - "\n", - "- `urbansim.urbanchoice.mnl.mnl_simulate()`\n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/urbanchoice/mnl.py#L121" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### Generate probabilities instead of predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 0.011688\n", - "1 0.012314\n", - "2 0.007539\n", - "Name: Coefficient, dtype: float64\n", - "(3473300, 3)\n", - " home_density work_density school_density\n", - "full_tract_id \n", - "6097150607 10.659461 6.868701 7.16003\n", - "6075020700 20.952573 4.410758 0.00000\n", - "6013319000 21.324330 9.745037 1.26180\n" - ] - } - ], - "source": [ - "# Use coefs and testing dataset from above\n", - "\n", - "print(coefs)\n", - "print(testing_df.shape)\n", - "print(testing_df.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(34733, 100)\n", - "[[ 0.00971082 0.01006746 0.01090096 0.00783025 0.00947018]\n", - " [ 0.01075279 0.01775576 0.01048519 0.0089396 0.00777349]\n", - " [ 0.01134458 0.00807226 0.00781357 0.00986128 0.00917031]\n", - " [ 0.01152731 0.00816036 0.00913801 0.01034887 0.01152651]\n", - " [ 0.00783021 0.02165223 0.00972678 0.01013919 0.02457639]]\n" - ] - } - ], - "source": [ - "probs = mnl.mnl_simulate(testing_df, coefs, numalts=100, returnprobs=True)\n", - "\n", - "print(probs.shape)\n", - "print(probs[:5,:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chooser_id alternative_id\n", - "11485050104 6097150607 0.009711\n", - " 6075020700 0.010067\n", - " 6013319000 0.010901\n", - " 6075017902 0.007830\n", - " 6075042800 0.009470\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "# Join probabilities to a multi-index of chooser and alternative id's\n", - "# Code adapted from UrbanSim: \n", - "# https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L549-L556\n", - "\n", - "mi = pd.MultiIndex.from_arrays(\n", - " [testing.to_frame()[testing.observation_id_col], \n", - " testing.to_frame()[testing.alternative_id_col]],\n", - " names=('chooser_id', 'alternative_id'))\n", - "\n", - "probs_df = pd.Series(probs.flatten(), index=mi)\n", - "\n", - "print(probs_df.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sum the probabilities\n", - "\n", - "Calculate the total probability associated with each alternative. This approach is adapted from UrbanSim. \n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L562-L597\n", - "\n", - "Conceptually, the fitted model implies a probability density function (PDF) for each agent choosing among a set of alternatives. Here we're summing the densities across agents to get a single density function that can serve as a proxy for the aggregate appeal of the alternatives.\n", - "\n", - "Important note! What we're actually creating here (I think) is PDFs over the alternatives sampled for each chooser. With random sampling, the sum will approximate a PDF over all the alternatives. Non-random sampling will alter the interpretation -- it's still a measure of aggregate appeal, but conditioned on the sampling procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "alternative_id\n", - "6001400100 0.364101\n", - "6001400200 0.213724\n", - "6001400300 0.352288\n", - "6001400400 0.332770\n", - "6001400500 0.258811\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "# Code adapted from UrbanSim - For each chooser, normalize the probabilities so\n", - "# they sum to 1 (is this really necessary?). Then sum the probabilties associated\n", - "# with each alternative. I'm using the first 500 choosers for efficiency.\n", - "\n", - "def normalize(s):\n", - " return s / s.sum()\n", - "\n", - "summed_probs = probs_df[:50000].groupby(level=0).apply(normalize).groupby(level=1).sum()\n", - "\n", - "print(summed_probs.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/Sampling-correction-01.ipynb b/notebooks/Sampling-correction-01.ipynb deleted file mode 100644 index 0c4d60a..0000000 --- a/notebooks/Sampling-correction-01.ipynb +++ /dev/null @@ -1,1110 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sampling correction for large choice sets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sam Maurer, Nov. 21, 2016 (updated Dec. 6 to remove errors)\n", - "\n", - "1. Replicate synthetic data from Guevara & Ben-Akiva 2013\n", - "2. Do MNL with and without sampling correction\n", - "3. Check whether parameter estimates deviate from true values\n", - "4. Extend to Mixed Logit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 1. Generate synthetic data set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- N = 1000 observations\n", - "- J = 1000 alternatives for all observations (C_n = C)\n", - "- X = single attribute distributed Uniform(-2,1) for the first 500 alternatives and Uniform(-1,2) for the second half\n", - "- beta = generic linear taste coefficient, distributed Normal(mu=1.5, sigma=0.8) across the 1000 observations\n", - "- systematic utility = beta * X\n", - "- epsilon = error term distributed ExtremeValue(0,1)\n", - "- random utility = beta * X + epsilon\n", - "\n", - "Utility of alternative i for agent n:\n", - "$$ U_{in} = V_{in} + \\varepsilon_{in} = \\beta_n x_{i} + \\varepsilon_{in} $$\n", - "\n", - "Probability that agent n will choose alternative i:\n", - "$$ L_n(i \\mid \\beta_n, x_n,C_n) = \\frac {e^{V_{in}}} {\\sum_{j \\epsilon C_n} e^{V_{jn}}} $$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 187, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "[-1.53751147 0.22014909 -1.21005495 -0.39878182 -1.95627511]\n" - ] - } - ], - "source": [ - "# Generate attribute x for each of J alternatives\n", - "\n", - "# Set a seed for reproducibility\n", - "np.random.seed(12)\n", - "\n", - "# Start with J << 1000 to speed up runtimes\n", - "\n", - "J = 50 # alternatives\n", - "\n", - "Xa = 3 * np.random.rand(J/2) - 2 # uniform distribution over [-2, 1]\n", - "Xb = 3 * np.random.rand(J/2) - 1 # uniform distribution over [-1, 2]\n", - "\n", - "X = np.concatenate((Xa, Xb))\n", - "\n", - "print len(X)\n", - "print X[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 188, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "[ 1.5 1.5 1.5 1.5 1.5]\n" - ] - } - ], - "source": [ - "# Generate taste coefficient beta for each of N agents \n", - "\n", - "# For regular MNL, i think we need to use a single value, instead of a \n", - "# distribution as Guevara & Ben-Akiva used for the mixture model\n", - "\n", - "N = 1000 # agents/observations\n", - "\n", - "beta = np.zeros(1000) + 1.5\n", - "# beta = 0.8 * np.random.randn(N) + 1.5\n", - "\n", - "print len(beta)\n", - "print beta[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print pd.DataFrame(beta).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 190, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1000, 50)\n" - ] - } - ], - "source": [ - "# Generate probability matrix for N agents choosing among J alternatives\n", - "\n", - "def probs(n):\n", - " ''' \n", - " Return list of J probabilities for agent n\n", - " '''\n", - " b = beta[n]\n", - " exps = [np.exp(b*x) for x in X]\n", - " sum_exps = np.sum(exps)\n", - " return [exp/sum_exps for exp in exps]\n", - "\n", - "P = np.array([probs(n) for n in range(N)])\n", - " \n", - "print P.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 191, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n" - ] - } - ], - "source": [ - "# Check that each row sums to 1\n", - "\n", - "print np.sum(P, axis=1)[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 192, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "[12, 41, 37, 5, 30, 27, 8, 35, 33, 6]\n" - ] - } - ], - "source": [ - "# Simulate a choice from J alternatives for each of N agents\n", - "\n", - "C = [np.random.choice(range(J), p=p) for p in P]\n", - "\n", - "print len(C)\n", - "print C[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "#### Now we have data:\n", - "\n", - "- N agents/observations with true taste coefficients in array \"beta\"\n", - "- J alternatives with single attributes in array \"X\"\n", - "- N choice outcomes in array \"C\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Estimate beta without sampling, using PyLogit MNL" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pylogit\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 233, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50000\n" - ] - } - ], - "source": [ - "# Set up an estimation dataset in long format\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i]] for n in range(N) for i in range(J)]\n", - "\n", - "print len(d)" - ] - }, - { - "cell_type": "code", - "execution_count": 234, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id choice x\n", - "0 0 0 0 -1.537511\n", - "1 0 1 0 0.220149\n", - "2 0 2 0 -1.210055\n", - "3 0 3 0 -0.398782\n", - "4 0 4 0 -1.956275 \n", - "\n", - " obs_id alt_id choice x\n", - "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 499.500000 24.500000 0.020000 0.014570\n", - "std 288.677877 14.431014 0.140001 1.116965\n", - "min 0.000000 0.000000 0.000000 -1.993222\n", - "25% 249.750000 12.000000 0.000000 -0.894495\n", - "50% 499.500000 24.500000 0.000000 0.220035\n", - "75% 749.250000 37.000000 0.000000 0.832675\n", - "max 999.000000 49.000000 1.000000 1.985414\n" - ] - } - ], - "source": [ - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 235, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up model spec\n", - "\n", - "spec = OrderedDict([\n", - " ('x', [range(J)])\n", - " ])\n", - "\n", - "labels = OrderedDict([\n", - " ('x', ['beta_x'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 236, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -3,912.0230\n", - "Initial Log-likelihood: -3,912.0230\n", - "Estimation Time: 0.07 seconds.\n", - "Final log-likelihood: -3,065.1983\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: choice No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Mon, 21 Nov 2016 Pseudo R-squ.: 0.216\n", - "Time: 13:52:41 Pseudo R-bar-squ.: 0.216\n", - "converged: True Log-Likelihood: -3,065.198\n", - " LL-Null: -3,912.023\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.5324 0.046 33.649 0.000 1.443 1.622\n", - "==============================================================================\n", - "CPU times: user 7.7 s, sys: 14.1 s, total: 21.8 s\n", - "Wall time: 14.4 s\n" - ] - } - ], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec, \n", - " model_type = \"MNL\", \n", - " names = labels)\n", - "\n", - "m.fit_mle(init_vals = np.array([0]))\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Try with UrbanSim MNL instead of PyLogit\n", - "\n", - "Model class: https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py\n", - "\n", - "Estimation algorithms: https://github.com/UDST/urbansim/blob/master/urbansim/urbanchoice/mnl.py" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from urbansim.models import MNLDiscreteChoiceModel" - ] - }, - { - "cell_type": "code", - "execution_count": 178, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n" - ] - } - ], - "source": [ - "# Choosers should be a DataFrame of characteristics, with index as identifier\n", - "\n", - "d = [[n, C[n]] for n in range(N)]\n", - "\n", - "choosers = pd.DataFrame(d, columns=['id', 'choice']).set_index('id')\n", - "\n", - "print len(choosers)" - ] - }, - { - "cell_type": "code", - "execution_count": 179, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n" - ] - } - ], - "source": [ - "# Alternatives should be a DataFrame of characteristics, with index as identifier\n", - "\n", - "d = [[i, X[i]] for i in range(J)]\n", - "\n", - "alts = pd.DataFrame(d, columns=['id', 'x']).set_index('id')\n", - "\n", - "print len(alts)" - ] - }, - { - "cell_type": "code", - "execution_count": 180, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Null Log-liklihood: -3891.820\n", - "Log-liklihood at convergence: -3077.869\n", - "Log-liklihood Ratio: 0.209\n", - "\n", - "+-----------+-------------+------------+---------+\n", - "| Component | Coefficient | Std. Error | T-Score |\n", - "+-----------+-------------+------------+---------+\n", - "| x | 1.527 | 0.022 | 69.267 |\n", - "+-----------+-------------+------------+---------+\n", - "CPU times: user 104 ms, sys: 9.03 ms, total: 113 ms\n", - "Wall time: 89.4 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "# It seems like this implementation *requires* us to sample the alternatives, \n", - "# so here i'm estimating the model with J-1 alts\n", - "\n", - "m = MNLDiscreteChoiceModel(model_expression = 'x',\n", - " sample_size = J-1)\n", - "\n", - "m.fit(choosers = choosers,\n", - " alternatives = alts,\n", - " current_choice = 'choice')\n", - "\n", - "m.report_fit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 4. MNL, sampling alternatives without correction\n", - "\n", - "(NB - with random sampling, no correction is needed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# In the estimation dataset, for each observation include a row for the\n", - "# chosen alternative, plus K-1 other alternatives sampled randomly\n", - "# without replacement, where K < J." - ] - }, - { - "cell_type": "code", - "execution_count": 194, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[12, 23, 7]\n" - ] - } - ], - "source": [ - "K = 3\n", - "\n", - "def alts(obs_id):\n", - " \"\"\"\n", - " Sample alternatives for observation `obs_id`. Expects `J` total\n", - " alts, `K` sampled alts, list `C` of choice outcomes. Returns list \n", - " of K alt id's including the chosen one.\n", - " \"\"\"\n", - " chosen = C[obs_id] # id of chosen alternative\n", - " unchosen = [i for i in range(J) if chosen != i] # id's of J-1 unchosen alts\n", - " sample_unchosen = np.random.choice(unchosen, size=K-1, replace=False)\n", - " return [chosen] + sample_unchosen.tolist()\n", - " \n", - "print alts(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3000\n" - ] - } - ], - "source": [ - "# Set up the estimation dataset\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i]] for n in range(N) for i in alts(n)]\n", - "\n", - "print len(d)" - ] - }, - { - "cell_type": "code", - "execution_count": 196, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id choice x\n", - "0 0 12 1 0.832675\n", - "1 0 3 0 -0.398782\n", - "2 0 35 0 1.850941\n", - "3 1 41 1 1.985414\n", - "4 1 45 0 0.272157 \n", - "\n", - " obs_id alt_id choice x\n", - "count 3000.000000 3000.000000 3000.000000 3000.000000\n", - "mean 499.500000 26.898000 0.333333 0.446787\n", - "std 288.723115 13.974669 0.471483 1.170677\n", - "min 0.000000 0.000000 0.000000 -1.993222\n", - "25% 249.750000 14.000000 0.000000 -0.181750\n", - "50% 499.500000 29.000000 0.000000 0.413689\n", - "75% 749.250000 38.000000 1.000000 1.448505\n", - "max 999.000000 49.000000 1.000000 1.985414\n" - ] - } - ], - "source": [ - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Same model spec as before\n", - "\n", - "spec = OrderedDict([\n", - " ('x', [range(J)])\n", - " ])\n", - "\n", - "labels = OrderedDict([\n", - " ('x', ['beta_x'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -1,098.6123\n", - "Initial Log-likelihood: -1,098.6123\n", - "Estimation Time: 0.02 seconds.\n", - "Final log-likelihood: -585.7551\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: choice No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 20 Nov 2016 Pseudo R-squ.: 0.467\n", - "Time: 14:37:24 Pseudo R-bar-squ.: 0.466\n", - "converged: True Log-Likelihood: -585.755\n", - " LL-Null: -1,098.612\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.6151 0.077 20.888 0.000 1.464 1.767\n", - "==============================================================================\n", - "CPU times: user 303 ms, sys: 41.3 ms, total: 344 ms\n", - "Wall time: 226 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec, \n", - " model_type = \"MNL\", \n", - " names = labels)\n", - "\n", - "m.fit_mle(init_vals = np.array([0]))\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def estimate_beta():\n", - " d = [[n, i, int(C[n]==i), X[i]] for n in range(N) for i in alts(n)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])\n", - " m = pylogit.create_choice_model(df, 'alt_id', 'obs_id', 'choice', spec, 'MNL', names=labels)\n", - " m.fit_mle(init_vals = np.array([0]))\n", - " return m.params.beta_x" - ] - }, - { - "cell_type": "code", - "execution_count": 218, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%%capture\n", - "\n", - "beta = []\n", - "for i in range(1000):\n", - " beta.append(estimate_beta())" - ] - }, - { - "cell_type": "code", - "execution_count": 219, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 1000.000000\n", - "mean 1.508913\n", - "std 0.052854\n", - "min 1.322523\n", - "25% 1.471155\n", - "50% 1.507724\n", - "75% 1.545232\n", - "max 1.674443\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. MNL with sampling correction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Utility of alternative j:\n", - "$$ V_{j} = \\beta x_{j} $$\n", - "\n", - "With sampling, we have to account for the restricted choice set (from Eq 6 in Guevara & Ben-Akiva 2013):\n", - "\n", - "$$ V_j = \\beta x_j + \\ln \\pi(D \\mid j) $$\n", - "\n", - "Where pi is the conditional probability that we would construct the choice set D given that alternative j was chosen. This goes into the likelihood function in both the numerator and denominator.\n", - "\n", - "$$ L_n = \\frac {exp(\\beta x_i + \\ln \\pi(D_n \\mid i))} {\\sum_{j \\epsilon D_n} exp(\\beta x_j + \\ln \\pi(D_n \\mid j))} $$\n", - "\n", - "How to calculate pi? From the original formulation of this in McFadden 1978: \"Suppose D is comprized of i plus a sample of alternatives from the set C\\\\{i}, obtained by considering each element of this set independently, and including it with probability p. Then, the probability of D will depend solely on the number of elements K it contains.\"\n", - "\n", - "$$ \\pi(D) = p^{K-1} (1 - p)^{J-K} $$\n", - "\n", - "(?? Without replacement, i think it should be the n-choose-k binomial coefficient, where n=J-1 and k=K-1)\n", - "\n", - "$$ \\pi(D) = {n \\choose k} = \\frac {(K-1)!(J-K)!} {(J-1)!} $$\n" - ] - }, - { - "cell_type": "code", - "execution_count": 197, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id choice x const\n", - "0 0 12 1 0.832675 1\n", - "1 0 24 0 -1.070307 1\n", - "2 0 4 0 -1.956275 1\n", - "3 1 41 1 1.985414 1\n", - "4 1 26 0 0.413689 1 \n", - "\n", - " obs_id alt_id choice x const\n", - "count 3000.000000 3000.000000 3000.000000 3000.000000 3000.0\n", - "mean 499.500000 26.777667 0.333333 0.438763 1.0\n", - "std 288.723115 13.883149 0.471483 1.180510 0.0\n", - "min 0.000000 0.000000 0.000000 -1.993222 1.0\n", - "25% 249.750000 15.000000 0.000000 -0.343887 1.0\n", - "50% 499.500000 29.000000 0.000000 0.413689 1.0\n", - "75% 749.250000 38.000000 1.000000 1.448505 1.0\n", - "max 999.000000 49.000000 1.000000 1.985414 1.0\n" - ] - } - ], - "source": [ - "# Add a column in the estimation data for the constant\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - "\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 198, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "spec2 = OrderedDict([\n", - " ('x', [range(J)]),\n", - " ('const', [range(J)])\n", - " ])\n", - "\n", - "labels2 = OrderedDict([\n", - " ('x', ['beta_x']),\n", - " ('const', ['constant'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 232, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5" - ] - }, - "execution_count": 232, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Try binomial formula\n", - "\n", - "j=3\n", - "k=2\n", - "\n", - "fact = np.math.factorial\n", - "\n", - "float(fact(k-1)*fact(j-k))/fact(j-1)" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -1,098.6123\n", - "Initial Log-likelihood: -1,098.6123\n", - "Estimation Time: 0.02 seconds.\n", - "Final log-likelihood: -613.3560\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: choice No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 998\n", - "Method: MLE Df Model: 2\n", - "Date: Mon, 21 Nov 2016 Pseudo R-squ.: 0.442\n", - "Time: 13:47:43 Pseudo R-bar-squ.: 0.440\n", - "converged: True Log-Likelihood: -613.356\n", - " LL-Null: -1,098.612\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.5376 0.075 20.586 0.000 1.391 1.684\n", - "constant -7.0699 1.31e+07 -5.39e-07 1.000 -2.57e+07 2.57e+07\n", - "==============================================================================\n", - "CPU times: user 325 ms, sys: 29.7 ms, total: 355 ms\n", - "Wall time: 237 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec2, \n", - " model_type = \"MNL\", \n", - " names = labels2)\n", - "\n", - "# p = float(K-1)/(J-1)\n", - "# const = np.log(p**(K-1) * (1-p)**(J-K))\n", - "\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "# Add an initial value for the constant and constrain it to that\n", - "m.fit_mle(init_vals = np.array([0, const]), \n", - " constrained_pos=[1])\n", - "\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples" - ] - }, - { - "cell_type": "code", - "execution_count": 213, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# try binomial formula\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "def estimate_beta_with_correction():\n", - " d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - " m = pylogit.create_choice_model(df, 'alt_id', 'obs_id', 'choice', spec2, 'MNL', names=labels2)\n", - " m.fit_mle(init_vals = np.array([0, const]), constrained_pos=[1])\n", - " return m.params.beta_x" - ] - }, - { - "cell_type": "code", - "execution_count": 216, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%capture\n", - "\n", - "beta = []\n", - "for i in range(1000):\n", - " beta.append(estimate_beta_with_correction())" - ] - }, - { - "cell_type": "code", - "execution_count": 217, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 1000.000000\n", - "mean 1.513490\n", - "std 0.051725\n", - "min 1.354507\n", - "25% 1.477341\n", - "50% 1.512756\n", - "75% 1.548081\n", - "max 1.736557\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "NB - the correction isn't needed for the random sampling case, but we can adapt this code for stratified sampling later on" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/Sampling-correction-02.ipynb b/notebooks/Sampling-correction-02.ipynb deleted file mode 100644 index 75ee428..0000000 --- a/notebooks/Sampling-correction-02.ipynb +++ /dev/null @@ -1,1141 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sampling correction for large choice sets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sam Maurer, Dec. 1, 2016\n", - "\n", - "1. Replicate synthetic data from Guevara & Ben-Akiva 2013\n", - "2. Do MNL with and without sampling correction\n", - "3. Check whether parameter estimates deviate from true values\n", - "4. Extend to Mixed Logit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 1. Generate synthetic data set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- numobs (N) = 1000 observations\n", - "- numalts (J) = 1000 alternatives for all observations (choiceset_n = choiceset, C_n = C)\n", - "- X = single attribute distributed Uniform(-2,1) for the first 500 alternatives and Uniform(-1,2) for the second half\n", - "- beta = generic linear taste coefficient, distributed Normal(mu=1.5, sigma=0.8) across the 1000 observations\n", - "- systematic utility = beta * X\n", - "- epsilon = error term distributed ExtremeValue(0,1)\n", - "- random utility = beta * X + epsilon\n", - "\n", - "Utility of alternative i for agent n:\n", - "$$ U_{in} = V_{in} + \\varepsilon_{in} = \\beta_n x_{i} + \\varepsilon_{in} $$\n", - "\n", - "Probability that agent n will choose alternative i:\n", - "$$ L_n(i \\mid \\beta_n, x_n,C_n) = \\frac {e^{V_{in}}} {\\sum_{j \\epsilon C_n} e^{V_{jn}}} $$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "# Set a seed so that the random numbers will be reproducible\n", - "np.random.seed(12)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate attributes x1, x2 for each of numalts (J) alternatives" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# For now, J << 1000 alternatives to speed up runtimes\n", - "numalts = 50\n", - "\n", - "def rand(len, min, max):\n", - " \"\"\" Generate `len` random floats uniformly distributed from `min` to `max` \"\"\"\n", - " return (max - min) * np.random.rand(len) + min\n", - "\n", - "# Attribute x is uniformly distributed over [-2, 1] for half the alternatives\n", - "# and over [-1, 2] for the other half, as in Guevara & Ben-Akiva\n", - "\n", - "# X = np.concatenate((rand(numalts/2, -2, 1), rand(numalts/2, -1, 2)))\n", - "\n", - "# Or, attribute x is uniformly distributed over [0, 10] for half the alternatives\n", - "# and over [100, 110] for the other half, to induce bias in estimation\n", - "\n", - "X = np.concatenate((rand(numalts/2, 0, 10), rand(numalts/2, 100, 110)))" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0\n", - "count 25.000000\n", - "mean 5.782730\n", - "std 2.903692\n", - "min 0.233499\n", - "25% 3.436553\n", - "50% 6.561771\n", - "75% 7.527760\n", - "max 9.956289\n", - " 0\n", - "count 25.000000\n", - "mean 105.373296\n", - "std 3.202724\n", - "min 100.563383\n", - "25% 102.109451\n", - "50% 105.497276\n", - "75% 108.062617\n", - "max 109.884905\n" - ] - } - ], - "source": [ - "print pd.DataFrame(X[:numalts/2]).describe()\n", - "print pd.DataFrame(X[numalts/2:]).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate taste coefficient beta for each of numobs (N) agents " - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# For regular MNL, use a single value instead of a distribution as \n", - "# Guevara & Ben-Akiva used for the mixture model\n", - "\n", - "numobs = 1000 # agents/observations\n", - "\n", - "beta = np.zeros(1000) + 1.5\n", - "# beta = 0.8 * np.random.randn(numobs) + 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print pd.DataFrame(beta).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Simulate a choice from numalts (J) alternatives for each of numobs (N) agents" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "50\n" - ] - } - ], - "source": [ - "# Generate a utility matrix for N agents choosing among J alternatives\n", - "\n", - "U = [[beta[n]*x + np.random.gumbel() for x in X] for n in range(numobs)]\n", - " \n", - "print len(U)\n", - "print len(U[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "[28, 32, 33, 33, 33, 43, 32, 43, 32, 37]\n" - ] - } - ], - "source": [ - "# Each agent chooses the alternative with highest utility\n", - "\n", - "choices = [np.argmax(a) for a in U]\n", - "\n", - "print len(choices)\n", - "print choices[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "#### Now we have data:\n", - "\n", - "- N agents/observations with true taste coefficients in array \"`beta`\"\n", - "- J alternatives with single attributes in array \"`X`\"\n", - "- N choice outcomes in array \"`choices`\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Estimate beta without sampling, using PyLogit MNL" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pylogit\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 174, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up the estimation dataset in long format\n", - "\n", - "d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in range(numalts)]\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id chosen x\n", - "0 0 0 0 1.699728\n", - "1 0 1 0 2.530486\n", - "2 0 2 0 7.104747\n", - "3 0 3 0 5.721117\n", - "4 0 4 0 6.125892 \n", - "\n", - " obs_id alt_id chosen x\n", - "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 499.500000 24.500000 0.020000 55.555265\n", - "std 288.677877 14.431014 0.140001 50.140919\n", - "min 0.000000 0.000000 0.000000 1.611696\n", - "25% 249.750000 12.000000 0.000000 6.127162\n", - "50% 499.500000 24.500000 0.000000 54.888537\n", - "75% 749.250000 37.000000 0.000000 105.748896\n", - "max 999.000000 49.000000 1.000000 109.778827\n" - ] - } - ], - "source": [ - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 168, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up reusable model spec\n", - "\n", - "spec = OrderedDict([('x', 'all_same')])\n", - "labels = OrderedDict([('x', 'beta_x')])" - ] - }, - { - "cell_type": "code", - "execution_count": 172, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Set up reusable code to estimate a model\n", - "\n", - "def estimate_model(init_val):\n", - " \"\"\"\n", - " Initialize and fit a model, returning it as an object. Will use the \n", - " current values of `df`, `spec`, and `labels`.\n", - " \"\"\"\n", - " m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'chosen', \n", - " specification = spec, \n", - " model_type = \"MNL\", \n", - " names = labels)\n", - "\n", - " m.fit_mle(init_vals = np.array([init_val]))\n", - " return m" - ] - }, - { - "cell_type": "code", - "execution_count": 175, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -3,912.0230\n", - "Initial Log-likelihood: -1,823.3647\n", - "Estimation Time: 0.17 seconds.\n", - "Final log-likelihood: -1,813.8248\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 11 Dec 2016 Pseudo R-squ.: 0.536\n", - "Time: 19:42:20 Pseudo R-bar-squ.: 0.536\n", - "converged: True Log-Likelihood: -1,813.825\n", - " LL-Null: -3,912.023\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.4422 0.060 24.067 0.000 1.325 1.560\n", - "==============================================================================\n", - "CPU times: user 8.37 s, sys: 15.4 s, total: 23.7 s\n", - "Wall time: 15.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "m = estimate_model(init_val = 1.2)\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This looks good: it's very close to the true beta of 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3a. Estimate beta with random sampling of alternatives\n", - "\n", - "This should produce an unbiased estimate of beta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# In the estimation dataset, for each observation include a row for the\n", - "# chosen alternative, plus K-1 other alternatives sampled randomly\n", - "# without replacement, where K < J.\n", - "\n", - "# Some more notation:\n", - "# - true choice set C = range(J)\n", - "# - restricted choice set D_n is a subset of C, where len(D_n) = K" - ] - }, - { - "cell_type": "code", - "execution_count": 154, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 3 12 13 28 38]\n" - ] - } - ], - "source": [ - "# TO DO - rewrite to use sampling weights\n", - "\n", - "def alts(obs_id, C, K):\n", - " \"\"\"\n", - " This function generates a restricted choice set D for a particular\n", - " observation. Expects list `C` of alternatives to sample from (either\n", - " the full choice set or a stratrum), int `K` alternatives to sample,\n", - " and list `choices` of the alt_id chosen for each obs_id. Returns list \n", - " of K alt_id's including the chosen one.\n", - " \"\"\"\n", - " chosen = choices[obs_id] # id of chosen alternative\n", - " unchosen = [i for i in C if chosen != i] # id's of unchosen alts\n", - " sample_unchosen = np.random.choice(unchosen, size=K-1, replace=False).tolist()\n", - " return np.sort([chosen] + sample_unchosen)\n", - " \n", - "print alts(0, range(numalts), 5)" - ] - }, - { - "cell_type": "code", - "execution_count": 176, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up the estimation dataset, which can use the same spec as earlier\n", - "\n", - "C = range(numalts) # choice set to sample from\n", - "K = 10\n", - "\n", - "d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id chosen x\n", - "0 0 2 0 8.730367\n", - "1 0 11 0 2.855760\n", - "2 0 18 0 9.956289\n", - "3 0 28 1 108.045363\n", - "4 0 30 0 105.386425 \n", - "\n", - " obs_id alt_id chosen x\n", - "count 10000.000000 10000.00000 10000.000000 10000.000000\n", - "mean 499.500000 25.51300 0.100000 60.170736\n", - "std 288.689425 14.29645 0.300015 50.009368\n", - "min 0.000000 0.00000 0.000000 0.233499\n", - "25% 249.750000 13.00000 0.000000 6.797005\n", - "50% 499.500000 27.00000 0.000000 100.970300\n", - "75% 749.250000 37.00000 0.000000 106.140668\n", - "max 999.000000 49.00000 1.000000 109.884905\n" - ] - } - ], - "source": [ - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 177, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,302.5851\n", - "Initial Log-likelihood: -585.5314\n", - "Estimation Time: 0.01 seconds.\n", - "Final log-likelihood: -578.0528\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 11 Dec 2016 Pseudo R-squ.: 0.749\n", - "Time: 19:42:57 Pseudo R-bar-squ.: 0.749\n", - "converged: True Log-Likelihood: -578.053\n", - " LL-Null: -2,302.585\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.4855 0.081 18.243 0.000 1.326 1.645\n", - "==============================================================================\n", - "CPU times: user 594 ms, sys: 380 ms, total: 974 ms\n", - "Wall time: 657 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = estimate_model(init_val = 1.2)\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples of alternatives" - ] - }, - { - "cell_type": "code", - "execution_count": 178, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1min 33s, sys: 38 s, total: 2min 11s\n", - "Wall time: 1min 14s\n" - ] - } - ], - "source": [ - "%%time\n", - "%%capture\n", - "\n", - "beta = []\n", - "C = range(numalts)\n", - "K = 10\n", - "\n", - "for i in range(100):\n", - " d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])\n", - " m = estimate_model(init_val = 1.2)\n", - " beta.append(m.params.beta_x)" - ] - }, - { - "cell_type": "code", - "execution_count": 179, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 100.000000\n", - "mean 1.453900\n", - "std 0.042923\n", - "min 1.371329\n", - "25% 1.426357\n", - "50% 1.450013\n", - "75% 1.484759\n", - "max 1.557182\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looks unbiased, as expected. It's very close to the true beta of 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 3b. Estimate beta with over-sampling of *irrelevant* alternatives\n", - "\n", - "This should produce a biased estimate of beta, until we add a correction to the estimation procedure" - ] - }, - { - "cell_type": "code", - "execution_count": 187, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Recall that half the values of x are in the range [0, 10] and half are\n", - "# in the range [100, 110]. The taste coefficient is positive, so the first\n", - "# set of alternatives is much less relevant than the second set. \n", - "\n", - "C = range(numalts/2) # alternatives to sample from\n", - "K = 10\n", - "\n", - "d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])" - ] - }, - { - "cell_type": "code", - "execution_count": 188, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id chosen x\n", - "0 0 1 0 7.427010\n", - "1 0 3 0 9.930585\n", - "2 0 5 0 3.436553\n", - "3 0 9 0 6.558043\n", - "4 0 11 0 2.855760 \n", - "\n", - " obs_id alt_id chosen x\n", - "count 10000.000000 10000.000000 10000.000000 10000.000000\n", - "mean 499.500000 14.463900 0.100000 16.177211\n", - "std 288.689425 10.195033 0.300015 31.248750\n", - "min 0.000000 0.000000 0.000000 0.233499\n", - "25% 249.750000 6.000000 0.000000 3.436553\n", - "50% 499.500000 13.000000 0.000000 6.797005\n", - "75% 749.250000 20.000000 0.000000 8.371036\n", - "max 999.000000 45.000000 1.000000 109.884905\n" - ] - } - ], - "source": [ - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 189, - "metadata": { - "collapsed": false, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,302.5851\n", - "Initial Log-likelihood: 0.0000\n", - "Estimation Time: 0.00 seconds.\n", - "Final log-likelihood: 0.0000\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 11 Dec 2016 Pseudo R-squ.: 1.000\n", - "Time: 20:00:33 Pseudo R-bar-squ.: 1.000\n", - "converged: True Log-Likelihood: 0.000\n", - " LL-Null: -2,302.585\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.5000 nan nan nan nan nan\n", - "==============================================================================\n", - "CPU times: user 635 ms, sys: 373 ms, total: 1.01 s\n", - "Wall time: 674 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = estimate_model(init_val = 1.5)\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. MNL with sampling correction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Utility of alternative j:\n", - "$$ V_{j} = \\beta x_{j} $$\n", - "\n", - "With sampling, we have to account for the restricted choice set (from Eq 6 in Guevara & Ben-Akiva 2013):\n", - "\n", - "$$ V_j = \\beta x_j + \\ln \\pi(D \\mid j) $$\n", - "\n", - "Where pi is the conditional probability that we would construct the choice set D given that alternative j was chosen. This goes into the likelihood function in both the numerator and denominator.\n", - "\n", - "$$ L_n = \\frac {exp(\\beta x_i + \\ln \\pi(D_n \\mid i))} {\\sum_{j \\epsilon D_n} exp(\\beta x_j + \\ln \\pi(D_n \\mid j))} $$\n", - "\n", - "How to calculate pi? From the original formulation of this in McFadden 1978: \"Suppose D is comprized of i plus a sample of alternatives from the set C\\\\{i}, obtained by considering each element of this set independently, and including it with probability p. Then, the probability of D will depend solely on the number of elements K it contains.\"\n", - "\n", - "$$ \\pi(D) = p^{K-1} (1 - p)^{J-K} $$\n", - "\n", - "(?? Without replacement, i think it should be the n-choose-k binomial coefficient, where n=J-1 and k=K-1)\n", - "\n", - "$$ \\pi(D) = {n \\choose k} = \\frac {(K-1)!(J-K)!} {(J-1)!} $$\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Add a column in the estimation data for the constant\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - "\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "spec2 = OrderedDict([\n", - " ('x', [range(J)]),\n", - " ('const', [range(J)])\n", - " ])\n", - "\n", - "labels2 = OrderedDict([\n", - " ('x', ['beta_x']),\n", - " ('const', ['constant'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Try binomial formula\n", - "\n", - "j=3\n", - "k=2\n", - "\n", - "fact = np.math.factorial\n", - "\n", - "float(fact(k-1)*fact(j-k))/fact(j-1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec2, \n", - " model_type = \"MNL\", \n", - " names = labels2)\n", - "\n", - "# p = float(K-1)/(J-1)\n", - "# const = np.log(p**(K-1) * (1-p)**(J-K))\n", - "\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "# Add an initial value for the constant and constrain it to that\n", - "m.fit_mle(init_vals = np.array([0, const]), \n", - " constrained_pos=[1])\n", - "\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# try binomial formula\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "def estimate_beta_with_correction():\n", - " d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - " m = pylogit.create_choice_model(df, 'alt_id', 'obs_id', 'choice', spec2, 'MNL', names=labels2)\n", - " m.fit_mle(init_vals = np.array([0, const]), constrained_pos=[1])\n", - " return m.params.beta_x" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%time\n", - "%%capture\n", - "\n", - "beta = []\n", - "for i in range(1000):\n", - " beta.append(estimate_beta_with_correction())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/_archive/CHTS-exploration-01.ipynb b/notebooks/_archive/CHTS-exploration-01.ipynb deleted file mode 100644 index 652d11a..0000000 --- a/notebooks/_archive/CHTS-exploration-01.ipynb +++ /dev/null @@ -1,2051 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring the public CHTS data\n", - "\n", - "Sam Maurer, June 2017\n", - "\n", - "Python 3.6" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib\n", - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# See ../data/README.md for instructions about how to get the data\n", - "\n", - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Households\n", - "\n", - "Households that participated in the travel diary survey" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Limit to the Bay Area\n", - "\n", - "households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(households_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 1076\n", - "SAN JOSE 939\n", - "OAKLAND 459\n", - "SANTA ROSA 321\n", - "BERKELEY 251\n", - "NAPA 228\n", - "PALO ALTO 218\n", - "SUNNYVALE 200\n", - "SAN MATEO 197\n", - "FREMONT 177\n", - "WALNUT CREEK 173\n", - "REDWOOD CITY 170\n", - "FAIRFIELD 159\n", - "CONCORD 158\n", - "SAN RAFAEL 158\n", - "Name: home_city, dtype: int64" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Top home locations\n", - "\n", - "households_ba.home_city.value_counts()[:15]" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 2.571462\n", - "std 1.373733\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 2.000000\n", - "75% 3.000000\n", - "max 8.000000\n", - "Name: persons_count, dtype: float64" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.persons_count.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 0.999955\n", - "std 0.704667\n", - "min 0.003498\n", - "25% 0.447392\n", - "50% 0.915924\n", - "75% 1.376790\n", - "max 5.400840\n", - "Name: hhwgt, dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.hhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 293.007784\n", - "std 206.482227\n", - "min 1.025146\n", - "25% 131.095416\n", - "50% 268.385115\n", - "75% 403.428487\n", - "max 1582.559559\n", - "Name: exphhwgt, dtype: float64" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.exphhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421\n", - "mean 6056293182\n", - "std 29445570\n", - "min 6001400100\n", - "25% 6037207301\n", - "50% 6059042114\n", - "75% 6079011200\n", - "max 6115041100\n", - "Name: home_tract_id, dtype: float64" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.home_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Persons" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno travel_date gender relation education race1\n", - "0 7128119 1 2013-01-27 1 1 6 1\n", - "1 7128119 3 2013-01-27 2 3 1 1\n", - "2 7128138 1 2012-11-05 2 1 5 1\n", - "3 7128262 1 2012-12-21 2 1 1 1\n", - "4 7128262 3 2012-12-21 2 3 2 1\n", - "5 7128262 2 2012-12-21 1 2 1 1\n", - "6 7128288 2 2013-01-22 1 3 3 1\n", - "7 7128288 1 2013-01-22 2 1 5 1\n", - "8 7128316 1 2012-12-29 2 1 4 1\n", - "9 7128372 1 2012-12-29 2 1 6 1\n" - ] - } - ], - "source": [ - "print(persons[['sampno', 'perno', 'travel_date', 'gender', 'relation', \n", - " 'education', 'race1']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 108776.000000\n", - "mean 3.233838\n", - "std 2.954577\n", - "min 0.000000\n", - "25% 1.000000\n", - "50% 2.000000\n", - "75% 5.000000\n", - "max 33.000000\n", - "Name: person_trips, dtype: float64" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What is `person_trips`? -- not sure, but it looks related to the `tripno` field\n", - "\n", - "persons.person_trips.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 43111\n", - "mean 6241008094\n", - "std 3120182944\n", - "min 2614\n", - "25% 6037238200\n", - "50% 6059063907\n", - "75% 6079011001\n", - "max 99999999999\n", - "Name: empl_tract_id, dtype: float64" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 66002\n", - "Name: empl_tract_id, dtype: object" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 25438\n", - "mean 6342776654\n", - "std 3678070397\n", - "min 4005001000\n", - "25% 6037232875\n", - "50% 6059062642\n", - "75% 6079010205\n", - "max 99999999999\n", - "Name: school_tract_id, dtype: float64" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 83675\n", - "Name: school_tract_id, dtype: object" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113.000000\n", - "mean 0.999999\n", - "std 0.962373\n", - "min 0.000568\n", - "25% 0.322230\n", - "50% 0.717519\n", - "75% 1.329846\n", - "max 5.060089\n", - "Name: perwgt, dtype: float64" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.perwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Places\n", - "\n", - "Each record represents a single visit to a place" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Filter for places visited by people who live in the Bay Area (may want to do use a\n", - "# different filter depending on the application)\n", - "\n", - "places_ba = places[places.sampno.isin(households_ba.sampno)]\n", - "\n", - "len(places_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano tripno\n", - "0 1031985 1 1 nan\n", - "1 1031985 1 2 1\n", - "2 1031985 1 3 2\n", - "3 1031985 2 1 nan\n", - "4 1031985 2 2 1\n", - "5 1031985 2 3 2\n", - "118 1033944 1 1 nan\n", - "119 1033944 1 2 1\n", - "120 1033944 1 3 2\n", - "121 1033944 1 4 3\n" - ] - } - ], - "source": [ - "# Is there a unique identifier?\n", - "\n", - "# Might need to use combination of `sampno` (household), `perno` (person within hh),\n", - "# `plano` (place within person's travel diary)\n", - "\n", - "# What's `tripno`? (\"unlinked trip ID\" - maybe representing transfer between modes)\n", - "\n", - "print(places_ba[['sampno', 'perno', 'plano', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is every combination of `sampno`, `perno`, `plano` unique? -- Yes\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano']))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "93406" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many places have a `tripno`? -- about 80%\n", - "\n", - "places_ba.tripno.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is the `tripno` ever repeated? -- No\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano', 'tripno']))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406.000000\n", - "mean 3.817185\n", - "std 2.841705\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 3.000000\n", - "75% 5.000000\n", - "max 32.000000\n", - "Name: tripno, dtype: float64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.tripno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 REDACTED\n", - "1 REDACTED\n", - "2 REDACTED\n", - "3 REDACTED\n", - "4 REDACTED\n", - "Name: place_name, dtype: object" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Can we see the place names? -- No\n", - "\n", - "places_ba.place_name.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 15680\n", - "SAN JOSE 11414\n", - "OAKLAND 5455\n", - "SANTA ROSA 3441\n", - "BERKELEY 3185\n", - "PALO ALTO 2664\n", - "SUNNYVALE 2440\n", - "SAN MATEO 2190\n", - "NAPA 2160\n", - "FREMONT 2126\n", - "REDWOOD CITY 2067\n", - "MOUNTAIN VIEW 1948\n", - "WALNUT CREEK 1896\n", - "SANTA CLARA 1816\n", - "CONCORD 1800\n", - "Name: city, dtype: int64" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.city.value_counts().head(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAD8CAYAAAC/1zkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGWJJREFUeJzt3X+0XWV95/H3R2IRVJAfkdL86A0l0gkZDeQOKzOM1ja1\npGINdqENqyNxyiIyMFamrNUG6qp21sosmKq0zAyxUWgCRSDyQzIVOgZwyXQtA15oSiBIuUiUew0k\nAkP8AcHAZ/44z7Gby73JSbLP3eeQz2uts85zvns/e38PEL7Z+3nOfmSbiIiIOryh6QQiIuL1I0Ul\nIiJqk6ISERG1SVGJiIjapKhERERtUlQiIqI2KSoREVGbFJWIiKhNikpERNRmStMJTLajjz7aAwMD\nTacREdFX7r///h/anrqn/Q64ojIwMMDQ0FDTaURE9BVJ3+tkv9z+ioiI2qSoREREbVJUIiKiNikq\nERFRmxSViIioTYpKRETUJkUlIiJqk6ISERG1SVGJiIjaHHC/qN8fA8u/tl/9t1x6ek2ZRET0plyp\nREREbVJUIiKiNikqERFRmxSViIioTYpKRETUJkUlIiJqk6ISERG16VpRkTRD0jckbZb0sKRPlviR\nktZLeqy8H1Hpc7GkYUmPSjqtEp8vaVPZdoUklfjBkm4s8XslDXTr+0RExJ5180plF3CR7TnAAuAC\nSXOA5cBdtmcDd5XPlG1LgBOBRcCVkg4qx1oJnAvMLq9FJX4O8Jzt44HLgcu6+H0iImIPulZUbG+1\n/UBp/wh4BJgGLAbWlN3WAGeU9mLgBts7bT8BDAOnSDoWOMz2BtsGrhnTp32sm4CF7auYiIiYfJMy\nplJuS50E3AscY3tr2fQUcExpTwOerHQbKbFppT02/qo+tncBzwNH1f4FIiKiI10vKpLeAtwMXGh7\nR3VbufLwJOSwTNKQpKHt27d3+3QREQesrhYVSW+kVVCus31LCT9dbmlR3reV+Cgwo9J9eomNlvbY\n+Kv6SJoCHA48MzYP26tsD9oenDp1ah1fLSIixtHN2V8CrgIesf35yqZ1wNLSXgrcVokvKTO6ZtEa\nkL+v3CrbIWlBOebZY/q0j3UmcHe5+omIiAZ089H3pwIfBTZJ2lhilwCXAmslnQN8D/gIgO2HJa0F\nNtOaOXaB7ZdLv/OB1cAhwB3lBa2ida2kYeBZWrPHIiKiIV0rKrb/AZhoJtbCCfqsAFaMEx8C5o4T\nfxH48H6kGRERNcov6iMiojYpKhERUZsUlYiIqE2KSkRE1CZFJSIiapOiEhERtUlRiYiI2qSoRERE\nbVJUIiKiNikqERFRmxSViIioTYpKRETUJkUlIiJqk6ISERG1SVGJiIjapKhERERturmc8NWStkl6\nqBK7UdLG8trSXhFS0oCkFyrbvlDpM1/SJknDkq4oSwpTlh2+scTvlTTQre8SERGd6eaVympgUTVg\n+/dsz7M9D7gZuKWy+fH2NtvnVeIrgXNprVk/u3LMc4DnbB8PXA5c1p2vERERnepaUbF9D61141+j\nXG18BLh+d8eQdCxwmO0Ntg1cA5xRNi8G1pT2TcDC9lVMREQ0o6kxlXcDT9t+rBKbVW59fVPSu0ts\nGjBS2WekxNrbngSwvQt4HjhqvJNJWiZpSNLQ9u3b6/weERFR0VRROYtXX6VsBWaW22J/BHxZ0mF1\nncz2KtuDtgenTp1a12EjImKMKZN9QklTgN8F5rdjtncCO0v7fkmPA+8ARoHple7TS4zyPgMYKcc8\nHHim618gIiIm1MSVym8C37H989takqZKOqi0j6M1IP9d21uBHZIWlPGSs4HbSrd1wNLSPhO4u4y7\nREREQ7o5pfh64FvACZJGJJ1TNi3htQP07wEeLFOMbwLOs90e5D8f+BIwDDwO3FHiVwFHSRqmdcts\nebe+S0REdKZrt79snzVB/GPjxG6mNcV4vP2HgLnjxF8EPrx/WUZERJ3yi/qIiKhNikpERNQmRSUi\nImqTohIREbVJUYmIiNqkqERERG1SVCIiojYpKhERUZsUlYiIqE2KSkRE1CZFJSIiapOiEhERtUlR\niYiI2qSoREREbVJUIiKiNt1cpOtqSdskPVSJfUbSqKSN5fX+yraLJQ1LelTSaZX4fEmbyrYrygqQ\nSDpY0o0lfq+kgW59l4iI6Ew3r1RWA4vGiV9ue1553Q4gaQ6tFSFPLH2ubC8vDKwEzqW1xPDsyjHP\nAZ6zfTxwOXBZt75IRER0pmtFxfY9wLN73LFlMXCD7Z22n6C1dPApko4FDrO9oaw/fw1wRqXPmtK+\nCVjYvoqJiIhmNDGm8glJD5bbY0eU2DTgyco+IyU2rbTHxl/Vx/Yu4HngqG4mHhERuzfZRWUlcBww\nD9gKfG4yTippmaQhSUPbt2+fjFNGRByQJrWo2H7a9su2XwG+CJxSNo0CMyq7Ti+x0dIeG39VH0lT\ngMOBZyY47yrbg7YHp06dWtfXiYiIMSa1qJQxkrYPAe2ZYeuAJWVG1yxaA/L32d4K7JC0oIyXnA3c\nVumztLTPBO4u4y4REdGQKd06sKTrgfcCR0saAT4NvFfSPMDAFuDjALYflrQW2AzsAi6w/XI51Pm0\nZpIdAtxRXgBXAddKGqY1IWBJt75LRER0pmtFxfZZ44Sv2s3+K4AV48SHgLnjxF8EPrw/OUZERL3y\ni/qIiKhNikpERNQmRSUiImqTohIREbVJUYmIiNqkqERERG1SVCIiojZd+51KvNbA8q/tc98tl55e\nYyYREd3R0ZWKpH/d7UQiIqL/dXr760pJ90k6X9LhXc0oIiL6VkdFxfa7gd+n9VTg+yV9WdL7uppZ\nRET0nY4H6m0/BnwK+BPg14ArJH1H0u92K7mIiOgvnY6pvFPS5cAjwG8Av2P7X5X25V3MLyIi+kin\ns7/+B/Al4BLbL7SDtn8g6VNdySwiIvpOp0XldOCF9honkt4AvMn2T21f27XsIiKir3Q6pnInrUWy\n2g4tsQlJulrSNkkPVWJ/UcZhHpR0q6S3lfiApBckbSyvL1T6zJe0SdKwpCvKCpCUVSJvLPF7JQ10\n+F0iIqJLOi0qb7L94/aH0j50D31WA4vGxNYDc22/E/hn4OLKtsdtzyuv8yrxlcC5tJYYnl055jnA\nc7aPpzWuc1mH3yUiIrqk06LyE0kntz9Img+8sJv9sX0PrWV+q7Gv295VPm4Apu/uGGVN+8Nsbyjr\nz18DnFE2LwbWlPZNwML2VUxERDSj0zGVC4GvSPoBIOAXgd/bz3P/AXBj5fMsSRuB54FP2f6/wDRg\npLLPSIlR3p8EsL1L0vPAUcAP9zOviIjYRx0VFdvflvSrwAkl9Kjtn+3rSSX9KbALuK6EtgIzbT9T\nroK+KunEfT3+OOdbBiwDmDlzZl2HjYiIMfbmgZL/BhgofU6WhO1r9vaEkj4GfABYWG5pYXsnsLO0\n75f0OPAOYJRX3yKbXmKU9xnAiKQpwOHAM+Od0/YqYBXA4OCg9zbniIjoTEdFRdK1wK8AG4GXS7g9\nxtExSYuAPwZ+zfZPK/GpwLO2X5Z0HK0B+e/aflbSDkkLgHuBs2n9ZgZgHbAU+BZwJnB3u0hFREQz\nOr1SGQTm7M3/tCVdD7wXOFrSCPBpWrO9DgbWlzH1DWWm13uA/yrpZ8ArwHm224P859OaSXYIcEd5\nAVwFXCtpmNaEgCWd5hYREd3RaVF5iNbg/NZOD2z7rHHCV02w783AzRNsGwLmjhN/Efhwp/lERET3\ndVpUjgY2S7qPMvYBYPuDXckqIiL6UqdF5TPdTCIiIl4fOp1S/E1JvwzMtn2npEOBg7qbWkRE9JtO\nH31/Lq1frf91CU0DvtqtpCIioj91+piWC4BTgR3w8wW73t6tpCIioj91WlR22n6p/aH82DC/CYmI\niFfptKh8U9IlwCFlbfqvAP+7e2lFREQ/6rSoLAe2A5uAjwO301qvPiIi4uc6nf31CvDF8oqIiBhX\np8/+eoJxxlBsH1d7RhER0bf25tlfbW+i9XiUI+tPJyIi+llHYyq2n6m8Rm3/JXB6l3OLiIg+0+nt\nr5MrH99A68plb9ZiiYiIA0CnheFzlfYuYAvwkdqziYiIvtbp7K9f73YiERHR/zq9/fVHu9tu+/P1\npBMREf2s0x8/DgL/idaDJKcB5wEnA28tr9eQdLWkbZIeqsSOlLRe0mPl/YjKtoslDUt6VNJplfh8\nSZvKtitUloyUdLCkG0v8XkkDe/fVIyKibp0WlenAybYvsn0RMB+YafvPbf/5BH1WA4vGxJYDd9me\nDdxVPiNpDq3lgE8sfa6U1H60/krgXFrr1s+uHPMc4DnbxwOXA5d1+F0iIqJLOi0qxwAvVT6/VGIT\nsn0PrbXjqxYDa0p7DXBGJX6D7Z22nwCGgVMkHQscZnuDbQPXjOnTPtZNwML2VUxERDSj09lf1wD3\nSbq1fD6Df/kf+t44xnZ7nfun+JfCNA3YUNlvpMR+Vtpj4+0+TwLY3iXpeeAo4IdjTyppGbAMYObM\nmfuQdkREdKLT2V8rJN0BvLuE/qPtf9yfE9u2pEl5fL7tVcAqgMHBwb58ZP/A8q/tc98tl+Z3qhEx\nOTq9/QVwKLDD9l8BI5Jm7cP5ni63tCjv20p8FJhR2W96iY2W9tj4q/qU9V0OB57Zh5wiIqImnS4n\n/GngT4CLS+iNwN/uw/nWAUtLeylwWyW+pMzomkVrQP6+cqtsh6QFZbzk7DF92sc6E7i7jLtERERD\nOh1T+RBwEvAAgO0fSBp3KnGbpOuB9wJHSxoBPg1cCqyVdA7wPcqv8m0/LGktsJnWL/YvsP1yOdT5\ntGaSHQLcUV4AVwHXShqmNSFgSYffJSIiuqTTovJSdQxE0pv31MH2WRNsWjjB/iuAFePEh4C548Rf\npPW05IiI6BGdjqmslfTXwNsknQvcSRbsioiIMTqd/fXZsjb9DuAE4M9sr+9qZhER0Xf2WFTKL9vv\nLA+VTCGJiIgJ7fH2Vxkwf0XS4ZOQT0RE9LFOB+p/DGyStB74STto+w+7klVERPSlTovKLeUVEREx\nod0WFUkzbX/f9r485ysiIg4wexpT+Wq7IenmLucSERF9bk9Fpfoo+eO6mUhERPS/PRUVT9COiIh4\njT0N1L9L0g5aVyyHlDbls20f1tXsIiKir+y2qNg+aHfbIyIiqvZmPZWIiIjdSlGJiIjapKhERERt\nJr2oSDpB0sbKa4ekCyV9RtJoJf7+Sp+LJQ1LelTSaZX4fEmbyrYryuqQERHRkEkvKrYftT3P9jxg\nPvBT4Nay+fL2Ntu3A0iaQ2tVxxOBRcCV5cnJACuBc2ktPzy7bI+IiIY0fftrIfC47e/tZp/FwA22\nd9p+AhgGTpF0LHCY7Q1lbfprgDO6n3JEREyk6aKyBLi+8vkTkh6UdLWkI0psGvBkZZ+REptW2mPj\nERHRkMaKiqRfAD4IfKWEVtJ6FMw8YCvwuRrPtUzSkKSh7du313XYiIgYo9NH33fDbwMP2H4aoP0O\nIOmLwN+Vj6PAjEq/6SU2Wtpj469hexWwCmBwcPCAe9zMwPKv7XPfLZeeXmMmEfF61+Ttr7Oo3Poq\nYyRtHwIeKu11wBJJB0uaRWtA/j7bW4EdkhaUWV9nA7dNTuoRETGeRq5UJL0ZeB/w8Ur4v0uaR+vB\nlVva22w/LGktsBnYBVxQljgGOB9YDRwC3FFeERHRkEaKiu2fAEeNiX10N/uvAFaMEx8C5taeYERE\n7JOmZ39FRMTrSIpKRETUJkUlIiJqk6ISERG1SVGJiIjapKhERERtUlQiIqI2KSoREVGbFJWIiKhN\nikpERNQmRSUiImqTohIREbVJUYmIiNqkqERERG2aXPkx+kBWjYyIvdHIlYqkLZI2SdooaajEjpS0\nXtJj5f2Iyv4XSxqW9Kik0yrx+eU4w5KuKCtARkREQ5q8/fXrtufZHiyflwN32Z4N3FU+I2kOsAQ4\nEVgEXCnpoNJnJXAurSWGZ5ftERHRkF4aU1kMrCntNcAZlfgNtnfafgIYBk4pa9ofZnuDbQPXVPpE\nREQDmioqBu6UdL+kZSV2jO2tpf0UcExpTwOerPQdKbFppT02HhERDWlqoP7f2x6V9HZgvaTvVDfa\ntiTXdbJSuJYBzJw5s67DRkTEGI1cqdgeLe/bgFuBU4Cnyy0tyvu2svsoMKPSfXqJjZb22Ph451tl\ne9D24NSpU+v8KhERUTHpRUXSmyW9td0Gfgt4CFgHLC27LQVuK+11wBJJB0uaRWtA/r5yq2yHpAVl\n1tfZlT4REdGAJm5/HQPcWmb/TgG+bPvvJX0bWCvpHOB7wEcAbD8saS2wGdgFXGD75XKs84HVwCHA\nHeUVERENmfSiYvu7wLvGiT8DLJygzwpgxTjxIWBu3TlGRMS+6aUpxRER0edSVCIiojZ59ld0zf48\nNwzy7LCIfpQrlYiIqE2KSkRE1CZFJSIiapOiEhERtUlRiYiI2qSoREREbVJUIiKiNikqERFRmxSV\niIioTYpKRETUJkUlIiJqk2d/Rc/an2eH5blhEc3IlUpERNSmieWEZ0j6hqTNkh6W9MkS/4ykUUkb\ny+v9lT4XSxqW9Kik0yrx+ZI2lW1XlGWFIyKiIU3c/toFXGT7gbJW/f2S1pdtl9v+bHVnSXOAJcCJ\nwC8Bd0p6R1lSeCVwLnAvcDuwiCwpHBHRmEm/UrG91fYDpf0j4BFg2m66LAZusL3T9hPAMHCKpGOB\nw2xvsG3gGuCMLqcfERG70ehAvaQB4CRaVxqnAp+QdDYwROtq5jlaBWdDpdtIif2stMfGxzvPMmAZ\nwMyZM2v9DtGbMsgf0YzGBuolvQW4GbjQ9g5at7KOA+YBW4HP1XUu26tsD9oenDp1al2HjYiIMRop\nKpLeSKugXGf7FgDbT9t+2fYrwBeBU8ruo8CMSvfpJTZa2mPjERHRkCZmfwm4CnjE9ucr8WMru30I\neKi01wFLJB0saRYwG7jP9lZgh6QF5ZhnA7dNypeIiIhxNTGmcirwUWCTpI0ldglwlqR5gIEtwMcB\nbD8saS2wmdbMsQvKzC+A84HVwCG0Zn1l5lfst4zHROy7SS8qtv8BGO/3JLfvps8KYMU48SFgbn3Z\nRUTE/sgv6iMiojZ59ldEjXLrLA50uVKJiIja5Eolokfsz1UO5EonekOKSsTrRG69RS/I7a+IiKhN\nrlQiIlc5UZsUlYjYLylIUZWiEhGN2d/JCfsjBa07MqYSERG1yZVKRByQctuuO1JUIiL2UgrSxFJU\nIiIm0et9HCljKhERUZsUlYiIqE3fFxVJiyQ9KmlY0vKm84mIOJD1dVGRdBDwv4DfBubQWj1yTrNZ\nRUQcuPq6qACnAMO2v2v7JeAGYHHDOUVEHLD6vahMA56sfB4psYiIaMABMaVY0jJgWfn4Y0mP7uOh\njgZ+WE9WXdPrOfZ6fpAc69Dr+UHv51h7frpsv7r/cic79XtRGQVmVD5PL7FXsb0KWLW/J5M0ZHtw\nf4/TTb2eY6/nB8mxDr2eH/R+jr2e30T6/fbXt4HZkmZJ+gVgCbCu4ZwiIg5YfX2lYnuXpP8M/B/g\nIOBq2w83nFZExAGrr4sKgO3bgdsn6XT7fQttEvR6jr2eHyTHOvR6ftD7OfZ6fuOS7aZziIiI14l+\nH1OJiIgekqLSoV5+HIykGZK+IWmzpIclfbLpnCYi6SBJ/yjp75rOZTyS3ibpJknfkfSIpH/bdE5V\nkv5L+Xf8kKTrJb2pB3K6WtI2SQ9VYkdKWi/psfJ+RA/m+Bfl3/ODkm6V9LZeyq+y7SJJlnR0E7nt\nrRSVDvTB42B2ARfZngMsAC7osfyqPgk80nQSu/FXwN/b/lXgXfRQrpKmAX8IDNqeS2tyypJmswJg\nNbBoTGw5cJft2cBd5XOTVvPaHNcDc22/E/hn4OLJTqpiNa/ND0kzgN8Cvj/ZCe2rFJXO9PTjYGxv\ntf1Aaf+I1v8Ie+7JApKmA6cDX2o6l/FIOhx4D3AVgO2XbP+/ZrN6jSnAIZKmAIcCP2g4H2zfAzw7\nJrwYWFPaa4AzJjWpMcbL0fbXbe8qHzfQ+p1bIyb4ZwhwOfDHQN8MfqeodKZvHgcjaQA4Cbi32UzG\n9Ze0/oC80nQiE5gFbAf+ptyi+5KkNzedVJvtUeCztP7WuhV43vbXm81qQsfY3lraTwHHNJlMB/4A\nuKPpJKokLQZGbf9T07nsjRSV1xFJbwFuBi60vaPpfKokfQDYZvv+pnPZjSnAycBK2ycBP6H52zY/\nV8YlFtMqfr8EvFnSf2g2qz1za4ppz/5NW9Kf0rqFfF3TubRJOhS4BPizpnPZWykqnenocTBNkvRG\nWgXlOtu3NJ3POE4FPihpC63bh78h6W+bTek1RoAR2+2rvJtoFZle8ZvAE7a32/4ZcAvw7xrOaSJP\nSzoWoLxvazifcUn6GPAB4PfdW7+v+BVaf3n4p/JnZjrwgKRfbDSrDqSodKanHwcjSbTGAR6x/fmm\n8xmP7YttT7c9QOuf3922e+pv2bafAp6UdEIJLQQ2N5jSWN8HFkg6tPw7X0gPTSQYYx2wtLSXArc1\nmMu4JC2idTv2g7Z/2nQ+VbY32X677YHyZ2YEOLn8N9rTUlQ6UAbz2o+DeQRY22OPgzkV+Citv/1v\nLK/3N51Un/oEcJ2kB4F5wH9rOJ+fK1dQNwEPAJto/flt/FfXkq4HvgWcIGlE0jnApcD7JD1G6wrr\n0h7M8X8CbwXWlz8zX+ix/PpSflEfERG1yZVKRETUJkUlIiJqk6ISERG1SVGJiIjapKhERERtUlQi\nIqI2KSoREVGbFJWIiKjN/we9J82/i0JuNwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "places_ba.trip_distance_miles.plot.hist(bins=20, range=(0,15));" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2296" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Most detailed spatial identifier in public data is tract_id\n", - "\n", - "# How many different tracts are visited?\n", - "places_ba.tract_id.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different households?\n", - "places_ba.sampno.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "23939" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different people?\n", - "len(places_ba.groupby(['sampno','perno']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Census identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.0f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6912\n", - "14388\n" - ] - } - ], - "source": [ - "# Is the mapping between census tracts and city names consistent? -- No\n", - "\n", - "print(places.tract_id.drop_duplicates().shape[0])\n", - "print(places[['tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 191724\n", - "std 242716\n", - "min 100\n", - "25% 5911\n", - "50% 43317\n", - "75% 402800\n", - "max 999999\n", - "Name: tract_id, dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 58\n", - "std 50\n", - "min 1\n", - "25% 37\n", - "50% 59\n", - "75% 79\n", - "max 999\n", - "Name: county_id, dtype: float64" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.county_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460523\n", - "mean 6\n", - "std 5\n", - "min 1\n", - "25% 6\n", - "50% 6\n", - "75% 6\n", - "max 99\n", - "Name: state_id, dtype: float64" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6 455641\n", - "99 1064\n", - "32 957\n", - "41 454\n", - "4 412\n", - "Name: state_id, dtype: int64" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# How to deal with this? I think `tract_id` is an integer representation\n", - "# of the 4-digit tract ID within the couty plus the 2 digit suffix. \n", - "\n", - "# So the full unique identifier is `state_id` + `county_id` (3 digits) + `tract_id` (6 digits)\n", - "\n", - "places['_full_tract_id'] = places.state_id * 1e9 + places.county_id * 1e6 + places.tract_id\n", - "\n", - "# Presumably the all-9 entries reflect missing data, but documentation doesn't specify\n", - "\n", - "places.ix[(places.tract_id == 999999) |\n", - " (places.county_id == 999) |\n", - " (places.state_id == 99), '_full_tract_id'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "14194\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(places[['_full_tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6115041100 14\n", - "6091010000 12\n", - "6027000800 11\n", - "6107000100 10\n", - "6097154303 10\n", - "Name: _full_tract_id, dtype: int64" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " _full_tract_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " _full_tract_id city\n", - "3238 6115041100 BROWNSVILLE\n", - "18952 6115041100 MARYSVILLE\n", - "33913 6115041100 NORTH SAN JUAN\n", - "44697 6115041100 DOBBINS\n", - "44705 6115041100 YUBA\n", - "100194 6115041100 BANGOR\n", - "160254 6115041100 CAMPTONVILLE\n", - "178724 6115041100 STRAWBERRY VALLEY\n", - "271235 6115041100 CHALLENGE-BROWNSVILLE\n", - "271250 6115041100 OREGON HOUSE\n", - "300021 6115041100 FORBESTOWN\n", - "317626 6115041100 CHALLENGE-BROWNSVILL\n", - "402446 6115041100 BROWNS VALLEY\n", - "403959 6115041100 RACKERBY\n" - ] - } - ], - "source": [ - "print(places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " loc[places._full_tract_id == 6115041100])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "So, there are still many census tracts that correspond to more than one city. I think we probably just want to use the census tracts as our unit of analysis. \n", - "\n", - "For descriptive purposes we can map each census tract to its most common corresponding city." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city\n", - "_full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Map each tract to its most common corresponding city\n", - "\n", - "tracts = places[['_full_tract_id', 'city']].groupby('_full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - " \n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "9097\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(tracts.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Activities\n", - "\n", - "\"The activity reported is for a single travel day and contains the highest level of detail about the survey participants' travel purpose\" (data dictionary)\n", - "\n", - "So, there can be multiple \"activities\" at each \"place\" visited as part of a trip." - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "157011" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# TO DO - fix to reflect households\n", - "\n", - "activities_ba = activities[activities.county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(activities_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano actno tripno\n", - "1 1041766 3 1 1 nan\n", - "4 1051203 1 9 1 8\n", - "8 1065929 1 1 1 nan\n", - "14 1097949 1 1 1 nan\n", - "22 1124271 1 5 1 4\n", - "27 1126030 2 1 1 nan\n", - "30 1127449 2 1 1 nan\n", - "32 1127626 1 1 1 nan\n", - "35 1128657 1 1 1 nan\n", - "37 1129482 1 1 1 nan\n" - ] - } - ], - "source": [ - "# What do the identifiers look like? \n", - "\n", - "print(activities_ba[['sampno', 'perno', 'plano', 'actno', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "118271\n", - "118271\n", - "118271\n" - ] - } - ], - "source": [ - "# Each place occurs in the activities table at least once\n", - "\n", - "print((activities_ba.actno == 1).sum()) # number of activities with id 1\n", - "\n", - "print(len(activities_ba.groupby(['sampno', 'perno', 'plano']))) # unique places referenced\n", - "\n", - "print(len(places_ba)) # records in places table" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2624572\n", - "std 1695612\n", - "min 1031985\n", - "25% 1662824\n", - "50% 1979173\n", - "75% 2797238\n", - "max 7212388\n", - "Name: sampno, dtype: float64" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.sampno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2\n", - "std 1\n", - "min 1\n", - "25% 1\n", - "50% 2\n", - "75% 3\n", - "max 8\n", - "Name: perno, dtype: float64" - ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.perno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 3\n", - "std 3\n", - "min 1\n", - "25% 1\n", - "50% 3\n", - "75% 5\n", - "max 34\n", - "Name: plano, dtype: float64" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.plano.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trips\n", - "\n", - "What's the correct way to aggregate places into trips?\n", - "\n", - "It seems like each person recorded their travel for a single day as a sequence of places visited, without explicit classification into trips or tours. So that's up to us to do by applying whatever rules seem appropriate. \n", - "\n", - "Probably it's not even possible to identify tours with certainty from the anonymized data, because the place names and precise locations are redacted." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "sampno perno\n", - "1031985 1 3\n", - " 2 3\n", - "1033944 1 16\n", - "1035274 1 8\n", - " 2 6\n", - "1037952 1 3\n", - " 2 1\n", - "1039620 1 5\n", - " 2 5\n", - "1041076 1 4\n", - "Name: plano, dtype: int64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Dig into `tripno` some more\n", - "\n", - "places_ba.groupby(['sampno', 'perno']).plano.max().head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 19512.0\n", - "mean 1.0\n", - "std 0.0\n", - "min 1.0\n", - "25% 1.0\n", - "50% 1.0\n", - "75% 1.0\n", - "max 1.0\n", - "dtype: float64" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Do any respondents have multiple trip sequences? -- No!\n", - "\n", - "plano_counts = places_ba.groupby(['sampno', 'perno']).plano.max()\n", - "tripno_counts = places_ba.groupby(['sampno', 'perno']).tripno.max()\n", - "\n", - "(plano_counts - tripno_counts).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406.0\n", - "mean 1.0\n", - "std 0.0\n", - "min 1.0\n", - "25% 1.0\n", - "50% 1.0\n", - "75% 1.0\n", - "max 1.0\n", - "dtype: float64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(places_ba.plano - places_ba.tripno).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
travel_datearr_timedep_timetract_idcitymodetrip_distance_milesprev_trip_duration_minact_dur
1522012-07-1703:00:0010:00:00509000SUNNYVALEnannannan425
1532012-07-1710:00:0010:00:00509000SUNNYVALE511030
1542012-07-1711:00:0011:00:00508504SUNNYVALE52151
1552012-07-1711:00:0011:00:00508504SUNNYVALE1059
1562012-07-1711:00:0013:00:00508504SUNNYVALE105105
1572012-07-1713:00:0014:00:00509000SUNNYVALE521060
1582012-07-1714:00:0015:00:00500100SAN JOSE582025
1592012-07-1715:00:0002:00:00509000SUNNYVALE5920699
\n", - "
" - ], - "text/plain": [ - " travel_date arr_time dep_time tract_id city mode \\\n", - "152 2012-07-17 03:00:00 10:00:00 509000 SUNNYVALE nan \n", - "153 2012-07-17 10:00:00 10:00:00 509000 SUNNYVALE 5 \n", - "154 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 5 \n", - "155 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 1 \n", - "156 2012-07-17 11:00:00 13:00:00 508504 SUNNYVALE 1 \n", - "157 2012-07-17 13:00:00 14:00:00 509000 SUNNYVALE 5 \n", - "158 2012-07-17 14:00:00 15:00:00 500100 SAN JOSE 5 \n", - "159 2012-07-17 15:00:00 02:00:00 509000 SUNNYVALE 5 \n", - "\n", - " trip_distance_miles prev_trip_duration_min act_dur \n", - "152 nan nan 425 \n", - "153 1 10 30 \n", - "154 2 15 1 \n", - "155 0 5 9 \n", - "156 0 5 105 \n", - "157 2 10 60 \n", - "158 8 20 25 \n", - "159 9 20 699 " - ] - }, - "execution_count": 100, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What does a sequence of places look like?\n", - "\n", - "varlist = ['travel_date', 'arr_time', 'dep_time', 'tract_id', 'city', 'mode', \n", - " 'trip_distance_miles', 'prev_trip_duration_min', 'act_dur']\n", - "\n", - "places_ba.loc[(places_ba.sampno == 1035274) & (places_ba.perno == 1), varlist]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, it looks like the key to identifying trip/tour semantics involves looking at the trip purposes in the activities table. Transfers are noted as a particular purpose, and those trip legs need to be aggregated together. \n", - "\n", - "The first and last activities of the day probably take place at home, but we can't verify using the public data.\n", - "\n", - "It looks like the arrival and departure times, and trip durations, are approximate based on people's recollections, but distances are precise because they come from the Google Maps interface." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Travel modes" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "5.0 50139\n", - "6.0 18632\n", - "1.0 15924\n", - "2.0 2244\n", - "15.0 1635\n", - "24.0 1444\n", - "7.0 566\n", - "26.0 459\n", - "8.0 299\n", - "25.0 293\n", - "Name: mode, dtype: int64" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the travel modes?\n", - "\n", - "places_ba['mode'].value_counts().head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "Travel mode:\n", - "\n", - "- 1- Walk; \n", - "- 2- Bike; \n", - "- 3- Wheelchair/mobility scooter; \n", - "- 4- Other non-motorized; \n", - "- 5- Auto/van/truck driver; \n", - "- 6- Auto/van/truck passenger; \n", - "- 7- Carpool/vanpool; \n", - "- 8- Motorcycle/scooter/moped; \n", - "- 9- Taxi/hired car/limo; \n", - "- 10- Rental car/vehicle; \n", - "- 11- Private shuttle (Super shuttle, employer, hotel, etc.); \n", - "- 12- Greyhound bus; \n", - "- 13- Plane; \n", - "- 14- Other private transit; \n", - "- 15- Local bus, rapid bus; \n", - "- 16- Express bus/commuter bus (AC Transbay, Golden Gate Transit, etc.); \n", - "- 17- Premium bus (Metro Orange/Silver Line); \n", - "- 18- School bus; \n", - "- 19- Public transit shuttle (DASH, Emery Go Round, etc.); \n", - "- 20- AirBART/LAX FlyAway; \n", - "- 21- Dial-a-ride/paratransit (access services, etc.); \n", - "- 22- Amtrak bus; \n", - "- 23- Other bus; \n", - "- 24- BART, Metro Red/Purple Line; \n", - "- 25- ACE, Amtrak, Cal- train, Coaster, Metrolink; \n", - "- 26- Metro Blue/Green/Gold Line, Muni Metro, Sacramento Light Rail, San Diego Sprinter/Trolley/Orange/ Blue/Green, VTA light rail; \n", - "- 27- Streetcar/cable car, \n", - "- 28- Other rail; \n", - "- 29- Ferry/boat; \n", - "- 99- RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trip purposes" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1 47241\n", - "2 16700\n", - "21 9523\n", - "9 9151\n", - "27 8583\n", - "22 7250\n", - "8 6151\n", - "7 5792\n", - "37 5040\n", - "31 4737\n", - "39 3484\n", - "17 3105\n", - "25 3039\n", - "34 2701\n", - "29 2541\n", - "Name: purpose, dtype: int64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the trip purposes?\n", - "\n", - "activities_ba.purpose.value_counts().head(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "[Somewhere there's a `ptype` key indicating categories of purposes, probably based on the home/ work/ school locations, but I can't find it in these data tables.]\n", - "\n", - "Activity purpose: \n", - "\n", - "[These look like activities at home]\n", - "\n", - "- 1- Personal activities (sleeping, personal care, leisure, chores); \n", - "- 2- Preparing meals/eating; \n", - "- 3- Hosting visitors/entertaining guests; \n", - "- 4- Exercise (with or without equipment)/playing sports; \n", - "- 5- Study/schoolwork; \n", - "- 6- Work for pay at home using telecommunications equipment; \n", - "- 7- Using computer/telephone/cell or smart phone, or other communications device for personal activities; \n", - "- 8- All other activities at home; \n", - "\n", - "[These look like activites at work]\n", - "\n", - "- 9- Work/job duties; \n", - "- 10- Training; \n", - "- 11- Meals at work; \n", - "- 12- Work-sponsored social activities (holiday/birthday celebrations, etc.); \n", - "- 13- Non-work-related activities (social clubs, etc.); \n", - "- 14- Exercise/sports; \n", - "- 15- Volunteer work/activities, \n", - "- 16- All other work- related activities at work; \n", - "\n", - "[These look like activities at school]\n", - "\n", - "- 17- School/classroom/ laboratory; \n", - "- 18- Meals at school/college; \n", - "- 19- After-school or non-class-related sports/physical activities; \n", - "- 20- All other after-school or non-class-related activities (library, music rehearsal, clubs, etc.); \n", - "\n", - "[These look like transport-related]\n", - "\n", - "- 21- Change type of transportation/transfer (walk to bus, walk to/from parked car); \n", - "- 22- pick up/drop off passenger(s); \n", - "\n", - "[These look like activities at non-home, non-work, non-school locations]\n", - "\n", - "- 23- Drive-through meals (snacks, coffee, etc.) (show if PTYPE <> 1 [Home]); \n", - "- 24- Drive-through other (ATM, bank, etc.) (show if PTYPE <> 1); \n", - "- 25- Work-related (meetings, sales calls, deliveries); \n", - "- 26- Service private vehicle (gas, oil, lubes, repairs), \n", - "- 27- Routine shopping (groceries, clothing, convenience store, household maintenance, etc.); \n", - "- 28- Shopping for major purchases or specialty items (appliance, electronics, new vehicles, major household repairs, etc.); \n", - "- 29- Household errands (bank, dry cleaning, etc.); \n", - "- 30- Personal business (visit government office, attorney, accountant, etc.); \n", - "- 31- Eat meal at restaurant/diner; \n", - "- 32- Health care (doctor, dentist, eye care, chiropractor, veterinarian, etc.); \n", - "- 33- Civic/ religious activities; \n", - "- 34- Outdoor exercise (outdoor sports, jogging, bicycling, walking the dog, etc.); \n", - "- 35- Indoor exercise (gym, yoga, etc.); \n", - "- 36- Entertainment (movies, sporting events, etc.); \n", - "- 37- Social/visiting friends and relatives; \n", - "- 38- Other (specify), \n", - "\n", - "[Misc]\n", - "\n", - "- 39- Loop trip (for interviewer only- not listed on diary), \n", - "- 99- DK/RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# TO DO\n", - "\n", - "# - set up destination choice model\n", - "# - make two tables: (1) trips, (2) destinations\n", - "# - write a function to generate choice set\n", - "\n", - "# - for covariates, calculate home/work/etc density endogenously\n", - "\n", - "# - can probably generate average travel time between tracts, by mode\n", - "# - then can use that to build a mode choice model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/ChoiceModels-demo.ipynb b/notebooks/_archive/ChoiceModels-demo.ipynb deleted file mode 100644 index c759ca6..0000000 --- a/notebooks/_archive/ChoiceModels-demo.ipynb +++ /dev/null @@ -1,468 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ChoiceModels usage demo\n", - "\n", - "Sam Maurer, October 10, 2016" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%aimport choicemodels\n", - "%autoreload 1" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import choicemodels\n", - "import numpy as np\n", - "import pandas as pd\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Binary Logit" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Set up estimation data\n", - "\n", - "endog = np.random.randint(2, size=50) # 50x1 vector of random 0's and 1's\n", - "exog = np.random.rand(50, 5) # 50x5 matrix of random floats" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Optimization terminated successfully.\n", - " Current function value: 0.635509\n", - " Iterations 5\n" - ] - } - ], - "source": [ - "# Estimate a model\n", - "\n", - "m = choicemodels.Logit(endog, exog)\n", - "results = m.fit()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Logit Regression Results \n", - "==============================================================================\n", - "Dep. Variable: y No. Observations: 50\n", - "Model: Logit Df Residuals: 45\n", - "Method: MLE Df Model: 4\n", - "Date: Fri, 07 Oct 2016 Pseudo R-squ.: 0.07890\n", - "Time: 16:31:07 Log-Likelihood: -31.775\n", - "converged: True LL-Null: -34.497\n", - " LLR p-value: 0.2447\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "x1 0.0305 0.899 0.034 0.973 -1.731 1.792\n", - "x2 1.4040 0.977 1.436 0.151 -0.512 3.320\n", - "x3 -2.2294 1.034 -2.156 0.031 -4.256 -0.202\n", - "x4 0.0607 0.996 0.061 0.951 -1.892 2.013\n", - "x5 0.5010 0.995 0.503 0.615 -1.450 2.452\n", - "==============================================================================\n" - ] - } - ], - "source": [ - "# Show estimation results\n", - "\n", - "print(results.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multinomial Logit" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Load some real data\n", - "\n", - "path = '../../timothyb0912/pylogit/examples/data/swissmetro.dat'\n", - "swissmetro = pd.read_table(path, sep='\\t')\n", - "\n", - "include = (swissmetro.PURPOSE.isin([1, 3]) & (swissmetro.CHOICE != 0))\n", - "swissmetro = swissmetro.loc[include]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "swissmetro.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/smmaurer/Dropbox/Git-rMBP/timothyb0912/pylogit/pylogit/choice_tools.py:431: UserWarning: Note, there are 29 variables in wide_data but the inputs ind_vars, alt_specific_vars, and subset_specific_vars only account for 28 variables.\n", - " msg_2 + msg_3.format(num_vars_accounted_for))\n" - ] - } - ], - "source": [ - "# Convert to long format\n", - "\n", - "ind_vars = swissmetro.columns.tolist()[:15]\n", - "\n", - "alt_varying_vars = {'travel_time': dict([(1, 'TRAIN_TT'), (2, 'SM_TT'), (3, 'CAR_TT')]),\n", - " 'travel_cost': dict([(1, 'TRAIN_CO'), (2, 'SM_CO'), (3, 'CAR_CO')]),\n", - " 'headway': dict([(1, 'TRAIN_HE'), (2, 'SM_HE')])}\n", - "\n", - "availability_vars = {1: 'TRAIN_AV', 2: 'SM_AV', 3: 'CAR_AV'}\n", - "\n", - "alt_id_col = 'mode_id'\n", - "\n", - "swissmetro['custom_id'] = np.arange(swissmetro.shape[0], dtype=int) + 1\n", - "obs_id_col = 'custom_id'\n", - "\n", - "choice_col = 'CHOICE'\n", - "\n", - "data = choicemodels.convert_wide_to_long(swissmetro, ind_vars, alt_varying_vars, \n", - " availability_vars, obs_id_col, choice_col, new_alt_id_name=alt_id_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "data.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Rescale variables\n", - "\n", - "data[\"travel_time_hrs\"] = data[\"travel_time\"] / 60.0\n", - "data[\"headway_hrs\"] = data[\"headway\"] / 60.0\n", - "data[\"travel_cost_scaled\"] = data[\"travel_cost\"] / 100.0" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up specification\n", - "\n", - "spec = OrderedDict()\n", - "labels = OrderedDict()\n", - "\n", - "spec[\"intercept\"] = [1, 2]\n", - "labels[\"intercept\"] = ['ASC Train', 'ASC Swissmetro']\n", - "\n", - "spec[\"travel_time_hrs\"] = [[1, 2,], 3]\n", - "labels[\"travel_time_hrs\"] = ['Travel Time (Train/SM)', 'Travel Time (Car)']\n", - "\n", - "spec[\"travel_cost_scaled\"] = [1, 2, 3]\n", - "labels[\"travel_cost_scaled\"] = ['Travel Cost (Train)', 'Travel Cost (Swissmetro)', \n", - " 'Travel Cost (Car)']\n", - "\n", - "spec[\"headway_hrs\"] = [1, 2]\n", - "labels[\"headway_hrs\"] = [\"Headway (Train)\", \"Headway (Swissmetro)\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -6,964.6630\n", - "Initial Log-likelihood: -6,964.6630\n", - "Estimation Time: 0.09 seconds.\n", - "Final log-likelihood: -5,359.1984\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/smmaurer/anaconda/lib/python2.7/site-packages/scipy/optimize/_minimize.py:385: RuntimeWarning: Method BFGS does not use Hessian information (hess).\n", - " RuntimeWarning)\n" - ] - } - ], - "source": [ - "# Set up and estimate the model\n", - "\n", - "m = choicemodels.MNLogit(data, alt_id_col, obs_id_col, choice_col, spec, names=labels)\n", - "\n", - "results = m.fit_mle(np.zeros(9))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: CHOICE No. Observations: 6,768\n", - "Model: Multinomial Logit Model Df Residuals: 6,759\n", - "Method: MLE Df Model: 9\n", - "Date: Fri, 07 Oct 2016 Pseudo R-squ.: 0.231\n", - "Time: 16:31:26 Pseudo R-bar-squ.: 0.229\n", - "converged: False Log-Likelihood: -5,359.198\n", - " LL-Null: -6,964.663\n", - "============================================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "--------------------------------------------------------------------------------------------\n", - "ASC Train -0.4710 0.128 -3.674 0.000 -0.722 -0.220\n", - "ASC Swissmetro 0.2597 0.104 2.504 0.012 0.056 0.463\n", - "Travel Time (Train/SM) -0.7459 0.041 -18.011 0.000 -0.827 -0.665\n", - "Travel Time (Car) -0.5572 0.043 -13.065 0.000 -0.641 -0.474\n", - "Travel Cost (Train) 0.0637 0.004 14.386 0.000 0.055 0.072\n", - "Travel Cost (Swissmetro) 0.0096 0.003 2.969 0.003 0.003 0.016\n", - "Travel Cost (Car) -0.2327 0.091 -2.546 0.011 -0.412 -0.054\n", - "Headway (Train) -0.3592 0.064 -5.590 0.000 -0.485 -0.233\n", - "Headway (Swissmetro) -0.4353 0.192 -2.265 0.023 -0.812 -0.059\n", - "============================================================================================\n" - ] - } - ], - "source": [ - "# Show results\n", - "\n", - "print(results.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### Alternate syntax for setting up a multinomial specification\n", - "\n", - "This section is speculative -- not yet implemented!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# StatsModels allows the following syntax:\n", - "\n", - "spec = 'outcome ~ const + var1 + np.log(var2)'\n", - "\n", - "m = choicemodels.Logit.from_formula(spec, data)\n", - "results = m.fit_mle()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# It would be nice to enable something similar for multinomial models,\n", - "# so that the user interface follows the utility functions more closely\n", - "\n", - "spec = {\n", - " '1': 'choice ~ ASC_t + btt * time_t/60 + bct * cost_t/100 + bht * headway_t/60',\n", - " \n", - " '2': 'choice ~ ASC_sm + btt * time_sm/60 + bcs * cost_sm/100 + bhs * headway_sm/60',\n", - " \n", - " '3': 'choice ~ btc * time_c/60 + bcc * cost_c/100' }\n", - "\n", - "labels: {\n", - " 'ASC_t': \"ASC Train\",\n", - " 'ASC_sm': \"ASC Swissmetro\", \n", - " 'btt': \"Travel Time (Train/SM)\", \n", - " 'btc': \"Travel Time (Car)\", \n", - " 'bct': \"Travel Cost (Train)\", \n", - " 'bcs': \"Travel Cost (Swissmetro)\", \n", - " 'bht': \"Headway (Train)\", \n", - " 'bhs': \"Headway (Swissmetro)\", }\n", - "\n", - "m = choicemodels.MNL.from_formula(spec, data, alt_id_col)\n", - "results = m.fit_mle()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/_archive/Data-prep-01.ipynb b/notebooks/_archive/Data-prep-01.ipynb deleted file mode 100644 index 43b5f97..0000000 --- a/notebooks/_archive/Data-prep-01.ipynb +++ /dev/null @@ -1,666 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data prep for estimating models\n", - "\n", - "Sam Maurer, June 2017\n", - "\n", - "Python 3.6" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load raw CHTS tables\n", - "\n", - "This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the \"data\" directory." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build a master table of census tracts\n", - "\n", - "Generate a table of census tracts in the 9-county Bay Area, for use in destination choice models." - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation in the Pandas display output\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.0f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6098141414.0\n", - "6.0\n", - "98.0\n" - ] - } - ], - "source": [ - "# Functions to move back and forth between full numerical tract ID and its components\n", - "\n", - "# TO DO - it would be better to generate ints than floats, but it's not obvious\n", - "# to me how to do this in a way that works smoothly with arrays\n", - "\n", - "def full_tract_id(state_id, county_id, tract_id):\n", - " return state_id * 1e9 + county_id * 1e6 + tract_id\n", - "\n", - "def state_id(full_tract_id):\n", - " return np.floor(full_tract_id / 1e9)\n", - "\n", - "def county_id(full_tract_id):\n", - " _county_tract = np.fmod(full_tract_id, 1e9)\n", - " return np.floor(_county_tract / 1e6)\n", - "\n", - "print(full_tract_id(6, 98, 141414))\n", - "print(state_id(6098141414))\n", - "print(county_id(6098141414))" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Generate full tract identifiers for the `places` table\n", - "\n", - "places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)\n", - "\n", - "# Replace missing identifiers with NaN's\n", - "\n", - "places.ix[(places.tract_id == 999999) |\n", - " (places.county_id == 999) |\n", - " (places.state_id == 99), 'full_tract_id'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9097\n", - " city\n", - "full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Generate a master list of census tracts from the `places` table, keeping the\n", - "# city name most commonly associated with each tract\n", - "\n", - "tracts = places[['full_tract_id', 'city']].groupby('full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city\n", - "full_tract_id \n", - "6001008309 TIJUANA\n", - "6001400100 BERKELEY\n", - "6001400200 OAKLAND\n", - "6001400300 OAKLAND\n", - "6001400400 OAKLAND\n" - ] - } - ], - "source": [ - "# Limit to the 9-county San Francisco Bay Area\n", - "\n", - "tracts = tracts[(state_id(tracts.index).isin([6])) & \n", - " (county_id(tracts.index).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "85 371\n", - "1 360\n", - "13 207\n", - "75 195\n", - "81 158\n", - "97 99\n", - "95 97\n", - "41 55\n", - "55 41\n", - "Name: full_tract_id, dtype: int64\n" - ] - } - ], - "source": [ - "print(county_id(tracts.index).value_counts())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate some tract-level covariates\n", - "\n", - "Residential density, school/employment density" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Note: the `home_tract_id` in the households table is already a full 11-digit\n", - "# identifier, with the same format that we generated for the places table.\n", - "# Same with `empl_tract_id` and `school_tract_id` in the persons table." - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Residential density = sum of weighted household sizes by census tract of home\n", - "\n", - "households['_weighted_persons_count'] = households.persons_count * households.hhwgt\n", - "\n", - "home_density = households.groupby('home_tract_id')._weighted_persons_count.sum().\\\n", - " rename('home_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Employment density = sum of person weights by census tract of work location\n", - "\n", - "work_density = persons.groupby('empl_tract_id').perwgt.sum().\\\n", - " rename('work_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# School density = sum of person weights by census tract of school location\n", - "\n", - "school_density = persons.groupby('school_tract_id').perwgt.sum().\\\n", - " rename('school_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309 TIJUANA 0 0 0\n", - "6001400100 BERKELEY 13 13 14\n", - "6001400200 OAKLAND 11 4 1\n", - "6001400300 OAKLAND 29 8 0\n", - "6001400400 OAKLAND 17 4 8\n" - ] - } - ], - "source": [ - "# Merge these into the census tracts table, only keeping Bay Area tracts\n", - "\n", - "tracts = pd.merge(tracts, home_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, work_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, school_density, how='left', left_index=True, right_index=True)\n", - "tracts = tracts.fillna(0) # fill missing values with zero\n", - "\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generate a table of trips\n", - "\n", - "For now, this is a table of places visited for non-school, non-work activities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - trip destinations are in `places.full_tract_id` (sometimes missing)\n", - "# - trip purposes are in `activities.purpose`, and we want 23 thru 38\n", - "# - places and acitivities are linked by `sampno`, `perno`, `plano`, and there \n", - "# can be multiple activities per place" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10417660312.0\n" - ] - } - ], - "source": [ - "# Function to generate a single unique ID for places\n", - "\n", - "def place_id(sampno, perno, plano):\n", - " return sampno * 1e4 + perno * 1e2 + plano\n", - "\n", - "print(place_id(1041766, 3, 12))" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Add place_id to places table and activities table\n", - "\n", - "places['place_id'] = place_id(places.sampno, places.perno, places.plano)\n", - "activities['place_id'] = place_id(activities.sampno, activities.perno, activities.plano)" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Get list of places that have a secondary activity\n", - "\n", - "_secondary_activity_places = activities.loc[activities.purpose.isin(range(23, 38+1)),\n", - " 'place_id'].drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "145993\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850102 6095252108 6 13\n", - "10319850202 6095251902 5 5\n", - "10320360102 6073017051 5 4\n", - "10320360104 6073009304 5 19\n", - "10320360105 6073008511 5 6\n" - ] - } - ], - "source": [ - "# Generate a table of those places with some covariates\n", - "\n", - "trips = places.loc[places.place_id.isin(_secondary_activity_places) &\n", - " places.full_tract_id.notnull(),\n", - " ['place_id', 'full_tract_id', 'mode', \n", - " 'trip_distance_miles']].set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36765\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850102 6095252108 6 13\n", - "10319850202 6095251902 5 5\n", - "10335860102 6085511915 6 156\n", - "10335860103 6085512027 6 2\n", - "10335860104 6085512027 6 0\n" - ] - } - ], - "source": [ - "# Limit to destinations in the 9-county San Francisco Bay Area\n", - "\n", - "trips = trips[(state_id(trips.full_tract_id).isin([6])) & \n", - " (county_id(trips.full_tract_id).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save estimaton data to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "tracts.to_csv('../data/tracts.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "trips.to_csv('../data/trips.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/Destination-choice-models-01.ipynb b/notebooks/_archive/Destination-choice-models-01.ipynb deleted file mode 100644 index 45d91bb..0000000 --- a/notebooks/_archive/Destination-choice-models-01.ipynb +++ /dev/null @@ -1,634 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring destination choice models\n", - "\n", - "Sam Maurer, June 2017\n", - "\n", - "Python 3.6\n", - "\n", - "## Plan\n", - "\n", - "- Set up a simple MNL destination choice model using the `urbansim.urbanchoice` interface\n", - "\n", - "- Refactor the code, using this notebook for ad-hoc testing\n", - "\n", - "- Set up more complex models as needed" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "from urbansim.urbanchoice import interaction, mnl\n", - "\n", - "from choicemodels import MultinomialLogit" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress deprecation warnings\n", - "\n", - "import warnings; warnings.simplefilter('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load estimation data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation in the Pandas display output\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.3f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309.000 TIJUANA 0.000 0.000 0.000\n", - "6001400100.000 BERKELEY 13.438 13.131 13.512\n", - "6001400200.000 OAKLAND 11.090 4.249 0.895\n", - "6001400300.000 OAKLAND 28.878 7.672 0.000\n", - "6001400400.000 OAKLAND 16.885 4.064 8.150\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id')\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36765\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850102.000 6095252108.000 6.000 13.428\n", - "10319850202.000 6095251902.000 5.000 5.126\n", - "10335860102.000 6085511915.000 6.000 156.371\n", - "10335860103.000 6085512027.000 6.000 1.616\n", - "10335860104.000 6085512027.000 6.000 0.376\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips.csv').set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL destination choice using urbansim.urbanchoice" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - each trip is a realized choice of a particular census tract\n", - "# - we can randomly sample alternative census tracts and build a model\n", - "# of destination choice" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# `interaction.mnl_interaction_dataset()` is not documented very well, but \n", - "# this is how it seems to work\n", - "\n", - "# Takes following input:\n", - "# - choosers: pandas.DataFrame with unique index\n", - "# - alternatives: pandas.DataFrame with unique index\n", - "# - SAMPLE_SIZE: number of alternatives for each choice scenario\n", - "# - chosenalts: list containing the alternative id chosen by each chooser?\n", - "\n", - "# Returns following output:\n", - "# - full list of alternatives that were sampled\n", - "# - long-format DataFrame merging the two tables\n", - "# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start with a sample of ~500 trips for easier computation" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "490\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "71720050203.000 6055201402.000 6.000 3.080\n", - "19678330204.000 6095253404.000 6.000 15.400\n", - "30057980204.000 6001408600.000 6.000 7.070\n", - "30002610307.000 6001433400.000 5.000 1.371\n", - "30208410103.000 6085503601.000 5.000 7.498\n" - ] - } - ], - "source": [ - "choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]\n", - "choosers = choosers.loc[choosers.trip_distance_miles.notnull()]\n", - "\n", - "print(choosers.shape[0])\n", - "print(choosers.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sample 100 alternatives for each and set up a long-format data table" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "49000\n", - "(490, 100)\n" - ] - } - ], - "source": [ - "numalts = 100\n", - "\n", - "_, merged, chosen = interaction.mnl_interaction_dataset(\n", - " choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, \n", - " chosenalts=choosers.full_tract_id)\n", - "\n", - "print(merged.shape[0])\n", - "print(chosen.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use Patsy to generate the design matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Intercept home_density work_density school_density\n", - "full_tract_id \n", - "6055201402.000 1.000 13.406 1.692 0.000\n", - "6013308001.000 1.000 8.448 0.828 2.252\n", - "6085500901.000 1.000 6.060 32.747 110.417\n", - "6085503712.000 1.000 16.097 6.792 0.000\n", - "6097153801.000 1.000 48.146 3.061 8.313\n" - ] - } - ], - "source": [ - "model_expression = \"home_density + work_density + school_density\"\n", - "\n", - "model_design = dmatrix(model_expression, data=merged, return_type='dataframe')\n", - "\n", - "print(model_design.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit the model using mnl_estimate()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'convergence': -2209.5185606064615, 'null': -2256.5333911341672, 'ratio': 0.02083498108755011}\n", - " Coefficient Std. Error T-Score\n", - "0 -0.000 0.084 -0.000\n", - "1 0.013 0.004 3.049\n", - "2 0.012 0.001 9.855\n", - "3 0.011 0.005 2.170\n" - ] - } - ], - "source": [ - "log_likelihoods, fit_parameters = mnl.mnl_estimate(\n", - " model_design.as_matrix(), chosen, numalts=numalts)\n", - "\n", - "print(log_likelihoods)\n", - "print(fit_parameters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NEW -- Same process in ChoiceModels" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "490\n" - ] - } - ], - "source": [ - "# Start with the same sample of trips\n", - "\n", - "print(choosers.shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge choosers and alternatives using a new ChoiceModels interface" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "49000\n" - ] - } - ], - "source": [ - "merged = MergedChoiceTable(observations = choosers, \n", - " alternatives = tracts, \n", - " chosen_alternatives = choosers.full_tract_id, \n", - " sample_size = numalts)\n", - "\n", - "print(type(merged))\n", - "print(merged.to_frame().shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the ChoiceModels engine" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -2,206.414\n", - "BIC: LL-Null: -2,256.533\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0123 0.003 4.574 \n", - "work_density 0.0128 0.001 10.993 \n", - "school_density 0.0097 0.005 2.018 \n", - "===================================================================\n", - "CPU times: user 125 ms, sys: 34.8 ms, total: 160 ms\n", - "Wall time: 110 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(), \n", - " observation_id_col = merged.observation_id_col, \n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(type(results))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the PyLogit engine\n", - "\n", - "Usage is the same, but with an OrderedDict model expression" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,256.5334\n", - "Initial Log-likelihood: -2,256.5334\n", - "Estimation Time: 0.15 seconds.\n", - "Final log-likelihood: -2,206.4141\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 490\n", - "Model: Multinomial Logit Model Df Residuals: 487\n", - "Method: MLE Df Model: 3\n", - "Date: Tue, 27 Jun 2017 Pseudo R-squ.: 0.022\n", - "Time: 19:51:07 Pseudo R-bar-squ.: 0.021\n", - "converged: True Log-Likelihood: -2,206.414\n", - " LL-Null: -2,256.533\n", - "==================================================================================\n", - " coef std err z P>|z| [0.025 0.975]\n", - "----------------------------------------------------------------------------------\n", - "home_density 0.0123 0.004 2.942 0.003 0.004 0.020\n", - "work_density 0.0128 0.001 11.104 0.000 0.011 0.015\n", - "school_density 0.0097 0.004 2.191 0.028 0.001 0.018\n", - "==================================================================================\n", - "CPU times: user 15.5 s, sys: 13.7 s, total: 29.2 s\n", - "Wall time: 21 s\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = OrderedDict([('home_density', 'all_same'),\n", - " ('work_density', 'all_same'),\n", - " ('school_density', 'all_same')])\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(),\n", - " observation_id_col = merged.observation_id_col,\n", - " alternative_id_col = merged.alternative_id_col,\n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/MNL-prediction-demo-01.ipynb b/notebooks/_archive/MNL-prediction-demo-01.ipynb deleted file mode 100644 index 75e3cdc..0000000 --- a/notebooks/_archive/MNL-prediction-demo-01.ipynb +++ /dev/null @@ -1,623 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL prediction demo\n", - "\n", - "Sam Maurer, July 2017\n", - "\n", - "Python 3.6\n", - "\n", - "### Summary\n", - "\n", - "This notebook demonstrates how to fit a model using the ChoiceModels interface and then use the UrbanSim MNL functions to generate predictions. \n", - "\n", - "Eventually, a prediction interface will be incorporated into the `MultinomialLogitResults` object, but it's not there yet!\n", - "\n", - "This demo uses the estimation data that's set up in the `Data-prep-01` notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "\n", - "from choicemodels import mnl # could also import form urbansim.urbanchoice\n", - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress deprecation warnings\n", - "import warnings; warnings.simplefilter('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1566\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6.001400e+09 BERKELEY 13.437961 13.130867 13.511570\n", - "6.001400e+09 OAKLAND 11.089638 4.248928 0.894794\n", - "6.001400e+09 OAKLAND 28.878399 7.671554 0.000000\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id')\n", - "tracts = tracts.loc[(tracts.home_density > 0) | (tracts.work_density > 0) | (tracts.school_density > 0)]\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35787\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "1.031985e+10 6.095252e+09 6.0 13.428271\n", - "1.031985e+10 6.095252e+09 5.0 5.125960\n", - "1.033586e+10 6.085512e+09 6.0 156.370628\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips.csv').set_index('place_id')\n", - "trips = trips.loc[trips.trip_distance_miles.notnull()]\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up estimation table\n", - "\n", - "Each observed trip is a realized choice of a particular destination census tract. We can randomly sample alternative census tracts to build a model of destination choice.\n", - "\n", - "We'll divide the trips into a training set and a testing set, fit an MNL model using the training data, use it to generate predicted choices for the testing data, and compare the predicted to the actual choices." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(100000, 9)\n", - "(3473400, 9)\n" - ] - } - ], - "source": [ - "training_observations = trips.iloc[:1000]\n", - "training = MergedChoiceTable(observations = training_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = training_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "testing_observations = trips.iloc[1000:]\n", - "testing = MergedChoiceTable(observations = testing_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = testing_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "print(training.to_frame().shape)\n", - "print(testing.to_frame().shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit a model using the training observations" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -4,504.887\n", - "BIC: LL-Null: -4,605.170\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0109 0.002 5.848 \n", - "work_density 0.0122 0.001 15.221 \n", - "school_density 0.0071 0.004 1.976 \n", - "===================================================================\n", - "CPU times: user 499 ms, sys: 46.8 ms, total: 546 ms\n", - "Wall time: 192 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = training.to_frame(), \n", - " observation_id_col = training.observation_id_col, \n", - " choice_col = training.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Predict destination choices for the testing observations\n", - "\n", - "We'll use the UrbanSim MNL functions directly, because this hasn't been integrated into the ChoiceModels results classes yet. https://github.com/UDST/choicemodels/blob/master/choicemodels/mnl.py#L536" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 0.010935\n", - "1 0.012232\n", - "2 0.007140\n", - "Name: Coefficient, dtype: float64\n" - ] - } - ], - "source": [ - "# Pull the coefs out of the results object (the PyLogit syntax would be different)\n", - "\n", - "coefs = results.get_raw_results()['fit_parameters']['Coefficient']\n", - "print(coefs)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(3473400, 3)\n", - " home_density work_density school_density\n", - "full_tract_id \n", - "6.097151e+09 10.659461 6.868701 7.160030\n", - "6.085512e+09 34.971081 5.483731 2.181334\n", - "6.013326e+09 21.491132 0.153325 1.326145\n" - ] - } - ], - "source": [ - "# The data columns for prediction need to align with the coefficients; \n", - "# you can do this manually or with patsy, as here\n", - "\n", - "df = testing.to_frame().set_index('full_tract_id')\n", - "\n", - "testing_df = dmatrix(model_expression, data=df, return_type='dataframe')\n", - "print(testing_df.shape)\n", - "print(testing_df.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34734\n", - "[90 24 75 80 70]\n" - ] - } - ], - "source": [ - "# Simulate a destination choice for each testing observation\n", - "\n", - "choices = mnl.mnl_simulate(testing_df, coefs, numalts=100, returnprobs=False)\n", - "\n", - "print(len(choices))\n", - "print(choices[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['a', 'd']\n" - ] - } - ], - "source": [ - "# Annoyingly, that identifies the choices by position rather than by id;\n", - "# here's a function to get the id's\n", - "\n", - "def get_chosen_ids(ids, positions):\n", - " \"\"\"\n", - " We observe N choice scenarios. In each, one of J alternatives is chosen.\n", - " We have a long (len N * J) list of the available alternatives. We have a \n", - " list (len N) of which alternatives were chosen, but it identifies them \n", - " by POSITION and we want their ID. \n", - " \n", - " Parameters\n", - " ----------\n", - " ids : list or list-like\n", - " List of alternative ID's (len N * J).\n", - " \n", - " positions : list or list-like\n", - " List of chosen alternatives by position (len N), where each entry is\n", - " an int in range [0, J)\n", - " \n", - " Returns\n", - " -------\n", - " chosen_ids : list\n", - " List of chosen alternatives by ID (len N)\n", - " \n", - " \"\"\"\n", - " N = len(positions)\n", - " J = len(ids) / N\n", - " \n", - " ids_by_obs = np.reshape(ids, (N,J))\n", - " return [ids_by_obs[i][positions[i]] for i in range(N)]\n", - " \n", - "\n", - "print(get_chosen_ids(['a','b','c','d'], [0,1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34734\n", - "[6085500400.0, 6085512020.0, 6013355115.0, 6085505008.0, 6075016802.0]\n" - ] - } - ], - "source": [ - "# Get tract id's for the simulated choices\n", - "\n", - "predicted_tracts = get_chosen_ids(testing_df.index.tolist(), choices)\n", - "\n", - "print(len(predicted_tracts))\n", - "print(predicted_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34734\n", - "[6097150607.0, 6097150607.0, 6097153200.0, 6097151402.0, 6097151402.0]\n" - ] - } - ], - "source": [ - "# Get tract id's for observed choices\n", - "\n", - "df = testing.to_frame()\n", - "observed_tracts = df.loc[df.chosen == 1, 'full_tract_id'].tolist()\n", - "\n", - "print(len(observed_tracts))\n", - "print(observed_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compare the predicted choices to the observed ones\n", - "\n", - "Multinomial models are kind of tricky to validate. We don't expect the actual choices to match, because there are so many alternatives, but we do expect the characteristics of the predicted choices to be similar to the characteristics of the observed choices. \n", - "\n", - "Choose your own metric for this depending on what you're trying to evaluate! It's even plausible that the metric could be something not directly in the model, like the distance between the predicted and actual destination choices." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.0154603558473\n" - ] - } - ], - "source": [ - "# What portion of predicted destination choices were a perfect match?\n", - "# With an uninformative model we would expect 0.01, given that the \n", - "# observed choice is included in the 100 available alternatives.\n", - "\n", - "perfect_match = np.equal(predicted_tracts, observed_tracts)\n", - "print(sum(perfect_match)/len(perfect_match))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.145854426158\n" - ] - } - ], - "source": [ - "# What's the correlation between employment density of the predicted and \n", - "# observed destinations? With an uninformative model we would expect 0.\n", - "\n", - "density_1 = pd.Series([tracts.loc[t,'work_density'] for t in predicted_tracts])\n", - "density_2 = pd.Series([tracts.loc[t,'work_density'] for t in observed_tracts])\n", - "\n", - "print(density_1.corr(density_2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### How does UrbanSim generate household location choices?\n", - "\n", - "These three class methods collectively set up the choosers and alternatives according to various parameters like the sample size, prediction filters, \"probability mode,\" and \"choice mode\" (aggregate or individual):\n", - "\n", - "- `urbansim.models.MNLDiscreteChocieModel.probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.summed_probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.predict()` \n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L474\n", - "\n", - "Then this lower-level function generates a table of probabilities for each alternative, which is passed back to the `MNLDiscreteChoiceModel` class for further processing:\n", - "\n", - "- `urbansim.urbanchoice.mnl.mnl_simulate()`\n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/urbanchoice/mnl.py#L121" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/Sampling-correction-01--Tim-edits.html b/notebooks/_archive/Sampling-correction-01--Tim-edits.html deleted file mode 100644 index 5f4ec19..0000000 --- a/notebooks/_archive/Sampling-correction-01--Tim-edits.html +++ /dev/null @@ -1,13193 +0,0 @@ - - - -Sampling-correction--Tim-edits - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
-
-
-

Sampling correction for large choice sets

-
-
-
-
-
-
-
-
-
    -
  1. Replicate synthetic data from Guevara & Ben-Akiva 2013
  2. -
  3. Do MNL with and without sampling correction
  4. -
  5. Check whether parameter estimates deviate from true values
  6. -
  7. Extend to Mixed Logit
  8. -
- -
-
-
-
-
-
-
-
-

1. Generate synthetic data set

-
-
-
-
-
-
-
-
-
    -
  • N = 1000 observations
  • -
  • J = 1000 alternatives for all observations (C_n = C)
  • -
  • X = single attribute distributed Uniform(-2,1) for the first 500 alternatives and Uniform(-1,2) for the second half
  • -
  • beta = generic linear taste coefficient, distributed Normal(mu=1.5, sigma=0.8) across the 1000 observations
  • -
  • systematic utility = beta * X
  • -
  • epsilon = error term distributed ExtremeValue(0,1)
  • -
  • random utility = beta * X + epsilon
  • -
-

Utility of alternative i for agent n: -$$ U_{in} = V_{in} + \varepsilon_{in} = \beta_n x_{i} + \varepsilon_{in} $$

-

Probability that agent n will choose alternative i: -$$ L_n(i \mid \beta_n, x_n,C_n) = \frac {e^{V_{in}}} {\sum_{j \epsilon C_n} e^{V_{jn}}} $$

- -
-
-
-
-
-
In [1]:
-
-
-
import numpy as np
-import pandas as pd
-
- -
-
-
- -
-
-
-
In [162]:
-
-
-
# Generate attribute x for each of J alternatives
-
-# Start with J << 1000 to speed up runtimes
-
-J = 50  # alternatives
-
-# Set a seed for reproducibility
-np.random.seed(12)
-
-Xa = 3 * np.random.rand(J/2) - 2  # uniform distribution over [-2, 1]
-Xb = 3 * np.random.rand(J/2) - 1  # uniform distribution over [-1, 2]
-
-X = np.concatenate((Xa, Xb))
-
-print len(X)
-print X[:5]
-
- -
-
-
- -
-
- - -
-
-
50
-[-1.53751147  0.22014909 -1.21005495 -0.39878182 -1.95627511]
-
-
-
- -
-
- -
-
-
-
In [163]:
-
-
-
# Generate taste coefficient beta for each of N agents 
-
-# For regular MNL, i think we need to use a single value, instead of a 
-# distribution as Guevara & Ben-Akiva used for the mixture model
-
-N = 1000  # agents/observations
-
-beta = np.zeros(1000) + 1.5
-# beta = 0.8 * np.random.randn(N) + 1.5
-
-print len(beta)
-print beta[:5]
-
- -
-
-
- -
-
- - -
-
-
1000
-[ 1.5  1.5  1.5  1.5  1.5]
-
-
-
- -
-
- -
-
-
-
In [164]:
-
-
-
print pd.DataFrame(beta).describe()
-
- -
-
-
- -
-
- - -
-
-
            0
-count  1000.0
-mean      1.5
-std       0.0
-min       1.5
-25%       1.5
-50%       1.5
-75%       1.5
-max       1.5
-
-
-
- -
-
- -
-
-
-
In [165]:
-
-
-
# Generate probability matrix for N agents choosing among J alternatives
-
-def probs(n):
-    ''' 
-    Return list of J probabilities for agent n
-    '''
-    b = beta[n]
-    exps = [np.exp(b*x) for x in X]
-    sum_exps = np.sum(exps)
-    return [exp/sum_exps for exp in exps]
-
-P = np.array([probs(n) for n in range(N)])
-    
-print P.shape
-
- -
-
-
- -
-
- - -
-
-
(1000, 50)
-
-
-
- -
-
- -
-
-
-
In [166]:
-
-
-
# Check that each row sums to 1
-
-print np.sum(P, axis=1)[:10]
-
- -
-
-
- -
-
- - -
-
-
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
-
-
-
- -
-
- -
-
-
-
In [167]:
-
-
-
# Simulate a choice from J alternatives for each of N agents
-
-C = [np.random.choice(range(J), p=p) for p in P]
-
-print len(C)
-print C[:10]
-
- -
-
-
- -
-
- - -
-
-
1000
-[12, 41, 37, 5, 30, 27, 8, 35, 33, 6]
-
-
-
- -
-
- -
-
-
-
-
-
-

Now we have data:

    -
  • N agents/observations with true taste coefficients in array "beta"
  • -
  • J alternatives with single attributes in array "X"
  • -
  • N choice outcomes in array "C"
  • -
- -
-
-
-
-
-
-
-
-

2. Estimate beta using PyLogit MNL

-
-
-
-
-
-
In [168]:
-
-
-
import pylogit
-from collections import OrderedDict
-
- -
-
-
- -
-
-
-
In [169]:
-
-
-
# Set up an estimation dataset in long format
-
-d = [[n, i, int(C[n]==i), X[i]] for i in range(J) for n in range(N)]
-
-print len(d)
-
- -
-
-
- -
-
- - -
-
-
50000
-
-
-
- -
-
- -
-
-
-
In [170]:
-
-
-
df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])
-
-print df.describe()
-
- -
-
-
- -
-
- - -
-
-
             obs_id        alt_id        choice             x
-count  50000.000000  50000.000000  50000.000000  50000.000000
-mean     499.500000     24.500000      0.020000      0.014570
-std      288.677877     14.431014      0.140001      1.116965
-min        0.000000      0.000000      0.000000     -1.993222
-25%      249.750000     12.000000      0.000000     -0.894495
-50%      499.500000     24.500000      0.000000      0.220035
-75%      749.250000     37.000000      0.000000      0.832675
-max      999.000000     49.000000      1.000000      1.985414
-
-
-
- -
-
- -
-
-
-
In [171]:
-
-
-
# Set up model spec
-
-spec = OrderedDict([
-        ('x', [range(J)])
-    ])
-
-labels = OrderedDict([
-        ('x', ['beta_x'])
-    ])
-
- -
-
-
- -
-
-
-
In [172]:
-
-
-
m = pylogit.create_choice_model(data = df, 
-                                alt_id_col = 'alt_id', 
-                                obs_id_col = 'obs_id', 
-                                choice_col = 'choice', 
-                                specification = spec, 
-                                model_type = "MNL", 
-                                names = labels)
-
-m.fit_mle(init_vals = np.array([0]))
-print m.get_statsmodels_summary()
-
- -
-
-
- -
-
- - -
-
-
Log-likelihood at zero: -3,912.0230
-Initial Log-likelihood: -3,912.0230
-Estimation Time: 0.04 seconds.
-Final log-likelihood: -3,065.1983
-                     Multinomial Logit Model Regression Results                    
-===================================================================================
-Dep. Variable:                      choice   No. Observations:                1,000
-Model:             Multinomial Logit Model   Df Residuals:                      999
-Method:                                MLE   Df Model:                            1
-Date:                     Tue, 15 Nov 2016   Pseudo R-squ.:                   0.216
-Time:                             17:35:26   Pseudo R-bar-squ.:               0.216
-converged:                            True   Log-Likelihood:             -3,065.198
-                                             LL-Null:                    -3,912.023
-==============================================================================
-                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
-beta_x         1.5324        nan        nan        nan           nan       nan
-==============================================================================
-
-
-
- -
-
- -
-
-
-
In [173]:
-
-
-
m.hessian
-
- -
-
-
- -
-
- - -
Out[173]:
- -
-
- - - - - - - - - - - - - -
beta_x
beta_x5.826086e-13
-
-
- -
- -
-
- -
-
-
-
-
-
-

Notes:

-
    -
  1. Clearly pylogit is struggling to create the hessian. The kernel dies on my computer when it attempts to create the hessian with 100 alternatives.
  2. -
  3. However, when using 50 alternatives per person, the issue of NaN standard errors can still be reproduced. In this setting, one can see that the hessian is positive, albeit very small (i.e. near zero). A positive hessian is indicative of a local minima instead of a local maxima. However, given that the model has converged, and converged essentially to the true value of 1.5, it is instead likely that the calculation of the hessian is experiencing numerical problems. A hessian near zero indicates a log-likelihood that is essentially flat.
  4. -
  5. When using only 10 alternatives per person, we can still see issues. For one, the standard error is still reported as being huge and the calculated hessian is extremely small. From plotting the log-likelihood function (below) around the estimated value, we can see that the function is not close to being flat. This means that the hessian has been calculated incorrectly.
  6. -
- -
-
-
-
-
-
In [174]:
-
-
-
# Use a more recent version of the mnl code for convenience
-import integrated_mnl_2 as mnl_module
-
-# Import libraries for plotting
-import seaborn
-import matplotlib.pyplot as plt
-
-%matplotlib inline
-
- -
-
-
- -
-
-
-
In [175]:
-
-
-
# Recreate the estimation object to use its convenient log-likelihood function
-mnl_estimator = mnl_module.MNLEstimator(model_obj=m,
-                                        mapping_dict=m.get_mappings_for_fit(),
-                                        ridge=0,
-                                        zero_vector=np.zeros(1),
-                                        split_params=mnl_module.split_param_vec)
-
- -
-
-
- -
-
-
-
In [176]:
-
-
-
# Create a function to plot the second order taylor series based on the
-# estimated gradient and hessian, to see if the estimated values are correct
-def plot_2nd_order_taylor_series(center_value, 
-                                 original_y,
-                                 x_line, 
-                                 first_deriv, 
-                                 second_deriv,
-                                 line_format_string="-k",
-                                 line_label="2nd Order Taylor Series"):
-    # Determine the value of each x minus the value 
-    # around which the series will be centered.
-    diff_from_center = x_line - center_value
-    # Create the "y-values" to be plotted
-    y_vals = (original_y +
-              first_deriv * (diff_from_center) +
-              (second_deriv / 2.0) * np.square(diff_from_center))
-    # Make the plot
-    plt.plot(x_line, y_vals, line_format_string, label=line_label)
-    
-    return None
-    
-
- -
-
-
- -
-
-
-
In [177]:
-
-
-
# Plot the log likelihood as a function of beta, around the estimated value
-estimated_beta = m.params.values[0]
-interval_width = 0.5
-beta_line = np.linspace(estimated_beta - interval_width,
-                        estimated_beta + interval_width,
-                        num=500)
-
-log_likelihoods = [mnl_estimator.convenience_calc_log_likelihood(np.array(test_beta))
-                   for test_beta in beta_line]
-
-plt.plot(beta_line, log_likelihoods, label='log-likelihoods')
-plt.vlines(estimated_beta,
-           min(log_likelihoods),
-           m.log_likelihood,
-           linestyles='dashed',
-           label='max log-likelihood')
-
-# Plot a second order taylor series to see how well we estimated
-# the hessian of the log-likelihood function
-plot_2nd_order_taylor_series(estimated_beta,
-                             m.log_likelihood,
-                             beta_line,
-                             m.gradient.values[0],
-                             m.hessian.values[0, 0],
-                             line_format_string='-r')
-
-plt.legend(loc='best')
-plt.xlabel(r"$\beta$", fontsize=15)
-plt.ylabel("Log-likelihood", fontsize=15)
-plt.title(r"Log-likelihood versus $\beta$", fontsize=15)
-plt.ylim(ymax=m.log_likelihood + 1)
-plt.show()
-
- -
-
-
- -
-
- - -
- - -
- -
- -
- -
-
- -
-
-
-
-
-
-

Clearly we've done a poor job.

- -
-
-
-
-
-
-
-
-

Why is the hessian being calculated incorrectly?

-
-
-
-
-
-
-
-
-

First, how should the Hessian be calculated?

Note that

-
    -
  1. the design matrix only consists of variables that remain constant across the dataset
  2. -
  3. the coefficients are constant across the population
  4. -
  5. the choice set is constant across individuals -As a result of all of this, the estimated probabilities of choosing each alternative will be the same for each person.
  6. -
-

Now, the hessian is the sum of the second derivatives of the log-likelihood for each observation. So we should N * hessian_1 where hessian_1 is the second derivative of the log-likelihood for the first observation.

- -
-
-
-
-
-
In [179]:
-
-
-
# Get the estimated probabilities of choosing each alternative
-# This will be constant across individuals
-estimated_probs = [m.long_fitted_probs[N * idx]for idx in range(J)]
-estimated_probs
-
-# Calculate the derivative of the probabilities with respect
-# to s = X*B
-dp_ds = np.diag(estimated_probs) - np.outer(estimated_probs,
-                                            estimated_probs)
-
-# Calculate the hessian for a single observation
-hessian_1 = (-1 * X[None, :]).dot(dp_ds.dot(X[:, None]))
-
-# Calculate the hessian for the entire dataset that we expect
-# recover. Note this multiplication only works because of the
-# special set up of this simulation/residential choice problem
-expected_hessian = N * hessian_1
-print "The hessian we expect to see is:", expected_hessian
-
-# Calculate the standard error that we expect for this dataset
-expected_std_error = np.diag(np.linalg.inv(-1 *
-                                           expected_hessian))**0.5
-print "The standard error we expect to see is:", expected_std_error
-
- -
-
-
- -
-
- - -
-
-
The hessian we expect to see is: [[-482.17324461]]
-The standard error we expect to see is: [ 0.04554057]
-
-
-
- -
-
- -
-
-
-
-
-
-

Now, what is wrong with the current hessian calculation?

Long story short, the matrix equations I use to calculate the hessian create blocks of dp_ds, therefore the matrices that multiply and are multiplied by dp_ds should have continuous columns (rows) for a given observation. The simulation data from Sam does not meet this criteria, and I do not have a warning for the user for this criteria. Essentially, the input data should place all rows for a given observation together.

- -
-
-
-
-
-
In [180]:
-
-
-
# Get the mapping matrices from the first model object
-mapping_matrices = m.get_mappings_for_fit()
-
-# Order the dataframe in order of it's observation ids
-sub_dfs = [df.loc[indices] for indices in
-           pylogit.choice_calcs.create_matrix_block_indices(mapping_matrices["rows_to_obs"])]
-ordered_df = pd.concat(sub_dfs, axis=0, ignore_index=True)
-
- -
-
-
- -
-
-
-
In [181]:
-
-
-
m2 = pylogit.create_choice_model(data = ordered_df, 
-                                alt_id_col = 'alt_id', 
-                                obs_id_col = 'obs_id', 
-                                choice_col = 'choice', 
-                                specification = spec, 
-                                model_type = "MNL", 
-                                names = labels)
-
-m2.fit_mle(init_vals = np.array([0]))
-print m2.get_statsmodels_summary()
-
- -
-
-
- -
-
- - -
-
-
Log-likelihood at zero: -3,912.0230
-Initial Log-likelihood: -3,912.0230
-Estimation Time: 0.04 seconds.
-Final log-likelihood: -3,065.1983
-                     Multinomial Logit Model Regression Results                    
-===================================================================================
-Dep. Variable:                      choice   No. Observations:                1,000
-Model:             Multinomial Logit Model   Df Residuals:                      999
-Method:                                MLE   Df Model:                            1
-Date:                     Tue, 15 Nov 2016   Pseudo R-squ.:                   0.216
-Time:                             17:35:41   Pseudo R-bar-squ.:               0.216
-converged:                            True   Log-Likelihood:             -3,065.198
-                                             LL-Null:                    -3,912.023
-==============================================================================
-                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
-beta_x         1.5324      0.046     33.649      0.000         1.443     1.622
-==============================================================================
-
-
-
- -
-
- -
-
-
-
- -
-
-
-
In [52]:
-
-
-
from urbansim.models import MNLDiscreteChoiceModel
-
- -
-
-
- -
-
-
-
In [97]:
-
-
-
# Choosers should be a DataFrame of characteristics, with index as identifier
-
-d = [[n, C[n]] for n in range(N)]
-
-choosers = pd.DataFrame(d, columns=['id', 'choice']).set_index('id')
-
-print len(choosers)
-
- -
-
-
- -
-
- - -
-
-
1000
-
-
-
- -
-
- -
-
-
-
In [98]:
-
-
-
# Alternatives should be a DataFrame of characteristics, with index as identifier
-
-d = [[i, X[i]] for i in range(J)]
-
-alts = pd.DataFrame(d, columns=['id', 'x']).set_index('id')
-
-print len(alts)
-
- -
-
-
- -
-
- - -
-
-
100
-
-
-
- -
-
- -
-
-
-
In [84]:
-
-
-
# It seems like this implementation *requires* us to sample the alternatives, 
-# so here i'm estimating the model with J-1 alts
-
-m = MNLDiscreteChoiceModel(model_expression = 'x',
-                           sample_size = J-1)
-
-m.fit(choosers = choosers,
-      alternatives = alts,
-      current_choice = 'choice')
-
-m.report_fit()
-
- -
-
-
- -
-
- - -
-
-
Null Log-liklihood: -4595.120
-Log-liklihood at convergence: -3793.079
-Log-liklihood Ratio: 0.175
-
-+-----------+-------------+------------+---------+
-| Component | Coefficient | Std. Error | T-Score |
-+-----------+-------------+------------+---------+
-| x         |    1.544    |   0.023    |  68.242 |
-+-----------+-------------+------------+---------+
-
-
-
- -
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
# To do 
-# - look through PyLogit and LCCM code
-# - in many-alternative scenarios, attirbutes of the alternatives will 
-#   usually be in a separate data table - what helper functions do we need?
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
- - diff --git a/notebooks/_archive/mnl_refactoring.py b/notebooks/_archive/mnl_refactoring.py deleted file mode 100644 index 52bb2ee..0000000 --- a/notebooks/_archive/mnl_refactoring.py +++ /dev/null @@ -1,48 +0,0 @@ -import numpy as np -import pandas as pd - -from choicemodels import MultinomialLogit -from choicemodels.tools import MergedChoiceTable -from collections import OrderedDict - - -tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id') -trips = pd.read_csv('../data/trips.csv').set_index('place_id') - -pd.set_option('display.float_format', lambda x: '%.3f' % x) - -choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)] -choosers = choosers.loc[choosers.trip_distance_miles.notnull()] - -numalts = 10 - -merged = MergedChoiceTable(observations = choosers, - alternatives = tracts, - chosen_alternatives = choosers.full_tract_id, - sample_size = numalts) - -model_expression = "home_density + work_density + school_density" - -model = MultinomialLogit(merged.to_frame(), - merged.observation_id_col, - merged.choice_col, - model_expression) - -results = model.fit() - -results.report_fit() - -""" -model_expression = OrderedDict([('home_density', 'all_same'), - ('work_density', 'all_same'), - ('school_density', 'all_same')]) - -model = MultinomialLogit(data = merged.to_frame(), - observation_id_col = merged.observation_id_col, - alternative_id_col = merged.alternative_id_col, - choice_col = merged.choice_col, - model_expression = model_expression) - -results = model.fit() -print(results.print_summaries()) -""" diff --git a/notebooks/make_distance_bands.ipynb b/notebooks/make_distance_bands.ipynb deleted file mode 100644 index 046bd5f..0000000 --- a/notebooks/make_distance_bands.ipynb +++ /dev/null @@ -1,162 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Anaconda\\envs\\cm\\lib\\site-packages\\statsmodels\\compat\\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import pandas as pd, numpy as np\n", - "from choicemodels.tools import distancematrix as dm\n", - "\n", - "# define distance bands in meters\n", - "distances = [0, 3000, 10000, 20000, np.inf]\n", - "\n", - "# specify input/output file locations\n", - "distance_matrix_file = '../data/bay_tracts_distance_matrix.csv'\n", - "distance_bands_file = '../data/bay_tracts_distance_bands.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "from to \n", - "06001400100 06001400100 0\n", - " 06001400200 2659\n", - " 06001400300 3595\n", - " 06001400400 3111\n", - " 06001400500 3579\n", - "Name: distance, dtype: int64" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# keep to and from geography IDs as string to preserve preceding zeros in tract IDs\n", - "dtypes = {0:str, 1:str}\n", - "dist_matrix = pd.read_csv(distance_matrix_file, header=None, dtype=dtypes, encoding='utf-8')\n", - "dist_matrix = dist_matrix.rename(columns={0:'from', 1:'to', 2:'distance'})\n", - "dist_matrix = dist_matrix.set_index(['from', 'to'])\n", - "dist_vector = dist_matrix['distance']\n", - "dist_vector.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 4.17 s\n" - ] - } - ], - "source": [ - "%%time\n", - "db = dm.distance_bands(dist_vector, distances)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "06001400100 0 [06001400100, 06001400200, 06001404300, 060014...\n", - " 1 [06001400300, 06001400400, 06001400500, 060014...\n", - " 2 [06001406100, 06001407200, 06001407300, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400200 0 [06001400100, 06001400200, 06001400300, 060014...\n", - " 1 [06001401300, 06001401400, 06001401500, 060014...\n", - " 2 [06001407300, 06001408100, 06001408200, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400300 0 [06001400200, 06001400300, 06001400400, 060014...\n", - " 1 [06001400100, 06001401500, 06001401600, 060014...\n", - " 2 [06001408100, 06001408200, 06001408300, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400400 0 [06001400200, 06001400300, 06001400400, 060014...\n", - " 1 [06001400100, 06001401300, 06001401400, 060014...\n", - " 2 [06001407300, 06001407500, 06001408100, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400500 0 [06001400200, 06001400300, 06001400400, 060014...\n", - " 1 [06001400100, 06001401300, 06001401400, 060014...\n", - " 2 [06001407300, 06001407400, 06001407500, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "dtype: object" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.head(20)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# save to csv for now... should store in database\n", - "db.to_csv(distance_bands_file, index=True, encoding='utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/make_distance_matrix.ipynb b/notebooks/make_distance_matrix.ipynb deleted file mode 100644 index 38bee16..0000000 --- a/notebooks/make_distance_matrix.ipynb +++ /dev/null @@ -1,249 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Anaconda\\envs\\cm\\lib\\site-packages\\statsmodels\\compat\\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import pandas as pd, numpy as np\n", - "from choicemodels.tools import distancematrix as dm\n", - "\n", - "tract_centroids_file = '../data/bay_tract_centroids.csv'\n", - "distance_matrix_file = '../data/bay_tracts_distance_matrix.csv'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load the data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1588" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# use double-precision floating points to ensure sufficient significant digits\n", - "dtypes = {'GEOID10':str, 'lat':np.float64, 'lng':np.float64}\n", - "df = pd.read_csv(tract_centroids_file, dtype=dtypes, encoding='utf-8').sort_values(by='GEOID10')\n", - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# alternatively, create a randomized dataframe of length n to test performance relative to size\n", - "#n = 5000\n", - "#df = pd.DataFrame({'GEOID10':range(n), 'lng':np.random.random(n), 'lat':np.random.random(n)})" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# index the dataframe by place identifier (i.e., census tract ID)\n", - "df = df.set_index('GEOID10')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate distance matrices, reindexed as multi-index vectors\n", - "\n", - "#### First, the euclidean distance vector in units of degrees" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 72 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "df_eu_dm = dm.distance_matrix(df, method='euclidean')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(2521744L,)\n" - ] - }, - { - "data": { - "text/plain": [ - "06001400100 06001400100 0.000000\n", - " 06001400200 0.026261\n", - " 06001400300 0.035165\n", - " 06001400400 0.032078\n", - " 06001400500 0.037980\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(df_eu_dm.shape)\n", - "df_eu_dm.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Next, the great-circle distance vector in units of meters" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "g:\\geoff\\dropbox\\documents\\school\\phd\\work\\2017-summer\\paul\\code\\choicemodels\\choicemodels\\tools\\distancematrix.py:47: RuntimeWarning: invalid value encountered in arccos\n", - " arc = np.arccos(cos)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 1.68 s\n" - ] - } - ], - "source": [ - "%%time\n", - "df_gc_dm = dm.distance_matrix(df, method='greatcircle')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(2521744L,)\n" - ] - }, - { - "data": { - "text/plain": [ - "06001400100 06001400100 0\n", - " 06001400200 2659\n", - " 06001400300 3595\n", - " 06001400400 3111\n", - " 06001400500 3579\n", - "dtype: int32" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(df_gc_dm.shape)\n", - "df_gc_dm.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# save distance matrix to disk\n", - "df_gc_dm.to_csv(distance_matrix_file, index=True, encoding='utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/make_tract_centroids.ipynb b/notebooks/make_tract_centroids.ipynb deleted file mode 100644 index 484675d..0000000 --- a/notebooks/make_tract_centroids.ipynb +++ /dev/null @@ -1,155 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pandas as pd, geopandas as gpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# specify location of census tracts shapefile and centroids output file\n", - "cal_tracts_file = '../data/tl_2010_06_tract10/tl_2010_06_tract10.shp'\n", - "tract_centroids_file = '../data/bay_tract_centroids.csv'\n", - "\n", - "# identify bay area counties by fips code\n", - "counties = {'Alameda':'001',\n", - " 'Contra Costa':'013',\n", - " 'Marin':'041',\n", - " 'Napa':'055',\n", - " 'San Francisco':'075',\n", - " 'San Mateo':'081',\n", - " 'Santa Clara':'085',\n", - " 'Solano':'095',\n", - " 'Sonoma':'097'}" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8057" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# load the tracts shapefile\n", - "gdf_cal = gpd.read_file(cal_tracts_file)\n", - "len(gdf_cal)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1588" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# retain only those tracts that are in the bay area counties\n", - "gdf_cal['county_fips'] = gdf_cal['GEOID10'].str.slice(start=2, stop=5)\n", - "gdf_bay = gdf_cal[gdf_cal['county_fips'].isin(counties.values())]\n", - "len(gdf_bay)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# calculate the centroid of each tract polygon then extract lat and lng coordinates\n", - "centroids = gdf_bay.centroid\n", - "lng = centroids.apply(lambda point: point.x)\n", - "lat = centroids.apply(lambda point: point.y)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1588" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# assemble into a dataframe to save\n", - "df_save = pd.DataFrame({'GEOID10':gdf_bay['GEOID10'],\n", - " 'lat':lat,\n", - " 'lng':lng})\n", - "len(df_save)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# save to disk\n", - "df_save.to_csv(tract_centroids_file, index=False, encoding='utf-8')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/requirements-dev.txt b/requirements-dev.txt index 45557ee..c089214 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,8 @@ +# requirements for development and testing + coverage >= 4.5 coveralls >= 1.3 pytest >= 3.4 -urbansim >= 3.1 \ No newline at end of file +# urbansim >= 3.1 # additional tests will run if urbansim is installed +sphinx +sphinx_rtd_theme \ No newline at end of file diff --git a/setup.py b/setup.py index 92048c3..7f37d6f 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,4 @@ -try: - from setuptools import setup -except ImportError: - from distutils.core import setup - +from setuptools import setup # read README as the long description with open('README.md', 'r') as f: @@ -14,8 +10,8 @@ setup( name='choicemodels', - version='0.2.dev7', - description='Tools for discrete choice estimation', + version='0.2.1', + description='Tools for discrete choice modeling', long_description=long_description, author='UDST', url='https://github.com/udst/choicemodels', diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..201f69a --- /dev/null +++ b/tests/README.md @@ -0,0 +1 @@ +Run tests from this folder using `pytest *.py -s`. \ No newline at end of file diff --git a/tests/test_mnl_new.py b/tests/test_mnl_new.py index ce2602e..b54c260 100644 --- a/tests/test_mnl_new.py +++ b/tests/test_mnl_new.py @@ -9,7 +9,6 @@ from pandas.testing import assert_frame_equal from patsy import dmatrix -from urbansim.urbanchoice.mnl import mnl_estimate, mnl_simulate from choicemodels import MultinomialLogit from choicemodels.tools import MergedChoiceTable @@ -43,9 +42,15 @@ def test_mnl(obs, alts): def test_mnl_estimation(obs, alts): """ Confirm that estimated params from the new interface match urbansim.urbanchoice. + Only runs if the urbansim package has been installed. """ - + try: + from urbansim.urbanchoice.mnl import mnl_estimate + except: + print("Comparison of MNL estimation results skipped because urbansim is not installed") + return + model_expression = 'obsval + altval - 1' mct = MergedChoiceTable(obs, alts, 'choice') @@ -67,8 +72,15 @@ def test_mnl_estimation(obs, alts): def test_mnl_prediction(obs, alts): """ Confirm that fitted probabilities in the new codebase match urbansim.urbanchoice. + Only runs if the urbansim package has been installed. """ + try: + from urbansim.urbanchoice.mnl import mnl_simulate + except: + print("Comparison of MNL simulation results skipped because urbansim is not installed") + return + # produce a fitted model mct = MergedChoiceTable(obs, alts, 'choice', 5) m = MultinomialLogit(mct, model_expression='obsval + altval - 1') diff --git a/tests/test_mnl_urbansim.py b/tests/test_mnl_urbansim.py index f4309c0..01dac31 100644 --- a/tests/test_mnl_urbansim.py +++ b/tests/test_mnl_urbansim.py @@ -1,6 +1,5 @@ """ -Test data and results for this are generated -by the R script at data/mnl_tests.R. +These tests were brought over from UrbanSim. """ from __future__ import division diff --git a/tests/test_simulation.py b/tests/test_simulation.py index e0ece16..a4fb20d 100644 --- a/tests/test_simulation.py +++ b/tests/test_simulation.py @@ -202,6 +202,17 @@ def test_max_iter(obs, alts, mct, probs): obs['size'] = 2 # (alts have capacity of 1) choices = iterative_lottery_choices(obs, alts, mct, probs, chooser_size='size', max_iter=5) + + +def test_capacity_break(obs, alts, mct, probs): + """ + Confirm that if alts[capacity].max() < choosers[size].min() will prevent infinite loop. + + """ + obs['size'] = 2 + alts['capacity'] = np.random.choice([3,5], size=len(alts)) # alt capacity left but not enough to host one obs + choices = iterative_lottery_choices(obs, alts, mct, probs, + chooser_size='size', alt_capacity='capacity') def test_parallel_lottery_choices(obs, alts, mct, probs):