# Exploring destination choice models

Sam Maurer, June 2017

Python 3.6

## Plan

- Set up a simple MNL destination choice model using the `urbansim.urbanchoice` interface

- Refactor the code, using this notebook for ad-hoc testing

- Set up more complex models as needed

- Add support for PyLogit MNL through an alternate constructor (class method)

In [1]:
import numpy as np
import pandas as pd

from patsy import dmatrix
from urbansim.urbanchoice import interaction, mnl

from choicemodels import MultinomialLogit

In [2]:
# Suppress deprecation warnings

import warnings; warnings.simplefilter('ignore')

## Load estimation data from disk

In [3]:
# Suppress scientific notation in the Pandas display output

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id')

print(tracts.shape[0])
print(tracts.head())

1583
                    city  home_density  work_density  school_density
full_tract_id                                                       
6001008309.000   TIJUANA         0.000         0.000           0.000
6001400100.000  BERKELEY        13.438        13.131          13.512
6001400200.000   OAKLAND        11.090         4.249           0.895
6001400300.000   OAKLAND        28.878         7.672           0.000
6001400400.000   OAKLAND        16.885         4.064           8.150


In [5]:
trips = pd.read_csv('../data/trips.csv').set_index('place_id')

print(trips.shape[0])
print(trips.head())

36765
                 full_tract_id  mode  trip_distance_miles
place_id                                                 
10319850102.000 6095252108.000 6.000               13.428
10319850202.000 6095251902.000 5.000                5.126
10335860102.000 6085511915.000 6.000              156.371
10335860103.000 6085512027.000 6.000                1.616
10335860104.000 6085512027.000 6.000                0.376


## MNL destination choice using urbansim.urbanchoice

In [6]:
# - each trip is a realized choice of a particular census tract
# - we can randomly sample alternative census tracts and build a model
#   of destination choice

In [7]:
# `interaction.mnl_interaction_dataset()` is not documented very well, but 
# this is how it seems to work

# Takes following input:
# - choosers: pandas.DataFrame with unique index
# - alternatives: pandas.DataFrame with unique index
# - SAMPLE_SIZE: number of alternatives for each choice scenario
# - chosenalts: list containing the alternative id chosen by each chooser?

# Returns following output:
# - full list of alternatives that were sampled
# - long-format DataFrame merging the two tables
# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives

### Start with a sample of ~500 trips for easier computation

In [24]:
choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]
choosers = choosers.loc[choosers.trip_distance_miles.notnull()]

print(choosers.shape[0])
print(choosers.head())

487
                 full_tract_id  mode  trip_distance_miles
place_id                                                 
13426430210.000 6013339001.000 5.000                7.799
14587590203.000 6081607200.000 5.000                2.093
70164780108.000 6075012402.000 1.000                0.359
17514320402.000 6001427100.000 1.000                0.064
13924110405.000 6013303203.000 6.000                3.712


### Sample alternatives and set up a long-format data table

In [26]:
numalts = 100

_, merged, chosen = interaction.mnl_interaction_dataset(
    choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, 
    chosenalts=choosers.full_tract_id)

print(merged.shape[0])
print(chosen.shape)

48700
(487, 100)


### Use Patsy to generate the design matrix

In [27]:
model_expression = "home_density + work_density + school_density"

model_design = dmatrix(model_expression, data=merged, return_type='dataframe')

print(model_design.head())

                Intercept  home_density  work_density  school_density
full_tract_id                                                        
6013339001.000      1.000         4.646        26.256           4.856
6085502910.000      1.000         5.573         8.856           0.000
6001441700.000      1.000        19.828         2.202           2.164
6001437200.000      1.000        14.579         4.993          46.697
6001441524.000      1.000         0.000         0.471           3.121


### Fit the model using mnl_estimate()

In [28]:
log_likelihoods, fit_parameters = mnl.mnl_estimate(
    model_design.as_matrix(), chosen, numalts=numalts)

print(log_likelihoods)
print(fit_parameters)

{'null': -2242.7178805762023, 'convergence': -2182.3458970189504, 'ratio': 0.026919116345450078}
   Coefficient  Std. Error  T-Score
0       -0.000       0.083   -0.000
1        0.016       0.004    3.841
2        0.012       0.001   10.011
3        0.014       0.005    2.801


## NEW -- Same process in ChoiceModels

In [12]:
from choicemodels import MultinomialLogit
from choicemodels.tools import MergedChoiceTable

In [23]:
# Start with the same sample of trips

print(choosers.shape[0])

483


### Merge choosers and alternatives using a new ChoiceModels interface

In [14]:
merged = MergedChoiceTable(observations = choosers, 
                           alternatives = tracts, 
                           chosen_alternatives = choosers.full_tract_id, 
                           sample_size = numalts)

print(type(merged))
print(merged.to_frame().shape[0])

<class 'choicemodels.tools.interaction.MergedChoiceTable'>
48300


### Estimate a model using the ChoiceModels engine

In [22]:
%%time
model_expression = "home_density + work_density + school_density"

model = MultinomialLogit(data = merged.to_frame(), 
                         observation_id_col = merged.observation_id_col, 
                         choice_col = merged.choice_col,
                         model_expression = model_expression)

results = model.fit()
print(results)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:               
Model:         Multinomial Logit   Df Residuals:                   
Method:       Maximum Likelihood   Df Model:                       
Date:                              Pseudo R-squ.:                  
Time:                              Pseudo R-bar-squ.:              
AIC:                               Log-Likelihood:       -2,187.539
BIC:                               LL-Null:              -2,224.297
                    coef   std err         z     P>|z|   Conf. Int.
-------------------------------------------------------------------
Intercept        -0.0000     0.086    -0.000                       
home_density      0.0150     0.004     3.626                       
work_density      0.0100     0.001     6.770                       
school_density    0.0121     0.004     3.194                       
CPU times: user 228 ms, sys: 5.53 ms, total: 234

In [20]:
print(type(results))

<class 'choicemodels.mnl.MultinomialLogitResults'>


### Estimate a model using the PyLogit engine

Usage is the same, but with an OrderedDict model expression

In [17]:
from collections import OrderedDict

In [18]:
%%time
model_expression = OrderedDict([('home_density', 'all_same'),
                                ('work_density', 'all_same'),
                                ('school_density', 'all_same')])

model = MultinomialLogit(data = merged.to_frame(),
                         observation_id_col = merged.observation_id_col,
                         alternative_id_col = merged.alternative_id_col,
                         choice_col = merged.choice_col,
                         model_expression = model_expression)

results = model.fit()
print(results)

Log-likelihood at zero: -2,224.2972
Initial Log-likelihood: -2,224.2972
Estimation Time: 0.05 seconds.
Final log-likelihood: -2,187.5391
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                      chosen   No. Observations:                  483
Model:             Multinomial Logit Model   Df Residuals:                      480
Method:                                MLE   Df Model:                            3
Date:                     Mon, 26 Jun 2017   Pseudo R-squ.:                   0.017
Time:                             23:07:24   Pseudo R-bar-squ.:               0.015
AIC:                             4,381.078   Log-Likelihood:             -2,187.539
BIC:                             4,393.618   LL-Null:                    -2,224.297
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
home_density       0.0150