# Exploring destination choice models

Sam Maurer, August 2017 | Python 3.6

Original version June 2017  
Updated Aug 2017 to use new version of the estimation data (see "Data-prep-02" notebook)

In [1]:
import numpy as np
import pandas as pd

from patsy import dmatrix
from urbansim.urbanchoice import interaction, mnl

from choicemodels import MultinomialLogit

  from pandas.core import datetools


## Load estimation data from disk

In [3]:
# Suppress scientific notation in the Pandas display output

# pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
tracts = pd.read_csv('../data/tracts_v02.csv').set_index('full_tract_id')

print(tracts.shape[0])
print(tracts.head())

1583
                   city  home_density  work_density  school_density
full_tract_id                                                      
6001008309      TIJUANA      0.000000      0.000000        0.000000
6001400100     BERKELEY     13.437961     13.130867       13.511570
6001400200      OAKLAND     11.089638      4.248928        0.894794
6001400300      OAKLAND     28.878399      7.671554        0.000000
6001400400      OAKLAND     16.884910      4.063805        8.150402


In [3]:
trips = pd.read_csv('../data/trips_v02.csv').set_index('place_id')

print(trips.shape[0])
print(trips.head())

36764
             full_tract_id  mode  trip_distance_miles
place_id                                             
10319850202     6095251902     5             5.125960
10335860102     6085511915     6           156.370628
10335860103     6085512027     6             1.615535
10335860104     6085512027     6             0.375708
10335860105     6085511915     6             0.894730


## MNL destination choice using urbansim.urbanchoice

In [6]:
# - each trip is a realized choice of a particular census tract
# - we can randomly sample alternative census tracts and build a model
#   of destination choice

In [7]:
# `interaction.mnl_interaction_dataset()` is not documented very well, but 
# this is how it seems to work

# Takes following input:
# - choosers: pandas.DataFrame with unique index
# - alternatives: pandas.DataFrame with unique index
# - SAMPLE_SIZE: number of alternatives for each choice scenario
# - chosenalts: list containing the alternative id chosen by each chooser?

# Returns following output:
# - full list of alternatives that were sampled
# - long-format DataFrame merging the two tables
# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives

### Start with a sample of ~500 trips for easier computation

In [4]:
choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]
choosers = choosers.loc[choosers.trip_distance_miles.notnull()]

print(choosers.shape[0])
print(choosers.head())

487
             full_tract_id  mode  trip_distance_miles
place_id                                             
25870990106     6097153902     6             1.538361
25255000203     6085505402     5             3.051930
23645200205     6081613700     6             3.040237
71638550104     6075013300     5             2.607826
13820900103     6075017000     5             2.906653


### Sample 100 alternatives for each and set up a long-format data table

In [5]:
numalts = 100

_, merged, chosen = interaction.mnl_interaction_dataset(
    choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, 
    chosenalts=choosers.full_tract_id)

print(merged.shape[0])
print(chosen.shape)

48700
(487, 100)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  alts_sample['join_index'] = np.repeat(choosers.index.values, SAMPLE_SIZE)


### Use Patsy to generate the design matrix

In [6]:
model_expression = "home_density + work_density + school_density"

model_design = dmatrix(model_expression, data=merged, return_type='dataframe')

print(model_design.head())

               Intercept  home_density  work_density  school_density
full_tract_id                                                       
6097153902           1.0     26.727745     14.451364        6.962221
6081609800           1.0     23.900920      6.500372        0.000000
6001407101           1.0     30.019729      0.000000        0.000000
6001433700           1.0      0.487064      5.110116        3.144844
6085504001           1.0      5.083191      0.724509        0.000000


### Fit the model using mnl_estimate()

In [7]:
log_likelihoods, fit_parameters = mnl.mnl_estimate(
    model_design.as_matrix(), chosen, numalts=numalts)

print(log_likelihoods)
print(fit_parameters)

{'null': -2242.7178805762023, 'convergence': -2182.299404558532, 'ratio': 0.026939846755111074}
    Coefficient  Std. Error       T-Score
0 -3.816825e-18    0.088074 -4.333652e-17
1  1.934374e-02    0.004179  4.629110e+00
2  1.107013e-02    0.001238  8.939178e+00
3  1.658034e-02    0.004241  3.909594e+00


  return PMAT(np.exp(self.mat))


## NEW -- Same process in ChoiceModels

In [8]:
from choicemodels import MultinomialLogit
from choicemodels.tools import MergedChoiceTable

In [9]:
# Start with the same sample of trips

print(choosers.shape[0])

487


### Merge choosers and alternatives using a new ChoiceModels interface

In [10]:
merged = MergedChoiceTable(observations = choosers, 
                           alternatives = tracts, 
                           chosen_alternatives = choosers.full_tract_id, 
                           sample_size = numalts)

print(type(merged))
print(merged.to_frame().shape[0])

<class 'choicemodels.tools.interaction.MergedChoiceTable'>
48700


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  alts_sample['join_index'] = np.repeat(choosers.index.values, SAMPLE_SIZE)


### Estimate a model using the ChoiceModels engine

In [11]:
%%time
model_expression = "home_density + work_density + school_density - 1"

model = MultinomialLogit(data = merged.to_frame(), 
                         observation_id_col = merged.observation_id_col, 
                         choice_col = merged.choice_col,
                         model_expression = model_expression)

results = model.fit()
print(results)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:               
Model:         Multinomial Logit   Df Residuals:                   
Method:       Maximum Likelihood   Df Model:                       
Date:                              Pseudo R-squ.:                  
Time:                              Pseudo R-bar-squ.:              
AIC:                               Log-Likelihood:       -2,183.890
BIC:                               LL-Null:              -2,242.718
                    coef   std err         z     P>|z|   Conf. Int.
-------------------------------------------------------------------
home_density      0.0196     0.003     7.675                       
work_density      0.0111     0.001     8.993                       
school_density    0.0158     0.004     3.826                       
CPU times: user 146 ms, sys: 4.39 ms, total: 151 ms
Wall time: 48.6 ms


  return PMAT(np.exp(self.mat))


In [12]:
print(type(results))

<class 'choicemodels.mnl.MultinomialLogitResults'>


### Estimate a model using the PyLogit engine

Usage is the same, but with an OrderedDict model expression

In [13]:
from collections import OrderedDict

In [14]:
%%time
model_expression = OrderedDict([('home_density', 'all_same'),
                                ('work_density', 'all_same'),
                                ('school_density', 'all_same')])

model = MultinomialLogit(data = merged.to_frame(),
                         observation_id_col = merged.observation_id_col,
                         alternative_id_col = merged.alternative_id_col,
                         choice_col = merged.choice_col,
                         model_expression = model_expression)

results = model.fit()
print(results)

Log-likelihood at zero: -2,242.7179
Initial Log-likelihood: -2,242.7179
Estimation Time: 0.06 seconds.
Final log-likelihood: -2,183.8902




                     Multinomial Logit Model Regression Results                    
Dep. Variable:                      chosen   No. Observations:                  487
Model:             Multinomial Logit Model   Df Residuals:                      484
Method:                                MLE   Df Model:                            3
Date:                     Thu, 10 Aug 2017   Pseudo R-squ.:                   0.026
Time:                             11:14:28   Pseudo R-bar-squ.:               0.025
AIC:                             4,373.780   Log-Likelihood:             -2,183.890
BIC:                             4,386.345   LL-Null:                    -2,242.718
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
home_density       0.0196      0.004      4.820      0.000       0.012       0.028
work_density       0.0111      0.001      9.964      0.000       0.009       0.