# Exploring destination choice models

Sam Maurer, August 2017 | Python 3.6

Original version June 2017 (v01)  
Updated Aug 2017 (v02) to use new version of the estimation data (see "Data-prep-02" notebook)

In [1]:
import numpy as np
import pandas as pd

from patsy import dmatrix
from urbansim.urbanchoice import interaction, mnl

from choicemodels import MultinomialLogit

  from pandas.core import datetools


## Load estimation data from disk

In [2]:
tracts = pd.read_csv('../data/tracts_v02.csv').set_index('full_tract_id')

print(tracts.shape[0])
print(tracts.head())

1583
                   city  home_density  work_density  school_density
full_tract_id                                                      
6001008309      TIJUANA      0.000000      0.000000        0.000000
6001400100     BERKELEY     13.437961     13.130867       13.511570
6001400200      OAKLAND     11.089638      4.248928        0.894794
6001400300      OAKLAND     28.878399      7.671554        0.000000
6001400400      OAKLAND     16.884910      4.063805        8.150402


In [3]:
trips = pd.read_csv('../data/trips_v02.csv').set_index('place_id')

print(trips.shape[0])
print(trips.head())

36764
             full_tract_id  mode  trip_distance_miles
place_id                                             
10319850202     6095251902     5             5.125960
10335860102     6085511915     6           156.370628
10335860103     6085512027     6             1.615535
10335860104     6085512027     6             0.375708
10335860105     6085511915     6             0.894730


## MNL destination choice using urbansim.urbanchoice

In [4]:
# - each trip is a realized choice of a particular census tract
# - we can randomly sample alternative census tracts and build a model
#   of destination choice

In [5]:
# `interaction.mnl_interaction_dataset()` is not documented very well, but 
# this is how it seems to work

# Takes following input:
# - choosers: pandas.DataFrame with unique index
# - alternatives: pandas.DataFrame with unique index
# - SAMPLE_SIZE: number of alternatives for each choice scenario
# - chosenalts: list containing the alternative id chosen by each chooser?

# Returns following output:
# - full list of alternatives that were sampled
# - long-format DataFrame merging the two tables
# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives

### Start with a sample of ~500 trips for easier computation

In [6]:
choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]
choosers = choosers.loc[choosers.trip_distance_miles.notnull()]

print(choosers.shape[0])
print(choosers.head())

490
             full_tract_id  mode  trip_distance_miles
place_id                                             
19752330105     6085504307     1             0.180822
29323050204     6013353002     5            98.121255
22399690108     6085511301     1             0.439090
71966460103     6013364002     6             2.981047
13433100102     6013369001     5             1.989993


### Sample 100 alternatives for each and set up a long-format data table

In [7]:
numalts = 100

_, merged, chosen = interaction.mnl_interaction_dataset(
    choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, 
    chosenalts=choosers.full_tract_id)

print(merged.shape[0])
print(chosen.shape)

49000
(490, 100)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  alts_sample['join_index'] = np.repeat(choosers.index.values, SAMPLE_SIZE)


### Use Patsy to generate the design matrix

In [8]:
model_expression = "home_density + work_density + school_density"

model_design = dmatrix(model_expression, data=merged, return_type='dataframe')

print(model_design.head())

               Intercept  home_density  work_density  school_density
full_tract_id                                                       
6085504307           1.0     24.146421      0.000000        3.920454
6001436300           1.0     12.043594      2.008506        2.059304
6075011800           1.0      0.000000      5.268535        0.000000
6097151203           1.0     12.129010      0.321506        0.000000
6013312000           1.0     16.447524      5.608611        0.000000


### Fit the model using mnl_estimate()

In [9]:
log_likelihoods, fit_parameters = mnl.mnl_estimate(
    model_design.as_matrix(), chosen, numalts=numalts)

print(log_likelihoods)
print(fit_parameters)

{'null': -2256.5333911341672, 'convergence': -2204.0167766232917, 'ratio': 0.023273138663585158}
    Coefficient  Std. Error       T-Score
0 -2.783576e-16    0.085960 -3.238205e-15
1  1.294233e-02    0.004293  3.014600e+00
2  1.105461e-02    0.001206  9.165657e+00
3  1.325068e-02    0.004052  3.270132e+00


  return PMAT(np.exp(self.mat))


## NEW -- Same process in ChoiceModels

In [10]:
from choicemodels import MultinomialLogit
from choicemodels.tools import MergedChoiceTable

In [11]:
# Start with the same sample of trips

print(choosers.shape[0])

490


### Merge choosers and alternatives using a new ChoiceModels interface

In [12]:
merged = MergedChoiceTable(observations = choosers, 
                           alternatives = tracts, 
                           chosen_alternatives = choosers.full_tract_id, 
                           sample_size = numalts)

print(type(merged))
print(merged.to_frame().shape[0])

<class 'choicemodels.tools.interaction.MergedChoiceTable'>
49000


### Estimate a model using the ChoiceModels engine

In [13]:
%%time
model_expression = "home_density + work_density + school_density - 1"

model = MultinomialLogit(data = merged.to_frame(), 
                         observation_id_col = merged.observation_id_col, 
                         choice_col = merged.choice_col,
                         model_expression = model_expression)

results = model.fit()
print(results)

  return PMAT(np.exp(self.mat))


                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:               
Model:         Multinomial Logit   Df Residuals:                   
Method:       Maximum Likelihood   Df Model:                       
Date:                              Pseudo R-squ.:                  
Time:                              Pseudo R-bar-squ.:              
AIC:                               Log-Likelihood:       -2,200.702
BIC:                               LL-Null:              -2,256.533
                    coef   std err         z     P>|z|   Conf. Int.
-------------------------------------------------------------------
home_density      0.0136     0.003     5.154                       
work_density      0.0119     0.001     9.987                       
school_density    0.0129     0.004     3.251                       
CPU times: user 248 ms, sys: 8.24 ms, total: 256 ms
Wall time: 65.5 ms


In [14]:
print(type(results))

<class 'choicemodels.mnl.MultinomialLogitResults'>


### Estimate a model using the PyLogit engine

Usage is the same, but with an OrderedDict model expression

In [15]:
from collections import OrderedDict

In [16]:
%%time
model_expression = OrderedDict([('home_density', 'all_same'),
                                ('work_density', 'all_same'),
                                ('school_density', 'all_same')])

model = MultinomialLogit(data = merged.to_frame(),
                         observation_id_col = merged.observation_id_col,
                         alternative_id_col = merged.alternative_id_col,
                         choice_col = merged.choice_col,
                         model_expression = model_expression)

results = model.fit()
print(results)

Log-likelihood at zero: -2,256.5334
Initial Log-likelihood: -2,256.5334
Estimation Time: 0.09 seconds.
Final log-likelihood: -2,200.7021




                     Multinomial Logit Model Regression Results                    
Dep. Variable:                      chosen   No. Observations:                  490
Model:             Multinomial Logit Model   Df Residuals:                      487
Method:                                MLE   Df Model:                            3
Date:                     Thu, 10 Aug 2017   Pseudo R-squ.:                   0.025
Time:                             12:50:46   Pseudo R-bar-squ.:               0.023
AIC:                             4,407.404   Log-Likelihood:             -2,200.702
BIC:                             4,419.987   LL-Null:                    -2,256.533
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
home_density       0.0136      0.004      3.295      0.001       0.006       0.022
work_density       0.0119      0.001     11.001      0.000       0.010       0.