# Estimating External Worker Location Model

This notebook re-estimates ActivitySim external worker location model in Larch. It predicts what external station an external worker will travel to for an external tour.

# Load libraries

In [160]:
import os
import larch  # !conda install larch -c conda-forge # for estimation
import pandas as pd
import numpy as np
from larch import P, X
import matplotlib.pyplot as plt
import shutil

The directory with the estimation data bundles for external worker location choice.

In [161]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/output/estimation_data_bundle/external_workplace_location')

# Drop duplicate person records

In [162]:
alts_combined_data = pd.read_csv("external_workplace_location_alternatives_combined_orig.csv")
household_data = pd.read_csv("../override_households.csv")
person_data = pd.read_csv("../override_persons.csv")
alts_combined_data.shape

(10605, 11)

In [163]:
household_data.shape

(49762, 26)

In [164]:
#merge alts_combined data with person file
alts_combined_per_data = pd.merge(alts_combined_data, person_data[['person_id','household_id','PNUM']], on=["person_id"], how='left')
alts_combined_per_data.shape

(10605, 13)

In [165]:
#merge alts_combined data (with person file) with household file
alts_combined_per_hh_data = pd.merge(alts_combined_per_data, household_data[['household_id','HH_ID']], on=["household_id"])
alts_combined_per_hh_data.shape

(10605, 14)

In [166]:
#drop duplicate person records and merged columns
alts_combined_new_data = alts_combined_per_hh_data.drop_duplicates(subset=['HH_ID','PNUM','variable'])
alts_combined_new_data = alts_combined_new_data.drop(columns=['household_id','PNUM','HH_ID'])
alts_combined_new_data.shape

(2205, 11)

In [167]:
#write data to file
if os.path.exists("external_workplace_location_alternatives_combined_orig.csv")!=True:
  shutil.copy2('external_workplace_location_alternatives_combined.csv','external_workplace_location_alternatives_combined_orig.csv')
alts_combined_new_data.to_csv("external_workplace_location_alternatives_combined.csv", index=False)

In [168]:
choosers_combined_data = pd.read_csv("external_workplace_location_choosers_combined_orig.csv")
choosers_combined_data.shape

(707, 6)

In [169]:
#merge choosers_combined data with person file
choosers_combined_per_data = pd.merge(choosers_combined_data, person_data[['person_id','PNUM']], on=["person_id"], how='left')
choosers_combined_per_data.shape

(707, 7)

In [170]:
#merge alts_combined data (with person file) with household file
choosers_combined_per_hh_data = pd.merge(choosers_combined_per_data, household_data[['household_id','HH_ID']], on=["household_id"])
choosers_combined_per_hh_data.shape

(707, 8)

In [171]:
#drop duplicate person records and merged columns
choosers_combined_new_data = choosers_combined_per_hh_data.drop_duplicates(subset=['HH_ID','PNUM'])
choosers_combined_new_data.shape

(147, 8)

In [172]:
# recode workers who chose station 24333 (TAZ 11) to 24327 (TAZ 12); coded incorrectly
choosers_combined_new_data['override_choice'] = np.where(choosers_combined_new_data['override_choice']==24333, 24327, choosers_combined_new_data['override_choice'])

In [173]:
#write data to file
if os.path.exists("external_workplace_location_choosers_combined_orig.csv")!=True:
  shutil.copy2('external_workplace_location_choosers_combined.csv','external_workplace_location_choosers_combined_orig.csv')
choosers_combined_new_data.to_csv("external_workplace_location_choosers_combined.csv", index=False)

In [174]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation')
modelname = "external_workplace_location"

from activitysim.estimation.larch import component_model
model, data = component_model(modelname, return_data=True)

# Review data loaded from the EDB

The next step is to read the EDB, including the coefficients, model settings, utilities specification, and chooser and alternative data.

### Coefficients

In [175]:
data.coefficients

Unnamed: 0_level_0,value,constrain
coefficient_name,Unnamed: 1_level_1,Unnamed: 2_level_1
coef_mode_logsum,0,F
coef_dist_0_1,0,T
coef_dist_1_2,0,T
coef_dist_2_5,0,T
coef_dist_5_15,0,T
coef_dist_15_up,0,T
coef_dist,0,F
coef_dist_lt_2p5,0,T
coef_size,1,T


#### Utility specification

In [176]:
data.spec

Unnamed: 0,Label,Description,Expression,external_work
0,util_dist,,@_DIST,coef_dist
1,util_dist_lt_2p5,"Distance, piecewise linear from 0 to 1 miles","@np.where(_DIST<2.5,1,0)",coef_dist_lt_2p5
2,util_dist_1_2,"Distance, piecewise linear from 1 to 2 miles","@(_DIST-1).clip(0,1)",coef_dist_1_2
3,util_dist_2_5,"Distance, piecewise linear from 2 to 5 miles","@(_DIST-2).clip(0,3)",coef_dist_2_5
4,util_dist_5_15,"Distance, piecewise linear from 5 to 15 miles","@(_DIST-5).clip(0,10)",coef_dist_5_15
5,util_dist_15_up,"Distance, piecewise linear for 15+ miles",@(_DIST-15.0).clip(0),coef_dist_15_up
6,util_no_attractions,No attractions,@df['size_term']==0,-999
7,mode_choice_logsum,Mode choice logsum,mode_choice_logsum,coef_mode_logsum


## Explore data

In [177]:
data.chooser_data

Unnamed: 0,person_id,model_choice,override_choice,home_zone_id,household_id,is_external_worker,PNUM,HH_ID
0,38,24327,24327,24303,23,True,2,161004975
1,108,24326,24327,16387,55,True,2,161008944
2,764,24326,24323,5391,387,True,2,161067863
3,962,24326,24327,6641,482,True,1,161080976
4,1536,24327,24327,15411,786,True,2,161135536
...,...,...,...,...,...,...,...,...
142,95534,24330,24327,16268,49046,True,1,22068544
143,95747,24326,24327,15110,49151,True,1,22078307
144,96195,24327,24323,4413,49395,True,3,22098638
145,96371,24326,24327,312,49494,True,2,22106128


In [178]:
#pd.crosstab(data.chooser_data.closest_external_zone, data.chooser_data.override_choice, margins=True)

In [179]:
#pd.crosstab(data.chooser_data.industry, data.chooser_data.override_choice, margins=True, normalize='index')

In [180]:
#pd.crosstab(data.chooser_data.industry, data.chooser_data.override_choice, margins=True)

In [181]:
#data.chooser_data['external_worker'] = np.where(data.chooser_data["override_choice"]==1,0,1)

In [182]:
#plt.hist(data.chooser_data['distance_int'],range=(0, data.chooser_data['distance_int'].max()), bins=data.chooser_data['distance_int'].max() + 1)
#plot_df = data.chooser_data.groupby('distance_int')['external_worker'].mean().mul(100).reindex(range(data.chooser_data.distance_int.min()-1,data.chooser_data.distance_int.max()+1), fill_value=0)

#ax = plot_df.plot(kind='bar',rot = 0,title='Share of External Workers by Distance to Closest External Station',ylim=[0, 30], xlabel="Distance (mi)", ylabel="Percent",  figsize=(20, 5))

#plt.show()

In [183]:
#chooser_data_lt1mi= data.chooser_data[data.chooser_data["distance_int"]<=2]
#pd.crosstab(chooser_data_lt1mi.closest_external_zone, chooser_data_lt1mi.override_choice, margins=True)

# Set Coefficients

In [184]:
dir(model)
print(model.utility_ca)

  P.coef_dist * X('util_dist*(is_external_worker==True)')
+ P.coef_dist_lt_2p5 * X('util_dist_lt_2p5*(is_external_worker==True)')
+ P.coef_dist_1_2 * X('util_dist_1_2*(is_external_worker==True)')
+ P.coef_dist_2_5 * X('util_dist_2_5*(is_external_worker==True)')
+ P.coef_dist_5_15 * X('util_dist_5_15*(is_external_worker==True)')
+ P.coef_dist_15_up * X('util_dist_15_up*(is_external_worker==True)')
+ P('-999') * X('util_no_attractions*(is_external_worker==True)')
+ P.coef_mode_logsum * X('mode_choice_logsum*(is_external_worker==True)')


In [185]:
#capping distance at 10 miles
model.utility_ca =   (P.coef_dist * X('fmin(util_dist,10)*(is_external_worker==True)')
+ P.coef_dist_lt_2p5 * X('util_dist_lt_2p5*(is_external_worker==True)')
+ P.coef_dist_1_2 * X('util_dist_1_2*(is_external_worker==True)')
+ P.coef_dist_2_5 * X('util_dist_2_5*(is_external_worker==True)')
+ P.coef_dist_5_15 * X('util_dist_5_15*(is_external_worker==True)')
+ P.coef_dist_15_up * X('util_dist_15_up*(is_external_worker==True)')
+ P('-999') * X('util_no_attractions*(is_external_worker==True)')
+ P.coef_mode_logsum * X('mode_choice_logsum*(is_external_worker==True)'))

# Estimate

With the model setup for estimation, the next step is to estimate the model coefficients.  Make sure to use a sufficiently large enough household sample and set of zones to avoid an over-specified model, which does not have a numerically stable likelihood maximizing solution.  Larch has a built-in estimation methods including BHHH, and also offers access to more advanced general purpose non-linear optimizers in the `scipy` package, including SLSQP, which allows for bounds and constraints on parameters.  BHHH is the default and typically runs faster, but does not follow constraints on parameters.

In [186]:
model.load_data()
#model.doctor(repair_ch_av="-")

req_data does not request avail_ca or avail_co but it is set and being provided


In [187]:
model.maximize_loglike(method="SLSQP", options={"maxiter": 1000})


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
-999,-999.0,-999.0,-999.0,-999.0,-999.0,1,,-999.0
coef_dist,-0.52677,0.0,0.0,-25.0,25.0,0,,-0.52677
coef_dist_15_up,0.0,0.0,0.0,0.0,0.0,1,,0.0
coef_dist_1_2,0.0,0.0,0.0,0.0,0.0,1,,0.0
coef_dist_2_5,0.0,0.0,0.0,0.0,0.0,1,,0.0
coef_dist_5_15,0.0,0.0,0.0,0.0,0.0,1,,0.0
coef_dist_lt_2p5,0.0,0.0,0.0,0.0,0.0,1,,0.0
coef_mode_logsum,0.059338,0.0,0.0,-25.0,25.0,0,,0.059338
external_work_external_work,0.0,0.0,0.0,0.0,0.0,1,,0.0


Unnamed: 0_level_0,0
Unnamed: 0_level_1,0
-999,-999.000000
coef_dist,-0.526770
coef_dist_15_up,0.000000
coef_dist_1_2,0.000000
coef_dist_2_5,0.000000
coef_dist_5_15,0.000000
coef_dist_lt_2p5,0.000000
coef_mode_logsum,0.059338
external_work_external_work,0.000000
-999,0.000000

Unnamed: 0,0
-999,-999.0
coef_dist,-0.52677
coef_dist_15_up,0.0
coef_dist_1_2,0.0
coef_dist_2_5,0.0
coef_dist_5_15,0.0
coef_dist_lt_2p5,0.0
coef_mode_logsum,0.059338
external_work_external_work,0.0

Unnamed: 0,0
-999,0.0
coef_dist,0.002387
coef_dist_15_up,0.0
coef_dist_1_2,0.0
coef_dist_2_5,0.0
coef_dist_5_15,0.0
coef_dist_lt_2p5,0.0
coef_mode_logsum,-0.002007
external_work_external_work,0.0


### Estimated coefficients

In [188]:
model.calculate_parameter_covariance()
result_dir='/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/'
model.to_xlsx(
        result_dir+"ext_worker_location_004.xlsx", 
        data_statistics=True,
    )

  xl = ExcelWriter(filename, engine='xlsxwriter_larch', model=model, **kwargs)


<larch.util.excel.ExcelWriter at 0x242aba67340>

# Output Estimation Results

In [189]:
from activitysim.estimation.larch import update_coefficients
result_dir = data.edb_directory/"estimated"
update_coefficients(
    model, data, result_dir,
    output_file=f"{modelname}_coefficients_004.csv",
);

In [190]:
#larch.__version__

In [191]:
#result_dir

### Write the model estimation report, including coefficient t-statistic and log likelihood

# Next Steps

The final step is to either manually or automatically copy the `*_coefficients_revised.csv` file to the configs folder, rename it to `*_coefficients.csv`, and run ActivitySim in simulation mode.

In [192]:
#pd.read_csv(result_dir/f"{modelname}_coefficients_revised.csv")