# Estimating External Worker Location Model

This notebook re-estimates ActivitySim external worker location model in Larch. It predicts what external station an external worker will travel to for an external tour.

# Load libraries

In [230]:
import os
import larch  # !conda install larch -c conda-forge # for estimation
import pandas as pd
import numpy as np
from larch import P, X
import matplotlib.pyplot as plt
import shutil

In [231]:
#The directory with the estimation data bundles for external worker location choice.

In [232]:
#os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/output/estimation_data_bundle/external_non_mandatory_destination')

In [233]:
# Dont need to drop any duplicates, this is a tour file

In [234]:
#alts_combined_data = pd.read_csv("external_non_mandatory_destination_alternatives_combined_orig.csv")
#household_data = pd.read_csv("../override_households.csv")
#person_data = pd.read_csv("../override_persons.csv")
#alts_combined_data.shape

In [235]:
#household_data.shape

In [236]:
#merge alts_combined data with person file
#alts_combined_per_data = pd.merge(alts_combined_data, person_data[['person_id','household_id','PNUM']], on=["person_id"], how='left')
#alts_combined_per_data.shape

In [237]:
#merge alts_combined data (with person file) with household file
#alts_combined_per_hh_data = pd.merge(alts_combined_per_data, household_data[['household_id','HH_ID']], on=["household_id"])
#alts_combined_per_hh_data.shape

In [238]:
#drop duplicate person records and merged columns
#alts_combined_new_data = alts_combined_per_hh_data.drop_duplicates(subset=['HH_ID','PNUM','variable'])
#alts_combined_new_data = alts_combined_new_data.drop(columns=['household_id','PNUM','HH_ID'])
#alts_combined_new_data.shape

In [239]:
#write data to file
#if os.path.exists("external_workplace_location_alternatives_combined_orig.csv")!=True:
#  shutil.copy2('external_workplace_location_alternatives_combined.csv','external_workplace_location_alternatives_combined_orig.csv')
#alts_combined_new_data.to_csv("external_workplace_location_alternatives_combined.csv", index=False)

In [240]:
#choosers_combined_data = pd.read_csv("external_workplace_location_choosers_combined_orig.csv")
#choosers_combined_data.shape

In [241]:
#merge choosers_combined data with person file
#choosers_combined_per_data = pd.merge(choosers_combined_data, person_data[['person_id','PNUM']], on=["person_id"], how='left')
#choosers_combined_per_data.shape

In [242]:
#merge alts_combined data (with person file) with household file
#choosers_combined_per_hh_data = pd.merge(choosers_combined_per_data, household_data[['household_id','HH_ID']], on=["household_id"])
#choosers_combined_per_hh_data.shape

In [243]:
#drop duplicate person records and merged columns
#choosers_combined_new_data = choosers_combined_per_hh_data.drop_duplicates(subset=['HH_ID','PNUM'])
#choosers_combined_new_data.shape

In [244]:
# recode workers who chose station 24333 (TAZ 11) to 24327 (TAZ 12); coded incorrectly
#choosers_combined_new_data['override_choice'] = np.where(choosers_combined_new_data['override_choice']==24333, 24327, choosers_combined_new_data['override_choice'])
#choosers_combined_new_data['is_external_worker']=True

In [245]:
#write data to file
#if os.path.exists("external_workplace_location_choosers_combined_orig.csv")!=True:
#  shutil.copy2('external_workplace_location_choosers_combined.csv','external_workplace_location_choosers_combined_orig.csv')
#choosers_combined_new_data.to_csv("external_workplace_location_choosers_combined.csv", index=False)

In [246]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation')
modelname = "external_non_mandatory_destination"

from activitysim.estimation.larch import component_model
model, data = component_model(modelname, return_data=True)

# Review data loaded from the EDB

The next step is to read the EDB, including the coefficients, model settings, utilities specification, and chooser and alternative data.

### Coefficients

In [247]:
data.coefficients

Unnamed: 0_level_0,value,constrain
coefficient_name,Unnamed: 1_level_1,Unnamed: 2_level_1
coef_mode_logsum,0,F
coef_dist_escort,0,F
coef_dist_shopping,0,F
coef_dist_eatout,0,F
coef_dist_maint,0,F
coef_dist_social,0,F
coef_dist_discr,0,F
coef_mode_logsum,0,F
coef_size,1,T


#### Utility specification

In [248]:
data.spec

Unnamed: 0,Label,Description,Expression,escort,shopping,eatout,othmaint,social,othdiscr
0,util_dist,dist,@_DIST,coef_dist_escort,coef_dist_shopping,coef_dist_eatout,coef_dist_maint,coef_dist_social,coef_dist_discr
1,util_size_term,Size variable,@df['size_term'].apply(np.log1p),coef_size,coef_size,coef_size,coef_size,coef_size,coef_size
2,util_mode_choice_logsum,Mode choice logsum,mode_choice_logsum,coef_mode_logsum,coef_mode_logsum,coef_mode_logsum,coef_mode_logsum,coef_mode_logsum,coef_mode_logsum


## Explore data

In [249]:
data.chooser_data

Unnamed: 0,tour_id,model_choice,override_choice,person_id,tour_type,tour_type_count,tour_type_num,age,female,home_zone_id,income,income_segment
0,2729,24323,24326.0,54,othmaint,1,1,61,False,18011,91738.75,3
1,28829,24329,24327.0,576,othmaint,1,1,41,False,4040,45818.75,2
2,65079,24326,24328.0,1301,othmaint,1,1,35,False,8135,58526.25,2
3,69375,24328,24327.0,1387,othdiscr,1,1,30,True,14725,39376.25,2
4,71525,24325,24328.0,1430,othdiscr,1,1,71,False,1710,2125.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...
277,4790529,24326,24327.0,95810,othmaint,1,1,71,True,16535,78035.00,3
278,4798479,24329,24326.0,95969,othmaint,1,1,68,False,13380,37012.00,2
279,4814629,24330,24326.0,96292,othmaint,1,1,40,True,1252,98191.00,3
280,4814856,24326,24327.0,96297,eatout,1,1,74,False,13105,58591.00,2


In [250]:
#pd.crosstab(data.chooser_data.closest_external_zone, data.chooser_data.override_choice, margins=True)

In [251]:
#pd.crosstab(data.chooser_data.industry, data.chooser_data.override_choice, margins=True, normalize='index')

In [252]:
#pd.crosstab(data.chooser_data.industry, data.chooser_data.override_choice, margins=True)

In [253]:
#data.chooser_data['external_worker'] = np.where(data.chooser_data["override_choice"]==1,0,1)

In [254]:
#plt.hist(data.chooser_data['distance_int'],range=(0, data.chooser_data['distance_int'].max()), bins=data.chooser_data['distance_int'].max() + 1)
#plot_df = data.chooser_data.groupby('distance_int')['external_worker'].mean().mul(100).reindex(range(data.chooser_data.distance_int.min()-1,data.chooser_data.distance_int.max()+1), fill_value=0)

#ax = plot_df.plot(kind='bar',rot = 0,title='Share of External Workers by Distance to Closest External Station',ylim=[0, 30], xlabel="Distance (mi)", ylabel="Percent",  figsize=(20, 5))

#plt.show()

In [255]:
#chooser_data_lt1mi= data.chooser_data[data.chooser_data["distance_int"]<=2]
#pd.crosstab(chooser_data_lt1mi.closest_external_zone, chooser_data_lt1mi.override_choice, margins=True)

# Set Coefficients

In [256]:
dir(model)
print(model.utility_ca)

  P.coef_dist_shopping * X('util_dist*(tour_type=='shopping')')
+ P.coef_size * X('util_size_term*(tour_type=='shopping')')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(tour_type=='shopping')')
+ P.coef_dist_maint * X('util_dist*(tour_type=='othmaint')')
+ P.coef_size * X('util_size_term*(tour_type=='othmaint')')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(tour_type=='othmaint')')
+ P.coef_dist_discr * X('util_dist*(tour_type=='othdiscr')')
+ P.coef_size * X('util_size_term*(tour_type=='othdiscr')')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(tour_type=='othdiscr')')
+ P.coef_dist_eatout * X('util_dist*(tour_type=='eatout')')
+ P.coef_size * X('util_size_term*(tour_type=='eatout')')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(tour_type=='eatout')')
+ P.coef_dist_social * X('util_dist*(tour_type=='social')')
+ P.coef_size * X('util_size_term*(tour_type=='social')')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(tour_type=='social')')
+ P.coef_dist_escort *

In [257]:
#capping distance at 10 miles
#model.utility_ca =   (P.coef_dist * X('fmin(util_dist,10)*(is_external_worker==True)')
#+ P.coef_dist_lt_2p5 * X('util_dist_lt_2p5*(is_external_worker==True)')
#+ P.coef_dist_1_2 * X('util_dist_1_2*(is_external_worker==True)')
#+ P.coef_dist_2_5 * X('util_dist_2_5*(is_external_worker==True)')
#+ P.coef_dist_5_15 * X('util_dist_5_15*(is_external_worker==True)')
#+ P.coef_dist_15_up * X('util_dist_15_up*(is_external_worker==True)')
#+ P('-999') * X('util_no_attractions*(is_external_worker==True)')
#+ P.coef_mode_logsum * X('mode_choice_logsum*(is_external_worker==True)'))

# Estimate

With the model setup for estimation, the next step is to estimate the model coefficients.  Make sure to use a sufficiently large enough household sample and set of zones to avoid an over-specified model, which does not have a numerically stable likelihood maximizing solution.  Larch has a built-in estimation methods including BHHH, and also offers access to more advanced general purpose non-linear optimizers in the `scipy` package, including SLSQP, which allows for bounds and constraints on parameters.  BHHH is the default and typically runs faster, but does not follow constraints on parameters.

In [258]:
model.load_data()
#model.doctor(repair_ch_av="-")

req_data does not request avail_ca or avail_co but it is set and being provided


In [259]:
model.maximize_loglike(method="SLSQP", options={"maxiter": 1000})


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
coef_dist_discr,-0.345476,0.0,0.0,-25.0,25.0,0,,-0.345476
coef_dist_eatout,-0.311656,0.0,0.0,-25.0,25.0,0,,-0.311656
coef_dist_escort,-0.356001,0.0,0.0,-25.0,25.0,0,,-0.356001
coef_dist_maint,-0.179535,0.0,0.0,-25.0,25.0,0,,-0.179535
coef_dist_shopping,-0.420035,0.0,0.0,-25.0,25.0,0,,-0.420035
coef_dist_social,-0.273091,0.0,0.0,-25.0,25.0,0,,-0.273091
coef_mode_logsum,0.270403,0.0,0.0,-25.0,25.0,0,,0.270403
coef_size,1.0,1.0,0.0,1.0,1.0,1,,1.0
eatout_external_nonwork,0.0,0.0,0.0,0.0,0.0,1,,0.0
escort_external_nonwork,0.0,0.0,0.0,0.0,0.0,1,,0.0


Unnamed: 0_level_0,0
Unnamed: 0_level_1,0
coef_dist_discr,-0.345476
coef_dist_eatout,-0.311656
coef_dist_escort,-0.356001
coef_dist_maint,-0.179535
coef_dist_shopping,-0.420035
coef_dist_social,-0.273091
coef_mode_logsum,0.270403
coef_size,1.000000
eatout_external_nonwork,0.000000
escort_external_nonwork,0.000000

Unnamed: 0,0
coef_dist_discr,-0.345476
coef_dist_eatout,-0.311656
coef_dist_escort,-0.356001
coef_dist_maint,-0.179535
coef_dist_shopping,-0.420035
coef_dist_social,-0.273091
coef_mode_logsum,0.270403
coef_size,1.0
eatout_external_nonwork,0.0
escort_external_nonwork,0.0

Unnamed: 0,0
coef_dist_discr,-0.000401
coef_dist_eatout,-7.9e-05
coef_dist_escort,0.000196
coef_dist_maint,0.004984
coef_dist_shopping,0.000363
coef_dist_social,0.003931
coef_mode_logsum,-0.005863
coef_size,0.0
eatout_external_nonwork,0.0
escort_external_nonwork,0.0


### Estimated coefficients

In [260]:
model.calculate_parameter_covariance()
result_dir='/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/'
model.to_xlsx(
        result_dir+"external_nonmandatory_destination_01.xlsx", 
        data_statistics=True,
    )

  xl = ExcelWriter(filename, engine='xlsxwriter_larch', model=model, **kwargs)


<larch.util.excel.ExcelWriter at 0x1de712eabb0>

# Output Estimation Results

In [261]:
from activitysim.estimation.larch import update_coefficients
model.parameter_summary().data.to_csv(data.edb_directory/'estimated/coefficient_results.csv')

#result_dir = data.edb_directory/"estimated"
#update_coefficients(
#    model, data, result_dir,
#    output_file=f"{modelname}_coefficients_001.csv",
#);

In [262]:
#larch.__version__

In [263]:
#result_dir

### Write the model estimation report, including coefficient t-statistic and log likelihood

# Next Steps

The final step is to either manually or automatically copy the `*_coefficients_revised.csv` file to the configs folder, rename it to `*_coefficients.csv`, and run ActivitySim in simulation mode.

In [264]:
#pd.read_csv(result_dir/f"{modelname}_coefficients_revised.csv")