# Estimating External Worker Location Model

This notebook re-estimates ActivitySim external worker location model in Larch. It predicts what external station an external worker will travel to for an external tour.

# Load libraries

In [1]:
import os
import larch  # !conda install larch -c conda-forge # for estimation
import pandas as pd
import numpy as np
from larch import P, X
import matplotlib.pyplot as plt
import shutil

The directory with the estimation data bundles for external worker location choice.

In [2]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/output/estimation_data_bundle/external_workplace_location')

# Drop duplicate person records

In [3]:
alts_combined_data = pd.read_csv("external_workplace_location_alternatives_combined_orig.csv")
household_data = pd.read_csv("../override_households.csv")
person_data = pd.read_csv("../override_persons.csv")
alts_combined_data.shape

(7070, 11)

In [4]:
household_data.shape

(49762, 26)

In [5]:
#merge alts_combined data with person file
alts_combined_per_data = pd.merge(alts_combined_data, person_data[['person_id','household_id','PNUM']], on=["person_id"], how='left')
alts_combined_per_data.shape

(7070, 13)

In [6]:
#merge alts_combined data (with person file) with household file
alts_combined_per_hh_data = pd.merge(alts_combined_per_data, household_data[['household_id','HH_ID']], on=["household_id"])
alts_combined_per_hh_data.shape

(7070, 14)

In [7]:
#drop duplicate person records and merged columns
alts_combined_new_data = alts_combined_per_hh_data.drop_duplicates(subset=['HH_ID','PNUM','variable'])
alts_combined_new_data = alts_combined_new_data.drop(columns=['household_id','PNUM','HH_ID'])
alts_combined_new_data.shape

(1470, 11)

In [8]:
#write data to file
if os.path.exists("external_workplace_location_alternatives_combined_orig.csv")!=True:
  shutil.copy2('external_workplace_location_alternatives_combined.csv','external_workplace_location_alternatives_combined_orig.csv')
alts_combined_new_data.to_csv("external_workplace_location_alternatives_combined.csv", index=False)

In [9]:
choosers_combined_data = pd.read_csv("external_workplace_location_choosers_combined_orig.csv")
choosers_combined_data.shape

(707, 5)

In [10]:
#merge choosers_combined data with person file
choosers_combined_per_data = pd.merge(choosers_combined_data, person_data[['person_id','PNUM']], on=["person_id"], how='left')
choosers_combined_per_data.shape

(707, 6)

In [11]:
#merge alts_combined data (with person file) with household file
choosers_combined_per_hh_data = pd.merge(choosers_combined_per_data, household_data[['household_id','HH_ID']], on=["household_id"])
choosers_combined_per_hh_data.shape

(707, 7)

In [12]:
#drop duplicate person records and merged columns
choosers_combined_new_data = choosers_combined_per_hh_data.drop_duplicates(subset=['HH_ID','PNUM'])
choosers_combined_new_data.shape

(147, 7)

In [13]:
# recode workers who chose station 24333 (TAZ 11) to 24327 (TAZ 12); coded incorrectly
choosers_combined_new_data['override_choice'] = np.where(choosers_combined_new_data['override_choice']==24333, 24327, choosers_combined_new_data['override_choice'])
choosers_combined_new_data['is_external_worker']=True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  choosers_combined_new_data['override_choice'] = np.where(choosers_combined_new_data['override_choice']==24333, 24327, choosers_combined_new_data['override_choice'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  choosers_combined_new_data['is_external_worker']=True


In [14]:
#write data to file
if os.path.exists("external_workplace_location_choosers_combined_orig.csv")!=True:
  shutil.copy2('external_workplace_location_choosers_combined.csv','external_workplace_location_choosers_combined_orig.csv')
choosers_combined_new_data.to_csv("external_workplace_location_choosers_combined.csv", index=False)

In [15]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation')
modelname = "external_workplace_location"

from activitysim.estimation.larch import component_model
model, data = component_model(modelname, return_data=True)

# Review data loaded from the EDB

The next step is to read the EDB, including the coefficients, model settings, utilities specification, and chooser and alternative data.

### Coefficients

In [16]:
data.coefficients

Unnamed: 0_level_0,value,constrain
coefficient_name,Unnamed: 1_level_1,Unnamed: 2_level_1
coef_mode_logsum,0,F
coef_dist_capped,0,F
coef_size,1,T


#### Utility specification

In [17]:
data.spec

Unnamed: 0,Label,Description,Expression,external_workplace
0,util_dist,,"@np.minimum(_DIST,10)",coef_dist_capped
1,util_no_attractions,No attractions,@df['size_term']==0,-999
2,mode_choice_logsum,Mode choice logsum,@df.mode_choice_logsum if 'mode_choice_logsum'...,coef_mode_logsum


## Explore data

In [18]:
data.chooser_data

Unnamed: 0,person_id,model_choice,override_choice,home_zone_id,household_id,PNUM,HH_ID,is_external_worker
0,38,24327,24327.0,24303,23,2,161004975,True
1,108,24326,24327.0,16387,55,2,161008944,True
2,764,24326,24323.0,5391,387,2,161067863,True
3,962,24326,24327.0,6641,482,1,161080976,True
4,1536,24327,24327.0,15411,786,2,161135536,True
...,...,...,...,...,...,...,...,...
142,95534,24330,24327.0,16268,49046,1,22068544,True
143,95747,24326,24327.0,15110,49151,1,22078307,True
144,96195,24328,24323.0,4413,49395,3,22098638,True
145,96371,24326,24327.0,312,49494,2,22106128,True


In [19]:
#pd.crosstab(data.chooser_data.closest_external_zone, data.chooser_data.override_choice, margins=True)

In [20]:
#pd.crosstab(data.chooser_data.industry, data.chooser_data.override_choice, margins=True, normalize='index')

In [21]:
#pd.crosstab(data.chooser_data.industry, data.chooser_data.override_choice, margins=True)

In [22]:
#data.chooser_data['external_worker'] = np.where(data.chooser_data["override_choice"]==1,0,1)

In [23]:
#plt.hist(data.chooser_data['distance_int'],range=(0, data.chooser_data['distance_int'].max()), bins=data.chooser_data['distance_int'].max() + 1)
#plot_df = data.chooser_data.groupby('distance_int')['external_worker'].mean().mul(100).reindex(range(data.chooser_data.distance_int.min()-1,data.chooser_data.distance_int.max()+1), fill_value=0)

#ax = plot_df.plot(kind='bar',rot = 0,title='Share of External Workers by Distance to Closest External Station',ylim=[0, 30], xlabel="Distance (mi)", ylabel="Percent",  figsize=(20, 5))

#plt.show()

In [24]:
#chooser_data_lt1mi= data.chooser_data[data.chooser_data["distance_int"]<=2]
#pd.crosstab(chooser_data_lt1mi.closest_external_zone, chooser_data_lt1mi.override_choice, margins=True)

# Set Coefficients

In [25]:
dir(model)
print(model.utility_ca)

  P.coef_dist_capped * X('util_dist*(is_external_worker==True)')
+ P('-999') * X('util_no_attractions*(is_external_worker==True)')
+ P.coef_mode_logsum * X('mode_choice_logsum*(is_external_worker==True)')


In [26]:
#capping distance at 10 miles
#model.utility_ca =   (P.coef_dist * X('fmin(util_dist,10)*(is_external_worker==True)')
#+ P.coef_dist_lt_2p5 * X('util_dist_lt_2p5*(is_external_worker==True)')
#+ P.coef_dist_1_2 * X('util_dist_1_2*(is_external_worker==True)')
#+ P.coef_dist_2_5 * X('util_dist_2_5*(is_external_worker==True)')
#+ P.coef_dist_5_15 * X('util_dist_5_15*(is_external_worker==True)')
#+ P.coef_dist_15_up * X('util_dist_15_up*(is_external_worker==True)')
#+ P('-999') * X('util_no_attractions*(is_external_worker==True)')
#+ P.coef_mode_logsum * X('mode_choice_logsum*(is_external_worker==True)'))

# Estimate

With the model setup for estimation, the next step is to estimate the model coefficients.  Make sure to use a sufficiently large enough household sample and set of zones to avoid an over-specified model, which does not have a numerically stable likelihood maximizing solution.  Larch has a built-in estimation methods including BHHH, and also offers access to more advanced general purpose non-linear optimizers in the `scipy` package, including SLSQP, which allows for bounds and constraints on parameters.  BHHH is the default and typically runs faster, but does not follow constraints on parameters.

In [27]:
model.load_data()
#model.doctor(repair_ch_av="-")

req_data does not request avail_ca or avail_co but it is set and being provided


In [28]:
model.maximize_loglike(method="SLSQP", options={"maxiter": 1000})


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
-999,-999.0,-999.0,-999.0,-999.0,-999.0,1,,-999.0
coef_dist_capped,-0.519016,0.0,0.0,-25.0,25.0,0,,-0.519016
coef_mode_logsum,0.064105,0.0,0.0,-25.0,25.0,0,,0.064105
external_workplace_external_work,0.0,0.0,0.0,0.0,0.0,1,,0.0


Unnamed: 0_level_0,0
Unnamed: 0_level_1,0
-999,-999.000000
coef_dist_capped,-0.519016
coef_mode_logsum,0.064105
external_workplace_external_work,0.000000
-999,0.000000
coef_dist_capped,0.001745
coef_mode_logsum,-0.000942
external_workplace_external_work,0.000000
key,value
x,0  -999  -999.000000  coef_dist_capped  -0.519016  coef_mode_logsum  0.064105  external_workplace_external_work  0.000000

Unnamed: 0,0
-999,-999.0
coef_dist_capped,-0.519016
coef_mode_logsum,0.064105
external_workplace_external_work,0.0

Unnamed: 0,0
-999,0.0
coef_dist_capped,0.001745
coef_mode_logsum,-0.000942
external_workplace_external_work,0.0


### Estimated coefficients

In [29]:
model.calculate_parameter_covariance()
result_dir='/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/'
model.to_xlsx(
        result_dir+"ext_worker_location_004.xlsx", 
        data_statistics=True,
    )

  xl = ExcelWriter(filename, engine='xlsxwriter_larch', model=model, **kwargs)


<larch.util.excel.ExcelWriter at 0x22631bcaf40>

# Output Estimation Results

In [30]:
from activitysim.estimation.larch import update_coefficients
result_dir = data.edb_directory/"estimated"
update_coefficients(
    model, data, result_dir,
    output_file=f"{modelname}_coefficients_004.csv",
);

In [31]:
#larch.__version__

In [32]:
#result_dir

### Write the model estimation report, including coefficient t-statistic and log likelihood

# Next Steps

The final step is to either manually or automatically copy the `*_coefficients_revised.csv` file to the configs folder, rename it to `*_coefficients.csv`, and run ActivitySim in simulation mode.

In [33]:
#pd.read_csv(result_dir/f"{modelname}_coefficients_revised.csv")