# Estimating Transponder Ownership Model

This notebook re-estimates ActivitySim transponder ownership model in Larch. 

# Load libraries

In [119]:
import os
import larch  # !conda install larch -c conda-forge # for estimation
import pandas as pd
import numpy as np
from larch import P, X
import matplotlib.pyplot as plt

In [120]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/output/estimation_data_bundle/transponder_ownership')

In [121]:
alts_combined_data = pd.read_csv("transponder_ownership_values_combined_orig.csv", dtype={'household_id': np.int64},low_memory=False)
alts_combined_new_data = alts_combined_data.copy(deep=True)
alts_combined_new_data['model_choice'] = np.where(alts_combined_data['model_choice']==True,1,0)
alts_combined_new_data['override_choice'] = np.where(alts_combined_data['override_choice']==True,1,0)
alts_combined_new_data['util_ml_dist'] = alts_combined_data['ml_dist']
alts_combined_new_data['util_hh_income_verylow'] = np.where(alts_combined_data['income']<15000,1,0)
alts_combined_new_data['util_hh_income_low'] =np.where((alts_combined_data['income']>=15000) & (alts_combined_data['income']<50000),1,0)
alts_combined_new_data['util_hh_income_mid'] =np.where((alts_combined_data['income']>=50000) & (alts_combined_data['income']<100000),1,0)
alts_combined_new_data['util_hh_income_veryhigh'] =np.where(alts_combined_data['income']>=150000,1,0)
alts_combined_new_data['util_autos_2plus'] =np.where(alts_combined_data['auto_ownership']>1,1,0)
alts_combined_new_data['util_workers_0'] =np.where(alts_combined_data['num_workers']==0,1,0)
alts_combined_new_data['util_2016'] =np.where(alts_combined_data['survey_year']==2016,1,0)
#drop duplicates
alts_combined_new_data = alts_combined_new_data.drop_duplicates(subset=['HH_ID'])


In [122]:
#write data to file
if os.path.exists("transponder_ownership_values_combined_orig.csv")!=True:
  shutil.copy2('transponder_ownership_values_combined.csv','transponder_ownership_values_combined_orig.csv')
alts_combined_new_data.to_csv("transponder_ownership_values_combined.csv", index=False)

# Load data and prep model for estimation

In [123]:
os.chdir('/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation')
modelname = "transponder_ownership"

from activitysim.estimation.larch import component_model
model, data = component_model(modelname, return_data=True)

# Review data loaded from the EDB

The next step is to read the EDB, including the coefficients, model settings, utilities specification, and chooser and alternative data.

### Coefficients

In [124]:
data.coefficients

Unnamed: 0_level_0,value,constrain
coefficient_name,Unnamed: 1_level_1,Unnamed: 2_level_1
coef_pct_multiple_auto,0,F
coef_Expected_time_savings_up_to_03_min,0,F
coef_Distance_above_two_miles,0,F
coef_Percent_detour_10_to_20,0,F
coef_Transit_accessibility,0,F
coef_Transit_accessibility_is_zero,0,F
coef_Constant,0,F
coef_total_hh_toll_travel_time_savings,0,F
coef_total_hh_toll_dist,0,F
coef_total_hh_toll_cost,0,T


#### Utility specification

In [125]:
data.spec

Unnamed: 0,Label,Description,Expression,No Pass,Pass
0,util_auto_ownership,auto_ownership zero would result in unavailabi...,@(df.auto_ownership==0) * (df.everyone_owns==0),,-999
1,util_ml_dist,Distance to nearest ML,df.ml_dist,,coef_distance_to_ML
2,util_Constant,Constant,1,,coef_Constant
3,util_total_hh_toll_dist,Sum of toll distance for all workers in hh,total_hh_toll_dist,,coef_total_hh_toll_dist
4,util_total_hh_toll_cost,Sum of toll cost for all workers in hh,total_hh_toll_cost,,coef_total_hh_toll_cost
5,util_hh_income_verylow,household income <15k,@(df.income<15000),,coef_income_vlow
6,util_hh_income_low,household income 15k-50k,@((df.income>=15000) & (df.income<50000)),,coef_income_low
7,util_hh_income_mid,household income 50-100k,@((df.income>=50000) & (df.income<100000)),,coef_income_med
8,util_hh_income_veryhigh,household income 150k+,@(df.income>=150000),,coef_income_vhigh
9,util_autos_2plus,Two or more household vehicles,@(df.auto_ownership>1),,coef_autos_2plus


## Explore data

In [126]:
data.chooser_data

Unnamed: 0_level_0,model_choice,override_choice,util_auto_ownership,util_pct_multiple_auto,util_Expected_time_savings_up_to_03_min,util_Distance_above_two_miles,util_Percent_detour_10_to_20,util_Constant,util_total_hh_toll_travel_time_savings,util_total_hh_toll_dist,...,total_hh_toll_cost,util_ml_dist,util_hh_income_verylow,util_hh_income_low,util_hh_income_mid,util_hh_income_veryhigh,util_autos_2plus,util_workers_0,util_2016,override_choice_code
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0.0,0.0,0.30000,0.0000,0.0,1.0,0.000000,0.0,...,0.0,0.4980,0,0,1,0,1,0,1,1
2,0,0,0.0,1.0,0.16163,5.2306,0.1,1.0,0.000000,0.0,...,0.0,7.2306,0,1,0,0,1,1,1,1
3,0,0,0.0,0.0,0.23092,7.0316,0.0,1.0,0.000000,0.0,...,0.0,9.0316,1,0,0,0,0,1,1,1
4,0,0,0.0,0.0,0.23069,1.6817,0.0,1.0,0.146070,0.0,...,0.0,3.6817,0,1,0,0,1,0,1,1
5,0,0,0.0,0.0,0.22326,3.3084,0.0,1.0,0.000000,0.0,...,0.0,5.3084,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49757,0,0,0.0,0.0,0.22083,18.5877,0.0,1.0,0.000000,0.0,...,0.0,20.5877,0,1,0,0,0,0,0,1
49758,0,0,0.0,0.0,0.21980,17.6129,0.0,1.0,0.000000,0.0,...,0.0,19.6129,0,1,0,0,0,0,0,1
49759,0,0,0.0,0.0,0.22397,10.2910,0.0,1.0,0.647308,0.0,...,0.0,12.2910,0,1,0,0,0,0,0,1
49760,0,0,0.0,0.0,0.22072,17.4927,0.0,1.0,0.000000,0.0,...,0.0,19.4927,0,0,1,0,0,0,0,1


In [127]:
pd.crosstab(data.chooser_data.HHT, data.chooser_data.override_choice, margins=True)

override_choice,0,1,All
HHT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3614,742,4356
2,267,36,303
3,532,60,592
4,1465,166,1631
6,1899,199,2098
All,7777,1203,8980


In [128]:
#pd.crosstab(data.chooser_data.num_workers, data.chooser_data.override_choice, margins=True)

In [129]:
#plt.hist(data.chooser_data['distance_int'],range=(0, data.chooser_data['distance_int'].max()), bins=data.chooser_data['distance_int'].max() + 1)
#plot_df = data.chooser_data.groupby('distance_int')['external_worker'].mean().mul(100).reindex(range(data.chooser_data.distance_int.min()-1,data.chooser_data.distance_int.max()+1), fill_value=0)

#ax = plot_df.plot(kind='bar',rot = 0,title='Share of External Workers by Distance to Closest External Station',ylim=[0, 30], xlabel="Distance (mi)", ylabel="Percent",  figsize=(20, 5))

#plt.show()

# Set Coefficients

In [130]:
dir(model)
print(model.utility_co)

DictOfLinearFunction_C({1: <Empty LinearFunction_C>, 2:   P('-999') * X.util_auto_ownership
+ P.coef_distance_to_ML * X.util_ml_dist
+ P.coef_Constant * X.util_Constant
+ P.coef_total_hh_toll_dist * X.util_total_hh_toll_dist
+ P.coef_total_hh_toll_cost * X.util_total_hh_toll_cost
+ P.coef_income_vlow * X.util_hh_income_verylow
+ P.coef_income_low * X.util_hh_income_low
+ P.coef_income_med * X.util_hh_income_mid
+ P.coef_income_vhigh * X.util_hh_income_veryhigh
+ P.coef_autos_2plus * X.util_autos_2plus
+ P.coef_workers_0 * X.util_workers_0
+ P.coef_2016 * X.util_2016})


In [131]:
#model.utility_co = {0: P.coef_dist_to_nearest_ext_station * X.util_dist_to_nearest_ext_station
#+ P.coef_size_of_nearest_ext_station * X.util_size_of_nearest_ext_station
#+ P.coef_part_time * X.parttime
#+ P.coef_agriculture * X.agriculture
#+ P.coef_business_srv * X.business_srv
#+ P.coef_construction * X.construction
#+ P.coef_education * X.education
#+ P.coef_entertainment * X.entertainment
#+ P.coef_food_srv * X.food_srv                   
#+ P.coef_government * X.government
#+ P.coef_healthcare * X.healthcare                   
#+ P.coef_manufacturing * X.manufacturing
#+ P.coef_mgmt_srv * X.mgmt_srv
#+ P.coef_military * X.military
#+ P.coef_retail * X.retail    
#+ P.coef_inc_lt15 * X.income_less15K
#+ P.coef_inc_15_25 * X.income_15_25 
#+ P.coef_inc_25_50 * X.income_25_50 
#+ P.coef_inc_100_150 * X.income_100_150 
#+ P.coef_inc_150_250 * X.income_150_250 
#+ P.coef_inc_250plus * X.income_250plus
#+ P.asc_external_2016 * X.year_2016         
#+ P.coef_dist_lt_2p5 * X.distance_lt_2p5                    
#+ P.asc_external_worker * X.util_asc_placeholder, 1: 0}

# Estimate

With the model setup for estimation, the next step is to estimate the model coefficients.  Make sure to use a sufficiently large enough household sample and set of zones to avoid an over-specified model, which does not have a numerically stable likelihood maximizing solution.  Larch has a built-in estimation methods including BHHH, and also offers access to more advanced general purpose non-linear optimizers in the `scipy` package, including SLSQP, which allows for bounds and constraints on parameters.  BHHH is the default and typically runs faster, but does not follow constraints on parameters.

In [132]:
model.load_data()
#model.doctor(repair_ch_av="-")

req_data does not request avail_ca or avail_co but it is set and being provided
converting data_co to <class 'numpy.float64'>


In [133]:
model.maximize_loglike(method="SLSQP", options={"maxiter": 1000})


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
-999,-999.0,-999.0,-999.0,-999.0,-999.0,1,,-999.0
coef_2016,-0.023949,0.0,0.0,,,0,,-0.023949
coef_Constant,-1.620224,0.0,0.0,,,0,,-1.620224
coef_autos_2plus,0.252353,0.0,0.0,,,0,,0.252353
coef_distance_to_ML,0.033172,0.0,0.0,,,0,,0.033172
coef_income_low,-1.066171,0.0,0.0,,,0,,-1.066171
coef_income_med,-0.312499,0.0,0.0,,,0,,-0.312499
coef_income_vhigh,0.513927,0.0,0.0,,,0,,0.513927
coef_income_vlow,-1.642832,0.0,0.0,,,0,,-1.642832
coef_total_hh_toll_cost,0.0,0.0,0.0,0.0,0.0,1,,0.0


Unnamed: 0_level_0,0
Unnamed: 0_level_1,0
-999,-999.000000
coef_2016,-0.023949
coef_Constant,-1.620224
coef_autos_2plus,0.252353
coef_distance_to_ML,0.033172
coef_income_low,-1.066171
coef_income_med,-0.312499
coef_income_vhigh,0.513927
coef_income_vlow,-1.642832
coef_total_hh_toll_cost,0.000000

Unnamed: 0,0
-999,-999.0
coef_2016,-0.023949
coef_Constant,-1.620224
coef_autos_2plus,0.252353
coef_distance_to_ML,0.033172
coef_income_low,-1.066171
coef_income_med,-0.312499
coef_income_vhigh,0.513927
coef_income_vlow,-1.642832
coef_total_hh_toll_cost,0.0

Unnamed: 0,0
-999,0.0
coef_2016,0.001353
coef_Constant,0.000268
coef_autos_2plus,0.000755
coef_distance_to_ML,-0.006753
coef_income_low,-0.001936
coef_income_med,0.002301
coef_income_vhigh,-0.000431
coef_income_vlow,0.000637
coef_total_hh_toll_cost,0.0


### Estimated coefficients

In [134]:
model.calculate_parameter_covariance()
result_dir='/projects/SANDAG/2017 On-Call Modeling Services/Area B/TO 05 - ABM3/estimation/'
model.to_xlsx(
        result_dir+"transponder_ownership_003.xlsx", 
        data_statistics=True,
    )

  xl = ExcelWriter(filename, engine='xlsxwriter_larch', model=model, **kwargs)


<larch.util.excel.ExcelWriter at 0x2052155ac40>

# Output Estimation Results

In [135]:
from activitysim.estimation.larch import update_coefficients
result_dir = data.edb_directory/"estimated"
update_coefficients(
    model, data, result_dir,
    output_file=f"{modelname}_coefficients_003.csv",
);

In [136]:
#larch.__version__

In [137]:
#result_dir

### Write the model estimation report, including coefficient t-statistic and log likelihood

# Next Steps

The final step is to either manually or automatically copy the `*_coefficients_revised.csv` file to the configs folder, rename it to `*_coefficients.csv`, and run ActivitySim in simulation mode.

In [138]:
#pd.read_csv(result_dir/f"{modelname}_coefficients_revised.csv")