### SANDAG Commercial Vehicle Model
#### Task 3 -- Data Exploration for Design Decisions

#### Establishment TNC Use

In [40]:
import numpy as np
import pandas as pd
import os
# from matplotlib import pyplot as plt
# from matplotlib import ticker

In [41]:
pd.set_option("display.max_rows", None)

In [42]:
# Import file paths from script
from FilePaths import *
print("root_dir = \n", root_dir, "\n")
print("raw_data_dir = \n", raw_data_dir,"\n")
print("proc_data_dir = \n", proc_data_dir,"\n")
print("lookup_dir = \n", lookup_dir,"\n")

root_dir = 
 C:\Users\jgliebe\OneDrive - Cambridge Systematics\Documents - PROJ SANDAG Commercial Vehicle & Heavy Truck Model Update\_Shared_CSTeam 

raw_data_dir = 
 Task03_DataID_Review 

proc_data_dir = 
 Task04_DataProcessing\Data\proc data 

lookup_dir = 
 Task04_DataProcessing\Data\Lookups 



#### Read in establishment file and create expansion weights

In [43]:
# Read input data -- establishment file
df_estab = pd.read_excel(os.path.join(root_dir, 
                                      raw_data_dir, 
                                      r"CVS\SANDAG 2022 CV DataBase & Dictionaires_03_03_2023.xlsx"),
                                     sheet_name=r"Establishment Data")

df_estab.set_index('company_id')
df_estab.head()

Unnamed: 0,company_id,"Industry Group_Size Code (Group from Column D; Size 1=0-9 emp, 2=10+)",company_name,base_location_Industry Group,company_location_address,company_location_city,company_location_state,company_location_zipcode,company_location_latitude,company_location_longitude,...,LCV Owned or Leased,LCV other,LCV Total,SUT Owned or Leased,SUT other,SUT Total,MUT Owned or Leased,MUT other,MUT Total,TOTAL CVs
0,100002,1_2,ALTAR,1,4370 LA JOLLA VILLAGE DR # 655,SAN DIEGO,CA,92122,32.873754,-117.210198,...,0,0,0,0,0,0,0,0,0,0
1,100003,1_1,John Baker Property Mgmt,1,405 W 9TH AVE,ESCONDIDO,CA,92025,33.112208,-117.080743,...,0,1,1,0,0,0,0,0,0,1
2,100004,1_1,OWB RANCHES LLC,1,512 VIA DE LA VALLE # 310,SOLANA BEACH,CA,92075,32.98037,-117.26082,...,0,0,0,0,0,0,0,0,0,0
3,100005,1_2,"United Sportfishers of San Diego, Inc.",1,2803 Emerson St,San Diego,CA,92106,32.723522,-117.227602,...,0,0,0,0,1,1,1,0,1,2
4,100007,1_1,SCRIPPS COASTAL RESERVE,1,9500 GILMAN DR,LA JOLLA,CA,92093,32.877189,-117.237422,...,0,1,1,0,0,0,0,0,0,1


In [44]:
df_estab.loc[:, 'emp_total'] = df_estab['employees_fulltime_count'] + df_estab['employees_parttime_count']

#### Read in file from CVS survey weighting report -- Most likely number of establishments by Industry

In [45]:
df_expand = pd.read_excel(os.path.join(root_dir, 
                                      raw_data_dir, 
                                      r"CVS\CVS_EstabExpansion.xlsx"),
                                     sheet_name=r"CVS_Expansion_Totals")
df_expand.info()
df_expand.SectorID.fillna(0, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
SectorID    11 non-null float64
Sector      12 non-null object
Emp_0_9     12 non-null int64
Emp_10p     12 non-null int64
All         12 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 608.0+ bytes


In [46]:
df_expand

Unnamed: 0,SectorID,Sector,Emp_0_9,Emp_10p,All
0,1.0,Agriculture/Mining,334,88,422
1,2.0,Manufacturing,2723,1664,4387
2,3.0,Industrial/Utilties,115,77,192
3,4.0,Retail,7124,3272,10396
4,5.0,Wholesale,3355,990,4345
5,6.0,Construction,6162,1586,7748
6,7.0,Transportation,1371,496,1867
7,8.0,Info FlRE/Professional services,24539,4310,28849
8,9.0,Education/Public/Other services,11416,3233,14649
9,10.0,Medical/Health Services,19634,2404,22038


In [47]:
# Create dictionary for establishment expansion targets
expandEstab = dict()
expandEstab['Emp_0_9'] = dict(zip(df_expand['SectorID'],df_expand['Emp_0_9']))
expandEstab['Emp_10p'] = dict(zip(df_expand['SectorID'],df_expand['Emp_10p']))
expandEstab

{'Emp_0_9': {1.0: 334,
  2.0: 2723,
  3.0: 115,
  4.0: 7124,
  5.0: 3355,
  6.0: 6162,
  7.0: 1371,
  8.0: 24539,
  9.0: 11416,
  10.0: 19634,
  11.0: 4742,
  0.0: 81515},
 'Emp_10p': {1.0: 88,
  2.0: 1664,
  3.0: 77,
  4.0: 3272,
  5.0: 990,
  6.0: 1586,
  7.0: 496,
  8.0: 4310,
  9.0: 3233,
  10.0: 2404,
  11.0: 4492,
  0.0: 22612}}

In [48]:
# Add establishment population-level estimates (targets)
df_estab.loc[df_estab['emp_total']<=9, 'emp_lt10'] = 1
df_estab.loc[df_estab['emp_total']>9, 'emp_lt10'] = 0
df_estab['emp_lt10'] = df_estab['emp_lt10'].astype(int)
             
df_estab.loc[df_estab['emp_total']<=9, \
             'region_estab_wght'] = df_estab['base_location_Industry Group'].map(expandEstab['Emp_0_9'])

df_estab.loc[df_estab['emp_total']>9, \
             'region_estab_wght'] = df_estab['base_location_Industry Group'].map(expandEstab['Emp_10p'])

df_estab['region_estab_wght'] = df_estab['region_estab_wght'].astype('int64')
#df_estab[['emp_total','base_location_Industry Group','emp_lt10','region_estab_wght']].head()

In [49]:
# Calculate Establishment Weights
estab_weights = df_estab.groupby(['region_estab_wght']).size().reset_index(name='num_estabs')
estab_weights['estab_exp_weight'] = estab_weights['region_estab_wght'] / estab_weights['num_estabs']
estab_weights = dict(zip(estab_weights['region_estab_wght'],estab_weights['estab_exp_weight']))

In [50]:
df_estab.loc[:,'estab_exp_weight'] = df_estab['region_estab_wght'].map(estab_weights)
df_estab[['emp_total','base_location_Industry Group','emp_lt10','region_estab_wght','estab_exp_weight']].head(10)
df_estab = df_estab.drop(['region_estab_wght'], axis=1)

In [51]:
# Finalize Establishment Weights in Table
estab_wts = round(df_estab.groupby(['base_location_Industry Group','emp_lt10'])['estab_exp_weight'].mean().reset_index(),3)
print(estab_wts.to_string(index=False))

 base_location_Industry Group  emp_lt10  estab_exp_weight
                            1         0             2.514
                            1         1             5.475
                            2         0            13.206
                            2         1            22.320
                            3         0             2.962
                            3         1             7.188
                            4         0            25.562
                            4         1            48.463
                            5         0            10.879
                            5         1            24.312
                            6         0            14.685
                            6         1            42.205
                            7         0             9.725
                            7         1            16.926
                            8         0            34.480
                            8         1            81.255
              

#### Find TNC Usage Rates by Establishment Industries

In [52]:
indus_lookup = {
    1: "Agriculture/Mining",
    2: "Manufacturing",
    3: "Industrial/Utilities",
    4: "Retail",
    5: "Wholesale",
    6: "Construction",
    7: "Transportation",
    8: "Info/Finance/Insurance/Real Estate/Professional services",
    9: "Education/Public/Other services",
    10: "Medical/Health Services",
    11: "Leisure/Accommodations and Food",
    96: "Other/Non-Classified"
}


In [53]:
# Create short names for industries
ind_abrv = {
    1: "AGM",
    6: "CON",
    9: "EPO",
    3: "IUT",
    8: "IFR",
    11: "LAF",
    2: "MFG",
    10: "MHS",
    4: "RET",
    7: "TRN",
    5: "WHL",
    96: "ONC"
}

In [54]:
delivTNC = df_estab[df_estab['is_use_tnc']==1].groupby(['base_location_Industry Group']).sum()['no_of_from_deliveries_pcsuvpu'].reset_index(name='delivTNC')
delivTNC

Unnamed: 0,base_location_Industry Group,delivTNC
0,1,1.0
1,2,7.0
2,3,0.0
3,4,133.0
4,5,2.0
5,6,4.0
6,7,3.0
7,8,2.0
8,9,6.0
9,10,1.0


In [55]:
# TNC Usage Rates (raw)
useTNC = df_estab[df_estab['is_use_tnc']==1].groupby(['base_location_Industry Group']).size().reset_index(name='useTNC')
delivTNC = df_estab[df_estab['is_use_tnc']==1].groupby(['base_location_Industry Group']).sum()['no_of_from_deliveries_pcsuvpu'].reset_index(name='delivTNC').astype('int64')

allEstab = df_estab.groupby(['base_location_Industry Group']).size().reset_index(name='totEstab')
allEstab["IndustryGroup"] = allEstab['base_location_Industry Group'].map(indus_lookup)
TNC_Industry = allEstab.set_index('base_location_Industry Group').join(useTNC.set_index('base_location_Industry Group'))
TNC_Industry = TNC_Industry.join(delivTNC.set_index('base_location_Industry Group'))
TNC_Industry.loc[:,'IndusAbbrv'] = TNC_Industry.index.map(ind_abrv)

# Reorder columns
TNC_Industry = TNC_Industry[['IndustryGroup', 'useTNC', 'delivTNC', 'totEstab']]

# Calculate rates
TNC_Industry['pctUseTNC'] = TNC_Industry['useTNC'] / TNC_Industry['totEstab']
TNC_Industry['pctUseTNC'] = TNC_Industry['pctUseTNC'].map('{:.2%}'.format)
TNC_Industry['rateDelTNC'] = TNC_Industry['delivTNC'] / TNC_Industry['totEstab']
TNC_Industry['rateDelTNC'] = TNC_Industry['rateDelTNC'].map('{:5.3f}'.format)

print("Unweighted Establishment Observations")
print("\nNote: 'rateDelTNC' = delivery trips per day per establishment in industry\n")
print(TNC_Industry.to_string(index=False))

Unweighted Establishment Observations

Note: 'rateDelTNC' = delivery trips per day per establishment in industry

                                     IndustryGroup  useTNC  delivTNC  totEstab pctUseTNC rateDelTNC
                                Agriculture/Mining       1         1        96     1.04%      0.010
                                     Manufacturing       6         7       248     2.42%      0.028
                              Industrial/Utilities       1         0        42     2.38%      0.000
                                            Retail      26       133       275     9.45%      0.484
                                         Wholesale       4         2       229     1.75%      0.009
                                      Construction       8         4       254     3.15%      0.016
                                    Transportation       7         3       132     5.30%      0.023
 Info/Finance/Insurance/Real Estate/Professiona...       8         2       427     1.8

In [56]:
# TNC Usage Rates (weighted)
useTNC = df_estab[df_estab['is_use_tnc']==1].groupby(['base_location_Industry Group']).sum()['estab_exp_weight'].reset_index(name='useTNC')
df_estab.loc[df_estab['is_use_tnc']==1, 'wghtd_from_deliveries_pcsuvpu'] = df_estab['estab_exp_weight'] * df_estab['no_of_from_deliveries_pcsuvpu']
df_estab['wghtd_employment'] = df_estab['estab_exp_weight'] * df_estab['emp_total']
delivTNC = df_estab[df_estab['is_use_tnc']==1].groupby(['base_location_Industry Group']).sum()['wghtd_from_deliveries_pcsuvpu'].reset_index(name='delivTNC').astype('int64')

empIfUseTNC = df_estab[df_estab['is_use_tnc']==1].groupby(['base_location_Industry Group']).sum()['wghtd_employment'].reset_index(name='empIfUseTNC').astype('int64')
empAll = df_estab.groupby(['base_location_Industry Group']).sum()['wghtd_employment'].reset_index(name='empAll').astype('int64')
#empAll["IndustryGroup"] = empAll['base_location_Industry Group'].map(indus_lookup)

allEstab = df_estab.groupby(['base_location_Industry Group']).sum()['estab_exp_weight'].reset_index(name='totEstab')
allEstab["IndustryGroup"] = allEstab['base_location_Industry Group'].map(indus_lookup)
TNC_Industry = allEstab.set_index('base_location_Industry Group').join(useTNC.set_index('base_location_Industry Group'))
TNC_Industry = TNC_Industry.join(delivTNC.set_index('base_location_Industry Group'))
TNC_Industry = TNC_Industry.join(empIfUseTNC.set_index('base_location_Industry Group'))
TNC_Industry = TNC_Industry.join(empAll.set_index('base_location_Industry Group'))
TNC_Industry.loc[:,'IndusAbbrv'] = TNC_Industry.index.map(ind_abrv)

# Reorder columns
TNC_Industry = TNC_Industry[['IndusAbbrv', 'useTNC', 'delivTNC', 'empIfUseTNC', 'empAll', 'totEstab']]

TNC_Industry['pctUseTNC'] = TNC_Industry['useTNC'] / TNC_Industry['totEstab']
TNC_Totals = pd.DataFrame(TNC_Industry[['useTNC', 'delivTNC', 'empIfUseTNC', 'empAll', 'totEstab']].sum().map('{:.0f}'.format)).T
TNC_Industry['rateDelTNC'] = TNC_Industry['delivTNC'] / TNC_Industry['totEstab']
TNC_Industry['delivEmpIfTNC'] = TNC_Industry['delivTNC'] / TNC_Industry['empIfUseTNC']

NonRestRetl = pd.DataFrame(TNC_Industry[(TNC_Industry.index!=4) & (TNC_Industry.index!=11)]\
                             [['useTNC','delivTNC','empIfUseTNC','empAll','totEstab']].copy().sum()).T
NonRestRetl['pctUseTNC'] = NonRestRetl['useTNC'] / NonRestRetl['totEstab']
NonRestRetl['rateDelTNC'] = NonRestRetl['delivTNC'] / NonRestRetl['totEstab']
NonRestRetl['delivEmpIfTNC'] = NonRestRetl['delivTNC'] / NonRestRetl['empIfUseTNC']

TNC_Industry['useTNC'] = TNC_Industry['useTNC'].map('{:.0f}'.format)
TNC_Industry['delivTNC'] = TNC_Industry['delivTNC'].map('{:.0f}'.format)
TNC_Industry['totEstab'] = TNC_Industry['totEstab'].map('{:.0f}'.format)
TNC_Industry['empIfUseTNC'] = TNC_Industry['empIfUseTNC'].map('{:.0f}'.format)
TNC_Industry['empAll'] = TNC_Industry['empAll'].map('{:.0f}'.format)
TNC_Industry['pctUseTNC'] = TNC_Industry['pctUseTNC'].map('{:.5f}'.format)
TNC_Industry['rateDelTNC'] = TNC_Industry['rateDelTNC'].map('{:.5f}'.format)
TNC_Industry['delivEmpIfTNC'] = TNC_Industry['delivEmpIfTNC'].map('{:.5f}'.format)

NonRestRetl['useTNC'] = NonRestRetl['useTNC'].map('{:.0f}'.format)
NonRestRetl['delivTNC'] = NonRestRetl['delivTNC'].map('{:.0f}'.format) 
NonRestRetl['totEstab'] = NonRestRetl['totEstab'].map('{:.0f}'.format)
NonRestRetl['empIfUseTNC'] = NonRestRetl['empIfUseTNC'].map('{:.0f}'.format)
NonRestRetl['empAll'] = NonRestRetl['empAll'].map('{:.0f}'.format)
NonRestRetl['pctUseTNC'] = NonRestRetl['pctUseTNC'].map('{:.5f}'.format)
NonRestRetl['rateDelTNC'] = NonRestRetl['rateDelTNC'].map('{:.5f}'.format)
NonRestRetl['delivEmpIfTNC'] = NonRestRetl['delivEmpIfTNC'].map('{:.5f}'.format)
NonRestRetl['type'] = 'NonRestRetl'

print("Weighted Establishment Observations")
print("\nNote: 'rateDelTNC' = delivery trips per day per establishment in industry\n")
print(TNC_Industry.to_string(index=False))
print("\nAll Establishment Totals:")
print(TNC_Totals.to_string(index=False))

Weighted Establishment Observations

Note: 'rateDelTNC' = delivery trips per day per establishment in industry

IndusAbbrv useTNC delivTNC empIfUseTNC  empAll totEstab pctUseTNC rateDelTNC delivEmpIfTNC
       AGM      3        2          25    3378      422   0.00596    0.00474       0.08000
       MFG     88      101        3998   92184     4387   0.02014    0.02302       0.02526
       IUT      3        0          35    3327      192   0.01542    0.00000       0.00000
       RET    825     3766       26808  168206    10396   0.07935    0.36225       0.14048
       WHL     70       48         594   38724     4345   0.01620    0.01105       0.08081
       CON    173       58        3250   71600     7748   0.02227    0.00749       0.01785
       TRN     90       29        2301   22314     1867   0.04803    0.01553       0.01260
       IFR    556      162       10946  271083    28849   0.01929    0.00562       0.01480
       EPO    159      161        3883  241807    14649   0.01084    

In [57]:
empAll["IndustryGroup"] = empAll['base_location_Industry Group'].map(indus_lookup)
empAll

Unnamed: 0,base_location_Industry Group,empAll,IndustryGroup
0,1,3378,Agriculture/Mining
1,2,92184,Manufacturing
2,3,3327,Industrial/Utilities
3,4,168206,Retail
4,5,38724,Wholesale
5,6,71600,Construction
6,7,22314,Transportation
7,8,271083,Info/Finance/Insurance/Real Estate/Professiona...
8,9,241807,Education/Public/Other services
9,10,230367,Medical/Health Services


In [58]:
# Create table for Restaurant and Non-Restaurant Types
usecols = ['useTNC', 'delivTNC', 'empIfUseTNC', 'empAll', 'totEstab', 'pctUseTNC', 'rateDelTNC', 'delivEmpIfTNC']

Retail = TNC_Industry[TNC_Industry.index==4].reset_index()
Retail = Retail[usecols]
Retail['type'] = 'Retail'

Restaurant = TNC_Industry[TNC_Industry.index==11].reset_index()
Restaurant = Restaurant[usecols]
Restaurant['type'] = 'Restaurant'

In [59]:
# Create combined table of usage rates by three industry groups
usecols = ['type', 'useTNC', 'delivTNC', 'empIfUseTNC','empAll','totEstab', 'pctUseTNC', 'rateDelTNC', 'delivEmpIfTNC'] #re-order
TNC_Industry3 = pd.concat([NonRestRetl, Restaurant, Retail], axis=0, sort=False).reset_index().drop(['index'], axis=1)
TNC_Industry3 = TNC_Industry3[usecols].set_index('type')

print("TNC Usage Rates by Three Primary Groups\n")
print(TNC_Industry3.to_string(index=True))

TNC Usage Rates by Three Primary Groups

            useTNC delivTNC empIfUseTNC  empAll totEstab pctUseTNC rateDelTNC delivEmpIfTNC
type                                                                                       
NonRestRetl   1449      575       27532  974784    84497   0.01715    0.00680       0.02088
Restaurant    3132     8184       78791  374445     9234   0.33914    0.88629       0.10387
Retail         825     3766       26808  168206    10396   0.07935    0.36225       0.14048


In [60]:
.44129 * 84497 * 0.01715

639.4837313795

#### Read in TNC establishment-level records (driver summaries)

In [61]:
# Read input data -- TNC establishment file
df_tnc_driver = pd.read_excel(os.path.join(root_dir, raw_data_dir, 
                                      r"TNC\TNC Travel Survey_Data Submittal_1-19-23.xlsx"),
                                     sheet_name=r"Establishment Data")

df_tnc_driver.set_index('company_id')
df_tnc_driver.head(3).T

Unnamed: 0,0,1,2
company_id,600026,600027,600028
"Industry Group_Size Code (Group from Column D; Size 1=0-9 emp, 2=10+)",,,
company_name,Uber Eats,Amazon Flex,Senpex
vehicle_purpose,,,
number_of_trips,14,26,4
base_location_Industry Group,,,
company_location_address,1052 Woodlawn Ave,3980 Hatton St,6881 Alvarado Rd
company_location_city,Chula Vista,San Diego,San Diego
company_location_state,California,California,California
company_location_zipcode,91911,92111,92120


In [62]:
df_tnc_driver = df_tnc_driver[['company_id','company_name','number_of_trips']]
df_tnc_driver = df_tnc_driver.rename(columns = {'company_name': 'client_app'})
df_tnc_driver.head()

Unnamed: 0,company_id,client_app,number_of_trips
0,600026,Uber Eats,14
1,600027,Amazon Flex,26
2,600028,Senpex,4
3,600029,Grub Hub,9
4,600031,Uber Eats,30


In [63]:
# Read input data -- lookup table for TNC client categories
df_lookup = pd.read_excel(os.path.join(root_dir, lookup_dir, 
                                      r"Lookups_v8.xlsx"),
                                     sheet_name=r"TNC Categories")
df_lookup.head()

Unnamed: 0,company_name,company_name_alt,Estb (ie veh),Trips,Trips/Veh,Tnc_Cat,TNC_IndCat3
0,amazon,Amazon,34.0,748.0,22.0,Package/Other,NonRestRetl
1,amazonflex,Amazon Flex,15.0,259.0,17.266667,Package/Other,NonRestRetl
2,amazonfresh,Amazon Fresh,1.0,12.0,12.0,Grocery,Retail
3,axhire,Axle Hire,4.0,61.0,15.25,Package/Other,NonRestRetl
4,axle,Axle Hire,,,15.25,Package/Other,NonRestRetl


In [64]:
df_lookup["company_name2"] = df_lookup["company_name_alt"].str.strip().str.lower().str.replace(" ","")

In [65]:
TNC_IndCat = dict(zip(df_lookup['company_name2'],df_lookup['TNC_IndCat3']))
TNC_IndCat

{'amazon': 'NonRestRetl',
 'amazonflex': 'NonRestRetl',
 'amazonfresh': 'Retail',
 'axlehire': 'NonRestRetl',
 'doordash': 'Restaurant',
 'fantuan': 'Restaurant',
 'gopuff': 'Retail',
 'grubhub': 'Restaurant',
 'instacart': 'Retail',
 'pointpickup': 'Retail',
 'postmates': 'Restaurant',
 'returnmates': 'NonRestRetl',
 'returnme': 'NonRestRetl',
 'returntorunway': 'NonRestRetl',
 'roadie': 'NonRestRetl',
 'senpex': 'NonRestRetl',
 'shipt': 'Retail',
 'spark': 'Retail',
 'ubereats': 'Restaurant'}

In [66]:
df_tnc_driver['client_app2'] = df_tnc_driver['client_app'].str.strip().str.lower().str.replace(" ","")
df_tnc_driver.loc[:,'IndCat3'] = df_tnc_driver['client_app2'].map(TNC_IndCat)
df_tnc_driver.head()

Unnamed: 0,company_id,client_app,number_of_trips,client_app2,IndCat3
0,600026,Uber Eats,14,ubereats,Restaurant
1,600027,Amazon Flex,26,amazonflex,NonRestRetl
2,600028,Senpex,4,senpex,NonRestRetl
3,600029,Grub Hub,9,grubhub,Restaurant
4,600031,Uber Eats,30,ubereats,Restaurant


In [67]:
# Read input data -- TNC trips
df_tnc_trips = pd.read_excel(os.path.join(root_dir, raw_data_dir, 
                                      r"TNC\TNC Travel Survey_Data Submittal_1-19-23.xlsx"),
                                     sheet_name=r"Trips")

df_tnc_trips.set_index('company_id')
df_tnc_trips.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,trip_load_status,trip_load_weight,activity_type,activity_type_other,placetype,placetype_other,...,cargo_do_weight,travel_date,arrival_time,departure_time,used_other_vehicle,participation_type,vehicle_id.1,Lower Estimate Weight Factor,Most Likely Estimate Weight Factor,Upper Estimate Weight Factor
0,600026,1371,1744,0,3.0,,14,,11,,...,,2022-08-29,,13:39:00,2,Smartphone,,,,
1,600026,1371,1744,1,,,6,,6,,...,,2022-08-29,14:03:00,14:33:00,2,Smartphone,,,,
2,600026,1371,1744,2,,,5,,11,,...,11.0,2022-08-29,14:39:00,14:42:00,2,Smartphone,,,,
3,600026,1371,1744,3,,,5,,11,,...,89.0,2022-08-29,14:54:03,15:01:21,2,Smartphone,,,,
4,600026,1371,1744,4,,,6,,5,,...,,2022-08-29,15:09:00,15:12:00,2,Smartphone,,,,


In [68]:
# Sum number of client pickup stops for each driver (total unique clients)
temp = df_tnc_trips[df_tnc_trips['activity_type']==6].groupby(['company_id',\
                                                                        'location_placename',\
                                                                        'location_address']).size().reset_index(name='clientPickups')
clientPickups = temp.groupby(['company_id']).count()
clientPickups = clientPickups['clientPickups'].reset_index()
clientPickups.head()

Unnamed: 0,company_id,clientPickups
0,600026,3
1,600027,4
2,600028,1
3,600029,3
4,600031,4


In [69]:
# Sum number of home dropoffs for each driver
homeDropoffs = df_tnc_trips[(df_tnc_trips['activity_type']==5) &\
                           (df_tnc_trips['placetype']==11)].groupby(['company_id']).size().reset_index(name='homeDropoffs')
homeDropoffs.head()

Unnamed: 0,company_id,homeDropoffs
0,600026,7
1,600027,12
2,600028,1
3,600031,10
4,600035,71


In [70]:
# Sum number of business dropoffs for each driver
bizDropoffs = df_tnc_trips[(df_tnc_trips['activity_type']==5) &\
                           (df_tnc_trips['placetype']!=11)].groupby(['company_id']).size().reset_index(name='businessDropoffs')
bizDropoffs.head()

Unnamed: 0,company_id,businessDropoffs
0,600026,3
1,600027,2
2,600029,2
3,600031,3
4,600032,2


In [71]:
df_tnc_driver = df_tnc_driver.merge(clientPickups, how='left', on='company_id')
df_tnc_driver.head()

Unnamed: 0,company_id,client_app,number_of_trips,client_app2,IndCat3,clientPickups
0,600026,Uber Eats,14,ubereats,Restaurant,3.0
1,600027,Amazon Flex,26,amazonflex,NonRestRetl,4.0
2,600028,Senpex,4,senpex,NonRestRetl,1.0
3,600029,Grub Hub,9,grubhub,Restaurant,3.0
4,600031,Uber Eats,30,ubereats,Restaurant,4.0


In [72]:
df_tnc_driver = df_tnc_driver.merge(homeDropoffs, how='left', on='company_id')
df_tnc_driver.head()

Unnamed: 0,company_id,client_app,number_of_trips,client_app2,IndCat3,clientPickups,homeDropoffs
0,600026,Uber Eats,14,ubereats,Restaurant,3.0,7.0
1,600027,Amazon Flex,26,amazonflex,NonRestRetl,4.0,12.0
2,600028,Senpex,4,senpex,NonRestRetl,1.0,1.0
3,600029,Grub Hub,9,grubhub,Restaurant,3.0,
4,600031,Uber Eats,30,ubereats,Restaurant,4.0,10.0


In [73]:
df_tnc_driver = df_tnc_driver.merge(bizDropoffs, how='left', on='company_id')
df_tnc_driver.head()

Unnamed: 0,company_id,client_app,number_of_trips,client_app2,IndCat3,clientPickups,homeDropoffs,businessDropoffs
0,600026,Uber Eats,14,ubereats,Restaurant,3.0,7.0,3.0
1,600027,Amazon Flex,26,amazonflex,NonRestRetl,4.0,12.0,2.0
2,600028,Senpex,4,senpex,NonRestRetl,1.0,1.0,
3,600029,Grub Hub,9,grubhub,Restaurant,3.0,,2.0
4,600031,Uber Eats,30,ubereats,Restaurant,4.0,10.0,3.0


In [74]:
# Calculate TNC Driver Pickups by Industry Category
tnc_by_client = df_tnc_driver.groupby('IndCat3').sum()['clientPickups'].reset_index(name='clientPickups')
tnc_by_client.set_index('IndCat3')

Unnamed: 0_level_0,clientPickups
IndCat3,Unnamed: 1_level_1
NonRestRetl,152.0
Restaurant,636.0
Retail,197.0


In [75]:
# Calculate TNC Home Dropoffs by Industry Category
tnc_by_homedel = df_tnc_driver.groupby('IndCat3').sum()['homeDropoffs'].reset_index(name='homeDropoffs')
tnc_by_homedel.set_index('IndCat3')

Unnamed: 0_level_0,homeDropoffs
IndCat3,Unnamed: 1_level_1
NonRestRetl,719.0
Restaurant,709.0
Retail,508.0


In [76]:
# Calculate TNC Business Dropoffs by Industry Category
tnc_by_bizdel = df_tnc_driver.groupby('IndCat3').sum()['businessDropoffs'].reset_index(name='businessDropoffs')
tnc_by_bizdel.set_index('IndCat3')

Unnamed: 0_level_0,businessDropoffs
IndCat3,Unnamed: 1_level_1
NonRestRetl,292.0
Restaurant,313.0
Retail,90.0


In [77]:
# Calculate TNC Weights
tnc_weights = tnc_by_client.merge(TNC_Industry3, left_on='IndCat3', right_on='type')

tnc_weights['tnc_expwght'] = tnc_weights['delivTNC'].str.replace(",","").astype(float) \
/ tnc_weights['clientPickups'].astype(float) 

tnc_weights['clientPickups'] = tnc_weights['clientPickups'].map('{:,.0f}'.format)

tnc_weights['tnc_expwght'] = round(tnc_weights['tnc_expwght'],5)
print(tnc_weights[['IndCat3','clientPickups','delivTNC','tnc_expwght']].to_string(index=False))

     IndCat3 clientPickups delivTNC  tnc_expwght
 NonRestRetl           152      575      3.78289
  Restaurant           636     8184     12.86792
      Retail           197     3766     19.11675


In [78]:
# Create TNC expansion weights dictionary
tnc_expwght = dict(zip(tnc_weights['IndCat3'],tnc_weights['tnc_expwght']))
df_tnc_driver.loc[:, 'tnc_expwght'] = df_tnc_driver['IndCat3'].map(tnc_expwght)

In [79]:
tnc_expwght

{'NonRestRetl': 3.78289, 'Restaurant': 12.86792, 'Retail': 19.11675}

In [40]:
# Calculate impact on total trips
df_tnc_driver['wghtd_trips'] = df_tnc_driver['number_of_trips'] * df_tnc_driver['tnc_expwght']

tnc_summary = pd.DataFrame(df_tnc_driver[['number_of_trips','wghtd_trips']].sum()).T
tnc_summary['number_of_trips'] = tnc_summary['number_of_trips'].map('{:,.0f}'.format)
tnc_summary['wghtd_trips'] = tnc_summary['wghtd_trips'].map('{:,.0f}'.format)
tnc_summary.rename(columns={'number_of_trips': 'raw_trips'}, inplace=True)

print("Impact of TNC weights expansion on Total TNC trips\n")
print(tnc_summary.to_string(index=False))

Impact of TNC weights expansion on Total TNC trips

raw_trips wghtd_trips
    5,253      60,640


In [41]:
df_tnc_driver.to_csv(os.path.join(root_dir, 
                                      raw_data_dir, 
                                      r"TNC\TNC_weighted.csv"))

In [42]:
# Get weighted number of trips and average per driver-route
df_tnc_driver['wghtd_client_picks'] = df_tnc_driver['clientPickups'] * df_tnc_driver['tnc_expwght']
df_tnc_driver['wghtd_home_delivs'] = df_tnc_driver['homeDropoffs'] * df_tnc_driver['tnc_expwght']
df_tnc_driver['wghtd_biz_delivs'] = df_tnc_driver['businessDropoffs'] * df_tnc_driver['tnc_expwght']

In [43]:
# Client pickups by TNCs
clientPickups = df_tnc_driver.groupby(['IndCat3']).sum()['wghtd_client_picks'] 
clientPickups = pd.DataFrame(clientPickups.map('{:,.0f}'.format))
print(clientPickups.T.to_string(index=False))

NonRestRetl Restaurant Retail
        575      8,184  3,766


In [44]:
# Calculate deliveries to households by TNCs -- food (retaurant+groceries), non-food parcels
homeDeliveries = df_tnc_driver.groupby(['IndCat3']).sum()['wghtd_home_delivs']
food = homeDeliveries['Restaurant']+homeDeliveries['Retail']
homeDeliveries = pd.DataFrame(homeDeliveries.map('{:,.0f}'.format))

print(homeDeliveries.T.to_string(index=False))
print("\nFood Deliveries = Restaurant + Retail")
print(f"\t{food:,.0f}")

NonRestRetl Restaurant Retail
      2,720      9,123  9,711

Food Deliveries = Restaurant + Retail
	18,835


In [45]:
# Calculate deliveries to businesses by TNCs
clientPickups = df_tnc_driver.groupby(['IndCat3']).sum()['wghtd_biz_delivs']
clientPickups = pd.DataFrame(clientPickups.map('{:,.0f}'.format))
print(clientPickups.T.to_string(index=False))

NonRestRetl Restaurant Retail
      1,105      4,028  1,721


In [46]:
# Clients per Driver Route
clientPickups = df_tnc_driver.groupby(['IndCat3']).sum()['wghtd_client_picks'] \
/ df_tnc_driver.groupby(['IndCat3']).sum()['tnc_expwght']
clientPickups = pd.DataFrame(clientPickups.map('{:,.4f}'.format))
print("Clients per driver route:")
print(clientPickups.T.to_string(index=False))

# Driver routes
driverRoutes = df_tnc_driver.groupby(['IndCat3']).sum()['tnc_expwght']
driverRoutes['Total'] = driverRoutes.sum()
driverRoutes = pd.DataFrame(driverRoutes.map('{:.0f}'.format))
print("\nDriver routes (weighted)")
print(driverRoutes.T.to_string(index=False))

Clients per driver route:
NonRestRetl Restaurant  Retail
     1.6344     2.8778  2.0957

Driver routes (weighted)
NonRestRetl Restaurant Retail Total
        352       2844   1797  4993


#### Get TNC Route Generation (routes = drivers)

In [47]:
# Driver Routes Per Client Establishment using TNCs
driversPerClient = df_tnc_driver.groupby(['IndCat3']).sum()['tnc_expwght'] \
/ df_tnc_driver.groupby(['IndCat3']).sum()['wghtd_client_picks']
driversPerClient = pd.DataFrame(driversPerClient.map('{:.5f}'.format))
driversPerClient.rename(columns={0:'drivers_per_client'}, inplace=True)

print("Driver Routes per Client Establishment using TNCs:")
print(driversPerClient.T.to_string(index=False))

Driver Routes per Client Establishment using TNCs:
NonRestRetl Restaurant   Retail
    0.61184    0.34748  0.47716


In [48]:
# Merge with establishment data
tncRouteGen = pd.merge(driversPerClient.astype('float').reset_index(), 
                       TNC_Industry3.astype('float').reset_index(), 
                       how='left', left_on='IndCat3', right_on='type')

tncRouteGen = tncRouteGen[['IndCat3','pctUseTNC','rateDelTNC', 'delivEmpIfTNC','drivers_per_client']]

In [49]:
tncRouteGen

Unnamed: 0,IndCat3,pctUseTNC,rateDelTNC,delivEmpIfTNC,drivers_per_client
0,NonRestRetl,0.01715,0.0068,0.02088,0.61184
1,Restaurant,0.33914,0.88629,0.10387,0.34748
2,Retail,0.07935,0.36225,0.14048,0.47716


In [50]:
TNC_Industry3

Unnamed: 0_level_0,useTNC,delivTNC,empIfUseTNC,empAll,totEstab,pctUseTNC,rateDelTNC,delivEmpIfTNC
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NonRestRetl,1449,575,27532,974784,84497,0.01715,0.0068,0.02088
Restaurant,3132,8184,78791,374445,9234,0.33914,0.88629,0.10387
Retail,825,3766,26808,168206,10396,0.07935,0.36225,0.14048


In [51]:
27532 * 0.02088 * 0.61184 

351.7273350144

In [52]:
print("\nDriver routes (weighted)")
print(driverRoutes.T.to_string(index=False))


Driver routes (weighted)
NonRestRetl Restaurant Retail Total
        352       2844   1797  4993


In [53]:
# Create employment based rates
routeGenRates = pd.merge(driverRoutes.astype('float').reset_index(), 
                         TNC_Industry3[['empAll']].astype('float').reset_index(), 
                         how='inner', left_on='IndCat3', right_on='type').drop(columns=['type'])

routeGenRates.rename(columns={'tnc_expwght': 'routes_wtd', 'empAll': 'employment_wtd'}, inplace=True)
routeGenRates.loc[4] = routeGenRates.sum()
routeGenRates.loc[routeGenRates.index[-1], 'IndCat3'] = "Totals"
routeGenRates['routes_per_emp'] = routeGenRates['routes_wtd'] / routeGenRates['employment_wtd']
routeGenRates

Unnamed: 0,IndCat3,routes_wtd,employment_wtd,routes_per_emp
0,NonRestRetl,352.0,974784.0,0.000361
1,Restaurant,2844.0,374445.0,0.007595
2,Retail,1797.0,168206.0,0.010683
4,Totals,4993.0,1517435.0,0.00329


#### Calibrate Route Gen

In [54]:
# Read land use data 
df_MGRA = pd.read_csv(os.path.join(root_dir, 
                                      raw_data_dir, 
                                      r"Land_Use\mgra15_based_input_2022_02_cvm.csv"))
df_MGRA.head()

Unnamed: 0,mgra,taz,LUZ,pop,hhp,hs,hs_sf,hs_mf,hs_mh,hh,...,hotelroomtotal,parkactive,openspaceparkpreserve,beachactive,district27,milestocoast,acre,landacre,effective_acres,truckregiontype
0,1,3010,10,440,440,176,84,92,0,174,...,0,0.0,0.0,0.0,9,4.35,18.837621,18.837621,18.837621,1
1,2,1797,28,130,68,56,0,56,0,48,...,0,0.0,0.0,0.0,15,0.64,2.87233,2.87233,2.87233,1
2,3,4361,239,549,549,200,23,177,0,192,...,0,0.0,0.0,0.0,13,12.22,25.713898,25.713898,25.713898,1
3,4,340,151,5,5,3,3,0,0,2,...,0,0.0,0.0,0.0,2,0.17,2.678374,2.678374,2.678374,1
4,5,388,151,90,90,43,43,0,0,36,...,0,0.0,0.0,0.0,2,0.47,4.057765,4.057765,4.057765,1


In [55]:
# Create short names for industries
indus_abrv = {
    1: "AGM",
    2: "MFG",
    3: "IUT",
    4: "RET",
    5: "WHL",
    6: "CON",
    7: "TRN",
    8: "IFR",
    9: "EPO",
    10: "MHS",
    11: "LAF",
    12: "MIL"
}

# Reverse lookup
abrv_indus = dict(zip(list(indus_abrv.values()), list(indus_abrv.keys())))

# Cross walk between MGRA employment and model employment categories
emp_mgra_to_model = {
    1:["emp_ag_min"],
    2:["emp_mnf"],
    3:["emp_utl"],
    4:["emp_ret"],
    5:["emp_whl"],
    6:["emp_con"],
    7:["emp_trn_wrh"],
    8:["emp_fin_res_mgm","emp_bus_svcs"],
    9:["emp_educ","emp_gov","emp_oth","emp_ent","emp_accm"],
    10:["emp_hlth"],
    11:["emp_food"],
    12:["emp_mil"]
}

# Reverse lookup
emp_survey_to_mgra = dict()
for key, values in emp_mgra_to_model.items():
    for v in values:
        emp_survey_to_mgra[v] = key
#emp_survey_to_mgra

In [56]:
# Bucket round function
def bucketRound(arr, thr=0.5):
    if isinstance(arr, list) or isinstance(arr, np.ndarray):
        arr = np.asarray(arr).astype('float')
        out = np.zeros(len(arr)).astype('int64')
        bucket = float(0)
        for i in range(len(arr)):
            out[i] = int(arr[i])
            bucket += arr[i]%1
            if bucket > thr:
                out[i] += 1
                bucket = 0
        return out
    else:
        print(arr)
        print("Error: Function requires inputs as an array or list of values.")

In [57]:
# Select only relevant employment fields
use_columns = [col for col in df_MGRA.columns if 'emp_' in col]
#[use_columns.remove(x) for x in ['emp_non_ws_wfh','emp_non_ws_oth','emp_tot']]

# Create new dataframe and identify model sectors
df_MGRA_emp = pd.DataFrame(df_MGRA[use_columns].sum()).rename(columns={0:'mgra_jobs'})
df_MGRA_emp.loc[:, 'emp_group_no'] = df_MGRA_emp.index.map(emp_survey_to_mgra)
df_MGRA_emp.loc[:, 'emp_mod_sector'] = df_MGRA_emp.emp_group_no.map(indus_abrv)
df_MGRA_emp.fillna(0, inplace=True)

# Identify TNC3 sectors
df_MGRA_emp['IndCat3'] = 'NonRestRetl'
df_MGRA_emp.loc[df_MGRA_emp['emp_group_no'] == 11, 'IndCat3'] = 'Restaurant'
df_MGRA_emp.loc[df_MGRA_emp['emp_group_no'] == 4, 'IndCat3'] = 'Retail'
df_MGRA_emp.loc[df_MGRA_emp['emp_group_no'] == 0, 'IndCat3'] = 'None'
df_MGRA_emp

Unnamed: 0,mgra_jobs,emp_group_no,emp_mod_sector,IndCat3
emp_gov,121320,9.0,EPO,NonRestRetl
emp_mil,110000,12.0,MIL,NonRestRetl
emp_ag_min,3305,1.0,AGM,NonRestRetl
emp_bus_svcs,267270,8.0,IFR,NonRestRetl
emp_fin_res_mgm,105702,8.0,IFR,NonRestRetl
emp_educ,130918,9.0,EPO,NonRestRetl
emp_hlth,199896,10.0,MHS,NonRestRetl
emp_ret,144185,4.0,RET,Retail
emp_trn_wrh,32237,7.0,TRN,NonRestRetl
emp_con,77331,6.0,CON,NonRestRetl


In [58]:
# Collect jobs summaries by model sectors
mgraEmpCat3 = pd.DataFrame(df_MGRA_emp[df_MGRA_emp['emp_group_no']>0].groupby(['IndCat3'])['mgra_jobs'].sum()).reset_index()
mgraEmpCat3.loc['Totals'] = mgraEmpCat3.sum()['mgra_jobs']
mgraEmpCat3

Unnamed: 0,IndCat3,mgra_jobs
0,NonRestRetl,1324253
1,Restaurant,140151
2,Retail,144185
Totals,1608589,1608589


In [59]:
# Adjust Rates to match MGRA mix of employment, which is different from CVS x TNC survey
routeGenRates = routeGenRates.merge(mgraEmpCat3, how='inner', on='IndCat3')
routeGenRates.loc[:, 'routes_per_emp_adj1'] = routeGenRates['routes_per_emp'] * routeGenRates['employment_wtd'] / routeGenRates['mgra_jobs']
routeGenRates.loc[:, 'routes_calibrated'] = routeGenRates['routes_per_emp_adj1'] * routeGenRates['mgra_jobs']
routeGenRates

Unnamed: 0,IndCat3,routes_wtd,employment_wtd,routes_per_emp,mgra_jobs,routes_per_emp_adj1,routes_calibrated
0,NonRestRetl,352.0,974784.0,0.000361,1324253,0.000266,352.0
1,Restaurant,2844.0,374445.0,0.007595,140151,0.020292,2844.0
2,Retail,1797.0,168206.0,0.010683,144185,0.012463,1797.0


In [60]:
# Apply to MGRAs and adjust for rounding
test_retail = pd.DataFrame(df_MGRA.groupby(['LUZ'])['emp_ret','emp_non_ws_wfh','emp_non_ws_oth'].sum()).reset_index()
test_retail.loc[:, 'tnc_routes'] = routeGenRates.iloc[2]["routes_per_emp_adj1"] * test_retail['emp_ret']
test_retail.loc[:, 'tnc_routes'] = np.round(test_retail['tnc_routes'].values)

out_retail = test_retail['tnc_routes'].sum()
factor = routeGenRates.iloc[2]["routes_wtd"] / out_retail
routeGenRates["routes_per_emp_calib"] = 0
routeGenRates.iloc[2, routeGenRates.columns.get_loc("routes_per_emp_calib")] = routeGenRates.iloc[2]["routes_per_emp_adj1"] * factor
test_retail.loc[:, 'tnc_routes'] = routeGenRates.iloc[2]["routes_per_emp_calib"] * test_retail['emp_ret']
test_retail.loc[:, 'tnc_routes'] = np.round(test_retail['tnc_routes'].values)

routeGenRates.iloc[2, routeGenRates.columns.get_loc("routes_calibrated")] = test_retail['tnc_routes'].sum()
test_retail['tnc_routes'].sum()
# test_retail[test_retail['tnc_routes']>0]

1794.0

In [61]:
df_MGRA.loc[:, 'emp_laf'] = df_MGRA['emp_food']# + df_MGRA['emp_ent'] + df_MGRA['emp_accm']
test_restaurant = pd.DataFrame(df_MGRA.groupby(['LUZ'])['emp_laf','emp_non_ws_wfh','emp_non_ws_oth'].sum()).reset_index()
test_restaurant.loc[:, 'tnc_routes'] = routeGenRates.iloc[1]["routes_per_emp_adj1"] * test_restaurant['emp_laf']
test_restaurant.loc[:, 'tnc_routes'] = np.round(test_restaurant['tnc_routes'].values)

out_restaurant = test_restaurant['tnc_routes'].sum()
factor = routeGenRates.iloc[1]["routes_wtd"] / out_restaurant
routeGenRates.iloc[1, routeGenRates.columns.get_loc("routes_per_emp_calib")] = routeGenRates.iloc[1]["routes_per_emp_adj1"] * factor
test_restaurant.loc[:, 'tnc_routes'] = routeGenRates.iloc[1]["routes_per_emp_calib"] * test_restaurant['emp_laf']
test_restaurant.loc[:, 'tnc_routes'] = np.round(test_restaurant['tnc_routes'].values)

routeGenRates.iloc[1, routeGenRates.columns.get_loc("routes_calibrated")] = test_restaurant['tnc_routes'].sum()
test_restaurant['tnc_routes'].sum()
# test_restaurant[test_restaurant['tnc_routes']>0]

2841.0

In [62]:
df_MGRA.loc[:, 'emp_NonRR'] = df_MGRA['emp_tot'] - df_MGRA['emp_laf'] - df_MGRA['emp_ret'] - df_MGRA['emp_non_ws_wfh'] - df_MGRA['emp_non_ws_oth']
test_nonrr = pd.DataFrame(df_MGRA.groupby(['LUZ'])['emp_NonRR','emp_non_ws_wfh','emp_non_ws_oth'].sum()).reset_index()
test_nonrr.loc[:, 'tnc_routes'] = routeGenRates.iloc[0]["routes_per_emp_adj1"] * test_nonrr['emp_NonRR']
test_nonrr.loc[:, 'tnc_routes'] = np.round(test_nonrr['tnc_routes'].values)

out_nonrr = test_nonrr['tnc_routes'].sum()
factor = routeGenRates.iloc[0]["routes_wtd"] / out_nonrr
routeGenRates.iloc[0, routeGenRates.columns.get_loc("routes_per_emp_calib")] = routeGenRates.iloc[0]["routes_per_emp_adj1"] * factor
test_nonrr.loc[:, 'tnc_routes'] = routeGenRates.iloc[0]["routes_per_emp_calib"] * test_nonrr['emp_NonRR']
test_nonrr.loc[:, 'tnc_routes'] = np.round(test_nonrr['tnc_routes'].values)

routeGenRates.iloc[0, routeGenRates.columns.get_loc("routes_calibrated")] = test_nonrr['tnc_routes'].sum()
test_nonrr['tnc_routes'].sum()
# test_nonrr[test_nonrr['tnc_routes']>0]

357.0

In [63]:
# Send to CSV
model_path = "Task05_Estimation_Calibration/Estimation/Route_Gen"
routeGenRates['routes_per_emp_calib'] = np.round(routeGenRates['routes_per_emp_calib'], 7) 
routeGenRates[['IndCat3','routes_wtd','routes_per_emp_calib']].to_csv(os.path.join(root_dir, model_path, "TNC_RouteGenRates.csv"), index=False)