## SANDAG CV and TNC Survey Analysis
### CV: Commercial Vehicles
### TNC: Transportation Network Companies

#### In the cell below, specify which data set to process:
* 'CV' for Establishment surveys
* 'TNC' for TNC surveys

In [6]:
# Indicate what dataset is to be processed. It should be either 'CV' or 'TNC'.
# dataset = 'CV'
dataset = 'TNC'

# Specify value of time for different trucks, [Light, Medium, Heavy]. Unit is $ per hour.
vot = [67, 68, 89]

#### Import Libraries 

In [7]:
import os, glob
import pandas as pd
import numpy as np
import openpyxl
import datetime as dt
import collections
import matplotlib.pyplot as plt
import openmatrix as omx
import gc

# from matplotlib.ticker import PercentFormatter
# from scipy import stats  # to get inverse of '.quantile()'

In [8]:
from datetime import datetime
current_date = datetime.now().strftime('%Y%m%d')

In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Define Trip Class

In [10]:
class Trip:
    registry = []

    def __init__(self):
        Trip.registry.append(self)
        self.index = -1
        self.o_act = 0
        self.o_place_type = 0
        self.o_place_name = ''
        self.o_address = ''
        self.o_lon = 0
        self.o_lat = 0
        self.o_taz = -1
        self.d_act = 0
        self.d_place_type = 0
        self.d_place_name = ''
        self.d_address = ''
        self.d_lon = 0
        self.d_lat = 0
        self.d_taz = -1
        self.trip_dist = 0
        self.o_dt = dt.datetime(1982, 1, 1, 0, 0, 0)
        self.d_at = dt.datetime(1982, 1, 1, 0, 0, 0)
        self.nt_dt = dt.datetime(1982, 1, 1, 0, 0, 0)
        self.travel_time = 0    # in seconds
        self.stop_duration = 0  # in seconds
        self.cargo_pickup = -1
        self.cargo_delivery = -1
        self.travel_date = dt.date(1982, 1, 1)
        self.last_trip = None

#### Set File Paths

In [11]:
project_path = os.getcwd().replace("\\02 Scripts", "")

# To run the script on your computer, update 'project path' above with script folder address on your computer and
# then uncomment it.
in_data_dir  = '01 Inputs'
out_data_dir = '03 Outputs'

lu_file = 'Lookups_v8.xlsx'
lu_path = os.path.join(project_path, in_data_dir, lu_file)

skims_dir = r'C:\Users\jgliebe\OneDrive - Cambridge Systematics\Documents - PROJ SANDAG Commercial Vehicle & Heavy Truck Model Update\_Shared_CSTeam\Task03_DataID_Review\ABM3\Skims'

if dataset == 'TNC':
    in_data_file = 'TNC Travel Survey_Data Submittal_1-19-23.xlsx'
else:
    in_data_file = 'SANDAG 2022 CV DataBase & Dictionaires_03_03_2023.xlsx'    
    in_data_file = 'SANDAG 2022 CV DataBase, Revised.xlsx'
in_data_path = os.path.join(project_path, in_data_dir, in_data_file)

#### Read the Lookup Tables

In [12]:
lookups = pd.read_excel(lu_path, header=0, sheet_name=None)

In [13]:
lookups['Segment Codes']

Unnamed: 0,Activities,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Places,Unnamed: 6
0,act_seg_code,act_seg_let,act_seg_name,,,plc_seg_code,plc_seg_name
1,1,S,Service,,,1,Residential
2,2,P,Goods_Pickup,,,2,Office
3,3,D,Goods_Delivery,,,3,Warehouse
4,4,M,Maintenance/Other,,,4,Other
5,5,B,Base,,,5,Retail and Restaurant
6,6,H,Home,,,6,Gas
7,,,,,,7,"Industrial, Agriculture, or Construction"
8,,,,,,8,Truck Terminal or Parking


In [14]:
lookup_at = lookups['Activity Type']  # .copy()
# lookup_at.drop(columns='count', inplace=True)

asl = {1: 'S', 2: 'P', 3: 'D', 4: 'M', 5: 'B', 6: 'H'} # asl = Activity Segment Letter
lookup_at['act_seg_let'] = lookup_at['act_seg_code'].map(asl)

dic_at = {}
for i, row in lookup_at.iterrows():
    dic_at[row['activity_type_code']] = row['act_seg_let'], row['act_seg_name']

lookup_at

Unnamed: 0,activity_type_code,count,activity_type_name,act_seg_name,act_seg_code,act_seg_let
0,10,3066,"Providing professional services (legal, medica...",Service,1,S
1,1,2766,Returning to Base Location,Base,5,B
2,5,1948,Delivering cargo - (including grocery and rest...,Goods_Delivery,3,D
3,8,1181,Providing Installation / Maintenance / Repair ...,Service,1,S
4,96,941,Other Activity (specify),Maintenance/Other,4,M
5,3,663,"Driver Needs (lunch, restroom, etc)",Maintenance/Other,4,M
6,2,589,"Vehicle Maintenance (fuel/charging, etc)",Maintenance/Other,4,M
7,14,531,Home,Home,6,H
8,6,226,Picking up cargo-(including grocery and restau...,Goods_Pickup,2,P
9,11,191,Shopping for Business (i.e. business/office su...,Maintenance/Other,4,M


In [15]:
lookup_pt = lookups['Place Type']   
# lookup_pt.drop(columns='count', inplace=True)

dic_pt = {}
dic_pt2 = {}
for i, row in lookup_pt.iterrows():
    dic_pt[row['place_type_code']] = row['plc_seg_code'], row['plc_seg_name']
    dic_pt2[row['plc_seg_code']] = row['plc_seg_name']
    
lookup_pt # .head(2)

Unnamed: 0,place_type_code,count,place_type_name,plc_seg_name,plc_seg_code
0,11,3335,Residential / Home,Residential,1
1,1,2607,Office Building (Non-Government),Office,2
2,14,1179,Warehouse,Warehouse,3
3,96,1175,Other (specify):,Other,4
4,6,972,Retail / Shopping,Retail and Restaurant,5
5,16,578,Gas station,Gas,6
6,5,470,Restaurant,Retail and Restaurant,5
7,7,460,Industrial / Manufacturing,"Industrial, Agriculture, or Construction",7
8,2,317,Government Office Building,Office,2
9,8,293,Medical / Hospital / Dental,Office,2


In [16]:
lookup_ind = lookups['Industries']
lookup_ind #.head(2)

Unnamed: 0,df_column_name,industry_code,industry_group,industry_code.1
0,base_location_Industry Group,1,Agriculture/Mining,1.0
1,base_location_Industry Group,2,Manufacturing,2.0
2,base_location_Industry Group,3,Industrial/Utilities,3.0
3,base_location_Industry Group,4,Retail,4.0
4,base_location_Industry Group,5,Wholesale,5.0
5,base_location_Industry Group,6,Construction,6.0
6,base_location_Industry Group,7,Transportation,7.0
7,base_location_Industry Group,8,Info/Finance/Insurance/Real Estate/Professiona...,8.0
8,base_location_Industry Group,9,Education/Other public services,9.0
9,base_location_Industry Group,10,Medical/Health Services,10.0


In [17]:
lookup_tnc = lookups['TNC Categories']  # .copy()

dic_tnc_names = {}
dic_tnc_categories = {}
for i, row in lookup_tnc.iterrows():
    dic_tnc_names[row['company_name']] = row['company_name_alt']   
    dic_tnc_categories[row['company_name_alt']] = row['TNC_IndCat3']   

lookup_tnc

Unnamed: 0,company_name,company_name_alt,Estb (ie veh),Trips,Trips/Veh,Tnc_Cat,TNC_IndCat3
0,amazon,Amazon,34.0,748.0,22.0,Package/Other,NonRestRetl
1,amazonflex,Amazon Flex,15.0,259.0,17.266667,Package/Other,NonRestRetl
2,amazonfresh,Amazon Fresh,1.0,12.0,12.0,Grocery,Retail
3,axhire,Axle Hire,4.0,61.0,15.25,Package/Other,NonRestRetl
4,axle,Axle Hire,,,15.25,Package/Other,NonRestRetl
5,axlehire,Axle Hire,,,15.25,Package/Other,NonRestRetl
6,doordash,Door Dash,90.0,907.0,10.077778,Restaurant,Restaurant
7,fantuan,Fantuan,1.0,23.0,23.0,Restaurant,Restaurant
8,gopuff,Go Puff,2.0,38.0,19.0,Food,Retail
9,grubhub,Grub Hub,43.0,576.0,13.395349,Restaurant,Restaurant


In [18]:
lookup_cv_estab = lookups['CV Estab TAZ']
lookup_cv_estab.head()

Unnamed: 0,OBJECTID,Join_Count,TARGET_FID,company_id,company_name,company_location_address,company_location_city,company_location_state,company_location_zipcode,company_location_latitude,company_location_longitude,estab_taz
0,1,1,1,100002,ALTAR,4370 LA JOLLA VILLAGE DR # 655,SAN DIEGO,CA,92122,32.873754,-117.210198,1244
1,2,1,2,100003,John Baker Property Mgmt,405 W 9TH AVE,ESCONDIDO,CA,92025,33.112208,-117.080743,3159
2,3,1,3,100004,OWB RANCHES LLC,512 VIA DE LA VALLE # 310,SOLANA BEACH,CA,92075,32.98037,-117.26082,536
3,4,1,4,100005,"United Sportfishers of San Diego, Inc.",2803 Emerson St,San Diego,CA,92106,32.723522,-117.227602,1073
4,5,1,5,100007,SCRIPPS COASTAL RESERVE,9500 GILMAN DR,LA JOLLA,CA,92093,32.877189,-117.237422,952


In [19]:
lookup_tnc_estab = lookups['TNC Estab TAZ']  # .copy()
lookup_tnc_estab.loc[lookup_tnc_estab['estab_taz'].isnull(), 'estab_taz'] = -1 # Could've been done with fillna too.
lookup_tnc_estab['estab_taz'] = lookup_tnc_estab['estab_taz'].astype(int)
lookup_tnc_estab.head()

Unnamed: 0,OBJECTID,Join_Count,TARGET_FID,company_id,company_name,company_location_address,company_location_city,company_location_state,company_location_zipcode,company_location_latitude,company_location_longitude,estab_taz
0,1,1,1,600026,Uber Eats,1052 Woodlawn Ave,Chula Vista,California,91911,32.610634,-117.086326,2958
1,2,1,2,600027,Amazon Flex,3980 Hatton St,San Diego,California,92111,32.816592,-117.165559,1787
2,3,1,3,600028,Senpex,6881 Alvarado Rd,San Diego,California,92120,32.775523,-117.050791,3502
3,4,1,4,600029,Grub Hub,4437 39th St,San Diego,California,92116,32.757872,-117.110067,2538
4,5,1,5,600031,Uber Eats,5741 Carnegie St,San Diego,California,92122,32.846406,-117.21743,1207


In [20]:
lookup_cv_estab_replace = lookups['CV Rt Rplcmnt Estab TAZ'] #.copy()
lookup_cv_estab_replace.insert(1, 'Date', 0)
lookup_cv_estab_replace.insert(2, 'Veh', 0)
lookup_cv_estab_replace['Date'] = pd.to_datetime(lookup_cv_estab_replace['route_id'].astype(str).str[:8]) #.astype(int)
lookup_cv_estab_replace['Veh'] = lookup_cv_estab_replace['route_id'].astype(str).str[-4:].astype(int)

lookup_cv_estab_replace

Unnamed: 0,route_id,Date,Veh,NewEstab_TAZ,Revision,Unnamed: 3,Unnamed: 4
0,202206280030,2022-06-28,30,3262,OK,,Note: New Estab TAZes have been assigned to ro...
1,202206280035,2022-06-28,35,3163,OK,,"This means, according to this sheet, two vehic..."
2,202206290038,2022-06-29,38,1184,OK,,Example: Routes 202206280030 and 202206280035 ...
3,202207190119,2022-07-19,119,3607,OK,,
4,202207200224,2022-07-20,224,124,OK,,
5,202207200242,2022-07-20,242,1180,OK,,
6,202207270317,2022-07-27,317,73,OK,,
7,202208020513,2022-08-02,513,1248,OK,,
8,202208030497,2022-08-03,497,3431,OK,,
9,202208030501,2022-08-03,501,3431,OK,,


In [21]:
# Get LogisticsNodes
lookup_logistics = lookups['LogisticsNodes']
lookup_logistics

Unnamed: 0,FreightCode,Category,TAZ,Description,Label
0,1,Sea Port,2086,Tenth Ave Marine Terminal and BNSF Freight Yard,Seaport
1,1,Sea Port,1154,Tenth Ave Marine Terminal and BNSF Freight Yard,Seaport
2,1,Sea Port,2497,National City Marine Terminal,Seaport
3,1,Sea Port,2193,Ship Building,Seaport
4,2,Airport Cargo,1294,Airport Infrastructure,Airport
5,2,Airport Cargo,1338,Airport Infrastructure,Airport
6,2,Airport Cargo,1457,Airport Infrastructure,Airport
7,2,Airport Cargo,1476,"UPS, FedEx, DHL",Airport
8,2,Airport Cargo,1485,Air Cargo,Airport
9,2,Airport Cargo,1520,Airport Infrastructure,Airport


### Read Skim Matrices -- two methods, choose one
1. Read OMX files, process, and save as pickle files -- do this only if the pickle files do not already exist or new raw data skims are wanted.
2. Read the pickle files directly, assuming they exist. This is 100 times faster.

In [26]:
# Note: There are 4,947 TAZs and 24,321 MGRAs in the shapefiles.
# The OMX files contain info about TAZs, not MGRAs.
dic_veh_size = {1: 'L', 2: 'M', 3: 'H'}
dic_tod = {1: 'EA', 2: 'AM', 3: 'MD', 4: 'PM', 5: 'EV'}
fns = [f'traffic_skims_{v}.omx' for v in dic_tod.values()] # fns = file names
# print(fns)

# Periods:
# Early:   3AM     6AM
# AM Peak: 6AM     9AM
# Midday:  9AM     3:30PM
# PM Peak: 3:30PM  7PM
# Late:    7PM     3AM

dist_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]
# 'dist_dfs' is a list of lists that stores distance dataframes. 
# Each row is for a time of day, and each column is a vehicle size.
# Note: Indices start from zero.

time_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]
toll_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]
g_tt_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]

#### Method 1: Read skims for OMX files, process, and save as pickle files. (Slow)
Change below cell to "code" before running

#### Method 2: Read already processed skims from pickle files. (Fast)
Change below cell to "code" before running

In [27]:
# Read skim matrices from pickle files (fast load)
for i in range(len(dic_tod)):
    for j in range(len(dic_veh_size)):
        in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_10th_of_miles.pkl'
        in_path = os.path.join(project_path, out_data_dir, 'Distance Skims')
        if not os.path.exists(in_path):
           print(f"Input skims file path not found: \n{in_path}")
        in_path = os.path.join(project_path, out_data_dir, 'Distance Skims', in_file)
        dist_dfs[i][j] = pd.read_pickle(in_path)
        
        if i == 2:
            in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_TT_Minutes.pkl'
            in_path = os.path.join(project_path, out_data_dir, 'Travel Time Skims')
            if not os.path.exists(in_path):
               print(f"Input skims file path not found: \n{in_path}")
            in_path = os.path.join(project_path, out_data_dir, 'Travel Time Skims', in_file)
            time_dfs[i][j] = pd.read_pickle(in_path)
            
            in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_Toll_Cents.pkl'
            in_path = os.path.join(project_path, out_data_dir, 'Toll Skims')
            if not os.path.exists(in_path):
               print(f"Input skims file path not found: \n{in_path}")
            in_path = os.path.join(project_path, out_data_dir, 'Toll Skims', in_file)
            toll_dfs[i][j] = pd.read_pickle(in_path)
            
            in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_GenTT_Minutes.pkl'
            in_path = os.path.join(project_path, out_data_dir, 'Generalized TT Skims')
            if not os.path.exists(in_path):
               print(f"Input skims file path not found: \n{in_path}")
            in_path = os.path.join(project_path, out_data_dir, 'Generalized TT Skims', in_file)
            g_tt_dfs[i][j] = pd.read_pickle(in_path)

In [28]:
print(f'Example 1: Distance from TAZ 5 to TAZ 19 in AM peak for medium trucks is: {dist_dfs[1][1].at[320, 344]/10:} miles\n')
print(f'Example 2: Distance from TAZ 321 to TAZ 345 in mid-day for light trucks is: {dist_dfs[2][0].iloc[320, 344]/10:} miles,',
      '\n', f'          Travel time is: {time_dfs[2][0].iloc[320, 344]} minutes, and', '\n',
      f'          toll is: {toll_dfs[2][0].iloc[320, 344]} cents.\n')
print(f'Example 3: Fastest route between TAZ 3901 and TAZ 4233 uses expressway 125, which is a FasTrak toll road.',
      f'\n           Distance from TAZ 3901 to TAZ 4233 in mid-day for heavy trucks is: {dist_dfs[2][2].iloc[3900, 4232]/10:} miles,',
      '\n', f'          Travel time is: {time_dfs[2][2].iloc[3900, 4232]} minutes,\n',
      f'          Toll is: {toll_dfs[2][2].iloc[3900, 4232]} cents, and\n'
      f'           Generalized travel time is: {g_tt_dfs[2][2].iloc[3900, 4232]} minutes.')

Example 1: Distance from TAZ 5 to TAZ 19 in AM peak for medium trucks is: 0.8 miles

Example 2: Distance from TAZ 321 to TAZ 345 in mid-day for light trucks is: 0.9 miles, 
           Travel time is: 2.8 minutes, and 
           toll is: 0 cents.

Example 3: Fastest route between TAZ 3901 and TAZ 4233 uses expressway 125, which is a FasTrak toll road. 
           Distance from TAZ 3901 to TAZ 4233 in mid-day for heavy trucks is: 7.9 miles, 
           Travel time is: 13.63 minutes,
           Toll is: 375 cents, and
           Generalized travel time is: 16.15 minutes.


#### Read Stop, Establishment, and Vehicle Data

In [29]:
# Read the survey data.
df_original = pd.read_excel(in_data_path, header=0, sheet_name=None)
# df_original is a dictionary of Dfs. Keys are sheetnames, and values are the dataframes in those worksheets.

In [30]:
# # If all we were to do was to read the stop data:
# df_stops = pd.read_excel(in_data_path, header=0, sheet_name='Trip Data')
# df_stops.head(2)

In [31]:
df_stops = df_original['Trip Data'].copy()
df_estab = df_original['Establishment Data'].copy()
df_veh = df_original['Vehicle Data'].copy()
df_stops.head()

Unnamed: 0,unique_id,company_id,vehicle_id,driver_id,trip_number,trip_load_status,trip_load_weight,activity_type,activity_type_other,placetype,placetype_other,location_placename,location_address,location_city,location_state,location_zip,location_latitude,location_longitude,taz,cargo_pickup,cargo_other_pickup,cargo_delivery,cargo_other_delivery,cargo_pu_weight,cargo_do_weight,travel_date,arrival_time,departure_time,used_other_vehicle,participation_type,vehicle_id.1,Lower Estimate Weight Factor,Most Likely Estimate Weight Factor,Upper Estimate Weight Factor
0,1,600026,1371,1744,0,3.0,,14,,11,,HOME,1052 Woodlawn Ave,Chula Vista,California,91911,32.610644,-117.086283,2958,,,,,,,2022-08-29,,13:39:00,2,Smartphone,,,,
1,2,600026,1371,1744,1,,,6,,6,,WALMART,13487 Camino Canada,El Cajon,California,92021,32.822468,-116.901892,4617,5.0,,,,100.0,,2022-08-29,14:03:00,14:33:00,2,Smartphone,,,,
2,3,600026,1371,1744,2,,,5,,11,,HOUSE,12651 Julian Ave,Lakeside,California,92040,32.855417,-116.91758,4564,,,10.0,,,11.0,2022-08-29,14:39:00,14:42:00,2,Smartphone,,,,
3,4,600026,1371,1744,3,,,5,,11,,DROP OFF CUSTOMER,12143 Rockcrest Rd,Lakeside,California,92040,32.846187,-116.92999,4508,,,10.0,,,89.0,2022-08-29,14:54:03,15:01:21,2,Smartphone,,,,
4,5,600026,1371,1744,4,,,6,,5,,RESTAURANT,12038 Woodside Ave,Lakeside,California,92040,32.856083,-116.932653,4465,5.0,,,,45.0,,2022-08-29,15:09:00,15:12:00,2,Smartphone,,,,


In [32]:
df_estab.head()

Unnamed: 0,company_id,"Industry Group_Size Code (Group from Column D; Size 1=0-9 emp, 2=10+)",company_name,vehicle_purpose,number_of_trips,base_location_Industry Group,company_location_address,company_location_city,company_location_state,company_location_zipcode,company_location_latitude,company_location_longitude,employees_fulltime_count,employees_parttime_count,no_of_emp_work,"Number of Employess (1=<10, 2=10+)",total_telecommute_from_home,is_use_tnc,no_of_deliveries,no_of_deliveries_pcsuvpu,no_of_deliveries_su,no_of_deliveries_cu,per_of_deliveries_warehouse,per_of_deliveries_airport,per_of_deliveries_manufacture,per_of_deliveries_retail,per_of_deliveries_service,per_of_deliveries_other,per_of_deliveries_dontknow,per_of_deliveries_text_other,no_of_from_deliveries,no_of_from_deliveries_pcsuvpu,no_of_from_deliveries_su,no_of_from_deliveries_cu,per_of_from_deliveries_warehouse,per_of_from_deliveries_airport,per_of_from_deliveries_manufacture,per_of_from_deliveries_retail,per_of_from_deliveries_service,per_of_from_deliveries_other,per_of_from_deliveries_dontknow,per_of_from_deliveries_text_other,has_vehicles,vehicle_count_total,vehicle_axle_cargotruck,vehicle_semi_cargo,vehicle_cars_count,vehicle_pickuptrucks_count,vehicle_cargovans_count,vehicle_servicevans_count,use_personal_vehicle,vehicle_other_count_total,vehicle_other_axle_cargotruck,vehicle_other_semi_cargo,vehicle_other_cars_count,vehicle_other_pickuptrucks_count,vehicle_other_cargovans_count,vehicle_other_servicevans_count,vehicle_toll_trans_count,Participated in Travel Survey?,LCV Owned or Leased,LCV other,LCV Total,SUT Owned or Leased,SUT other,SUT Total,MUT Owned or Leased,MUT other,MUT Total,TOTAL CVs,number_of_trips.1
0,600026,,Uber Eats,,14,,1052 Woodlawn Ave,Chula Vista,California,91911,32.610634,-117.086326,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,600027,,Amazon Flex,,26,,3980 Hatton St,San Diego,California,92111,32.816592,-117.165559,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,600028,,Senpex,,4,,6881 Alvarado Rd,San Diego,California,92120,32.775523,-117.050791,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,600029,,Grub Hub,,9,,4437 39th St,San Diego,California,92116,32.757872,-117.110067,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,600031,,Uber Eats,,30,,5741 Carnegie St,San Diego,California,92122,32.846406,-117.21743,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [33]:
df_veh.head()

Unnamed: 0,id,company_id,veh_make_other,vehicle_model,vehicle_year,vehicle_type,vehicle_fuel_type,vehicle_miles_per_gallon,has_vehicle_transponder,vehicle_classification,vehicle_weight,odometer,ending_odometer,number_of_trips,Industry Group,"Employee Size (1=0-9, 2=10+)",Vehicle Type,Vehicle Type_Industry_Size Category Code,Lower Estimate Weight Factor,Most Likely Estimate Weight Factor,Upper Estimate Weight Factor
0,1371,600026,TOYOTA,COROLLA,2004,3,1.0,28.0,2.0,1,2524,98110,98188,14,,,,,,,
1,1443,600027,CHEVROLET,VOLT,2017,3,6.0,,2.0,1,3519,77785,77855,26,,,,,,,
2,1395,600028,HYUNDAI,SONATA,2011,3,1.0,27.0,2.0,1,4299,254009,254019,4,,,,,,,
3,1393,600029,TOYOTA,PRIUS,2020,3,6.0,,2.0,1,3040,10000,10031,9,,,,,,,
4,1451,600031,KIA,NIRO,2018,3,6.0,,2.0,1,3200,100600,100644,30,,,,,,,


#### Explore the Vehicle Dataframe

In [34]:
# 'vehicle_classification' codes according to data dictionary:
# 1: Passenger Car or Motorcycle
# 2: Pick-up Truck (4 wheels)
# 3: Van (Cargo/Minivan) (4 wheels)
# 4: Buses 
# 5: Single Unit 2-axle
# 6: Single Unit 3-axle
# 7: Single Unit 4-axle
# 8: Semi (all Tractor-Trailer combinations)
# 96:Other (please specify)

In [35]:
df_veh['vehicle_classification'].unique()

array([1, 3, 2, 4], dtype=int64)

In [36]:
if dataset == 'CV':
    print(df_veh['Vehicle Type'].unique())

In [37]:
if dataset == 'CV':
    pvt_axle_vtype = pd.pivot_table(df_veh, values='id', index='vehicle_classification', columns='Vehicle Type',
                             aggfunc='count') #, sort=True) For sort, default is True.
    print(pvt_axle_vtype)

In [38]:
# Create a conversion dictionary from veh_classification to veh_size.
# This is necessary because the TNC dataset doesn't have the 'Vehicle Type' column - this script
# needs to work for both CV and the TNC datasets.
d1 = {i: 'LCV' for i in range(1, 4)}
d2 = {i: 'SUT' for i in range(4, 8)}
d3 = {8: 'MUT'}
dic_veh_size = {**d1, **d2, **d3}
dic_veh_size

{1: 'LCV',
 2: 'LCV',
 3: 'LCV',
 4: 'SUT',
 5: 'SUT',
 6: 'SUT',
 7: 'SUT',
 8: 'MUT'}

In [39]:
df_veh['veh_size'] = df_veh['vehicle_classification'].map(dic_veh_size)
df_veh[['veh_size', 'vehicle_classification']]

Unnamed: 0,veh_size,vehicle_classification
0,LCV,1
1,LCV,1
2,LCV,1
3,LCV,1
4,LCV,1
5,LCV,3
6,LCV,1
7,LCV,1
8,LCV,1
9,LCV,1


#### Clean the Data

In [40]:
# Decide which columns to keep and rearrange.
cols = ['company_id', 'vehicle_id', 'driver_id', 'trip_number', 'activity_type', 'placetype',
        'location_placename', 'location_address', 'location_longitude', 'location_latitude', 'taz',
        'arrival_time', 'departure_time', 'cargo_pickup', 'cargo_delivery', 'travel_date', 
        'participation_type', 'Most Likely Estimate Weight Factor']
        # 'Lower Estimate Weight Factor', 
        # 'Upper Estimate Weight Factor'

df_stops = df_stops[cols]

# Sort the dataframe
df_stops.sort_values(by=['company_id', 'vehicle_id', 'travel_date', 'trip_number'],
                     inplace=True, ascending=True)
df_stops.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,activity_type,placetype,location_placename,location_address,location_longitude,location_latitude,taz,arrival_time,departure_time,cargo_pickup,cargo_delivery,travel_date,participation_type,Most Likely Estimate Weight Factor
0,600026,1371,1744,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,,13:39:00,,,2022-08-29,Smartphone,
1,600026,1371,1744,1,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,14:03:00,14:33:00,5.0,,2022-08-29,Smartphone,
2,600026,1371,1744,2,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,14:39:00,14:42:00,,10.0,2022-08-29,Smartphone,
3,600026,1371,1744,3,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,14:54:03,15:01:21,,10.0,2022-08-29,Smartphone,
4,600026,1371,1744,4,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,15:09:00,15:12:00,5.0,,2022-08-29,Smartphone,


#### Attach Vehicle Type Info

In [41]:
df_stops = df_stops.merge(df_veh[['id', 'veh_size']], how='left', left_on='vehicle_id', right_on='id')
df_stops.rename(columns={'veh_size': 'veh_type'}, inplace=True)
df_stops.drop(columns='id', inplace=True)
df_stops.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,activity_type,placetype,location_placename,location_address,location_longitude,location_latitude,taz,arrival_time,departure_time,cargo_pickup,cargo_delivery,travel_date,participation_type,Most Likely Estimate Weight Factor,veh_type
0,600026,1371,1744,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,,13:39:00,,,2022-08-29,Smartphone,,LCV
1,600026,1371,1744,1,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,14:03:00,14:33:00,5.0,,2022-08-29,Smartphone,,LCV
2,600026,1371,1744,2,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,14:39:00,14:42:00,,10.0,2022-08-29,Smartphone,,LCV
3,600026,1371,1744,3,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,14:54:03,15:01:21,,10.0,2022-08-29,Smartphone,,LCV
4,600026,1371,1744,4,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,15:09:00,15:12:00,5.0,,2022-08-29,Smartphone,,LCV


#### Attach Establishment TAZ Info

In [42]:
lookup_estab = lookup_cv_estab if dataset == 'CV' else lookup_tnc_estab
df_stops = df_stops.merge(lookup_estab[['company_id', 'estab_taz']], how='left', on='company_id')
df_stops.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,activity_type,placetype,location_placename,location_address,location_longitude,location_latitude,taz,arrival_time,departure_time,cargo_pickup,cargo_delivery,travel_date,participation_type,Most Likely Estimate Weight Factor,veh_type,estab_taz
0,600026,1371,1744,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,,13:39:00,,,2022-08-29,Smartphone,,LCV,2958
1,600026,1371,1744,1,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,14:03:00,14:33:00,5.0,,2022-08-29,Smartphone,,LCV,2958
2,600026,1371,1744,2,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,14:39:00,14:42:00,,10.0,2022-08-29,Smartphone,,LCV,2958
3,600026,1371,1744,3,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,14:54:03,15:01:21,,10.0,2022-08-29,Smartphone,,LCV,2958
4,600026,1371,1744,4,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,15:09:00,15:12:00,5.0,,2022-08-29,Smartphone,,LCV,2958


#### Update establishment TAZ info for stops that have a replacement establishment TAZ

In [43]:
if dataset == 'CV':
    for i, rowStop in df_stops.iterrows():
        for j, rowRep in lookup_cv_estab_replace.iterrows():
            if rowStop['travel_date']==rowRep['Date'] and rowStop['vehicle_id']==rowRep['Veh']:
#                 print(i, j,
#                       f"{df_stops.at[i, 'estab_taz']} was replaced by {lookup_cv_estab_replace.at[j, 'NewEstab_TAZ']}")
                df_stops.at[i, 'estab_taz'] = lookup_cv_estab_replace.at[j, 'NewEstab_TAZ']                
                break     

#### Create the Trip Dataframe Using the Stop Dataframe

In [44]:
df_temp = df_stops.copy()
rename_dic = {
    'activity_type':      'd_act',
    'placetype':          'd_place_type',
    'location_placename': 'd_place_name',
    'location_address':   'd_address',
    'location_longitude': 'd_lon',
    'location_latitude':  'd_lat',
    'taz':                'd_taz',
    'arrival_time':       'd_arr_time',
    'departure_time':     'next_trip_dep_time',
    'Most Likely Estimate Weight Factor': 'expnsn_factor'
}
df_temp.rename(columns=rename_dic, inplace=True)
df_temp.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,d_arr_time,next_trip_dep_time,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,veh_type,estab_taz
0,600026,1371,1744,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,,13:39:00,,,2022-08-29,Smartphone,,LCV,2958
1,600026,1371,1744,1,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,14:03:00,14:33:00,5.0,,2022-08-29,Smartphone,,LCV,2958
2,600026,1371,1744,2,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,14:39:00,14:42:00,,10.0,2022-08-29,Smartphone,,LCV,2958
3,600026,1371,1744,3,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,14:54:03,15:01:21,,10.0,2022-08-29,Smartphone,,LCV,2958
4,600026,1371,1744,4,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,15:09:00,15:12:00,5.0,,2022-08-29,Smartphone,,LCV,2958


In [45]:
# For TNCs, set missing expansion factors to 1
df_temp['expnsn_factor'] = df_temp['expnsn_factor'].fillna(1)

In [46]:
# Create a dictionary of new columns that should be added.
cols_to_add_1 = ['o_act', 'o_place_type', 'o_lon', 'o_lat', 'o_taz', 'o_dep_time',
                 'travel_time', 'stop_duration',
                 'o_act_seg', 'o_plc_seg', 
                 'd_act_seg', 'd_plc_seg',
                 'trip_dist', 'orgn_to_hq_dist',
                 'hq_taz'
                ]
d1 = dict.fromkeys(cols_to_add_1, 0)

cols_to_add_2 = ['o_place_name', 'o_address',
                 'o_act_seg_name', 'o_plc_seg_name',
                 'd_act_seg_name', 'd_plc_seg_name',
                 'headquarters'
                ]
d2 = dict.fromkeys(cols_to_add_2, "")

d = {**d1, **d2}
d

{'o_act': 0,
 'o_place_type': 0,
 'o_lon': 0,
 'o_lat': 0,
 'o_taz': 0,
 'o_dep_time': 0,
 'travel_time': 0,
 'stop_duration': 0,
 'o_act_seg': 0,
 'o_plc_seg': 0,
 'd_act_seg': 0,
 'd_plc_seg': 0,
 'trip_dist': 0,
 'orgn_to_hq_dist': 0,
 'hq_taz': 0,
 'o_place_name': '',
 'o_address': '',
 'o_act_seg_name': '',
 'o_plc_seg_name': '',
 'd_act_seg_name': '',
 'd_plc_seg_name': '',
 'headquarters': ''}

In [47]:
# Add the new columns with them being initialized by the values of the above dictionary.
df_temp = df_temp.assign(**d)
df_temp.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,d_arr_time,next_trip_dep_time,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,veh_type,estab_taz,o_act,o_place_type,o_lon,o_lat,o_taz,o_dep_time,travel_time,stop_duration,o_act_seg,o_plc_seg,d_act_seg,d_plc_seg,trip_dist,orgn_to_hq_dist,hq_taz,o_place_name,o_address,o_act_seg_name,o_plc_seg_name,d_act_seg_name,d_plc_seg_name,headquarters
0,600026,1371,1744,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,,13:39:00,,,2022-08-29,Smartphone,1.0,LCV,2958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,
1,600026,1371,1744,1,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,14:03:00,14:33:00,5.0,,2022-08-29,Smartphone,1.0,LCV,2958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,
2,600026,1371,1744,2,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,14:39:00,14:42:00,,10.0,2022-08-29,Smartphone,1.0,LCV,2958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,
3,600026,1371,1744,3,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,14:54:03,15:01:21,,10.0,2022-08-29,Smartphone,1.0,LCV,2958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,
4,600026,1371,1744,4,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,15:09:00,15:12:00,5.0,,2022-08-29,Smartphone,1.0,LCV,2958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,


In [48]:
# Rearrange.
cols = ['company_id', 'vehicle_id', 'driver_id', 'trip_number',
        'o_act_seg', 'o_act_seg_name', 'o_plc_seg', 'o_plc_seg_name',
        'd_act_seg', 'd_act_seg_name', 'd_plc_seg', 'd_plc_seg_name', 
        'o_act', 'o_place_type', 'o_place_name', 'o_address', 'o_lon', 'o_lat', 'o_taz',
        'd_act', 'd_place_type', 'd_place_name', 'd_address', 'd_lon', 'd_lat', 'd_taz',
        'trip_dist', 'headquarters', 'hq_taz', 'orgn_to_hq_dist',
        'o_dep_time', 'd_arr_time', 'next_trip_dep_time', 'travel_time', 'stop_duration',        
        'veh_type', 'cargo_pickup', 'cargo_delivery', 'travel_date',
        'participation_type', 'expnsn_factor', 'estab_taz'
       ]
df_temp = df_temp[cols]
df_temp.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz
0,600026,1371,1744,0,0,,0,,0,,0,,0,0,,,0,0,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,0,,0,0,0,,13:39:00,0,0,LCV,,,2022-08-29,Smartphone,1.0,2958
1,600026,1371,1744,1,0,,0,,0,,0,,0,0,,,0,0,0,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,0,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958
2,600026,1371,1744,2,0,,0,,0,,0,,0,0,,,0,0,0,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,0,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
3,600026,1371,1744,3,0,,0,,0,,0,,0,0,,,0,0,0,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,0,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
4,600026,1371,1744,4,0,,0,,0,,0,,0,0,,,0,0,0,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,0,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958


In [49]:
# Add origin information from the previous row.
dic_read_last = {
    'o_act':        'd_act',
    'o_place_type': 'd_place_type',
    'o_place_name': 'd_place_name',
    'o_address':    'd_address',
    'o_lon':        'd_lon',
    'o_lat':        'd_lat',
    'o_taz':        'd_taz',
    'o_dep_time':   'next_trip_dep_time',
    'travel_date':  'travel_date'
}

for k, v in dic_read_last.items():
    df_temp.loc[df_temp['trip_number']!=0, k] = df_temp[v].shift(1)

df_temp['o_taz'] = df_temp['o_taz'].astype('int64')

df_temp.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz
0,600026,1371,1744,0,0,,0,,0,,0,,0.0,0.0,,,0.0,0.0,0,14,11,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,0,,0,0,0,,13:39:00,0,0,LCV,,,2022-08-29,Smartphone,1.0,2958
1,600026,1371,1744,1,0,,0,,0,,0,,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958
2,600026,1371,1744,2,0,,0,,0,,0,,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
3,600026,1371,1744,3,0,,0,,0,,0,,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
4,600026,1371,1744,4,0,,0,,0,,0,,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958


In [50]:
# Remove the first record of each vehicle as it doesn't represent a trip, rather the 
# initial origin of the vehicle, whose critical info has already been stored in the next record.
df_temp = df_temp.loc[df_temp['trip_number']!=0]
df_temp.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz
1,600026,1371,1744,1,0,,0,,0,,0,,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958
2,600026,1371,1744,2,0,,0,,0,,0,,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
3,600026,1371,1744,3,0,,0,,0,,0,,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
4,600026,1371,1744,4,0,,0,,0,,0,,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958
5,600026,1371,1744,5,0,,0,,0,,0,,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958


In [51]:
# Fill the activity and place segment fields.
df_temp['o_act_seg']      = df_temp['o_act'].map(lambda x: dic_at[x][0])
df_temp['o_act_seg_name'] = df_temp['o_act'].map(lambda x: dic_at[x][1])
df_temp['d_act_seg']      = df_temp['d_act'].apply(lambda x: dic_at[x][0])
df_temp['d_act_seg_name'] = df_temp['d_act'].apply(lambda x: dic_at[x][1])

df_temp['o_plc_seg']      = df_temp['o_place_type'].map(lambda x: dic_pt[x][0])
df_temp['o_plc_seg_name'] = df_temp['o_place_type'].map(lambda x: dic_pt[x][1])
df_temp['d_plc_seg']      = df_temp['d_place_type'].apply(lambda x: dic_pt[x][0])
df_temp['d_plc_seg_name'] = df_temp['d_place_type'].apply(lambda x: dic_pt[x][1])

df_temp.head()

Unnamed: 0,company_id,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz
1,600026,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958
2,600026,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
3,600026,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958
4,600026,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958
5,600026,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958


In [52]:
# df_temp.loc[(df_temp['d_act_seg']=='B')&(df_temp['d_taz']!=df_temp['estab_taz'])].head(1000)

In [53]:
df_temp.groupby(['d_act_seg', 'd_act_seg_name']).size()

d_act_seg  d_act_seg_name   
B          Base                  168
D          Goods_Delivery       2656
H          Home                  154
M          Maintenance/Other    1123
P          Goods_Pickup         1053
S          Service                99
dtype: int64

##### Fix Activity Type 'Base' If Stop's TAZ Is Not Equal to Establishment TAZ

In [54]:
df_temp.loc[(df_temp['o_act_seg']=='B')&
            (df_temp['o_taz']!=df_temp['estab_taz'])&
            (df_temp['o_act'].isin([12])), ['o_act_seg', 'o_act_seg_name']
           ] = 'P', 'Goods_Pickup'

df_temp.loc[(df_temp['o_act_seg']=='B')&
            (df_temp['o_taz']!=df_temp['estab_taz'])&
            (df_temp['o_act'].isin([13])), ['o_act_seg', 'o_act_seg_name']
           ] = 'D', 'Goods_Delivery'

df_temp.loc[(df_temp['o_act_seg']=='B')&
            (df_temp['o_taz']!=df_temp['estab_taz'])&
            (~df_temp['o_act'].isin([12, 13])), ['o_act_seg', 'o_act_seg_name']
           ] = 'M', 'Maintenance/Other'

df_temp.loc[(df_temp['d_act_seg']=='B')&
            (df_temp['d_taz']!=df_temp['estab_taz'])&
            (df_temp['d_act'].isin([12])), ['d_act_seg', 'd_act_seg_name']
           ] = 'P', 'Goods_Pickup'

df_temp.loc[(df_temp['d_act_seg']=='B')&
            (df_temp['d_taz']!=df_temp['estab_taz'])&
            (df_temp['d_act'].isin([13])), ['d_act_seg', 'd_act_seg_name']
           ] = 'D', 'Goods_Delivery'

df_temp.loc[(df_temp['d_act_seg']=='B')&
            (df_temp['d_taz']!=df_temp['estab_taz'])&
            (~df_temp['d_act'].isin([12, 13])), ['d_act_seg', 'd_act_seg_name']
           ] = 'M', 'Maintenance/Other'

In [55]:
# df_temp.loc[df_temp['d_taz']!=df_temp['estab_taz']].head(1000)

In [56]:
df_temp.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5253 entries, 1 to 5663
Data columns (total 42 columns):
company_id            5253 non-null int64
vehicle_id            5253 non-null int64
driver_id             5253 non-null int64
trip_number           5253 non-null int64
o_act_seg             5253 non-null object
o_act_seg_name        5253 non-null object
o_plc_seg             5253 non-null int64
o_plc_seg_name        5253 non-null object
d_act_seg             5253 non-null object
d_act_seg_name        5253 non-null object
d_plc_seg             5253 non-null int64
d_plc_seg_name        5253 non-null object
o_act                 5253 non-null float64
o_place_type          5253 non-null float64
o_place_name          5253 non-null object
o_address             5253 non-null object
o_lon                 5253 non-null float64
o_lat                 5253 non-null float64
o_taz                 5253 non-null int64
d_act                 5253 non-null int64
d_place_type          5253 non-null i

##### Include the industry of establishments

In [57]:
if dataset == 'CV':
    df_temp = df_temp.merge(df_estab[['company_id', 'base_location_Industry Group']], how='left', on='company_id')
    df_temp.rename(columns={'base_location_Industry Group': 'industry_code'}, inplace=True)
    df_temp = df_temp.merge(lookup_ind[['industry_code', 'industry_group']], how='left', on='industry_code')
    temp = df_temp.pop('industry_group')
    df_temp.insert(1, 'industry_group', temp) 

In [58]:
def omit_spaces(s):
    return s.replace(' ', '')

if dataset == 'TNC':
    df_temp = df_temp.merge(df_estab[['company_id', 'company_name']], how='left', on='company_id')
    
    # Some of the company names have leading or lagging spaces in their names.
    df_temp['company_name'] = df_temp['company_name'].apply(str.strip).apply(str.lower).apply(omit_spaces)
    df_temp['company_name'] = df_temp['company_name'].map(dic_tnc_names)
        
    # Figure out the industry groups.
    df_temp.insert(1, 'industry_group', "")
    df_temp['industry_group'] = df_temp['company_name'].map(dic_tnc_categories)
    
    df_temp['industry_group'] = 'TNC_' + df_temp['industry_group']

In [59]:
df_temp.head()

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats


In [60]:
df_trips = df_temp.copy()
del df_temp
# gc.collect()

##### Clean the time fields

Glossary:<br>
|Field          | Meaning|
|:---------------|:--------------------|
|str_td | Travel Date as String | 
|str_o_dt|      Departure Time from the Origin as String|
|str_d_at |     Arrival Time at the Destination as String|
|str_nt_dt |    Next Trip Departure Time as String|
|o_dt       |   Departure Time from the Origin as TimeStamp|
|d_at        |  Arrival Time at the Destination as TimeStamp|
|nt_dt        | Next Trip Departure Time as TimeStamp|

In [61]:
print(df_trips['travel_date'].dtypes)
print(df_trips['o_dep_time'].dtypes)
print(df_trips['d_arr_time'].dtypes)
print(df_trips['next_trip_dep_time'].dtypes)

datetime64[ns]
object
object
object


In [62]:
df_trips['str_td'] = df_trips['travel_date'].dt.strftime('%Y-%m-%d')    # str_td = travel date as string
df_trips.head()

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,str_td
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29


In [63]:
df_trips['o_dep_time'].describe()

count         5253
unique        2129
top       15:00:00
freq            14
Name: o_dep_time, dtype: object

In [64]:
if dataset == 'CV':
    df_trips['str_o_dt'] =  df_trips['o_dep_time'].astype(str)
    print(f"Max length of column str_o_dt: {max(df_trips['str_o_dt'].str.len())}")
    print(f"Min length of column str_o_dt: {min(df_trips['str_o_dt'].str.len())}\n")

    df_trips['str_d_at'] =  df_trips['d_arr_time'].astype(str)
    print(f"Max length of column str_d_at: {max(df_trips['str_d_at'].str.len())}")
    print(f"Min length of column str_d_at: {min(df_trips['str_d_at'].str.len())}")
    df_trips.loc[df_trips['str_d_at'].str.len()>8]
    df_trips['str_d_at'] =df_trips['str_d_at'].str[-8:]
    print(f"Max length of column str_d_at: {max(df_trips['str_d_at'].str.len())}, after cleaning")
    print(f"Min length of column str_d_at: {min(df_trips['str_d_at'].str.len())}, after cleaning\n")

    df_trips['str_nt_dt'] =  df_trips['next_trip_dep_time'].astype(str)
    print(f"Max length of column str_nt_dt: {max(df_trips['str_nt_dt'].str.len())}")
    print(f"Min length of column str_nt_dt: {min(df_trips['str_nt_dt'].str.len())}")
    df_trips.loc[df_trips['str_nt_dt'].str.len()<8].head(2)

In [65]:
if dataset == 'TNC':
    df_trips['str_o_dt'] =  df_trips['o_dep_time'].astype(str)
    print('Before:\n', df_trips['str_o_dt'].loc[~df_trips['str_o_dt'].str.len().isin([8, 15])])
    df_trips['str_o_dt'] = df_trips['str_o_dt'].\
    apply(lambda x: x[11:] if x[:10]=='1900-01-01' else x)
    print('\nAfter:\n', df_trips['str_o_dt'].loc[~df_trips['str_o_dt'].str.len().isin([8, 15])])
    df_trips['str_o_dt'].loc[df_trips['str_o_dt'].str.len()!=8]
    df_trips['str_o_dt'].loc[df_trips['str_o_dt'].str.len()==8]
    df_trips.loc[df_trips['str_o_dt'].str.len()!=8, 'str_o_dt'] = df_trips['str_o_dt'].str[:-7]
    print(f"\n'o_dt' data are now clean: " +
          f"{len(df_trips.loc[df_trips['str_o_dt'].str.len()==8]) == len(df_trips)}\n\n")
    
    df_trips['str_d_at'] =  df_trips['d_arr_time'].astype(str)
    print('Before:\n', df_trips['str_d_at'].loc[~df_trips['str_d_at'].str.len().isin([8, 15])])
    df_trips['str_d_at'] = df_trips['str_d_at'].\
    apply(lambda x: x[11:] if x[:10]=='1900-01-01' else x)
    print('\nAfter:\n', df_trips['str_d_at'].loc[~df_trips['str_d_at'].str.len().isin([8, 15])])
    df_trips['str_d_at'].loc[df_trips['str_d_at'].str.len()!=8]
    df_trips['str_d_at'].loc[df_trips['str_d_at'].str.len()==8]
    df_trips.loc[df_trips['str_d_at'].str.len()!=8, 'str_d_at'] = df_trips['str_d_at'].str[:-7]
    print(f"\n'd_at' data are now clean: " +
          f"{len(df_trips.loc[df_trips['str_d_at'].str.len()==8]) == len(df_trips)}")

    df_trips['str_nt_dt'] =  df_trips['next_trip_dep_time'].astype(str)
    print('Before:\n', df_trips['str_nt_dt'].loc[~df_trips['str_nt_dt'].str.len().isin([3, 8, 15])])
    df_trips['str_nt_dt'] = df_trips['str_nt_dt'].\
    apply(lambda x: x[11:] if x[:10]=='1900-01-01' else x)
    print('\nAfter:\n', df_trips['str_nt_dt'].loc[~df_trips['str_nt_dt'].str.len().isin([3, 8, 15])])
    df_trips['str_nt_dt'].loc[~df_trips['str_nt_dt'].str.len().isin([3, 8])]
    df_trips['str_nt_dt'].loc[df_trips['str_nt_dt'].str.len()==8]
    df_trips.loc[df_trips['str_nt_dt'].str.len()==15, 'str_nt_dt'] = df_trips['str_nt_dt'].str[:-7]
    actual_times = len(df_trips.loc[df_trips['str_nt_dt'].str.len()==8])
    nans = len(df_trips.loc[df_trips['str_nt_dt'].str.len()==3])    
    print(f"\n'nt_dt' data are now clean: " +
          f"{actual_times+nans==len(df_trips)}\n")
    print(f'Number of recorded times: {actual_times}')
    print(f'Number of NANs: {nans}')
    print(f'Total trips in the dataframe: {len(df_trips)}')

Before:
 1861           1900-01-01 00:04:24
1862    1900-01-01 00:10:09.600000
Name: str_o_dt, dtype: object

After:
 Series([], Name: str_o_dt, dtype: object)

'o_dt' data are now clean: True


Before:
 401            1900-01-01 00:00:00
881            1900-01-01 00:17:00
1013           1900-01-01 00:00:00
1860    1900-01-01 00:01:31.200000
1861    1900-01-01 00:07:16.800000
1862    1900-01-01 00:23:02.400000
2050           1900-01-01 00:00:00
3124           1900-01-01 00:27:00
3209           1900-01-01 00:16:59
3641           1900-01-01 00:02:59
3726           1900-01-01 00:00:00
4679           1900-01-01 00:03:59
4947           1900-01-01 00:04:59
5188           1900-01-01 00:00:59
Name: str_d_at, dtype: object

After:
 Series([], Name: str_d_at, dtype: object)

'd_at' data are now clean: True
Before:
 1860           1900-01-01 00:04:24
1861    1900-01-01 00:10:09.600000
Name: str_nt_dt, dtype: object

After:
 Series([], Name: str_nt_dt, dtype: object)

'nt_dt' data are now clean: T

In [66]:
df_trips['o_dt'] = df_trips['str_td'] + ' ' + df_trips['str_o_dt']
df_trips['o_dt'] = pd.to_datetime(df_trips['o_dt'])
print(type(df_trips['o_dt']))
print(type(df_trips['o_dt'][0]))
df_trips.head()

<class 'pandas.core.series.Series'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,str_td,str_o_dt,str_d_at,str_nt_dt,o_dt
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,13:39:00,14:03:00,14:33:00,2022-08-29 13:39:00
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:33:00,14:39:00,14:42:00,2022-08-29 14:33:00
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:42:00,14:54:03,15:01:21,2022-08-29 14:42:00
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:01:21,15:09:00,15:12:00,2022-08-29 15:01:21
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:12:00,15:15:00,15:19:00,2022-08-29 15:12:00


In [67]:
df_trips['d_at'] = df_trips['str_td'] + ' ' + df_trips['str_d_at']
df_trips['d_at'] = pd.to_datetime(df_trips['d_at'])
print(type(df_trips['d_at']))
print(type(df_trips['d_at'][0]))
print(f"Number of NAs in this column: {df_trips['d_at'].isna().sum()}")
df_trips.head()

<class 'pandas.core.series.Series'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
Number of NAs in this column: 0


Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,str_td,str_o_dt,str_d_at,str_nt_dt,o_dt,d_at
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,13:39:00,14:03:00,14:33:00,2022-08-29 13:39:00,2022-08-29 14:03:00
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:33:00,14:39:00,14:42:00,2022-08-29 14:33:00,2022-08-29 14:39:00
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:42:00,14:54:03,15:01:21,2022-08-29 14:42:00,2022-08-29 14:54:03
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:01:21,15:09:00,15:12:00,2022-08-29 15:01:21,2022-08-29 15:09:00
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:12:00,15:15:00,15:19:00,2022-08-29 15:12:00,2022-08-29 15:15:00


In [68]:
df_trips['nt_dt'] = df_trips['str_td'] + ' ' + df_trips['str_nt_dt']
df_trips['nt_dt'] = pd.to_datetime(df_trips['nt_dt'], errors='coerce')
print(type(df_trips['nt_dt']))
print(type(df_trips['nt_dt'][0]))
print(f"Number of NaTs in this column: {df_trips['nt_dt'].isna().sum()}")
df_trips.head()

<class 'pandas.core.series.Series'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
Number of NaTs in this column: 411


Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,str_td,str_o_dt,str_d_at,str_nt_dt,o_dt,d_at,nt_dt
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,13:39:00,14:03:00,14:33:00,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:33:00,14:39:00,14:42:00,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,0,0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:42:00,14:54:03,15:01:21,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,0,0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:01:21,15:09:00,15:12:00,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,0,0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:12:00,15:15:00,15:19:00,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00


In [69]:
# Some tests:
print(df_trips['d_at'][0] - df_trips['o_dt'][0])
print(df_trips['d_at'][0] > df_trips['o_dt'][0])
print(df_trips['nt_dt'][3] - df_trips['d_at'][3])
print(df_trips['nt_dt'][0] + pd.to_timedelta(1, unit='D'))

0 days 00:24:00
True
0 days 00:03:00
2022-08-30 14:33:00


In [70]:
# Ensure events are in chronological order.
print(len(df_trips.loc[df_trips['d_at']<df_trips['o_dt']]))
df_trips.loc[df_trips['d_at']<df_trips['o_dt'], 'd_at'] = df_trips['d_at'] + pd.to_timedelta(1, unit='days')
print(len(df_trips.loc[df_trips['d_at']<df_trips['o_dt']]), '\n')

print(len(df_trips.loc[df_trips['nt_dt']<df_trips['d_at']]))
df_trips.loc[df_trips['nt_dt']<df_trips['d_at'], 'nt_dt'] = df_trips['nt_dt'] + pd.to_timedelta(1, unit='days')
print(len(df_trips.loc[df_trips['nt_dt']<df_trips['d_at']]))

12
0 

1
0


In [71]:
# Find travel time and stop duration in minutes.
# df_trips['travel_time'] = ((df_trips['d_at'] - df_trips['o_dt']).dt.seconds).astype(int)
# df_trips['stop_duration'] = ((df_trips['nt_dt'] - df_trips['d_at']).dt.seconds).round().astype('Int32') #, errors='ignore'
df_trips['travel_time'] = (((df_trips['d_at'] - df_trips['o_dt']).dt.seconds) / 60).astype(int)
df_trips['stop_duration'] = (((df_trips['nt_dt'] - df_trips['d_at']).dt.seconds) / 60).round().astype('Int32') #, errors='ignore'
# Note the capitalized 'Int32' in the line above - as opposed to 'int32'.
print(f"Now, the datatype for the 'stop_duration' column is kind of weird: {df_trips['stop_duration'].dtypes}, rather than simply int32.")
print('We need to live with this because we want to allow <NA> values in this column.')
df_trips.head()

Now, the datatype for the 'stop_duration' column is kind of weird: Int32, rather than simply int32.
We need to live with this because we want to allow <NA> values in this column.


Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,o_dep_time,d_arr_time,next_trip_dep_time,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,str_td,str_o_dt,str_d_at,str_nt_dt,o_dt,d_at,nt_dt
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,13:39:00,14:03:00,14:33:00,24,30,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,13:39:00,14:03:00,14:33:00,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,14:33:00,14:39:00,14:42:00,6,3,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:33:00,14:39:00,14:42:00,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,14:42:00,14:54:03,15:01:21,12,7,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,14:42:00,14:54:03,15:01:21,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,15:01:21,15:09:00,15:12:00,7,3,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:01:21,15:09:00,15:12:00,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,15:12:00,15:15:00,15:19:00,3,4,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29,15:12:00,15:15:00,15:19:00,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00


In [72]:
# Throw away unnecessary time fields.
rmv_fields = ['o_dep_time', 'd_arr_time', 'next_trip_dep_time',
              'str_td', 'str_o_dt', 'str_d_at', 'str_nt_dt']
all_fields = df_trips.columns.to_list()
keep_cols = [f for f in all_fields if not f in rmv_fields]        

df_trips = df_trips[keep_cols]
df_trips.head()

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,o_dt,d_at,nt_dt
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,24,30,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,6,3,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,12,7,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,7,3,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,3,4,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00


##### Mark the last trips

In [73]:
df_trips['last_trip'] = False
df_trips.loc[df_trips['trip_number'].shift(-1)==1, 'last_trip'] = True
df_trips.at[df_trips.index[-1], 'last_trip'] = True
df_trips.tail()
df_trips.head(50)

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,o_dt,d_at,nt_dt,last_trip
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,0,,0,0,24,30.0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00,False
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,0,,0,0,6,3.0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00,False
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,0,,0,0,12,7.0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21,False
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,0,,0,0,7,3.0,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00,False
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0,,0,0,3,4.0,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00,False
5,600026,TNC_Restaurant,1371,1744,6,D,Goods_Delivery,2,Office,D,Goods_Delivery,1,Residential,5.0,1.0,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,5,11,HOUSE,10928 Valle Vista Rd,-116.936786,32.875659,4467,0,,0,0,11,1.0,LCV,,6.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:19:00,2022-08-29 15:30:00,2022-08-29 15:31:00,False
6,600026,TNC_Restaurant,1371,1744,7,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,10928 Valle Vista Rd,-116.936786,32.875659,4467,5,11,DROP OFF,9333 Spectrum Center Blvd,-117.130098,32.825059,2332,0,,0,0,29,3.0,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:31:00,2022-08-29 16:00:00,2022-08-29 16:02:52,False
7,600026,TNC_Restaurant,1371,1744,8,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,DROP OFF,9333 Spectrum Center Blvd,-117.130098,32.825059,2332,5,11,FARM,11434 Rocoso Rd,-116.947734,32.884509,4401,0,,0,0,29,3.0,LCV,,6.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 16:02:52,2022-08-29 16:32:00,2022-08-29 16:34:52,False
8,600026,TNC_Restaurant,1371,1744,9,D,Goods_Delivery,1,Residential,D,Goods_Delivery,2,Office,5.0,11.0,FARM,11434 Rocoso Rd,-116.947734,32.884509,4401,5,1,YMCA DELIVERY & WAIT FOR NEXT JOB,10167 Riverwalk Dr,-116.976491,32.85127,4168,0,,0,0,16,100.0,LCV,,6.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 16:34:52,2022-08-29 16:51:45,2022-08-29 18:32:00,False
9,600026,TNC_Restaurant,1371,1744,10,D,Goods_Delivery,2,Office,P,Goods_Pickup,5,Retail and Restaurant,5.0,1.0,YMCA DELIVERY & WAIT FOR NEXT JOB,10167 Riverwalk Dr,-116.976491,32.85127,4168,6,5,RESTURANT,4637 Convoy St Ste 101,-117.154685,32.825124,2051,0,,0,0,19,16.0,LCV,6.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 18:32:00,2022-08-29 18:51:00,2022-08-29 19:07:00,False


##### Find the trip distance

In [74]:
df_trips['tod'] = ""

In [75]:
# Indicate at what time of day the trip has started.
def find_tod(timestamp):
    if timestamp.time() < dt.time(3):
        return 'EV' # Late: 7PM to 3AM
    elif timestamp.time() < dt.time(6):
        return 'EA' # Early: 3AM to 6AM
    elif timestamp.time() < dt.time(9):
        return 'AM' # AM Peak: 6AM to 9AM
    elif timestamp.time() < dt.time(15, 30):
        return 'MD' # Midday: 9AM to 3:30PM
    elif timestamp.time() < dt.time(19):
        return 'PM' # PM Peak: 3:30PM to 7pm
    elif timestamp.time() >= dt.time(19):
        return 'EV' # Late: 7PM to 3AM

df_trips['tod'] = df_trips['o_dt'].apply(find_tod) 
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(0)) & (df_trips['o_dt'].dt.time<dt.time(3))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(3)) & (df_trips['o_dt'].dt.time<dt.time(6))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(6)) & (df_trips['o_dt'].dt.time<dt.time(9))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(9)) & (df_trips['o_dt'].dt.time<dt.time(15, 30))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(15,30)) & (df_trips['o_dt'].dt.time<dt.time(19))].head(10)
df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(19)) & (df_trips['o_dt'].dt.time<dt.time(23, 59, 59, 999999))].head(10)

Unnamed: 0,o_dt,tod
10,2022-08-29 19:07:00,EV
11,2022-08-29 19:21:00,EV
12,2022-08-29 19:32:00,EV
13,2022-08-29 19:45:00,EV
35,2022-09-01 19:35:00,EV
36,2022-09-01 19:54:00,EV
37,2022-09-01 20:51:00,EV
38,2022-09-01 21:17:00,EV
39,2022-09-01 21:37:00,EV
51,2022-08-30 19:00:00,EV


In [76]:
print(f'Example: Distance from TAZ 5 to TAZ 19 in AM peak for medium trucks is: {dist_dfs[1][1].at[4,18]/10:} miles')

Example: Distance from TAZ 5 to TAZ 19 in AM peak for medium trucks is: 103.2 miles


In [77]:
dic_v = {'LCV': 0, 'SUT': 1, 'MUT': 2}
dic_tod = {'EA': 0, 'AM': 1, 'MD':2, 'PM':3, 'EV':4}

def find_dist(o_taz, d_taz, vs, tod): # vs is vehicle size and tod is time of day.
    if o_taz==-1 or d_taz==-1: return None
    return dist_dfs[tod][vs].at[o_taz-1, d_taz-1] / 10

df_trips['trip_dist'] = df_trips.apply(lambda x: find_dist(x['o_taz'], x['d_taz'],
                                                           dic_v[x['veh_type']],
                                                           dic_tod[x['tod']]), axis=1)
    
df_trips.head()

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,o_dt,d_at,nt_dt,last_trip,tod
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,23.9,,0,0,24,30,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00,False,MD
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,2.7,,0,0,6,3,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00,False,MD
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,1.8,,0,0,12,7,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21,False,MD
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,1.1,,0,0,7,3,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00,False,MD
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0.1,,0,0,3,4,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00,False,MD


In [78]:
df_trips[['o_taz','d_taz','veh_type','tod']].head()

Unnamed: 0,o_taz,d_taz,veh_type,tod
0,2958,4617,LCV,MD
1,4617,4564,LCV,MD
2,4564,4508,LCV,MD
3,4508,4465,LCV,MD
4,4465,4465,LCV,MD


##### Find the trip generalized travel time from the establishment base to the trip destination

In [79]:
print(f'Example: Generalized travel time from TAZ 3901 to TAZ 4233 in mid-day for heavy trucks is: {g_tt_dfs[2][2].iloc[3900, 4232]} minutes.')

Example: Generalized travel time from TAZ 3901 to TAZ 4233 in mid-day for heavy trucks is: 16.15 minutes.


In [80]:
dic_v = {'LCV': 0, 'SUT': 1, 'MUT': 2}
for i, row in df_trips.iterrows():    
    if row['estab_taz'] == -1 or row['d_taz'] == -1: continue
    df_trips.at[i, 'toll_in_cents_from_base'] = toll_dfs[2][dic_v[row['veh_type']]].at[row['estab_taz']-1, row['d_taz']-1]
    df_trips.at[i, 'gen_tt_from_base'] = g_tt_dfs[2][dic_v[row['veh_type']]].at[row['estab_taz']-1, row['d_taz']-1]

In [81]:
out_file = f'Generalized TT for {dataset}.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_trips.to_excel(out_path, engine='openpyxl')

##### Calculate average generalized travel time by industry by destination activity

In [82]:
print(len(df_trips.loc[df_trips['gen_tt_from_base'].isnull()]))
df_trips.loc[df_trips['gen_tt_from_base'].isnull()].head(2)

261


Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,o_dt,d_at,nt_dt,last_trip,tod,toll_in_cents_from_base,gen_tt_from_base
211,600042,TNC_Restaurant,1478,1877,1,H,Home,1,Residential,M,Maintenance/Other,2,Office,14.0,11.0,MURRIETA MEADOWS,24850 Hancock Ave,-117.18509,33.572444,-1,3,8,PEDIATRIC PARTNERS,27699 Jefferson Ave Suite #309,-117.159853,33.514176,-1,,,0,0,8,51,LCV,,,2022-09-08,Smartphone,1.0,-1,Uber Eats,2022-09-08 08:13:00,2022-09-08 08:21:00,2022-09-08 09:12:00,False,AM,,
212,600042,TNC_Restaurant,1478,1877,2,M,Maintenance/Other,2,Office,S,Service,5,Retail and Restaurant,3.0,8.0,PEDIATRIC PARTNERS,27699 Jefferson Ave Suite #309,-117.159853,33.514176,-1,10,5,JERSEY MIKES SUBS PICK UP/ORDER CANCELLED,2534 Murrieta Hot Springs Rd,-117.196641,33.551981,-1,,,0,0,7,1,LCV,,,2022-09-08,Smartphone,1.0,-1,Uber Eats,2022-09-08 09:12:00,2022-09-08 09:19:00,2022-09-08 09:20:00,False,MD,,


In [83]:
cols = ['industry_group', 'd_act_seg', 'd_plc_seg_name', 'veh_type', 'expnsn_factor', 'gen_tt_from_base']
df_gtt = df_trips[cols].copy()
df_gtt = df_gtt.loc[~df_gtt['gen_tt_from_base'].isnull()]
df_gtt = df_gtt.loc[df_gtt['d_act_seg'].isin(['P', 'D', 'S'])]
df_gtt.rename(columns={'d_act_seg': 'purpose'}, inplace=True)
df_gtt['purpose'] = df_gtt['purpose'].apply(lambda x: 'Service' if x=='S' else 'Goods')
# df_gtt.insert(4, 'count', 1)
df_gtt.insert(3, 'customer', 'Business')
df_gtt.loc[df_gtt['d_plc_seg_name']=='Residential', 'customer'] = 'Resident'
df_gtt.drop(columns='d_plc_seg_name', inplace=True)
df_gtt.head(2)

Unnamed: 0,industry_group,purpose,customer,veh_type,expnsn_factor,gen_tt_from_base
0,TNC_Restaurant,Goods,Business,LCV,1.0,28.29
1,TNC_Restaurant,Goods,Resident,LCV,1.0,31.61


###### Unweighted generalized travel time

In [84]:
df_gtt_unw1 = df_gtt.groupby(['industry_group', 'purpose', 'customer', 'veh_type']).size().rename('sample_size').reset_index()
df_gtt_unw1.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,sample_size
0,TNC_NonRestRetl,Goods,Business,LCV,453
1,TNC_NonRestRetl,Goods,Business,SUT,3
2,TNC_NonRestRetl,Goods,Resident,LCV,672
3,TNC_NonRestRetl,Service,Business,LCV,11
4,TNC_NonRestRetl,Service,Resident,LCV,3


In [85]:
df_gtt_unw2 = df_gtt.groupby(['industry_group', 'purpose', 'customer', 'veh_type'])['gen_tt_from_base']\
.mean().reset_index()
df_gtt_unw2.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,gen_tt_from_base
0,TNC_NonRestRetl,Goods,Business,LCV,19.419669
1,TNC_NonRestRetl,Goods,Business,SUT,22.48
2,TNC_NonRestRetl,Goods,Resident,LCV,23.50628
3,TNC_NonRestRetl,Service,Business,LCV,20.405455
4,TNC_NonRestRetl,Service,Resident,LCV,14.023333


In [86]:
df_gtt_unw = pd.merge(df_gtt_unw1, df_gtt_unw2, on=['industry_group', 'purpose', 'customer', 'veh_type']).dropna()
df_gtt_unw['product'] = df_gtt_unw['sample_size'] * df_gtt_unw['gen_tt_from_base']
df_gtt_unw.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,sample_size,gen_tt_from_base,product
0,TNC_NonRestRetl,Goods,Business,LCV,453,19.419669,8797.11
1,TNC_NonRestRetl,Goods,Business,SUT,3,22.48,67.44
2,TNC_NonRestRetl,Goods,Resident,LCV,672,23.50628,15796.22
3,TNC_NonRestRetl,Service,Business,LCV,11,20.405455,224.46
4,TNC_NonRestRetl,Service,Resident,LCV,3,14.023333,42.07


In [87]:
s = df_gtt_unw.groupby(['industry_group', 'purpose', 'customer'])['sample_size'].sum()
df_avg_gtt_unw = ((df_gtt_unw.groupby(['industry_group', 'purpose', 'customer'])['product'].sum()/s)\
                  .rename('avg_gen_tt_unweighted')).reset_index()
df_avg_gtt_unw = s.reset_index().merge(df_avg_gtt_unw, on=['industry_group', 'purpose', 'customer'] )
df_avg_gtt_unw.head()

Unnamed: 0,industry_group,purpose,customer,sample_size,avg_gen_tt_unweighted
0,TNC_NonRestRetl,Goods,Business,456,19.439803
1,TNC_NonRestRetl,Goods,Resident,672,23.50628
2,TNC_NonRestRetl,Service,Business,11,20.405455
3,TNC_NonRestRetl,Service,Resident,3,14.023333
4,TNC_Restaurant,Goods,Business,921,17.085364


In [88]:
out_file = f'Generalized TT for {dataset} Segments, Unweighted.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_avg_gtt_unw.to_excel(out_path, engine='openpyxl')

###### Weighted generalized travel time

In [89]:
df_gtt2 = df_gtt.copy()
df_gtt2['expanded_tt'] = df_gtt2.expnsn_factor * df_gtt2.gen_tt_from_base
df_gtt2.head(1)

Unnamed: 0,industry_group,purpose,customer,veh_type,expnsn_factor,gen_tt_from_base,expanded_tt
0,TNC_Restaurant,Goods,Business,LCV,1.0,28.29,28.29


In [90]:
df_gtt2_wei0 = df_gtt2.groupby(['industry_group', 'purpose', 'customer', 'veh_type']).size().rename('sample_size').reset_index()
df_gtt2_wei0.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,sample_size
0,TNC_NonRestRetl,Goods,Business,LCV,453
1,TNC_NonRestRetl,Goods,Business,SUT,3
2,TNC_NonRestRetl,Goods,Resident,LCV,672
3,TNC_NonRestRetl,Service,Business,LCV,11
4,TNC_NonRestRetl,Service,Resident,LCV,3


In [91]:
df_gtt2_wei1 = df_gtt2.groupby(['industry_group', 'purpose', 'customer', 'veh_type'])['expnsn_factor']\
.sum().rename('pop_size').reset_index()
df_gtt2_wei1.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,pop_size
0,TNC_NonRestRetl,Goods,Business,LCV,453.0
1,TNC_NonRestRetl,Goods,Business,SUT,3.0
2,TNC_NonRestRetl,Goods,Resident,LCV,672.0
3,TNC_NonRestRetl,Service,Business,LCV,11.0
4,TNC_NonRestRetl,Service,Resident,LCV,3.0


In [92]:
df_gtt2_wei2 = df_gtt2.groupby(['industry_group', 'purpose', 'customer', 'veh_type'])['expanded_tt']\
.sum().reset_index()
df_gtt2_wei2.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,expanded_tt
0,TNC_NonRestRetl,Goods,Business,LCV,8797.11
1,TNC_NonRestRetl,Goods,Business,SUT,67.44
2,TNC_NonRestRetl,Goods,Resident,LCV,15796.22
3,TNC_NonRestRetl,Service,Business,LCV,224.46
4,TNC_NonRestRetl,Service,Resident,LCV,42.07


In [93]:
df_gtt2_wei = pd.merge(df_gtt2_wei0, df_gtt2_wei1, on=['industry_group', 'purpose', 'customer', 'veh_type'])
df_gtt2_wei = pd.merge(df_gtt2_wei, df_gtt2_wei2, on=['industry_group', 'purpose', 'customer', 'veh_type']).dropna()
# df_gtt2_wei['product'] = df_gtt2_wei['pop_size'] * df_gtt2_wei['expanded_tt']
df_gtt2_wei.head()

Unnamed: 0,industry_group,purpose,customer,veh_type,sample_size,pop_size,expanded_tt
0,TNC_NonRestRetl,Goods,Business,LCV,453,453.0,8797.11
1,TNC_NonRestRetl,Goods,Business,SUT,3,3.0,67.44
2,TNC_NonRestRetl,Goods,Resident,LCV,672,672.0,15796.22
3,TNC_NonRestRetl,Service,Business,LCV,11,11.0,224.46
4,TNC_NonRestRetl,Service,Resident,LCV,3,3.0,42.07


In [94]:
sz= df_gtt2_wei.groupby(['industry_group', 'purpose', 'customer'])['sample_size'].sum()
s = df_gtt2_wei.groupby(['industry_group', 'purpose', 'customer'])['pop_size'].sum()
df_avg_gtt2_wei = ((df_gtt2_wei.groupby(['industry_group', 'purpose', 'customer'])['expanded_tt'].sum()/s)\
                  .rename('avg_gen_tt_weighted')).reset_index()
s = sz.reset_index().merge(s, on=['industry_group', 'purpose', 'customer'] )
df_avg_gtt2_wei = s.reset_index().merge(df_avg_gtt2_wei, on=['industry_group', 'purpose', 'customer'] )
df_avg_gtt2_wei.head()

Unnamed: 0,index,industry_group,purpose,customer,sample_size,pop_size,avg_gen_tt_weighted
0,0,TNC_NonRestRetl,Goods,Business,456,456.0,19.439803
1,1,TNC_NonRestRetl,Goods,Resident,672,672.0,23.50628
2,2,TNC_NonRestRetl,Service,Business,11,11.0,20.405455
3,3,TNC_NonRestRetl,Service,Resident,3,3.0,14.023333
4,4,TNC_Restaurant,Goods,Business,921,921.0,17.085364


In [95]:
out_file = f'Generalized TT for {dataset} Segments, Weighted.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_avg_gtt2_wei.to_excel(out_path, engine='openpyxl')

#### Create Summaries from the Trip Dataframe

In [96]:
# Unweighted:
AvgStop1 = df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group']).size().rename('count')
AvgStop2 = df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['stop_duration']\
.mean().rename('stop_duration_minutes').round()

# Weighted:
df_trips.loc[df_trips['stop_duration']>0, 'dur_by_weight'] = df_trips['stop_duration'] * df_trips['expnsn_factor']
AvgStop1w = df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['expnsn_factor'].sum().rename('weighted_count')
AvgStop2w = (df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['dur_by_weight'].sum()/\
            df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['expnsn_factor'].sum()).\
            rename('stop_duration_minutes').round()

In [97]:
# df_trips[df_trips['stop_duration']>0][['stop_duration','expnsn_factor']].sort_values(['expnsn_factor'], ascending=True)

In [98]:
AvgStop = pd.concat([AvgStop1, AvgStop2], axis=1)
AvgStop_w = pd.concat([AvgStop1w, AvgStop2w], axis=1)
AvgStop
AvgStop_w

Unnamed: 0_level_0,Unnamed: 1_level_0,weighted_count,stop_duration_minutes
d_act_seg_name,industry_group,Unnamed: 2_level_1,Unnamed: 3_level_1
Base,TNC_NonRestRetl,12.0,59.0
Base,TNC_Restaurant,39.0,82.0
Base,TNC_Retail,17.0,81.0
Goods_Delivery,TNC_NonRestRetl,1010.0,8.0
Goods_Delivery,TNC_Restaurant,963.0,15.0
Goods_Delivery,TNC_Retail,591.0,10.0
Goods_Pickup,TNC_NonRestRetl,174.0,27.0
Goods_Pickup,TNC_Restaurant,653.0,17.0
Goods_Pickup,TNC_Retail,212.0,26.0
Home,TNC_NonRestRetl,14.0,116.0


In [99]:
i = 'industry_group'  
pvt_ind_act = pd.pivot_table(df_trips, values='company_id', index=i,
                             columns='d_act_seg_name', aggfunc='count').fillna(0) #, sort=True) For sort, default is True. 
pvt_ind_act_w = pd.pivot_table(df_trips, values='expnsn_factor', index=i,
                             columns='d_act_seg_name', aggfunc='sum').fillna(0).round() #, sort=True) For sort, default is True.    
pvt_ind_act
pvt_ind_act_w

d_act_seg_name,Base,Goods_Delivery,Goods_Pickup,Home,Maintenance/Other,Service
industry_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TNC_NonRestRetl,32.0,1028.0,176.0,37.0,259.0,14.0
TNC_Restaurant,89.0,1022.0,663.0,77.0,587.0,68.0
TNC_Retail,44.0,606.0,214.0,40.0,280.0,17.0


In [100]:
pvt_act_plc = pd.pivot_table(df_trips, values='company_id', index='d_act_seg_name',
                             columns='d_plc_seg_name', aggfunc='count').fillna(0) #, sort=True) For sort, default is True.
pvt_act_plc_w = pd.pivot_table(df_trips, values='expnsn_factor', index='d_act_seg_name',
                             columns='d_plc_seg_name', aggfunc='sum').fillna(0).round() #, sort=True) For sort, default is True.
pvt_act_plc
# pvt_act_plc_w

d_plc_seg_name,Gas,"Industrial, Agriculture, or Construction",Office,Other,Residential,Retail and Restaurant,Truck Terminal or Parking,Warehouse
d_act_seg_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Base,0.0,1.0,6.0,2.0,148.0,2.0,0.0,6.0
Goods_Delivery,16.0,117.0,252.0,24.0,1936.0,253.0,8.0,50.0
Goods_Pickup,16.0,16.0,27.0,9.0,18.0,889.0,2.0,76.0
Home,0.0,0.0,0.0,1.0,153.0,0.0,0.0,0.0
Maintenance/Other,145.0,11.0,116.0,90.0,332.0,382.0,9.0,41.0
Service,0.0,0.0,10.0,14.0,23.0,39.0,5.0,8.0


In [101]:
out_file = dataset + f'_TripSummaries_{current_date}.xlsx'
out_path = os.path.join(project_path, out_data_dir)
if not os.path.exists(out_path):
   os.makedirs(out_path)
out_path = os.path.join(project_path, out_data_dir, out_file)
xl_writer = pd.ExcelWriter(out_path, engine='openpyxl')

In [102]:
sn1 = 'StopDur_byAct&Ind'
sn2 = 'StopDur_byAct&Ind_w'
sn3 = 'Ind-DesAct'
sn4 = 'Ind-DesAct_w'
sn5 = 'DesAct-DesPlc'
sn6 = 'DesAct-DesPlc_w'
AvgStop.to_excel(xl_writer, sheet_name=sn1) # , index_label='index'
AvgStop_w.to_excel(xl_writer, sheet_name=sn2) # , index_label='index'
pvt_ind_act.to_excel(xl_writer, sheet_name=sn3) # , index_label='index'
pvt_ind_act_w.to_excel(xl_writer, sheet_name=sn4) # , index_label='index'
pvt_act_plc.to_excel(xl_writer, sheet_name=sn5) # , index_label='index'
pvt_act_plc_w.to_excel(xl_writer, sheet_name=sn6) # , index_label='index'
xl_writer.close()

try:
    df_trips.drop(columns='dur_by_weight', inplace=True)
except:
    pass

#### Start Creating the Routes

In [103]:
cols = ['route_id', 'industry_group', 'veh_type',        
        'trip_count', 'g_stops', 's_stops', 'm_stops', 'b_stops', 'h_stops',        
        'primary_purp', 'customer_type',
        'start_tod', 'end_tod', 'route_dur_hr', 'cumlv_dur', 'durations_match',
        'start_activity', 'end_activity',
        'start_plc_seg', 'end_plc_seg',
        'act_seg_seq', 'plc_seg_seq',
        'headquarters', 'hq_taz',
        'tot_distance',
        'company_id', 'vehicle_id', 'driver_id',
        'participation_type', 'expnsn_factor', 'estab_taz', 'trips']
# g_stops = goods stops, s for services, m for maintenance/other, b for base, h for home. (These are super activities.)
# Primary purpose: Goods, Services, Maintenance/Other
# Customer type: Residential, Non-residential, Mixed
# start_tod = starting time of day, it is the first departure; end_tod = ending time of day, it is the last arrival.
# route_dur_hr = Duration of the route based on its start and end times.
# cumlv_dur = Duration of the route based on trip and stay durations.
# durations_match = True if route durations from the two methods match; else False.
# start_activity or end_activity: Starting/ending activity at O/D
# start_plc_seg or end_plc_seg: Starting/ending place (Base, Home, Warehouse/DC (distribution center)/Transport Node, Other)
# headquarters: Primary point of return. Most of the time it is the base.
# hq_taz: TAZ of the primary point of return.

if dataset=='TNC': cols.insert(-1, 'company_name')
    
df_routes = pd.DataFrame(columns = cols)

In [105]:
# Iterate through the trip dataframe.
for i, row in df_trips.iterrows():
    
    # If this trip is the first trip of the route, initialize a route for it.
    if row['trip_number'] == 1:
        df_temp_rt = pd.DataFrame(columns=cols, index=[0])
        
        # Generate a route ID.
        date = str(df_trips.at[i, 'travel_date'])[:10]
        date = date.replace('-', '')
        date = int(date) * 10000
        df_temp_rt['route_id'] = date + row['vehicle_id']
        
        # Copy common attributes from the trip row to the temporary route dataframe.
        for c in df_temp_rt.columns.to_list():
            if c in df_trips.columns.to_list():
                df_temp_rt[c] = row[c]
        
        # Initialize route variables.
        trips = []
        activities = [row['o_act_seg']]
        places = [row['o_plc_seg']]
        
        # Set route variables that are known.
        df_temp_rt['start_activity'] = row['o_act_seg']
        df_temp_rt['start_plc_seg'] = dic_pt2[row['o_plc_seg']]
        df_temp_rt['start_tod'] = row['o_dt']
    
    # Initialize a trip and store the current row values into its attributes.
    t = Trip()    
    t.index = i
    for attr in list(vars(t).keys()):
        if attr == 'index': continue
        #  print(row[str(attr)])
        #  print(getattr(t, attr))
        setattr(t, attr, row[attr])
    
    trips.append(t)
    # Add trip info to the route variables.    
    activities.append(row['d_act_seg'])
    places.append(row['d_plc_seg'])
    
    # Finalize the route if this is the last trip of the route.
    if row['last_trip']:
        df_temp_rt.at[0, 'act_seg_seq'] = activities
        df_temp_rt.at[0, 'plc_seg_seq'] = places
        df_temp_rt.at[0, 'trips'] = trips
                
        # Set route variables that are known.        
        df_temp_rt['end_activity'] = row['d_act_seg']
        df_temp_rt['end_plc_seg'] = dic_pt2[row['d_plc_seg']]
        df_temp_rt['end_tod'] = row['d_at']
        
        # Attach the completed route to the route dataframe.
        df_routes = pd.concat([df_routes, df_temp_rt], axis=0, ignore_index=True)
        
#     if i == 300: break
df_routes.tail()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips
817,202212222500,TNC_Restaurant,LCV,,,,,,,,,2022-12-22 14:41:00,2022-12-22 19:16:25,,,,S,D,Other,Warehouse,"[S, P, D, H, D]","[4, 2, 2, 1, 3]",,0,,600652,2500,3060,Smartphone,1.0,811,Uber Eats,"[<__main__.Trip object at 0x0000019986C14448>,..."
818,202212222502,TNC_Restaurant,LCV,,,,,,,,,2022-12-22 14:51:11,2022-12-22 21:29:16,,,,M,M,Office,Residential,"[M, M, P, D, M]","[2, 1, 5, 2, 1]",,0,,600653,2502,3062,Smartphone,1.0,4472,Door Dash,"[<__main__.Trip object at 0x0000019986C0F2C8>,..."
819,202212222504,TNC_Restaurant,LCV,,,,,,,,,2022-12-22 16:49:00,2022-12-22 20:19:00,,,,H,D,Residential,Residential,"[H, S, S, P, D, P, D, M, P, D]","[1, 5, 2, 5, 1, 5, 1, 5, 5, 1]",,0,,600654,2504,3065,Smartphone,1.0,284,Door Dash,"[<__main__.Trip object at 0x0000019986C19308>,..."
820,202212222501,TNC_Restaurant,LCV,,,,,,,,,2022-12-22 14:36:00,2022-12-22 15:49:00,,,,S,H,Other,Residential,"[S, P, D, D, H]","[4, 2, 2, 3, 1]",,0,,600655,2501,3061,Smartphone,1.0,4270,Postmates,"[<__main__.Trip object at 0x0000019986C19248>,..."
821,202212232506,TNC_Retail,LCV,,,,,,,,,2022-12-23 09:51:13,2022-12-23 16:40:38,,,,H,M,Residential,Residential,"[H, P, D, M]","[1, 5, 5, 1]",,0,,600656,2506,3067,Smartphone,1.0,4383,Instacart,"[<__main__.Trip object at 0x0000019986C19888>,..."


In [106]:
for i, row in df_routes.iterrows():
    # Identify number of stops by type of stop.        
    df_routes.at[i, 'trip_count'] = len(row['trips'])
    counts = collections.Counter(row['act_seg_seq'][1:])
    df_routes.at[i, 'g_stops'] = counts['P'] + counts['D']
    df_routes.at[i, 's_stops'] = counts['S']
    df_routes.at[i, 'm_stops'] = counts['M']
    df_routes.at[i, 'b_stops'] = counts['B']
    df_routes.at[i, 'h_stops'] = counts['H']
    
    # Identify the primary purpose of the route.
    if 'P' in row['act_seg_seq'][1:]:
        purp = 'Goods'
    elif 'D' in row['act_seg_seq'][1:]:
        purp = 'Goods'        
    elif 'S' in row['act_seg_seq'][1:]:
        purp = 'Service'
    else:
        purp = 'Maintenance/Other'
    df_routes.at[i, 'primary_purp'] = purp    
    
    # Find total distance traveled on the route.
    dist = 0
    for t in row['trips']:
        dist += t.trip_dist
    df_routes.at[i, 'tot_distance'] = dist
    
df_routes.head()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips
0,202208291371,TNC_Restaurant,LCV,14,13,0,0,0,1,Goods,,2022-08-29 13:39:00,2022-08-29 20:25:00,,,,H,H,Residential,Residential,"[H, P, D, D, P, D, D, D, D, D, P, D, D, D, H]","[1, 5, 1, 1, 5, 2, 1, 1, 1, 2, 5, 1, 1, 2, 1]",,0,105.4,600026,1371,1744,Smartphone,1.0,2958,Uber Eats,"[<__main__.Trip object at 0x0000019980481288>,..."
1,202209011443,TNC_NonRestRetl,LCV,26,18,3,1,3,1,Goods,,2022-09-01 09:51:00,2022-09-01 21:49:00,,,,B,B,Residential,Residential,"[B, S, B, P, D, S, H, P, D, D, D, D, D, D, D, ...","[1, 2, 1, 5, 6, 5, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...",,0,101.5,600027,1443,1745,Smartphone,1.0,1787,Amazon Flex,"[<__main__.Trip object at 0x00000199804819C8>,..."
2,202208311395,TNC_NonRestRetl,LCV,4,2,0,1,1,0,Goods,,2022-08-31 11:05:00,2022-08-31 12:17:00,,,,B,B,Residential,Residential,"[B, P, D, M, B]","[1, 2, 1, 5, 1]",,0,15.1,600028,1395,1746,Smartphone,1.0,3502,Senpex,"[<__main__.Trip object at 0x00000199804850C8>,..."
3,202208301393,TNC_Restaurant,LCV,9,5,0,2,1,1,Goods,,2022-08-30 16:00:00,2022-08-30 19:17:00,,,,B,H,Residential,Residential,"[B, B, D, P, D, P, M, M, P, H]","[1, 1, 3, 5, 2, 5, 6, 5, 5, 1]",,0,40.8,600029,1393,1747,Smartphone,1.0,2538,Grub Hub,"[<__main__.Trip object at 0x00000199874E9FC8>,..."
4,202209021451,TNC_Restaurant,LCV,30,18,1,8,2,1,Goods,,2022-09-02 08:57:00,2022-09-02 20:11:57,,,,B,B,Residential,Residential,"[B, M, M, M, M, M, P, D, P, D, P, D, D, D, P, ...","[1, 4, 4, 1, 5, 1, 5, 4, 5, 2, 5, 1, 1, 1, 5, ...",,0,65.5,600031,1451,1857,Smartphone,1.0,1207,Uber Eats,"[<__main__.Trip object at 0x00000199874EAFC8>,..."


In [108]:
for i, t in enumerate(df_routes.at[0, 'trips']):    
    print(f'\nTrip #{i+1}:')
    for e in zip(list(vars(t).keys()), list(vars(t).values())):
        print(e)    


Trip #1:
('index', 0)
('o_act', 14.0)
('o_place_type', 11.0)
('o_place_name', 'HOME')
('o_address', '1052 Woodlawn Ave')
('o_lon', -117.0862828)
('o_lat', 32.610644)
('o_taz', 2958)
('d_act', 6)
('d_place_type', 6)
('d_place_name', 'WALMART')
('d_address', '13487 Camino Canada')
('d_lon', -116.901892)
('d_lat', 32.822468)
('d_taz', 4617)
('trip_dist', 23.9)
('o_dt', Timestamp('2022-08-29 13:39:00'))
('d_at', Timestamp('2022-08-29 14:03:00'))
('nt_dt', Timestamp('2022-08-29 14:33:00'))
('travel_time', 24)
('stop_duration', 30)
('cargo_pickup', 5.0)
('cargo_delivery', nan)
('travel_date', Timestamp('2022-08-29 00:00:00'))
('last_trip', False)

Trip #2:
('index', 1)
('o_act', 6.0)
('o_place_type', 6.0)
('o_place_name', 'WALMART')
('o_address', '13487 Camino Canada')
('o_lon', -116.901892)
('o_lat', 32.822468)
('o_taz', 4617)
('d_act', 5)
('d_place_type', 11)
('d_place_name', 'HOUSE')
('d_address', '12651 Julian Ave')
('d_lon', -116.9175804)
('d_lat', 32.855417)
('d_taz', 4564)
('trip_dis

In [109]:
# Determine customer type.

# Customer Types at Stops (only applies to routes with Goods and/or Service purposes):
# a. Residential Only (households, including multi-family buildings)
# b. Non-residential Only (commercial, public/government)
# c. Mixed Residential and Non-residential
dic_cstmr_typ = {
    'ro' : 'Residential Only',
    'nro': 'Non-Residential Only',
    'm'  : 'Mixed Residential and Non-residential',
    'nc' : 'No Customer'
}

# plc_seg_code: plc_seg_name
# 1: Residential
# 2: Office
# 3: Warehouse
# 4: Other   A closer look at the trip table indicated most of these places are non-residential.
# 5: Retail and Restaurant
# 6: Gas
# 7: Industrial, Agriculture, or Construction
# 8: Truck Terminal or Parking

lst_res = [1]
lst_non_res = [2, 3, 4, 5, 6, 7, 8]

# Logic: if no relevant place is visited: No Customer;
#        else if every visited place is res: Residential Only;
#        else if there's at least one residential place that has been visited: Mixed;
#        else: Non-Residential Only.

def identify_customer(lrvp):  # lrvp is the list of relevant, visited places.    
    if not lrvp: return 'nc'  # if input list is empty, no customer has been served, or no
                              # good has been delivered or picked up.
    result = 'ro'
    for plc in lrvp:
        if plc not in lst_res:
            result = ''
    if result == 'ro': return result
    result = 'm'
    for plc in lrvp:
        if plc not in lst_non_res:
            return result
    result = 'nro'
    return result
    

for i, route in df_routes.iterrows():
    acts = route['act_seg_seq']
    plcs = route['plc_seg_seq']
    plcs_cleaned = []
    for j, act in enumerate(acts):
        if act in ['S', 'P', 'D']: plcs_cleaned.append(plcs[j])
    cstmr = identify_customer(plcs_cleaned)
    df_routes.at[i, 'customer_type'] = dic_cstmr_typ[cstmr]

In [110]:
df_routes[['act_seg_seq', 'plc_seg_seq', 'customer_type']]

Unnamed: 0,act_seg_seq,plc_seg_seq,customer_type
0,"[H, P, D, D, P, D, D, D, D, D, P, D, D, D, H]","[1, 5, 1, 1, 5, 2, 1, 1, 1, 2, 5, 1, 1, 2, 1]",Mixed Residential and Non-residential
1,"[B, S, B, P, D, S, H, P, D, D, D, D, D, D, D, ...","[1, 2, 1, 5, 6, 5, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...",Mixed Residential and Non-residential
2,"[B, P, D, M, B]","[1, 2, 1, 5, 1]",Mixed Residential and Non-residential
3,"[B, B, D, P, D, P, M, M, P, H]","[1, 1, 3, 5, 2, 5, 6, 5, 5, 1]",Non-Residential Only
4,"[B, M, M, M, M, M, P, D, P, D, P, D, D, D, P, ...","[1, 4, 4, 1, 5, 1, 5, 4, 5, 2, 5, 1, 1, 1, 5, ...",Mixed Residential and Non-residential
5,"[B, P, D, B, P, D, M, M, B]","[1, 5, 5, 1, 5, 2, 4, 1, 1]",Non-Residential Only
6,"[B, D, D, D, D, D, D, M, D, D, D, D, D, D, D, ...","[3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, ...",Mixed Residential and Non-residential
7,"[S, P, M, P, P, P, P, D, D, D, P, B, M, P, D, ...","[5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 1, 5, 5, 1, ...",Mixed Residential and Non-residential
8,"[H, M, S, M, M, D, M, P, M]","[1, 2, 5, 5, 6, 2, 1, 5, 1]",Non-Residential Only
9,"[B, S, P, D, D, D, P, D, M]","[1, 1, 5, 1, 2, 2, 2, 1, 1]",Mixed Residential and Non-residential


In [111]:
# Check how many of the routes go outside of the SANDAG TAZs.
a = len(df_routes)
b = len(df_routes.loc[pd.isnull(df_routes['tot_distance'])])
print(f'{b} routes out of {a:,} routes have trips that either start or end in an external TAZ.')
print("'tot_distance' field of these routes has been marked as 'NA'.")

62 routes out of 822 routes have trips that either start or end in an external TAZ.
'tot_distance' field of these routes has been marked as 'NA'.


In [112]:
# Determine what is the point of return of the route. It's either base or home.
for i, row in df_routes.iterrows():
    counts_act = collections.Counter(row['act_seg_seq'])
#     counts_plc = collections.Counter(row['plc_seg_seq'])
    if counts_act['B'] >= counts_act['H'] and counts_act['B'] > 0:
        df_routes.at[i, 'headquarters'] = 'B'
    elif counts_act['H'] > counts_act['B']:
        df_routes.at[i, 'headquarters'] = 'H'
#     elif counts_plc[3] >= counts_plc[8] and counts_plc[3] > 0:
#         df_routes.at[i, 'headquarters'] = 'Warehouse'
#     elif counts_plc[8] > counts_act[3]:
#         df_routes.at[i, 'headquarters'] = 'Truck Terminal'
    else:
        df_routes.at[i, 'headquarters'] = 'Unknown'
df_routes.groupby('headquarters').size().reset_index(name='count')

Unnamed: 0,headquarters,count
0,B,438
1,H,214
2,Unknown,170


In [113]:
df_routes.loc[df_routes['headquarters']=='Unknown'].head()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips
18,202211212207,TNC_Retail,LCV,20,18,0,2,0,0,Goods,Mixed Residential and Non-residential,2022-11-21 17:46:00,2022-11-21 22:59:00,,,,S,M,Retail and Restaurant,Retail and Restaurant,"[S, D, P, D, P, D, P, D, P, D, P, D, P, D, P, ...","[5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, ...",Unknown,0,46.6,600057,2207,2105,Smartphone,1.0,4370,Spark,"[<__main__.Trip object at 0x00000199806C5988>,..."
39,202211292276,TNC_Restaurant,LCV,16,16,0,0,0,0,Goods,Mixed Residential and Non-residential,2022-11-29 10:40:00,2022-11-29 13:36:00,,,,M,D,Other,Residential,"[M, P, D, P, D, D, D, P, D, P, D, D, D, P, D, ...","[4, 5, 1, 5, 1, 2, 1, 5, 1, 5, 1, 1, 1, 5, 1, ...",Unknown,0,44.0,600092,2276,2805,Smartphone,1.0,2828,Grub Hub,"[<__main__.Trip object at 0x00000199806DCB08>,..."
41,202210021766,TNC_Retail,LCV,13,9,1,3,0,0,Goods,Mixed Residential and Non-residential,2022-10-02 11:17:00,2022-10-02 21:34:00,,,,M,M,Residential,Residential,"[M, M, S, M, P, D, P, D, P, D, P, D, D, M]","[1, 5, 1, 1, 5, 1, 5, 1, 5, 1, 5, 1, 1, 1]",Unknown,0,27.5,600099,1766,2219,Smartphone,1.0,582,Instacart,"[<__main__.Trip object at 0x00000199D29D6B08>,..."
48,202209301759,TNC_Restaurant,LCV,13,10,0,3,0,0,Goods,Mixed Residential and Non-residential,2022-09-30 14:27:08,2022-09-30 20:56:00,,,,M,D,Other,Residential,"[M, M, M, P, D, M, P, D, D, D, P, D, P, D]","[4, 5, 5, 5, 1, 5, 5, 1, 1, 1, 5, 1, 5, 1]",Unknown,0,36.5,600109,1759,2225,Smartphone,1.0,98,Door Dash,"[<__main__.Trip object at 0x0000019981CAA508>,..."
52,202210101849,TNC_Retail,LCV,21,11,2,8,0,0,Goods,Mixed Residential and Non-residential,2022-10-10 13:00:00,2022-10-10 21:21:00,,,,M,M,Office,Residential,"[M, P, P, D, S, P, P, D, M, M, M, M, P, M, D, ...","[2, 5, 5, 1, 1, 5, 5, 2, 2, 1, 5, 5, 5, 1, 1, ...",Unknown,0,,600115,1849,2231,Smartphone,1.0,-1,Go Puff,"[<__main__.Trip object at 0x0000019981CA4F48>,..."


In [114]:
def find_hq_taz(hq, lst_act, lst_trips):
    if hq in 'BH':
        pos = lst_act.index(hq)
        if pos == 0:
            return lst_trips[0].o_taz
        else:            
            return lst_trips[pos-1].d_taz
    else:
        return -1
    
df_routes['hq_taz'] = df_routes.apply(lambda x: find_hq_taz(x.headquarters, x.act_seg_seq, x.trips), axis=1)
df_routes.head()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips
0,202208291371,TNC_Restaurant,LCV,14,13,0,0,0,1,Goods,Mixed Residential and Non-residential,2022-08-29 13:39:00,2022-08-29 20:25:00,,,,H,H,Residential,Residential,"[H, P, D, D, P, D, D, D, D, D, P, D, D, D, H]","[1, 5, 1, 1, 5, 2, 1, 1, 1, 2, 5, 1, 1, 2, 1]",H,2958,105.4,600026,1371,1744,Smartphone,1.0,2958,Uber Eats,"[<__main__.Trip object at 0x0000019980481288>,..."
1,202209011443,TNC_NonRestRetl,LCV,26,18,3,1,3,1,Goods,Mixed Residential and Non-residential,2022-09-01 09:51:00,2022-09-01 21:49:00,,,,B,B,Residential,Residential,"[B, S, B, P, D, S, H, P, D, D, D, D, D, D, D, ...","[1, 2, 1, 5, 6, 5, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...",B,1787,101.5,600027,1443,1745,Smartphone,1.0,1787,Amazon Flex,"[<__main__.Trip object at 0x00000199804819C8>,..."
2,202208311395,TNC_NonRestRetl,LCV,4,2,0,1,1,0,Goods,Mixed Residential and Non-residential,2022-08-31 11:05:00,2022-08-31 12:17:00,,,,B,B,Residential,Residential,"[B, P, D, M, B]","[1, 2, 1, 5, 1]",B,3502,15.1,600028,1395,1746,Smartphone,1.0,3502,Senpex,"[<__main__.Trip object at 0x00000199804850C8>,..."
3,202208301393,TNC_Restaurant,LCV,9,5,0,2,1,1,Goods,Non-Residential Only,2022-08-30 16:00:00,2022-08-30 19:17:00,,,,B,H,Residential,Residential,"[B, B, D, P, D, P, M, M, P, H]","[1, 1, 3, 5, 2, 5, 6, 5, 5, 1]",B,2538,40.8,600029,1393,1747,Smartphone,1.0,2538,Grub Hub,"[<__main__.Trip object at 0x00000199874E9FC8>,..."
4,202209021451,TNC_Restaurant,LCV,30,18,1,8,2,1,Goods,Mixed Residential and Non-residential,2022-09-02 08:57:00,2022-09-02 20:11:57,,,,B,B,Residential,Residential,"[B, M, M, M, M, M, P, D, P, D, P, D, D, D, P, ...","[1, 4, 4, 1, 5, 1, 5, 4, 5, 2, 5, 1, 1, 1, 5, ...",B,1207,65.5,600031,1451,1857,Smartphone,1.0,1207,Uber Eats,"[<__main__.Trip object at 0x00000199874EAFC8>,..."


##### Now that headquarters of routes are known, go back to df_trips and update distance to headquarters for each trip.

In [115]:
# Create a dictionary that gets route_id and returns hq_taz.
dic_rid_hq_taz = dict(zip(df_routes.route_id, df_routes.hq_taz))

# Specify hq_taz of trips in the trips dataframe.
def omit_dashes(s):
    return s.replace('-', '')

df_trips['route_id'] = df_trips['travel_date'].astype(str).str[:10].apply(omit_dashes).astype('int64') * 10000
df_trips['route_id'] += df_trips.vehicle_id
df_trips.hq_taz = df_trips.route_id.map(dic_rid_hq_taz)

df_trips[['travel_date', 'vehicle_id', 'route_id', 'hq_taz']].head()

Unnamed: 0,travel_date,vehicle_id,route_id,hq_taz
0,2022-08-29,1371,202208291371,2958
1,2022-08-29,1371,202208291371,2958
2,2022-08-29,1371,202208291371,2958
3,2022-08-29,1371,202208291371,2958
4,2022-08-29,1371,202208291371,2958


In [116]:
# Add route primary purposes to trips
dic_rte_purp = dict(zip(df_routes.route_id, df_routes.primary_purp))
df_trips['route_purpose'] = df_trips.route_id.map(dic_rte_purp)

In [117]:
# Add route primary customer type to trips
dic_rte_cust = dict(zip(df_routes.route_id, df_routes.customer_type)) 
df_trips['route_customers'] = df_trips.route_id.map(dic_rte_cust)

In [118]:
# Update distance to headquarters.
df_trips['orgn_to_hq_dist'] = df_trips.apply(lambda x: find_dist(x['o_taz'], x['hq_taz'],
                                                           dic_v[x['veh_type']],
                                                           dic_tod[x['tod']]), axis=1)

In [119]:
df_trips.head()

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,o_dt,d_at,nt_dt,last_trip,tod,toll_in_cents_from_base,gen_tt_from_base,route_id,route_purpose,route_customers
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,23.9,,2958,0.1,24,30,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00,False,MD,0.0,28.29,202208291371,Goods,Mixed Residential and Non-residential
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,2.7,,2958,24.7,6,3,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00,False,MD,0.0,31.61,202208291371,Goods,Mixed Residential and Non-residential
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,1.8,,2958,26.2,12,7,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21,False,MD,0.0,31.09,202208291371,Goods,Mixed Residential and Non-residential
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,1.1,,2958,26.1,7,3,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00,False,MD,0.0,29.53,202208291371,Goods,Mixed Residential and Non-residential
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0.1,,2958,25.2,3,4,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00,False,MD,0.0,29.53,202208291371,Goods,Mixed Residential and Non-residential


In [120]:
print(df_trips.groupby(['route_purpose']).size().to_string(), "\n")
print(df_trips.groupby(['route_customers']).size().to_string())

route_purpose
Goods                5217
Maintenance/Other      26
Service                10 

route_customers
Mixed Residential and Non-residential    4314
No Customer                                 8
Non-Residential Only                      780
Residential Only                          151


####  Run a Few Checks for Quality Control of the Route Dataframe

In [121]:
# Find the route start to end duration in minutes.
df_routes['route_dur_min'] = ((df_routes['end_tod'] - df_routes['start_tod'])
                             .dt.total_seconds()/60).round()

# Find the route duration in minutes based on its trips and stays durations.
for i, route in df_routes.iterrows():
    duration = 0
    trips = route['trips']
    for j, t in enumerate(trips):
        if j != len(trips)-1:
            duration += (t.travel_time + t.stop_duration)
        else:
            duration += t.travel_time
    duration = round(duration)
    df_routes.at[i, 'cumlv_dur'] = duration
    
    # Specify if route durations from the two methods are off by more than tolerance minutes.
    tol = 15
    if abs(duration - route['route_dur_min']) >= tol:
        df_routes.at[i, 'durations_match'] = False
    else:
        df_routes.at[i, 'durations_match'] = True

misses = len(df_routes.loc[df_routes['durations_match'] == False])
matches = len(df_routes.loc[df_routes['durations_match'] != False])
print(f"Number of routes with duration matches = {matches}; misses = {misses} (tolerance = {tol} minutes)")

Number of routes with duration matches = 802; misses = 20 (tolerance = 15 minutes)


In [124]:
df_routes.loc[df_routes['durations_match'] == False].head()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,multiday_route,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips,route_dur_min
6,202212141506,TNC_Retail,LCV,74,72,0,2,0,0,Goods,Mixed Residential and Non-residential,2022-12-14 10:07:00,2022-12-14 18:02:40,False,,439,False,B,D,Warehouse,Residential,"[B, D, D, D, D, D, D, M, D, D, D, D, D, D, D, ...","[3, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, ...",B,2377,64.8,600035,1506,2959,Smartphone,1.0,2377,Shipt,"[<__main__.Trip object at 0x00000199874F1F08>,...",476.0
61,202211252184,TNC_Restaurant,LCV,39,37,0,2,0,0,Goods,Residential Only,2022-11-25 12:03:19,2022-11-25 23:50:45,False,,687,False,H,D,Residential,Residential,"[H, M, M, D, D, D, D, D, D, D, D, D, D, D, D, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",H,2824,129.7,600132,2184,2674,Smartphone,1.0,3556,Uber Eats,"[<__main__.Trip object at 0x0000019981C91508>,...",707.0
78,202210111964,TNC_NonRestRetl,LCV,60,57,0,2,0,1,Goods,Mixed Residential and Non-residential,2022-10-11 11:35:00,2022-10-11 20:43:00,False,,515,False,B,H,Retail and Restaurant,Residential,"[B, M, D, D, D, D, D, P, D, D, D, D, D, D, D, ...","[5, 2, 3, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, ...",B,2494,84.5,600159,1964,2312,Smartphone,1.0,2494,Amazon,"[<__main__.Trip object at 0x00000199D2999B48>,...",548.0
103,202211252242,TNC_NonRestRetl,LCV,47,44,0,3,0,0,Goods,Mixed Residential and Non-residential,2022-11-25 09:51:00,2022-11-25 16:03:00,False,,349,False,H,M,Residential,Residential,"[H, P, D, D, D, D, D, D, D, D, D, D, P, D, D, ...","[1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, ...",H,1469,21.9,600192,2242,2768,Smartphone,1.0,1469,Amazon Flex,"[<__main__.Trip object at 0x00000199D297E588>,...",372.0
128,202211172164,TNC_Restaurant,LCV,29,17,0,8,1,3,Goods,Mixed Residential and Non-residential,2022-11-17 09:20:00,2022-11-17 00:23:02,False,,901,False,B,B,Residential,Residential,"[B, M, P, M, D, D, D, P, D, P, D, H, M, M, M, ...","[1, 2, 5, 6, 1, 1, 1, 5, 1, 5, 1, 1, 2, 2, 1, ...",H,2614,91.5,600242,2164,2683,Smartphone,1.0,2614,Uber Eats,"[<__main__.Trip object at 0x00000199D2AB92C8>,...",-537.0


In [125]:
# Determine if the route has extended beyond the first day. This does not require that the duration of a route is more
# than 24 hours, rather if the first departure is in one calendar day while the last arrival is in another day, the
# route will be flagged as Multi-Day.
try:
    df_routes.insert(13, 'multiday_route', False)
except:
    pass
df_routes['multiday_route'] = df_routes['start_tod'].dt.date < df_routes['end_tod'].dt.date    

In [126]:
df_routes.loc[df_routes['multiday_route']==True].head()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,multiday_route,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips,route_dur_min
24,202209291727,TNC_Restaurant,LCV,7,2,0,4,0,1,Goods,Mixed Residential and Non-residential,2022-09-29 13:49:23,2022-09-30 00:00:00,True,,609,True,H,H,Residential,Residential,"[H, M, M, P, D, M, M, H]","[1, 5, 5, 5, 1, 1, 1, 1]",H,2480,22.0,600069,1727,2183,Smartphone,1.0,2620,Uber Eats,"[<__main__.Trip object at 0x00000199806D1348>,...",611.0
60,202211142158,TNC_Restaurant,LCV,25,12,0,11,0,2,Goods,Mixed Residential and Non-residential,2022-11-14 17:35:57,2022-11-15 00:17:00,True,,395,True,M,H,Residential,Residential,"[M, M, M, D, P, D, P, M, M, M, P, M, M, P, D, ...","[1, 5, 4, 1, 5, 1, 5, 4, 4, 4, 5, 5, 4, 5, 1, ...",H,1623,90.2,600128,2158,2250,Smartphone,1.0,1623,Door Dash,"[<__main__.Trip object at 0x0000019981C96708>,...",401.0
70,202210061820,TNC_Restaurant,LCV,14,8,0,4,2,0,Goods,Mixed Residential and Non-residential,2022-10-06 12:24:23,2022-10-07 00:00:00,True,,693,True,H,B,Residential,Residential,"[H, P, D, M, P, D, P, D, M, M, P, D, M, B, B]","[1, 5, 5, 1, 5, 2, 5, 1, 4, 4, 5, 1, 1, 1, 1]",B,2480,45.9,600147,1820,2282,Smartphone,1.0,2480,Door Dash,"[<__main__.Trip object at 0x00000199D2990788>,...",696.0
145,202211172173,TNC_Retail,LCV,6,2,0,4,0,0,Goods,Mixed Residential and Non-residential,2022-11-17 08:37:00,2022-11-18 00:00:00,True,,923,True,H,M,Residential,Residential,"[H, P, D, M, M, M, M]","[1, 5, 1, 1, 5, 1, 1]",H,2480,20.8,600282,2173,2696,Smartphone,1.0,927,Point Pick Up,"[<__main__.Trip object at 0x00000199D2855F48>,...",923.0
238,202211292275,TNC_Retail,LCV,9,4,0,5,0,0,Goods,Mixed Residential and Non-residential,2022-11-29 10:04:00,2022-11-30 00:27:00,True,,863,True,H,M,Other,Other,"[H, M, M, P, D, M, P, M, D, M]","[4, 4, 6, 2, 2, 4, 5, 5, 1, 4]",H,2444,216.8,600401,2275,2804,Smartphone,1.0,1627,Point Pick Up,"[<__main__.Trip object at 0x00000199D28DE148>,...",863.0


In [127]:
# Add a column that indicates if the route has any warning.
df_routes['has_warning'] = 0

In [128]:
# Identify routes that vehicle goes from Base to Base, or it goes from Home to Home.
for i, route in df_routes.iterrows():
    flag = False
    acts = route['act_seg_seq']
    act = acts[0]
    for next_act in acts[1:]:
        if (act=='H' and next_act=='H') or (act=='B' and next_act=='B'):
            flag = True
            break
        act = next_act
    df_routes.at[i, 'warn_BB_or_HH'] = flag*1
df_routes['warn_BB_or_HH'] = df_routes['warn_BB_or_HH'].astype('int8')

In [129]:
# Add warning columns.    
    
# Activity at the route origin is to deliver/pickup goods:
df_routes['warn_o_act_G'] = df_routes['act_seg_seq'].apply(lambda l: 1 if l[0]=='G' else 0)

# Activity at the route origin is to provide services:
df_routes['warn_o_act_S'] = df_routes['act_seg_seq'].apply(lambda l: 1 if l[0]=='S' else 0)

# Activity at the route destination is either to provide service or deliver/pickup goods:
df_routes['warn_d_act_SG'] = df_routes['act_seg_seq'].apply(lambda l: 1 if l[-1] in ['G', 'S'] else 0)

# Route trips extend beyond 12AM of the first day:
df_routes['warn_next_day'] = df_routes['multiday_route'].astype(int)

# Route duration time calculated from start and end of the tour doesn't match with the one that is calculated by
# summing trip travel times and stop durations:
df_routes['warn_duration'] = 1 - df_routes['durations_match'].astype(int)

# Neither Base nor Home appears in the route activities, including the route origin:
df_routes['warn_no_BH_stops'] = df_routes['act_seg_seq'].apply(lambda l: 0 if ('B' in l or 'H' in l) else 1)

# Neither Goods nor Services appears in the route activities, including the route origin:
df_routes['warn_no_GS_stops'] = df_routes['act_seg_seq'].apply(lambda l: 0 if ('G' in l or 'S' in l) else 1)

# None of Goods, Services, or Maintenance appears in the route activities, including the route origin:
df_routes['warn_no_GSM_stops'] = df_routes['act_seg_seq'].apply(lambda l: 0 if ('G' in l or 'S' in l or 'M' in l) else 1)
df_routes.head()

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,multiday_route,route_dur_hr,cumlv_dur,durations_match,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,headquarters,hq_taz,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,trips,route_dur_min,has_warning,warn_BB_or_HH,warn_o_act_G,warn_o_act_S,warn_d_act_SG,warn_next_day,warn_duration,warn_no_BH_stops,warn_no_GS_stops,warn_no_GSM_stops
0,202208291371,TNC_Restaurant,LCV,14,13,0,0,0,1,Goods,Mixed Residential and Non-residential,2022-08-29 13:39:00,2022-08-29 20:25:00,False,,404,True,H,H,Residential,Residential,"[H, P, D, D, P, D, D, D, D, D, P, D, D, D, H]","[1, 5, 1, 1, 5, 2, 1, 1, 1, 2, 5, 1, 1, 2, 1]",H,2958,105.4,600026,1371,1744,Smartphone,1.0,2958,Uber Eats,"[<__main__.Trip object at 0x0000019980481288>,...",406.0,0,0,0,0,0,0,0,0,1,1
1,202209011443,TNC_NonRestRetl,LCV,26,18,3,1,3,1,Goods,Mixed Residential and Non-residential,2022-09-01 09:51:00,2022-09-01 21:49:00,False,,718,True,B,B,Residential,Residential,"[B, S, B, P, D, S, H, P, D, D, D, D, D, D, D, ...","[1, 2, 1, 5, 6, 5, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...",B,1787,101.5,600027,1443,1745,Smartphone,1.0,1787,Amazon Flex,"[<__main__.Trip object at 0x00000199804819C8>,...",718.0,0,0,0,0,0,0,0,0,0,0
2,202208311395,TNC_NonRestRetl,LCV,4,2,0,1,1,0,Goods,Mixed Residential and Non-residential,2022-08-31 11:05:00,2022-08-31 12:17:00,False,,72,True,B,B,Residential,Residential,"[B, P, D, M, B]","[1, 2, 1, 5, 1]",B,3502,15.1,600028,1395,1746,Smartphone,1.0,3502,Senpex,"[<__main__.Trip object at 0x00000199804850C8>,...",72.0,0,0,0,0,0,0,0,0,1,0
3,202208301393,TNC_Restaurant,LCV,9,5,0,2,1,1,Goods,Non-Residential Only,2022-08-30 16:00:00,2022-08-30 19:17:00,False,,196,True,B,H,Residential,Residential,"[B, B, D, P, D, P, M, M, P, H]","[1, 1, 3, 5, 2, 5, 6, 5, 5, 1]",B,2538,40.8,600029,1393,1747,Smartphone,1.0,2538,Grub Hub,"[<__main__.Trip object at 0x00000199874E9FC8>,...",197.0,0,1,0,0,0,0,0,0,1,0
4,202209021451,TNC_Restaurant,LCV,30,18,1,8,2,1,Goods,Mixed Residential and Non-residential,2022-09-02 08:57:00,2022-09-02 20:11:57,False,,669,True,B,B,Residential,Residential,"[B, M, M, M, M, M, P, D, P, D, P, D, D, D, P, ...","[1, 4, 4, 1, 5, 1, 5, 4, 5, 2, 5, 1, 1, 1, 5, ...",B,1207,65.5,600031,1451,1857,Smartphone,1.0,1207,Uber Eats,"[<__main__.Trip object at 0x00000199874EAFC8>,...",675.0,0,0,0,0,0,0,0,0,0,0


In [130]:
# Fill the has_warning column.
warn_cols = [c for c in df_routes.columns if 'warn' in c]
df_routes['has_warning'] = df_routes[warn_cols].max(axis=1)
# df_routes.head(20)

#### Explore the Route Dataframe

##### OPTIONAL: Create a cumulative distribution function for route durations.

##### OPTIONAL: Create a graph for starting/ending activity combinations.
Note: Starting activity is the activity at the origin of the first trip, not at its destination.

#### Report Out the Trip Dataframe

In [131]:
print(f'There are {len(df_trips):,} trips in the dataframe.')

There are 5,253 trips in the dataframe.


In [132]:
df_trips.head()

Unnamed: 0,company_id,industry_group,vehicle_id,driver_id,trip_number,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,headquarters,hq_taz,orgn_to_hq_dist,travel_time,stop_duration,veh_type,cargo_pickup,cargo_delivery,travel_date,participation_type,expnsn_factor,estab_taz,company_name,o_dt,d_at,nt_dt,last_trip,tod,toll_in_cents_from_base,gen_tt_from_base,route_id,route_purpose,route_customers
0,600026,TNC_Restaurant,1371,1744,1,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,23.9,,2958,0.1,24,30,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 13:39:00,2022-08-29 14:03:00,2022-08-29 14:33:00,False,MD,0.0,28.29,202208291371,Goods,Mixed Residential and Non-residential
1,600026,TNC_Restaurant,1371,1744,2,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,1,Residential,6.0,6.0,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,5,11,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,2.7,,2958,24.7,6,3,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:33:00,2022-08-29 14:39:00,2022-08-29 14:42:00,False,MD,0.0,31.61,202208291371,Goods,Mixed Residential and Non-residential
2,600026,TNC_Restaurant,1371,1744,3,D,Goods_Delivery,1,Residential,D,Goods_Delivery,1,Residential,5.0,11.0,HOUSE,12651 Julian Ave,-116.91758,32.855417,4564,5,11,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,1.8,,2958,26.2,12,7,LCV,,10.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 14:42:00,2022-08-29 14:54:03,2022-08-29 15:01:21,False,MD,0.0,31.09,202208291371,Goods,Mixed Residential and Non-residential
3,600026,TNC_Restaurant,1371,1744,4,D,Goods_Delivery,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,5.0,11.0,DROP OFF CUSTOMER,12143 Rockcrest Rd,-116.92999,32.846187,4508,6,5,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,1.1,,2958,26.1,7,3,LCV,5.0,,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:01:21,2022-08-29 15:09:00,2022-08-29 15:12:00,False,MD,0.0,29.53,202208291371,Goods,Mixed Residential and Non-residential
4,600026,TNC_Restaurant,1371,1744,5,P,Goods_Pickup,5,Retail and Restaurant,D,Goods_Delivery,2,Office,6.0,5.0,RESTAURANT,12038 Woodside Ave,-116.932653,32.856083,4465,5,1,OFFICE,11980 Woodside Ave UNIT 5,-116.934696,32.855876,4465,0.1,,2958,25.2,3,4,LCV,,5.0,2022-08-29,Smartphone,1.0,2958,Uber Eats,2022-08-29 15:12:00,2022-08-29 15:15:00,2022-08-29 15:19:00,False,MD,0.0,29.53,202208291371,Goods,Mixed Residential and Non-residential


In [133]:
# Rearrange to have time-related fields next to each other.
cols = ['industry_group', 'trip_number', 'last_trip',
        'travel_date', 'o_dt', 'tod', 'd_at', 'nt_dt', 'travel_time', 'stop_duration', 
        'o_act_seg', 'o_act_seg_name', 'o_plc_seg', 'o_plc_seg_name',
        'd_act_seg', 'd_act_seg_name', 'd_plc_seg', 'd_plc_seg_name',                
        'o_act', 'o_place_type', 'o_place_name', 'o_address', 'o_lon', 'o_lat', 'o_taz',
        'd_act', 'd_place_type', 'd_place_name', 'd_address', 'd_lon', 'd_lat', 'd_taz',
        'trip_dist',
        'veh_type', 
        'route_id', 'company_id', 'estab_taz', 'vehicle_id', 'driver_id',
        'expnsn_factor', 'participation_type', 'route_purpose', 'route_customers']

if dataset=='TNC':    
    cols.insert(-5, 'company_name')

df_trips = df_trips[cols]
df_trips.head(1)

Unnamed: 0,industry_group,trip_number,last_trip,travel_date,o_dt,tod,d_at,nt_dt,travel_time,stop_duration,o_act_seg,o_act_seg_name,o_plc_seg,o_plc_seg_name,d_act_seg,d_act_seg_name,d_plc_seg,d_plc_seg_name,o_act,o_place_type,o_place_name,o_address,o_lon,o_lat,o_taz,d_act,d_place_type,d_place_name,d_address,d_lon,d_lat,d_taz,trip_dist,veh_type,route_id,company_id,estab_taz,vehicle_id,company_name,driver_id,expnsn_factor,participation_type,route_purpose,route_customers
0,TNC_Restaurant,1,False,2022-08-29,2022-08-29 13:39:00,MD,2022-08-29 14:03:00,2022-08-29 14:33:00,24,30,H,Home,1,Residential,P,Goods_Pickup,5,Retail and Restaurant,14.0,11.0,HOME,1052 Woodlawn Ave,-117.086283,32.610644,2958,6,6,WALMART,13487 Camino Canada,-116.901892,32.822468,4617,23.9,LCV,202208291371,600026,2958,1371,Uber Eats,1744,1.0,Smartphone,Goods,Mixed Residential and Non-residential


In [134]:
out_file = dataset + f'_Trips_{current_date}.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_trips.to_excel(out_path, sheet_name='Trip Dataframe', index_label='index')

#### Report Out the Route Dataframe

In [135]:
drop_cols = ['multiday_route', 'cumlv_dur', 'durations_match', 'trips', 'headquarters', 'hq_taz']
df_routes.drop(columns=drop_cols, inplace=True)
df_routes.head(2)

Unnamed: 0,route_id,industry_group,veh_type,trip_count,g_stops,s_stops,m_stops,b_stops,h_stops,primary_purp,customer_type,start_tod,end_tod,route_dur_hr,start_activity,end_activity,start_plc_seg,end_plc_seg,act_seg_seq,plc_seg_seq,tot_distance,company_id,vehicle_id,driver_id,participation_type,expnsn_factor,estab_taz,company_name,route_dur_min,has_warning,warn_BB_or_HH,warn_o_act_G,warn_o_act_S,warn_d_act_SG,warn_next_day,warn_duration,warn_no_BH_stops,warn_no_GS_stops,warn_no_GSM_stops
0,202208291371,TNC_Restaurant,LCV,14,13,0,0,0,1,Goods,Mixed Residential and Non-residential,2022-08-29 13:39:00,2022-08-29 20:25:00,,H,H,Residential,Residential,"[H, P, D, D, P, D, D, D, D, D, P, D, D, D, H]","[1, 5, 1, 1, 5, 2, 1, 1, 1, 2, 5, 1, 1, 2, 1]",105.4,600026,1371,1744,Smartphone,1.0,2958,Uber Eats,406.0,1,0,0,0,0,0,0,0,1,1
1,202209011443,TNC_NonRestRetl,LCV,26,18,3,1,3,1,Goods,Mixed Residential and Non-residential,2022-09-01 09:51:00,2022-09-01 21:49:00,,B,B,Residential,Residential,"[B, S, B, P, D, S, H, P, D, D, D, D, D, D, D, ...","[1, 2, 1, 5, 6, 5, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...",101.5,600027,1443,1745,Smartphone,1.0,1787,Amazon Flex,718.0,0,0,0,0,0,0,0,0,0,0


In [136]:
out_file = dataset + f'_Routes_{current_date}.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_routes.to_excel(out_path, sheet_name='Routes', index=False) #  index_label='index'