## SANDAG CV and TNC Survey Analysis
### CV: Commercial Vehicles
### TNC: Transportation Network Companies

#### In the cell below, specify which data set to process:
* 'CV' for Establishment surveys
* 'TNC' for TNC surveys

In [None]:
# Indicate what dataset is to be processed. It should be either 'CV' or 'TNC'.
dataset = 'CV'
# dataset = 'TNC'

# Specify value of time for different vehicle sizes, [Light, Medium, Heavy]. Unit is $ per hour.
vot = [67, 68, 89]

#### Import Libraries 

In [None]:
import os, glob
import pandas as pd
import numpy as np
import openpyxl
import datetime as dt
import collections
import matplotlib.pyplot as plt
import openmatrix as omx
import gc

# from matplotlib.ticker import PercentFormatter
# from scipy import stats  # to get inverse of '.quantile()'

In [None]:
from datetime import datetime
current_date = datetime.now().strftime('%Y%m%d')

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Define Trip Class

In [None]:
class Trip:
    registry = []

    def __init__(self):
        Trip.registry.append(self)
        self.index = -1
        self.o_act = 0
        self.o_place_type = 0
        self.o_place_name = ''
        self.o_address = ''
        self.o_lon = 0
        self.o_lat = 0
        self.o_taz = -1
        self.d_act = 0
        self.d_place_type = 0
        self.d_place_name = ''
        self.d_address = ''
        self.d_lon = 0
        self.d_lat = 0
        self.d_taz = -1
        self.trip_dist = 0
        self.o_dt = dt.datetime(1982, 1, 1, 0, 0, 0)
        self.d_at = dt.datetime(1982, 1, 1, 0, 0, 0)
        self.nt_dt = dt.datetime(1982, 1, 1, 0, 0, 0)
        self.travel_time = 0    # in seconds
        self.stop_duration = 0  # in seconds
        self.cargo_pickup = -1
        self.cargo_delivery = -1
        self.travel_date = dt.date(1982, 1, 1)
        self.last_trip = None

#### Set File Paths

In [None]:
project_path = os.getcwd().replace("\\02 Scripts", "")

# To run the script on your computer, update 'project path' above with script folder address on your computer and
# then uncomment it.
in_data_dir  = '01 Inputs'
out_data_dir = '03 Outputs'

lu_file = 'Lookups_v8.xlsx'
lu_path = os.path.join(project_path, in_data_dir, lu_file)

skims_dir = r'C:\Users\jgliebe\OneDrive - Cambridge Systematics\Documents - PROJ SANDAG Commercial Vehicle & Heavy Truck Model Update\_Shared_CSTeam\Task03_DataID_Review\ABM3\Skims'

if dataset == 'TNC':
    in_data_file = 'TNC Travel Survey_Data Submittal_1-19-23.xlsx'
else:
#     in_data_file = 'SANDAG 2022 CV DataBase & Dictionaires_03_03_2023.xlsx'
    in_data_file = 'SANDAG 2022 CV DataBase, Revised.xlsx'  # Cleaned up some data formatting
in_data_path = os.path.join(project_path, in_data_dir, in_data_file)

#### Read the Lookup Tables

In [None]:
lookups = pd.read_excel(lu_path, header=0, sheet_name=None)

In [None]:
lookups['Segment Codes']

In [None]:
lookup_at = lookups['Activity Type']  # .copy()
# lookup_at.drop(columns='count', inplace=True)

asl = {1: 'S', 2: 'P', 3: 'D', 4: 'M', 5: 'B', 6: 'H'} # asl = Activity Segment Letter
lookup_at['act_seg_let'] = lookup_at['act_seg_code'].map(asl)

dic_at = {}
for i, row in lookup_at.iterrows():
    dic_at[row['activity_type_code']] = row['act_seg_let'], row['act_seg_name']

lookup_at

In [None]:
lookup_pt = lookups['Place Type']
# lookup_pt.drop(columns='count', inplace=True)

dic_pt = {}
dic_pt2 = {}
for i, row in lookup_pt.iterrows():
    dic_pt[row['place_type_code']] = row['plc_seg_code'], row['plc_seg_name']
    dic_pt2[row['plc_seg_code']] = row['plc_seg_name']

lookup_pt # .head(2)

In [None]:
lookup_ind = lookups['Industries']
lookup_ind #.head(2)

In [None]:
lookup_tnc = lookups['TNC Categories']  # .copy()

dic_tnc_names = {}
dic_tnc_categories = {}
for i, row in lookup_tnc.iterrows():
    dic_tnc_names[row['company_name']] = row['company_name_alt']
    dic_tnc_categories[row['company_name_alt']] = row['TNC_IndCat3']

lookup_tnc

In [None]:
lookup_cv_estab = lookups['CV Estab TAZ']
lookup_cv_estab.head()

In [None]:
lookup_tnc_estab = lookups['TNC Estab TAZ']  # .copy()
lookup_tnc_estab.loc[lookup_tnc_estab['estab_taz'].isnull(), 'estab_taz'] = -1 # Could've been done with fillna too.
lookup_tnc_estab['estab_taz'] = lookup_tnc_estab['estab_taz'].astype(int)
lookup_tnc_estab.head()

In [None]:
lookup_cv_estab_replace = lookups['CV Rt Rplcmnt Estab TAZ'] #.copy()
lookup_cv_estab_replace.insert(1, 'Date', 0)
lookup_cv_estab_replace.insert(2, 'Veh', 0)
lookup_cv_estab_replace['Date'] = pd.to_datetime(lookup_cv_estab_replace['route_id'].astype(str).str[:8]) #.astype(int)
lookup_cv_estab_replace['Veh'] = lookup_cv_estab_replace['route_id'].astype(str).str[-4:].astype(int)

lookup_cv_estab_replace

In [None]:
# Get LogisticsNodes
lookup_logistics = lookups['LogisticsNodes']
lookup_logistics

### Read Skim Matrices -- two methods, choose one
1. Read OMX files, process, and save as pickle files -- do this only if the pickle files do not already exist or new raw data skims are wanted.
2. Read the pickle files directly, assuming they exist. This is 100 times faster.

In [None]:
# Note: There are 4,947 TAZs and 24,321 MGRAs in the shapefiles.
# The OMX files contain info about TAZs, not MGRAs.
dic_veh_size = {1: 'L', 2: 'M', 3: 'H'}
dic_tod = {1: 'EA', 2: 'AM', 3: 'MD', 4: 'PM', 5: 'EV'}
fns = [f'traffic_skims_{v}.omx' for v in dic_tod.values()] # fns = file names
# print(fns)

# Periods:
# Early:   3AM     6AM
# AM Peak: 6AM     9AM
# Midday:  9AM     3:30PM
# PM Peak: 3:30PM  7PM
# Late:    7PM     3AM

dist_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]
# 'dist_dfs' is a list of lists that stores distance dataframes.
# Each row is for a time of day, and each column is a vehicle size.
# Note: Indices start from zero.

time_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]
toll_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]
g_tt_dfs = [[None for j in range(len(dic_veh_size))] for i in range(len(dic_tod))]

#### Method 1: Read skims for OMX files, process, and save as pickle files. (Slow)
Change below cell to "code" before running

#### Method 2: Read already processed skims from pickle files. (Fast)
Change below cell to "code" before running

In [None]:
# Read skim matrices from pickle files (fast load)
for i in range(len(dic_tod)):
    for j in range(len(dic_veh_size)):
        in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_10th_of_miles.pkl'
        in_path = os.path.join(project_path, out_data_dir, 'Distance Skims')
        if not os.path.exists(in_path):
           print(f"Input skims file path not found: \n{in_path}")
        in_path = os.path.join(project_path, out_data_dir, 'Distance Skims', in_file)
        dist_dfs[i][j] = pd.read_pickle(in_path)

        if i == 2:
            in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_TT_Minutes.pkl'
            in_path = os.path.join(project_path, out_data_dir, 'Travel Time Skims')
            if not os.path.exists(in_path):
               print(f"Input skims file path not found: \n{in_path}")
            in_path = os.path.join(project_path, out_data_dir, 'Travel Time Skims', in_file)
            time_dfs[i][j] = pd.read_pickle(in_path)

            in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_Toll_Cents.pkl'
            in_path = os.path.join(project_path, out_data_dir, 'Toll Skims')
            if not os.path.exists(in_path):
               print(f"Input skims file path not found: \n{in_path}")
            in_path = os.path.join(project_path, out_data_dir, 'Toll Skims', in_file)
            toll_dfs[i][j] = pd.read_pickle(in_path)

            in_file = f'{3*i+j+1:02}_{dic_tod[i+1]}_{dic_veh_size[j+1]}_GenTT_Minutes.pkl'
            in_path = os.path.join(project_path, out_data_dir, 'Generalized TT Skims')
            if not os.path.exists(in_path):
               print(f"Input skims file path not found: \n{in_path}")
            in_path = os.path.join(project_path, out_data_dir, 'Generalized TT Skims', in_file)
            g_tt_dfs[i][j] = pd.read_pickle(in_path)

In [None]:
print(f'Example 1: Distance from TAZ 5 to TAZ 19 in AM peak for medium trucks is: {dist_dfs[1][1].at[320, 344]/10:} miles\n')
print(f'Example 2: Distance from TAZ 321 to TAZ 345 in mid-day for light trucks is: {dist_dfs[2][0].iloc[320, 344]/10:} miles,',
      '\n', f'          Travel time is: {time_dfs[2][0].iloc[320, 344]} minutes, and', '\n',
      f'          toll is: {toll_dfs[2][0].iloc[320, 344]} cents.\n')
print(f'Example 3: Fastest route between TAZ 3901 and TAZ 4233 uses expressway 125, which is a FasTrak toll road.',
      f'\n           Distance from TAZ 3901 to TAZ 4233 in mid-day for heavy trucks is: {dist_dfs[2][2].iloc[3900, 4232]/10:} miles,',
      '\n', f'          Travel time is: {time_dfs[2][2].iloc[3900, 4232]} minutes,\n',
      f'          Toll is: {toll_dfs[2][2].iloc[3900, 4232]} cents, and\n'
      f'           Generalized travel time is: {g_tt_dfs[2][2].iloc[3900, 4232]} minutes.')

#### Read Stop, Establishment, and Vehicle Data

In [None]:
# Read the survey data.
df_original = pd.read_excel(in_data_path, header=0, sheet_name=None)
# df_original is a dictionary of Dfs. Keys are sheetnames, and values are the dataframes in those worksheets.

In [None]:
# # If all we were to do was to read the stop data:
# df_stops = pd.read_excel(in_data_path, header=0, sheet_name='Trip Data')
# df_stops.head(2)

In [None]:
df_stops = df_original['Trip Data'].copy()
df_estab = df_original['Establishment Data'].copy()
df_veh = df_original['Vehicle Data'].copy()
df_stops.head()

In [None]:
df_estab.head()

In [None]:
df_veh.head()

#### Explore the Vehicle Dataframe

In [None]:
# 'vehicle_classification' codes according to data dictionary:
# 1: Passenger Car or Motorcycle
# 2: Pick-up Truck (4 wheels)
# 3: Van (Cargo/Minivan) (4 wheels)
# 4: Buses
# 5: Single Unit 2-axle
# 6: Single Unit 3-axle
# 7: Single Unit 4-axle
# 8: Semi (all Tractor-Trailer combinations)
# 96:Other (please specify)

In [None]:
df_veh['vehicle_classification'].unique()

In [None]:
if dataset == 'CV':
    print(df_veh['Vehicle Type'].unique())

In [None]:
if dataset == 'CV':
    pvt_axle_vtype = pd.pivot_table(df_veh, values='id', index='vehicle_classification', columns='Vehicle Type',
                             aggfunc='count') #, sort=True) For sort, default is True.
    print(pvt_axle_vtype)

In [None]:
# Create a conversion dictionary from veh_classification to veh_size.
# This is necessary because the TNC dataset doesn't have the 'Vehicle Type' column - this script
# needs to work for both CV and the TNC datasets.
d1 = {i: 'LCV' for i in range(1, 4)}
d2 = {i: 'SUT' for i in range(4, 8)}
d3 = {8: 'MUT'}
dic_veh_size = {**d1, **d2, **d3}
dic_veh_size

In [None]:
df_veh['veh_size'] = df_veh['vehicle_classification'].map(dic_veh_size)
df_veh[['veh_size', 'vehicle_classification']]

#### Clean the Data

In [None]:
# Decide which columns to keep and rearrange.
cols = ['company_id', 'vehicle_id', 'driver_id', 'trip_number', 'activity_type', 'placetype',
        'location_placename', 'location_address', 'location_longitude', 'location_latitude', 'taz',
        'arrival_time', 'departure_time', 'cargo_pickup', 'cargo_delivery', 'travel_date',
        'participation_type', 'Most Likely Estimate Weight Factor']
        # 'Lower Estimate Weight Factor',
        # 'Upper Estimate Weight Factor'

df_stops = df_stops[cols]

# Sort the dataframe
df_stops.sort_values(by=['company_id', 'vehicle_id', 'travel_date', 'trip_number'],
                     inplace=True, ascending=True)
df_stops.head()

#### Attach Vehicle Type Info

In [None]:
df_stops = df_stops.merge(df_veh[['id', 'veh_size']], how='left', left_on='vehicle_id', right_on='id')
df_stops.rename(columns={'veh_size': 'veh_type'}, inplace=True)
df_stops.drop(columns='id', inplace=True)
df_stops.head()

#### Attach Establishment TAZ Info

In [None]:
lookup_estab = lookup_cv_estab if dataset == 'CV' else lookup_tnc_estab
df_stops = df_stops.merge(lookup_estab[['company_id', 'estab_taz']], how='left', on='company_id')
df_stops.head()

#### Update establishment TAZ info for stops that have a replacement establishment TAZ

In [None]:
if dataset == 'CV':
    for i, rowStop in df_stops.iterrows():
        for j, rowRep in lookup_cv_estab_replace.iterrows():
            if rowStop['travel_date']==rowRep['Date'] and rowStop['vehicle_id']==rowRep['Veh']:
#                 print(i, j,
#                       f"{df_stops.at[i, 'estab_taz']} was replaced by {lookup_cv_estab_replace.at[j, 'NewEstab_TAZ']}")
                df_stops.at[i, 'estab_taz'] = lookup_cv_estab_replace.at[j, 'NewEstab_TAZ']
                break

#### Create the Trip Dataframe Using the Stop Dataframe

In [None]:
df_temp = df_stops.copy()
rename_dic = {
    'activity_type':      'd_act',
    'placetype':          'd_place_type',
    'location_placename': 'd_place_name',
    'location_address':   'd_address',
    'location_longitude': 'd_lon',
    'location_latitude':  'd_lat',
    'taz':                'd_taz',
    'arrival_time':       'd_arr_time',
    'departure_time':     'next_trip_dep_time',
    'Most Likely Estimate Weight Factor': 'expnsn_factor'
}
df_temp.rename(columns=rename_dic, inplace=True)
df_temp.head()

In [None]:
# For TNCs, set missing expansion factors to 1
df_temp['expnsn_factor'] = df_temp['expnsn_factor'].fillna(1)

In [None]:
# Create a dictionary of new columns that should be added.
cols_to_add_1 = ['o_act', 'o_place_type', 'o_lon', 'o_lat', 'o_taz', 'o_dep_time',
                 'travel_time', 'stop_duration',
                 'o_act_seg', 'o_plc_seg',
                 'd_act_seg', 'd_plc_seg',
                 'trip_dist', 'orgn_to_hq_dist',
                 'hq_taz'
                ]
d1 = dict.fromkeys(cols_to_add_1, 0)

cols_to_add_2 = ['o_place_name', 'o_address',
                 'o_act_seg_name', 'o_plc_seg_name',
                 'd_act_seg_name', 'd_plc_seg_name',
                 'headquarters'
                ]
d2 = dict.fromkeys(cols_to_add_2, "")

d = {**d1, **d2}
d

In [None]:
# Add the new columns with them being initialized by the values of the above dictionary.
df_temp = df_temp.assign(**d)
df_temp.head()

In [None]:
# Rearrange.
cols = ['company_id', 'vehicle_id', 'driver_id', 'trip_number',
        'o_act_seg', 'o_act_seg_name', 'o_plc_seg', 'o_plc_seg_name',
        'd_act_seg', 'd_act_seg_name', 'd_plc_seg', 'd_plc_seg_name',
        'o_act', 'o_place_type', 'o_place_name', 'o_address', 'o_lon', 'o_lat', 'o_taz',
        'd_act', 'd_place_type', 'd_place_name', 'd_address', 'd_lon', 'd_lat', 'd_taz',
        'trip_dist', 'headquarters', 'hq_taz', 'orgn_to_hq_dist',
        'o_dep_time', 'd_arr_time', 'next_trip_dep_time', 'travel_time', 'stop_duration',
        'veh_type', 'cargo_pickup', 'cargo_delivery', 'travel_date',
        'participation_type', 'expnsn_factor', 'estab_taz'
       ]
df_temp = df_temp[cols]
df_temp.head()

In [None]:
# Add origin information from the previous row.
dic_read_last = {
    'o_act':        'd_act',
    'o_place_type': 'd_place_type',
    'o_place_name': 'd_place_name',
    'o_address':    'd_address',
    'o_lon':        'd_lon',
    'o_lat':        'd_lat',
    'o_taz':        'd_taz',
    'o_dep_time':   'next_trip_dep_time',
    'travel_date':  'travel_date'
}

for k, v in dic_read_last.items():
    df_temp.loc[df_temp['trip_number']!=0, k] = df_temp[v].shift(1)

df_temp['o_taz'] = df_temp['o_taz'].astype('int64')

df_temp.head()

In [None]:
# Remove the first record of each vehicle as it doesn't represent a trip, rather the
# initial origin of the vehicle, whose critical info has already been stored in the next record.
df_temp = df_temp.loc[df_temp['trip_number']!=0]
df_temp.head()

In [None]:
# Fill the activity and place segment fields.
df_temp['o_act_seg']      = df_temp['o_act'].map(lambda x: dic_at[x][0])
df_temp['o_act_seg_name'] = df_temp['o_act'].map(lambda x: dic_at[x][1])
df_temp['d_act_seg']      = df_temp['d_act'].apply(lambda x: dic_at[x][0])
df_temp['d_act_seg_name'] = df_temp['d_act'].apply(lambda x: dic_at[x][1])

df_temp['o_plc_seg']      = df_temp['o_place_type'].map(lambda x: dic_pt[x][0])
df_temp['o_plc_seg_name'] = df_temp['o_place_type'].map(lambda x: dic_pt[x][1])
df_temp['d_plc_seg']      = df_temp['d_place_type'].apply(lambda x: dic_pt[x][0])
df_temp['d_plc_seg_name'] = df_temp['d_place_type'].apply(lambda x: dic_pt[x][1])

df_temp.head()

In [None]:
# df_temp.loc[(df_temp['d_act_seg']=='B')&(df_temp['d_taz']!=df_temp['estab_taz'])].head(1000)

In [None]:
df_temp.groupby(['d_act_seg', 'd_act_seg_name']).size()

##### Fix Activity Type 'Base' If Stop's TAZ Is Not Equal to Establishment TAZ

In [None]:
df_temp.loc[(df_temp['o_act_seg']=='B')&
            (df_temp['o_taz']!=df_temp['estab_taz'])&
            (df_temp['o_act'].isin([12])), ['o_act_seg', 'o_act_seg_name']
           ] = 'P', 'Goods_Pickup'

df_temp.loc[(df_temp['o_act_seg']=='B')&
            (df_temp['o_taz']!=df_temp['estab_taz'])&
            (df_temp['o_act'].isin([13])), ['o_act_seg', 'o_act_seg_name']
           ] = 'D', 'Goods_Delivery'

df_temp.loc[(df_temp['o_act_seg']=='B')&
            (df_temp['o_taz']!=df_temp['estab_taz'])&
            (~df_temp['o_act'].isin([12, 13])), ['o_act_seg', 'o_act_seg_name']
           ] = 'M', 'Maintenance/Other'

df_temp.loc[(df_temp['d_act_seg']=='B')&
            (df_temp['d_taz']!=df_temp['estab_taz'])&
            (df_temp['d_act'].isin([12])), ['d_act_seg', 'd_act_seg_name']
           ] = 'P', 'Goods_Pickup'

df_temp.loc[(df_temp['d_act_seg']=='B')&
            (df_temp['d_taz']!=df_temp['estab_taz'])&
            (df_temp['d_act'].isin([13])), ['d_act_seg', 'd_act_seg_name']
           ] = 'D', 'Goods_Delivery'

df_temp.loc[(df_temp['d_act_seg']=='B')&
            (df_temp['d_taz']!=df_temp['estab_taz'])&
            (~df_temp['d_act'].isin([12, 13])), ['d_act_seg', 'd_act_seg_name']
           ] = 'M', 'Maintenance/Other'

In [None]:
# df_temp.loc[df_temp['d_taz']!=df_temp['estab_taz']].head(1000)

In [None]:
df_temp.info(memory_usage="deep")

##### Include the industry of establishments

In [None]:
if dataset == 'CV':
    df_temp = df_temp.merge(df_estab[['company_id', 'base_location_Industry Group']], how='left', on='company_id')
    df_temp.rename(columns={'base_location_Industry Group': 'industry_code'}, inplace=True)
    df_temp = df_temp.merge(lookup_ind[['industry_code', 'industry_group']], how='left', on='industry_code')
    temp = df_temp.pop('industry_group')
    df_temp.insert(1, 'industry_group', temp)

In [None]:
def omit_spaces(s):
    return s.replace(' ', '')

if dataset == 'TNC':
    df_temp = df_temp.merge(df_estab[['company_id', 'company_name']], how='left', on='company_id')

    # Some of the company names have leading or lagging spaces in their names.
    df_temp['company_name'] = df_temp['company_name'].apply(str.strip).apply(str.lower).apply(omit_spaces)
    df_temp['company_name'] = df_temp['company_name'].map(dic_tnc_names)

    # Figure out the industry groups.
    df_temp.insert(1, 'industry_group', "")
    df_temp['industry_group'] = df_temp['company_name'].map(dic_tnc_categories)

    df_temp['industry_group'] = 'TNC_' + df_temp['industry_group']

In [None]:
df_temp.head()

In [None]:
df_trips = df_temp.copy()
del df_temp
# gc.collect()

##### Clean the time fields

Glossary:<br>
|Field          | Meaning|
|:---------------|:--------------------|
|str_td | Travel Date as String | 
|str_o_dt|      Departure Time from the Origin as String|
|str_d_at |     Arrival Time at the Destination as String|
|str_nt_dt |    Next Trip Departure Time as String|
|o_dt       |   Departure Time from the Origin as TimeStamp|
|d_at        |  Arrival Time at the Destination as TimeStamp|
|nt_dt        | Next Trip Departure Time as TimeStamp|

In [None]:
print(df_trips['travel_date'].dtypes)
print(df_trips['o_dep_time'].dtypes)
print(df_trips['d_arr_time'].dtypes)
print(df_trips['next_trip_dep_time'].dtypes)

In [None]:
df_trips['str_td'] = df_trips['travel_date'].dt.strftime('%Y-%m-%d')    # str_td = travel date as string
df_trips.head()

In [None]:
df_trips['o_dep_time'].describe()

In [None]:
if dataset == 'CV':
    df_trips['str_o_dt'] =  df_trips['o_dep_time'].astype(str)
    print(f"Max length of column str_o_dt: {max(df_trips['str_o_dt'].str.len())}")
    print(f"Min length of column str_o_dt: {min(df_trips['str_o_dt'].str.len())}\n")

    df_trips['str_d_at'] =  df_trips['d_arr_time'].astype(str)
    print(f"Max length of column str_d_at: {max(df_trips['str_d_at'].str.len())}")
    print(f"Min length of column str_d_at: {min(df_trips['str_d_at'].str.len())}")
    df_trips.loc[df_trips['str_d_at'].str.len()>8]
    df_trips['str_d_at'] =df_trips['str_d_at'].str[-8:]
    print(f"Max length of column str_d_at: {max(df_trips['str_d_at'].str.len())}, after cleaning")
    print(f"Min length of column str_d_at: {min(df_trips['str_d_at'].str.len())}, after cleaning\n")

    df_trips['str_nt_dt'] =  df_trips['next_trip_dep_time'].astype(str)
    print(f"Max length of column str_nt_dt: {max(df_trips['str_nt_dt'].str.len())}")
    print(f"Min length of column str_nt_dt: {min(df_trips['str_nt_dt'].str.len())}")
    df_trips.loc[df_trips['str_nt_dt'].str.len()<8].head(2)

In [None]:
if dataset == 'TNC':
    df_trips['str_o_dt'] =  df_trips['o_dep_time'].astype(str)
#     print('Before:\n', df_trips['str_o_dt'].loc[~df_trips['str_o_dt'].str.len().isin([8, 15])])
    df_trips['str_o_dt'] = df_trips['str_o_dt'].\
    apply(lambda x: x[11:] if x[:10]=='1900-01-01' else x)
#     print('\nAfter:\n', df_trips['str_o_dt'].loc[~df_trips['str_o_dt'].str.len().isin([8, 15])])
    df_trips['str_o_dt'].loc[df_trips['str_o_dt'].str.len()!=8]
    df_trips['str_o_dt'].loc[df_trips['str_o_dt'].str.len()==8]
    df_trips.loc[df_trips['str_o_dt'].str.len()!=8, 'str_o_dt'] = df_trips['str_o_dt'].str[:-7]
    print(f"\n'o_dt' data are now clean: " +
          f"{len(df_trips.loc[df_trips['str_o_dt'].str.len()==8]) == len(df_trips)}\n\n")

    df_trips['str_d_at'] =  df_trips['d_arr_time'].astype(str)
#     print('Before:\n', df_trips['str_d_at'].loc[~df_trips['str_d_at'].str.len().isin([8, 15])])
    df_trips['str_d_at'] = df_trips['str_d_at'].\
    apply(lambda x: x[11:] if x[:10]=='1900-01-01' else x)
#     print('\nAfter:\n', df_trips['str_d_at'].loc[~df_trips['str_d_at'].str.len().isin([8, 15])])
    df_trips['str_d_at'].loc[df_trips['str_d_at'].str.len()!=8]
    df_trips['str_d_at'].loc[df_trips['str_d_at'].str.len()==8]
    df_trips.loc[df_trips['str_d_at'].str.len()!=8, 'str_d_at'] = df_trips['str_d_at'].str[:-7]
    print(f"\n'd_at' data are now clean: " +
          f"{len(df_trips.loc[df_trips['str_d_at'].str.len()==8]) == len(df_trips)}")

    df_trips['str_nt_dt'] =  df_trips['next_trip_dep_time'].astype(str)
#     print('Before:\n', df_trips['str_nt_dt'].loc[~df_trips['str_nt_dt'].str.len().isin([3, 8, 15])])
    df_trips['str_nt_dt'] = df_trips['str_nt_dt'].\
    apply(lambda x: x[11:] if x[:10]=='1900-01-01' else x)
#     print('\nAfter:\n', df_trips['str_nt_dt'].loc[~df_trips['str_nt_dt'].str.len().isin([3, 8, 15])])
    df_trips['str_nt_dt'].loc[~df_trips['str_nt_dt'].str.len().isin([3, 8])]
    df_trips['str_nt_dt'].loc[df_trips['str_nt_dt'].str.len()==8]
    df_trips.loc[df_trips['str_nt_dt'].str.len()==15, 'str_nt_dt'] = df_trips['str_nt_dt'].str[:-7]
    actual_times = len(df_trips.loc[df_trips['str_nt_dt'].str.len()==8])
    nans = len(df_trips.loc[df_trips['str_nt_dt'].str.len()==3])
    print(f"\n'nt_dt' data are now clean: " +
          f"{actual_times+nans==len(df_trips)}\n")
    print(f'Number of recorded times: {actual_times}')
    print(f'Number of NANs: {nans}')
    print(f'Total trips in the dataframe: {len(df_trips)}')

In [None]:
df_trips['o_dt'] = df_trips['str_td'] + ' ' + df_trips['str_o_dt']
df_trips['o_dt'] = pd.to_datetime(df_trips['o_dt'])
print(type(df_trips['o_dt']))
print(type(df_trips['o_dt'][0]))
df_trips.head()

In [None]:
df_trips['d_at'] = df_trips['str_td'] + ' ' + df_trips['str_d_at']
df_trips['d_at'] = pd.to_datetime(df_trips['d_at'])
print(type(df_trips['d_at']))
print(type(df_trips['d_at'][0]))
print(f"Number of NAs in this column: {df_trips['d_at'].isna().sum()}")
df_trips.head()

In [None]:
df_trips['nt_dt'] = df_trips['str_td'] + ' ' + df_trips['str_nt_dt']
df_trips['nt_dt'] = pd.to_datetime(df_trips['nt_dt'], errors='coerce')
print(type(df_trips['nt_dt']))
print(type(df_trips['nt_dt'][0]))
print(f"Number of NaTs in this column: {df_trips['nt_dt'].isna().sum()}")
df_trips.head()

In [None]:
# Some tests:
print(df_trips['d_at'][0] - df_trips['o_dt'][0])
print(df_trips['d_at'][0] > df_trips['o_dt'][0])
print(df_trips['nt_dt'][3] - df_trips['d_at'][3])
print(df_trips['nt_dt'][0] + pd.to_timedelta(1, unit='D'))

In [None]:
# Ensure events are in chronological order.
print(len(df_trips.loc[df_trips['d_at']<df_trips['o_dt']]))
df_trips.loc[df_trips['d_at']<df_trips['o_dt'], 'd_at'] = df_trips['d_at'] + pd.to_timedelta(1, unit='days')
print(len(df_trips.loc[df_trips['d_at']<df_trips['o_dt']]), '\n')

print(len(df_trips.loc[df_trips['nt_dt']<df_trips['d_at']]))
df_trips.loc[df_trips['nt_dt']<df_trips['d_at'], 'nt_dt'] = df_trips['nt_dt'] + pd.to_timedelta(1, unit='days')
print(len(df_trips.loc[df_trips['nt_dt']<df_trips['d_at']]))

In [None]:
# Find travel time and stop duration in minutes.
# df_trips['travel_time'] = ((df_trips['d_at'] - df_trips['o_dt']).dt.seconds).astype(int)
# df_trips['stop_duration'] = ((df_trips['nt_dt'] - df_trips['d_at']).dt.seconds).round().astype('Int32') #, errors='ignore'
df_trips['travel_time'] = (((df_trips['d_at'] - df_trips['o_dt']).dt.seconds) / 60).astype(int)
df_trips['stop_duration'] = (((df_trips['nt_dt'] - df_trips['d_at']).dt.seconds) / 60).round().astype('Int32') #, errors='ignore'
# Note the capitalized 'Int32' in the line above - as opposed to 'int32'.
# print(f"Now, the datatype for the 'stop_duration' column is kind of weird: {df_trips['stop_duration'].dtypes}, rather than simply int32.")
# print('We need to live with this because we want to allow <NA> values in this column.')
df_trips.head()

In [None]:
# Throw away unnecessary time fields.
rmv_fields = ['o_dep_time', 'd_arr_time', 'next_trip_dep_time',
              'str_td', 'str_o_dt', 'str_d_at', 'str_nt_dt']
all_fields = df_trips.columns.to_list()
keep_cols = [f for f in all_fields if not f in rmv_fields]

df_trips = df_trips[keep_cols]
df_trips.head()

##### Mark the last trips

In [None]:
df_trips['last_trip'] = False
df_trips.loc[df_trips['trip_number'].shift(-1)==1, 'last_trip'] = True
df_trips.at[df_trips.index[-1], 'last_trip'] = True
df_trips.tail()
df_trips.head(50)

##### Find the trip distance

In [None]:
df_trips['tod'] = ""

In [None]:
# Indicate at what time of day the trip has started.
def find_tod(timestamp):
    if timestamp.time() < dt.time(3):
        return 'EV' # Late: 7PM to 3AM
    elif timestamp.time() < dt.time(6):
        return 'EA' # Early: 3AM to 6AM
    elif timestamp.time() < dt.time(9):
        return 'AM' # AM Peak: 6AM to 9AM
    elif timestamp.time() < dt.time(15, 30):
        return 'MD' # Midday: 9AM to 3:30PM
    elif timestamp.time() < dt.time(19):
        return 'PM' # PM Peak: 3:30PM to 7pm
    elif timestamp.time() >= dt.time(19):
        return 'EV' # Late: 7PM to 3AM

df_trips['tod'] = df_trips['o_dt'].apply(find_tod)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(0)) & (df_trips['o_dt'].dt.time<dt.time(3))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(3)) & (df_trips['o_dt'].dt.time<dt.time(6))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(6)) & (df_trips['o_dt'].dt.time<dt.time(9))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(9)) & (df_trips['o_dt'].dt.time<dt.time(15, 30))].head(10)
# df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(15,30)) & (df_trips['o_dt'].dt.time<dt.time(19))].head(10)
df_trips[['o_dt', 'tod']].loc[(df_trips['o_dt'].dt.time>=dt.time(19)) & (df_trips['o_dt'].dt.time<dt.time(23, 59, 59, 999999))].head(10)

In [None]:
print(f'Example: Distance from TAZ 5 to TAZ 19 in AM peak for medium trucks is: {dist_dfs[1][1].at[4,18]/10:} miles')

In [None]:
dic_v = {'LCV': 0, 'SUT': 1, 'MUT': 2}
dic_tod = {'EA': 0, 'AM': 1, 'MD':2, 'PM':3, 'EV':4}

def find_dist(o_taz, d_taz, vs, tod): # vs is vehicle size and tod is time of day.
    if o_taz==-1 or d_taz==-1: return None
    return dist_dfs[tod][vs].at[o_taz-1, d_taz-1] / 10

df_trips['trip_dist'] = df_trips.apply(lambda x: find_dist(x['o_taz'], x['d_taz'],
                                                           dic_v[x['veh_type']],
                                                           dic_tod[x['tod']]), axis=1)

df_trips.head()

In [None]:
df_trips[['o_taz','d_taz','veh_type','tod']].head()

##### Find the trip generalized travel time from the establishment base to the trip destination

In [None]:
print(f'Example: Generalized travel time from TAZ 3901 to TAZ 4233 in mid-day for heavy trucks is: {g_tt_dfs[2][2].iloc[3900, 4232]} minutes.')

In [None]:
dic_v = {'LCV': 0, 'SUT': 1, 'MUT': 2}
for i, row in df_trips.iterrows():
    if row['estab_taz'] == -1 or row['d_taz'] == -1: continue
    df_trips.at[i, 'toll_in_cents_from_base'] = toll_dfs[2][dic_v[row['veh_type']]].at[row['estab_taz']-1, row['d_taz']-1]
    df_trips.at[i, 'gen_tt_from_base'] = g_tt_dfs[2][dic_v[row['veh_type']]].at[row['estab_taz']-1, row['d_taz']-1]

In [None]:
out_file = f'Generalized TT for {dataset}.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_trips.to_excel(out_path, engine='openpyxl')

##### Calculate average generalized travel time by industry by destination activity

In [None]:
print(len(df_trips.loc[df_trips['gen_tt_from_base'].isnull()]))
df_trips.loc[df_trips['gen_tt_from_base'].isnull()].head(2)

In [None]:
cols = ['industry_group', 'd_act_seg', 'd_plc_seg_name', 'veh_type', 'expnsn_factor', 'gen_tt_from_base']
df_gtt = df_trips[cols].copy()
df_gtt = df_gtt.loc[~df_gtt['gen_tt_from_base'].isnull()]
df_gtt = df_gtt.loc[df_gtt['d_act_seg'].isin(['P', 'D', 'S'])]
df_gtt.rename(columns={'d_act_seg': 'purpose'}, inplace=True)
df_gtt['purpose'] = df_gtt['purpose'].apply(lambda x: 'Service' if x=='S' else 'Goods')
# df_gtt.insert(4, 'count', 1)
df_gtt.insert(3, 'customer', 'Business')
df_gtt.loc[df_gtt['d_plc_seg_name']=='Residential', 'customer'] = 'Resident'
df_gtt.drop(columns='d_plc_seg_name', inplace=True)
df_gtt.head(2)

###### Unweighted generalized travel time

In [None]:
df_gtt_unw1 = df_gtt.groupby(['industry_group', 'purpose', 'customer', 'veh_type']).size().rename('sample_size').reset_index()
df_gtt_unw1.head()

In [None]:
df_gtt_unw2 = df_gtt.groupby(['industry_group', 'purpose', 'customer', 'veh_type'])['gen_tt_from_base']\
.mean().reset_index()
df_gtt_unw2.head()

In [None]:
df_gtt_unw = pd.merge(df_gtt_unw1, df_gtt_unw2, on=['industry_group', 'purpose', 'customer', 'veh_type']).dropna()
df_gtt_unw['product'] = df_gtt_unw['sample_size'] * df_gtt_unw['gen_tt_from_base']
df_gtt_unw.head()

In [None]:
s = df_gtt_unw.groupby(['industry_group', 'purpose', 'customer'])['sample_size'].sum()
df_avg_gtt_unw = ((df_gtt_unw.groupby(['industry_group', 'purpose', 'customer'])['product'].sum()/s)\
                  .rename('avg_gen_tt_unweighted')).reset_index()
df_avg_gtt_unw = s.reset_index().merge(df_avg_gtt_unw, on=['industry_group', 'purpose', 'customer'] )
df_avg_gtt_unw.head()

In [None]:
out_file = f'Generalized TT for {dataset} Segments, Unweighted.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_avg_gtt_unw.to_excel(out_path, engine='openpyxl')

###### Weighted generalized travel time

In [None]:
df_gtt2 = df_gtt.copy()
df_gtt2['expanded_tt'] = df_gtt2.expnsn_factor * df_gtt2.gen_tt_from_base
df_gtt2.head(1)

In [None]:
df_gtt2_wei0 = df_gtt2.groupby(['industry_group', 'purpose', 'customer', 'veh_type']).size().rename('sample_size').reset_index()
df_gtt2_wei0.head()

In [None]:
df_gtt2_wei1 = df_gtt2.groupby(['industry_group', 'purpose', 'customer', 'veh_type'])['expnsn_factor']\
.sum().rename('pop_size').reset_index()
df_gtt2_wei1.head()

In [None]:
df_gtt2_wei2 = df_gtt2.groupby(['industry_group', 'purpose', 'customer', 'veh_type'])['expanded_tt']\
.sum().reset_index()
df_gtt2_wei2.head()

In [None]:
df_gtt2_wei = pd.merge(df_gtt2_wei0, df_gtt2_wei1, on=['industry_group', 'purpose', 'customer', 'veh_type'])
df_gtt2_wei = pd.merge(df_gtt2_wei, df_gtt2_wei2, on=['industry_group', 'purpose', 'customer', 'veh_type']).dropna()
# df_gtt2_wei['product'] = df_gtt2_wei['pop_size'] * df_gtt2_wei['expanded_tt']
df_gtt2_wei.head()

In [None]:
sz= df_gtt2_wei.groupby(['industry_group', 'purpose', 'customer'])['sample_size'].sum()
s = df_gtt2_wei.groupby(['industry_group', 'purpose', 'customer'])['pop_size'].sum()
df_avg_gtt2_wei = ((df_gtt2_wei.groupby(['industry_group', 'purpose', 'customer'])['expanded_tt'].sum()/s)\
                  .rename('avg_gen_tt_weighted')).reset_index()
s = sz.reset_index().merge(s, on=['industry_group', 'purpose', 'customer'] )
df_avg_gtt2_wei = s.reset_index().merge(df_avg_gtt2_wei, on=['industry_group', 'purpose', 'customer'] )
df_avg_gtt2_wei.head()

In [None]:
out_file = f'Generalized TT for {dataset} Segments, Weighted.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_avg_gtt2_wei.to_excel(out_path, engine='openpyxl')

#### Create Summaries from the Trip Dataframe

In [None]:
# Unweighted:
AvgStop1 = df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group']).size().rename('count')
AvgStop2 = df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['stop_duration']\
.mean().rename('stop_duration_minutes').round()

# Weighted:
df_trips.loc[df_trips['stop_duration']>0, 'dur_by_weight'] = df_trips['stop_duration'] * df_trips['expnsn_factor']
AvgStop1w = df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['expnsn_factor'].sum().rename('weighted_count')
AvgStop2w = (df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['dur_by_weight'].sum()/\
            df_trips[df_trips['stop_duration']>0].groupby(['d_act_seg_name', 'industry_group'])['expnsn_factor'].sum()).\
            rename('stop_duration_minutes').round()

In [None]:
# df_trips[df_trips['stop_duration']>0][['stop_duration','expnsn_factor']].sort_values(['expnsn_factor'], ascending=True)

In [None]:
AvgStop = pd.concat([AvgStop1, AvgStop2], axis=1)
AvgStop_w = pd.concat([AvgStop1w, AvgStop2w], axis=1)
AvgStop
AvgStop_w

In [None]:
i = 'industry_group'
pvt_ind_act = pd.pivot_table(df_trips, values='company_id', index=i,
                             columns='d_act_seg_name', aggfunc='count').fillna(0) #, sort=True) For sort, default is True.
pvt_ind_act_w = pd.pivot_table(df_trips, values='expnsn_factor', index=i,
                             columns='d_act_seg_name', aggfunc='sum').fillna(0).round() #, sort=True) For sort, default is True.
pvt_ind_act
pvt_ind_act_w

In [None]:
pvt_act_plc = pd.pivot_table(df_trips, values='company_id', index='d_act_seg_name',
                             columns='d_plc_seg_name', aggfunc='count').fillna(0) #, sort=True) For sort, default is True.
pvt_act_plc_w = pd.pivot_table(df_trips, values='expnsn_factor', index='d_act_seg_name',
                             columns='d_plc_seg_name', aggfunc='sum').fillna(0).round() #, sort=True) For sort, default is True.
pvt_act_plc
# pvt_act_plc_w

In [None]:
out_file = dataset + f'_StopDuration_{current_date}.xlsx'
out_path = os.path.join(project_path, out_data_dir)
if not os.path.exists(out_path):
   os.makedirs(out_path)
out_path = os.path.join(project_path, out_data_dir, out_file)
xl_writer = pd.ExcelWriter(out_path, engine='openpyxl')

In [None]:
sn1 = 'StopDur_byAct&Ind'
sn2 = 'StopDur_byAct&Ind_w'
sn3 = 'Ind-DesAct'
sn4 = 'Ind-DesAct_w'
sn5 = 'DesAct-DesPlc'
sn6 = 'DesAct-DesPlc_w'
AvgStop.to_excel(xl_writer, sheet_name=sn1) # , index_label='index'
AvgStop_w.to_excel(xl_writer, sheet_name=sn2) # , index_label='index'
# pvt_ind_act.to_excel(xl_writer, sheet_name=sn3) # , index_label='index'
# pvt_ind_act_w.to_excel(xl_writer, sheet_name=sn4) # , index_label='index'
# pvt_act_plc.to_excel(xl_writer, sheet_name=sn5) # , index_label='index'
# pvt_act_plc_w.to_excel(xl_writer, sheet_name=sn6) # , index_label='index'
xl_writer.close()

try:
    df_trips.drop(columns='dur_by_weight', inplace=True)
except:
    pass

#### Start Creating the Routes

In [None]:
cols = ['route_id', 'industry_group', 'veh_type',
        'trip_count', 'g_stops', 's_stops', 'm_stops', 'b_stops', 'h_stops',
        'primary_purp', 'customer_type',
        'start_tod', 'end_tod', 'route_dur_hr', 'cumlv_dur', 'durations_match',
        'start_activity', 'end_activity',
        'start_plc_seg', 'end_plc_seg',
        'act_seg_seq', 'plc_seg_seq',
        'headquarters', 'hq_taz',
        'tot_distance',
        'company_id', 'vehicle_id', 'driver_id',
        'participation_type', 'expnsn_factor', 'estab_taz', 'trips']
# g_stops = goods stops, s for services, m for maintenance/other, b for base, h for home. (These are super activities.)
# Primary purpose: Goods, Services, Maintenance/Other
# Customer type: Residential, Non-residential, Mixed
# start_tod = starting time of day, it is the first departure; end_tod = ending time of day, it is the last arrival.
# route_dur_hr = Duration of the route based on its start and end times.
# cumlv_dur = Duration of the route based on trip and stay durations.
# durations_match = True if route durations from the two methods match; else False.
# start_activity or end_activity: Starting/ending activity at O/D
# start_plc_seg or end_plc_seg: Starting/ending place (Base, Home, Warehouse/DC (distribution center)/Transport Node, Other)
# headquarters: Primary point of return. Most of the time it is the base.
# hq_taz: TAZ of the primary point of return.

if dataset=='TNC': cols.insert(-1, 'company_name')

df_routes = pd.DataFrame(columns = cols)

In [None]:
# Iterate through the trip dataframe.
for i, row in df_trips.iterrows():

    # If this trip is the first trip of the route, initialize a route for it.
    if row['trip_number'] == 1:
        df_temp_rt = pd.DataFrame(columns=cols, index=[0])

        # Generate a route ID.
        date = str(df_trips.at[i, 'travel_date'])[:10]
        date = date.replace('-', '')
        date = int(date) * 10000
        df_temp_rt['route_id'] = date + row['vehicle_id']

        # Copy common attributes from the trip row to the temporary route dataframe.
        for c in df_temp_rt.columns.to_list():
            if c in df_trips.columns.to_list():
                df_temp_rt[c] = row[c]

        # Initialize route variables.
        trips = []
        activities = [row['o_act_seg']]
        places = [row['o_plc_seg']]

        # Set route variables that are known.
        df_temp_rt['start_activity'] = row['o_act_seg']
        df_temp_rt['start_plc_seg'] = dic_pt2[row['o_plc_seg']]
        df_temp_rt['start_tod'] = row['o_dt']

    # Initialize a trip and store the current row values into its attributes.
    t = Trip()
    t.index = i
    for attr in list(vars(t).keys()):
        if attr == 'index': continue
        #  print(row[str(attr)])
        #  print(getattr(t, attr))
        setattr(t, attr, row[attr])

    trips.append(t)
    # Add trip info to the route variables.
    activities.append(row['d_act_seg'])
    places.append(row['d_plc_seg'])

    # Finalize the route if this is the last trip of the route.
    if row['last_trip']:
        df_temp_rt.at[0, 'act_seg_seq'] = activities
        df_temp_rt.at[0, 'plc_seg_seq'] = places
        df_temp_rt.at[0, 'trips'] = trips

        # Set route variables that are known.
        df_temp_rt['end_activity'] = row['d_act_seg']
        df_temp_rt['end_plc_seg'] = dic_pt2[row['d_plc_seg']]
        df_temp_rt['end_tod'] = row['d_at']

        # Attach the completed route to the route dataframe.
        df_routes = pd.concat([df_routes, df_temp_rt], axis=0, ignore_index=True)

#     if i == 300: break
df_routes.tail()

In [None]:
for i, row in df_routes.iterrows():
    # Identify number of stops by type of stop.
    df_routes.at[i, 'trip_count'] = len(row['trips'])
    counts = collections.Counter(row['act_seg_seq'][1:])
    df_routes.at[i, 'g_stops'] = counts['P'] + counts['D']
    df_routes.at[i, 's_stops'] = counts['S']
    df_routes.at[i, 'm_stops'] = counts['M']
    df_routes.at[i, 'b_stops'] = counts['B']
    df_routes.at[i, 'h_stops'] = counts['H']

    # Identify the primary purpose of the route.
    if 'P' in row['act_seg_seq'][1:]:
        purp = 'Goods'
    elif 'D' in row['act_seg_seq'][1:]:
        purp = 'Goods'
    elif 'S' in row['act_seg_seq'][1:]:
        purp = 'Service'
    else:
        purp = 'Maintenance/Other'
    df_routes.at[i, 'primary_purp'] = purp

    # Find total distance traveled on the route.
    dist = 0
    for t in row['trips']:
        dist += t.trip_dist
    df_routes.at[i, 'tot_distance'] = dist

df_routes.head()

In [None]:
for i, t in enumerate(df_routes.at[0, 'trips']):
    print(f'\nTrip #{i+1}:')
    for e in zip(list(vars(t).keys()), list(vars(t).values())):
        print(e)

In [None]:
# Determine customer type.

# Customer Types at Stops (only applies to routes with Goods and/or Service purposes):
# a. Residential Only (households, including multi-family buildings)
# b. Non-residential Only (commercial, public/government)
# c. Mixed Residential and Non-residential
dic_cstmr_typ = {
    'ro' : 'Residential Only',
    'nro': 'Non-Residential Only',
    'm'  : 'Mixed Residential and Non-residential',
    'nc' : 'No Customer'
}

# plc_seg_code: plc_seg_name
# 1: Residential
# 2: Office
# 3: Warehouse
# 4: Other   A closer look at the trip table indicated most of these places are non-residential.
# 5: Retail and Restaurant
# 6: Gas
# 7: Industrial, Agriculture, or Construction
# 8: Truck Terminal or Parking

lst_res = [1]
lst_non_res = [2, 3, 4, 5, 6, 7, 8]

# Logic: if no relevant place is visited: No Customer;
#        else if every visited place is res: Residential Only;
#        else if there's at least one residential place that has been visited: Mixed;
#        else: Non-Residential Only.

def identify_customer(lrvp):  # lrvp is the list of relevant, visited places.
    if not lrvp: return 'nc'  # if input list is empty, no customer has been served, or no
                              # good has been delivered or picked up.
    result = 'ro'
    for plc in lrvp:
        if plc not in lst_res:
            result = ''
    if result == 'ro': return result
    result = 'm'
    for plc in lrvp:
        if plc not in lst_non_res:
            return result
    result = 'nro'
    return result


for i, route in df_routes.iterrows():
    acts = route['act_seg_seq']
    plcs = route['plc_seg_seq']
    plcs_cleaned = []
    for j, act in enumerate(acts):
        if act in ['S', 'P', 'D']: plcs_cleaned.append(plcs[j])
    cstmr = identify_customer(plcs_cleaned)
    df_routes.at[i, 'customer_type'] = dic_cstmr_typ[cstmr]

In [None]:
df_routes[['act_seg_seq', 'plc_seg_seq', 'customer_type']]

In [None]:
# Check how many of the routes go outside of the SANDAG TAZs.
a = len(df_routes)
b = len(df_routes.loc[pd.isnull(df_routes['tot_distance'])])
print(f'{b} routes out of {a:,} routes have trips that either start or end in an external TAZ.')
print("'tot_distance' field of these routes has been marked as 'NA'.")

In [None]:
# Determine what is the point of return of the route. It's either base or home.
for i, row in df_routes.iterrows():
    counts_act = collections.Counter(row['act_seg_seq'])
#     counts_plc = collections.Counter(row['plc_seg_seq'])
    if counts_act['B'] >= counts_act['H'] and counts_act['B'] > 0:
        df_routes.at[i, 'headquarters'] = 'B'
    elif counts_act['H'] > counts_act['B']:
        df_routes.at[i, 'headquarters'] = 'H'
#     elif counts_plc[3] >= counts_plc[8] and counts_plc[3] > 0:
#         df_routes.at[i, 'headquarters'] = 'Warehouse'
#     elif counts_plc[8] > counts_act[3]:
#         df_routes.at[i, 'headquarters'] = 'Truck Terminal'
    else:
        df_routes.at[i, 'headquarters'] = 'Unknown'
df_routes.groupby('headquarters').size().reset_index(name='count')

In [None]:
df_routes.loc[df_routes['headquarters']=='Unknown'].head()

In [None]:
def find_hq_taz(hq, lst_act, lst_trips):
    if hq in 'BH':
        pos = lst_act.index(hq)
        if pos == 0:
            return lst_trips[0].o_taz
        else:
            return lst_trips[pos-1].d_taz
    else:
        return -1

df_routes['hq_taz'] = df_routes.apply(lambda x: find_hq_taz(x.headquarters, x.act_seg_seq, x.trips), axis=1)
df_routes.head()

##### Now that headquarters of routes are known, go back to df_trips and update distance to headquarters for each trip.

In [None]:
# Create a dictionary that gets route_id and returns hq_taz.
dic_rid_hq_taz = dict(zip(df_routes.route_id, df_routes.hq_taz))

# Specify hq_taz of trips in the trips dataframe.
def omit_dashes(s):
    return s.replace('-', '')

df_trips['route_id'] = df_trips['travel_date'].astype(str).str[:10].apply(omit_dashes).astype('int64') * 10000
df_trips['route_id'] += df_trips.vehicle_id
df_trips.hq_taz = df_trips.route_id.map(dic_rid_hq_taz)

df_trips[['travel_date', 'vehicle_id', 'route_id', 'hq_taz']].head()

In [None]:
# Add route primary purposes to trips
dic_rte_purp = dict(zip(df_routes.route_id, df_routes.primary_purp))
df_trips['route_purpose'] = df_trips.route_id.map(dic_rte_purp)

In [None]:
# Add route primary customer type to trips
dic_rte_cust = dict(zip(df_routes.route_id, df_routes.customer_type))
df_trips['route_customers'] = df_trips.route_id.map(dic_rte_cust)

In [None]:
# Update distance to headquarters.
df_trips['orgn_to_hq_dist'] = df_trips.apply(lambda x: find_dist(x['o_taz'], x['hq_taz'],
                                                           dic_v[x['veh_type']],
                                                           dic_tod[x['tod']]), axis=1)

In [None]:
df_trips.head()

In [None]:
print(df_trips.groupby(['route_purpose']).size().to_string(), "\n")
print(df_trips.groupby(['route_customers']).size().to_string())

####  Run a Few Checks for Quality Control of the Route Dataframe

In [None]:
# Find the route start to end duration in minutes.
df_routes['route_dur_min'] = ((df_routes['end_tod'] - df_routes['start_tod'])
                             .dt.total_seconds()/60).round()

# Find the route duration in minutes based on its trips and stays durations.
for i, route in df_routes.iterrows():
    duration = 0
    trips = route['trips']
    for j, t in enumerate(trips):
        if j != len(trips)-1:
            duration += (t.travel_time + t.stop_duration)
        else:
            duration += t.travel_time
    duration = round(duration)
    df_routes.at[i, 'cumlv_dur'] = duration

    # Specify if route durations from the two methods are off by more than tolerance minutes.
    tol = 15
    if abs(duration - route['route_dur_min']) >= tol:
        df_routes.at[i, 'durations_match'] = False
    else:
        df_routes.at[i, 'durations_match'] = True

misses = len(df_routes.loc[df_routes['durations_match'] == False])
matches = len(df_routes.loc[df_routes['durations_match'] != False])
print(f"Number of routes with duration matches = {matches}; misses = {misses} (tolerance = {tol} minutes)")

In [None]:
df_routes.loc[df_routes['durations_match'] == False].head()

In [None]:
# Determine if the route has extended beyond the first day. This does not require that the duration of a route is more
# than 24 hours, rather if the first departure is in one calendar day while the last arrival is in another day, the
# route will be flagged as Multi-Day.
try:
    df_routes.insert(13, 'multiday_route', False)
except:
    pass
df_routes['multiday_route'] = df_routes['start_tod'].dt.date < df_routes['end_tod'].dt.date

In [None]:
df_routes.loc[df_routes['multiday_route']==True].head()

In [None]:
# Add a column that indicates if the route has any warning.
df_routes['has_warning'] = 0

In [None]:
# Identify routes that vehicle goes from Base to Base, or it goes from Home to Home.
for i, route in df_routes.iterrows():
    flag = False
    acts = route['act_seg_seq']
    act = acts[0]
    for next_act in acts[1:]:
        if (act=='H' and next_act=='H') or (act=='B' and next_act=='B'):
            flag = True
            break
        act = next_act
    df_routes.at[i, 'warn_BB_or_HH'] = flag*1
df_routes['warn_BB_or_HH'] = df_routes['warn_BB_or_HH'].astype('int8')

In [None]:
# Add warning columns.

# Activity at the route origin is to deliver/pickup goods:
df_routes['warn_o_act_G'] = df_routes['act_seg_seq'].apply(lambda l: 1 if l[0]=='G' else 0)

# Activity at the route origin is to provide services:
df_routes['warn_o_act_S'] = df_routes['act_seg_seq'].apply(lambda l: 1 if l[0]=='S' else 0)

# Activity at the route destination is either to provide service or deliver/pickup goods:
df_routes['warn_d_act_SG'] = df_routes['act_seg_seq'].apply(lambda l: 1 if l[-1] in ['G', 'S'] else 0)

# Route trips extend beyond 12AM of the first day:
df_routes['warn_next_day'] = df_routes['multiday_route'].astype(int)

# Route duration time calculated from start and end of the tour doesn't match with the one that is calculated by
# summing trip travel times and stop durations:
df_routes['warn_duration'] = 1 - df_routes['durations_match'].astype(int)

# Neither Base nor Home appears in the route activities, including the route origin:
df_routes['warn_no_BH_stops'] = df_routes['act_seg_seq'].apply(lambda l: 0 if ('B' in l or 'H' in l) else 1)

# Neither Goods nor Services appears in the route activities, including the route origin:
df_routes['warn_no_GS_stops'] = df_routes['act_seg_seq'].apply(lambda l: 0 if ('G' in l or 'S' in l) else 1)

# None of Goods, Services, or Maintenance appears in the route activities, including the route origin:
df_routes['warn_no_GSM_stops'] = df_routes['act_seg_seq'].apply(lambda l: 0 if ('G' in l or 'S' in l or 'M' in l) else 1)
df_routes.head()

In [None]:
# Fill the has_warning column.
warn_cols = [c for c in df_routes.columns if 'warn' in c]
df_routes['has_warning'] = df_routes[warn_cols].max(axis=1)
# df_routes.head(20)

#### Explore the Route Dataframe

##### OPTIONAL: Create a cumulative distribution function for route durations.

##### OPTIONAL: Create a graph for starting/ending activity combinations.
Note: Starting activity is the activity at the origin of the first trip, not at its destination.

#### Report Out the Trip Dataframe

In [None]:
print(f'There are {len(df_trips):,} trips in the dataframe.')

In [None]:
df_trips.head()

In [None]:
# Rearrange to have time-related fields next to each other.
cols = ['industry_group', 'trip_number', 'last_trip',
        'travel_date', 'o_dt', 'tod', 'd_at', 'nt_dt', 'travel_time', 'stop_duration',
        'o_act_seg', 'o_act_seg_name', 'o_plc_seg', 'o_plc_seg_name',
        'd_act_seg', 'd_act_seg_name', 'd_plc_seg', 'd_plc_seg_name',
        'o_act', 'o_place_type', 'o_taz',
        'd_act', 'd_place_type', 'd_taz',
        'trip_dist',
        'veh_type',
        'route_id', 'company_id', 'estab_taz', 'vehicle_id', 'driver_id',
        'expnsn_factor', 'participation_type', 'route_purpose', 'route_customers']

if dataset=='TNC':
    cols.insert(-5, 'company_name')

df_trips = df_trips[cols]
df_trips.head(1)

In [None]:
out_file = dataset + f'_Trips_{current_date}.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_trips.to_excel(out_path, sheet_name='Trip Dataframe', index_label='index')

#### Report Out the Route Dataframe

In [None]:
drop_cols = ['multiday_route', 'cumlv_dur', 'durations_match', 'trips', 'headquarters', 'hq_taz']
df_routes.drop(columns=drop_cols, inplace=True)
df_routes.head(2)

In [None]:
out_file = dataset + f'_Routes_{current_date}.xlsx'
out_path = os.path.join(project_path, out_data_dir, out_file)
df_routes.to_excel(out_path, sheet_name='Routes', index=False) #  index_label='index'