# SCOG Trip Distribution

## Join model time/distance skims to survey trips

michael.mccarthy@rsginc.com


In [1]:
# setup
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import openmatrix as omx


In [2]:
def freqPlot(df, var, query=None):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))

    if query != None:
        data = df.query(query)
    else: 
        data = df
    
    counts = data[var].value_counts()
    axes.bar(counts.index, counts.values)
    axes.set_title(var)
    axes.set_xlabel(var)
    axes.set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()

# from Bishoy
def GetTripRatesUsingOneVars(hh_df, trips_df, x, groups, cat, *args, **kwargs):
    """
    Calculate trip rates per cross-classified hhh
    
    Parameters:
    HTS_df: trip records joined to hh + persons
    x: x-class variable no. 1

    groups: trip purpose, i.e., hbs, hbw, ... etc.!
    cat: usually triprate (continous variable), but if trip, we will report the absolute number of trips (discrete)
    """
    HH_PivotTable = hh_df.groupby([x])['hh_weight'].sum().reset_index()
    if groups != "all":
        trips_df = trips_df[trips_df["model_purpose"] == groups].copy()
    else:
        trips_df = trips_df.copy()

    # Finally, pivot the trips using the defined x and y by summing up the trip_weight
    Trip_PivotTable = trips_df.groupby([x])['trip_weight'].sum().reset_index()
    
    if cat == "trip":
        result_table=pd.merge(HH_PivotTable, Trip_PivotTable, left_on=x, right_on=x, how='inner')

    else:
        result_table=pd.merge(HH_PivotTable, Trip_PivotTable, left_on=x, right_on=x, how='inner')
        result_table['trips_rate'] = result_table['trip_weight']/result_table['hh_weight']
    result_table=result_table.fillna(0)
        
    return result_table

def GetTripRatesUsingTwoVars(hh_df, trips_df, x, y, groups, cat, *args, **kwargs):
    """
    Calculate trip rates per cross-classified hhh
    
    Parameters:
    HTS_df: trip records joined to hh + persons
    x: x-class variable no. 1
    y: x-class variable no. 2, if any!
    groups: trip purpose, i.e., hbs, hbw, ... etc.!
    cat: usually triprate (continous variable), but if trip, we will report the absolute number of trips (discrete)
    """
   
    HH_PivotTable = pd.pivot_table(hh_df, values="hh_weight", index=[y],
                        columns=[x], aggfunc=np.sum)     # population
    # Filter out the specific trip purpose: ['nhb', 'hbo', 'hbr', 'hbw', 'hbsc', 'hbc']
    if groups != "all":
        trips_df = trips_df[trips_df["model_purpose"] == groups].copy()
    else:
        trips_df = trips_df.copy()

    # Finally, pivot the trips using the defined x and y by summing up the trip_weight
    dta = "trip_weight"
    Trip_PivotTable = pd.pivot_table(trips_df, values=dta, index=[y],
                    columns=[x], aggfunc=np.sum)     # population

    if cat == "trip":
        result_table = Trip_PivotTable
    else:
        result_table = Trip_PivotTable/HH_PivotTable    # get the rates by dividing by the number of hh's in the respective market segment
    result_table=result_table.fillna(0)
    return result_table

def GetTripRatesUsingOneVars_Unweighted(hh_df, trips_df, x, groups, cat, *args, **kwargs):
    """
    Calculate trip rates per cross-classified hhh
    
    Parameters:
    HTS_df: trip records joined to hh + persons
    x: x-class variable no. 1

    groups: trip purpose, i.e., hbs, hbw, ... etc.!
    cat: usually triprate (continous variable), but if trip, we will report the absolute number of trips (discrete)
    """
    HH_PivotTable = hh_df.groupby([x])['hh_id'].count().reset_index()
    if groups != "all":
        trips_df = trips_df[trips_df["trip_purpose"] == groups].copy()
    else:
        trips_df = trips_df.copy()

    # Finally, pivot the trips using the defined x and y by summing up the trip_weight
    Trip_PivotTable = trips_df.groupby([x])['trip_id'].count().reset_index()
    
    if cat == "trip":
        result_table=pd.merge(HH_PivotTable, Trip_PivotTable, left_on=x, right_on=x, how='inner')

    else:
        result_table=pd.merge(HH_PivotTable, Trip_PivotTable, left_on=x, right_on=x, how='inner')
        result_table['trips_rate'] = result_table['trip_id']/result_table['hh_id']
    result_table=result_table.fillna(0)
        
    return result_table

def tripQA(trips_df, lookup_pairs, qa_queries):
    """
        Method for decoding survey codes (trip purpose and mode) and run queries (check reported trip duration/speed) and export sample for QA 

        Parameters:
        trips_df: trips dataframe
        decode: dict of column, lookup pairs, such as {"o_purpose_category": purpose_lookup_dict}
            queries use numexpr, column names and operators passsed in one string, such as
            df.eval("(mode_type == 1 & speed_mph > 4) | (mode_type == 2 & speed_mph > 20) | (speed_mph > 70)"

    """

    for dfcol, lookup in lookup_pairs.items():
        trips_df[dfcol+"_decode"] = trips_df[dfcol].map(lookup)

    for dfcol, query in qa_queries.items():
        trips_df[dfcol+"_QA"] = trips_df.eval(query) # return True/False column

    return trips_df

def omxtoDataframe(thismatrix,indexmap,corestr):
    df = pd.DataFrame(thismatrix, columns=indexmap.keys(), index=indexmap.keys()).reset_index().melt(id_vars='index').rename(columns = {'index':'origin', 'variable':'destination', 'value':corestr})
    return df

In [3]:
# read in survey trips dataset

survey_trips = pd.read_csv('SCOG_HTS_trips.csv')


In [4]:
# spatial join lat/lon to model shapefile
# reproject zones into Washington State Plane North (EPSG 2285)
taz = gpd.read_file('data/model/SCOG_20250113_zone.shp')
taz.to_crs('EPSG:2285', inplace=True)
# remove externals
taz = taz[taz['NO'] < 1000]

# project survey lat/long
survey_o = gpd.GeoDataFrame(survey_trips, geometry=gpd.points_from_xy(survey_trips['o_lon'], survey_trips['o_lat']))
survey_o.set_crs('EPSG:4326', inplace=True) # WGS 84
survey_o.to_crs('EPSG:2285', inplace=True) # to state plane
#survey_o.to_file('survey_trip_origins.shp')

survey_d = gpd.GeoDataFrame(survey_trips, geometry=gpd.points_from_xy(survey_trips['d_lon'], survey_trips['d_lat']))
survey_d.set_crs('EPSG:4326', inplace=True) # WGS 84
survey_d.to_crs('EPSG:2285', inplace=True) # to state plane

# join survey o/d to TAZs
taz_no = taz[['NO','geometry']]
survey_o_join = survey_o.sjoin(taz_no, how="left", predicate='intersects')
survey_d_join = survey_d.sjoin(taz_no, how="left", predicate='intersects')

In [5]:
# some trips labelled I-I are to/from externals
bad_o_join = survey_o_join[survey_o_join['NO'].isna()]
bad_d_join = survey_d_join[survey_d_join['NO'].isna()]


In [6]:
# join all results
survey_trips['o_taz'] = survey_o_join['NO']
survey_trips['d_taz'] = survey_d_join['NO']

# then drop trips with O/D outside TAZ polygons
survey_trips = survey_trips[~(survey_trips['o_taz'].isna()) & ~(survey_trips['d_taz'].isna())]

# convert float to int
survey_trips['o_taz'] = survey_trips['o_taz'].astype('int')
survey_trips['d_taz'] = survey_trips['d_taz'].astype('int')


In [7]:
# read omx skims
skim = omx.open_file('data/model/skims_newnet.omx')
skim.list_matrices()
# 102 = t0, 103 = tCur, 104 = distance
# New omx: 102 = t0, 103 = tCur, 110 = distance
# mappings = 'NO' (i.e. TAZID)


['102', '104']

In [8]:

tazs = skim.mapping('NO')
fftime = skim['102']
distance = skim['104']

fftime_df = omxtoDataframe(fftime,tazs,'skim_fftime')
dist_df = omxtoDataframe(distance,tazs,'skim_distance')

In [9]:
# fix some high/error skims
fftime_df['skim_fftime'] = np.where(fftime_df['skim_fftime'] > 999, None, fftime_df['skim_fftime'])
dist_df['skim_distance'] = np.where(dist_df['skim_distance'] > 999, None, dist_df['skim_distance'])

In [10]:
survey_trips = survey_trips.merge(fftime_df, left_on=['o_taz','d_taz'], right_on=['origin','destination']).drop(['origin','destination'], axis=1)
survey_trips = survey_trips.merge(dist_df, left_on=['o_taz','d_taz'], right_on=['origin','destination']).drop(['origin','destination'], axis=1)

In [11]:
#survey_trips.to_csv('SCOG_HTS_trips_toOldZones.csv')
survey_trips.to_csv('SCOG_HTS_trips_toNewZones.csv')

In [11]:
survey_auto_trips = survey_trips[survey_trips['mode_type'].isin([5,6,8,9])] # taxi, ridehail, vehicle, carshare
survey_auto_trips.to_csv('SCOG_HTS_trips_toNewZones_Autos.csv',index=False)

In [13]:
fftime_df.to_csv('SCOG_FFSkim.csv')
dist_df.to_csv('SCOG_DistSkim.csv')

## Reasonability Checks

### Survey Time vs Model Free-flow Time

In [14]:
# compare rMove app trip times to model free-flow skim
# participation_group 1 and 3 completed diaries online; 2 and 4 used rMove app
survey_trips = survey_trips[survey_trips['participation_group'].isin([2,4])]

In [15]:
survey_avg_trip_time = (survey_trips['duration_minutes'] * survey_trips['trip_weight']).sum() / survey_trips['trip_weight'].sum()
skim_avg_trip_time = (survey_trips['skim_fftime'] * survey_trips['trip_weight']).sum() / survey_trips['trip_weight'].sum()
survey_trips['time_vs_skim'] = survey_trips['duration_minutes'] / survey_trips['skim_fftime']

In [16]:
print('Avg Survey Reported Trip Duration:', '{0:.2f}'.format(survey_avg_trip_time), 'minutes')
print('Avg Survey Skimmed Trip Duration:', '{0:.2f}'.format(skim_avg_trip_time), 'minutes')
print('Ratio of Survey Reported Trip Duration to Time Skim:', 'min:','{0:.2f}'.format(survey_trips['time_vs_skim'].min()), 'max:','{0:.2f}'.format(survey_trips['time_vs_skim'].max()), 'avg:','{0:.2f}'.format(survey_trips['time_vs_skim'].mean()))

Avg Survey Reported Trip Duration: 14.05 minutes
Avg Survey Skimmed Trip Duration: 7.34 minutes
Ratio of Survey Reported Trip Duration to Time Skim: min: 0.00 max: 260.86 avg: 3.31


### Survey Distance vs Model Distance

In [17]:
survey_avg_trip_dist = (survey_trips['distance_miles'] * survey_trips['trip_weight']).sum() / survey_trips['trip_weight'].sum()
skim_avg_trip_dist = (survey_trips['skim_distance'] * survey_trips['trip_weight']).sum() / survey_trips['trip_weight'].sum()
survey_trips['dist_vs_skim'] = survey_trips['distance_miles'] / survey_trips['skim_distance']

In [18]:
print('Avg Survey Reported Trip Distance:', '{0:.2f}'.format(survey_avg_trip_dist), 'miles')
print('Avg Survey Skimmed Trip Distance:', '{0:.2f}'.format(skim_avg_trip_dist), 'miles')
print('Ratio of Survey Reported Trip Distance to Distance Skim:', 'min:','{0:.2f}'.format(survey_trips['dist_vs_skim'].min()), 'max:','{0:.2f}'.format(survey_trips['dist_vs_skim'].max()), 'avg:','{0:.2f}'.format(survey_trips['dist_vs_skim'].mean()))

Avg Survey Reported Trip Distance: 4.02 miles
Avg Survey Skimmed Trip Distance: 4.50 miles
Ratio of Survey Reported Trip Distance to Distance Skim: min: 0.00 max: 31.80 avg: 0.93


In [19]:
len(survey_trips)

3754

In [20]:
# check for outliers where the distance is 2x the skim or time is 5x the free-flow skim
qa_trips = survey_trips[(survey_trips['dist_vs_skim'] > 2) | (survey_trips['time_vs_skim'] > 5)]
qa_trips

Unnamed: 0.1,Unnamed: 0,arrive_date,arrive_dow,arrive_hour,arrive_minute,arrive_second,bike_park_location,d_bg,d_county,d_in_region,...,income_imputed_value,hh_size,w_size,hh_inc,o_taz,d_taz,skim_fftime,skim_distance,time_vs_skim,dist_vs_skim
1,1,2021-10-19,2,14,27,59,995,5.305795e+11,53057.0,1,...,74999.0,3,2,INC3,895,601,2.947121,1.672173,5.666548,0.418617
2,2,2021-10-19,2,14,43,24,995,5.305795e+11,53057.0,1,...,74999.0,3,2,INC3,601,895,3.030454,1.672173,5.081747,0.418617
12,12,2021-10-19,2,14,28,5,995,5.305795e+11,53057.0,1,...,74999.0,3,2,INC3,895,601,2.947121,1.672173,5.395097,0.418617
18,18,2021-10-20,3,9,56,49,995,5.305795e+11,53057.0,1,...,74999.0,3,2,INC3,608,606,1.538908,0.645423,7.732755,1.394435
19,19,2021-10-20,3,10,3,23,995,5.305795e+11,53057.0,1,...,74999.0,3,2,INC3,606,602,1.071559,0.490353,6.159251,3.466893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5463,5546,2021-10-21,4,15,10,19,1,5.305795e+11,53057.0,1,...,99999.0,2,0,INC4,861,862,6.02015,3.92002,7.391843,2.933659
5466,5549,2021-10-19,2,12,57,6,995,5.305795e+11,53057.0,1,...,99999.0,2,0,INC4,874,510,2.081621,1.340329,7.926513,0.447651
5474,5557,2021-10-21,4,12,0,26,995,5.305795e+11,53057.0,1,...,99999.0,2,0,INC4,608,606,1.538908,0.645423,8.122641,1.859247
5484,5567,2021-10-26,2,15,58,8,995,5.305795e+11,53057.0,1,...,199999.0,4,2,INC4,709,730,4.355706,1.88807,7.048227,0.741498
