In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly as pys
import ipywidgets as widgets
from ipywidgets import Layout

import _global_scripts as gs

## Purpose

**Investigation #6**: Average Drive Access Distance by Station (Model vs Observed) 

## Inputs

In [36]:
tdm_filenames = [
    "_data/base_link.csv",
    "_data/E2.14/WFv920-E2.14.4_BY_2019_transit_rider_summary_link.csv",
    "_data/E2.7/WFv920-E2.7.3_BY_2019_transit_rider_summary_link.csv",
    "_data/E2.13/WFv920-E2.13.4.3_BY_2019_transit_rider_summary_link.csv",
    "_data/E2.13/WFv920-E2.13.5.2_BY_2019_transit_rider_summary_link.csv"
]

tdm_sourcenames = [
    "TDM v9.1",
    "TDM v920-E2.14.4",
    "TDM v920-E2.7.3",
    "TDM v920-E2.13.4.3",
    "TDM v920-E2.13.5.2"
]


df_obs_access_dist = pd.read_csv(r"_data/E2.6/OBS_Access_Egress_Distances_v2.csv")
df_obs_ungrouped = pd.read_csv(r"_data/base_observed_ungrouped_pa.csv")
#df_obs_data = pd.read_csv(r"D:/GitHub/OBS-TDM-Ridership-Compare/2023-OBS-TDM-Compare/data/2019-OBS/2019 Final Weighted UTA OD Data - 2022-04-05 - processed.csv")7

In [8]:
def weighted_quantile(data, q, weights, factor):
    factored_weights = weights * factor
    rounded_weights = np.round(factored_weights).astype(int)
    repeat_data = np.repeat(data,rounded_weights).astype(float)

    return np.quantile(repeat_data, q)

def weighted_average(data, weights):
    return np.average(data, weights=weights)

## TDM

In [4]:
# read in stations and summarize tdm results
df_stations1 = gs.df_stations[['station','N']]

In [38]:
df_tdm_brd = pd.DataFrame()

for tdm_filename, tdm_sourcename in zip(tdm_filenames, tdm_sourcenames):
    print(tdm_sourcename)
    df_tdm_links = pd.read_csv(tdm_filename, low_memory=False)

    # Merge to get boarding and alighting station names
    df_tdm_1 = (
        df_tdm_links
        .merge(df_stations1, left_on="B", right_on="N", how="left")
        .rename(columns={"station": "brd_station"})
        .merge(df_stations1, left_on="A", right_on="N", how="left")
        .rename(columns={"station": "alt_station"})
        .drop(columns=['N_x', 'N_y'])
        .copy()
    )

    # Filter out invalid data and keep specific access links
    df_tdm_2 = df_tdm_1[
        ((df_tdm_1["brd_station"].notna()) & (df_tdm_1["A"] < 10000)) |
        ((df_tdm_1["alt_station"].notna()) & (df_tdm_1["B"] < 10000))
    ].copy()

    df_tdm_2 = df_tdm_2[df_tdm_2["Mode"].isin([80, 11])].copy()
    df_tdm_2["FromSkim_CRT"] = pd.to_numeric(df_tdm_2["FromSkim_CRT"], errors="coerce")

    # Keep only drive access links and select necessary columns
    df_tdm_3 = df_tdm_2[['Mode', 'Period', 'AccessMode', 'brd_station', 'alt_station', 'Riders', 'Distance']].copy()

    # Aggregate data at boarding station level
    df_tdm_brd_source = (
        df_tdm_3
        .dropna(subset=["brd_station"])
        .groupby(['Mode', 'brd_station', "AccessMode"], as_index=False)
        .apply(lambda x: pd.Series({
            'Brd_PA': x['Riders'].sum(),
            'Dist_15': weighted_quantile(x['Distance'].values, 0.15, x['Riders'].values, 10000),
            'Dist_Med': weighted_quantile(x['Distance'].values, 0.5, x['Riders'].values, 10000),
            'Dist_85': weighted_quantile(x['Distance'].values, 0.85, x['Riders'].values, 10000),
            'Dist_Avg': weighted_average(x['Distance'].values, x['Riders'].values)
        }))
    )

    # Add metadata columns
    df_tdm_brd_source['Source'] = tdm_sourcename
    df_tdm_brd_source['station'] = df_tdm_brd_source['brd_station']
    df_tdm_brd_source['Notes'] = 'Boardings represent only direct boardings to CRT.'

    # Reorder and select final columns
    df_tdm_brd_source = df_tdm_brd_source[
        ['Source', 'station', 'AccessMode', 'Brd_PA', 'Dist_15', 'Dist_Med', 'Dist_85', 'Dist_Avg', 'Notes']
    ]

    # Append to the main DataFrame
    df_tdm_brd = pd.concat([df_tdm_brd, df_tdm_brd_source], ignore_index=True)

df_tdm_brd


TDM v9.1






TDM v920-E2.14.4






TDM v920-E2.7.3






TDM v920-E2.13.4.3






TDM v920-E2.13.5.2






Unnamed: 0,Source,station,AccessMode,Brd_PA,Dist_15,Dist_Med,Dist_85,Dist_Avg,Notes
0,TDM v9.1,01-PROVO CENTRAL STATION,walk,421.53,0.24,0.56,0.56,0.425441,Boardings represent only direct boardings to CRT.
1,TDM v9.1,02-OREM CENTRAL STATION,walk,463.64,0.08,0.08,0.58,0.256846,Boardings represent only direct boardings to CRT.
2,TDM v9.1,03-AMERICAN FORK STATION,walk,81.71,0.42,0.42,0.77,0.559250,Boardings represent only direct boardings to CRT.
3,TDM v9.1,04-LEHI STATION,walk,221.96,0.25,0.58,0.66,0.474284,Boardings represent only direct boardings to CRT.
4,TDM v9.1,05-DRAPER STATION,walk,151.75,0.40,0.41,0.47,0.437257,Boardings represent only direct boardings to CRT.
...,...,...,...,...,...,...,...,...,...
145,TDM v920-E2.13.5.2,11-FARMINGTON STATION,drive,444.90,1.43,2.30,4.16,2.679123,Boardings represent only direct boardings to CRT.
146,TDM v920-E2.13.5.2,12-LAYTON STATION,drive,1216.27,1.41,2.52,3.90,2.720216,Boardings represent only direct boardings to CRT.
147,TDM v920-E2.13.5.2,13-CLEARFIELD STATION,drive,1065.04,1.24,2.70,4.53,2.978371,Boardings represent only direct boardings to CRT.
148,TDM v920-E2.13.5.2,14-ROY STATION,drive,1027.89,2.01,3.37,5.82,3.883000,Boardings represent only direct boardings to CRT.


## Observed

In [40]:
df_obs_1 = df_obs_ungrouped.merge(df_obs_access_dist, on = 'id')

In [41]:
boardings = df_obs_1.loc[df_obs_1['onoff'] == 'on']

boardings_total = (boardings.groupby(['station','AccessMode'])
                     .apply(lambda x: pd.Series({
                        'Brd_PA': sum(x['unlinked_weight_adj']),
                        'Dist_15': weighted_quantile(x['AcDist'].values, 0.15, x['unlinked_weight_adj'].values,10000),
                        'Dist_Med': weighted_quantile(x['AcDist'].values, 0.5, x['unlinked_weight_adj'].values,10000),
                        'Dist_85': weighted_quantile(x['AcDist'].values, 0.85, x['unlinked_weight_adj'].values,10000),
                        'Dist_Avg': weighted_average(x['AcDist'].values, x['unlinked_weight_adj'].values)

                     }))
                     .reset_index()
                  )

boardings_total['Source'] = 'OBS'
boardings_total['Notes'] = 'Boardings represent all boardings, not just direct to CRT.'
boardings_total = boardings_total[['Source','station','AccessMode','Brd_PA','Dist_15','Dist_Med','Dist_85','Dist_Avg', 'Notes']]
boardings_total





Unnamed: 0,Source,station,AccessMode,Brd_PA,Dist_15,Dist_Med,Dist_85,Dist_Avg,Notes
0,OBS,01-PROVO CENTRAL STATION,drive,2260.414,1.17,2.79,9.89,4.727752,"Boardings represent all boardings, not just di..."
1,OBS,01-PROVO CENTRAL STATION,walk,945.908,0.164252,0.371521,1.103047,0.545255,"Boardings represent all boardings, not just di..."
2,OBS,02-OREM CENTRAL STATION,drive,1460.04,0.95,2.85,4.83,3.319107,"Boardings represent all boardings, not just di..."
3,OBS,02-OREM CENTRAL STATION,walk,360.64,0.076044,0.310929,0.605386,0.4007,"Boardings represent all boardings, not just di..."
4,OBS,03-AMERICAN FORK STATION,drive,1410.593,2.09,3.98,9.58,5.402691,"Boardings represent all boardings, not just di..."
5,OBS,03-AMERICAN FORK STATION,walk,136.077,0.282029,0.824947,0.92142,0.839018,"Boardings represent all boardings, not just di..."
6,OBS,04-LEHI STATION,drive,1128.372,2.35,4.27,8.48,5.39814,"Boardings represent all boardings, not just di..."
7,OBS,04-LEHI STATION,walk,292.189,0.27289,0.351125,1.679277,0.896353,"Boardings represent all boardings, not just di..."
8,OBS,05-DRAPER STATION,drive,935.648,2.838472,4.54,8.32,5.075091,"Boardings represent all boardings, not just di..."
9,OBS,05-DRAPER STATION,walk,144.648,0.469682,1.03481,1.613032,1.072952,"Boardings represent all boardings, not just di..."


## Comparison of TDM and OBS

In [42]:
sumStats = pd.concat([boardings_total,df_tdm_brd],ignore_index=True)

In [43]:
def plotit(variable, access_mode):
    output.clear_output()  # Clear previous output before displaying new content
    global firstTime
    if firstTime:
    
        filtered_data = sumStats[sumStats['AccessMode'] == access_mode]
            
        # Create histogram
        fig = px.histogram(
            filtered_data, 
            x="station", 
            y=variable, 
            text_auto='.2s',
            color='Source', 
            barmode='group',
            height=400
        )
        fig.update_layout(
            xaxis_title="Station Name",
            yaxis_title=str(variable),
            legend_title="Model Version"
        )
        
        # Display the plot
        fig.show()
    
    else:
        firstTime = True

In [44]:
lstValues = list([
    'Dist_15',
    'Dist_Med',
    'Dist_85',
    'Dist_Avg'
])
accessModeOptions = ['drive', 'walk']

selectValues = widgets.Select(options=lstValues, value=('Dist_Med'), description = 'Select Variable')
selectAccessMode = widgets.Dropdown(options=accessModeOptions, value='drive', description='Access Mode')

# Set up a global variable to track whether the widgets have been changed
firstTime = False

# create output widget to display filtered DataFrame
output = widgets.Output()
hbox = widgets.HBox([selectValues, selectAccessMode])

# create interactive widget
interactive_output = widgets.interactive_output(plotit, {'variable':selectValues, 'access_mode': selectAccessMode})

display(hbox)
display(interactive_output)
display(output)

HBox(children=(Select(description='Select Variable', index=1, options=('Dist_15', 'Dist_Med', 'Dist_85', 'Dist…

Output()

Output()

## Conclusions

- The end of the line stations (Provo and Ogden) are showing lower drive access distances in the model than observed. 
    - In addition, the stations that immediately follow (Orem and Roy) are showing higher drive access distances in the model than observed.
    - This leads us to beleive that the model is forecasting trips that possible "bypass" the Provo and Ogden stations instead prefering to go one more station closer