In [75]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests
import db

pd.set_option('display.max_colwidth', None)

INDICATOR = 'border_wait_time'
CONFIG = config.get_config(INDICATOR, '../config.toml')

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/transportation/border/border_wait_time](C:/Users/tan/src/regional-pm-2023/data/raw/transportation/border/border_wait_time)

# Transportation Planning: Border

## Border Wait Time

In [76]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Average border wait time for different traffic types.

nan

In [77]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
traffic_type,Traffic Type,Type of border traffic.,string
average_wait_time,Average Wait Time,Average wait time in minutes in a given year for a given traffic type.,float


In [78]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,Border Wait Time Data,Customs and Border Patrol,True,"SANDAG's database archive of this is missing several months in 2022 due to ETL errors, so average does not include some months of data."


In [79]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Extract from legacy PM data.
1,Download new data from SANDAG's SQL Server and combine with legacy PM data.


In [80]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,


### Step 0: Extract legacy data

In [81]:
display(steps.loc[0])

step    Extract from legacy PM data.
Name: 0, dtype: object

There are two tables on the sheet. Per their comments that the methodology has changed, it would be the best to use 2011-now data.

In [82]:
legacy_data = (
    pd.read_excel(
        CONFIG['legacy_xlsx_path'],
        CONFIG['legacy_sheet'],
        skiprows=42,
        nrows=11,
        usecols='A:I',
    )
    .rename(columns={'Unnamed: 0': 'year'})
    .rename(columns=lambda col: col.strip(' *'))
    .melt('year', var_name='traffic_type', value_name='average_wait_time')
    .set_index(['year', 'traffic_type'])
)
legacy_data

Unnamed: 0_level_0,Unnamed: 1_level_0,average_wait_time
year,traffic_type,Unnamed: 2_level_1
2011,SENTRI Passengers,5.0
2012,SENTRI Passengers,6.5
2013,SENTRI Passengers,7.0
2014,SENTRI Passengers,7.5
2015,SENTRI Passengers,7.5
...,...,...
2017,San Ysidro Pedestrian West,8.0
2018,San Ysidro Pedestrian West,10.0
2019,San Ysidro Pedestrian West,10.0
2020,San Ysidro Pedestrian West,7.0


### Step 1: Update with new data

In [83]:
display(steps.loc[1])

step    Download new data from SANDAG's SQL Server and combine with legacy PM data.
Name: 1, dtype: object

In [84]:
query = """--sql

SELECT 
	*
FROM (
SELECT
	[Year],
	[Column],
	CAST([Average Wait Time] AS FLOAT) AS 'Average Wait Time'
FROM (
	SELECT
		[Year],
		IIF(
			[Vehicle Type]='Passenger' AND [Lane Type]='SENTRI' AND [Port Name] IN ('San Ysidro','SanYsidro PedWest','Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express'), 
			'SENTRI Passengers',
		IIF(
			[Vehicle Type]='Passenger' AND [Lane Type]='READY' AND [Port Name] IN ('San Ysidro','SanYsidro PedWest','Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express'), 
			'READY  Passengers',
		IIF(
			[Vehicle Type]='Passenger' AND [Lane Type]='Standard' AND [Port Name] IN ('San Ysidro','SanYsidro PedWest','Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express','Tecate'), 
			'General Passengers',
		IIF(
			[Vehicle Type]='Commercial' AND [Lane Type]='Standard' AND [Port Name] IN ('Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express','Tecate'), 
			'Standard Commercial',
		IIF(
			[Vehicle Type]='Commercial' AND [Lane Type]='FAST' AND [Port Name] IN ('Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express'), 
			'FAST Commercial',
		IIF(
			[Vehicle Type]='Pedestrian' AND [Lane Type]='Standard' AND [Port Name] IN ('San Ysidro','Tecate','Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express'), 
			'General Pedestrians',
		IIF(
			[Vehicle Type]='Pedestrian' AND [Lane Type]='READY' AND [Port Name] IN ('San Ysidro','SanYsidro PedWest','Otay Mesa Passenger','Otay Mesa Commercial','Otay Mesa Border Express'), 
			'READY Pedestrians',
		IIF(
			[Vehicle Type]='Pedestrian' AND [Lane Type]='Standard' AND [Port Name] IN ('San Ysidro','San Ysidro PedWest'), 
			'San Ysidro Pedestrian West',
			'not included'
		)))))))) AS 'Column',
		[Average Wait Time],
		CONCAT([Vehicle Type],'_',[Lane Type],'_',[Port Name]) AS 'Inputs'
		FROM (
			SELECT 
				YEAR(date) as 'Year', 
				REPLACE(SUBSTRING(crossing_type.description, 1, CHARINDEX('-', crossing_type.description) - 1), 'Vehicle', '') as 'Vehicle Type',
				SUBSTRING(crossing_type.description, CHARINDEX('-', crossing_type.description) + 2, LEN(crossing_type.description)) as 'Lane Type',
				port_name as 'Port Name', 
				AVG(delay) as 'Average Wait Time'
			FROM borderwaittimes.dbo.border_wait_time 
			LEFT JOIN borderwaittimes.dbo.crossing_type 
				ON crossing_type.crossing_type_id = border_wait_time.crossing_type
			LEFT JOIN borderwaittimes.dbo.port_of_entry
				ON port_of_entry.port_id = border_wait_time.port_id
			WHERE date >= '2011-01-01'
			GROUP BY 
				YEAR(date), 
				description, 
				port_name
		) AS [wt_lookup]
		WHERE [Year]<2023
	) AS [wt_intermediate]
) AS wt
PIVOT
(
	AVG([Average Wait Time]) FOR [Column] IN (
		[SENTRI Passengers],
		[READY  Passengers],
		[General Passengers],
		[Standard Commercial],
		[FAST Commercial],
		[General Pedestrians],
		[READY Pedestrians],
		[San Ysidro Pedestrian West]
	)
) AS pivot_wt
"""

In [85]:
con = db.get_db_connection(
    server='SQL2014b8',
    db='borderwaittimes',
)

In [86]:
new_data = (
    pd.read_sql(query, con)
    .rename(columns={'Year': 'year'})
    .melt('year', var_name='traffic_type', value_name='average_wait_time')
    .query("`year` == 2022")
    .set_index(['year', 'traffic_type'])
)
new_data

Unnamed: 0_level_0,Unnamed: 1_level_0,average_wait_time
year,traffic_type,Unnamed: 2_level_1
2022,SENTRI Passengers,12.0
2022,READY Passengers,68.0
2022,General Passengers,63.666667
2022,Standard Commercial,30.5
2022,FAST Commercial,16.0
2022,General Pedestrians,19.666667
2022,READY Pedestrians,9.0
2022,San Ysidro Pedestrian West,0.0


In [87]:
borderwaittimes = pd.concat([legacy_data, new_data]).sort_index()
borderwaittimes

Unnamed: 0_level_0,Unnamed: 1_level_0,average_wait_time
year,traffic_type,Unnamed: 2_level_1
2011,FAST Commercial,22.00000
2011,General Passengers,46.66667
2011,General Pedestrians,21.00000
2011,READY Passengers,
2011,READY Pedestrians,
...,...,...
2022,READY Passengers,68.00000
2022,READY Pedestrians,9.00000
2022,SENTRI Passengers,12.00000
2022,San Ysidro Pedestrian West,0.00000


Note that several months were missing in the database due to the automatic download script going down.

In [88]:
# List not null records in 2022
(
    pd.read_sql(
        """--sql
            SELECT
                YEAR([date]) AS [year],
                MONTH([date]) AS [month],
                COUNT([date]) AS [records]
            FROM [borderwaittimes].[dbo].[border_wait_time]
            WHERE YEAR([date]) = 2022 AND [delay] is not NULL
            GROUP BY YEAR([date]), MONTH([date])
            ORDER BY MONTH([date])
        """,
        con
    )
)

Unnamed: 0,year,month,records
0,2022,1,9542
1,2022,2,10565
2,2022,3,11629
3,2022,11,8144
4,2022,12,11570


### Save Data

In [89]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/transportation/border/border_wait_time](C:/Users/tan/src/regional-pm-2023/data/clean/transportation/border/border_wait_time)

In [91]:
borderwaittimes.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)