# HOV Account Matching

In [1]:
suppressWarnings(suppressMessages(library(tidyverse)))
suppressMessages(library(lubridate))
library(RSQLCipher)

In [2]:
Sys.setenv("SQL_KEY"=Sys.getenv("HOT_KEY"))
options(repr.plot.width=10, repr.plot.height=6)

In [7]:
db_path = "../../../data/hot.db"

# import tables
trips = load_table(db_path, "trips_linked", c(tag_id="c", acct="c", 
                                              plate="c", id="c", zip="c"))
rts = load_table(db_path, "rts", c(plate="c"))
hov = load_table(db_path, "hov", c(ag_tag_id="c", acct_id="c"))

In [3]:
make_2way = function(df, var1, var2) {
    table(eval(substitute(var1), df), eval(substitute(var2), df), 
         dnn=c(substitute(var1), substitute(var2)), useNA="ifany") %>% prop.table %>% round(3)
}
frac.na = function(x) { mean(is.na(x)) }

## Missing data
Most of the data is here, except for 8% of the plate states, 7% of the census block information, and a smattering of other missing variables.

In [11]:
hov %>%
    head(10000) %>%
    execute %>%
    summarize_all(frac.na) %>% 
    t

0,1
acct_id,0.0
is_original,0.0
trip_id,0.0
txn_id,0.0
ag_tag_id,0.0
plate_state_id,0.0827
city,0.0002
state,0.0002
zip_code,0.0077
match,0.0002


## Joining

In [142]:
rts_hov = rts %>%
    filter(pmt_type == "HOV") %>%
    select(trip_id, trip_def_id, txn_id, agency_tag) %>%
    left_join(select(hov, acct_id, is_original, ag_tag_id, plate_state_id, fips), 
              by=c("agency_tag"="ag_tag_id")) %>%
    head(10000) %>%
    execute(col_types="iiicclcc")

In [145]:
make_2way(rts_hov, !is.na(acct_id), fips != 0)

               fips != 0
!is.na(acct_id) FALSE  TRUE  <NA>
          FALSE 0.000 0.000 0.004
          TRUE  0.098 0.898 0.000

### *Diagnostics*

In [149]:
hov %>% filter(trip_id == 104255284) %>% execute

acct_id,is_original,trip_id,txn_id,ag_tag_id,plate_state_id,city,state,zip_code,match,is_exact,fips,county,cty_subdivision,block
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
-4.667543e+18,1,104255284,383866747,-5.018025e+18,-9.152049e+18,WOODINVILLE,WA,98072,Match,1,530330323212,33,32321,2002
-4.667543e+18,1,104255284,383870235,-5.018025e+18,8.48719e+17,WOODINVILLE,WA,98072,Match,1,530330323212,33,32321,2002


In [150]:
rts %>% filter(trip_id == 104255284) %>% execute

trip_id,entry_time,exit_time,trip_def_id,fare,pmt_type,plaza,txn_id,entry_exit,txn_time,agency_tag,plate
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
104255284,1514992382,1514992722,2690,0,HOV,NB01,383866747,E,1514992382,-5.018025e+18,-9.152049e+18
104255284,1514992382,1514992722,2690,0,HOV,NB03,383868474,,1514992550,-5.018025e+18,
104255284,1514992382,1514992722,2690,0,HOV,NB04,383870235,X,1514992722,-5.018025e+18,8.48719e+17


In [151]:
trips %>% filter(trip_id == 104255284) %>% execute

trip_id,def_id,toll,entry_time,exit_time,entry_plaza,exit_plaza,is_hov,tag_id,acct,plate,id,plate_state,zip,plus4_code,fips
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
104255284,2690,0,1514995982,1514996322,3,6,1,,,,,,,,


## Post-join check

We are able to get census tracts for around 38% of HOV trips.

In [17]:
trips_top = trips %>%
    group_by(is.na(fips), toll==0) %>%
    summarize(n=n()) %>%
    execute