In [6]:
import pandas as pd

#google cloud big query libaries
from google.cloud import bigquery
from google.oauth2 import service_account

import os

In [7]:
dirShiny = "E:/GitHub/Resources/R-Shiny/HTS-Trip-Lengths/input"

group_by_columns = ['groupSampleSegment','groupNumWorkers','groupNumVehicles','groupTripType','groupModeTypeBroad']
display(group_by_columns)

['groupSampleSegment',
 'groupNumWorkers',
 'groupNumVehicles',
 'groupTripType',
 'groupModeTypeBroad']

# Setup Data

In [8]:
# Link to BigQuery Client through API

key_path = r"C:\Users\bhereth\confidential-2023-utah-hts-db5335615978.json"
#key_path = r"C:\Users\bhereth\tdm-scenarios-a85044dbbfd3.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)
print ('Successfully Linked to BigQuery Client!')

Successfully Linked to BigQuery Client!


In [31]:
str_sql_trips = """
SELECT
  gSs.groupSampleSegment,
  gNw.groupNumWorkers,
  gNv.groupNumVehicles,
  gMtb.groupModeTypeBroad,
  trip.trip_type AS trip_type,
  COUNT(*) as numTripRecords,
  ROUND(SUM(trip.trip_weight),2) AS sumTripWeight
FROM
  `confidential-2023-utah-hts.20230313.hh` as hh,
  `confidential-2023-utah-hts.20230313.trip` as trip,
  `confidential-2023-utah-hts.20230313_groupings.groupSampleSegment` as gSs,
  `confidential-2023-utah-hts.20230313_groupings.groupNumWorkers` as gNw,
  `confidential-2023-utah-hts.20230313_groupings.groupNumVehicles` as gNv,
  `confidential-2023-utah-hts.20230313_groupings.groupModeTypeBroad` as gMtb
WHERE
  hh.sample_segment = gSs.sample_segment AND
  hh.hh_id = trip.hh_id AND
  hh.num_workers = gNw.num_workers AND
  hh.num_vehicles = gNv.num_vehicles AND
  trip.mode_type_broad = gMtb.mode_type_broad
GROUP BY
  gSs.groupSampleSegment,
  gNw.groupNumWorkers,
  gNv.groupNumVehicles,
  gMtb.groupModeTypeBroad,
  trip.trip_type
"""

In [37]:
str_sql_hh = """
SELECT
  gSs.groupSampleSegment,
  gNw.groupNumWorkers,
  gNv.groupNumVehicles,
  COUNT(*) as numHhRecords,
  ROUND(SUM(hh.hh_weight),2) AS sumHhWeight
FROM
  `confidential-2023-utah-hts.20230313.hh` as hh,
  `confidential-2023-utah-hts.20230313_groupings.groupSampleSegment` as gSs,
  `confidential-2023-utah-hts.20230313_groupings.groupNumWorkers` as gNw,
  `confidential-2023-utah-hts.20230313_groupings.groupNumVehicles` as gNv
WHERE
  hh.sample_segment = gSs.sample_segment AND
  hh.num_workers = gNw.num_workers AND
  hh.num_vehicles = gNv.num_vehicles
GROUP BY
  gSs.groupSampleSegment,
  gNw.groupNumWorkers,
  gNv.groupNumVehicles
"""

In [32]:
# test connection
df_trips_by_type = client.query(str_sql_trips).to_dataframe()

display(df_trips_by_type)

Unnamed: 0,groupSampleSegment,groupNumWorkers,groupNumVehicles,groupModeTypeBroad,trip_type,numTripRecords,sumTripWeight
0,Da,-1,-1,-1,6,2326,108307.51
1,Da,-1,-1,5,6,107,4950.30
2,Da,-1,3,-1,6,670,43351.36
3,Da,-1,3,5,6,82,3160.41
4,Da,3,-1,-1,6,384,32794.74
...,...,...,...,...,...,...,...
24676,Bo,3,6,995,995,2,16.76
24677,Un,4,1,995,995,3,14.51
24678,Sa,3,6,995,995,2,1263.43
24679,WF,3,6,995,995,2,1263.43


In [38]:
# test connection
df_hh = client.query(str_sql_hh).to_dataframe()

display(df_hh)

Unnamed: 0,groupSampleSegment,groupNumWorkers,groupNumVehicles,numHhRecords,sumHhWeight
0,TT,-1,2,4401,441010.53
1,TT,-1,-1,11183,1137207.98
2,TT,1,2,1438,177837.28
3,TT,1,-1,4043,467915.47
4,TT,-1,0,731,59359.12
...,...,...,...,...,...
763,Sa,3,6,1,631.71
764,Sa,6,3,1,737.64
765,Sa,5,7,1,727.58
766,Ir,7,5,1,5.56


In [49]:
df_trips_by_type_hh = pd.merge(df_trips_by_type, df_hh, on=[col for col in group_by_columns if col not in ['groupTripType', 'groupModeTypeBroad']])
df_trips_by_type_hh['tripsPerHh'] = df_trips_by_type_hh['sumTripWeight'] / df_trips_by_type_hh['sumHhWeight'] 
df_trips_by_type_hh['tripsPerHh'].fillna(0, inplace=True)
df_trips_by_type_hh

Unnamed: 0,groupSampleSegment,groupNumWorkers,groupNumVehicles,groupModeTypeBroad,trip_type,numTripRecords,sumTripWeight,numHhRecords,sumHhWeight,tripsPerHh
0,Da,-1,-1,-1,6,2326,108307.51,862,114449.46,0.946335
1,Da,-1,-1,5,6,107,4950.30,862,114449.46,0.043253
2,Da,-1,-1,-1,7,9183,289262.22,862,114449.46,2.527423
3,Da,-1,-1,1,7,687,34117.37,862,114449.46,0.298100
4,Da,-1,-1,1001,7,778,40751.70,862,114449.46,0.356067
...,...,...,...,...,...,...,...,...,...,...
24676,Al,3,1,-1,2,1,59.59,1,36.31,1.641146
24677,Al,3,1,3,2,1,59.59,1,36.31,1.641146
24678,Al,3,1,-1,7,1,36.31,1,36.31,1.000000
24679,Al,3,1,3,7,1,36.31,1,36.31,1.000000


In [50]:
df_trip_types = client.query("SELECT * FROM confidential-2023-utah-hts.20230313.value_labels WHERE table='trip' AND variable='trip_type'").to_dataframe()
df_trip_types.drop(columns=['table','variable'], inplace=True)
df_trip_types.rename(columns={'value':'trip_type','label':'tripTypeLabel'}, inplace=True)
df_trip_types['trip_type'] = df_trip_types['trip_type'].astype(int)
df_trip_types

Unnamed: 0,trip_type,tripTypeLabel
0,4,Home-based personal business
1,2,Home-based school
2,6,Non-home-based work
3,7,Non-home-based non-work
4,1,Home-based work
5,3,Home-based shopping
6,5,Home-based other
7,995,Missing response


In [51]:
df_trips_by_type_hh_with_labels = pd.merge(df_trips_by_type_hh, df_trip_types, on='trip_type')
df_trips_by_type_hh_with_labels

Unnamed: 0,groupSampleSegment,groupNumWorkers,groupNumVehicles,groupModeTypeBroad,trip_type,numTripRecords,sumTripWeight,numHhRecords,sumHhWeight,tripsPerHh,tripTypeLabel
0,Da,-1,-1,-1,6,2326,108307.51,862,114449.46,0.946335,Non-home-based work
1,Da,-1,-1,5,6,107,4950.30,862,114449.46,0.043253,Non-home-based work
2,Da,-1,-1,1,6,219,9956.28,862,114449.46,0.086993,Non-home-based work
3,Da,-1,-1,1001,6,271,12146.25,862,114449.46,0.106128,Non-home-based work
4,Da,-1,-1,3,6,1839,89050.14,862,114449.46,0.778074,Non-home-based work
...,...,...,...,...,...,...,...,...,...,...,...
24676,Su,5,2,5,2,1,30.80,1,18.77,1.640916,Home-based school
24677,Un,0,3,-1,2,1,19.25,1,11.73,1.641091,Home-based school
24678,Un,0,3,3,2,1,19.25,1,11.73,1.641091,Home-based school
24679,Al,3,1,-1,2,1,59.59,1,36.31,1.641146,Home-based school


In [52]:
df_trips_by_type_hh_with_labels.to_csv('trips_by_type.csv', index=False)