In [21]:
import pandas as pd

#google cloud big query libaries
from google.cloud import bigquery
from google.oauth2 import service_account

import os

In [22]:
dirShiny = "E:/GitHub/Resources/R-Shiny/HTS-Trip-Lengths/input"

group_by_columns = ['groupSampleSegment','groupNumWorkers','groupNumVehicles','groupTripType','groupModeTypeBroad','binSize']
display(group_by_columns)

index_columns = group_by_columns + ['binStart']
display(index_columns)

['groupSampleSegment',
 'groupNumWorkers',
 'groupNumVehicles',
 'groupTripType',
 'groupModeTypeBroad',
 'binSize']

['groupSampleSegment',
 'groupNumWorkers',
 'groupNumVehicles',
 'groupTripType',
 'groupModeTypeBroad',
 'binSize',
 'binStart']

# Setup Data

In [23]:
# Link to BigQuery Client through API

key_path = r"C:\Users\bhereth\confidential-2023-utah-hts-db5335615978.json"
#key_path = r"C:\Users\bhereth\tdm-scenarios-a85044dbbfd3.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id,)
print ('Successfully Linked to BigQuery Client!')

Successfully Linked to BigQuery Client!


In [24]:
str_sql = """
SELECT
  bins.binSize,
  bins.binStart,
  gSs.groupSampleSegment,
  gNw.groupNumWorkers,
  gNv.groupNumVehicles,
  gTt.groupTripType,
  gMtb.groupModeTypeBroad,
  COUNT(*) as numTripRecords,
  SUM(trip.trip_weight) AS sumTripWeight
FROM
  `confidential-2023-utah-hts.bins.binsDist` as bins,
  `confidential-2023-utah-hts.20230313.hh` as hh,
  `confidential-2023-utah-hts.20230313.trip` as trip,
  `confidential-2023-utah-hts.20230313_groupings.groupSampleSegment` as gSs,
  `confidential-2023-utah-hts.20230313_groupings.groupNumWorkers` as gNw,
  `confidential-2023-utah-hts.20230313_groupings.groupNumVehicles` as gNv,
  `confidential-2023-utah-hts.20230313_groupings.groupModeTypeBroad` as gMtb,
  `confidential-2023-utah-hts.20230313_groupings.groupTripType` as gTt
WHERE
  trip.distance_miles >= bins.binStart AND
  trip.distance_miles < bins.binStart + bins.binSize AND
  hh.sample_segment = gSs.sample_segment AND
  hh.hh_id = trip.hh_id AND
  hh.num_workers = gNw.num_workers AND
  hh.num_vehicles = gNv.num_vehicles AND
  trip.trip_type = gTt.trip_type AND
  trip.mode_type_broad = gMtb.mode_type_broad
GROUP BY
  bins.binSize,
  bins.binStart,
  gSs.groupSampleSegment,
  gNw.groupNumWorkers,
  gNv.groupNumVehicles,
  gTt.groupTripType,
  gMtb.groupModeTypeBroad
"""

In [25]:
# test connection
df_trips_by_distance_bins = client.query(str_sql).to_dataframe()

display(df_trips_by_distance_bins)

Unnamed: 0,binSize,binStart,groupSampleSegment,groupNumWorkers,groupNumVehicles,groupTripType,groupModeTypeBroad,numTripRecords,sumTripWeight
0,1.0,0.0,Al,-1,-1,1,-1,179,20340.132947
1,1.0,0.0,Al,2,-1,-1,3,718,51811.237666
2,1.0,0.0,Al,-1,2,1002,1,171,7699.678613
3,1.0,0.0,Al,0,2,995,3,7,58.955739
4,1.0,0.0,Al,-1,4,1002,3,56,3364.649145
...,...,...,...,...,...,...,...,...,...
1398015,1.0,15.0,We,2,1,4,3,1,0.000000
1398016,0.5,29.5,We,4,5,1,-1,1,12.721921
1398017,1.0,7.0,We,1,3,1002,1,3,0.000000
1398018,1.0,58.0,We,2,-1,6,3,1,7.046994


In [26]:
_df = df_trips_by_distance_bins.copy()

_df = _df[(_df['groupNumWorkers']==-1) &
     (_df['groupNumVehicles']==-1) &
     (_df['groupTripType']==-1) &
     (_df['binSize']==1.0) &
     (_df['binStart']==0.0)]

display(_df)
display(_df.groupby(['groupModeTypeBroad'], as_index=False).agg(tripcount=('numTripRecords','sum')))

Unnamed: 0,binSize,binStart,groupSampleSegment,groupNumWorkers,groupNumVehicles,groupTripType,groupModeTypeBroad,numTripRecords,sumTripWeight
9424,1.0,0.0,Al,-1,-1,-1,1,410,23386.475581
14343,1.0,0.0,Al,-1,-1,-1,5,16,385.481346
15468,1.0,0.0,Al,-1,-1,-1,995,382,14774.390918
26630,1.0,0.0,Al,-1,-1,-1,-1,2786,181265.370049
37562,1.0,0.0,Al,-1,-1,-1,3,1870,134279.710455
...,...,...,...,...,...,...,...,...,...
1358728,1.0,0.0,We,-1,-1,-1,4,21,250.468333
1361185,1.0,0.0,Wa,-1,-1,-1,2,126,8318.904829
1376534,1.0,0.0,We,-1,-1,-1,3,2718,110599.923370
1387269,1.0,0.0,We,-1,-1,-1,1,1037,39646.421793


Unnamed: 0,groupModeTypeBroad,tripcount
0,-1,230041
1,1,63243
2,2,3645
3,3,127198
4,4,1132
5,5,2240
6,995,32583


# Calculate Distributions

In [27]:
# Group by 'trip_type' and 'distance_miles_2mibin' and sum 'trip_weight_sum'
df_grouped = df_trips_by_distance_bins.groupby(index_columns).agg({'numTripRecords': 'sum', 'sumTripWeight': 'sum'}).reset_index()

# Calculate the total weight for each trip type
total_weights = df_grouped.groupby(group_by_columns)['sumTripWeight'].transform('sum')

# Calculate percentage distribution
df_grouped['pctTripWeight'] = (df_grouped['sumTripWeight'] / total_weights) * 100

# Add cumulative distribution
df_grouped['cumPctTripWeight'] = df_grouped.groupby(group_by_columns)['pctTripWeight'].cumsum()

df_grouped = df_grouped.fillna(0)

display(df_grouped)

Unnamed: 0,groupSampleSegment,groupNumWorkers,groupNumVehicles,groupTripType,groupModeTypeBroad,binSize,binStart,numTripRecords,sumTripWeight,pctTripWeight,cumPctTripWeight
0,Al,-1,-1,-1,-1,0.5,0.0,1141,65337.101515,8.895491,8.895491
1,Al,-1,-1,-1,-1,0.5,0.5,1645,115928.268535,15.783359,24.678850
2,Al,-1,-1,-1,-1,0.5,1.0,1160,106695.780465,14.526377,39.205228
3,Al,-1,-1,-1,-1,0.5,1.5,771,67833.182464,9.235327,48.440555
4,Al,-1,-1,-1,-1,0.5,2.0,490,37199.059576,5.064564,53.505118
...,...,...,...,...,...,...,...,...,...,...,...
1398015,We,8,7,1002,3,1.0,2.0,1,90.183729,26.174584,26.174584
1398016,We,8,7,1002,3,1.0,3.0,1,90.183729,26.174584,52.349169
1398017,We,8,7,1002,3,1.0,10.0,2,164.179478,47.650831,100.000000
1398018,We,8,7,1002,3,5.0,0.0,2,180.367458,52.349169,52.349169


In [28]:
df_grouped.to_csv('../input/hts-trip-lengths.csv', index=False)

In [None]:
df_value_labels= client.query("SELECT * FROM `confidential-2023-utah-hts.20230313.value_labels`").to_dataframe()

display(df_value_labels)

Unnamed: 0,table,variable,value,label
0,person,age,8,55-64
1,person,age,1,Under 5
2,person,age,2,5-15
3,person,age,11,85 or older
4,person,age,3,16-17
...,...,...,...,...
2037,person,second_home_county,995,Missing response
2038,trip,trace_quality_flag,995,Missing response
2039,person,commute_subsidy_998,995,Missing response
2040,trip,trip_survey_complete,995,Missing response


In [None]:
def process_labels(variable, new_column_name):
    """
    Processes and formats a specific variable's labels from a DataFrame.

    Parameters:
    variable (str): The variable to filter and process.
    new_column_name (str): The new column name for the processed variable.

    Returns:
    DataFrame: A processed DataFrame with formatted labels for the specified variable.
    """
    # Filter the DataFrame for the specified variable
    df_labels = df_value_labels[df_value_labels['variable'] == variable].copy()
    
    # Assuming df_labels is your DataFrame and 'value' is initially not in integer format
    df_labels['value'] = df_labels['value'].astype(int)  # Convert 'value' to integer

    # Now sort the DataFrame by the 'value' column
    df_labels.sort_values(by='value', inplace=True)

    # Rename 'value' column to new_column_name and convert it to int64
    #df_labels.rename(columns={'value': variable}, inplace=True)

    #df_labels['newLabel'] = df_labels['value'].astype(str) + ': ' + df_labels['label']
    
    df_labels['newLabel'] = df_labels['label']

    # Drop unnecessary columns
    df_labels.drop(columns=['table', 'variable', 'label'], inplace=True)

    df_labels.rename(columns={'newLabel': 'label'}, inplace=True)

    
    df_labels = pd.concat([pd.DataFrame([[-1,'All']], columns=['value', 'label']),df_labels])

    display(df_labels)

    return df_labels

# Example usage:
# Assuming df_value_labels is a DataFrame you have that meets the requirements
df_trip_type_labels = process_labels('trip_type', 'Trip Type')
df_mode_type_broad_labels = process_labels('mode_type_broad', 'Broad Mode Type')
df_num_workers_labels = process_labels('num_workers', 'Number Worker')
df_num_vehicles_labels = process_labels('num_vehicles', 'Number Vehicle')


Unnamed: 0,value,label
0,-1,All
772,1,Home-based work
769,2,Home-based school
773,3,Home-based shopping
768,4,Home-based personal business
774,5,Home-based other
770,6,Non-home-based work
771,7,Non-home-based non-work
2003,995,Missing response


Unnamed: 0,value,label
0,-1,All
1244,1,Walk
1245,2,Bike
1247,3,Car
1246,4,Transit
1546,5,Other
2030,995,Missing response


Unnamed: 0,value,label
0,-1,All
908,0,0 (No workers)
911,1,1 worker
909,2,2 workers
917,3,3 workers
907,4,4 workers
918,5,5 workers
915,6,6 workers
912,7,7 workers
910,8,8 workers


Unnamed: 0,value,label
0,-1,All
1050,0,0 (no vehicles in household)
1049,1,1 vehicle
1052,2,2 vehicles
1047,3,3 vehicles
1051,4,4 vehicles
1053,5,5 vehicles
1048,6,6 vehicles
1045,7,7 vehicles
1046,8,8 or more vehicles


In [None]:
df_sample_segment_groups = client.query("SELECT * FROM `confidential-2023-utah-hts.20230313._vSampleSegmentGroups`").to_dataframe()

df_sample_segment_groups = pd.concat([pd.DataFrame([['All']], columns=['sampleSegmentGroup']),df_sample_segment_groups])
df_sample_segment_groups['value'] = df_sample_segment_groups[['sampleSegmentGroup']]
df_sample_segment_groups['label'] = df_sample_segment_groups[['sampleSegmentGroup']]

df_sample_segment_groups = df_sample_segment_groups[['value','label']]
display(df_sample_segment_groups)


Unnamed: 0,value,label
0,All,All
0,Box Elder - Tooele - Juab,Box Elder - Tooele - Juab
1,Cache,Cache
2,Davis,Davis
3,Iron,Iron
4,Morgan - Summit - Wasatch,Morgan - Summit - Wasatch
5,Salt Lake,Salt Lake
6,Utah,Utah
7,Washington,Washington
8,Weber,Weber


In [None]:
df_bin_sizes = df_trips_by_sample_segment_and_distance_bins[['binSize']].drop_duplicates()
df_bin_sizes.columns = ['value']
df_bin_sizes.loc[df_bin_sizes['value']==0.5, 'label'] = "1/2-mile"
df_bin_sizes.loc[df_bin_sizes['value']>=1, 'label'] = df_bin_sizes['value'].astype(int).astype(str) + "-mile"
df_bin_sizes = df_bin_sizes.sort_values('value')

display(df_bin_sizes)


NameError: name 'df_trips_by_sample_segment_and_distance_bins' is not defined

In [None]:
df_grouped[(df_grouped['sampleSegmentGroup']=='All') & (df_grouped['num_workers']==1) & (df_grouped['mode_type_broad']==-1)]

Unnamed: 0,sampleSegmentGroup,num_workers,num_vehicles,trip_type,mode_type_broad,binSize,binStart,numTrips,sumTripWeight,percentage,cumulative_percentage


# Export App Data

In [None]:
#df_trip_type_labels.to_csv(os.path.join(dirShiny,'trip_type_labels.csv'), index=False)
#df_mode_type_broad_labels.to_csv(os.path.join(dirShiny, 'mode_type_broad_labels.csv'), index=False)
#df_num_workers_labels.to_csv(os.path.join(dirShiny, 'num_workers_labels.csv'), index=False)
#df_num_vehicles_labels.to_csv(os.path.join(dirShiny, 'num_vehicles_labels.csv'), index=False)
#df_bin_sizes.to_csv(os.path.join(dirShiny, 'bin_sizes_labels.csv'), index=False)
#df_sample_segment_groups.to_csv(os.path.join(dirShiny, 'sample_segment_groups.csv'), index=False)
