## Process Data
Remove Coordinates from Survey Responses. Put coordinates into Series15 geographies, drop coordinates in responses

## Set-up

In [137]:
# import geopandas as gpd
import pandas as pd
import numpy as np

In [None]:
data_path = '../../data'

# input path
selected_geography = 'TAZ'
survey_data_matched_geographies = f'{data_path}/processed/survey_data_matched_geographies_{selected_geography}.csv'
fhv_fare_path = f'{data_path}/interim/fhv_fares.csv'

# output path
processed_survey_data_low_pii_path = f'{data_path}/data_model_output_{selected_geography.lower()}.csv'
processed_survey_data_low_pii_simplified_path = f'{data_path}/data_model_output_simplified_{selected_geography.lower()}.csv'
processed_survey_data_full_pii_simplified_path = f'{data_path}/data_model_output_full_pii_{selected_geography.lower()}.csv'

In [139]:
# read in data
survey_output = pd.read_csv(survey_data_matched_geographies).drop(columns=['Unnamed: 0']).query('weight_departing_only_model_respondents > 0')
fhv_fare_data = pd.read_csv(fhv_fare_path, index_col ='Unnamed: 0')

# merge taxi/tnc imputes fares to main data
# survey_output = survey_output.merge(fhv_fare_data, on = 'unique_id')

  survey_output = pd.read_csv(survey_data_matched_geographies).drop(columns=['Unnamed: 0']).query('weight_departing_only_model_respondents > 0')


## Data Transformations

### Market Contruction

In [140]:
# WSP's current method from notebook 17 - Apr 2, 2025
# Define the classification logic using np.where
survey_output['market_segment'] = np.where(
    survey_output['is_sdia_home_airport'] & survey_output['flight_purpose_label'].isin(['BUSINESS_WORK', 'COMBINATION_BUSINESS_LEISURE']),
    'RESIDENT_BUSINESS',
    np.where(
        survey_output['marketsegment_label']=='EMPLOYEE', 'EMPLOYEE',
        np.where(
            survey_output['is_sdia_home_airport'], 'RESIDENT_NON_BUSINESS',
            np.where(
                survey_output['flight_purpose_label'].isin(['BUSINESS_WORK', 'COMBINATION_BUSINESS_LEISURE']),
                'VISITOR_BUSINESS',
                'VISITOR_NON_BUSINESS'
            )
        )
    )
)
survey_output['market_segment'] = survey_output['market_segment'].str.lower()
survey_output = survey_output.drop(columns = ['marketsegment','marketsegment_label'])

### [WIP] Other Access Mode
recode access mode "OTHER"

In [141]:
# transit boarding TAZs
    # 1044 - # 3830 -> inland trolley stop -> walk to transit -> use WALK skims
    # 2821 -> recode to rental # 1385 -> Old Town
    # 3102 -> recode to rental # 1891 -> Gaslamp -> other public transit -> use PNR skims
    # 4187 -> recode to rental # 1520 -> rental car center
    # 4206 -> recode to rental # rental car from shelter island to western rental car parking, took flyer very short ride ... drop?
    # 5229 -> recode to rental # 1520 -> rental car center ... drove from Encinitas

(
    survey_output
        .query('access_mode_label == "OTHER"')
        .query('weight_departing_only>0')
        [['unique_id','inbound_or_outbound_label','main_mode_label','transit_boarding_taz','origin_taz','access_mode','access_mode_label','access_mode_other']]
)

Unnamed: 0,unique_id,inbound_or_outbound_label,main_mode_label,transit_boarding_taz,origin_taz,access_mode,access_mode_label,access_mode_other
705,1044,INBOUND_TO_AIRPORT,AIRPORT_FLYER_SHUTTLE,3830.0,3829,98.0,OTHER,Trolley
2384,2821,INBOUND_TO_AIRPORT,AIRPORT_FLYER_SHUTTLE,1385.0,1166,98.0,OTHER,Rental
2654,3102,INBOUND_TO_AIRPORT,AIRPORT_FLYER_SHUTTLE,1891.0,1908,98.0,OTHER,Rental
3675,4187,INBOUND_TO_AIRPORT,AIRPORT_FLYER_SHUTTLE,1520.0,389,98.0,OTHER,Rental Car
3694,4206,INBOUND_TO_AIRPORT,AIRPORT_FLYER_SHUTTLE,1338.0,1024,98.0,OTHER,Rental
4658,5229,INBOUND_TO_AIRPORT,AIRPORT_FLYER_SHUTTLE,1520.0,848,98.0,OTHER,Rental


In [142]:
# recode unique_id 1044
    # 3830 -> inland trolley stop -> walk to transit -> use WALK skims by changing access_mode to walk?

### Hotel Courtesy Boolean
only some visitors have access to a hotel courtesy shuttle van. Use the "origin_activity_type" column to create a T/F column that can be used to determine if respondent has access to this mode for unchosen alternatives

In [143]:
survey_output['hotel_shuttle_bool'] = (survey_output['origin_activity_type'] == 3) #origin_activity_type 3 is hotel

### POIs - La Jolla & Downtown
the way the model classifies trips from these POIs: https://github.com/SANDAG/ABM/blob/b299c46c11153a791640e2656842851950ff1e79/src/asim/scripts/airport/CreatePOIomx.py#L43

In [144]:
survey_output['origin_poi'] = 0
survey_output.loc[survey_output['origin_zip'].astype(float) == 92037.0, 'origin_poi'] = 1
survey_output.loc[survey_output['origin_zip'].astype(float) == 92101.0, 'origin_poi'] = 2
# survey_output.groupby('origin_poi')['weight_departing_only_model_respondents'].sum()

### Normalize Parking Costs
compute total parking cost using the parking_cost and parking_cost_frequency columns

In [145]:
# create total parking cost column
survey_output['parking_cost_total'] = None

In [146]:
# resolve parking location issues
survey_output.loc[
    (survey_output['unique_id'].isin([1755, 2032])) &
        (survey_output['respondentid'].isin([8010, 9377])) &
        (survey_output['parking_location_other'].isin(['Aladdin','Wally']))
    , ['parking_location','parking_location_label']
] = np.array([5.0, 'OFF_AIRPORT_PARKING'])

survey_output.loc[
    (survey_output['unique_id']== 2111) &
        (survey_output['respondentid']== 9718) &
        (survey_output['parking_location_other']== 'San Diego airport parking') &
        (survey_output['airport_terminal']== 2.0)
    , ['parking_location','parking_location_label']
] = np.array([2.0, 'TERM2_PARKING_PLAZA'])

  survey_output.loc[


#### REFUSED Frequency Responses
few responses

In [147]:
# replace null employee parking costs w/ 0
survey_output.loc[
    (survey_output['market_segment'] == "EMPLOYEE") &
        (survey_output['parking_cost_frequency_label'] == "REFUSED") &
        (survey_output['main_mode_label'].isin(['DROVE_ALONE_AND_PARKED','DROVE_WITH_OTHERS_AND_PARKED'])) &
        (survey_output['parking_cost_numeric'].isna()),
    'parking_cost_total'
    ] = 0

In [148]:
# update rental car rider
secretive_rental_car_parker_id = (
    survey_output
    .query('main_mode_label == "RENTAL_CAR_PARKED"')
    .query('parking_cost_frequency_label == "REFUSED"')
    ['unique_id']
    .values
)

survey_output.loc[
    survey_output['unique_id'].isin(secretive_rental_car_parker_id),
    'parking_cost_frequency_label'
    ] = None
survey_output.loc[
    survey_output['unique_id'].isin(secretive_rental_car_parker_id),
    'parking_cost_total'
    ] = 0

#### "OTHER" Frequency Responses
mostly employees

In [149]:
# get list of alternate responses to recode
parking_cost_frequency_other = set(
    survey_output
        .query("parking_cost_numeric > 0")
        ['parking_cost_frequency_other']
        .dropna()
        .unique()
        .tolist()
)
# parking_cost_frequency_other

In [150]:
# recode parking numeric costs for respondents that had >0
#   value for parking_numeric but stated employer paid
responses = ['Company pays', '3 month pass employer paid', 'Employer paid']
survey_output.loc[
        survey_output['parking_cost_frequency_other'].isin(responses),
        'parking_cost_total'
    ] = 0

In [151]:
# recode parking numeric costs for respondent that said "drop off"
    # visitor non business ... may have been given ride and parked at structures and walked in, but survey does not give us a way of knowing that
survey_output.loc[
        survey_output['parking_cost_frequency_other'].isin(['Drop off']),
        'parking_cost_total'
    ] = 0

In [152]:
# recode 'weekly' and '4 days' response
#   only non-employees
respondent_index_weekly_4_days = (
    survey_output
    .query("parking_cost_frequency_other.isin(['Weekly','4 days'])")
    .query("market_segment == 'RESIDENT_NON_BUSINESS'")
    .query('nights_away > 0')
    .index
    )
survey_output.loc[
    respondent_index_weekly_4_days,
    'parking_cost_total'
    ] = survey_output.loc[respondent_index_weekly_4_days,'parking_cost_numeric']

In [153]:
# recode long-term paying employees
    # regular employee pass users included
        # appears to be a quarterly parking fee of ~$135
        # calculate daily rate for *employees* paying to park
# TOTO : REVISIT
number_weeks_in_months = 4.3333
number_months_based_on_responses_dict = {
    '3 months': 3.0,
    'Per three months': 3.0,
    '3 month pass': 3.0,
    'Quarterly': 3.0,
    'Quaterly': 3.0,
    '3 month': 3.0,
    'Quarer': 3.0,
    '2 months':2.0,
    '300':10.0,
    'Year':12.0
}
survey_output['number_months_paid'] = (
    survey_output
    .apply(lambda x: number_months_based_on_responses_dict.get(x['parking_cost_frequency_other'])
           ,axis=1)
)
response_index = (
    survey_output
    .query(f'parking_cost_frequency_other.isin({list(number_months_based_on_responses_dict.keys())})')
    .query('market_segment == "EMPLOYEE"')
    .index
)
survey_output.loc[response_index, 'parking_cost_total'] = (
    survey_output
    .loc[response_index]
    .eval(f"parking_cost_numeric / number_commute_days / number_months_paid / {number_weeks_in_months}")
)
survey_output.drop(columns = ['number_months_paid'], inplace=True)


In [154]:
# recode "2 week" responses
num_weeks = 2
response_index = (
    survey_output
    .query('parking_cost_frequency_other == "2 weeks"')
    .query('market_segment == "EMPLOYEE"')
    .index
)
survey_output.loc[response_index,'parking_cost_total'] = (
    survey_output
    .loc[response_index]
    .eval(f"parking_cost_numeric / number_commute_days / {num_weeks}")
)

#### Typical Transforms Using parking_cost_frequency_label
how to determine who paid for parking?

Treat Visitor and Resident the same for now


In [155]:
# hourly payers
    # found only two residential, and that they were miscoded as hourly when they gave total responses
res_miscoded_hourly_payers_unique_ids = [1239,2525]
res_miscoded_hourly_payers_index= (
    survey_output
        .query(f'unique_id.isin({res_miscoded_hourly_payers_unique_ids})')
        .index
    )
survey_output.loc[res_miscoded_hourly_payers_index,'parking_cost_frequency'] = 1
survey_output.loc[res_miscoded_hourly_payers_index,'parking_cost_frequency_label'] = "TOTAL"

    # visitor that miscoded response, should be daily
vis_miscoded_hourly_payers_unique_ids = [5303]
vis_miscoded_hourly_payers_index= (
    survey_output
        .query(f'unique_id.isin({vis_miscoded_hourly_payers_unique_ids})')
        .index
    )
survey_output.loc[vis_miscoded_hourly_payers_index,'parking_cost_frequency'] = 3
survey_output.loc[vis_miscoded_hourly_payers_index,'parking_cost_frequency_label'] = "DAILY"


In [156]:
# adjust nights away bins to give avg number of nights
# survey_output.groupby(['nights_away','nights_away_label'])['unique_id'].count()

nights_bin_avg_map = {
    8: 9,
    9: 12,
    10: 17
}
survey_output['days_parked'] = survey_output['nights_away'].combine_first(survey_output['number_of_nights']).replace(nights_bin_avg_map) + 1

In [157]:
# coding parking rates
    # keys are parking_cost_frequency values
cost_col = 'parking_cost_numeric'

# monthly payers
months_in_yr = 12
sdia_flight_frequency_bin_5_lower_bound = 21
estimate_num_trips_typical_superflyers = 40 # slightly less than once per week at high end # TODO - model typical number of trips??? # TODO: add to list of assumptions
num_monthly_trips = ((sdia_flight_frequency_bin_5_lower_bound + estimate_num_trips_typical_superflyers) / 2) / months_in_yr

pax_functions = {
    1: lambda x: x[cost_col], # TOTAL
    2: lambda x: x[cost_col] / num_monthly_trips, # MONTHLY
    3: lambda x: x[cost_col] * x['days_parked'], # DAILY
    # no true hourly rate paying respondents
    }
survey_output['parking_cost_frequency_typical_function'] = (
    survey_output
    .apply(lambda x: pax_functions.get(x['parking_cost_frequency']),
           axis=1)
    .fillna(lambda x: x[cost_col]) # handle nulls
)

survey_output['parking_cost_total'] = (
    survey_output
        .apply(lambda x: x['parking_cost_frequency_typical_function'](x),
            axis=1)
        .combine_first(survey_output['parking_cost_total']) # coalesce
    )

survey_output.drop(columns=['parking_cost_frequency_typical_function'], inplace=True)

#### Parking Costs for Respondents that Made Alternate Choice
- on-site and off-site columns
- take avgs of both per night and then multiply by number of nights
- only for residents

In [158]:
# assign actual parking costs to respective parking location columns

# define parking location groups
onsite_parking_locs = [1,2,3,4]
employee_parking_locs = [6,7]
offsite_parking_locs = [5]

# separate observed parking costs into parking location groups
survey_output.loc[
    survey_output['parking_location'].isin(onsite_parking_locs),
    'onsite_parking_cost'
    ] = survey_output.loc[survey_output['parking_location'].isin(onsite_parking_locs), 'parking_cost_total']
survey_output.loc[
    survey_output['parking_location'].isin(employee_parking_locs),
    'employee_parking_cost'
    ] = survey_output.loc[survey_output['parking_location'].isin(employee_parking_locs), 'parking_cost_total']
survey_output.loc[
    survey_output['parking_location'].isin(offsite_parking_locs),
    'offsite_parking_cost'
    ] = survey_output.loc[survey_output['parking_location'].isin(offsite_parking_locs), 'parking_cost_total']

In [159]:
onsite_parking_rate = 38.0 #per day -> from site, close to observed data
offsite_parking_rate = 21.6 # per day -> observed data, v close to various online rates

# to get avg observed parking rates by parking location:
(
    survey_output
    .query('parking_cost_total> 0')
    .groupby(['parking_location','parking_location_label'])
    [['parking_location','parking_location_label','parking_cost_total','days_parked']]
    .apply(lambda x: np.mean(x['parking_cost_total']/x['days_parked']))
    .reset_index()
    .astype({'parking_location': float})
    .sort_values(by='parking_location')
)

Unnamed: 0,parking_location,parking_location_label,0
0,1.0,TERM1_PARKING_PLAZA,31.930556
1,2.0,TERM2_PARKING_PLAZA,40.763792
8,2.0,TERM2_PARKING_PLAZA,15.0
2,3.0,TERM1_CURBSIDE_VALET,40.0
3,4.0,TERM2_CURBSIDE_VALET,50.0
4,5.0,OFF_AIRPORT_PARKING,21.700707
9,5.0,OFF_AIRPORT_PARKING,15.625
5,6.0,EMPLOYEE_LOT_3665_ADMIRAL_BOLAND_WAY,
6,7.0,ADMIN_BUILDING_LOT_2417_MCCAIN_ROAD,
7,98.0,OTHER,


In [160]:
# coalesce modeled rates onto observed parking rates
    # only use estimated values if they are not in data
for site,rate in zip(['onsite_parking_cost','offsite_parking_cost'],[onsite_parking_rate,offsite_parking_rate]):
    survey_output.loc[:,site] = (
        survey_output
            .loc[:,site]
            .combine_first(
                survey_output
                .apply(lambda x: x['days_parked'] * rate,
                    axis=1)
            )
        )

### Rental Car Cost + Travel Time
currently estimating rental car time by introducing  adjustment to auto skims
data from ABM3 model: https://github.com/SANDAG/ABM/blob/50b56ca950b08b5a7d902e97f2f45536d750b150/src/asim/configs/airport.SAN/trip_mode_choice.yaml#L129

In [161]:
rentalCostPerDay= 60.99
rentalCarInVehicleTime= 7.50
rentalCarWalkTime= 3.00
rentalCarWaitTime= 7.50

In [162]:
# TODO: revisit using skims?
survey_output['rental_additional_time'] = rentalCarWalkTime + rentalCarWaitTime # + survey_output['auto_time'] #+ rentalCarInVehicleTime

# some pax stated that destination_taz was rental car center
#   - add rentalCarInVehicleTime for these respondents, as auto skims do not account for additional auto travel to airport
rental_center_bound_pax = (
    survey_output
        .query('main_mode_grouped_label == "RENTAL_CAR"')
        .query('inbound_or_outbound_label == "INBOUND_TO_AIRPORT"')
        .query('market_segment != "EMPLOYEE"')
        .query('weight_departing_only>0')
        .query('destination_taz==1520')
        .index
)
survey_output.loc[rental_center_bound_pax,'rental_additional_time'] += rentalCarInVehicleTime

In [163]:
survey_output['rental_cost'] = (survey_output['nights_visited'].replace(nights_bin_avg_map) + 1) * rentalCostPerDay

### Shuttles

In [164]:
hotelCourtesyWalkTime = 5.00
# terminalWalkTime = 5.00
# rentalCarWaitTime= 7.50 # use rental car shuttle headway

In [165]:
survey_output['shuttle_additional_time'] = hotelCourtesyWalkTime + rentalCarWaitTime

### Parking Travel Time Adjustments 
(PAX On- and Off-Site)

use model assumptions


In [166]:
# TODO: employee parking time adjustments (not in airport model)

In [167]:
# https://github.com/SANDAG/ABM/blob/80fb954649c8cb5c10abbfa92c43372c8bceaa8d/src/asim/configs/airport.SAN/trip_mode_choice.yaml#L115C1-L117C30
parkLocation4InVehicleTime = 6.00
parkLocation4WalkTime = 2.00
parkLocation4WaitTime = 3.00 # TODO revisit

# https://github.com/SANDAG/ABM/blob/80fb954649c8cb5c10abbfa92c43372c8bceaa8d/src/asim/configs/airport.SAN/trip_mode_choice.yaml#L133
terminalWalkTime = 5.00

In [168]:
survey_output['offsite_parking_additional_time'] = parkLocation4InVehicleTime + parkLocation4WalkTime + parkLocation4WaitTime
survey_output['onsite_parking_additional_time'] = terminalWalkTime

### Party Size
Prefer to use ground access party column instead of party size flight column when available

431 Null values - equivalent to employee count

In [169]:
# TODO go back and fix error made by consultants

In [170]:
# coalesce columns, prioritize ground access column
survey_output['party_size_access_transformed'] = (
    survey_output
        ['party_size_ground_access']
        # .combine_first(survey_output['number_of_travel_companions'])
        .combine_first(survey_output['party_size_flight'] - 1)
)

# convert 'number of companions' into total party size
party_size_access_dict = {
    0: 'ONE',
    1: 'TWO',
    2: 'THREE',
    3: 'FOUR',
    4: 'FIVE',
    5: 'SIX',
    6: 'SEVEN',
    7: 'EIGHT_OR_MORE',
    98: 'OTHER',
    99: 'REFUSED'
    }
survey_output['party_size_access_transformed_label'] = (
    survey_output
    ['party_size_access_transformed']
    .replace(party_size_access_dict)
)

survey_output = (
    survey_output
    .drop(columns= ['number_of_travel_companions', 'number_of_travel_companions_label'])
)

### Taxi/TNC Fare

In [171]:
survey_output['taxi_fare'] = (
    fhv_fare_data.loc['taxi_model', 'base_fare'] +
    fhv_fare_data.loc['taxi_model', 'fare_cost_per_mile'] * survey_output['auto_dist']
)
survey_output['tnc_fare'] = (
    fhv_fare_data.loc['tnc_model', 'base_fare'] +
    fhv_fare_data.loc['tnc_model', 'fare_cost_per_mile'] * survey_output['auto_dist']
)

### Taxi/TNC Wait
The relatively low wait times are likely a result of respondents ordering rides before they are fully prepared to leave

In [172]:
# use overall mean of taxi wait
survey_output.loc[
    survey_output['main_mode_label'] == "TAXI",
    'taxi_wait'
    ] = survey_output.loc[survey_output['main_mode_label'] == "TAXI", 'taxi_fhv_wait_numeric']

taxi_mean_wait = survey_output.query('main_mode_label == "TAXI"')['taxi_fhv_wait_numeric'].mean()
survey_output['taxi_wait'] = survey_output['taxi_wait'].fillna(taxi_mean_wait)

In [173]:
# median ridehail wait time across PMSAs is approximately 5 minutes (after removing 3 implausible values over 1000)
    # waits are heavily skewed, if you look at violin or box plot this is clear

# sns.violinplot(
#     data = survey_output.query('main_mode_label == "UBER_LYFT"').query('taxi_fhv_wait_numeric < 60'),
#     x = 'origin_pmsa',
#     y = 'taxi_fhv_wait_numeric'
# )
# plt.ylim((0,60))
(
    survey_output
    .query('main_mode_label == "UBER_LYFT"')
    .query('taxi_fhv_wait_numeric < 60')
    .groupby(['origin_pmsa', 'origin_pmsa_label'])
    ['taxi_fhv_wait_numeric']
    .agg(['median','mean','std','count'])
    .round(3)
    .reset_index()
)

Unnamed: 0,origin_pmsa,origin_pmsa_label,median,mean,std,count
0,1.0,DOWNTOWN,5.0,5.7,5.611,343
1,2.0,CENTRAL,5.0,6.148,5.526,338
2,3.0,NORTH_CITY,5.0,6.184,6.439,381
3,4.0,SOUTH_SUBURBAN,5.0,5.863,6.235,51
4,5.0,EAST_SUBURBAN,5.0,7.68,8.472,50
5,6.0,NORTH_COUNTY_WEST,4.0,6.347,8.181,95
6,7.0,NORTH_COUNTY_EAST,1.0,3.524,5.706,21
7,99.0,EXTERNAL,5.0,5.915,7.077,47


In [174]:
tnc_index = (survey_output['main_mode_label'] == "UBER_LYFT") & (survey_output['taxi_fhv_wait_numeric'].notna())
survey_output.loc[tnc_index, 'tnc_wait'] = survey_output.loc[tnc_index, 'taxi_fhv_wait_numeric']
survey_output.loc[~tnc_index, 'tnc_wait'] = 5.0

### Mode Groupings

In [175]:
mode_model_grouping_dict = {
    'parked_on_site': {
        'main_mode': {13, 14}, #DROVE_ALONE_AND_PARKED, DROVE_WITH_OTHERS_AND_PARKED
        'parking_location': {1,2,3,4} # TERM1_PARKING_PLAZA, TERM2_PARKING_PLAZA, TERM1_CURBSIDE_VALET, TERM2_CURBSIDE_VALET
    },
    'parked_off_site': {
        'main_mode': {13, 14}, #DROVE_ALONE_AND_PARKED, DROVE_WITH_OTHERS_AND_PARKED
        'parking_location': {5} # OFF_AIRPORT_PARKING
    },
    'parked_employee': {
        'main_mode': {13, 14}, #DROVE_ALONE_AND_PARKED, DROVE_WITH_OTHERS_AND_PARKED
        'parking_location': {6,7} # EMPLOYEE_LOT_3665_ADMIRAL_BOLAND_WAY, ADMIN_BUILDING_LOT_2417_MCCAIN_ROAD
    },
    'parked_unknown': {
        'main_mode': {13, 14}, #DROVE_ALONE_AND_PARKED, DROVE_WITH_OTHERS_AND_PARKED
        'parking_location': {98,99} # OTHER, REFUSED
    },
    'drop_off': {
        'main_mode': {12, 23} #DROPPED_OFF_BY_FAMILY_FRIEND, PICKED_UP_BY_FAMILY_FRIEND-> including bc one respondent mislabeled
    },
    'public_transit': {
        'main_mode': {15,16,30} # MTS_ROUTE_992, AIRPORT_FLYER_SHUTTLE, OTHER_PUBLIC_TRANSIT
    },
    'active_transportation': {
        'main_mode': {1,2,3,4,5,6,7,8} #
    },
    'rental_car': {
        'main_mode': {20, 19, 27} # RENTAL_CAR_PARKED, RENTAL_CAR_DROPPED_OFF, RENTAL_CAR_PICKED_UP-> including bc one respondent mislabeled
    },
    'taxi': {
        'main_mode': {9} # TAXI
    },
    'tnc': {
        'main_mode': {10} # UBER_LYFT
    },
    'shuttle': {
        'main_mode': {17,21,22} # CHARTERED_TOUR_BUS, OTHER_SHARED_VAN, HOTEL_SHUTTLE_VAN
    },
    'employee_shuttle': {
        'main_mode': {18} # EMPLOYEE_SHUTTLE
    }
}

In [176]:
mode_model_grouping_df = (
    pd.DataFrame
    .from_dict(mode_model_grouping_dict, orient='index')
    .reset_index()
    .explode('main_mode')
    .explode('parking_location')
    .astype({'main_mode': float, 'parking_location': float})
    .fillna(999.0)
    .rename(columns= {'index': 'mode_group', 'parking_location': 'parking_location_fillna'})
)
# mode_model_grouping_df

In [177]:
survey_output = survey_output.drop(columns= ['mode_group'], errors= 'ignore')
survey_output['parking_location_fillna'] = survey_output['parking_location'].astype(float).fillna(999.0)
survey_output = (
    survey_output
    .merge(
        mode_model_grouping_df,
        how = 'left',
        on= ['main_mode','parking_location_fillna'],
        validate = 'many_to_one',
    )
    .drop(columns= ['parking_location_fillna'])
)

In [178]:

#
survey_output.loc[
    (survey_output['mode_group'].isna()) &
    (survey_output['market_segment'] == 'employee') &
    (survey_output['main_mode'].isin([24,25])) # GET_IN_PARKED_VEHICLE_AND_DRIVE_ALONE, GET_IN_PARKED_VEHICLE_AND_DRIVE_WITH_OTHERS
    , 'mode_group'
] = 'parked_employee'

## Write Out Data

### Low Information

In [180]:
keep_cols = [
   'unique_id',
   # 'weight_departing_and_arriving',
   # 'weight_departing_only',
   'weight_departing_only_model_respondents',
   # 'weight_non_sas_departing_only',
   # 'weight_departing_only_with_time_of_day',

   # survey respondent demos
   'age',
   'age_label',
   'gender',
   'gender_label',
   # 'occupation',
   # 'occupation_label',
   'household_income',
   'household_income_label',
   'number_persons_in_household',
   'number_persons_in_household_label',
   # 'market_transformed', # constructed trip characteristic
   'market_segment', # WSP's constructed trip characteristic, w/ employee logic included
   'number_vehicles',
   'number_vehicles_label',
   'car_available',
   'car_available_label',

   'nights_away',
   'nights_visited',
   'number_of_nights',
   'number_of_nights_label',

   'shift_start_time',
   'shift_start_time_label',
   'shift_end_time',
   'shift_end_time_label',
   'number_commute_days',
   'number_commute_days_label',

   # survey trip characteristics
   'main_mode',
   'main_mode_label',
   'mode_group',
   'from_airport_transit_route_1',
   'from_airport_transit_route_1_other',
   'from_airport_transit_route_2',
   'from_airport_transit_route_2_other',
   'from_airport_transit_route_3',
   'from_airport_transit_route_3_other',
   'from_airport_transit_route_4',
   'from_airport_transit_route_4_other',
#  'access_mode_label',
#  'egress_mode_label',
   'trip_arrival_time',
   'trip_arrival_time_label',
   'airline',
   'airline_label',
   'airport_terminal',
   # 'convention_center',
   # 'convention_center_label',
   # 'convention_center_activity',
   # 'convention_center_activity_label',
   'party_size_access_transformed', # total size of party, including respondent. THIS IS A CATEGORICAL COLUMN, NOT NUMERIC
   'party_size_access_transformed_label', # total size of party, including respondent

   # 'parking_cost_total', # transformed parking cost
   'days_parked', # fxn of nights_away, makes assumptions about bins
   'onsite_parking_cost', # use parking_location to determine which parking location was used
   'employee_parking_cost',
   'offsite_parking_cost',
   'parking_location',
   'parking_location_label',
   'parking_location_other',
   'reimbursement',
   'reimbursement_label',
   'onsite_parking_additional_time',
   'offsite_parking_additional_time',

   # respondent TAZs
   'origin_taz',
   'origin_poi', #LJ or downtown zips
   'destination_taz',
   'home_location_taz',
   'transit_alighting_taz',
   'transit_boarding_taz',

   # trip data from 2022 base scenario skims
   'auto_dist', 'auto_time', 'auto_tollcost',
   'transit_type', 'transit_time',
       'transit_fare', 'transit_acc', 'transit_firstwait', 'transit_totalivtt',
       'transit_xferwait', 'transit_egr', 'transit_xfers',

   # TNC/taxi columns - mix of survey and constructed data
   'taxi_fhv_fare_numeric', # fare
   # 'taxi_fhv_fare',
   'taxi_fare',
   'tnc_fare',
   'taxi_fhv_wait_numeric', # wait
   # 'taxi_fhv_wait',
   'taxi_wait',
   'tnc_wait',

   'rental_additional_time', # additional time renting car adds to trip -- to get total travel time, add to auto_time
   'rental_cost',
   'shuttle_additional_time',
   ]

In [181]:
(
    survey_output
    [keep_cols]
    .rename(columns={'weight_departing_only_model_respondents': 'weight'})
    .to_csv(processed_survey_data_low_pii_simplified_path)#, index_label = 'unique_id')
)

### High Information

In [182]:
# drop high PII columns
drop_columns = set()
col_filters = ['latitude','longitude','geometry']#,'employer']
for col_filter in col_filters:
    drop_columns.update({col for col in survey_output.columns if col_filter in col})
drop_columns

{'destination_latitude',
 'destination_longitude',
 'geometry',
 'home_location_latitude',
 'home_location_longitude',
 'origin_latitude',
 'origin_longitude',
 'transit_alighting_latitude',
 'transit_alighting_longitude',
 'transit_boarding_latitude',
 'transit_boarding_longitude'}

In [183]:
# keep only label columns
for i in range(len(survey_output.columns)-1):
    col1 = survey_output.columns[i]
    col2 = survey_output.columns[i+1].replace('_label','')
    if col1 == col2 and col1 != 'trip_arrival_time':
        drop_columns.add(col1)
        # print(f'{col1}: {survey_output.columns[i+1]}')

In [184]:
drop_columns.update({
    'respondentid',
    'is_completed',
    'date_completed',
    'is_pilot',
    'is_self_administered',

    'gender_other', # empty
    'interview_location_label',
    'interview_location_other',
    'is_qualified_age',

    # 'main_mode_other', #mapped some options to modes
})

In [185]:
(
    survey_output
    .drop(columns=list(drop_columns))
    .to_csv(processed_survey_data_low_pii_path)#, index_label = 'unique_id')
)

In [186]:
(
    survey_output
    .to_csv(processed_survey_data_full_pii_simplified_path)#, index_label = 'unique_id')
)