In [1]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
import pandas as pd
import numpy as np
from pydantic import ValidationError
import data_model
import enums as e
from utils import extract_base_type, add_enum_label_columns, add_list_objects, add_synthetic_records, map_zones
import datetime

In [3]:
importlib.reload(data_model)
importlib.reload(e)
from data_model import Respondent, Employee, AirPassenger, Trip, DepartingPassengerResident, DepartingPassengerVisitor, ArrivingPassengerResident, ArrivingPassengerVisitor, DepartingAirPassenger, ArrivingAirPassenger, Resident, Visitor

In [4]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"

# input_file1 = os.path.join(external_dir, "etc/od_20250226_sandag_airport_draftfinal.xlsx") # latest
# input_file2 = os.path.join(external_dir, "etc/od_20250226_sandag_airport_pilotdata.xlsx") #older version but records needed
# input_file3 = os.path.join(external_dir, "etc/od_20250131_sandag_airport_sas_draftinal.xlsx")
# input_file4 = os.path.join(external_dir, "etc/SP_rating_questions_data_022125.csv")

input_file1 = os.path.join(external_dir, "etc/od_20250314_sandag_airport_draftfinal.xlsx") # latest
input_file2 = os.path.join(external_dir, "etc/od_20250314_sandag_airport_pilotdata.xlsx") #older version but records needed
input_file3 = os.path.join(external_dir, "etc/od_20253014_sandag_airport_sas_draftinal.xlsx")
input_file4 = os.path.join(external_dir, "etc/ATC_airport_travel_survey_SP_data_03072025.xlsx")

variable_map_file = os.path.join(processed_dir, "revised_names.csv")
clean_survey_file = os.path.join(interim_dir, "survey_data_clean.csv")
output_csv_filename = os.path.join(processed_dir, "data_model_output.csv")
#summary_csv_filename = os.path.join(processed_dir, "data_model_output_summary.csv")

### Clean Data , Rename fields

In [5]:
in_df_complete1 = pd.read_excel(input_file1, sheet_name = 0)
in_df_incomplete1 = pd.read_excel(input_file1, sheet_name = 1)

in_df_complete2 = pd.read_excel(input_file2, sheet_name = 0)
in_df_incomplete2 = pd.read_excel(input_file2, sheet_name = 1)

in_df_complete3 = pd.read_excel(input_file3, sheet_name = 0)
in_df_incomplete3 = pd.read_excel(input_file3, sheet_name = 1)

#in_df_sp = pd.read_csv(input_file4, encoding = 'latin1')
in_df_sp = pd.read_excel(input_file4, sheet_name = 1)

in_df_complete2['is_self_administered'], in_df_incomplete2['is_self_administered'] = False, False
in_df_complete1['is_self_administered'], in_df_incomplete1['is_self_administered'] = False, False
in_df_complete3['is_self_administered'], in_df_incomplete3['is_self_administered'] = True, True


header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()

in_df_complete1 = in_df_complete1.rename(columns=header_dict).copy().drop(columns=["delete"])
in_df_complete2 = in_df_complete2.rename(columns=header_dict).copy().drop(columns=["delete"])
in_df_complete3 = in_df_complete3.rename(columns=header_dict).copy().drop(columns=["delete"])


in_df_incomplete1 = in_df_incomplete1.rename(columns=header_dict).copy().drop(columns=["delete"])
in_df_incomplete2 = in_df_incomplete2.rename(columns=header_dict).copy().drop(columns=["delete"])
in_df_incomplete3 = in_df_incomplete3.rename(columns=header_dict).copy().drop(columns=["delete"])
in_df_sp = in_df_sp.rename(columns=header_dict).copy()


in_df_complete = pd.concat([in_df_complete1, in_df_complete2, in_df_complete3], ignore_index = True)
in_df_incomplete = pd.concat([in_df_incomplete1, in_df_incomplete2, in_df_incomplete3], ignore_index = True)

in_df_complete['is_completed'] = 1
in_df_incomplete['is_completed'] = 0

in_df_complete['weight'] = 1
in_df_incomplete['weight'] = 0

#Concat incomplete and complete dataframes
clean_df = pd.concat([in_df_complete, in_df_incomplete], ignore_index = True)


  in_df_complete['is_completed'] = 1
  in_df_complete['weight'] = 1


In [6]:
clean_df['is_self_administered'].value_counts()

is_self_administered
False    8997
True      955
Name: count, dtype: int64

In [7]:
print("Complete Records: ", in_df_complete.shape)
print("Incomplete Records: ", in_df_incomplete.shape)

Complete Records:  (5378, 329)
Incomplete Records:  (4574, 15)


In [8]:
clean_df.shape

(9952, 330)

In [9]:
len(clean_df['respondentid'].unique())

5934

In [10]:
#Remove the duplicate respondentids
clean_df.drop_duplicates('respondentid', keep = 'first', inplace = True)
clean_df.shape

(5934, 330)

In [11]:
clean_df[clean_df['is_completed']==1].shape

(5378, 330)

In [12]:
#Remove duplicates from SP
in_df_sp = in_df_sp.drop_duplicates(subset=['respondentid'], keep='last')

In [13]:
#Merge SP
clean_df = clean_df.merge(in_df_sp, on="respondentid", how="left")

In [14]:
clean_df.shape

(5934, 396)

#### Add Zones Mapping

In [15]:
#PMSA
pmsa_zones_shapefile = "../data/external/geometry/pmsa_geoms/pmsa_geoms.shp"
clean_df['origin_pmsa'] = map_zones(clean_df, 'origin_latitude', 'origin_longitude', pmsa_zones_shapefile, 'pseudomsa', 99)
clean_df['destination_pmsa'] = map_zones(clean_df, 'destination_latitude', 'destination_longitude', pmsa_zones_shapefile, 'pseudomsa', 99)
clean_df['origin_pmsa'].value_counts(), clean_df['destination_pmsa'].value_counts()

(origin_pmsa
 2     1586
 3     1316
 1      723
 6      460
 99     449
 4      330
 5      311
 7      196
 8        7
 Name: count, dtype: int64,
 destination_pmsa
 2     5208
 3       50
 1       42
 99      40
 6       17
 5       13
 4        6
 7        2
 Name: count, dtype: int64)

In [16]:
#Municipal Zones
municipal_zones_shapefile = "../data/external/geometry/Municipal_Boundaries/Municipal_Boundaries.shp"
clean_df['origin_municipal_zone'] = map_zones(clean_df, 'origin_latitude', 'origin_longitude', municipal_zones_shapefile, 'name', 'EXTERNAL')
clean_df['destination_municipal_zone'] = map_zones(clean_df, 'destination_latitude', 'destination_longitude', municipal_zones_shapefile, 'name', 'EXTERNAL')
clean_df['origin_municipal_zone'].value_counts(), clean_df['destination_municipal_zone'].value_counts()

(origin_municipal_zone
 SAN DIEGO         3431
 EXTERNAL           449
 S.D. COUNTY        260
 CHULA VISTA        233
 CARLSBAD           168
 OCEANSIDE          141
 CORONADO           112
 ESCONDIDO           77
 ENCINITAS           74
 LA MESA             72
 EL CAJON            59
 NATIONAL CITY       57
 POWAY               47
 SAN MARCOS          42
 VISTA               38
 DEL MAR             30
 LEMON GROVE         27
 IMPERIAL BEACH      26
 SANTEE              21
 SOLANA BEACH        14
 Name: count, dtype: int64,
 destination_municipal_zone
 SAN DIEGO        5293
 EXTERNAL           40
 EL CAJON            7
 S.D. COUNTY         6
 CARLSBAD            6
 CHULA VISTA         5
 OCEANSIDE           5
 CORONADO            4
 NATIONAL CITY       3
 LA MESA             3
 ENCINITAS           3
 POWAY               2
 SOLANA BEACH        1
 Name: count, dtype: int64)

### Commonly occuring invalid values

### Making all modes consistent

In [17]:
clean_df['egress_mode_label'].value_counts()

egress_mode_label
Walk                                 30
Picked up by car by family/friend    15
Drive alone and park                  2
Uber/Lyft                             2
Taxi                                  2
Other shared van (please specify)     1
Name: count, dtype: int64

In [18]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
DROPPED OFF BY CAR BY FRIEND FAMILY               242
UBER LYFT                                         107
DROVE ALONE AND PARKED                             42
DROVE WITH OTHERS AND PARKED                       28
OTHER PUBLIC TRANSIT                               24
RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY     22
TAXI                                               14
RENTAL CAR AND PARKED IT                           11
WHEELCHAIR OR OTHER MOBILITY DEVICE                11
WALK                                                9
Public transit                                      5
Uber / Lyft                                         5
CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR            4
Rental car and dropped it off at rental agency      4
HOTEL SHUTTLE VAN                                   3
Drove alone and parked                              2
RODE WITH OTHER TRAVELER AND PARKED                 2
NON ELECTRIC BIKESHARE                             

In [19]:
other_airport_accessmode_label_map = {
    'Walk': 'Walk',
    'Wheelchair or other mobility device': 'Wheelchair or other mobility device',
    'ELECTRIC BIKESHARE': 'Bicycle: electric bikeshare',
    'NON ELECTRIC BIKESHARE': 'Bicycle: non-electric bikeshare',
    'E SCOOTER SHARE': 'E-scooter: shared',
    'PERSONAL ELECTRIC BICYCLE': 'Bicycle: personal electric bicycle',
    'PERSONAL NON ELECTRIC BICYCLE': 'Bicycle: personal non-electric bicycle',
    'PERSONAL E SCOOTER': 'E-scooter: personal',
    'Taxi': 'Taxi',
    'UBER LYFT': 'Uber/Lyft',
    'CAR SERVICE BLACK CAR LIMO EXECUTIVE CAR': 'Car service/black car/limo/executive car',
    'DROPPED OFF BY CAR BY FRIEND FAMILY': 'Dropped off by car by family/friend',
    'Drove alone and parked': 'Drove alone and parked',
    'Drove with others and parked': 'Drove with others and parked',
    'RODE WITH OTHER TRAVELER AND PARKED': 'Rode with other traveler(s) and parked',
    'Other public transit': 'Other public transit',
    'Chartered tour bus': 'Chartered tour bus',
    'Employee shuttle': 'Employee shuttle',
    'RENTAL CAR AND DROPPED IT OFF AT RENTAL AGENCY': 'Rental car: Dropped off at rental agency',
    'RENTAL CAR AND PARKED IT': 'Rental car: parked rental car',
    'Hotel shuttle van': 'Hotel shuttle van',
    'OTHER SHARED RIDE VAN SERVICE': 'Other shared van (please specify)',
    'Other': 'Other',
    'Refused/No Answer': 'Refused/No Answer'
}
clean_df['other_airport_accessmode_label'] = clean_df['other_airport_accessmode_label'].map(other_airport_accessmode_label_map)

In [20]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         242
Uber/Lyft                                   107
Rental car: Dropped off at rental agency     22
Rental car: parked rental car                11
Car service/black car/limo/executive car      4
Rode with other traveler(s) and parked        2
Drove alone and parked                        2
Bicycle: non-electric bikeshare               2
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Bicycle: electric bikeshare                   1
Taxi                                          1
Walk                                          1
Other                                         1
Name: count, dtype: int64

In [21]:
travel_mode_dict = {
    'Walk': 1,
    'Wheelchair or other mobility device': 2,
    'Bicycle: electric bikeshare': 3,
    'Bicycle: non-electric bikeshare': 4,
    'E-scooter: shared': 5,
    'Bicycle: personal electric bicycle': 6,
    'Bicycle: personal non-electric bicycle': 7,
    'E-scooter: personal': 8,
    'Taxi': 9,
    'Uber/Lyft': 10,
    'Car service/black car/limo/executive car': 11,
    'Dropped off by car by family/friend': 12,
    'Drove alone and parked': 13,
    'Drove with others and parked': 14,
    'MTS Route 992': 15,
    'Airport flyer shuttle': 16,
    'Chartered tour bus': 17,
    'Employee shuttle': 18,
    'Rental car: Dropped off at rental agency': 19,
    'Rental car: parked rental car': 20,
    'Hotel shuttle van': 21,
    'Other shared van (please specify)': 22,
    'Picked up by car by family/friend': 23,
    'Get in a parked vehicle and drive alone': 24,
    'Get in a parked vehicle and drive with others': 25,
    'Get in a parked vehicle and ride with other traveler(s)': 26,
    'Rental car: Picked up at rental agency': 27,
    'Rental car: get in a parked rental car': 28,
    'Rode with other traveler(s) and parked': 29,
    'Other public transit': 30,
    'Public Transit': 30,
    'Other': 98,
    'Refused/No Answer': 99,
    'None of the above': 98
}

### Modes to fix

In [22]:
mode_code_columns = ['main_transit_mode', 'main_mode', 'access_mode', 'egress_mode', 'reverse_mode', 'reverse_mode_predicted', 'other_airport_accessmode', 'reverse_commute_mode']
mode_label_columns = ['main_transit_mode_label', 'main_mode_label', 'access_mode_label', 'egress_mode_label', 'reverse_mode_label', 'reverse_mode_predicted_label', 'other_airport_accessmode_label', 'reverse_commute_mode_label']

In [23]:
#Remapping codes using label strings
travel_mode_dict = {k.lower(): v for k, v in travel_mode_dict.items()}
for mode_code_col, mode_label_col in zip(mode_code_columns, mode_label_columns):
    # Apply the mapping for each pair of columns
    clean_df[mode_code_col] = clean_df[mode_label_col].str.lower().map(travel_mode_dict)

In [24]:
clean_df['other_airport_accessmode_label'].value_counts()

other_airport_accessmode_label
Dropped off by car by family/friend         242
Uber/Lyft                                   107
Rental car: Dropped off at rental agency     22
Rental car: parked rental car                11
Car service/black car/limo/executive car      4
Rode with other traveler(s) and parked        2
Drove alone and parked                        2
Bicycle: non-electric bikeshare               2
Bicycle: personal non-electric bicycle        1
Other shared van (please specify)             1
Bicycle: electric bikeshare                   1
Taxi                                          1
Walk                                          1
Other                                         1
Name: count, dtype: int64

In [25]:
clean_df['other_airport_accessmode'].value_counts()

other_airport_accessmode
12.0    242
10.0    107
19.0     22
20.0     11
11.0      4
29.0      2
13.0      2
4.0       2
7.0       1
22.0      1
3.0       1
9.0       1
1.0       1
98.0      1
Name: count, dtype: int64

In [26]:
clean_df['main_transit_mode'].value_counts()

main_transit_mode
98.0    4919
16.0     249
15.0     209
99.0       1
Name: count, dtype: int64

### Pre-processing of some fields

In [27]:
clean_df['date_completed'] = pd.to_datetime(clean_df['date_completed'])
clean_df['is_pilot'] = np.where(clean_df['date_completed'].dt.date<=datetime.date(2024, 10, 3), 1, 0)
clean_df['record_type_synthetic'] = 0
clean_df.replace('-oth-', 98, inplace=True)
clean_df.replace('-', None, inplace = True )
clean_df['is_income_below_poverty'] = np.where(clean_df['is_income_below_poverty'] == 0, 2, clean_df['is_income_below_poverty'])
clean_df['household_income'] = np.where(clean_df['household_income']=='13B', 17, clean_df['household_income'] )

clean_df['stay_informed'] = np.where(clean_df['stay_informed'] == 0, 2, clean_df['stay_informed'])
#Maps
interview_location_map = {'Term1' : 1, 'Term2': 2, 'MTS_1_992': 3, 'SDA_1_FLYER': 4, 'ConracShuttle': 5, 'ParkingShuttle': 6, 'EmplParking': 7, '-oth-':98} 
inbound_outbound_map = {'IN':1, 'OUT':2}

#route_fields:
route_fields = ['to_airport_transit_route_1', 'to_airport_transit_route_2', 'to_airport_transit_route_3', 'to_airport_transit_route_4',
                'from_airport_transit_route_1', 'from_airport_transit_route_2', 'from_airport_transit_route_3', 'from_airport_transit_route_4']

#Replacement
clean_df['interview_location'] = clean_df['interview_location'].map(interview_location_map)
clean_df['inbound_or_outbound'] = clean_df['inbound_or_outbound'].map(inbound_outbound_map)
clean_df['main_mode'] = np.where(clean_df['main_transit_mode'].isin([15,16]), clean_df['main_transit_mode'], clean_df['main_mode'])

clean_df[route_fields] = clean_df[route_fields].replace(98, 'OTHER')
clean_df['nights_visited'] = clean_df['nights_visited'] - 1

clean_df['same_commute_mode'] = np.where(clean_df['same_commute_mode'] == 0, 2, clean_df['same_commute_mode'])
clean_df['resident_visitor_followup'] = np.where(clean_df['resident_visitor_followup'] == 0, 2, clean_df['resident_visitor_followup'])

#activity_type
clean_df['origin_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.OUTBOUND_FROM_AIRPORT, e.ActivityType.SAN_DIEGO_AIRPORT, clean_df['origin_activity_type'])
clean_df['destination_activity_type'] = np.where(clean_df['inbound_or_outbound'] == e.InboundOutbound.INBOUND_TO_AIRPORT, e.ActivityType.SAN_DIEGO_AIRPORT, clean_df['destination_activity_type'])

#For incomplete records:
clean_df['marketsegment'] = clean_df['marketsegment'].fillna(99)



  clean_df.replace('-oth-', 98, inplace=True)


### Fix main_mode to not take EMPLOYEE_SHUTTLE

In [28]:
clean_df['main_mode'] = np.where(
    clean_df['main_mode'] == e.TravelMode.EMPLOYEE_SHUTTLE,
    np.where(
        clean_df['marketsegment'] == e.Type.PASSENGER,
        e.TravelMode.OTHER,
        np.where(
            clean_df['marketsegment'] == e.Type.EMPLOYEE,
            clean_df['reverse_commute_mode'],
            clean_df['main_mode']  # fallback if neither condition is met
        )
    ),
    clean_df['main_mode']
)

### Re-assign some main_mode_others to main_mode categories

In [29]:
clean_df[['marketsegment_label', 'main_mode_other']].value_counts()

marketsegment_label                             main_mode_other                                           
Air passenger                                   Hospital shuttle                                              2
                                                Medical shuttle                                               2
                                                Refugee shuttle                                               2
Employee working at the airport                 Motorcycle                                                    2
Air passenger                                   Bus                                                           1
                                                Connecting flights                                            1
                                                Airplane                                                      1
                                                Medical                                                      

In [30]:
# Modes which are invalid, should make main_mode blank, and hence throw a critical validation error
# Some of the modes can stay as they are (i.e., OTHER) - Like, Refugee Shuttle, Hospital Shuttle, Medical Shuttle. 
# Others can be classified - for example 
# Mapping for reclassification
mode_mapping = {
    "Hospital shuttle": e.TravelMode.OTHER,
    "Medical shuttle": e.TravelMode.OTHER,
    "Refugee shuttle": e.TravelMode.OTHER,
    "Motorcycle": e.TravelMode.OTHER,
    "Bus": e.TravelMode.OTHER_PUBLIC_TRANSIT,
    "Connecting flights": None,
    "Airplane": None,
    "Flew in": None,
    "Medical": e.TravelMode.OTHER,
    "Personal car": e.TravelMode.DROVE_ALONE_AND_PARKED,
    "Paratransit": e.TravelMode.OTHER,
    "Shelter": None,
    "Stayed with family near airport and they drove me": e.TravelMode.DROPPED_OFF_BY_FAMILY_FRIEND,
    "Team bus": e.TravelMode.CHARTERED_TOUR_BUS,
    "Personal shuttle": e.TravelMode.OTHER_SHARED_VAN,
    "Turo": e.TravelMode.RENTAL_CAR_PICKED_UP,
    "Work": None,
    "Flight": None,
    "Flew": None,
    "Mts blue line": e.TravelMode.OTHER_PUBLIC_TRANSIT,
    "Route 10 and then Employee Shuttle": e.TravelMode.OTHER_PUBLIC_TRANSIT,
    "Telecommute Day but on a working day I use the hours below": None,
    "Work from home today": None
}

# Create a mapped column without modifying main_mode yet
# Create a mapped column
mapped_modes = clean_df["main_mode_other"].map(mode_mapping)

# Update main_mode where main_mode_other exists in mode_mapping (including None values)
clean_df.loc[clean_df["main_mode_other"].isin(mode_mapping.keys()), "main_mode"] = mapped_modes


In [31]:
mapped_modes

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
5929   NaN
5930   NaN
5931   NaN
5932   NaN
5933   NaN
Name: main_mode_other, Length: 5934, dtype: float64

### Create Grouped Modes
This section creates grouped modes for better readability and analysis. Particularly, it makes modes direction-agnostic.

In [32]:
travel_mode_to_grouped = {
    e.TravelMode.WALK: e.TravelModeGrouped.WALK,
    e.TravelMode.WHEELCHAIR_OR_MOBILITY_DEVICE: e.TravelModeGrouped.WHEELCHAIR_OR_OTHER_MOBILITY_DEVICE,
    e.TravelMode.BICYCLE_ELECTRIC_BIKESHARE: e.TravelModeGrouped.MICROMOBILITY_SHARED,
    e.TravelMode.BICYCLE_NON_ELECTRIC_BIKESHARE: e.TravelModeGrouped.MICROMOBILITY_SHARED,
    e.TravelMode.BICYCLE_PERSONAL_ELECTRIC: e.TravelModeGrouped.MICROMOBILITY_PERSONAL,
    e.TravelMode.BICYCLE_PERSONAL_NON_ELECTRIC: e.TravelModeGrouped.MICROMOBILITY_PERSONAL,
    e.TravelMode.E_SCOOTER_SHARED: e.TravelModeGrouped.MICROMOBILITY_SHARED,
    e.TravelMode.E_SCOOTER_PERSONAL: e.TravelModeGrouped.MICROMOBILITY_PERSONAL,
    e.TravelMode.TAXI: e.TravelModeGrouped.RIDEHAIL_TAXI,
    e.TravelMode.UBER_LYFT: e.TravelModeGrouped.RIDEHAIL_TAXI,
    e.TravelMode.CAR_SERVICE_BLACK_LIMO: e.TravelModeGrouped.RIDEHAIL_TAXI,
    e.TravelMode.MTS_ROUTE_992: e.TravelModeGrouped.BUS_992,
    e.TravelMode.AIRPORT_FLYER_SHUTTLE: e.TravelModeGrouped.AIRPORT_FLYER_SHUTTLE,
    e.TravelMode.OTHER_PUBLIC_TRANSIT: e.TravelModeGrouped.PUBLIC_TRANSPORTATION,
    e.TravelMode.DROPPED_OFF_BY_FAMILY_FRIEND: e.TravelModeGrouped.PERSONAL_CAR_DROPPED_OFF_PICKED_UP,
    e.TravelMode.PICKED_UP_BY_FAMILY_FRIEND: e.TravelModeGrouped.PERSONAL_CAR_DROPPED_OFF_PICKED_UP,
    e.TravelMode.DROVE_ALONE_AND_PARKED: e.TravelModeGrouped.PERSONAL_CAR_PARKED,
    e.TravelMode.DROVE_WITH_OTHERS_AND_PARKED: e.TravelModeGrouped.PERSONAL_CAR_PARKED,
    e.TravelMode.RODE_WITH_OTHER_TRAVELERS_AND_PARKED: e.TravelModeGrouped.PERSONAL_CAR_PARKED,
    e.TravelMode.GET_IN_PARKED_VEHICLE_AND_DRIVE_ALONE: e.TravelModeGrouped.PERSONAL_CAR_PARKED,
    e.TravelMode.GET_IN_PARKED_VEHICLE_AND_DRIVE_WITH_OTHERS: e.TravelModeGrouped.PERSONAL_CAR_PARKED,
    e.TravelMode.GET_IN_PARKED_VEHICLE_AND_RIDE_WITH_OTHER_TRAVELERS: e.TravelModeGrouped.PERSONAL_CAR_PARKED,
    e.TravelMode.RENTAL_CAR_DROPPED_OFF: e.TravelModeGrouped.RENTAL_CAR,
    e.TravelMode.RENTAL_CAR_PARKED: e.TravelModeGrouped.RENTAL_CAR,
    e.TravelMode.RENTAL_CAR_PICKED_UP: e.TravelModeGrouped.RENTAL_CAR,
    e.TravelMode.RENTAL_CAR_GET_IN_PARKED: e.TravelModeGrouped.RENTAL_CAR,
    e.TravelMode.HOTEL_SHUTTLE_VAN: e.TravelModeGrouped.SHARED_SHUTTLE_VAN,
    e.TravelMode.EMPLOYEE_SHUTTLE: e.TravelModeGrouped.SHARED_SHUTTLE_VAN,
    e.TravelMode.OTHER_SHARED_VAN: e.TravelModeGrouped.SHARED_SHUTTLE_VAN,
    e.TravelMode.CHARTERED_TOUR_BUS: e.TravelModeGrouped.OTHER,
    e.TravelMode.OTHER: e.TravelModeGrouped.OTHER,
    e.TravelMode.REFUSED_NO_ANSWER: e.TravelModeGrouped.REFUSED_NO_ANSWER,
}

In [33]:
mode_columns_to_remap = ['main_mode', 'access_mode', 'egress_mode', 'reverse_mode', 'reverse_mode_predicted', 'other_airport_accessmode', 'reverse_commute_mode']
for col in mode_columns_to_remap:
    clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
    print(f"Remapping Done for {col}")

Remapping Done for main_mode
Remapping Done for access_mode
Remapping Done for egress_mode
Remapping Done for reverse_mode
Remapping Done for reverse_mode_predicted
Remapping Done for other_airport_accessmode
Remapping Done for reverse_commute_mode


  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)
  clean_df[f'{col}_grouped'] = clean_df[col].map(travel_mode_to_grouped)


In [34]:
#### Consolidating multiple columns into one string column:
general_modes_used_visitor_mode_columns = [col for col in clean_df.columns if col.startswith("general_modes_used_visitor")]
alt_commute_mode_columns = [col for col in clean_df.columns if col.startswith("alt_commute_mode")]
sdia_accessmode_split_columns = [col for col in clean_df.columns if col.startswith("sdia_accessmode_split_")]
race_columns = [col for col in clean_df.columns if col.startswith("race_")]
reasons_no_transit_columns = [col for col in clean_df.columns if col.startswith("reasons_no_transit_")]
party_composition_columns = [col for col in clean_df.columns if col.startswith("party_includes_")]


# Create a new column with a comma-separated list of active modes
clean_df["general_modes_used_visitor_list"] = clean_df[general_modes_used_visitor_mode_columns].apply(lambda row: 
    ", ".join([col.replace("general_modes_used_visitor_", "").replace("_", " ") for col in general_modes_used_visitor_mode_columns if row[col]=='Yes']), 
    axis=1
)

clean_df["alt_commute_mode_list"] = clean_df[alt_commute_mode_columns].apply(lambda row: 
    ", ".join([col.replace("alt_commute_mode_", "").replace("_", " ") for col in alt_commute_mode_columns if row[col]=='Yes']), 
    axis=1
)

clean_df["sdia_accessmode_split_list"] = clean_df[sdia_accessmode_split_columns].apply(lambda row:
    ", ".join([col.replace("sdia_accessmode_split_", "").replace("_", " ") for col in sdia_accessmode_split_columns if row[col]=='Yes']), 
    axis=1
)

clean_df["race_list"] = clean_df[race_columns].apply(lambda row:
    ", ".join([col.replace("race_", "").replace("_", " ") for col in race_columns if row[col]=='Yes']), 
    axis=1
)

clean_df["reasons_no_transit_list"] = clean_df[reasons_no_transit_columns].apply(lambda row:
    ", ".join([col.replace("reasons_no_transit_", "").replace("_", " ") for col in reasons_no_transit_columns if row[col]=='Yes']), 
    axis=1
)

clean_df['party_composition_list'] = clean_df[party_composition_columns].apply(lambda row:
    ", ".join([col.replace("party_includes_", "").replace("_", " ") for col in party_composition_columns if row[col]=='Yes']), 
    axis=1
)

  clean_df["general_modes_used_visitor_list"] = clean_df[general_modes_used_visitor_mode_columns].apply(lambda row:
  clean_df["alt_commute_mode_list"] = clean_df[alt_commute_mode_columns].apply(lambda row:
  clean_df["sdia_accessmode_split_list"] = clean_df[sdia_accessmode_split_columns].apply(lambda row:
  clean_df["race_list"] = clean_df[race_columns].apply(lambda row:
  clean_df["reasons_no_transit_list"] = clean_df[reasons_no_transit_columns].apply(lambda row:
  clean_df['party_composition_list'] = clean_df[party_composition_columns].apply(lambda row:


In [35]:
ordered_transit_columns = [
    "from_airport_transit_route_1", "from_airport_transit_route_1_other",
    "from_airport_transit_route_2", "from_airport_transit_route_2_other",
    "from_airport_transit_route_3", "from_airport_transit_route_3_other",
    "from_airport_transit_route_4", "from_airport_transit_route_4_other",
    "to_airport_transit_route_1", "to_airport_transit_route_1_other",
    "to_airport_transit_route_2", "to_airport_transit_route_2_other",
    "to_airport_transit_route_3", "to_airport_transit_route_3_other",
    "to_airport_transit_route_4", "to_airport_transit_route_4_other"
]

# Ensure only valid columns (those that exist in the DataFrame)
valid_transit_columns = [col for col in ordered_transit_columns if col in clean_df.columns]

# Concatenate only non-null values while maintaining the correct order
clean_df["transit_routes_list"] = clean_df[valid_transit_columns].apply(
    lambda row: ", ".join(row.dropna().astype(str)), axis=1
)

# Compute the number of transfers (number of routes - 1), ensuring no negative values
clean_df["num_transit_transfers"] = clean_df["transit_routes_list"].apply(lambda x: max(len(x.split(", ")) - 1, 0) if x else 0)

  clean_df["transit_routes_list"] = clean_df[valid_transit_columns].apply(
  clean_df["num_transit_transfers"] = clean_df["transit_routes_list"].apply(lambda x: max(len(x.split(", ")) - 1, 0) if x else 0)


In [36]:
##Merge SP Survey fields:
sp_other_airport_columns = [col for col in clean_df.columns if col.startswith("sp_other_airport_")]


# Create a new column with a comma-separated list of active modes
clean_df["sp_other_airport_list"] = clean_df[sp_other_airport_columns].apply(lambda row: 
    ", ".join([col.replace("sp_other_airport_", "").replace("_", " ") for col in sp_other_airport_columns if row[col]==1]), 
    axis=1
)

  clean_df["sp_other_airport_list"] = clean_df[sp_other_airport_columns].apply(lambda row:


### Add Passenger Segment

In [37]:
# Add the `passenger_segment` column based on the updated logic
clean_df["passenger_segment"] = np.where(
    # Resident Arriving
    (clean_df["passenger_type"] == e.PassengerType.ARRIVING) & 
    ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.COMING_HOME) |
     (clean_df["resident_visitor_followup"] == e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT)),
    e.PassengerSegment.RESIDENT_ARRIVING,  # Resident Arriving
    np.where(
        (clean_df["passenger_type"] == e.PassengerType.ARRIVING),
        e.PassengerSegment.VISITOR_ARRIVING,  # Visitor Arriving
        np.where(
            (clean_df["passenger_type"] == e.PassengerType.DEPARTING) & 
            ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.LEAVING_HOME) |
             (clean_df["resident_visitor_followup"] == e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT)),
            e.PassengerSegment.RESIDENT_DEPARTING,  # Resident Departing
            np.where(
                # Visitor Departing
                (clean_df["passenger_type"] == e.PassengerType.DEPARTING),
                e.PassengerSegment.VISITOR_DEPARTING,  # Visitor Departing
                None  # Default case (if no conditions match)
            )
        )
    )
)

  clean_df["passenger_segment"] = np.where(


In [38]:
clean_df["resident_visitor_purpose"] = np.where(
    # Resident Business
    (clean_df["passenger_segment"].isin([e.PassengerSegment.RESIDENT_ARRIVING, e.PassengerSegment.RESIDENT_DEPARTING])) & 
    (clean_df["flight_purpose"].isin([e.FlightPurpose.BUSINESS_WORK, e.FlightPurpose.COMBINATION_BUSINESS_LEISURE])),
    e.ResidentVisitorPurpose.RESIDENT_BUSINESS,
    
    np.where(
        # Resident Non-Business
        clean_df["passenger_segment"].isin([e.PassengerSegment.RESIDENT_ARRIVING, e.PassengerSegment.RESIDENT_DEPARTING]),
        e.ResidentVisitorPurpose.RESIDENT_NON_BUSINESS,
        
        np.where(
            # Visitor Business
            (clean_df["flight_purpose"].isin([e.FlightPurpose.BUSINESS_WORK, e.FlightPurpose.COMBINATION_BUSINESS_LEISURE])),
            e.ResidentVisitorPurpose.VISITOR_BUSINESS,
            
            # Visitor Non-Business (default case)
            e.ResidentVisitorPurpose.VISITOR_NON_BUSINESS
        )
    )
)

  clean_df["resident_visitor_purpose"] = np.where(


In [39]:
## Explicit Visitor Check
clean_df["qualified_visitor"] = np.where(
    # Arriving and visiting or neither, and does not live in the same region traveled
    (clean_df["passenger_type"] == e.PassengerType.ARRIVING) & 
    ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.VISITING) | 
     (clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.NEITHER)) &
    (clean_df["resident_visitor_followup"] != e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT),
    1,  # Qualified visitor
    np.where(
        # Departing and going home or neither, and does not live in the same region traveled
        (clean_df["passenger_type"] == e.PassengerType.DEPARTING) &
        ((clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.GOING_HOME) | 
         (clean_df["resident_visitor_general"] == e.ResidentVisitorGeneral.NEITHER)) &
        (clean_df["resident_visitor_followup"] != e.ResidentVisitorFollowup.LIVE_OUTSIDE_REGION_TRAVELED_TO_AIRPORT),
        1,  # Qualified visitor
        0  # Not a qualified visitor
    )
)

  clean_df["qualified_visitor"] = np.where(


### Add some new consolidated variables, Combine some variables to exclude directionality

In [40]:
## New changes:
clean_df['number_of_nights'] = clean_df['nights_away'].fillna(clean_df['nights_visited'])

# Set is_sdia_home_airport to 1 for resident arriving or departing passengers
clean_df["is_sdia_home_airport"] = np.where(
    clean_df["passenger_segment"].isin([e.PassengerSegment.RESIDENT_ARRIVING, e.PassengerSegment.RESIDENT_DEPARTING]), 
    1, 
    0
)
### Combining reverse_mode, as reverse_mode_combined - 
clean_df['reverse_mode_combined'] = clean_df['reverse_mode_grouped'].combine_first(clean_df['reverse_mode_predicted_grouped'])
clean_df['reverse_mode_combined_other'] = clean_df['reverse_mode_predicted_other']

## party size
clean_df["party_size_ground_access"] = np.where(
    clean_df["party_size_ground_access_same"] == "Yes", 
    clean_df["party_size_flight"], 
    clean_df["party_size_ground_access"]
)

  clean_df['number_of_nights'] = clean_df['nights_away'].fillna(clean_df['nights_visited'])
  clean_df["is_sdia_home_airport"] = np.where(
  clean_df['reverse_mode_combined'] = clean_df['reverse_mode_grouped'].combine_first(clean_df['reverse_mode_predicted_grouped'])
  clean_df['reverse_mode_combined_other'] = clean_df['reverse_mode_predicted_other']


Remove origin, destination coordinates, when the location does not make sense

In [41]:
# For origin columns: if origin_state not in ['CA', 'BC'] OR origin_city is not 'Yuma'
mask_origin = ((clean_df['origin_state'].isin(['CA', 'BC'])) | (clean_df['origin_city'] == 'Yuma'))
clean_df.loc[~mask_origin, ['origin_latitude', 'origin_longitude']] = np.nan

# For destination columns: if destination_state not in ['CA', 'BC'] OR destination_city is not 'Yuma'
mask_destination = ((clean_df['destination_state'].isin(['CA', 'BC'])) | (clean_df['destination_city'] == 'Yuma'))
clean_df.loc[~mask_destination, ['destination_latitude', 'destination_longitude']] = np.nan

### Populate Home Location fields when it is not explicitly asked

In [42]:
# Create masks for home activity types
mask_origin_home = clean_df['origin_activity_type'] == e.ActivityType.HOME
mask_destination_home = clean_df['destination_activity_type'] == e.ActivityType.HOME

# For rows where origin is home, copy the origin fields to home_location_ fields
clean_df.loc[mask_origin_home, 'home_location_city'] = clean_df.loc[mask_origin_home, 'origin_city']
clean_df.loc[mask_origin_home, 'home_location_state'] = clean_df.loc[mask_origin_home, 'origin_state']
clean_df.loc[mask_origin_home, 'home_location_zip'] = clean_df.loc[mask_origin_home, 'origin_zip']
clean_df.loc[mask_origin_home, 'home_location_latitude'] = clean_df.loc[mask_origin_home, 'origin_latitude']
clean_df.loc[mask_origin_home, 'home_location_longitude'] = clean_df.loc[mask_origin_home, 'origin_longitude']
clean_df.loc[mask_origin_home, 'home_location_municipal_zone'] = clean_df.loc[mask_origin_home, 'origin_municipal_zone']
clean_df.loc[mask_origin_home, 'home_location_pmsa'] = clean_df.loc[mask_origin_home, 'origin_pmsa']

# For rows where destination is home, copy the destination fields to home_location_ fields
clean_df.loc[mask_destination_home, 'home_location_city'] = clean_df.loc[mask_destination_home, 'destination_city']
clean_df.loc[mask_destination_home, 'home_location_state'] = clean_df.loc[mask_destination_home, 'destination_state']
clean_df.loc[mask_destination_home, 'home_location_zip'] = clean_df.loc[mask_destination_home, 'destination_zip']
clean_df.loc[mask_destination_home, 'home_location_latitude'] = clean_df.loc[mask_destination_home, 'destination_latitude']
clean_df.loc[mask_destination_home, 'home_location_longitude'] = clean_df.loc[mask_destination_home, 'destination_longitude']
clean_df.loc[mask_origin_home, 'home_location_municipal_zone'] = clean_df.loc[mask_origin_home, 'destination_municipal_zone']
clean_df.loc[mask_origin_home, 'home_location_pmsa'] = clean_df.loc[mask_origin_home, 'destination_pmsa']


  clean_df.loc[mask_origin_home, 'home_location_municipal_zone'] = clean_df.loc[mask_origin_home, 'origin_municipal_zone']
  clean_df.loc[mask_origin_home, 'home_location_pmsa'] = clean_df.loc[mask_origin_home, 'origin_pmsa']


#### Fix transit_boarding and alighting coordinates

In [43]:
clean_df['transit_boarding_latitude'] = (
    clean_df['transit_board_1_lat']
    .combine_first(clean_df['stop_on_latitude'])
)

clean_df['transit_boarding_longitude'] = (
    clean_df['transit_board_1_long']
    .combine_first(clean_df['stop_on_longitude'])
)

clean_df['transit_alighting_latitude'] = (
    clean_df['transit_alight_4_lat']
    .combine_first(clean_df['transit_alight_3_lat'])
    .combine_first(clean_df['transit_alight_2_lat'])
    .combine_first(clean_df['transit_alight_1_lat'])
    .combine_first(clean_df['stop_off_latitude'])
)

clean_df['transit_alighting_longitude'] = (
    clean_df['transit_alight_4_long']
    .combine_first(clean_df['transit_alight_3_long'])
    .combine_first(clean_df['transit_alight_2_long'])
    .combine_first(clean_df['transit_alight_1_long'])
    .combine_first(clean_df['stop_off_longitude'])
)

  clean_df['transit_boarding_latitude'] = (
  clean_df['transit_boarding_longitude'] = (
  clean_df['transit_alighting_latitude'] = (
  clean_df['transit_alighting_longitude'] = (


In [44]:
clean_df.to_csv(clean_survey_file, index = False)

### Select Variables to verify for the survey

In [45]:
respondent_variables = [field_name for field_name, field_info in Respondent.__fields__.items()]


trip_variables = [field_name for field_name, field_info in Trip.__fields__.items()]
trip_variables.append('respondentid')
trip_variables.remove('valid_record')
trip_variables.remove('validation_error')
trip_variables.remove('validation_severity')
trip_variables.remove('validation_num_errors')

employee_variables = [field_name for field_name, field_info in Employee.__fields__.items()]
employee_variables.remove('trip')

#air_passenger_variables = [field_name for field_name, field_info in AirPassenger.__fields__.items()]
#air_passenger_variables.remove('trip')

air_passenger_departing_resident_variables = [field_name for field_name, field_info in DepartingPassengerResident.__fields__.items()] 
air_passenger_departing_visitor_variables = [field_name for field_name, field_info in DepartingPassengerVisitor.__fields__.items()]
air_passenger_arriving_resident_variables = [field_name for field_name, field_info in ArrivingPassengerResident.__fields__.items()]
air_passenger_arriving_visitor_variables = [field_name for field_name, field_info in ArrivingPassengerVisitor.__fields__.items()]
air_passenger_variables = list(set(air_passenger_departing_resident_variables + air_passenger_departing_visitor_variables + air_passenger_arriving_resident_variables +air_passenger_arriving_visitor_variables))

variables_to_verify = list(set(air_passenger_variables + respondent_variables + trip_variables + employee_variables))
variables_to_verify.remove('trip')
variables_to_verify.remove('valid_record')
variables_to_verify.remove('validation_error')
variables_to_verify.remove('validation_severity')
variables_to_verify.remove('validation_num_errors')

working_df = clean_df.copy()
working_df = working_df[variables_to_verify].copy()
working_df = working_df.loc[working_df['marketsegment'].notna()].copy()

In [46]:
working_df.shape

(5934, 319)

### Serialize the data

In [47]:
trips_df = working_df[trip_variables].copy()
persons_df = working_df[list[set(employee_variables + respondent_variables + air_passenger_variables)]].copy()

In [48]:
# combined
respondent_list = add_list_objects(
        trips_df.to_dict(orient="records"),  #child list
        "respondentid", # child key
        persons_df.to_dict(orient="records"), # parent list
        "respondentid", # parent key
        "trip", # parent var
    )

In [49]:
len(respondent_list)

5934

In [50]:
# employee_list = []
# air_passenger_list = []
# other_list = []
# failed_records = []

# for respondent in respondent_list:
#     market_segment = respondent["marketsegment"]
#     try:
#         if market_segment == e.Type.EMPLOYEE:
#             ev = Employee(** respondent)
#             employee_list.append(ev)
#         elif market_segment == e.Type.PASSENGER:
#              av = AirPassenger(** respondent)
#              air_passenger_list.append(av)
#         else:
#             rv = Respondent(** respondent)
#             other_list.append(rv)
#     except ValidationError as err:
#             respondent['error_flag'] = 'failed'
#             respondent['error_message'] = str(err)
#             failed_records.append(respondent) 


# failed_df = pd.DataFrame(failed_records)
# failed_df.head()

In [51]:
employee_list = []
arriving_air_passenger_resident_list = []
arriving_air_passenger_visitor_list = []
departing_air_passenger_resident_list = []
departing_air_passenger_visitor_list = []
other_list = []
failed_records = []

for respondent in respondent_list:
     market_segment = respondent["marketsegment"]
     try:
        if market_segment == e.Type.EMPLOYEE:
            ev = Employee(** respondent)
            employee_list.append(ev)
        elif market_segment == e.Type.PASSENGER:
             passenger_segment= respondent["passenger_segment"]
             if passenger_segment == e.PassengerSegment.RESIDENT_ARRIVING:
                    apr = ArrivingPassengerResident(** respondent)
                    arriving_air_passenger_resident_list.append(apr)
             elif passenger_segment == e.PassengerSegment.VISITOR_ARRIVING:
                    apv = ArrivingPassengerVisitor(** respondent)
                    arriving_air_passenger_visitor_list.append(apv)
             elif passenger_segment == e.PassengerSegment.RESIDENT_DEPARTING:
                    dpr = DepartingPassengerResident(** respondent)
                    departing_air_passenger_resident_list.append(dpr)
             elif passenger_segment == e.PassengerSegment.VISITOR_DEPARTING:
                    dpv = DepartingPassengerVisitor(** respondent)
                    departing_air_passenger_visitor_list.append(dpv)
             else:
                    rv = Respondent(** respondent)
                    other_list.append(rv)

        else:
            rv = Respondent(** respondent)
            other_list.append(rv)
            
     except ValidationError as err:
            respondent['error_flag'] = 'failed'
            respondent['error_message'] = str(err)
            failed_records.append(respondent) 


failed_df = pd.DataFrame(failed_records)
failed_df.head()

  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Critical check failed for variable: reverse_mode_predicted
Critical check failed for variable: reverse_mode_predicted
Critical check failed for variable: home_location_latitude
Critical check failed for variable: home_location_longitude
Critical check failed for variable: reverse_mode_predicted
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: reverse_mode_predicted
Critical check failed for variable: home_location_latitude
Critical check failed for variable: home_location_longitude
Critical check failed for variable: reverse_mode_predicted
Critical check failed for variable: home_location_latitude
Critical check failed for variable: home_location_longitude
Critical check failed for variable: reverse_mode_predicted
Critical check failed for variable: home_location_latitude
Critical check failed for variable: home_location_longitude
Critical check failed for variab

  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: nights_visited
Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: nights_visited
Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: reverse_mode
Critical check failed for variable: general_use_transit_visitor_home
Critical check failed for variable: general_use_transit_visitor

Unnamed: 0,from_airport_transit_route_1,non_sdia_flight_frequency,nights_away,transit_boarding_latitude,sdia_accessmode_split_drove_alone_and_parked,commute_mode_decision_other,sp_connection_to_old_town_center,alt_commute_mode_airport_flyer_shuttle,origin_municipal_zone,race_aian,...,from_airport_transit_route_3_other,race_unknown,origin_latitude,sp_other_airport_enough_room_to_stand,destination_state,egress_mode,sp_other_airport_did_not_like,trip,error_flag,error_message
0,,2.0,5.0,,No,,1.0,,SAN DIEGO,No,...,,No,32.733201,,,,,"{'inbound_or_outbound': 1.0, 'origin_activity_...",failed,1 validation error for DepartingPassengerResid...
1,,4.0,5.0,,No,,,,SAN DIEGO,No,...,,No,32.715116,,,,,"{'inbound_or_outbound': 1.0, 'origin_activity_...",failed,1 validation error for DepartingPassengerResid...
2,,0.0,,,No,,,,SAN DIEGO,Yes,...,,No,,,CA,,,"{'inbound_or_outbound': 2.0, 'origin_activity_...",failed,2 validation errors for ArrivingPassengerVisit...
3,,2.0,,,No,,,,SAN DIEGO,No,...,,No,,,CA,,,"{'inbound_or_outbound': 2.0, 'origin_activity_...",failed,1 validation error for ArrivingPassengerVisito...


In [52]:
failed_df.shape

(4, 322)

In [53]:
#failed_df['error_message'].unique()

In [54]:
#failed_df['is_completed'].value_counts()

In [55]:
#failed_df['error_message'][0]

In [56]:
failed_df.to_csv('../data/processed/failed_records.csv', index = False)

In [57]:
len(failed_df)

4

In [58]:
print("Arriving Air Passengers Residents:", len(arriving_air_passenger_resident_list))
print("Arriving Air Passengers Visitors:", len(arriving_air_passenger_visitor_list))
print("Departing Air Passengers Residents:", len(departing_air_passenger_resident_list))
print("Departing Air Passengers Visitors:", len(departing_air_passenger_visitor_list))
print("Total Air Passengers:", len(arriving_air_passenger_resident_list) + len(arriving_air_passenger_visitor_list) + len(departing_air_passenger_resident_list) + len(departing_air_passenger_visitor_list))

Arriving Air Passengers Residents: 73
Arriving Air Passengers Visitors: 189
Departing Air Passengers Residents: 1793
Departing Air Passengers Visitors: 2804
Total Air Passengers: 4859


### Make Data

In [59]:
employee_df = pd.DataFrame([Employee.model_dump() for Employee in employee_list])       

  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [60]:
#passenger_df = pd.DataFrame([AirPassenger.model_dump() for AirPassenger in air_passenger_list])

In [61]:
arriving_passenger_resident_df = pd.DataFrame([ArrivingPassengerResident.model_dump() for ArrivingPassengerResident in arriving_air_passenger_resident_list])
arriving_passenger_visitor_df = pd.DataFrame([ArrivingPassengerVisitor.model_dump() for ArrivingPassengerVisitor in arriving_air_passenger_visitor_list])
departing_passenger_resident_df = pd.DataFrame([DepartingPassengerResident.model_dump() for DepartingPassengerResident in departing_air_passenger_resident_list])
departing_passenger_visitor_df = pd.DataFrame([DepartingPassengerVisitor.model_dump() for DepartingPassengerVisitor in departing_air_passenger_visitor_list])

  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `enum` but got `Terminal` - serialized value may not be as expec

In [62]:
other_df = pd.DataFrame([Respondent.model_dump() for Respondent in other_list])
# other_df = add_enum_label_columns(other_df, Respondent)

  Expected `str` but got `list` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [63]:
len(other_list)

264

In [64]:
trip_list = []
id_list = []
for record in employee_list + arriving_air_passenger_resident_list + arriving_air_passenger_visitor_list + departing_air_passenger_resident_list + departing_air_passenger_visitor_list  + other_list:
    trip_list.append(record.trip)
    id_list.append(record.respondentid)

trip_df = pd.DataFrame([Trip.model_dump() for Trip in trip_list])
id_df = pd.DataFrame(id_list, columns=["respondentid"])

trip_df = pd.concat([id_df, trip_df], axis=1)
trip_df = add_enum_label_columns(trip_df,Trip)


  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `str` but got `list` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  Expected `float` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [65]:
output_df = pd.concat([employee_df, arriving_passenger_resident_df, arriving_passenger_visitor_df, departing_passenger_resident_df, departing_passenger_visitor_df , other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])
output_df = pd.merge(output_df, trip_df, on="respondentid", how="left", suffixes = ('_person', '_trip'))

  output_df = pd.concat([employee_df, arriving_passenger_resident_df, arriving_passenger_visitor_df, departing_passenger_resident_df, departing_passenger_visitor_df , other_df], axis=0).reset_index(drop=True).drop(columns=["trip"])


In [66]:
output_df.shape

(5930, 353)

In [67]:
output_df = add_synthetic_records(output_df)

  combined_df = pd.concat([df, synthetic_df], ignore_index=True)


In [68]:
output_df.head()

Unnamed: 0,valid_record_person,validation_error_person,validation_severity_person,validation_num_errors_person,is_completed,is_self_administered,respondentid,is_pilot,record_type_synthetic,date_completed,...,main_mode_grouped_label,number_transit_vehicles_to_airport_label,access_mode_label,access_mode_grouped_label,parking_location_label,parking_cost_frequency_label,reimbursement_label,number_transit_vehicles_from_airport_label,egress_mode_label,egress_mode_grouped_label
0,True,[],,0,True,False,5473,False,0,2024-10-04,...,PERSONAL_CAR_PARKED,,,,EMPLOYEE_LOT_3665_ADMIRAL_BOLAND_WAY,MONTHLY,NOT_REIMBURSED,,,
1,True,[],,0,True,False,5476,False,0,2024-10-04,...,BUS_992,ONE,WALK,WALK,,,NOT_REIMBURSED,,,
2,True,[],,0,True,False,5489,False,0,2024-10-04,...,PERSONAL_CAR_PARKED,,,,EMPLOYEE_LOT_3665_ADMIRAL_BOLAND_WAY,OTHER_SPECIFY,NOT_REIMBURSED,,,
3,True,[],,0,True,False,5558,False,0,2024-10-04,...,BUS_992,NONE,WALK,WALK,,,NOT_REIMBURSED,,,
4,True,[],,0,True,False,5593,False,0,2024-10-04,...,BUS_992,ONE,WALK,WALK,,,REIMBURSED_EMPLOYER_CLIENT,,,


In [69]:
output_df =  add_enum_label_columns(output_df, Respondent)

output_df = add_enum_label_columns(output_df, AirPassenger)
output_df = add_enum_label_columns(output_df, DepartingAirPassenger)
output_df =  add_enum_label_columns(output_df, ArrivingAirPassenger)
output_df =  add_enum_label_columns(output_df, Resident)
output_df =  add_enum_label_columns(output_df, Visitor)

output_df = add_enum_label_columns(output_df, DepartingPassengerResident)
output_df =  add_enum_label_columns(output_df, ArrivingPassengerResident)

output_df = add_enum_label_columns(output_df, DepartingPassengerVisitor)
output_df =  add_enum_label_columns(output_df, ArrivingPassengerVisitor)

output_df =  add_enum_label_columns(output_df, Trip)
output_df =  add_enum_label_columns(output_df, Employee)

  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum_names).astype(str)
  df[enum_name_col] = df[field].map(enum

In [70]:
# Define the list of important columns
important_columns = ['respondentid', 'is_completed', 'date_completed','time_completed', 'is_pilot', 'is_self_administered', 'record_type_synthetic']

# Separate important columns and the rest of the columns
remaining_columns = [col for col in output_df.columns if col not in important_columns]

# Create the new column order
new_column_order = important_columns + sorted(remaining_columns)

# Reorder the DataFrame
output_df = output_df[new_column_order]

In [71]:
output_df.index = output_df.index + 1
output_df.to_csv(output_csv_filename, index_label = 'unique_id')