In [1]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath("../data_model/"))

In [2]:
import pandas as pd
import numpy as np
from pydantic import ValidationError
import data_model
import enums as e
from utils import extract_base_type, add_enum_label_columns, add_list_objects  

In [3]:
importlib.reload(data_model)
importlib.reload(e)
from data_model import Respondent, Employee, AirPassenger, Trip

In [4]:
external_dir = "../data/external"
interim_dir = "../data/interim"
processed_dir = "../data/processed"

input_file = os.path.join(external_dir, "etc/od_20241004_sandag_airport_pilot_2.xlsx") #pilot survey 2, latest
variable_map_file = os.path.join(processed_dir, "revised_names.csv")
clean_survey_file = os.path.join(interim_dir, "survey_data_1004.csv")
output_csv_filename = os.path.join(processed_dir, "data_model_output.csv")
#summary_csv_filename = os.path.join(processed_dir, "data_model_output_summary.csv")

### Clean Data , Rename fields

In [5]:
in_df = pd.read_excel(input_file)
header_df = pd.read_csv(variable_map_file)[['ETC_name','WSP_name']]
header_dict = pd.Series(header_df.WSP_name.values,index=header_df.ETC_name).to_dict()
clean_df = in_df.rename(columns=header_dict).copy().drop(columns=["delete"])

In [6]:
clean_df.to_csv(clean_survey_file, index = False)

### Select Variables to verify for the survey

In [10]:
respondent_variables = [field_name for field_name, field_info in Respondent.__fields__.items()]
respondent_variables.remove('trip')

trip_variables = [field_name for field_name, field_info in Trip.__fields__.items()]

employee_variables = [field_name for field_name, field_info in Employee.__fields__.items()]
employee_variables.remove('trip')

air_passenger_variables = [field_name for field_name, field_info in AirPassenger.__fields__.items()]
air_passenger_variables.remove('trip')

variables_to_verify = list(set(air_passenger_variables + respondent_variables + trip_variables + employee_variables))

working_df = clean_df.copy()
working_df = working_df[variables_to_verify].copy()
working_df = working_df.loc[working_df['marketsegment'].notna()].copy()
working_df.head()

Unnamed: 0,home_location_longitude,origin_state,occupation,alt_commute_mode_e_scooter_personal,race_middle_eastern,general_modes_used_visitor_coaster,general_modes_used_visitor_e_scooter_shared,sdia_accessmode_split_rode_with_other_travelers_and_parked,reverse_mode_predicted_other,transit_alighting_latitude,...,gender,parking_location,number_vehicles,shift_start_airport_building,sdia_accessmode_split_car_black,general_modes_used_visitor_chartered_tour_bus,transit_alighting_stop_name,reasons_no_transit_dislike_transit,origin_city,party_includes_coworker
0,,,,,No,,,,,-,...,2,,3,,,,-,No,,
1,,,,,No,,,,,-,...,2,,3,,,,-,No,,
2,,Baja California,,,No,No,No,No,,32.732,...,1,,1,,No,No,Terminal 1 Rental Car Shuttle,,Mexicali,
3,,Baja California,,,No,No,No,No,,32.732,...,1,,1,,No,No,Terminal 1 Rental Car Shuttle,,Mexicali,
4,,CA,,,No,No,No,,,-,...,1,,2,,,No,-,No,Encinitas,


### Serialize the data

In [11]:
trips_df = working_df[trip_variables].copy()
persons_df = working_df[employee_variables + respondent_variables + air_passenger_variables].copy()