In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
#Import Data
# input_dir = os.path.join(r"C:\Users\kdannemiller\Cambridge Systematics\PROJ SANDAG Commercial Vehicle & Heavy Truck Model Update - _Shared_CSTeam\Task03_DataID_Review")
root_dir = os.path.join(r"C:\Users\jgliebe\OneDrive - Cambridge Systematics\Documents - PROJ SANDAG Commercial Vehicle & Heavy Truck Model Update\_Shared_CSTeam\Task03_DataID_Review")

land_use = pd.read_csv(os.path.join(root_dir,"Land_Use", "mgra15_based_input2022.csv"))

day = pd.read_csv(os.path.join(root_dir,"HHTS", "export_day_weights.csv"))
hh = pd.read_csv(os.path.join(root_dir,"HHTS", "Model Estimation", "export_hh_weights_TAZ.csv"))
person = pd.read_csv(os.path.join(root_dir,"HHTS", "export_person_weights.csv"))
tour = pd.read_csv(os.path.join(root_dir,"HHTS", "2022HHTS_tours.csv"))

#Export
outfile = "HHAttr_dataset_v5.csv"
export_dir = os.path.join(root_dir, "HHTS", outfile)


# Household

In [None]:
#Filter Out Missing Responses
#num_people
hh = hh[hh['num_people'] != 995]

#num_adults
hh = hh[hh['num_adults'] != 995]

#num_kids
hh = hh[hh['num_kids'] != 995]

#num_students
hh = hh[hh['num_students'] != 995]

#num_workers
hh = hh[hh['num_workers'] != 995]

#num_vehicles
hh = hh[hh['num_vehicles'] != 995]

#income_detailed
hh = hh[hh['income_detailed'] != 999]

#HH type
hh = hh[hh['res_type'] != 995]

#Create new variables
#num_people_nl
hh['num_people_nl'] = np.log(hh['num_people'] + 1)

#num_adults_worker
hh['num_adults_worker'] = hh['num_workers']

#num_adults_nonworker
hh['num_adults_nonworker'] = hh['num_adults'] - hh['num_adults_worker']
hh['num_adults_nonworker'] = hh['num_adults_nonworker'].apply(lambda x: max(0, x))

#hh_size_1
is_1_person = lambda x: 1 if x in [1] else 0
hh['hh_size_1'] = hh['num_people'].apply(is_1_person)

#hh_size_2
is_2_person = lambda x: 1 if x in [2] else 0
hh['hh_size_2'] = hh['num_people'].apply(is_2_person)

#num_veh_per_worker
hh['num_veh_per_worker'] = hh['num_vehicles']/hh['num_workers']

#hh_0_veh
is_0_veh = lambda x: 1 if x in [0] else 0
hh['hh_0_veh'] = hh['num_vehicles'].apply(is_0_veh)

#car_insufficiency
hh['num_veh_per_adult'] = hh['num_vehicles']/hh['num_adults']
hh['car_insufficiency'] = hh['num_veh_per_adult'].apply(lambda x: 1 if 0 < x < 1 else 0)
hh['car_insufficiency_0_veh'] = hh['num_veh_per_adult'].apply(lambda x: 1 if x < 1 else 0)

#car_sufficiency
hh['car_sufficiency'] = hh['num_veh_per_adult'].apply(lambda x: 1 if x >= 1 else 0)

#income: $15,000 to $49,999
is_15_49 = lambda x: 1 if x in [2, 3, 4] else 0
hh['income_15_49'] = hh['income_detailed'].apply(is_15_49)

#income: $50,000 to $99,999
is_50_99 = lambda x: 1 if x in [5, 6] else 0
hh['income_50_99'] = hh['income_detailed'].apply(is_50_99)

#income: $100,000 to $199,999
is_100_199 = lambda x: 1 if x in [7, 8] else 0
hh['income_100_199'] = hh['income_detailed'].apply(is_100_199)

#income: $200,000 plus
is_200_plus = lambda x: 1 if x in [9, 10] else 0
hh['income_200_plus'] = hh['income_detailed'].apply(is_200_plus)

#(TEMPORARY?) income: $100,000 plus
is_100_plus = lambda x: 1 if x in [7, 8, 9, 10] else 0
hh['income_100_plus'] = hh['income_detailed'].apply(is_100_plus)

#Condense Household Data
hh_cond = hh[[
    'hh_id',
    'home_taz',
    'num_people',
    'num_people_nl',
    'hh_size_1',
    'hh_size_2',
    'num_adults',
    'num_kids',
    'num_vehicles',
    'num_veh_per_worker',
    'hh_0_veh',
    'car_insufficiency',
    'car_insufficiency_0_veh',
    'car_sufficiency',
    'income_50_99',
    'income_100_199',
    'income_200_plus',
    'income_100_plus'
]]

hh_cond

# Day

In [None]:
#Filter Out Missing Responses
#survey_complete
day = day[day['survey_complete'] == 1]

#delivery
day = day[(day['delivery_2'] != 995) &
          (day['delivery_3'] != 995) &
          (day['delivery_5'] != 995) &
          (day['delivery_8'] != 995)]

#Create Dummy Variables
#weekday
is_weekday = lambda x: 1 if x in [1, 2, 3, 4] else 0
day['weekday'] = day['travel_dow'].apply(is_weekday)

#friday
is_friday = lambda x: 1 if x in [5] else 0
day['friday'] = day['travel_dow'].apply(is_friday)

#delivery
day['delivery'] = (day['delivery_2'] | day['delivery_3'] | day['delivery_5'] | day['delivery_8']).astype(int)

#food
day['food'] = (day['delivery_2']).astype(int)

#service
day['service'] = (day['delivery_3']).astype(int)

#package
day['package'] = (day['delivery_5'] | day['delivery_8']).astype(int)

#Condense Day Data
day_cond = day.groupby(['hh_id', 'day_id']).agg({
    'person_id': 'first',
    #'num_trips': 'first',
    'weekday': 'first',
    'friday': 'first',
    'delivery': 'first',
    'food': 'first',
    'service': 'first',
    'package': 'first'
    }).reset_index()

day_cond

# Person

In [None]:
#Filter Out Missing Responses
#employment
#person = person[person['employment'] != 995]

#num_jobs
#person = person[person['num_jobs'] != 995]

#education
person = person[person['education'] != 995]
person = person[person['education'] != 999]

#student
person = person[person['student'] != 995]

#can_drive
person = person[person['can_drive'] != 995]

#telework_freq
#person = person[person['telework_freq'] != 995]

#commute_freq
#person = person[person['commute_freq'] != 995]

#Create Dummy Variables
#age: 18-34
is_18_34 = lambda x: 1 if x in [4, 5] else 0
person['age_18_34'] = person['age'].apply(is_18_34)

#age: 35-54
is_35_54 = lambda x: 1 if x in [6, 7] else 0
person['age_35_54'] = person['age'].apply(is_35_54)

#age: 55-74
is_55_74 = lambda x: 1 if x in [8, 9] else 0
person['age_55_74'] = person['age'].apply(is_55_74)

#age: 75+
is_75_plus = lambda x: 1 if x in [10, 11] else 0
person['age_75_plus'] = person['age'].apply(is_75_plus)

#employed
#is_employed = lambda x: 1 if x in [1, 2, 3, 7, 8] else 0
#person['employed'] = person['employment'].apply(is_employed)

#education_higher
is_educated = lambda x: 1 if x in [3, 4, 5, 6, 7] else 0
person['education_higher'] = person['education'].apply(is_educated)

#student_status
is_student = lambda x: 1 if x in [0, 1, 3, 4] else 0
person['student_status'] = person['student'].apply(is_student)

#driver
is_driver = lambda x: 1 if x in [1] else 0
person['driver'] = person['can_drive'].apply(is_driver)

#commute
#is_commute = lambda x: 1 if x in [1, 2, 3, 4] else 0
#person['commute'] = person['commute_freq'].apply(is_commute)

#telework
#is_telework = lambda x: 1 if x in [1, 2, 3, 4] else 0
#person['telework'] = person['telework_freq'].apply(is_telework)

#Condense person data
person_cond = person[[
    'person_id',
    'age_18_34',
    'age_35_54',
    'age_55_74',
    'age_75_plus',
    'education_higher',
    'student_status',
    'driver'
]]

#Group person dataset by houshold ID, maintaining the maximum age group category

person_cond

# Tour

In [None]:
#Rename tour columns
column_names_tour = {
    'HH_ID': 'hh_id',
    'PER_ID': 'person_id',
    'TOURPURP': 'tour_purp'}
tour.rename(columns=column_names_tour, inplace=True)

#Create Dummy Variables
#shop
is_shop = lambda x: 1 if x in [5] else 0
tour['shop_tours'] = tour['tour_purp'].apply(is_shop)

#dine
is_dine = lambda x: 1 if x in [7] else 0
tour['dine_tours'] = tour['tour_purp'].apply(is_dine)

#work-based
is_work = lambda x: 1 if x in [1] else 0
tour['work_tours'] = tour['tour_purp'].apply(is_work)

#tours
tour['tours_total'] = 1

#Convert day column
def extract_day_number(day_string):
    return int(day_string.replace('day', ''))
tour['day'] = tour['day'].apply(extract_day_number)

#Create day_id column
tour['hh_id'] = tour['hh_id'].astype(str)
tour['person_id'] = tour['person_id'].astype(str)
tour['day'] = tour['day'].astype(str)

tour['day_id'] = tour['hh_id'] + tour['person_id'].astype(str).str.zfill(2) + tour['day'].astype(str).str.zfill(2)

#Create person_id column
tour['person_id'] = tour['hh_id'] + tour['person_id'].astype(str).str.zfill(2)

#Group by day_id
tour_grouped = tour.groupby(['day_id']).agg({
    'hh_id': 'first',
    'person_id': 'first',
    'tours_total': 'sum',
    'shop_tours': 'sum',
    'dine_tours': 'sum',
    'work_tours': 'sum'}).reset_index()

#tours_home_based
tour_grouped['home_based_tours'] = tour_grouped['tours_total'] - tour_grouped['work_tours']

#shop_dine
tour_grouped['shop_dine_tours'] = tour_grouped['shop_tours'] + tour_grouped['dine_tours']

#Condense tour dataframe
tour_cond = tour_grouped[[
#     'hh_id',
#     'person_id',
    'day_id',
    'home_based_tours',
    'shop_tours',
    'dine_tours',
    'shop_dine_tours'

]]

tour_cond

# Land Use

In [None]:
#Group by TAZ
land_use_grouped = land_use.groupby(['taz']).agg({
    'pop': 'sum'}).reset_index()

column_names_land_use = {
    'taz': 'home_taz',
    'pop': 'home_taz_pop'}
land_use_grouped.rename(columns=column_names_land_use, inplace=True)

land_use_grouped['home_taz_pop_100'] = land_use_grouped['home_taz_pop']/100

land_use_grouped

# Merge

In [None]:
#Change column type
day_cond['person_id'] = pd.to_numeric(day_cond['person_id'])
person_cond['person_id'] = pd.to_numeric(person_cond['person_id'])
day_cond['day_id'] = pd.to_numeric(day_cond['day_id'])
tour_cond['day_id'] = pd.to_numeric(tour_cond['day_id'])

#Merge
df = pd.merge(day_cond, tour_cond, on=['day_id'], how='left')
df = pd.merge(df, hh_cond, on=['hh_id'], how='left')
df = pd.merge(df, land_use_grouped, on=['home_taz'], how='left')
df = pd.merge(df, person_cond, on=['person_id'], how='left')

#Drop na columns
df = df.dropna()

df = df[[
    'hh_id',
    'person_id',
    'day_id',
    'weekday',
    'friday',
    'delivery',
    'food',
    'service',
    'package',
    'home_taz',
    'home_taz_pop',
    'home_taz_pop_100',
    'home_based_tours',
    'shop_tours',
    'dine_tours',
    'shop_dine_tours',
    'num_people',
    'num_people_nl',
    'hh_size_1',
    'hh_size_2',
    'num_adults',
    'num_kids',
    'num_vehicles',
    'num_veh_per_worker',
    'hh_0_veh',
    'car_insufficiency',
    'car_insufficiency_0_veh',
    'car_sufficiency',
    'age_18_34',
    'age_35_54',
    'age_55_74',
    'age_75_plus',
    'income_50_99',
    'income_100_199',
    'income_200_plus',
    'income_100_plus'
]]

df.head()

In [None]:
#Checks
#df['delivery'].value_counts()
#pd.crosstab(delivery_package['package'], delivery_package['income_200_plus'])
df.columns.tolist()

# Split

In [None]:
#delivery_food
# delivery_food = df[df['food'] == 1]
# delivery_food

In [None]:
#delivery_package
# delivery_package = df[(df['package'] == 1)]
# delivery_package

In [None]:
#delivery_service
# delivery_service = df[df['service'] == 1]
# delivery_service

# Export

In [None]:
df.to_csv(export_dir, index=False)

In [None]:
export_dir