In [4]:
import pandas as pd
import numpy as np

### Remote I/O

In [5]:
data_model_output_file = '../../data/processed/data_model_output.csv'
airport_routes_shapes_file = '../../data/external/gtfs-snippets/airport-routes.csv'
output_data_file = '../../data/interim/origins-close-to-transit.csv'

### Data Reads

In [6]:
input_data_df = pd.read_csv(data_model_output_file)
airport_routes_df = pd.read_csv(airport_routes_shapes_file)

  input_data_df = pd.read_csv(data_model_output_file)


In [7]:
airport_routes_df.head()

Unnamed: 0,shape_id,lat,lon,shape_pt_sequence,shape_dist_traveled
0,S2_992_9_60,32.71587,-117.15287,10001,0.0
1,S2_992_9_60,32.71574,-117.15287,10002,0.009
2,S2_992_9_60,32.715733,-117.153786,10003,0.063
3,S2_992_9_60,32.715733,-117.15406,10004,0.079
4,S2_992_9_60,32.715733,-117.15406,20001,0.079


In [8]:
input_data_df.filter(like='inbound', axis=1)

Unnamed: 0,inbound_or_outbound,inbound_or_outbound_label
0,1.0,INBOUND_TO_AIRPORT
1,1.0,INBOUND_TO_AIRPORT
2,1.0,INBOUND_TO_AIRPORT
3,1.0,INBOUND_TO_AIRPORT
4,1.0,INBOUND_TO_AIRPORT
...,...,...
9887,2.0,OUTBOUND_FROM_AIRPORT
9888,2.0,OUTBOUND_FROM_AIRPORT
9889,2.0,OUTBOUND_FROM_AIRPORT
9890,2.0,OUTBOUND_FROM_AIRPORT


### Reductions

In [9]:
data_df = input_data_df[[
    "unique_id",
    "origin_latitude",
    "origin_longitude",
    "main_mode_label",
    "marketsegment_label",
    "inbound_or_outbound_label",
    "weight_departing_and_arriving"
]]
data_df = data_df[data_df["origin_latitude"].notna()]

In [10]:
def haversine(lat1, lon1, lat2, lon2):
    radius = 6371
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance_in_kilometers = radius * c
    return distance_in_kilometers


In [11]:
min_distances = []
for index, row in data_df.iterrows():
    lat1, lon1 = row['origin_latitude'], row['origin_longitude']
    distances = [haversine(lat1, lon1, row2['lat'], row2['lon']) for index2, row2 in airport_routes_df.iterrows()]
    min_distance = min(distances)
    min_distances.append(min_distance)

# Create a new DataFrame with the minimum distances
distances_df = pd.DataFrame({'unique_id': data_df['unique_id'], 'distance_in_kilometers': min_distances})


In [12]:
distances_df.head()

Unnamed: 0,unique_id,distance_in_kilometers
0,1,5.45092
1,2,1.211928
2,3,3.447726
3,4,0.421076
4,5,1.345806


In [13]:
data_df = pd.merge(data_df, distances_df, on="unique_id")
data_df.head()

Unnamed: 0,unique_id,origin_latitude,origin_longitude,main_mode_label,marketsegment_label,inbound_or_outbound_label,weight_departing_and_arriving,distance_in_kilometers
0,1,32.816714,-117.176898,DROVE_ALONE_AND_PARKED,EMPLOYEE,INBOUND_TO_AIRPORT,10.840259,5.45092
1,2,32.639943,-117.085774,MTS_ROUTE_992,EMPLOYEE,INBOUND_TO_AIRPORT,6.487856,1.211928
2,3,32.743009,-117.131699,DROVE_ALONE_AND_PARKED,EMPLOYEE,INBOUND_TO_AIRPORT,10.840259,3.447726
3,4,32.706752,-117.14881,MTS_ROUTE_992,EMPLOYEE,INBOUND_TO_AIRPORT,6.487856,0.421076
4,5,32.678108,-117.099196,MTS_ROUTE_992,EMPLOYEE,INBOUND_TO_AIRPORT,6.487856,1.345806


In [14]:
data_df.head()

Unnamed: 0,unique_id,origin_latitude,origin_longitude,main_mode_label,marketsegment_label,inbound_or_outbound_label,weight_departing_and_arriving,distance_in_kilometers
0,1,32.816714,-117.176898,DROVE_ALONE_AND_PARKED,EMPLOYEE,INBOUND_TO_AIRPORT,10.840259,5.45092
1,2,32.639943,-117.085774,MTS_ROUTE_992,EMPLOYEE,INBOUND_TO_AIRPORT,6.487856,1.211928
2,3,32.743009,-117.131699,DROVE_ALONE_AND_PARKED,EMPLOYEE,INBOUND_TO_AIRPORT,10.840259,3.447726
3,4,32.706752,-117.14881,MTS_ROUTE_992,EMPLOYEE,INBOUND_TO_AIRPORT,6.487856,0.421076
4,5,32.678108,-117.099196,MTS_ROUTE_992,EMPLOYEE,INBOUND_TO_AIRPORT,6.487856,1.345806


In [15]:
data_df.to_csv(output_data_file, index=False)