#### This notebooks map taxi zone location to its nearest node in graphml.

#### It process following tasks
- Add latitude and longitude to taxi zone data
- Map each location to the nearest node in road_data
- Save it as parquet file for future analysis.

In [1]:
# Import needed libraries.
import networkx as nx
import geopandas as gpd
import pandas as pd
from scipy.spatial import KDTree

In [21]:
# Load taxi_zones data
taxi_zones = gpd.read_file(r"C:\Users\FuQuan\Desktop\Ride-Hailing\Ride-Hailing-VRP-Problem\data\raw\taxi_zones\taxi_zones.shp")

In [22]:
print(taxi_zones.crs)

# If the CRS is not WGS84, reproject it
# Example: if CRS is EPSG:2263 (New York State Plane)
if taxi_zones.crs != "EPSG:4326":
    taxi_zones = taxi_zones.to_crs("EPSG:4326")
    print("New EPSG == 4326")

EPSG:2263
New EPSG == 4326


In [23]:
# Add latitude and longitude data.
taxi_zones['centroid_lat'] = taxi_zones.geometry.centroid.y
taxi_zones['centroid_lon'] = taxi_zones.centroid.x

print(taxi_zones.head())

   OBJECTID  Shape_Leng  Shape_Area                     zone  LocationID  \
0         1    0.116357    0.000782           Newark Airport           1   
1         2    0.433470    0.004866              Jamaica Bay           2   
2         3    0.084341    0.000314  Allerton/Pelham Gardens           3   
3         4    0.043567    0.000112            Alphabet City           4   
4         5    0.092146    0.000498            Arden Heights           5   

         borough                                           geometry  \
0            EWR  POLYGON ((-74.18445 40.695, -74.18449 40.6951,...   
1         Queens  MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ...   
2          Bronx  POLYGON ((-73.84793 40.87134, -73.84725 40.870...   
3      Manhattan  POLYGON ((-73.97177 40.72582, -73.97179 40.725...   
4  Staten Island  POLYGON ((-74.17422 40.56257, -74.17349 40.562...   

   centroid_lat  centroid_lon  
0     40.691831    -74.174000  
1     40.616745    -73.831299  
2     40.864474    -


  taxi_zones['centroid_lat'] = taxi_zones.geometry.centroid.y

  taxi_zones['centroid_lon'] = taxi_zones.centroid.x


In [24]:
# Load road data.
G = nx.read_graphml(r"C:\Users\FuQuan\Desktop\Ride-Hailing\Ride-Hailing-VRP-Problem\data\processed\road_graph\new_york_processed_network.graphml")

In [25]:
nodes = []
for node, data in G.nodes(data=True):
    # Directly convert values since they’re guaranteed to be present
    y = float(data.get('y'))
    x = float(data.get('x'))
    highway = data.get('highway', None)
    street_count = int(data.get('street_count'))
    
    nodes.append({
         "node_id": node,
         "y": y,
         "x": x,
         "highway": highway,
         "street_count": street_count
    })
    
road_nodes_df = pd.DataFrame(nodes)
print("\nRoad Nodes Sample:")
print(road_nodes_df.head())



Road Nodes Sample:
    node_id          y          x            highway  street_count
0  39076461  40.786345 -73.794748  motorway_junction             3
1  39076490  40.762429 -73.757091  motorway_junction             3
2  39076504  40.753467 -73.744164  motorway_junction             3
3  42421728  40.798048 -73.960044    traffic_signals             3
4  42421731  40.798654 -73.961474    traffic_signals             4


In [26]:
road_coords = road_nodes_df[['y', 'x']].values
# Create KDTree for efficient searching nearest node.
kd_tree = KDTree(road_coords)

In [27]:
# Find nearest road node for each taxi zone centroid
def find_nearest_node(row):
    query_point = [row['centroid_lat'], row['centroid_lon']]
    _, idx = kd_tree.query(query_point)  # Find nearest road node
    return road_nodes_df.iloc[idx]['node_id']

In [28]:
taxi_zones['nearest_node_id'] = taxi_zones.apply(find_nearest_node, axis=1)

# Save updated taxi zone data to Parquet
taxi_zones.to_parquet(r"C:\Users\FuQuan\Desktop\Ride-Hailing\Ride-Hailing-VRP-Problem\data\processed\taxi_zones.parquet", index=False)
print("\nUpdated taxi zone data saved as 'taxizone_with_node_ids.parquet'")


Updated taxi zone data saved as 'taxizone_with_node_ids.parquet'


In [30]:
print(taxi_zones.head(100))

    OBJECTID  Shape_Leng  Shape_Area                       zone  LocationID  \
0          1    0.116357    0.000782             Newark Airport           1   
1          2    0.433470    0.004866                Jamaica Bay           2   
2          3    0.084341    0.000314    Allerton/Pelham Gardens           3   
3          4    0.043567    0.000112              Alphabet City           4   
4          5    0.092146    0.000498              Arden Heights           5   
..       ...         ...         ...                        ...         ...   
95        96    0.185180    0.000548  Forest Park/Highland Park          96   
96        97    0.062476    0.000163                Fort Greene          97   
97        98    0.121661    0.000486              Fresh Meadows          98   
98        99    0.183371    0.001210            Freshkills Park          99   
99       100    0.024813    0.000037           Garment District         100   

          borough                                  