In [1]:
import geopandas as gpd
import pandas as pd
import sys
sys.path.append('../..')

from configs.data_configs import RAW_DATA_DIR

# Load the taxi zones shapefile
taxi_zones = gpd.read_file(RAW_DATA_DIR / 'taxi_zones/taxi_zones.shp')
taxi_zones

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.77 256767.698, 1026495.593 2..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.31 144283.336, 936046.565 144..."
...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((1025414.782 270986.139, 1025138.624 ..."
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((1011466.966 216463.005, 1011545.889 ..."
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((980555.204 196138.486, 980570.792 19..."
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((999804.795 224498.527, 999824...."


In [3]:
# Reproject to a projected CRS (e.g., UTM) before calculating centroids
# taxi_zones = taxi_zones.to_crs(epsg=2263)  # UTM New York State Plane (EPSG:2263)

# Calculate centroids in the projected CRS
taxi_zones['centroid'] = taxi_zones.geometry.centroid

# # Reproject centroids back to WGS84 (EPSG:4326) for correct latitude/longitude
# taxi_zones['centroid_wgs84'] = taxi_zones['centroid'].to_crs(epsg=4326)

# # Extract the latitudes and longitudes from the reprojected centroids
# taxi_zones['lat'] = taxi_zones.centroid_wgs84.y
# taxi_zones['long'] = taxi_zones.centroid_wgs84.x

# # Handle potential missing geometries
# taxi_zones = taxi_zones.dropna(subset=['centroid'])

# # Drop the additional geometry columns to keep only one geometry
# taxi_zones = taxi_zones.drop(columns=['centroid_wgs84', 'centroid'])

# # Save the updated taxi zones with centroids
# taxi_zones.to_file(RAW_DATA_DIR / 'taxi_zones_with_centroids.shp')

# Print the first few rows to verify
taxi_zones.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry,centroid
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19...",POINT (935996.821 191376.75)
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343...",POINT (1031085.719 164018.754)
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.77 256767.698, 1026495.593 2...",POINT (1026452.617 254265.479)
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20...",POINT (990633.981 202959.782)
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.31 144283.336, 936046.565 144...",POINT (931871.37 140681.351)


In [23]:
import osmnx as ox
import networkx as nx
import time
import folium
from geopy.distance import geodesic  # Import geodesic from geopy

# Define the bounding box for New York City
place = "New York City, New York, USA"

# Load the road network graph for driving
G = ox.graph_from_place(place, network_type='drive')
print("Graph loaded successfully.")

# List of ride start and end coordinates
rides = [
    ((40.7580, -73.9855), (40.7644, -73.9745)),  # Ride 1: Times Square → Central Park
    ((40.7306, -73.9352), (40.7488, -73.9680)),  # Ride 2: Williamsburg → United Nations HQ
    ((40.7295, -73.9965), (40.8075, -73.9626)),  # Ride 3: NYU → Columbia University
    ((40.7527, -73.9772), (40.7831, -73.9712)),  # Ride 4: Grand Central → Upper East Side
    ((40.6900, -73.9900), (40.7291, -73.9965))   # Ride 5: Brooklyn Bridge → NYU
]

# Create a Folium map centered at NYC
route_map = folium.Map(location=(40.7580, -73.9855), zoom_start=13)

# Different colors for paths
path_colors = ['blue', 'red', 'green', 'purple', 'orange']

# Iterate over the rides
for i, (start_coords, end_coords) in enumerate(rides, start=1):
    try:
        # Find nearest nodes
        start_node = ox.distance.nearest_nodes(G, X=start_coords[1], Y=start_coords[0])
        end_node = ox.distance.nearest_nodes(G, X=end_coords[1], Y=end_coords[0])
        
        # Check if a path exists
        if nx.has_path(G, start_node, end_node):
            start_time = time.time()
            path = nx.astar_path(G, start_node, end_node, weight='length')
            end_time = time.time()
            
            # Get node data for plotting
            nodes = {node: (data['y'], data['x']) for node, data in G.nodes(data=True)}
            path_coords = [nodes[node] for node in path]
            
            # Calculate the length of the path (sum of edge lengths using geopy)
            path_length = sum(
                geodesic((nodes[path[i-1]][0], nodes[path[i-1]][1]), (nodes[path[i]][0], nodes[path[i]][1])).meters
                for i in range(1, len(path))
            )

            # Plot the path with unique color
            folium.PolyLine(
                locations=path_coords,
                color=path_colors[i % len(path_colors)],
                weight=5,
                opacity=0.8,
                popup=f'Ride {i}: A* Shortest Path\nLength: {path_length:.2f} meters'
            ).add_to(route_map)
            
            # Add start and end markers
            folium.Marker(
                location=start_coords,
                popup=f"Ride {i} Start",
                icon=folium.Icon(color='green', icon='play')
            ).add_to(route_map)
            
            folium.Marker(
                location=end_coords,
                popup=f"Ride {i} End",
                icon=folium.Icon(color='red', icon='stop')
            ).add_to(route_map)
            
            print(f"✅ Ride {i}: Path found in {end_time - start_time:.4f} seconds, Length: {path_length:.2f} meters")
        else:
            print(f"❌ Ride {i}: No path exists between the given points.")
    
    except Exception as e:
        print(f"🚨 Ride {i}: Error - {e}")

# Save the map
route_map.save('nyc_multiple_rides_with_length.html')
print("Map saved as 'nyc_multiple_rides_with_length.html'")

✅ Ride 1: Path found in 0.0036 seconds, Length: 1730.47 meters
✅ Ride 2: Path found in 0.0644 seconds, Length: 7243.80 meters
✅ Ride 3: Path found in 0.0667 seconds, Length: 9937.15 meters
✅ Ride 4: Path found in 0.0144 seconds, Length: 4257.55 meters
✅ Ride 5: Path found in 0.0278 seconds, Length: 5489.75 meters
Map saved as 'nyc_multiple_rides_with_length.html'


In [2]:
# Load trip data
from configs.data_configs import RAW_DATA_DIR
import polars as pl
from utils.data_utils import Preprocess

# Example Usage Without Creating an Object
trips_df = pl.read_parquet(f"{RAW_DATA_DIR}/fhvhv_tripdata_2021-01.parquet").pipe(Preprocess.reduce_memory_usage).to_pandas()

trips_df.head()

Memory usage before: 1700.79 MB
Memory usage after: 1155.66 MB


Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02682,B02682,2021-01-01 00:28:09,2021-01-01 00:31:42,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,5.26,...,1.98,2.75,,0.0,14.99,N,N,,N,N
1,HV0003,B02682,B02682,2021-01-01 00:45:56,2021-01-01 00:55:19,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,3.65,...,1.63,0.0,,0.0,17.059999,N,N,,N,N
2,HV0003,B02764,B02764,2021-01-01 00:21:15,2021-01-01 00:22:41,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,3.51,...,1.25,2.75,,0.94,12.98,N,N,,N,N
3,HV0003,B02764,B02764,2021-01-01 00:39:12,2021-01-01 00:42:37,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,0.74,...,0.7,2.75,,0.0,7.41,N,N,,N,N
4,HV0003,B02764,B02764,2021-01-01 00:46:11,2021-01-01 00:47:17,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,9.2,...,2.41,2.75,,0.0,22.440001,N,N,,N,N


In [3]:
zone_coords = taxi_zones[['LocationID', 'lat', 'long']]
zone_coords

Unnamed: 0,LocationID,lat,long
0,1,40.691830,-74.174001
1,2,40.616746,-73.831300
2,3,40.864473,-73.847422
3,4,40.723752,-73.976968
4,5,40.552659,-74.188485
...,...,...,...
258,259,40.897932,-73.852215
259,260,40.744233,-73.906307
260,261,40.709138,-74.013023
261,262,40.775932,-73.946510


In [4]:
# Map Pickup and Drop-off coordinates
trips_df = trips_df.merge(zone_coords, left_on='PULocationID', right_on='LocationID', suffixes=('_PU', ''))
trips_df = trips_df.merge(zone_coords, left_on='DOLocationID', right_on='LocationID', suffixes=('_PU', '_DO'))

In [5]:
trips_df.head()

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,LocationID_PU,lat_PU,long_PU,LocationID_DO,lat_DO,long_DO
0,HV0003,B02682,B02682,2021-01-01 00:28:09,2021-01-01 00:31:42,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,5.26,...,N,,N,N,230,40.759817,-73.984196,166,40.809456,-73.961763
1,HV0003,B02682,B02682,2021-01-01 00:45:56,2021-01-01 00:55:19,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,3.65,...,N,,N,N,152,40.817975,-73.953782,167,40.827512,-73.902352
2,HV0003,B02764,B02764,2021-01-01 00:21:15,2021-01-01 00:22:41,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,3.51,...,N,,N,N,233,40.749913,-73.970442,142,40.773633,-73.981532
3,HV0003,B02764,B02764,2021-01-01 00:39:12,2021-01-01 00:42:37,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,0.74,...,N,,N,N,142,40.773633,-73.981532,143,40.775965,-73.987645
4,HV0003,B02764,B02764,2021-01-01 00:46:11,2021-01-01 00:47:17,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,9.2,...,N,,N,N,143,40.775965,-73.987645,78,40.84496,-73.885521


In [6]:
from geopy.distance import geodesic
from tqdm import tqdm

# Enable tqdm for Pandas
tqdm.pandas()

def calculate_distance(row):
    start = (row['lat_PU'], row['long_PU'])
    end = (row['lat_DO'], row['long_DO'])
    return geodesic(start, end).miles

# Use progress_apply for progress bar in Pandas
trips_df['distance_miles'] = trips_df.progress_apply(calculate_distance, axis=1)

 13%|█▎        | 1547095/11613286 [04:32<29:32, 5677.63it/s]


KeyboardInterrupt: 

In [None]:
trips_df

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,access_a_ride_flag,wav_request_flag,wav_match_flag,LocationID_PU,lat_PU,long_PU,LocationID_DO,lat_DO,long_DO,distance_miles
0,HV0003,B02869,B02869,2021-01-23 08:15:03,2021-01-23 08:16:33,2021-01-23 08:18:36,2021-01-23 08:30:50,142,74,3.540,...,,N,N,142,40.773633,-73.981532,74,40.801169,-73.937346,2.996803
1,HV0003,B02872,B02872,2021-01-07 14:25:09,2021-01-07 14:34:09,2021-01-07 14:36:09,2021-01-07 14:50:59,206,206,2.870,...,,N,N,206,40.638973,-74.102314,206,40.638973,-74.102314,0.000000
2,HV0003,B02875,B02875,2021-01-17 12:44:54,2021-01-17 12:48:58,2021-01-17 12:50:58,2021-01-17 13:06:17,201,155,7.060,...,,N,N,201,40.577983,-73.843454,155,40.614591,-73.915277,4.544383
3,HV0003,B02764,B02764,2021-01-07 19:52:35,2021-01-07 19:56:56,2021-01-07 19:56:57,2021-01-07 20:15:04,170,226,5.100,...,,N,N,170,40.747746,-73.978492,226,40.737698,-73.924673,2.908365
4,HV0003,B02882,B02882,2021-01-22 11:52:43,2021-01-22 11:57:01,2021-01-22 11:57:01,2021-01-22 12:08:05,196,82,1.710,...,,N,N,196,40.726155,-73.863338,82,40.739495,-73.877118,1.170680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,HV0005,B02510,,2021-01-02 13:34:09,NaT,2021-01-02 13:36:11,2021-01-02 13:42:00,13,87,1.471,...,N,N,N,13,40.712038,-74.016079,87,40.706808,-74.007496,0.577352
974,HV0003,B02836,B02836,2021-01-24 16:22:26,2021-01-24 16:23:53,2021-01-24 16:24:18,2021-01-24 16:35:19,94,220,1.860,...,,N,N,94,40.858155,-73.899536,220,40.882403,-73.910665,1.771853
975,HV0003,B02889,B02889,2021-01-24 06:38:44,2021-01-24 06:42:44,2021-01-24 06:42:50,2021-01-24 06:50:03,238,75,1.710,...,,N,N,238,40.791705,-73.973049,75,40.790011,-73.945750,1.436415
976,HV0005,B02510,,2021-01-10 04:14:09,NaT,2021-01-10 04:15:50,2021-01-10 04:23:10,228,22,2.328,...,N,N,N,228,40.652354,-74.011273,22,40.612218,-73.995259,2.894592


In [31]:
# Save enriched dataset
trips_df.to_parquet('fhv_trips_with_coordinates.parquet', index=False)