In [1]:
import polars as pl
import sys
sys.path.append('../..')

from configs.data_configs import PROCESSED_DATA_DIR

taxi_zones = pl.read_parquet(PROCESSED_DATA_DIR / "taxi_zones/taxi_zones_with_centroids.parquet")
taxi_zones

OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,lat,long
i32,f64,f64,str,i32,str,f64,f64
1,0.116357,0.000782,"""Newark Airport""",1,"""EWR""",40.69183,-74.174001
2,0.43347,0.004866,"""Jamaica Bay""",2,"""Queens""",40.616746,-73.8313
3,0.084341,0.000314,"""Allerton/Pelham Gardens""",3,"""Bronx""",40.864473,-73.847422
4,0.043567,0.000112,"""Alphabet City""",4,"""Manhattan""",40.723752,-73.976968
5,0.092146,0.000498,"""Arden Heights""",5,"""Staten Island""",40.552659,-74.188485
…,…,…,…,…,…,…,…
259,0.12675,0.000395,"""Woodlawn/Wakefield""",259,"""Bronx""",40.897932,-73.852215
260,0.133514,0.000422,"""Woodside""",260,"""Queens""",40.744233,-73.906307
261,0.02712,0.000034,"""World Trade Center""",261,"""Manhattan""",40.709138,-74.013023
262,0.049064,0.000122,"""Yorkville East""",262,"""Manhattan""",40.775932,-73.94651


# plot ride and get ride length

In [2]:
import osmnx as ox
import networkx as nx
import time
import folium
from geopy.distance import geodesic


def plot_route_lat_lon(start_lat, start_lon, end_lat, end_lon, G, plot=True):
    """
    Plots a route for a single ride based on latitude and longitude coordinates.

    Args:
    - start_lat, start_lon: Latitude and longitude of the ride start point.
    - end_lat, end_lon: Latitude and longitude of the ride end point.
    - G: The road network graph (should already be loaded using osmnx).
    - plot (bool): Whether to plot the route on a map (default: True).
    """
    try:
        # Find nearest nodes using latitude and longitude
        start_node = ox.distance.nearest_nodes(G, X=start_lon, Y=start_lat)
        end_node = ox.distance.nearest_nodes(G, X=end_lon, Y=end_lat)
        
        # Check if a path exists
        if nx.has_path(G, start_node, end_node):
            start_time = time.time()
            path = nx.astar_path(G, start_node, end_node, weight='length')
            end_time = time.time()
            
            # Get node data for plotting
            nodes = {node: (data['y'], data['x']) for node, data in G.nodes(data=True)}  
            path_coords = [nodes[node] for node in path]
            
            # Calculate path length
            path_length = sum(
                geodesic((nodes[path[i-1]][0], nodes[path[i-1]][1]), (nodes[path[i]][0], nodes[path[i]][1])).meters
                for i in range(1, len(path))
            )

            if plot:
                # Create a Folium map centered at the start location
                route_map = folium.Map(location=(start_lat, start_lon), zoom_start=13)

                # Plot the path with popup
                folium.PolyLine(
                    locations=path_coords,
                    color='blue',
                    weight=5,
                    opacity=0.8,
                    popup=f'A* Shortest Path\nLength: {path_length:.2f} meters'
                ).add_to(route_map)
                
                # Add Start Marker with Icon
                folium.Marker(
                    location=(start_lat, start_lon),
                    popup=f"Start Point\nLat: {start_lat:.4f}, Lon: {start_lon:.4f}",
                    icon=folium.Icon(color='green', icon='play')
                ).add_to(route_map)
                
                # Add End Marker with Icon
                folium.Marker(
                    location=(end_lat, end_lon),
                    popup=f"End Point\nLat: {end_lat:.4f}, Lon: {end_lon:.4f}",
                    icon=folium.Icon(color='red', icon='stop')
                ).add_to(route_map)
                
                # Add Midpoint Marker with Path Length
                mid_index = len(path_coords) // 2
                midpoint = path_coords[mid_index]
                folium.Marker(
                    location=midpoint,
                    popup=f"Path Length: {path_length:.2f} meters",
                    icon=folium.Icon(color='blue', icon='info-sign')
                ).add_to(route_map)
                
                # Save the map
                route_map.save('ride_route.html')
                print("🗺️ Map saved as 'ride_route.html'")

            return path_length
        else:
            print(f"❌ No path exists between the given points.")
    
    except Exception as e:
        print(f"🚨 Error - {e}")


# Load the road network graph once
place = "New York City, New York, USA"
G = ox.graph_from_place(place, network_type='drive')
print("🌍 Graph loaded successfully.")

start_lat, start_long = taxi_zones.filter(taxi_zones['LocationID']==3)[['lat', 'long']].to_pandas().values[0]
end_lat, end_long = taxi_zones.filter(taxi_zones['LocationID']==4)[['lat', 'long']].to_pandas().values[0]

# Plot a single route
plot_route_lat_lon(start_lat, start_long, end_lat, end_long, G, plot=True)

🌍 Graph loaded successfully.
🗺️ Map saved as 'ride_route.html'


21779.147320845335

In [72]:
# Load trip data
from configs.data_configs import RAW_DATA_DIR
import polars as pl
from utils.data_utils import Preprocess

# Example Usage Without Creating an Object
trips_df = pl.read_parquet(f"{RAW_DATA_DIR}/fhvhv_tripdata_2021-01.parquet").pipe(Preprocess.reduce_memory_usage)

trips_df.head()

Memory usage before: 1700.79 MB
Memory usage after: 1155.66 MB


hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f32,i32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str
"""HV0003""","""B02682""","""B02682""",2021-01-01 00:28:09,2021-01-01 00:31:42,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,5.26,923,22.280001,0.0,0.67,1.98,2.75,,0.0,14.99,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02682""","""B02682""",2021-01-01 00:45:56,2021-01-01 00:55:19,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,3.65,1382,18.360001,0.0,0.55,1.63,0.0,,0.0,17.059999,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02764""","""B02764""",2021-01-01 00:21:15,2021-01-01 00:22:41,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,3.51,849,14.05,0.0,0.48,1.25,2.75,,0.94,12.98,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02764""","""B02764""",2021-01-01 00:39:12,2021-01-01 00:42:37,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,0.74,179,7.91,0.0,0.24,0.7,2.75,,0.0,7.41,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02764""","""B02764""",2021-01-01 00:46:11,2021-01-01 00:47:17,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,9.2,1228,27.110001,0.0,0.81,2.41,2.75,,0.0,22.440001,"""N""","""N""",""" ""","""N""","""N"""


In [74]:
# trips_df = trips_df.sample(50, seed=42)
trips_df.filter(trips_df['PULocationID']==trips_df['DOLocationID'])

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f32,i32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str
"""HV0003""","""B02836""","""B02836""",2021-01-01 00:40:44,2021-01-01 00:53:34,2021-01-01 00:53:48,2021-01-01 01:11:40,22,22,3.52,1072,28.67,0.0,0.86,2.54,0.0,,0.0,17.639999,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02682""","""B02682""",2021-01-01 00:43:01,2021-01-01 00:45:40,2021-01-01 00:46:07,2021-01-01 00:53:32,89,89,1.04,445,8.7,0.0,0.26,0.77,0.0,,0.0,8.98,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02682""","""B02682""",2021-01-01 00:47:51,2021-01-01 00:58:40,2021-01-01 00:58:40,2021-01-01 01:04:40,165,165,1.66,360,9.44,0.0,0.28,0.84,0.0,,0.0,10.38,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02889""","""B02889""",2021-01-01 00:32:34,2021-01-01 00:33:09,2021-01-01 00:33:57,2021-01-01 00:37:31,80,80,0.7,214,7.91,0.0,0.24,0.7,0.0,,0.0,6.81,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02764""","""B02764""",2021-01-01 00:37:39,2021-01-01 00:41:52,2021-01-01 00:42:10,2021-01-01 00:46:56,7,7,1.38,286,7.91,0.0,0.24,0.7,0.0,,0.0,8.84,"""N""","""N""",""" ""","""N""","""N"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""HV0003""","""B02764""","""B02764""",2021-01-31 23:12:39,2021-01-31 23:20:40,2021-01-31 23:21:43,2021-01-31 23:23:59,217,217,0.34,136,11.38,0.0,0.34,1.01,0.0,,1.0,6.65,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02764""","""B02764""",2021-01-31 23:21:44,2021-01-31 23:27:24,2021-01-31 23:27:47,2021-01-31 23:34:52,256,256,0.84,425,12.65,0.0,0.38,1.12,0.0,,0.0,7.65,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02512""","""B02512""",2021-01-31 23:38:40,2021-01-31 23:47:37,2021-01-31 23:47:38,2021-01-31 23:51:07,202,202,0.62,209,11.08,0.0,0.33,0.98,0.0,,0.0,6.89,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02875""","""B02875""",2021-01-31 23:23:58,2021-01-31 23:30:03,2021-01-31 23:32:05,2021-01-31 23:38:22,208,208,0.95,377,9.18,0.0,0.28,0.81,0.0,,0.0,7.02,"""N""","""N""",""" ""","""N""","""N"""


In [60]:
trips_df.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'originating_base_num',
 'request_datetime',
 'on_scene_datetime',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'shared_request_flag',
 'shared_match_flag',
 'access_a_ride_flag',
 'wav_request_flag',
 'wav_match_flag']

In [61]:
unique_columns = [col for col in trips_df.columns if trips_df[col].is_unique]
print(unique_columns)

['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_miles', 'trip_time', 'base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay', 'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag']


In [62]:
trips_df

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f32,i32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str
"""HV0005""","""B02510""",,2021-01-10 15:10:08,,2021-01-10 15:12:12,2021-01-10 15:27:46,33,189,2.341,934,15.18,0.0,0.46,1.35,0.0,,0.0,10.42,"""N""","""N""","""N""","""N""","""N"""
"""HV0003""","""B02878""","""B02878""",2021-01-30 13:20:55,2021-01-30 13:26:20,2021-01-30 13:26:29,2021-01-30 13:51:45,193,41,6.13,1516,21.549999,0.0,0.65,1.91,2.75,,0.0,19.450001,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02764""","""B02764""",2021-01-15 05:54:59,2021-01-15 06:02:08,2021-01-15 06:04:08,2021-01-15 06:25:56,36,34,4.31,1308,15.9,0.0,0.48,1.41,0.0,,0.0,21.959999,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02872""","""B02872""",2021-01-10 18:17:09,2021-01-10 18:20:51,2021-01-10 18:21:27,2021-01-10 18:40:56,256,36,2.92,1169,18.09,0.0,0.54,1.61,0.0,,0.0,13.0,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02872""","""B02872""",2021-01-16 16:06:35,2021-01-16 16:13:43,2021-01-16 16:14:23,2021-01-16 16:30:38,127,241,1.62,975,10.26,0.0,0.31,0.91,0.0,,0.0,9.95,"""N""","""N""",""" ""","""N""","""N"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""HV0003""","""B02765""","""B02765""",2021-01-07 12:34:28,2021-01-07 12:36:31,2021-01-07 12:37:08,2021-01-07 12:57:41,151,68,5.55,1233,18.41,0.0,0.55,1.63,2.75,,0.0,16.440001,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02882""","""B02882""",2021-01-21 09:34:30,2021-01-21 09:37:27,2021-01-21 09:39:29,2021-01-21 09:45:49,118,214,1.11,380,7.64,0.0,0.23,0.68,0.0,,0.0,6.37,"""N""","""N""",""" ""","""N""","""N"""
"""HV0003""","""B02872""","""B02872""",2021-01-01 07:44:21,2021-01-01 07:49:19,2021-01-01 07:49:48,2021-01-01 08:00:39,258,216,2.35,651,14.23,0.0,0.43,1.26,0.0,,0.0,12.45,"""N""","""N""",""" ""","""N""","""Y"""
"""HV0003""","""B02869""","""B02869""",2021-01-24 15:39:09,2021-01-24 15:41:11,2021-01-24 15:43:12,2021-01-24 15:54:52,211,246,2.57,700,10.97,0.0,0.35,1.02,2.75,,0.0,8.7,"""N""","""N""",""" ""","""N""","""N"""


In [63]:
def get_lat_long(location_id):
    filtered = taxi_zones.filter(taxi_zones["LocationID"] == location_id)
    if filtered.shape[0] == 0:
        return (None, None)
    lat = filtered["lat"].to_pandas().values[0]
    long = filtered["long"].to_pandas().values[0]
    return (lat, long)

trips_df = trips_df.with_columns(
    trips_df["PULocationID"].map_elements(get_lat_long, return_dtype=pl.Categorical).alias("PU_lat_long"),
    trips_df["DOLocationID"].map_elements(get_lat_long, return_dtype=pl.Categorical).alias("DO_lat_long")
)
trips_df.head()

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PU_lat_long,DO_lat_long
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f32,i32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,list[f64],list[f64]
"""HV0005""","""B02510""",,2021-01-10 15:10:08,,2021-01-10 15:12:12,2021-01-10 15:27:46,33,189,2.341,934,15.18,0.0,0.46,1.35,0.0,,0.0,10.42,"""N""","""N""","""N""","""N""","""N""","[40.695798, -73.99525]","[40.677635, -73.967587]"
"""HV0003""","""B02878""","""B02878""",2021-01-30 13:20:55,2021-01-30 13:26:20,2021-01-30 13:26:29,2021-01-30 13:51:45,193,41,6.13,1516,21.549999,0.0,0.65,1.91,2.75,,0.0,19.450001,"""N""","""N""",""" ""","""N""","""N""","[40.760313, -73.941997]","[40.804333, -73.951292]"
"""HV0003""","""B02764""","""B02764""",2021-01-15 05:54:59,2021-01-15 06:02:08,2021-01-15 06:04:08,2021-01-15 06:25:56,36,34,4.31,1308,15.9,0.0,0.48,1.41,0.0,,0.0,21.959999,"""N""","""N""",""" ""","""N""","""N""","[40.700521, -73.91771]","[40.700855, -73.971188]"
"""HV0003""","""B02872""","""B02872""",2021-01-10 18:17:09,2021-01-10 18:20:51,2021-01-10 18:21:27,2021-01-10 18:40:56,256,36,2.92,1169,18.09,0.0,0.54,1.61,0.0,,0.0,13.0,"""N""","""N""",""" ""","""N""","""N""","[40.710879, -73.959904]","[40.700521, -73.91771]"
"""HV0003""","""B02872""","""B02872""",2021-01-16 16:06:35,2021-01-16 16:13:43,2021-01-16 16:14:23,2021-01-16 16:30:38,127,241,1.62,975,10.26,0.0,0.31,0.91,0.0,,0.0,9.95,"""N""","""N""",""" ""","""N""","""N""","[40.866074, -73.919308]","[40.876512, -73.89562]"


In [67]:
from tqdm.auto import tqdm

# Initialize the progress bar
pbar = tqdm(total=len(trips_df))

def calculate_route_length(row):
    pbar.update(1)
    start_lat, start_long = row[0]
    end_lat, end_long = row[1]
    return plot_route_lat_lon(start_lat, start_long, end_lat, end_long, G, plot=False)

# Apply the function to the dataframe
trips_df = trips_df.with_columns(
    trips_df[['PU_lat_long', 'DO_lat_long']].map_rows(calculate_route_length)#.alias("route_length")
)#.rename({"map_rows": "route_length"})

# Close the progress bar
pbar.close()

  0%|          | 0/50 [00:00<?, ?it/s]

🚨 Error - ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
🚨 Error - ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''


In [71]:
trips_df.filter(trips_df['map'].is_null())

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,PU_lat_long,DO_lat_long,map
str,str,str,datetime[ns],datetime[ns],datetime[ns],datetime[ns],i32,i32,f32,i32,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,list[f64],list[f64],f64
"""HV0005""","""B02510""",,2021-01-18 22:03:39,,2021-01-18 22:06:09,2021-01-18 22:25:45,239,265,5.958,1176,24.18,20.0,1.33,0.0,0.0,,0.0,17.700001,"""N""","""N""","""N""","""N""","""N""","[40.783961, -73.978632]","[null, null]",
"""HV0005""","""B02510""",,2021-01-15 16:34:42,,2021-01-15 16:39:53,2021-01-15 17:27:37,61,265,18.311001,2864,67.540001,0.62,1.87,5.52,0.0,,0.0,46.080002,"""N""","""N""","""N""","""N""","""N""","[40.674469, -73.939287]","[null, null]",


## display html.
<iframe src="nyc_multiple_rides_with_length.html" width="700" height="600"></iframe>

In [3]:
# display html
from IPython.display import IFrame
IFrame(src='nyc_multiple_rides_with_length.html', width=700, height=600)
# not displaying the map