### This notebooks focus on adding features on data for demand prediction analysis.

In [14]:
# Import needed libraries
import os
import duckdb
import pandas as pd
import networkx as nx
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
os.chdir(r"C:/Users/FuQuan/Desktop/Ride-Hailing/Ride-Hailing-VRP-Problem/")

In [4]:
# Step 1: Load and preprocess RIDE DATA

ride_data = duckdb.query("SELECT * FROM 'data/processed/ride_data/enriched_ride_data.parquet'").df()
print(ride_data.head())

     request_datetime     pickup_datetime    dropoff_datetime  PULocationID  \
0 2024-01-01 04:26:07 2024-01-01 04:30:21 2024-01-01 04:43:53           242   
1 2024-01-01 03:23:13 2024-01-01 04:27:50 2024-01-01 04:55:58            37   
2 2024-01-01 04:04:46 2024-01-01 04:08:30 2024-01-01 04:33:39           208   
3 2024-01-01 04:41:16 2024-01-01 04:44:51 2024-01-01 04:54:18           216   
4 2024-01-01 04:45:53 2024-01-01 04:50:15 2024-01-01 05:13:42            39   

   DOLocationID  trip_miles  trip_time  base_passenger_fare  \
0            20       3.400        812                13.37   
1            26       9.570       1688                34.66   
2           180      15.519       1509                39.23   
3           134       2.271        567                11.44   
4           225       5.153       1407                29.98   

   congestion_surcharge  driver_pay  pickup_lat  pickup_lon  dropoff_lat  \
0                   0.0       12.28   40.846783  -73.850671    40.8577

In [5]:
# Convert datetime columns to proper datetime objects
ride_data['pickup_datetime'] = pd.to_datetime(ride_data['pickup_datetime'])
ride_data['request_datetime'] = pd.to_datetime(ride_data['request_datetime'])
ride_data['dropoff_datetime'] = pd.to_datetime(ride_data['dropoff_datetime'])

In [10]:
ride_data['hour_of_day'] = ride_data['pickup_datetime'].dt.hour
ride_data['day_of_week'] = ride_data['pickup_datetime'].dt.weekday
# 0 = Monday, 1 = Tuesday,..

print("\nRide Data with Time Features:")
print(ride_data[['pickup_datetime', 'hour_of_day', 'day_of_week']].head())


Ride Data with Time Features:
      pickup_datetime  hour_of_day  day_of_week
0 2024-01-01 04:30:21            4            0
1 2024-01-01 04:27:50            4            0
2 2024-01-01 04:08:30            4            0
3 2024-01-01 04:44:51            4            0
4 2024-01-01 04:50:15            4            0


In [25]:
print(ride_data.head())

     request_datetime     pickup_datetime    dropoff_datetime  PULocationID  \
0 2024-01-01 04:26:07 2024-01-01 04:30:21 2024-01-01 04:43:53           242   
1 2024-01-01 03:23:13 2024-01-01 04:27:50 2024-01-01 04:55:58            37   
2 2024-01-01 04:04:46 2024-01-01 04:08:30 2024-01-01 04:33:39           208   
3 2024-01-01 04:41:16 2024-01-01 04:44:51 2024-01-01 04:54:18           216   
4 2024-01-01 04:45:53 2024-01-01 04:50:15 2024-01-01 05:13:42            39   

   DOLocationID  trip_miles  trip_time  base_passenger_fare  \
0            20       3.400        812                13.37   
1            26       9.570       1688                34.66   
2           180      15.519       1509                39.23   
3           134       2.271        567                11.44   
4           225       5.153       1407                29.98   

   congestion_surcharge  driver_pay  ...  pickup_lon  dropoff_lat  \
0                   0.0       12.28  ...  -73.850671    40.857780   
1       

In [11]:
#  Aggregate ride demand by pickup location
ride_demand = ride_data.groupby('PULocationID').size().reset_index(name='ride_count')
print(ride_demand.head())

   PULocationID  ride_count
0             1           1
1             2          17
2             3       39987
3             4       62558
4             5        5512


In [13]:
# Normalize ride counts
scaler = MinMaxScaler()
ride_demand["norm_ride_count"] = scaler.fit_transform(ride_demand[['ride_count']])

print("Ride Demand Attributes by Location ID")
print(ride_demand.head())

Ride Demand Attributes by Location ID
   PULocationID  ride_count  norm_ride_count
0             1           1         0.000000
1             2          17         0.000052
2             3       39987         0.130390
3             4       62558         0.203992
4             5        5512         0.017971


In [29]:
ride_data['is_weekend'] = ride_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
aggregated_df = ride_data.groupby(
    ['pickup_nearest_node', 'hour_of_day', 'day_of_week', 'is_weekend']
).agg(
    ride_count=('pickup_datetime', 'count'),
    avg_trip_duration=('trip_time', 'mean'),
    avg_wait_time=('wait_time', 'mean'),
    avg_trip_miles=('trip_miles', 'mean'),
    avg_fare=('base_passenger_fare', 'mean'),
    avg_driver_pay=('driver_pay', 'mean')
).reset_index()

aggregated_df['avg_speed'] = aggregated_df['avg_trip_miles'] / (aggregated_df['avg_trip_duration'] / 3600)
aggregated_df.to_parquet("data/processed/ride_data/aggregated_ride_data.parquet")

In [30]:
print(aggregated_df)

      pickup_nearest_node  hour_of_day  day_of_week  is_weekend  ride_count  \
0               100662028            0            0           0          20   
1               100662028            0            1           0          14   
2               100662028            0            2           0          10   
3               100662028            0            3           0           8   
4               100662028            0            4           0           1   
...                   ...          ...          ...         ...         ...   
42833          9995746880           23            2           0          12   
42834          9995746880           23            3           0           7   
42835          9995746880           23            4           0          19   
42836          9995746880           23            5           1          30   
42837          9995746880           23            6           1          14   

       avg_trip_duration  avg_wait_time  avg_trip_m

In [15]:
# Step 2: Load Road Data
G = nx.read_graphml("data/processed/road_graph/new_york_processed_network.graphml")
print("GraphML Loaded. Number of nodes: ", G.number_of_nodes())

GraphML Loaded. Number of nodes:  55242


In [37]:
edge_list = []

for u, v, data in G.edges(data=True):
    edge_info = {'u': u, 'v': v}
    edge_info.update(data)  # Merge the edge attributes into the dictionary
    edge_list.append(edge_info)

# Create the DataFrame
edges_df = pd.DataFrame(edge_list)

# Show it
print("\nEdges DataFrame:")
print(edges_df.head())


Edges DataFrame:
          u          v                            osmid        highway lanes  \
0  39076461  274283981                         25161349       motorway     2   
1  39076461   42854803                         25161578  motorway_link   NaN   
2  39076490  277672046                          5699971  motorway_link   NaN   
3  39076490  277672005                       1014007069       motorway     3   
4  39076504  462124701  [618709517, 618709515, 5700693]  motorway_link     1   

  maxspeed                  name oneway  ref reversed              length  \
0   50 mph  Cross Island Parkway   True   CI    False   819.5016661477803   
1      NaN                   NaN   True  NaN    False   268.1440952459794   
2      NaN                   NaN   True  NaN    False   259.9234870362934   
3   50 mph  Cross Island Parkway   True   CI    False  291.83869462800163   
4      NaN                   NaN   True  NaN    False  433.14985047751736   

                                      

In [38]:
edges_df['maxspeed'] = edges_df['maxspeed'].fillna('25 mph')

print(edges_df)
edges_df.to_parquet("data/processed/road_graph/enriched_edge_data.parquet")


                  u           v                            osmid  \
0          39076461   274283981                         25161349   
1          39076461    42854803                         25161578   
2          39076490   277672046                          5699971   
3          39076490   277672005                       1014007069   
4          39076504   462124701  [618709517, 618709515, 5700693]   
...             ...         ...                              ...   
139292  12748634203  5328181022                       1376850953   
139293  12750132761    42848823                          5703658   
139294  12750132761    42848835                          5703658   
139295  12750132761    42866215                          5704783   
139296  12750132761    42860387                          5709384   

              highway lanes maxspeed                  name oneway  ref  \
0            motorway     2   50 mph  Cross Island Parkway   True   CI   
1       motorway_link   NaN   25 mp

In [None]:
nodes = []
for node_id, data in G.nodes(data=True):
    y = float(data.get('y', 0))  # Latitude
    x = float(data.get('x', 0))  # Longitude
    highway = data.get('highway', None)
    street_count = int(data.get('street_count', 0))
    nodes.append({"node_id": node_id, "y": y, "x": x, "highway": highway, "street_count": street_count})

road_nodes_df = pd.DataFrame(nodes)
print("\nRoad Nodes Sample:")
print(road_nodes_df.head())


Road Nodes Sample:
    node_id          y          x            highway  street_count
0  39076461  40.786345 -73.794748  motorway_junction             3
1  39076490  40.762429 -73.757091  motorway_junction             3
2  39076504  40.753467 -73.744164  motorway_junction             3
3  42421728  40.798048 -73.960044    traffic_signals             3
4  42421731  40.798654 -73.961474    traffic_signals             4


In [19]:
# STEP 3: Aggreate ride_demand by node using nearest_node_id

node_demand = ride_data.groupby('pickup_nearest_node').size().reset_index(name='ride_count')

scaler = MinMaxScaler()
node_demand['norm_ride_count'] = scaler.fit_transform(node_demand[['ride_count']])

print("\nAggregated Ride Demand by Nearest Node:")
print(node_demand.head())


Aggregated Ride Demand by Nearest Node:
  pickup_nearest_node  ride_count  norm_ride_count
0           100662028        2136         0.006962
1           102994022        8321         0.027131
2           103012237        5291         0.017250
3         10571899595       18482         0.060265
4         10593324590        1549         0.005048


In [21]:
road_nodes_df = road_nodes_df.merge(node_demand, left_on='node_id', right_on='pickup_nearest_node', how='left')
road_nodes_df['ride_count'] = road_nodes_df['ride_count'].fillna(0)
road_nodes_df['norm_ride_count'] = road_nodes_df['norm_ride_count'].fillna(0)

print("\nRoad Nodes Enhanced with Ride Demand:")
print(road_nodes_df[['node_id', 'y', 'x', 'street_count', 'highway', 'ride_count', 'norm_ride_count']].head())


Road Nodes Enhanced with Ride Demand:
    node_id          y          x  street_count            highway  \
0  39076461  40.786345 -73.794748             3  motorway_junction   
1  39076490  40.762429 -73.757091             3  motorway_junction   
2  39076504  40.753467 -73.744164             3  motorway_junction   
3  42421728  40.798048 -73.960044             3    traffic_signals   
4  42421731  40.798654 -73.961474             4    traffic_signals   

   ride_count  norm_ride_count  
0         0.0              0.0  
1         0.0              0.0  
2         0.0              0.0  
3         0.0              0.0  
4         0.0              0.0  


In [23]:
# Print out node with more ride_count
high_demand_nodes = road_nodes_df[road_nodes_df['ride_count'] > 1]
print("\nRoad Nodes with Ride Count > 1:")
print(high_demand_nodes[['node_id', 'y', 'x', 'street_count', 'highway', 'ride_count', 'norm_ride_count']].head())


Road Nodes with Ride Count > 1:
      node_id          y          x  street_count          highway  \
11   42421772  40.783828 -73.977825             4  traffic_signals   
166  42427136  40.842154 -73.942285             3  traffic_signals   
207  42427374  40.742234 -73.997002             4  traffic_signals   
254  42427870  40.865797 -73.919996             4  traffic_signals   
277  42428003  40.765545 -73.954711             4  traffic_signals   

     ride_count  norm_ride_count  
11     131226.0         0.427911  
166    143548.0         0.468092  
207    134127.0         0.437371  
254     64072.0         0.208929  
277    124415.0         0.405701  


In [24]:
road_nodes_df.to_parquet("data/processed/road_graph/enriched_road_data.parquet")
