Connected to Python 3.13.7

# Libraries

In [168]:
import pandas as pd
import numpy as np
import json

WAREHOUSE_COORDS = (-73.985428, 40.748817)  # Example: Empire State Building coordinates

# Read the Bici Data

In [169]:
final_data = pd.DataFrame()

for file in ["../in_data/202511-citibike-tripdata_1.csv",
             "../in_data/202511-citibike-tripdata_2.csv",
             "../in_data/202511-citibike-tripdata_3.csv",
             "../in_data/202511-citibike-tripdata_4.csv"]:
    df = pd.read_csv(file)
    final_data = pd.concat([final_data, df], ignore_index=True)

final_data.head()

  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CEE2FCDE8E0BC6F6,electric_bike,2025-11-11 10:08:01.447,2025-11-11 10:22:35.426,West Thames St,5114.06,Thompson St & Bleecker St,5721.07,40.708347,-74.017134,40.728401,-73.999688,member
1,1C6390981F71FC79,electric_bike,2025-11-09 05:08:42.937,2025-11-09 06:13:39.013,8 Ave & W 38 St,6526.05,8 Ave & W 33 St,6450.12,40.75461,-73.99177,40.751568,-73.993769,casual
2,5D13C1A39F5DF17C,electric_bike,2025-11-14 13:12:43.707,2025-11-14 13:20:29.147,W 54 St & 9 Ave,6920.05,8 Ave & W 33 St,6450.12,40.76604,-73.98737,40.751568,-73.993769,member
3,71CCA5EF65E27F1B,classic_bike,2025-11-12 17:27:45.060,2025-11-12 17:46:34.303,West End Ave & W 60 St,7059.08,W 82 St & Central Park W,7304.08,40.77237,-73.99005,40.78275,-73.97137,member
4,974E05C0A1E4A2DA,electric_bike,2025-11-11 11:45:05.368,2025-11-11 11:54:55.138,Morton St & Greenwich St,5772.05,8 Ave & W 33 St,6450.12,40.73115,-74.00887,40.751568,-73.993769,member


# Filter the data by time #

In [170]:
biciData = final_data.copy()
biciData['started_at'] = pd.to_datetime(biciData['started_at'])
biciData['ended_at'] = pd.to_datetime(biciData['ended_at'])

# Filter only the data for November 2025 and during weekdays and 7AM - 9AM
biciData = biciData[(biciData['started_at'].dt.month == 11) &
                    (biciData['started_at'].dt.year == 2025) &
                    (biciData['started_at'].dt.dayofweek < 5) &  # Monday to Friday
                    (biciData['started_at'].dt.hour >= 7)
                    & (biciData['started_at'].dt.hour < 9)]

biciData.reset_index(drop=True, inplace=True)
print(biciData.shape)

(325724, 13)


In [171]:
biciData.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,B9ACE7CC15728711,electric_bike,2025-11-05 07:05:08.387,2025-11-05 07:10:16.112,E 33 St & 5 Ave,6322.01,8 Ave & W 33 St,6450.12,40.747659,-73.984907,40.751568,-73.993769,member
1,58EDC7AF1E1AE185,electric_bike,2025-11-12 07:03:11.421,2025-11-12 07:08:07.863,9 Ave & W 45 St,6717.06,8 Ave & W 33 St,6450.12,40.760193,-73.991255,40.751568,-73.993769,member
2,F749CA8F4232897D,electric_bike,2025-11-11 08:51:36.729,2025-11-11 09:15:16.397,Evergreen Ave & Noll St,4873.08,Thompson St & Bleecker St,5721.07,40.70106,-73.93318,40.728401,-73.999688,member
3,A81A5F8B1FD8F11A,classic_bike,2025-11-05 07:37:02.232,2025-11-05 07:47:57.711,Washington Pl & 6 Ave,5838.09,8 Ave & W 33 St,6450.12,40.732241,-74.000264,40.751568,-73.993769,casual
4,8D5706EFABFCDF51,electric_bike,2025-11-05 07:22:02.710,2025-11-05 07:27:03.100,Madison Ave & E 82 St,7188.13,W 82 St & Central Park W,7304.08,40.778131,-73.960694,40.78275,-73.97137,member


# Collect all the unique stations

In [172]:
stations = pd.DataFrame(columns=['station_name', 'station_id', 'latitude', 'longitude'])
stations['station_id'] = pd.concat([biciData['start_station_id'], biciData['end_station_id']]).unique()
stations['station_name'] = stations['station_id'].map(biciData.drop_duplicates('start_station_id').set_index('start_station_id')['start_station_name']).fillna(
    stations['station_id'].map(biciData.drop_duplicates('end_station_id').set_index('end_station_id')['end_station_name'])
)

# Ensure unique index for mapping by dropping duplicates and setting index
start_lat_map = biciData.drop_duplicates('start_station_id').set_index('start_station_id')['start_lat']
end_lat_map = biciData.drop_duplicates('end_station_id').set_index('end_station_id')['end_lat']

stations['latitude'] = stations['station_id'].map(start_lat_map).fillna(
    stations['station_id'].map(end_lat_map)
)
stations['longitude'] = stations['station_id'].map(biciData.drop_duplicates('start_station_id').set_index('start_station_id')['start_lng']).fillna(
    stations['station_id'].map(biciData.drop_duplicates('end_station_id').set_index('end_station_id')['end_lng'])
)

print(f"The number of unique stations is: {stations.shape[0]}")

The number of unique stations is: 3137


Given the API has a limit of 100 coordinate calls

In [173]:
N_STATIONS = 99
some_stations = stations.sample(n=N_STATIONS, random_state=42).reset_index(drop=True)
some_stations.head()
some_stations = pd.concat([
    some_stations,
    pd.DataFrame([{
        'station_name': 'Warehouse', 'station_id': '1.00',
        'latitude': WAREHOUSE_COORDS[1], 'longitude': WAREHOUSE_COORDS[0]
    }])
], ignore_index=True)

In [174]:
print(some_stations.shape)
print(stations.shape)

(100, 4)
(3137, 4)


In [175]:
some_stations.head(100)

Unnamed: 0,station_name,station_id,latitude,longitude
0,68 St & 5 Ave,2698.07,40.634160,-74.020580
1,West St & Liberty St,5184.08,40.711444,-74.014847
2,Montague St & Hicks St,4718.11,40.695224,-73.995989
3,20 Ave & Shore Blvd,7391.02,40.785994,-73.915098
4,Frederick Douglass Blvd & Harlem River Dr,8100.01,40.830702,-73.936371
...,...,...,...,...
95,24 Ave & 26 St,7152.10,40.774591,-73.918544
96,47 Ave & 48 St,6092.01,40.740860,-73.917080
97,46 St & 7 Ave,3117.05,40.644743,-74.003754
98,51 Ave & 61 St,5923.02,40.735680,-73.903870


# Compute the Net-Flow Vector

In [176]:
some_stations_data = biciData[
    biciData['start_station_id'].isin(some_stations['station_id']) &
    biciData['end_station_id'].isin(some_stations['station_id'])
]
some_stations_data.shape

(502, 13)

In [177]:
some_stations_data.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
421,5153190F9C9D532D,electric_bike,2025-11-11 08:51:48.907,2025-11-11 09:01:11.225,Bank St & Washington St,5964.01,West St & Liberty St,5184.08,40.736197,-74.008592,40.711444,-74.014847,member
444,9B70184E00E7A0C1,electric_bike,2025-11-05 08:50:21.736,2025-11-05 09:08:33.118,E 2 St & Ave C,5476.03,West St & Liberty St,5184.08,40.720874,-73.980858,40.711444,-74.014847,member
642,181F8BC40880C177,classic_bike,2025-11-11 08:03:47.963,2025-11-11 08:16:44.619,W 22 St & 10 Ave,6306.06,W 22 St & 10 Ave,6306.06,40.74692,-74.004519,40.74692,-74.004519,member
768,2C37198F9A59EDD9,electric_bike,2025-11-05 08:31:46.798,2025-11-05 09:22:28.721,Brooklyn Ave & Dean St,4131.03,Brooklyn Ave & Dean St,4131.03,40.67669,-73.94437,40.67669,-73.94437,member
948,989E98C95C66760A,electric_bike,2025-11-13 08:24:17.839,2025-11-13 08:27:09.553,Greenwich Ave & 8 Ave,6072.06,Bank St & Washington St,5964.01,40.739017,-74.002638,40.736197,-74.008592,casual


In [184]:
"Now we want to compute how many trips occur to a node and how many occur away from a node"

# inflow to the end station
inflow_counts = some_stations_data['end_station_id'].value_counts().to_dict()

# Compute outflow from the start station
outflow_counts = some_stations_data['start_station_id'].value_counts().to_dict()

station_netFlow = {}
for station in some_stations['station_id']:
    inflow = inflow_counts.get(station, 0)
    outflow = outflow_counts.get(station, 0)
    net_flow = inflow - outflow
    station_netFlow[station] = net_flow

# convert into a dataframe
some_stations['net_flow'] = some_stations['station_id'].map(station_netFlow)
print(sum(some_stations['net_flow']))
# Positive Values indicate Source Nodes, Negative Values indicate Sink Nodes



0


In [185]:
some_stations.head()

Unnamed: 0,station_name,station_id,latitude,longitude,net_flow
0,68 St & 5 Ave,2698.07,40.63416,-74.02058,0
1,West St & Liberty St,5184.08,40.711444,-74.014847,6
2,Montague St & Hicks St,4718.11,40.695224,-73.995989,-6
3,20 Ave & Shore Blvd,7391.02,40.785994,-73.915098,0
4,Frederick Douglass Blvd & Harlem River Dr,8100.01,40.830702,-73.936371,0


# Build Distance Matrix between the stations

To do this we can use the project-osrm API to find the distances and durations between points

In [186]:
URL = "https://router.project-osrm.org/table/v1/driving/{coords}?annotations=duration,distance"

""" By inputting a list of coordinates of the type (long, lat), 
    it returns the distance and duration matrices between those coordinates. 
    We just need to split the long lat by a comma and each coordinate by a semicolon."""

coords = ";".join(f"{lon},{lat}" for lon, lat in zip(some_stations['longitude'], some_stations['latitude']))

# given the 
# get the response 
response = pd.read_json(URL.format(coords=coords))
# save the response in a json file
response.to_json("../processed_data/distance_duration_matrix.json", indent=4)


In [187]:
# Get the distance and duration dictionaries from the json file
with open("../processed_data/distance_duration_matrix.json", "r") as f:
    data = json.load(f)
distance_dict = data['distances']
duration_dict = data['durations']

distance_matrix = np.empty((N_STATIONS+1, N_STATIONS+1))
duration_matrix = np.empty((N_STATIONS+1, N_STATIONS+1))

for r in range(N_STATIONS+1):
    distance_matrix[r, :] = distance_dict[str(r)]
    duration_matrix[r, :] = duration_dict[str(r)]

station_name_to_row = {station_name: idx for idx, station_name in enumerate(stations['station_name'])}

distance_matrix /= 1000  # convert to kilometers
distance_matrix = distance_matrix.astype(np.float32)
print("Distance matrix (in kilometers):")
print(distance_matrix[95:100,95:100])

duration_matrix /= 60  # convert to seconds
duration_matrix = duration_matrix.astype(np.float32)
print("Duration matrix (in seconds):")
print(duration_matrix[95:100,95:100])

Distance matrix (in kilometers):
[[ 0.      6.3389 23.6978  6.7305  8.3997]
 [ 6.1711  0.     17.5642  2.0685  6.8368]
 [22.9726 17.3643  0.     18.102  14.8192]
 [ 5.8554  2.2598 19.2512  0.      8.8207]
 [ 8.5529  7.2133 17.713   8.9202  0.    ]]
Duration matrix (in seconds):
[[ 0.         8.92      26.898333   9.266666  12.776667 ]
 [ 9.123333   0.        21.226667   4.5916667 10.878333 ]
 [27.436666  22.43       0.        23.418333  24.205    ]
 [ 8.315      4.8216667 22.47       0.        11.92     ]
 [13.075     11.246667  23.95      12.183333   0.       ]]


# Final Review of Data

In [188]:
pd.DataFrame(distance_matrix).to_csv("../processed_data/distance_matrix.csv", index=False, header=False)
pd.DataFrame(duration_matrix).to_csv("../processed_data/duration_matrix.csv", index=False, header=False)

pd.DataFrame(some_stations).to_csv("../processed_data/some_stations.csv", index=False)