In [1]:
import pandas as pd
import numpy as np
from keplergl import KeplerGl

In [2]:
# Load trip-level data (raw trip data)
trips_df = pd.read_csv(
    "data/2022-citibike-tripdata/202201-citibike-tripdata_1.csv",
    low_memory=False
)

print("Shape:", trips_df.shape)
trips_df.head()

Shape: (1000000, 13)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member


In [3]:
# Add a column with value 1 (so we can count trips when grouping)
trips_df["trip_count"] = 1

trips_df[["start_station_name", "end_station_name", "trip_count"]].head()

Unnamed: 0,start_station_name,end_station_name,trip_count
0,West End Ave & W 107 St,Mt Morris Park W & W 120 St,1
1,4 Ave & 3 St,Boerum Pl\t& Pacific St,1
2,1 Ave & E 62 St,5 Ave & E 29 St,1
3,2 Ave & E 96 St,5 Ave & E 29 St,1
4,6 Ave & W 34 St,5 Ave & E 29 St,1


In [4]:
# Aggregate trips between start and end stations (creates one row per route)
aggregated_df = (
    trips_df
    .groupby(
        [
            "start_station_name",
            "end_station_name",
            "start_lat",
            "start_lng",
            "end_lat",
            "end_lng"
        ],
        dropna=False
    )["trip_count"]
    .sum()
    .reset_index()
)

print("Aggregated shape:", aggregated_df.shape)
aggregated_df.sort_values("trip_count", ascending=False).head(10)

Aggregated shape: (211089, 7)


Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,trip_count
183801,W 21 St & 6 Ave,9 Ave & W 22 St,40.74174,-73.994156,40.745497,-74.001971,428
2046,1 Ave & E 62 St,1 Ave & E 68 St,40.761227,-73.96094,40.765005,-73.958185,312
123099,Henry St & Grand St,Norfolk St & Broome St,40.714211,-73.981095,40.717227,-73.988021,269
149416,Norfolk St & Broome St,Henry St & Grand St,40.717227,-73.988021,40.714211,-73.981095,268
184063,W 21 St & 6 Ave,W 22 St & 10 Ave,40.74174,-73.994156,40.74692,-74.004519,244
86435,E 32 St & Park Ave,E 33 St & 1 Ave,40.745712,-73.981948,40.743227,-73.974498,228
184463,W 22 St & 10 Ave,W 21 St & 6 Ave,40.74692,-74.004519,40.74174,-73.994156,223
20059,46 Ave & 5 St,Vernon Blvd & 50 Ave,40.74731,-73.95451,40.742327,-73.954117,219
207005,West St & Chambers St,Pier 40 - Hudson River Park,40.717548,-74.013221,40.727714,-74.011296,209
57985,Central Park S & 6 Ave,Central Park S & 6 Ave,40.765909,-73.976342,40.765909,-73.976342,208


In [5]:
# Keep only busiest routes (top 2000 by trip_count) to avoid lag
top_routes_df = (
    aggregated_df
    .sort_values("trip_count", ascending=False)
    .head(2000)
)

print("Top routes shape:", top_routes_df.shape)
top_routes_df.head()

Top routes shape: (2000, 7)


Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,trip_count
183801,W 21 St & 6 Ave,9 Ave & W 22 St,40.74174,-73.994156,40.745497,-74.001971,428
2046,1 Ave & E 62 St,1 Ave & E 68 St,40.761227,-73.96094,40.765005,-73.958185,312
123099,Henry St & Grand St,Norfolk St & Broome St,40.714211,-73.981095,40.717227,-73.988021,269
149416,Norfolk St & Broome St,Henry St & Grand St,40.717227,-73.988021,40.714211,-73.981095,268
184063,W 21 St & 6 Ave,W 22 St & 10 Ave,40.74174,-73.994156,40.74692,-74.004519,244


In [6]:
# Clean data to avoid JSON serialization warning
top_routes_df = (
    top_routes_df
    .replace([np.inf, -np.inf], np.nan)
    .dropna()
)

In [7]:
# Create Kepler map + add busiest routes
map_1 = KeplerGl(height=600)

map_1.add_data(
    data=top_routes_df,
    name="Top NYC Trips 2022"
)

map_1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'Top NYC Trips 2022': {'index': [183801, 2046, 123099, 149416, 184063, 86435, 184463, 20059, 20…

After applying a filter to isolate one of the busiest starting stations, the visualization shows that most trips remain within Midtown Manhattan. The routes are relatively short and primarily follow the north–south grid pattern of the city. Only a small number of trips extend toward outer boroughs. This suggests that the station functions as a local mobility hub, likely serving commuters and short urban trips. The pattern aligns with Midtown Manhattan’s high business density, strong public transportation network, and the common use of bike-sharing for last-mile transportation.

### Route Coverage

The top 2000 most frequent routes account for approximately **12.6% of all rides in the dataset**. 

This indicates that while bike usage is spatially concentrated in certain high-traffic corridors, a large share of trips is distributed across many less frequent station pairs, reflecting the network’s broad coverage across the city.

In [9]:
map_1.save_to_html(file_name="kepler_map.html")

Map saved to kepler_map.html!
