# Citibike Trip Data Analysis and Visualization
Exploring and visualizing Citibike trip data with network flow maps


In [None]:
import pandas as pd
import pydeck as pdk
import folium
import geopandas as gpd
from shapely.geometry import LineString, Point
import matplotlib.pyplot as plt
import contextily as ctx
import numpy as np


## Load and Explore Data


In [None]:
df = pd.read_csv("../202408-citibike-tripdata/202408-citibike-tripdata_1.csv")
print(f"Total trips: {len(df)}")
df.head()


In [None]:
# Data overview
print("Column names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nBasic statistics:")
print(df.describe())


In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())


## Basic Trip Analysis


In [None]:
# Parse datetime columns
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])
df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60  # in minutes

print(f"Average trip duration: {df['trip_duration'].mean():.2f} minutes")
print(f"Median trip duration: {df['trip_duration'].median():.2f} minutes")


In [None]:
# Bike type distribution
print("Bike type distribution:")
print(df['rideable_type'].value_counts())
print("\nMember vs Casual:")
print(df['member_casual'].value_counts())


In [None]:
# Most popular stations
print("Top 10 Start Stations:")
print(df['start_station_name'].value_counts().head(10))
print("\nTop 10 End Stations:")
print(df['end_station_name'].value_counts().head(10))


## Aggregate Trips by Station Pairs


In [None]:
# Aggregate by station pairs to count number of trips
df_agg = df.groupby([
    'start_station_name', 'start_station_id', 'start_lat', 'start_lng',
    'end_station_name', 'end_station_id', 'end_lat', 'end_lng'
], as_index=False).size()

df_agg.columns = [
    'start_station_name', 'start_station_id', 'start_lat', 'start_lng',
    'end_station_name', 'end_station_id', 'end_lat', 'end_lng', 'trip_count'
]

print(f"Unique station pairs: {len(df_agg)}")
print("\nTop 10 most popular routes:")
df_agg_sorted = df_agg.sort_values('trip_count', ascending=False)
print(df_agg_sorted[['start_station_name', 'end_station_name', 'trip_count']].head(10))


## Create Geographic Visualizations


In [None]:
# Create LineString geometries for each trip route
def make_line(row):
    return LineString([
        (row['start_lng'], row['start_lat']),
        (row['end_lng'], row['end_lat'])
    ])

gdf = gpd.GeoDataFrame(
    df_agg, 
    geometry=df_agg.apply(make_line, axis=1), 
    crs="EPSG:4326"
)

print(f"GeoDataFrame created with {len(gdf)} routes")


### Flow Map with Basemap


In [None]:
# Convert to Web Mercator projection for basemap
gdf_merc = gdf.to_crs(epsg=3857)

# Create the flow map
fig, ax = plt.subplots(figsize=(15, 15))

# Plot lines with width proportional to trip count
gdf_merc.plot(
    ax=ax,
    linewidth=gdf_merc['trip_count'] / gdf_merc['trip_count'].max() * 5,
    color='red',
    alpha=0.6
)

# Add basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
ax.set_title("Citibike Trip Flows", fontsize=16, fontweight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.show()


### Focus on Popular Routes


In [None]:
# Visualize only the most popular routes (e.g., top 50%)
threshold = gdf_merc['trip_count'].quantile(0.5)
gdf_popular = gdf_merc[gdf_merc['trip_count'] >= threshold]

fig, ax = plt.subplots(figsize=(15, 15))

gdf_popular.plot(
    ax=ax,
    linewidth=gdf_popular['trip_count'] / gdf_popular['trip_count'].max() * 8,
    color='blue',
    alpha=0.7
)

ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
ax.set_title(f"Popular Citibike Routes (Top 50%, n={len(gdf_popular)})", fontsize=16, fontweight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.show()


### Station Point Map


In [None]:
# Create point geometries for stations
stations_start = df[['start_station_name', 'start_station_id', 'start_lat', 'start_lng']].drop_duplicates()
stations_start.columns = ['station_name', 'station_id', 'lat', 'lng']

stations_end = df[['end_station_name', 'end_station_id', 'end_lat', 'end_lng']].drop_duplicates()
stations_end.columns = ['station_name', 'station_id', 'lat', 'lng']

stations = pd.concat([stations_start, stations_end]).drop_duplicates(subset=['station_id'])

# Count trips per station (both starts and ends)
start_counts = df.groupby('start_station_id').size()
end_counts = df.groupby('end_station_id').size()
total_counts = start_counts.add(end_counts, fill_value=0)

stations['total_trips'] = stations['station_id'].map(total_counts)

print(f"Unique stations: {len(stations)}")
stations.head()


In [None]:
# Create GeoDataFrame for stations
stations_gdf = gpd.GeoDataFrame(
    stations,
    geometry=gpd.points_from_xy(stations.lng, stations.lat),
    crs="EPSG:4326"
)

stations_gdf_merc = stations_gdf.to_crs(epsg=3857)

# Plot stations
fig, ax = plt.subplots(figsize=(15, 15))

stations_gdf_merc.plot(
    ax=ax,
    markersize=stations_gdf_merc['total_trips'] / stations_gdf_merc['total_trips'].max() * 200,
    color='green',
    alpha=0.6,
    edgecolor='black',
    linewidth=0.5
)

ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
ax.set_title("Citibike Stations (size = trip volume)", fontsize=16, fontweight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.show()


### Interactive Map with PyDeck


In [None]:
# Prepare data for PyDeck ArcLayer
arc_data = df_agg.copy()

# Create ArcLayer for trip flows
layer = pdk.Layer(
    "ArcLayer",
    data=arc_data,
    get_source_position=["start_lng", "start_lat"],
    get_target_position=["end_lng", "end_lat"],
    get_source_color=[0, 128, 255, 160],
    get_target_color=[255, 0, 128, 160],
    get_width="trip_count",
    width_scale=0.1,
    width_min_pixels=1,
    pickable=True,
    auto_highlight=True,
)

# Set initial view
view_state = pdk.ViewState(
    latitude=df['start_lat'].mean(),
    longitude=df['start_lng'].mean(),
    zoom=11,
    pitch=40,
)

# Create deck
r = pdk.Deck(
    layers=[layer],
    initial_view_state=view_state,
    tooltip={
        "text": "{start_station_name} → {end_station_name}\nTrips: {trip_count}"
    }
)

# Display inline
r.show()


In [None]:
# Save to HTML file
r.to_html("../citibike_flows.html")
print("Interactive map saved to citibike_flows.html")


## Additional Analysis


In [None]:
# Trip duration distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['trip_duration'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Trip Duration (minutes)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Trip Duration Distribution')
axes[0].grid(True, alpha=0.3)

# Box plot by bike type
df.boxplot(column='trip_duration', by='rideable_type', ax=axes[1])
axes[1].set_xlabel('Bike Type')
axes[1].set_ylabel('Trip Duration (minutes)')
axes[1].set_title('Trip Duration by Bike Type')
plt.suptitle('')

plt.tight_layout()
plt.show()


In [None]:
# Usage by hour of day
df['hour'] = df['started_at'].dt.hour

hourly_trips = df.groupby('hour').size()

fig, ax = plt.subplots(figsize=(12, 6))
hourly_trips.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Number of Trips')
ax.set_title('Citibike Usage by Hour of Day')
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# Member vs Casual comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Trip count
df['member_casual'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'], edgecolor='black')
axes[0].set_xlabel('User Type')
axes[0].set_ylabel('Number of Trips')
axes[0].set_title('Trips by User Type')
axes[0].grid(True, alpha=0.3, axis='y')
plt.sca(axes[0])
plt.xticks(rotation=0)

# Average trip duration
df.groupby('member_casual')['trip_duration'].mean().plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'], edgecolor='black')
axes[1].set_xlabel('User Type')
axes[1].set_ylabel('Average Trip Duration (minutes)')
axes[1].set_title('Average Trip Duration by User Type')
axes[1].grid(True, alpha=0.3, axis='y')
plt.sca(axes[1])
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()
