In [17]:
import pandas as pd
import numpy as np
import os
from shapely.geometry import Point
from shapely.ops import nearest_points
import osmnx as ox
import geopandas as gpd
from tqdm import tqdm


In [None]:
import glob
import pandas as pd

files = sorted(glob.glob(f"2024*-bluebikes-tripdata.csv"))
print("Found CSV files:", len(files))
print(files)

# 读全部 csv
df_list = []
for f in files:
    print("Loading", f)
    tmp = pd.read_csv(f)
    df_list.append(tmp)

df = pd.concat(df_list, ignore_index=True)

df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')

start_stations = df[['start_station_id', 'start_station_name', 'start_lat', 'start_lng']].rename(
    columns={'start_station_id':'station_id','start_station_name':'station_name','start_lat':'lat','start_lng':'lng'}
)
end_stations = df[['end_station_id', 'end_station_name', 'end_lat', 'end_lng']].rename(
    columns={'end_station_id':'station_id','end_station_name':'station_name','end_lat':'lat','end_lng':'lng'}
)
stations = pd.concat([start_stations, end_stations]).dropna().drop_duplicates('station_id')
stations = stations.groupby('station_id').first().reset_index()


Found CSV files: 12
['202401-bluebikes-tripdata.csv', '202402-bluebikes-tripdata.csv', '202403-bluebikes-tripdata.csv', '202404-bluebikes-tripdata.csv', '202405-bluebikes-tripdata.csv', '202406-bluebikes-tripdata.csv', '202407-bluebikes-tripdata.csv', '202408-bluebikes-tripdata.csv', '202409-bluebikes-tripdata.csv', '202410-bluebikes-tripdata.csv', '202411-bluebikes-tripdata.csv', '202412-bluebikes-tripdata.csv']
Loading 202401-bluebikes-tripdata.csv
Loading 202402-bluebikes-tripdata.csv
Loading 202403-bluebikes-tripdata.csv
Loading 202404-bluebikes-tripdata.csv
Loading 202405-bluebikes-tripdata.csv
Loading 202406-bluebikes-tripdata.csv
Loading 202407-bluebikes-tripdata.csv
Loading 202408-bluebikes-tripdata.csv
Loading 202409-bluebikes-tripdata.csv
Loading 202410-bluebikes-tripdata.csv
Loading 202411-bluebikes-tripdata.csv
Loading 202412-bluebikes-tripdata.csv


In [52]:
usage_start = df.groupby('start_station_id').size().rename('start_count')
usage_end   = df.groupby('end_station_id').size().rename('end_count')
usage_total = usage_start.add(usage_end, fill_value=0).rename('usage_total')


In [None]:
import numpy as np

usage_monthly = None

for f in files:
    tmp = pd.read_csv(f)
    tmp['started_at'] = pd.to_datetime(tmp['started_at'], errors='coerce')
    tmp['month'] = tmp['started_at'].dt.month

    start = tmp.groupby(['start_station_id', 'month']).size().unstack(fill_value=0)
    end   = tmp.groupby(['end_station_id',   'month']).size().unstack(fill_value=0)
    u = start.add(end, fill_value=0) 

    if usage_monthly is None:
        usage_monthly = u
    else:
        usage_monthly = usage_monthly.add(u, fill_value=0)

usage_monthly = usage_monthly.fillna(0)

usage_monthly.columns = [f"usage_month_{int(m)}" for m in usage_monthly.columns]

usage_monthly.head()


Unnamed: 0,usage_month_1,usage_month_2,usage_month_3,usage_month_4,usage_month_5,usage_month_6,usage_month_7,usage_month_8,usage_month_9,usage_month_10,usage_month_11,usage_month_12
A32000,625.0,778.0,1087.0,1286.0,2140.0,2678.0,2938.0,2728.0,2208.0,1969.0,1325.0,538.0
A32001,1131.0,1630.0,2073.0,2288.0,2943.0,3072.0,3482.0,3343.0,3544.0,3451.0,2656.0,1467.0
A32002,2732.0,4586.0,4695.0,6230.0,6052.0,6619.0,7663.0,6845.0,10987.0,10183.0,6664.0,2935.0
A32003,1003.0,1801.0,2038.0,3003.0,2747.0,2593.0,2988.0,2713.0,4878.0,4790.0,3191.0,1310.0
A32004,1443.0,1870.0,2138.0,3801.0,5348.0,6325.0,6926.0,6255.0,7581.0,7850.0,5019.0,1826.0


In [54]:
stations = stations.merge(usage_total, how='left', left_on='station_id', right_index=True)
stations = stations.merge(usage_monthly, how='left', left_on='station_id', right_index=True)
stations = stations.fillna(0)
stations.head()



Unnamed: 0,station_id,station_name,lat,lng,usage_total,usage_month_1,usage_month_2,usage_month_3,usage_month_4,usage_month_5,usage_month_6,usage_month_7,usage_month_8,usage_month_9,usage_month_10,usage_month_11,usage_month_12
0,A32000,Fan Pier,42.353391,-71.044571,20300,625.0,778.0,1087.0,1286.0,2140.0,2678.0,2938.0,2728.0,2208.0,1969.0,1325.0,538.0
1,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,31080,1131.0,1630.0,2073.0,2288.0,2943.0,3072.0,3482.0,3343.0,3544.0,3451.0,2656.0,1467.0
2,A32002,Commonwealth Ave at Agganis Way,42.351692,-71.119035,76191,2732.0,4586.0,4695.0,6230.0,6052.0,6619.0,7663.0,6845.0,10987.0,10183.0,6664.0,2935.0
3,A32003,B.U. Central - 725 Comm. Ave.,42.350385,-71.108131,33055,1003.0,1801.0,2038.0,3003.0,2747.0,2593.0,2988.0,2713.0,4878.0,4790.0,3191.0,1310.0
4,A32004,Longwood Ave at Binney St,42.338466,-71.106984,56382,1443.0,1870.0,2138.0,3801.0,5348.0,6325.0,6926.0,6255.0,7581.0,7850.0,5019.0,1826.0


In [55]:
import osmnx as ox
print("OSMnx version:", ox.__version__)
print("Module path:", ox.__file__)

OSMnx version: 2.0.6
Module path: /Users/wuling/Desktop/25Fall/CS506/bluebikes-demand-forecast/venv/lib/python3.12/site-packages/osmnx/__init__.py


In [None]:
import osmnx as ox

place = "Boston, Massachusetts, USA"

attractions = ox.features_from_place(place, tags={"tourism": True})

parks = ox.features_from_place(place, tags={"leisure": "park"})

bike_lanes = ox.features_from_place(place, tags={"highway": "cycleway"})

In [None]:
attractions = attractions[attractions.geometry.type == "Point"]
parks = parks[parks.geometry.type == "Polygon"].copy()
parks['geometry'] = parks.geometry.centroid
bike_lanes = bike_lanes[bike_lanes.geometry.type == "LineString"]



  parks['geometry'] = parks.geometry.centroid  # 用中心点


In [58]:
def geodistance_m(g1, g2):
    """shapely distance returns degrees; convert to meters."""
    return g1.distance(g2) * 111_000


In [59]:
def count_nearby(gdf_points, station_point, radius_m=500):
    return gdf_points.distance(station_point).lt(radius_m / 111_000).sum()

station_geom = [Point(lng, lat) for lat, lng in zip(stations.lat, stations.lng)]
stations['geom'] = station_geom

stations['num_attractions_r500'] = stations['geom'].apply(
    lambda g: count_nearby(attractions.geometry, g, radius_m=500)
)

stations.head()



  return gdf_points.distance(station_point).lt(radius_m / 111_000).sum()


Unnamed: 0,station_id,station_name,lat,lng,usage_total,usage_month_1,usage_month_2,usage_month_3,usage_month_4,usage_month_5,usage_month_6,usage_month_7,usage_month_8,usage_month_9,usage_month_10,usage_month_11,usage_month_12,geom,num_attractions_r500
0,A32000,Fan Pier,42.353391,-71.044571,20300,625.0,778.0,1087.0,1286.0,2140.0,2678.0,2938.0,2728.0,2208.0,1969.0,1325.0,538.0,POINT (-71.04457139968872 42.3533905070523),2
1,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,31080,1131.0,1630.0,2073.0,2288.0,2943.0,3072.0,3482.0,3343.0,3544.0,3451.0,2656.0,1467.0,POINT (-71.137313 42.353334),10
2,A32002,Commonwealth Ave at Agganis Way,42.351692,-71.119035,76191,2732.0,4586.0,4695.0,6230.0,6052.0,6619.0,7663.0,6845.0,10987.0,10183.0,6664.0,2935.0,POINT (-71.11903488636017 42.35169201885973),1
3,A32003,B.U. Central - 725 Comm. Ave.,42.350385,-71.108131,33055,1003.0,1801.0,2038.0,3003.0,2747.0,2593.0,2988.0,2713.0,4878.0,4790.0,3191.0,1310.0,POINT (-71.10813117 42.350385308),9
4,A32004,Longwood Ave at Binney St,42.338466,-71.106984,56382,1443.0,1870.0,2138.0,3801.0,5348.0,6325.0,6926.0,6255.0,7581.0,7850.0,5019.0,1826.0,POINT (-71.1069839 42.338466287),2


In [60]:
def min_distance_to(gdf, point):
    return gdf.geometry.distance(point).min() * 111_000

stations['dist_to_park'] = stations['geom'].apply(lambda g: min_distance_to(parks, g))
stations.head()



  return gdf.geometry.distance(point).min() * 111_000


Unnamed: 0,station_id,station_name,lat,lng,usage_total,usage_month_1,usage_month_2,usage_month_3,usage_month_4,usage_month_5,usage_month_6,usage_month_7,usage_month_8,usage_month_9,usage_month_10,usage_month_11,usage_month_12,geom,num_attractions_r500,dist_to_park
0,A32000,Fan Pier,42.353391,-71.044571,20300,625.0,778.0,1087.0,1286.0,2140.0,2678.0,2938.0,2728.0,2208.0,1969.0,1325.0,538.0,POINT (-71.04457139968872 42.3533905070523),2,49.826427
1,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,31080,1131.0,1630.0,2073.0,2288.0,2943.0,3072.0,3482.0,3343.0,3544.0,3451.0,2656.0,1467.0,POINT (-71.137313 42.353334),10,89.363332
2,A32002,Commonwealth Ave at Agganis Way,42.351692,-71.119035,76191,2732.0,4586.0,4695.0,6230.0,6052.0,6619.0,7663.0,6845.0,10987.0,10183.0,6664.0,2935.0,POINT (-71.11903488636017 42.35169201885973),1,704.683388
3,A32003,B.U. Central - 725 Comm. Ave.,42.350385,-71.108131,33055,1003.0,1801.0,2038.0,3003.0,2747.0,2593.0,2988.0,2713.0,4878.0,4790.0,3191.0,1310.0,POINT (-71.10813117 42.350385308),9,932.422063
4,A32004,Longwood Ave at Binney St,42.338466,-71.106984,56382,1443.0,1870.0,2138.0,3801.0,5348.0,6325.0,6926.0,6255.0,7581.0,7850.0,5019.0,1826.0,POINT (-71.1069839 42.338466287),2,191.975588


In [61]:
stations['dist_to_bikelane'] = stations['geom'].apply(
    lambda g: bike_lanes.geometry.distance(g).min() * 111_000
)

stations.head()



  lambda g: bike_lanes.geometry.distance(g).min() * 111_000


Unnamed: 0,station_id,station_name,lat,lng,usage_total,usage_month_1,usage_month_2,usage_month_3,usage_month_4,usage_month_5,...,usage_month_7,usage_month_8,usage_month_9,usage_month_10,usage_month_11,usage_month_12,geom,num_attractions_r500,dist_to_park,dist_to_bikelane
0,A32000,Fan Pier,42.353391,-71.044571,20300,625.0,778.0,1087.0,1286.0,2140.0,...,2938.0,2728.0,2208.0,1969.0,1325.0,538.0,POINT (-71.04457139968872 42.3533905070523),2,49.826427,192.773167
1,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,31080,1131.0,1630.0,2073.0,2288.0,2943.0,...,3482.0,3343.0,3544.0,3451.0,2656.0,1467.0,POINT (-71.137313 42.353334),10,89.363332,301.85936
2,A32002,Commonwealth Ave at Agganis Way,42.351692,-71.119035,76191,2732.0,4586.0,4695.0,6230.0,6052.0,...,7663.0,6845.0,10987.0,10183.0,6664.0,2935.0,POINT (-71.11903488636017 42.35169201885973),1,704.683388,2.874625
3,A32003,B.U. Central - 725 Comm. Ave.,42.350385,-71.108131,33055,1003.0,1801.0,2038.0,3003.0,2747.0,...,2988.0,2713.0,4878.0,4790.0,3191.0,1310.0,POINT (-71.10813117 42.350385308),9,932.422063,154.677117
4,A32004,Longwood Ave at Binney St,42.338466,-71.106984,56382,1443.0,1870.0,2138.0,3801.0,5348.0,...,6926.0,6255.0,7581.0,7850.0,5019.0,1826.0,POINT (-71.1069839 42.338466287),2,191.975588,385.855073


In [62]:
feature_cols = [
    'station_id','station_name','lat','lng','usage_total'
] + list(usage_monthly.columns) + [
    'num_attractions_r500',
    'dist_to_park',
    'dist_to_bikelane'
]

station_features = stations[feature_cols]

station_features.to_csv("station_features_2024.csv", index=False)

station_features.head()


Unnamed: 0,station_id,station_name,lat,lng,usage_total,usage_month_1,usage_month_2,usage_month_3,usage_month_4,usage_month_5,usage_month_6,usage_month_7,usage_month_8,usage_month_9,usage_month_10,usage_month_11,usage_month_12,num_attractions_r500,dist_to_park,dist_to_bikelane
0,A32000,Fan Pier,42.353391,-71.044571,20300,625.0,778.0,1087.0,1286.0,2140.0,2678.0,2938.0,2728.0,2208.0,1969.0,1325.0,538.0,2,49.826427,192.773167
1,A32001,Union Square - Brighton Ave at Cambridge St,42.353334,-71.137313,31080,1131.0,1630.0,2073.0,2288.0,2943.0,3072.0,3482.0,3343.0,3544.0,3451.0,2656.0,1467.0,10,89.363332,301.85936
2,A32002,Commonwealth Ave at Agganis Way,42.351692,-71.119035,76191,2732.0,4586.0,4695.0,6230.0,6052.0,6619.0,7663.0,6845.0,10987.0,10183.0,6664.0,2935.0,1,704.683388,2.874625
3,A32003,B.U. Central - 725 Comm. Ave.,42.350385,-71.108131,33055,1003.0,1801.0,2038.0,3003.0,2747.0,2593.0,2988.0,2713.0,4878.0,4790.0,3191.0,1310.0,9,932.422063,154.677117
4,A32004,Longwood Ave at Binney St,42.338466,-71.106984,56382,1443.0,1870.0,2138.0,3801.0,5348.0,6325.0,6926.0,6255.0,7581.0,7850.0,5019.0,1826.0,2,191.975588,385.855073
