In [None]:
import polars as pl
import glob

files = sorted(glob.glob("data/2024_data/2024*-bluebikes-tripdata.csv"))

# 只读取需要的列（大幅减少内存）
usecols = ["started_at", "ended_at", "start_station_id", "end_station_id"]

df = pl.read_csv(
    files,
    columns=usecols,
    try_parse_dates=True,
    ignore_errors=True
)

print(df.shape)
df.head()


In [None]:
df["started_at"] = pd.to_datetime(df["started_at"])
df["ended_at"] = pd.to_datetime(df["ended_at"])
df["start_hour"] = df["started_at"].dt.floor("H")
df["end_hour"] = df["ended_at"].dt.floor("H")


In [None]:
hourly_start = (
    df.groupby(["start_station_id", "start_hour"])
    .size()
    .reset_index(name="usage_start")
)

hourly_end = (
    df.groupby(["end_station_id", "end_hour"])
    .size()
    .reset_index(name="usage_end")
)

In [None]:
hourly = (
    hourly_start.rename(columns={"start_station_id": "station_id", "start_hour": "hour"})
    .merge(
        hourly_end.rename(columns={"end_station_id": "station_id", "end_hour": "hour"}),
        on=["station_id", "hour"],
        how="outer"
    )
)
hourly["usage_start"] = hourly["usage_start"].fillna(0)
hourly["usage_end"] = hourly["usage_end"].fillna(0)

# 最终 usage = 借车/还车平均 (realistic proxy)
hourly["usage"] = (hourly["usage_start"] + hourly["usage_end"]) / 2


In [None]:
hourly["hour_of_day"] = hourly["hour"].dt.hour
hourly["day_of_week"] = hourly["hour"].dt.dayofweek
hourly["is_weekend"] = hourly["day_of_week"].isin([5, 6]).astype(int)
hourly["month"] = hourly["hour"].dt.month
hourly["day"] = hourly["hour"].dt.day
hourly["week_of_year"] = hourly["hour"].dt.isocalendar().week


In [None]:
stations = pd.read_csv("data/2024_data/station_features_2024.csv")

hourly = hourly.merge(
    stations,
    on="station_id",
    how="left"
)


In [None]:
print(hourly.head())
print(hourly.shape)