In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv("station_features_2024.csv")

print("Data shape:", df.shape)
df.head()


In [None]:
# --- 生成 log / inverse / clipped / bucket 特征 ---
df["log_dist_bikelane"] = np.log1p(df["dist_to_bikelane"])
df["inv_dist_bikelane"] = 1 / (df["dist_to_bikelane"] + 1)
df["dist_bikelane_clipped"] = df["dist_to_bikelane"].clip(0, 3000)

df["dist_bikelane_bucket"] = pd.cut(
    df["dist_to_bikelane"],
    bins=[-1, 200, 800, 2000, 5000, np.inf],
    labels=["very_close", "close", "medium", "far", "very_far"]
)

bucket_map = {"very_close":4, "close":3, "medium":2, "far":1, "very_far":0}
df["dist_bikelane_bucket_num"] = df["dist_bikelane_bucket"].map(bucket_map).astype(int)

# attractions transforms
df["log_attractions"] = np.log1p(df["num_attractions_r500"])
df["sqrt_attractions"] = np.sqrt(df["num_attractions_r500"])
df["attractions_bucket"] = pd.cut(
    df["num_attractions_r500"],
    bins=[-1, 0, 3, 10, np.inf],
    labels=["none", "few", "medium", "many"]
)

print("Feature engineering done")


In [None]:
to_drop = [
    "dist_to_park",           # 共线性极高 → 删除
    "dist_bikelane_norm"      # 树模型不需要 → 删除
]

df = df.drop(columns=[col for col in to_drop if col in df.columns])


In [None]:
df = pd.get_dummies(df, columns=["attractions_bucket"], drop_first=True)


In [None]:
features = [
    "lat", "lng",
    "num_attractions_r500",
    "log_attractions",
    "sqrt_attractions",

    "dist_to_bikelane",
    "log_dist_bikelane",
    "inv_dist_bikelane",
    "dist_bikelane_clipped",
    "dist_bikelane_bucket_num"
]

# 加上 one-hot 的列
features += [c for c in df.columns if c.startswith("attractions_bucket_")]

X = df[features]
y = df["usage_total"]

print("Final feature count:", len(features))
X.head()


In [None]:
df_sample = df

X_small = df_sample[features]
y_small = df_sample["usage_total"]

X_train, X_test, y_train, y_test = train_test_split(
    X_small, y_small, test_size=0.25, random_state=42
)


In [None]:
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror"
)

model.fit(X_train, y_train)

preds = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("RMSE:", rmse)
print("R²:", r2)


In [None]:
importance = model.feature_importances_
sorted_idx = np.argsort(importance)

plt.figure(figsize=(8,6))
plt.barh(np.array(features)[sorted_idx], importance[sorted_idx])
plt.title("Feature Importances")
plt.show()


In [None]:
feature_cols = [
    "lat",
    "lng",
    "num_attractions_r500",
    "log_attractions",
    "sqrt_attractions",
    "dist_to_bikelane",
    "log_dist_bikelane",
    "inv_dist_bikelane",
    "dist_bikelane_clipped",
    "dist_bikelane_bucket_num",
    "attractions_bucket_few",
    "attractions_bucket_medium",
    "attractions_bucket_many"
]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

X = df[feature_cols]

monthly_results = []
models = {}
pred_df = pd.DataFrame()

for m in range(1, 13):
    target = f"usage_month_{m}"
    y = df[target]

    # split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestRegressor(
        n_estimators=600,
        max_depth=None,
        random_state=42
    )
    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    monthly_results.append((m, rmse, r2))
    models[m] = model

    # store predictions for later aggregation
    pred_df[f"pred_month_{m}"] = model.predict(X)

monthly_results


In [None]:
results_df = pd.DataFrame(monthly_results, columns=["month", "RMSE", "R2"])
results_df


In [None]:
pred_df["pred_total"] = pred_df.sum(axis=1)

r2_total = r2_score(df["usage_total"], pred_df["pred_total"])
rmse_total = np.sqrt(mean_squared_error(df["usage_total"], pred_df["pred_total"]))

print("Monthly-sum RMSE:", rmse_total)
print("Monthly-sum R²:", r2_total)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(7,7))
sns.scatterplot(x=df["usage_total"], y=pred_df["pred_total"])

# 添加 y=x 参考线（完美预测）
max_val = max(df["usage_total"].max(), pred_df["pred_total"].max())
plt.plot([0, max_val], [0, max_val], 'r--')

plt.xlabel("Actual usage_total")
plt.ylabel("Predicted usage_total (sum of monthly)")
plt.title("Actual vs Predicted Total Usage")
plt.grid(True)
plt.show()


In [None]:
errors = pred_df["pred_total"] - df["usage_total"]

plt.figure(figsize=(7,5))
sns.scatterplot(x=df["usage_total"], y=errors)
plt.axhline(0, color="red", linestyle="--")

plt.xlabel("Actual usage_total")
plt.ylabel("Prediction Error (Pred - Actual)")
plt.title("Residual Plot")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(7,4))
sns.histplot(errors, kde=True, bins=30)

plt.xlabel("Prediction Error")
plt.title("Distribution of Total Usage Prediction Error")
plt.show()


In [None]:
df_eval = pd.DataFrame({
    "actual": df["usage_total"],
    "pred": pred_df["pred_total"],
})

df_eval["usage_group"] = pd.qcut(df_eval["actual"], q=4, labels=["Low", "Mid-Low", "Mid-High", "High"])

df_eval["abs_error"] = (df_eval["pred"] - df_eval["actual"]).abs()
df_eval["pct_error"] = df_eval["abs_error"] / df_eval["actual"]

group_stats = df_eval.groupby("usage_group")[["abs_error", "pct_error"]].mean()
group_stats


In [None]:
plt.figure(figsize=(7,6))
sns.kdeplot(x=df["usage_total"], y=pred_df["pred_total"], fill=True, cmap="Blues")

plt.xlabel("Actual usage_total")
plt.ylabel("Predicted usage_total")
plt.title("KDE: Actual vs Predicted Total Usage")
plt.show()
