In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import zipfile
import os
from pathlib import Path

# 加载数据
zip_file_path = os.path.join(os.getcwd(), "Cache", "merged_output.zip")
output_dir = os.path.join(os.getcwd(), "ML_Proj","output_csv")  
zip_file_path = zip_file_path.replace("\\", "/")
output_dir = output_dir.replace("\\", "/")
os.makedirs(output_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)  # 解压到目标目录
    print(f"Files extracted to: {output_dir}")
data_path = os.path.join(output_dir, "merged_output.csv")
data_path = data_path.replace("\\", "/")

# 检验文件存在
if Path(data_path).is_file():
    print(f"File exists: {data_path}")
else:
    print(f"File does not exist: {data_path}")
data = pd.read_csv(data_path)

# 确保数据类型正确
data["datetime_x"] = pd.to_datetime(data['datetime_x'], format='%Y/%m/%d %H:%M', errors='coerce')
data["datetime_x"] = data['datetime_x'].astype('int64') // 10**9  # 转换为秒级时间戳

# 创建二分类标签：延误为 1，否则为 0
data["delayed"] = (data["arrival_delay"] > 0).astype(int)

# 添加滞后特征
data["lag_arrival_delay"] = data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["arrival_delay"].shift(1)
data["lag_departure_delay"] = data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["departure_delay"].shift(1)

data = data.dropna(subset=["lag_arrival_delay", "lag_departure_delay"])

# 定义输入特征和目标变量
input_columns = [
    "stop_id", "datetime_x", "temperature", "precipitation", "snowfall",
    "snow_depth", "wind_speed", "cloud_cover", "lag_arrival_delay", "lag_departure_delay"
]
X = data[input_columns]
y = data["delayed"]

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Files extracted to: C:/Users/13490/ID2223_Project/output_csv
File exists: C:/Users/13490/ID2223_Project/output_csv/merged_output.csv


In [19]:
from xgboost import XGBClassifier
import joblib  # 用于保存模型

xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'  # 使用 logloss 作为评估指标
)
xgb_model.fit(X_train, y_train)

# 预测
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

output_model_dir = os.path.join(os.getcwd(), "ML_Proj","model")
output_model_dir = output_model_dir.replace("\\", "/")
os.makedirs(output_model_dir, exist_ok = True)
output_model = os.path.join(output_model_dir, "xgb_model.pkl")
output_model = output_model.replace("\\", "/")
joblib.dump(xgb_model, output_model)  # 保存为 .pkl 文件

['C:\\Users\\13490\\ID2223_Project\\model\\xgb_model.pkl']