In [1]:
import pykoda
import pandas as pd
import numpy as np
import requests

Config file path: C:\Users\13490\AppData\Local\pykoda\pykoda


In [113]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# 加载数据
data_path = "./merged_output.csv"  # 替换为实际文件路径
data = pd.read_csv(data_path)

# 确保数据类型正确
data["datetime_x"] = pd.to_datetime(data['datetime_x'], format='%Y/%m/%d %H:%M', errors='coerce')
data["datetime_x"] = data['datetime_x'].astype('int64') // 10**9  # 转换为秒级时间戳

# 创建二分类标签：延误为 1，否则为 0
data["delayed"] = (data["arrival_delay"] > 0).astype(int)

# 添加滞后特征
data["lag_arrival_delay"] = data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["arrival_delay"].shift(1)
data["lag_departure_delay"] = data.sort_values(["trip_id", "stop_sequence"]).groupby("trip_id")["departure_delay"].shift(1)

data = data.dropna(subset=["lag_arrival_delay", "lag_departure_delay"])

# 定义输入特征和目标变量
input_columns = [
    "stop_id", "datetime_x", "temperature", "precipitation", "snowfall",
    "snow_depth", "wind_speed", "cloud_cover", "lag_arrival_delay", "lag_departure_delay"
]
X = data[input_columns]
print(X.columns)
y = data["delayed"]

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X.to_csv("./merged_output_new.csv", index=False)

Index(['stop_id', 'datetime_x', 'temperature', 'precipitation', 'snowfall',
       'snow_depth', 'wind_speed', 'cloud_cover', 'lag_arrival_delay',
       'lag_departure_delay'],
      dtype='object')


In [114]:
from xgboost import XGBClassifier
import joblib  # 用于保存模型

xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'  # 使用 logloss 作为评估指标
)
xgb_model.fit(X_train, y_train)

# 预测
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

joblib.dump(xgb_model, "xgb_model.pkl")  # 保存为 .pkl 文件

['xgb_model.pkl']

In [162]:
import joblib

def predict_delay(date, time, stop_id):
    dataset = pd.read_csv("./merged_output_new.csv")
    model = joblib.load("xgb_model.pkl")
    dataset['datetime_x'] = pd.to_datetime(dataset['datetime_x'], errors='coerce')
    stop_id = float(stop_id)
    try:
        # 解析输入的日期和时间
        print(date)
        print(time)
        datetime_input = pd.to_datetime(f"{date} {time}", format="%Y-%m-%d %H:%M")
        # 将数据集中时间列截断到分钟级别
        dataset['datetime_minute'] = dataset['datetime_x'].dt.floor('T')  # 假设 datetime_x 是时间列
        
        matched_row = dataset.loc[
             (dataset['datetime_minute'] == datetime_input)&(dataset['stop_id'] == stop_id)]# 
        
        if matched_row.empty:
            return "No data found for the given date and time."
        
        # 提取指定的特征列
        feature_columns = [
            "stop_id", "datetime_x", "temperature", "precipitation",
            "snowfall", "snow_depth", "wind_speed", "cloud_cover",
            "lag_arrival_delay", "lag_departure_delay"
        ]
        input_features = matched_row[feature_columns].iloc[0].values.reshape(1, -1)
        print(input_features.dtype)
        df = pd.DataFrame(input_features, columns=feature_columns)
        #df['datetime_x'] = pd.to_datetime(df['datetime_x'], errors='coerce')
        df["datetime_x"] = df['datetime_x'].astype('int64') // 10**9
        df[feature_columns] = df[feature_columns].astype(float)
        
        # 使用模型预测延误概率
        delay_probability = model.predict_proba(df)[:, 1]
        
        # 返回预测结果
        return f"Delay Probability is {delay_probability[0]:.2%}"
    
    except ValueError:
        return "Invalid date and time format."
    except KeyError as e:
        return f"Missing feature: {str(e)}"

In [163]:
predict_delay('2022-12-15','20:29','9022001075368001')

2022-12-15
20:29


'No data found for the given date and time.'

In [165]:
predict_delay('2022-12-15','23:29','9022001075368001')

2022-12-15
23:29
object
stop_id                9.022001e+15
datetime_x             1.671147e+09
temperature           -1.450000e+01
precipitation          0.000000e+00
snowfall               0.000000e+00
snow_depth             8.000000e-02
wind_speed             6.600000e+00
cloud_cover            1.000000e+00
lag_arrival_delay      0.000000e+00
lag_departure_delay    0.000000e+00
Name: 0, dtype: float64
Index(['stop_id', 'datetime_x', 'temperature', 'precipitation', 'snowfall',
       'snow_depth', 'wind_speed', 'cloud_cover', 'lag_arrival_delay',
       'lag_departure_delay'],
      dtype='object')


'Delay Probability is 7.27%'

In [166]:
# import joblib
import gradio as gr


# 定义接口
interface = gr.Interface(
    fn=predict_delay,  # 预测函数
    inputs=[
        gr.Textbox(label="Date (YYYY-MM-DD)"),  # 输入日期
        gr.Textbox(label="Time (HH:MM)"),       # 输入时间
        gr.Textbox(label="stop_id")                # 输入站点
    ],
    outputs=gr.Label(label="Probability of Delay"),         # 输出延误概率
    title="Delay Probability Prediction System",
    description="Predict the probability of delay by entering the date, time and station"
)


# 启动应用
interface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7868
* Running on public URL: https://5db16d37b0788dbf2e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


