In [46]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score

# =========================
# 1. 读数据
# =========================
df = pd.read_csv("Wimbledon_featured_matches.csv")

# =========================
# 2. 目标变量
# =========================
# 假设 point_victor: 1 = P1 赢分, 2 = P2 赢分
df = df[df["point_victor"].isin([1, 2])]
df["y"] = (df["point_victor"] == 1).astype(int)

# =========================
# 3. 构造"无记忆"特征
# =========================
# 构造缺失的特征
df["is_break_point"] = ((df["p1_break_pt"] > 0) | (df["p2_break_pt"] > 0)).astype(int)
df["is_tiebreak"] = ((df["p1_score"] == "15") & (df["p2_score"] == "15")).astype(int)  # 简化版
df["is_deuce"] = ((df["p1_score"] == "D") | (df["p2_score"] == "D")).astype(int)

# 计算上一分的持续时间
df["elapsed_seconds"] = pd.to_timedelta(df["elapsed_time"]).dt.total_seconds()
df["point_duration"] = df.groupby("match_id")["elapsed_seconds"].diff()
df["prev_point_duration"] = df.groupby("match_id")["point_duration"].shift(1)

# 编码分类变量
le_serve_width = LabelEncoder()
le_serve_depth = LabelEncoder()
le_return_depth = LabelEncoder()

df["serve_width_encoded"] = le_serve_width.fit_transform(df["serve_width"].astype(str))
df["serve_depth_encoded"] = le_serve_depth.fit_transform(df["serve_depth"].astype(str))
df["return_depth_encoded"] = le_return_depth.fit_transform(df["return_depth"].astype(str))

# 获取上一分的特征值（t-1）
df["prev_p1_distance_run"] = df.groupby("match_id")["p1_distance_run"].shift(1)
df["prev_p2_distance_run"] = df.groupby("match_id")["p2_distance_run"].shift(1)
df["prev_rally_count"] = df.groupby("match_id")["rally_count"].shift(1)
df["prev_speed_mph"] = df.groupby("match_id")["speed_mph"].shift(1)
df["prev_serve_width_encoded"] = df.groupby("match_id")["serve_width_encoded"].shift(1)
df["prev_serve_depth_encoded"] = df.groupby("match_id")["serve_depth_encoded"].shift(1)
df["prev_return_depth_encoded"] = df.groupby("match_id")["return_depth_encoded"].shift(1)

feature_cols = [
    # 发球
    "server",
    "serve_no",

    # 比分 / 阶段
    "set_no",
    "game_no",
    "point_no",
    "p1_games",
    "p2_games",
    "p1_sets",
    "p2_sets",

    # 关键分
    "is_break_point",
    "is_tiebreak",
    "is_deuce",
    
    # 上一分的持续时间
    "prev_point_duration",
    
    # 上一分的特征（t-1）
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded"
]

X = df[feature_cols].copy()
y = df["y"].values

# 缺失值简单处理（baseline）
X = X.fillna(0)

# =========================
# 4. 训练 / 测试切分
#    （注意：这里是"非时序 baseline"，
#     所以允许随机切分）
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


X_train.shape

(5098, 20)

In [47]:
# =========================
# 5. LASSO Logistic 回归
# =========================
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",   # 或 saga
        C=0.05,                # 正则强度，可交叉验证
        max_iter=2000
    ))
])

pipe.fit(X_train, y_train)

# =========================
# 6. 评估
# =========================
proba_test = pipe.predict_proba(X_test)[:, 1]

print("Log loss :", log_loss(y_test, proba_test))
print("Brier    :", brier_score_loss(y_test, proba_test))
print("ROC AUC  :", roc_auc_score(y_test, proba_test))

# =========================
# 7. LASSO 选出来的特征
# =========================
coef = pipe.named_steps["clf"].coef_.flatten()
coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": coef
}).sort_values("coef", key=np.abs, ascending=False)

print(coef_df)

Log loss : 0.6431822155056456
Brier    : 0.2210573995545362
ROC AUC  : 0.6821953358704385
                      feature      coef
0                      server -0.728680
6                    p2_games -0.080702
7                     p1_sets  0.068752
5                    p1_games  0.062406
12        prev_point_duration -0.047073
16             prev_speed_mph  0.019502
19  prev_return_depth_encoded -0.018933
13       prev_p1_distance_run  0.006415
10                is_tiebreak  0.003445
4                    point_no  0.000000
8                     p2_sets  0.000000
9              is_break_point  0.000000
1                    serve_no  0.000000
11                   is_deuce  0.000000
3                     game_no  0.000000
14       prev_p2_distance_run  0.000000
15           prev_rally_count  0.000000
2                      set_no  0.000000
17   prev_serve_width_encoded  0.000000
18   prev_serve_depth_encoded  0.000000


In [48]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
proba_test = clf.predict_proba(X_test)[:, 1]
# output roc_auc_score and feature importance
roc_auc = roc_auc_score(y_test, proba_test)
feature_importances = clf.feature_importances_
importance_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": feature_importances
}).sort_values("importance", ascending=False)
print("ROC AUC  :", roc_auc)
print(importance_df)

ROC AUC  : 0.7065027907876781
                      feature  importance
0                      server    0.114743
14       prev_p2_distance_run    0.107708
13       prev_p1_distance_run    0.107345
4                    point_no    0.101403
12        prev_point_duration    0.099525
16             prev_speed_mph    0.091447
3                     game_no    0.048781
15           prev_rally_count    0.045939
1                    serve_no    0.042774
17   prev_serve_width_encoded    0.041049
5                    p1_games    0.039604
6                    p2_games    0.038330
19  prev_return_depth_encoded    0.023965
2                      set_no    0.023551
8                     p2_sets    0.019264
7                     p1_sets    0.018767
18   prev_serve_depth_encoded    0.017721
10                is_tiebreak    0.009264
9              is_break_point    0.008821
11                   is_deuce    0.000000


In [49]:
# =========================
# 8. LASSO - 使用 H:AM 列范围 + 之前的特征（所有特征）
# =========================
# H:AM 对应的列（从p1_sets到p2_break_pt_missed）
feature_cols_h_am = [
    "server", "serve_no",  # N, O
    "p1_break_pt", "p2_break_pt",  # AH, AI
    # 之前的特征
    "is_break_point", "is_tiebreak", "is_deuce",
    "set_no", "game_no", "point_no",
    # 上一分的持续时间
    "prev_point_duration",
    # 上一分的特征（t-1）
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded"
]

# 准备数据
X_combined = df[feature_cols_h_am].copy()

# 填充缺失值
X_combined = X_combined.fillna(0)

# 分割数据
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y, test_size=0.3, random_state=42
)

# 构建LASSO模型
pipe_combined = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=0.1,
        max_iter=2000
    ))
])

pipe_combined.fit(X_train_combined, y_train_combined)

# 评估
proba_test_combined = pipe_combined.predict_proba(X_test_combined)[:, 1]

print("="*50)
print("完整特征集 LASSO 模型结果")
print("="*50)
print("Log loss :", log_loss(y_test_combined, proba_test_combined))
print("Brier    :", brier_score_loss(y_test_combined, proba_test_combined))
print("ROC AUC  :", roc_auc_score(y_test_combined, proba_test_combined))

# 特征重要性
coef_combined = pipe_combined.named_steps["clf"].coef_.flatten()
coef_df_combined = pd.DataFrame({
    "feature": feature_cols_h_am,
    "coef": coef_combined
}).sort_values("coef", key=np.abs, ascending=False)

print("\n特征系数（按绝对值排序）：")
print(coef_df_combined)


完整特征集 LASSO 模型结果
Log loss : 0.6478947072782365
Brier    : 0.22114148548492443
ROC AUC  : 0.6791025584105201

特征系数（按绝对值排序）：
                      feature      coef
0                      server -0.722753
10        prev_point_duration -0.067834
7                      set_no  0.061000
14             prev_speed_mph  0.029487
17  prev_return_depth_encoded -0.026062
5                 is_tiebreak  0.012772
11       prev_p1_distance_run  0.010390
3                 p2_break_pt  0.008497
8                     game_no -0.006921
9                    point_no  0.002134
1                    serve_no  0.002023
15   prev_serve_width_encoded -0.001794
6                    is_deuce  0.000000
4              is_break_point  0.000000
12       prev_p2_distance_run  0.000000
13           prev_rally_count  0.000000
2                 p1_break_pt  0.000000
16   prev_serve_depth_encoded  0.000000


In [50]:
# =========================
# 9. 动量-残差 ARX（惯性 + 外生输入）
# 使用与逻辑回归相同的数据，拟合残差的ARX结构
# =========================
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1) 逻辑回归在全样本的残差 r_t = y_t - p_hat_t
proba_all = pipe.predict_proba(X)[:, 1]
df["resid"] = y - proba_all

# 2) 构造 ARX 输入：r_{t-1} 与 u_{t-1}
#    其中 u_{t-1} 选用我们已经构造好的上一分特征 prev_*
u_prev_cols = [
    "prev_point_duration",
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded",
]

df["resid_prev"] = df.groupby("match_id")["resid"].shift(1)

use_cols = ["resid", "resid_prev"] + u_prev_cols
arx_df = df[use_cols].copy().dropna()

Z = arx_df[["resid_prev"] + u_prev_cols].values
r = arx_df["resid"].values

# 3) 拟合线性 ARX：r_t = φ r_{t-1} + γ^T u_{t-1} + e_t
arx = LinearRegression()
arx.fit(Z, r)

r_hat = arx.predict(Z)
rmse = mean_squared_error(r, r_hat, squared=False)
r2 = r2_score(r, r_hat)

print("="*50)
print("动量-残差 ARX 拟合结果")
print("="*50)
print("RMSE(resid):", rmse)
print("R^2(resid) :", r2)

coef_names = ["φ(resid_prev)"] + [f"γ({c})" for c in u_prev_cols]
coef_vals = np.concatenate(([arx.coef_[0]], arx.coef_[1:]))
for name, val in zip(coef_names, coef_vals):
    print(f"{name:>24} : {val:+.6f}")
print(f"截距(intercept)     : {arx.intercept_:+.6f}")

# 4) 用 r_hat 调整逻辑概率：p_adj = clip(p_hat + r_hat)
#    注意：r_hat 只在 arx_df 的索引处有值，我们在这些样本上比较指标
idx = arx_df.index
p_base_sub = proba_all[idx]
p_adj_sub = np.clip(p_base_sub + r_hat, 1e-6, 1-1e-6)
y_sub = y[idx]

print("\n" + "-"*50)
print("在ARX可用样本上的评估（基线 vs 动量修正）")
print("-"*50)
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
print("Baseline  LogLoss:", log_loss(y_sub, p_base_sub))
print("Momentum  LogLoss:", log_loss(y_sub, p_adj_sub))
print("Baseline  Brier  :", brier_score_loss(y_sub, p_base_sub))
print("Momentum  Brier  :", brier_score_loss(y_sub, p_adj_sub))
print("Baseline  ROC AUC:", roc_auc_score(y_sub, p_base_sub))
print("Momentum  ROC AUC:", roc_auc_score(y_sub, p_adj_sub))

# 可选：根据拟合系数递推估计 M_t（设 α=1），用于直观展示
# M_t = φ M_{t-1} + γ^T u_{t-1}
Mt = np.zeros(len(df))
phi = arx.coef_[0]
gamma = arx.coef_[1:]

# 按比赛内顺序递推
for mid, g in df.groupby("match_id").groups.items():
    rows = df.loc[g].index
    for i, idx_i in enumerate(rows):
        if i == 0:
            Mt[df.index.get_loc(idx_i)] = 0.0
        else:
            # u_{t-1}取的是当前行的 prev_* 特征
            u_prev = df.loc[idx_i, u_prev_cols].values.astype(float)
            Mt[df.index.get_loc(idx_i)] = phi * Mt[df.index.get_loc(rows[i-1])] + gamma.dot(u_prev)

# 简要展示动量的分布
print("\n动量 M_t 概要（全体样本）:")
print(pd.Series(Mt).describe())


动量-残差 ARX 拟合结果
RMSE(resid): 0.46649670316711367
R^2(resid) : 0.0019627505495771747
           φ(resid_prev) : +0.020668
  γ(prev_point_duration) : +0.000011
 γ(prev_p1_distance_run) : +0.000310
 γ(prev_p2_distance_run) : -0.002010
     γ(prev_rally_count) : +0.007406
       γ(prev_speed_mph) : +0.000392
γ(prev_serve_width_encoded) : -0.006855
γ(prev_serve_depth_encoded) : -0.011537
γ(prev_return_depth_encoded) : -0.003632
截距(intercept)     : -0.010567

--------------------------------------------------
在ARX可用样本上的评估（基线 vs 动量修正）
--------------------------------------------------
Baseline  LogLoss: 0.6308681982374479
Momentum  LogLoss: 0.6266688274642426
Baseline  Brier  : 0.21811676172770358
Momentum  Brier  : 0.21761917406578618
Baseline  ROC AUC: 0.691431919054567
Momentum  ROC AUC: 0.6955928244578047





动量 M_t 概要（全体样本）:
count    31.0
mean      0.0
std       0.0
min       0.0
25%       0.0
50%       0.0
75%       0.0
max       0.0
dtype: float64


In [55]:
# =========================
# 10. 贝叶斯残差模型（Pyro）
# 用贝叶斯方法拟合 r_t ~ Normal(φ r_{t-1} + γ^T u_{t-1}, σ_M)
# 并在 logit 空间用 α·M_t 修正逻辑概率
# =========================
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import ClippedAdam

# 准备数据（与前面一致）
proba_all = pipe.predict_proba(X)[:, 1]
df["resid"] = y - proba_all

a = df.copy()
a["resid_prev"] = a.groupby("match_id")["resid"].shift(1)

u_prev_cols = [
    "prev_point_duration",
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded",
]

cols = ["resid", "resid_prev"] + u_prev_cols
b = a[cols].dropna()

r_t = torch.tensor(b["resid"].values, dtype=torch.float32)
r_prev = torch.tensor(b["resid_prev"].values, dtype=torch.float32)
U_prev = torch.tensor(b[u_prev_cols].values, dtype=torch.float32)

# 模型：r_t ~ Normal(φ r_{t-1} + γ^T u_{t-1}, σ)
def model(r_prev, U_prev, r_t):
    phi = pyro.sample("phi", dist.Normal(0.0, 1.0))
    # γ 是长度为 k 的向量参数，需标记为事件维度
    gamma = pyro.sample(
        "gamma",
        dist.Normal(torch.zeros(U_prev.shape[1]), torch.ones(U_prev.shape[1])).to_event(1)
    )
    sigma = pyro.sample("sigma", dist.HalfNormal(1.0))
    mu = phi * r_prev + (U_prev @ gamma)
    with pyro.plate("data", len(r_t)):
        pyro.sample("obs", dist.Normal(mu, sigma), obs=r_t)

# 引导（均值场）
def guide(r_prev, U_prev, r_t):
    phi_loc = pyro.param("phi_loc", torch.tensor(0.0))
    phi_scale = pyro.param("phi_scale", torch.tensor(0.1), constraint=dist.constraints.positive)
    gamma_loc = pyro.param("gamma_loc", torch.zeros(U_prev.shape[1]))
    gamma_scale = pyro.param("gamma_scale", torch.ones(U_prev.shape[1]) * 0.1, constraint=dist.constraints.positive)
    sigma_scale = pyro.param("sigma_scale", torch.tensor(0.5), constraint=dist.constraints.positive)
    pyro.sample("phi", dist.Normal(phi_loc, phi_scale))
    pyro.sample("gamma", dist.Normal(gamma_loc, gamma_scale).to_event(1))
    pyro.sample("sigma", dist.HalfNormal(sigma_scale))

pyro.clear_param_store()
optimizer = ClippedAdam({"lr": 0.02})
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

for step in range(2000):
    loss = svi.step(r_prev, U_prev, r_t)
    if step % 200 == 0:
        print(f"step {step}, ELBO: {loss:.4f}")

phi_est = pyro.param("phi_loc").item()
gamma_est = pyro.param("gamma_loc").detach().numpy()
sigma_est = pyro.param("sigma_scale").item()
print("\n贝叶斯残差参数（均值场近似）：")
print("phi :", phi_est)
print("gamma:", gamma_est)
print("sigma:", sigma_est)

# 仅在可用索引子集上递推动量 M_t，并在 logit 空间修正概率
alpha = 1.0
idx = b.index
p_base_sub = proba_all[idx]

# 在子集上按比赛内顺序递推 M_t：M_t = φ M_{t-1} + γ^T u_{t-1}
Mt_sub = np.zeros(len(idx), dtype=float)
phi = float(phi_est)
gamma = gamma_est.astype(float)

# 为子集构造 match_id 序列
match_ids_sub = a.loc[idx, "match_id"].values
u_prev_sub = a.loc[idx, u_prev_cols].values.astype(float)

# 按子集内的顺序递推（每个比赛独立）
import itertools
for mid, group_positions in itertools.groupby(range(len(idx)), key=lambda k: match_ids_sub[k]):
    group_positions = list(group_positions)
    for i, pos in enumerate(group_positions):
        if i == 0:
            Mt_sub[pos] = 0.0
        else:
            u_prev_vec = u_prev_sub[pos]
            Mt_sub[pos] = phi * Mt_sub[group_positions[i-1]] + gamma.dot(u_prev_vec)

# logit(p) = log(p / (1 - p))
logit_base_sub = np.log(p_base_sub / (1.0 - p_base_sub))
logit_adj_sub = logit_base_sub + alpha * Mt_sub
# sigmoid(x) = 1 / (1 + exp(-x))
p_adj_sub = 1.0 / (1.0 + np.exp(-logit_adj_sub))
y_sub = y[idx]

from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
print("\n评估（基线 vs 贝叶斯动量 logit 修正）")
print("Baseline  LogLoss:", log_loss(y_sub, p_base_sub))
print("Momentum  LogLoss:", log_loss(y_sub, p_adj_sub))
print("Baseline  Brier  :", brier_score_loss(y_sub, p_base_sub))
print("Momentum  Brier  :", brier_score_loss(y_sub, p_adj_sub))
print("Baseline  ROC AUC:", roc_auc_score(y_sub, p_base_sub))
print("Momentum  ROC AUC:", roc_auc_score(y_sub, p_adj_sub))

step 0, ELBO: 849964497.0368
step 200, ELBO: 27031.6255
step 400, ELBO: 31036.4011
step 600, ELBO: 14813.5879
step 800, ELBO: 83885.4183
step 1000, ELBO: 17051.3672
step 1200, ELBO: 12838.0179
step 1400, ELBO: 17225.5356
step 1600, ELBO: 17102.1694
step 1800, ELBO: 17269.2749

贝叶斯残差参数（均值场近似）：
phi : 0.06434185802936554
gamma: [-1.7576369e-03 -2.0473713e-02 -8.5522393e-03 -2.6840135e-05
 -3.1604758e-03 -4.2659398e-03 -6.4472128e-03  5.9372962e-02]
sigma: 3.4853947162628174

评估（基线 vs 贝叶斯动量 logit 修正）
Baseline  LogLoss: 0.6308681982374479
Momentum  LogLoss: 0.7364615787389565
Baseline  Brier  : 0.21811676172770358
Momentum  Brier  : 0.25854201294571383
Baseline  ROC AUC: 0.691431919054567
Momentum  ROC AUC: 0.6760072343643766
