In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score

# =========================
# 1. 读数据
# =========================
try:
    df = pd.read_csv("2024_Wimbledon_featured_matches.csv")
except:
    df = pd.read_csv("Wimbledon_featured_matches.csv")

# =========================
# 2. 目标变量
# =========================
# 假设 point_victor: 1 = P1 赢分, 2 = P2 赢分
df = df[df["point_victor"].isin([1, 2])]
df["y"] = (df["point_victor"] == 1).astype(int)

# =========================
# 3. 构造"无记忆"特征
# =========================
# 构造缺失的特征
df["is_break_point"] = ((df["p1_break_pt"] > 0) | (df["p2_break_pt"] > 0)).astype(int)
df["is_tiebreak"] = ((df["p1_score"] == "15") & (df["p2_score"] == "15")).astype(int)  # 简化版
df["is_deuce"] = ((df["p1_score"] == "D") | (df["p2_score"] == "D")).astype(int)

# 计算上一分的持续时间
df["elapsed_seconds"] = pd.to_timedelta(df["elapsed_time"]).dt.total_seconds()
df["point_duration"] = df.groupby("match_id")["elapsed_seconds"].diff()
df["prev_point_duration"] = df.groupby("match_id")["point_duration"].shift(1)

# 编码分类变量
le_serve_width = LabelEncoder()
le_serve_depth = LabelEncoder()
le_return_depth = LabelEncoder()

df["serve_width_encoded"] = le_serve_width.fit_transform(df["serve_width"].astype(str))
df["serve_depth_encoded"] = le_serve_depth.fit_transform(df["serve_depth"].astype(str))
df["return_depth_encoded"] = le_return_depth.fit_transform(df["return_depth"].astype(str))

# 获取上一分的特征值（t-1）
df["prev_p1_distance_run"] = df.groupby("match_id")["p1_distance_run"].shift(1)
df["prev_p2_distance_run"] = df.groupby("match_id")["p2_distance_run"].shift(1)
df["prev_rally_count"] = df.groupby("match_id")["rally_count"].shift(1)
df["prev_speed_mph"] = df.groupby("match_id")["speed_mph"].shift(1)
df["prev_serve_width_encoded"] = df.groupby("match_id")["serve_width_encoded"].shift(1)
df["prev_serve_depth_encoded"] = df.groupby("match_id")["serve_depth_encoded"].shift(1)
df["prev_return_depth_encoded"] = df.groupby("match_id")["return_depth_encoded"].shift(1)

feature_cols = [
    # 发球
    "server",
    "serve_no",

    # 比分 / 阶段
    "set_no",
    "game_no",
    "point_no",
    "p1_games",
    "p2_games",
    "p1_sets",
    "p2_sets",

    # 关键分
    "is_break_point",
    "is_tiebreak",
    "is_deuce",
    
    # 上一分的持续时间
    "prev_point_duration",
    
    # 上一分的特征（t-1）
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded"
]

X = df[feature_cols].copy()
y = df["y"].values

# 缺失值简单处理（baseline）
X = X.fillna(0)

# =========================
# 4. 训练 / 测试切分
#    （注意：这里是"非时序 baseline"，
#     所以允许随机切分）
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


X_train.shape

(5098, 20)

In [2]:
# =========================
# 5. LASSO Logistic 回归
# =========================
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",   # 或 saga
        C=0.05,                # 正则强度，可交叉验证
        max_iter=2000
    ))
])

pipe.fit(X_train, y_train)

# =========================
# 6. 评估
# =========================
proba_test = pipe.predict_proba(X_test)[:, 1]

print("Log loss :", log_loss(y_test, proba_test))
print("Brier    :", brier_score_loss(y_test, proba_test))
print("ROC AUC  :", roc_auc_score(y_test, proba_test))

# =========================
# 7. LASSO 选出来的特征
# =========================
coef = pipe.named_steps["clf"].coef_.flatten()
coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": coef
}).sort_values("coef", key=np.abs, ascending=False)

print(coef_df)

Log loss : 0.6431820916190735
Brier    : 0.22105732443499407
ROC AUC  : 0.6821886342941439
                      feature      coef
0                      server -0.728673
6                    p2_games -0.080694
7                     p1_sets  0.068754
5                    p1_games  0.062375
12        prev_point_duration -0.047074
16             prev_speed_mph  0.019503
19  prev_return_depth_encoded -0.018933
13       prev_p1_distance_run  0.006415
10                is_tiebreak  0.003445
3                     game_no  0.000000
1                    serve_no  0.000000
2                      set_no  0.000000
11                   is_deuce  0.000000
9              is_break_point  0.000000
8                     p2_sets  0.000000
4                    point_no  0.000000
15           prev_rally_count  0.000000
14       prev_p2_distance_run  0.000000
17   prev_serve_width_encoded  0.000000
18   prev_serve_depth_encoded  0.000000




In [3]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
proba_test = clf.predict_proba(X_test)[:, 1]
# output roc_auc_score and feature importance
roc_auc = roc_auc_score(y_test, proba_test)
feature_importances = clf.feature_importances_
importance_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": feature_importances
}).sort_values("importance", ascending=False)
print("ROC AUC  :", roc_auc)
print(importance_df)

ROC AUC  : 0.7065027907876781
                      feature  importance
0                      server    0.114743
14       prev_p2_distance_run    0.107708
13       prev_p1_distance_run    0.107345
4                    point_no    0.101403
12        prev_point_duration    0.099525
16             prev_speed_mph    0.091447
3                     game_no    0.048781
15           prev_rally_count    0.045939
1                    serve_no    0.042774
17   prev_serve_width_encoded    0.041049
5                    p1_games    0.039604
6                    p2_games    0.038330
19  prev_return_depth_encoded    0.023965
2                      set_no    0.023551
8                     p2_sets    0.019264
7                     p1_sets    0.018767
18   prev_serve_depth_encoded    0.017721
10                is_tiebreak    0.009264
9              is_break_point    0.008821
11                   is_deuce    0.000000


In [4]:
# =========================
# 8. LASSO - 使用 H:AM 列范围 + 之前的特征（所有特征）
# =========================
# H:AM 对应的列（从p1_sets到p2_break_pt_missed）
feature_cols_h_am = [
    "server", "serve_no",  # N, O
    "p1_break_pt", "p2_break_pt",  # AH, AI
    # 之前的特征
    "is_break_point", "is_tiebreak", "is_deuce",
    "set_no", "game_no", "point_no",
    # 上一分的持续时间
    "prev_point_duration",
    # 上一分的特征（t-1）
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded"
]

# 准备数据
X_combined = df[feature_cols_h_am].copy()

# 填充缺失值
X_combined = X_combined.fillna(0)

# 分割数据
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y, test_size=0.3, random_state=42
)

# 构建LASSO模型
pipe_combined = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=0.1,
        max_iter=2000
    ))
])

pipe_combined.fit(X_train_combined, y_train_combined)

# 评估
proba_test_combined = pipe_combined.predict_proba(X_test_combined)[:, 1]

print("="*50)
print("完整特征集 LASSO 模型结果")
print("="*50)
print("Log loss :", log_loss(y_test_combined, proba_test_combined))
print("Brier    :", brier_score_loss(y_test_combined, proba_test_combined))
print("ROC AUC  :", roc_auc_score(y_test_combined, proba_test_combined))

# 特征重要性
coef_combined = pipe_combined.named_steps["clf"].coef_.flatten()
coef_df_combined = pd.DataFrame({
    "feature": feature_cols_h_am,
    "coef": coef_combined
}).sort_values("coef", key=np.abs, ascending=False)

print("\n特征系数（按绝对值排序）：")
print(coef_df_combined)


完整特征集 LASSO 模型结果
Log loss : 0.6478881416954687
Brier    : 0.22114038424447113
ROC AUC  : 0.6791511448386554

特征系数（按绝对值排序）：
                      feature      coef
0                      server -0.722756
10        prev_point_duration -0.067815
7                      set_no  0.058861
14             prev_speed_mph  0.029560
17  prev_return_depth_encoded -0.026061
5                 is_tiebreak  0.012779
11       prev_p1_distance_run  0.010381
3                 p2_break_pt  0.008491
8                     game_no -0.007493
9                    point_no  0.004410
1                    serve_no  0.002019
15   prev_serve_width_encoded -0.001787
2                 p1_break_pt  0.000000
4              is_break_point  0.000000
6                    is_deuce  0.000000
12       prev_p2_distance_run  0.000000
13           prev_rally_count  0.000000
16   prev_serve_depth_encoded  0.000000




In [5]:
# =========================
# 9. 动量-残差 ARX（惯性 + 外生输入）
# 修正：对 u_prev 特征先填充缺失，确保 M_t 不被 NaN 传播；按比赛内顺序递推
# =========================
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1) 逻辑回归在全样本的残差 r_t = y_t - p_hat_t
proba_all = pipe.predict_proba(X)[:, 1]
df["resid"] = y - proba_all

# 2) 构造 ARX 输入：r_{t-1} 与 u_{t-1}
u_prev_cols = [
    "prev_point_duration",
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded",
 ]

# 关键修正：先处理缺失（首分的 prev_* 等结构性缺失全部置 0）
df[u_prev_cols] = df[u_prev_cols].fillna(0)

# 按比赛内顺序排序，避免乱序导致递推不一致
df_sorted = df.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort")

df_sorted["resid_prev"] = df_sorted.groupby("match_id")["resid"].shift(1)

use_cols = ["resid", "resid_prev"] + u_prev_cols
arx_df = df_sorted[use_cols].copy().dropna()

Z = arx_df[["resid_prev"] + u_prev_cols].values
r = arx_df["resid"].values

# 3) 拟合线性 ARX：r_t = φ r_{t-1} + γ^T u_{t-1} + e_t
arx = LinearRegression()
arx.fit(Z, r)

r_hat = arx.predict(Z)
rmse = mean_squared_error(r, r_hat)
r2 = r2_score(r, r_hat)

print("="*50)
print("动量-残差 ARX 拟合结果（填补 NaN 后）")
print("="*50)
print("RMSE(resid):", rmse)
print("R^2(resid) :", r2)

coef_names = ["phi(resid_prev)"] + [f"gamma({c})" for c in u_prev_cols]
coef_vals = np.concatenate(([arx.coef_[0]], arx.coef_[1:]))
for name, val in zip(coef_names, coef_vals):
    print(f"{name:>24} : {val:+.6f}")
print(f"截距(intercept)     : {arx.intercept_:+.6f}")

# 4) 用 r_hat 调整逻辑概率：p_adj = clip(p_hat + r_hat)
idx = arx_df.index
p_base_series = pd.Series(proba_all, index=df.index)
y_series = pd.Series(y, index=df.index)
p_base_sub = p_base_series.loc[idx].values
p_adj_sub = np.clip(p_base_sub + r_hat, 1e-6, 1 - 1e-6)
y_sub = y_series.loc[idx].values

print("\n" + "-"*50)
print("在ARX可用样本上的评估（基线 vs 动量修正）")
print("-"*50)
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
print("Baseline  LogLoss:", log_loss(y_sub, p_base_sub))
print("Momentum  LogLoss:", log_loss(y_sub, p_adj_sub))
print("Baseline  Brier  :", brier_score_loss(y_sub, p_base_sub))
print("Momentum  Brier  :", brier_score_loss(y_sub, p_adj_sub))
print("Baseline  ROC AUC:", roc_auc_score(y_sub, p_base_sub))
print("Momentum  ROC AUC:", roc_auc_score(y_sub, p_adj_sub))

# 5) 按比赛顺序递推动量 M_t（避免 NaN 传染）
Mt = pd.Series(np.nan, index=df_sorted.index, dtype=float)
phi = arx.coef_[0]
gamma = arx.coef_[1:]

for mid, g in df_sorted.groupby("match_id"):
    prev_idx = None
    for idx_i in g.index:
        if prev_idx is None:
            Mt.loc[idx_i] = 0.0
        else:
            u_prev = df_sorted.loc[idx_i, u_prev_cols].values.astype(float)
            Mt.loc[idx_i] = phi * Mt.loc[prev_idx] + gamma.dot(u_prev)
        prev_idx = idx_i

print("\n动量 M_t 概要（NaN 已填补）：")
print(Mt.describe())

动量-残差 ARX 拟合结果（填补 NaN 后）
RMSE(resid): 0.21876931458400842
R^2(resid) : 0.002051225171642135
         phi(resid_prev) : +0.024058
gamma(prev_point_duration) : +0.000011
gamma(prev_p1_distance_run) : +0.000963
gamma(prev_p2_distance_run) : -0.002008
 gamma(prev_rally_count) : +0.004980
   gamma(prev_speed_mph) : +0.000016
gamma(prev_serve_width_encoded) : -0.005134
gamma(prev_serve_depth_encoded) : -0.007494
gamma(prev_return_depth_encoded) : -0.000251
截距(intercept)     : +0.021570

--------------------------------------------------
在ARX可用样本上的评估（基线 vs 动量修正）
--------------------------------------------------
Baseline  LogLoss: 0.6329427103520731
Momentum  LogLoss: 0.6290513662359044
Baseline  Brier  : 0.21927299678501752
Momentum  Brier  : 0.21876931458400842
Baseline  ROC AUC: 0.6890739611498874
Momentum  ROC AUC: 0.6934655033176382



动量 M_t 概要（NaN 已填补）：
count    7284.000000
mean       -0.014677
std         0.017967
min        -0.202319
25%        -0.020658
50%        -0.014946
75%        -0.007561
max         0.937337
dtype: float64


In [6]:
# =========================
# 10. 贝叶斯残差模型（Pyro）
# 修正：排序+分组递推，u_prev 标准化，小尺度先验，稳定学习率
# =========================
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import ClippedAdam
from sklearn.preprocessing import StandardScaler

# 基础残差
proba_all = pipe.predict_proba(X)[:, 1]
df["resid"] = y - proba_all

u_prev_cols = [
    "prev_point_duration",
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded",
]

# 1) 排序 + shift，且先填补 prev_* 缺失
a = df.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort").copy()
a[u_prev_cols] = a[u_prev_cols].fillna(0)
a["resid_prev"] = a.groupby("match_id")["resid"].shift(1)

# 2) 构造用于 Pyro 的表，保留 match_id 以便分组递推
b = a[["match_id", "resid", "resid_prev"] + u_prev_cols].dropna(subset=["resid_prev"])

# 3) 标准化 u_prev（避免尺度失配导致 sigma 爆炸）；resid 本身在 [-1,1]，无需缩放
sc_U = StandardScaler()
U_prev_std = sc_U.fit_transform(b[u_prev_cols].values.astype(float))

r_t = torch.tensor(b["resid"].values, dtype=torch.float32)
r_prev = torch.tensor(b["resid_prev"].values, dtype=torch.float32)
U_prev = torch.tensor(U_prev_std, dtype=torch.float32)

# 模型：r_t ~ Normal(phi * r_{t-1} + gamma^T u_{t-1}, sigma)
def model(r_prev, U_prev, r_t):
    phi = pyro.sample("phi", dist.Normal(0.0, 1.0))
    gamma = pyro.sample("gamma", dist.Normal(torch.zeros(U_prev.shape[1]), torch.ones(U_prev.shape[1])).to_event(1))
    sigma = pyro.sample("sigma", dist.HalfNormal(0.5))  # 更小的先验尺度
    mu = phi * r_prev + (U_prev @ gamma)
    with pyro.plate("data", len(r_t)):
        pyro.sample("obs", dist.Normal(mu, sigma), obs=r_t)

# 引导：对 sigma 用 LogNormal 形式更稳定
def guide(r_prev, U_prev, r_t):
    phi_loc = pyro.param("phi_loc", torch.tensor(0.0))
    phi_scale = pyro.param("phi_scale", torch.tensor(0.1), constraint=dist.constraints.positive)
    gamma_loc = pyro.param("gamma_loc", torch.zeros(U_prev.shape[1]))
    gamma_scale = pyro.param("gamma_scale", torch.ones(U_prev.shape[1]) * 0.1, constraint=dist.constraints.positive)
    sigma_loc = pyro.param("sigma_loc", torch.tensor(-1.0))
    sigma_scale = pyro.param("sigma_scale", torch.tensor(0.2), constraint=dist.constraints.positive)
    pyro.sample("phi", dist.Normal(phi_loc, phi_scale))
    pyro.sample("gamma", dist.Normal(gamma_loc, gamma_scale).to_event(1))
    pyro.sample("sigma", dist.LogNormal(sigma_loc, sigma_scale))

pyro.clear_param_store()
optimizer = ClippedAdam({"lr": 0.005})
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

for step in range(4000):
    loss = svi.step(r_prev, U_prev, r_t)
    if step % 500 == 0:
        print(f"step {step}, ELBO: {loss:.4f}")

phi_est = pyro.param("phi_loc").item()
gamma_est = pyro.param("gamma_loc").detach().numpy()
sigma_med = float(torch.exp(pyro.param("sigma_loc")))  # LogNormal 的中位数
print("\n贝叶斯残差参数（均值场近似）：")
print("phi        :", phi_est)
print("gamma mean :", gamma_est)
print("sigma med  :", sigma_med)

# 4) 按比赛分组递推动量 M_t（使用标准化后的 u_prev）
phi = float(phi_est)
gamma = gamma_est.astype(float)
U_std_df = pd.DataFrame(U_prev_std, index=b.index, columns=u_prev_cols)
Mt_sub = np.zeros(len(b), dtype=float)
pos_map = {idx: i for i, idx in enumerate(b.index)}

for mid, g in b.groupby("match_id", sort=False):
    idxs = g.index.to_list()
    for j, idx_row in enumerate(idxs):
        pos = pos_map[idx_row]
        if j == 0:
            Mt_sub[pos] = 0.0
        else:
            u_vec = U_std_df.loc[idx_row].values.astype(float)
            prev_pos = pos_map[idxs[j - 1]]
            Mt_sub[pos] = phi * Mt_sub[prev_pos] + gamma.dot(u_vec)

alpha = 1.0
p_base_series = pd.Series(proba_all, index=df.index)
y_series = pd.Series(y, index=df.index)
p_base_sub = p_base_series.loc[b.index].values
y_sub = y_series.loc[b.index].values

logit_base_sub = np.log(p_base_sub / (1.0 - p_base_sub))
logit_adj_sub = logit_base_sub + alpha * Mt_sub
p_adj_sub = 1.0 / (1.0 + np.exp(-logit_adj_sub))

from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
print("\n评估（基线 vs 贝叶斯动量 logit 修正）")
print("Baseline  LogLoss:", log_loss(y_sub, p_base_sub))
print("Momentum  LogLoss:", log_loss(y_sub, p_adj_sub))
print("Baseline  Brier  :", brier_score_loss(y_sub, p_base_sub))
print("Momentum  Brier  :", brier_score_loss(y_sub, p_adj_sub))
print("Baseline  ROC AUC:", roc_auc_score(y_sub, p_base_sub))
print("Momentum  ROC AUC:", roc_auc_score(y_sub, p_adj_sub))

  from .autonotebook import tqdm as notebook_tqdm


step 0, ELBO: 8479.6789
step 500, ELBO: 5264.4393
step 1000, ELBO: 4901.6937
step 1500, ELBO: 4831.1238
step 2000, ELBO: 4861.7931
step 2500, ELBO: 4832.6324
step 3000, ELBO: 4832.0581
step 3500, ELBO: 4833.6275

贝叶斯残差参数（均值场近似）：
phi        : 0.030059870332479477
gamma mean : [ 0.01308978  0.01735308 -0.02545187  0.01340388 -0.00032502 -0.01020398
 -0.00536656  0.00620307]
sigma med  : 0.4666706919670105

评估（基线 vs 贝叶斯动量 logit 修正）
Baseline  LogLoss: 0.6329427103520731
Momentum  LogLoss: 0.6326188854156151
Baseline  Brier  : 0.21927299678501752
Momentum  Brier  : 0.21919596049166043
Baseline  ROC AUC: 0.6890739611498874
Momentum  ROC AUC: 0.6903048770468607


Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:836.)
  sigma_med = float(torch.exp(pyro.param("sigma_loc")))  # LogNormal 的中位数


In [7]:
# =========================
# 11. 实验A：控制基线（无 prev_*） + 残差动量 sanity check
# 目的：把短期记忆从 baseline 拿掉，看残差是否更可预测
# =========================
from sklearn.model_selection import GroupShuffleSplit

control_cols = [
    "server", "serve_no",
    "set_no", "game_no", "point_no",
    "p1_games", "p2_games", "p1_sets", "p2_sets",
    "is_break_point", "is_tiebreak", "is_deuce",
 ]

X_ctrl = df[control_cols].copy().fillna(0)
groups = df["match_id"].values
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(gss.split(X_ctrl, y, groups))

Xc_train, Xc_test = X_ctrl.iloc[train_idx], X_ctrl.iloc[test_idx]
yc_train, yc_test = y[train_idx], y[test_idx]

pipe_ctrl = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))
])
pipe_ctrl.fit(Xc_train, yc_train)

proba_ctrl_test = pipe_ctrl.predict_proba(Xc_test)[:, 1]
print("\n[控制基线] Group 留出评估")
print("LogLoss:", log_loss(yc_test, proba_ctrl_test))
print("Brier  :", brier_score_loss(yc_test, proba_ctrl_test))
print("ROC AUC:", roc_auc_score(yc_test, proba_ctrl_test))

# 全量 residual 用于动量检验
proba_ctrl_all = pipe_ctrl.predict_proba(X_ctrl)[:, 1]
resid_ctrl = y - proba_ctrl_all

# 用和之前相同的 u_prev_cols 测试残差是否更可预测
df_ctrl = df.copy()
df_ctrl[u_prev_cols] = df_ctrl[u_prev_cols].fillna(0)
df_ctrl["resid_ctrl"] = resid_ctrl
df_ctrl["resid_ctrl_prev"] = df_ctrl.groupby("match_id")["resid_ctrl"].shift(1)

df_ctrl_sorted = df_ctrl.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort")
arx_ctrl_df = df_ctrl_sorted[["resid_ctrl", "resid_ctrl_prev"] + u_prev_cols].dropna()

Zc = arx_ctrl_df[["resid_ctrl_prev"] + u_prev_cols].values
rc = arx_ctrl_df["resid_ctrl"].values
arx_ctrl = LinearRegression()
arx_ctrl.fit(Zc, rc)
rh = arx_ctrl.predict(Zc)

print("\n[控制基线] 残差 ARX 结果")
print("RMSE(resid):", mean_squared_error(rc, rh))
print("R^2(resid) :", r2_score(rc, rh))
print("phi(resid_prev):", arx_ctrl.coef_[0])
print("截距:", arx_ctrl.intercept_)




[控制基线] Group 留出评估
LogLoss: 0.6385095985692272
Brier  : 0.22294783417803687
ROC AUC: 0.6684337863801324

[控制基线] 残差 ARX 结果
RMSE(resid): 0.21881256824102302
R^2(resid) : 0.002039250008656457
phi(resid_prev): 0.02552900895127678
截距: 5.844290503978066e-05


In [8]:
# =========================
# 12. 实验B：事件驱动的 u_prev（ace/DF/winner/UE/break miss），检验是否带来更强动量
# =========================
candidate_events = [
    "p1_ace", "p2_ace",
    "p1_double_fault", "p2_double_fault",
    "p1_winner", "p2_winner",
    "p1_unf_err", "p2_unf_err",
    "p1_break_pt_missed", "p2_break_pt_missed",
    "p1_break_pt_won", "p2_break_pt_won",
]

event_cols = [c for c in candidate_events if c in df.columns]
if not event_cols:
    print("未找到事件类列，跳过实验B")
else:
    df_ev = df.copy()
    # 为事件列构造上一分特征
    for c in event_cols:
        df_ev[f"prev_{c}"] = df_ev.groupby("match_id")[c].shift(1)
    u_prev_events = [f"prev_{c}" for c in event_cols]

    df_ev[u_prev_events] = df_ev[u_prev_events].fillna(0)
    df_ev["resid_base"] = y - pipe.predict_proba(X)[:, 1]
    df_ev["resid_base_prev"] = df_ev.groupby("match_id")["resid_base"].shift(1)

    df_ev_sorted = df_ev.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort")
    ev_df = df_ev_sorted[["match_id", "resid_base", "resid_base_prev"] + u_prev_events].dropna(subset=["resid_base_prev"])

    # 标准化事件驱动特征
    sc_ev = StandardScaler()
    Ue = sc_ev.fit_transform(ev_df[u_prev_events].values.astype(float))
    re = ev_df["resid_base"].values
    re_prev = ev_df["resid_base_prev"].values
    Z_evt = np.column_stack([re_prev, Ue])

    arx_ev = LinearRegression()
    arx_ev.fit(Z_evt, re)
    r_hat_ev = arx_ev.predict(Z_evt)

    print("\n[事件驱动动量] 残差 ARX")
    print("RMSE(resid):", mean_squared_error(re, r_hat_ev))
    print("R^2(resid) :", r2_score(re, r_hat_ev))
    print("phi(resid_prev):", arx_ev.coef_[0])
    # 输出最重要的事件驱动系数
    gamma_ev = arx_ev.coef_[1:]
    topk = min(8, len(u_prev_events))
    idx_sorted = np.argsort(-np.abs(gamma_ev))[:topk]
    print("Top事件系数：")
    for i in idx_sorted:
        print(f"{u_prev_events[i]:>24} : {gamma_ev[i]:+0.4f}")


[事件驱动动量] 残差 ARX
RMSE(resid): 0.21867273903130371
R^2(resid) : 0.002491768922868398
phi(resid_prev): 0.03482359089686354
Top事件系数：
         prev_p2_unf_err : -0.0146
    prev_p1_double_fault : +0.0099
    prev_p2_double_fault : +0.0077
             prev_p2_ace : -0.0075
    prev_p2_break_pt_won : +0.0071
         prev_p1_unf_err : -0.0066
          prev_p2_winner : +0.0051
          prev_p1_winner : +0.0043


In [9]:
# =========================
# 13. DBN：控制基线 + 标量动量 M_t（logit 空间联合训练）
# 说明：
# - baseline 只含控制变量（server/score/importance），不含任何 prev_*
# - 短期/事件信号放入动量驱动 u_{t-1}，M_t 在 log-odds 上直接作用
# - 使用 Pyro + AutoNormal 变分；如需严格分组留出，可先过滤 match_id 再训练
# =========================
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO, Predictive
from pyro.optim import ClippedAdam

# 控制变量（弱化 baseline）
ctrl_cols = [
    "server", "serve_no",
    "set_no", "game_no", "point_no",
    "p1_games", "p2_games", "p1_sets", "p2_sets",
    "is_break_point", "is_tiebreak", "is_deuce",
]

# 事件驱动 + 体能/发球状态驱动（上一分/窗口）
event_cols = [c for c in [
    "p1_ace", "p2_ace",
    "p1_double_fault", "p2_double_fault",
    "p1_winner", "p2_winner",
    "p1_unf_err", "p2_unf_err",
    "p1_break_pt_missed", "p2_break_pt_missed",
    "p1_break_pt_won", "p2_break_pt_won",
] if c in df.columns]

df_m = df.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort").copy()

# 构造上一分事件特征
for c in event_cols:
    df_m[f"prev_{c}"] = df_m.groupby("match_id")[c].shift(1)

# 构造简单 EWMA（疲劳/发球状态），并 shift(1) 防泄漏
def ewm_prev(series, span=5):
    return series.shift(1).ewm(span=span, adjust=False).mean()

df_m["rally_ewm"] = df_m.groupby("match_id")["rally_count"].transform(lambda s: ewm_prev(s, span=6))
df_m["dist_ewm"] = df_m.groupby("match_id")["p1_distance_run"].transform(lambda s: ewm_prev(s, span=6))
df_m["serve_speed_ewm"] = df_m.groupby("match_id")["speed_mph"].transform(lambda s: ewm_prev(s, span=6))

u_cols = []
u_cols += [f"prev_{c}" for c in event_cols]
u_cols += ["rally_ewm", "dist_ewm", "serve_speed_ewm"]

df_m[u_cols] = df_m[u_cols].fillna(0)

# 设计矩阵
X_ctrl = df_m[ctrl_cols].fillna(0).values
U_drv = df_m[u_cols].values
y_arr = df_m["y"].values.astype(float)

# match_id 编码为整数
match_codes, match_uniques = pd.factorize(df_m["match_id"], sort=False)

# 标准化驱动项
sc_u = StandardScaler()
U_std = sc_u.fit_transform(U_drv)

ctrl_tensor = torch.tensor(X_ctrl, dtype=torch.float32)
u_tensor = torch.tensor(U_std, dtype=torch.float32)
y_tensor = torch.tensor(y_arr, dtype=torch.float32)
match_tensor = torch.tensor(match_codes, dtype=torch.long)

pyro.clear_param_store()

def dbn_model(ctrl, u, match_ids, y):
    T, p = ctrl.shape
    q = u.shape[1]
    beta = pyro.sample("beta", dist.Normal(0.0, 1.0).expand([p]).to_event(1))
    rho = pyro.sample("rho", dist.Normal(0.0, 0.3))
    eta = pyro.sample("eta", dist.Normal(0.0, 1.0).expand([q]).to_event(1))
    sigma_M = pyro.sample("sigma_M", dist.HalfNormal(0.5))
    logits = []
    M_prev = torch.tensor(0.0)
    last_mid = match_ids[0] if T > 0 else -1
    for t in range(T):
        if match_ids[t] != last_mid:
            M_prev = torch.tensor(0.0)
            last_mid = match_ids[t]
        mean_M = rho * M_prev + (u[t] @ eta)
        M_t = pyro.sample(f"M_{t}", dist.Normal(mean_M, sigma_M))
        logit_t = (ctrl[t] @ beta) + M_t
        pyro.sample(f"y_{t}", dist.Bernoulli(logits=logit_t), obs=y[t])
        logits.append(logit_t)
        M_prev = M_t
    return torch.stack(logits) if logits else torch.tensor([])

guide = pyro.infer.autoguide.AutoNormal(dbn_model)
optimizer = ClippedAdam({"lr": 0.003})
svi = SVI(dbn_model, guide, optimizer, loss=Trace_ELBO())

n_steps = 2
for step in range(n_steps):
    loss = svi.step(ctrl_tensor, u_tensor, match_tensor, y_tensor)
    print(f"\rstep {step}, ELBO: {loss:.2f}", end='')

step 1, ELBO: 55266.72

In [10]:
# 取 posterior 平均的 logits 进行评估（仅取 _RETURN 以避免内存爆）
from pyro.distributions import constraints
from pyro.distributions.transforms import biject_to

predictive = Predictive(dbn_model, guide=guide, num_samples=20, return_sites=["_RETURN"])
samples = predictive(ctrl_tensor, u_tensor, match_tensor, y_tensor)
logits_mc = samples["_RETURN"]  # [S, T]
p_mc = torch.sigmoid(logits_mc)
p_mean = p_mc.mean(0).detach().numpy()
y_np = y_tensor.numpy()

print("\n[DBN 动量] 全量评估 (提醒：当前未做 match 留出)")
print("LogLoss:", log_loss(y_np, p_mean))
print("Brier  :", brier_score_loss(y_np, p_mean))
print("ROC AUC:", roc_auc_score(y_np, p_mean))

# 用后验均值参数递推一个确定性的 M_t 均值，避免逐点采样导致内存溢出
rho = biject_to(constraints.real)(pyro.param("AutoNormal.locs.rho")).item()
eta = biject_to(constraints.real)(pyro.param("AutoNormal.locs.eta")).detach().numpy()
Mt_det = np.zeros(len(df_m), dtype=float)
last_mid = match_codes[0] if len(match_codes) > 0 else -1

for i in range(len(df_m)):
    if match_codes[i] != last_mid:
        Mt_det[i] = 0.0
        last_mid = match_codes[i]
    else:
        Mt_det[i] = rho * Mt_det[i-1] + eta.dot(U_std[i])

df_m["Mt_mean"] = Mt_det
print("\n动量 M_t 概要（确定性均值递推）：")
print(df_m["Mt_mean"].describe())
print("提示：如需严格评估，请先按 match_id 过滤训练/验证，再各自跑一遍 SVI 与预测。")


[DBN 动量] 全量评估 (提醒：当前未做 match 留出)
LogLoss: 0.7172249285786145
Brier  : 0.26158642768859863
ROC AUC: 0.5095039457285456

动量 M_t 概要（确定性均值递推）：
count    7284.000000
mean       -0.000014
std         0.019484
min        -0.063131
25%        -0.009820
50%        -0.000186
75%         0.010022
max         0.132671
Name: Mt_mean, dtype: float64
提示：如需严格评估，请先按 match_id 过滤训练/验证，再各自跑一遍 SVI 与预测。
