In [7]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score

# =========================
# 1. 读数据
# =========================
try:
    df = pd.read_csv("2024_Wimbledon_featured_matches.csv")
except:
    df = pd.read_csv("Wimbledon_featured_matches.csv")

# =========================
# 2. 目标变量
# =========================
# 假设 point_victor: 1 = P1 赢分, 2 = P2 赢分
df = df[df["point_victor"].isin([1, 2])]
df["y"] = (df["point_victor"] == 1).astype(int)

# =========================
# 3. 构造"无记忆"特征
# =========================
# 构造缺失的特征
df["is_break_point"] = ((df["p1_break_pt"] > 0) | (df["p2_break_pt"] > 0)).astype(int)
df["is_tiebreak"] = (
    (df["p1_games"] >= 6)
    & (df["p2_games"] >= 6)
    & (df["p1_games"].sub(df["p2_games"]).abs() <= 1)
).astype(int)
df["is_deuce"] = ((df["p1_score"] == "D") | (df["p2_score"] == "D")).astype(int)

# 计算上一分的持续时间
df["elapsed_seconds"] = pd.to_timedelta(df["elapsed_time"]).dt.total_seconds()
df["point_duration"] = df.groupby("match_id")["elapsed_seconds"].diff()
df["prev_point_duration"] = df.groupby("match_id")["point_duration"].shift(1)

# 编码分类变量
le_serve_width = LabelEncoder()
le_serve_depth = LabelEncoder()
le_return_depth = LabelEncoder()

df["serve_width_encoded"] = le_serve_width.fit_transform(df["serve_width"].astype(str))
df["serve_depth_encoded"] = le_serve_depth.fit_transform(df["serve_depth"].astype(str))
df["return_depth_encoded"] = le_return_depth.fit_transform(df["return_depth"].astype(str))

# 获取上一分的特征值（t-1）
df["prev_p1_distance_run"] = df.groupby("match_id")["p1_distance_run"].shift(1)
df["prev_p2_distance_run"] = df.groupby("match_id")["p2_distance_run"].shift(1)
df["prev_rally_count"] = df.groupby("match_id")["rally_count"].shift(1)
df["prev_speed_mph"] = df.groupby("match_id")["speed_mph"].shift(1)
df["prev_serve_width_encoded"] = df.groupby("match_id")["serve_width_encoded"].shift(1)
df["prev_serve_depth_encoded"] = df.groupby("match_id")["serve_depth_encoded"].shift(1)
df["prev_return_depth_encoded"] = df.groupby("match_id")["return_depth_encoded"].shift(1)

feature_cols = [
    # 发球
    "server",
    "serve_no",

    # 比分 / 阶段
    "set_no",
    "game_no",
    "point_no",
    "p1_games",
    "p2_games",
    "p1_sets",
    "p2_sets",

    # 关键分
    "is_break_point",
    "is_tiebreak",
    "is_deuce",
    
    # 上一分的持续时间
    "prev_point_duration",
    
    # 上一分的特征（t-1）
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded"
 ]

X = df[feature_cols].copy()
y = df["y"].values

# 缺失值简单处理（baseline）
X = X.fillna(0)

# =========================
# 4. 训练 / 测试切分
#    （注意：这里是"非时序 baseline"，
#     所以允许随机切分）
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


X_train.shape
(5098, 20)
# =========================
# 5. LASSO Logistic 回归
# =========================
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",   # 或 saga
        C=0.05,                # 正则强度，可交叉验证
        max_iter=2000
    ))
])

pipe.fit(X_train, y_train)

# =========================
# 6. 评估
# =========================
proba_test = pipe.predict_proba(X_test)[:, 1]

print("Log loss :", log_loss(y_test, proba_test))
print("Brier    :", brier_score_loss(y_test, proba_test))
print("ROC AUC  :", roc_auc_score(y_test, proba_test))

# =========================
# 7. LASSO 选出来的特征
# =========================
coef = pipe.named_steps["clf"].coef_.flatten()
coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": coef
}).sort_values("coef", key=np.abs, ascending=False)

print(coef_df)
# Log loss : 0.6432881357480272
# Brier    : 0.22109724921439386
# ROC AUC  : 0.6814363823550846
#                       feature      coef
# 0                      server -0.728514
# 6                    p2_games -0.073611
# 7                     p1_sets  0.069106
# 5                    p1_games  0.067332
# 12        prev_point_duration -0.047120
# 10                is_tiebreak -0.023539
# 16             prev_speed_mph  0.019188
# 19  prev_return_depth_encoded -0.019165
# 13       prev_p1_distance_run  0.006833
# 3                     game_no  0.000000
# 1                    serve_no  0.000000
# 2                      set_no  0.000000
# 11                   is_deuce  0.000000
# 9              is_break_point  0.000000
# 8                     p2_sets  0.000000
# 4                    point_no  0.000000
# 15           prev_rally_count  0.000000
# 14       prev_p2_distance_run  0.000000
# 17   prev_serve_width_encoded  0.000000
# 18   prev_serve_depth_encoded  0.000000
# /usr/local/python/3.12.1/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
#   warnings.warn(
# /usr/local/python/3.12.1/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
#   warnings.warn(
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
proba_test = clf.predict_proba(X_test)[:, 1]
# output roc_auc_score and feature importance
roc_auc = roc_auc_score(y_test, proba_test)
feature_importances = clf.feature_importances_
importance_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": feature_importances
}).sort_values("importance", ascending=False)
print("ROC AUC  :", roc_auc)
print(importance_df)
# ROC AUC  : 0.7054745176749886
#                       feature  importance
# 0                      server    0.113223
# 14       prev_p2_distance_run    0.109447
# 13       prev_p1_distance_run    0.109183
# 4                    point_no    0.101670
# 12        prev_point_duration    0.101123
# 16             prev_speed_mph    0.092422
# 3                     game_no    0.049483
# 15           prev_rally_count    0.046079
# 1                    serve_no    0.043066
# 17   prev_serve_width_encoded    0.041929
# 5                    p1_games    0.038208
# 6                    p2_games    0.037426
# 19  prev_return_depth_encoded    0.024284
# 2                      set_no    0.024078
# 8                     p2_sets    0.020305
# 7                     p1_sets    0.019179
# 18   prev_serve_depth_encoded    0.017587
# 9              is_break_point    0.008836
# 10                is_tiebreak    0.002471
# 11                   is_deuce    0.000000
# =========================
# 8. LASSO - 使用 H:AM 列范围 + 之前的特征（所有特征）
# =========================
# H:AM 对应的列（从p1_sets到p2_break_pt_missed）
feature_cols_h_am = [
    "server", "serve_no",  # N, O
    "p1_break_pt", "p2_break_pt",  # AH, AI
    # 之前的特征
    "is_break_point", "is_tiebreak", "is_deuce",
    "set_no", "game_no", "point_no",
    # 上一分的持续时间
    "prev_point_duration",
    # 上一分的特征（t-1）
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded"
]

# 准备数据
X_combined = df[feature_cols_h_am].copy()

# 填充缺失值
X_combined = X_combined.fillna(0)

# 分割数据
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y, test_size=0.3, random_state=42
)

# 构建LASSO模型
pipe_combined = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=0.1,
        max_iter=2000
    ))
])

pipe_combined.fit(X_train_combined, y_train_combined)

# 评估
proba_test_combined = pipe_combined.predict_proba(X_test_combined)[:, 1]

print("="*50)
print("完整特征集 LASSO 模型结果")
print("="*50)
print("Log loss :", log_loss(y_test_combined, proba_test_combined))
print("Brier    :", brier_score_loss(y_test_combined, proba_test_combined))
print("ROC AUC  :", roc_auc_score(y_test_combined, proba_test_combined))

# 特征重要性
coef_combined = pipe_combined.named_steps["clf"].coef_.flatten()
coef_df_combined = pd.DataFrame({
    "feature": feature_cols_h_am,
    "coef": coef_combined
}).sort_values("coef", key=np.abs, ascending=False)

print("\n特征系数（按绝对值排序）：")
print(coef_df_combined)
# ==================================================
# 完整特征集 LASSO 模型结果
# ==================================================
# Log loss : 0.6482339245667559
# Brier    : 0.22123165173714027
# ROC AUC  : 0.6768223470763117

# 特征系数（按绝对值排序）：
#                       feature      coef
# 0                      server -0.723306
# 10        prev_point_duration -0.068434
# 5                 is_tiebreak -0.038609
# 7                      set_no  0.038142
# 14             prev_speed_mph  0.029695
# 17  prev_return_depth_encoded -0.026370
# 9                    point_no  0.026358
# 11       prev_p1_distance_run  0.011250
# 3                 p2_break_pt  0.006380
# 15   prev_serve_width_encoded -0.002081
# 1                    serve_no  0.001060
# 6                    is_deuce  0.000000
# 2                 p1_break_pt  0.000000
# 4              is_break_point  0.000000
# 8                     game_no  0.000000
# 12       prev_p2_distance_run  0.000000
# 13           prev_rally_count  0.000000
# 16   prev_serve_depth_encoded  0.000000
# /usr/local/python/3.12.1/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
#   warnings.warn(
# /usr/local/python/3.12.1/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1160: UserWarning: Inconsistent values: penalty=l1 with l1_ratio=0.0. penalty is deprecated. Please use l1_ratio only.
#   warnings.warn(
# =========================
# 9. 动量-残差 ARX（惯性 + 外生输入）
# 修正：对 u_prev 特征先填充缺失，确保 M_t 不被 NaN 传播；按比赛内顺序递推
# =========================
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1) 逻辑回归在全样本的残差 r_t = y_t - p_hat_t
proba_all = pipe.predict_proba(X)[:, 1]
df["resid"] = y - proba_all

# 2) 构造 ARX 输入：r_{t-1} 与 u_{t-1}
u_prev_cols = [
    "prev_point_duration",
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded",
 ]

# 关键修正：先处理缺失（首分的 prev_* 等结构性缺失全部置 0）
df[u_prev_cols] = df[u_prev_cols].fillna(0)

# 按比赛内顺序排序，避免乱序导致递推不一致
df_sorted = df.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort")

df_sorted["resid_prev"] = df_sorted.groupby("match_id")["resid"].shift(1)

use_cols = ["resid", "resid_prev"] + u_prev_cols
arx_df = df_sorted[use_cols].copy().dropna()

Z = arx_df[["resid_prev"] + u_prev_cols].values
r = arx_df["resid"].values

# 3) 拟合线性 ARX：r_t = φ r_{t-1} + γ^T u_{t-1} + e_t
arx = LinearRegression()
arx.fit(Z, r)

r_hat = arx.predict(Z)
rmse = mean_squared_error(r, r_hat)
r2 = r2_score(r, r_hat)

print("="*50)
print("动量-残差 ARX 拟合结果（填补 NaN 后）")
print("="*50)
print("RMSE(resid):", rmse)
print("R^2(resid) :", r2)

coef_names = ["phi(resid_prev)"] + [f"gamma({c})" for c in u_prev_cols]
coef_vals = np.concatenate(([arx.coef_[0]], arx.coef_[1:]))
for name, val in zip(coef_names, coef_vals):
    print(f"{name:>24} : {val:+.6f}")
print(f"截距(intercept)     : {arx.intercept_:+.6f}")

# 4) 用 r_hat 调整逻辑概率：p_adj = clip(p_hat + r_hat)
idx = arx_df.index
p_base_series = pd.Series(proba_all, index=df.index)
y_series = pd.Series(y, index=df.index)
p_base_sub = p_base_series.loc[idx].values
p_adj_sub = np.clip(p_base_sub + r_hat, 1e-6, 1 - 1e-6)
y_sub = y_series.loc[idx].values

print("\n" + "-"*50)
print("在ARX可用样本上的评估（基线 vs 动量修正）")
print("-"*50)
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
print("Baseline  LogLoss:", log_loss(y_sub, p_base_sub))
print("Momentum  LogLoss:", log_loss(y_sub, p_adj_sub))
print("Baseline  Brier  :", brier_score_loss(y_sub, p_base_sub))
print("Momentum  Brier  :", brier_score_loss(y_sub, p_adj_sub))
print("Baseline  ROC AUC:", roc_auc_score(y_sub, p_base_sub))
print("Momentum  ROC AUC:", roc_auc_score(y_sub, p_adj_sub))

# 5) 按比赛顺序递推动量 M_t（避免 NaN 传染）
Mt = pd.Series(np.nan, index=df_sorted.index, dtype=float)
phi = arx.coef_[0]
gamma = arx.coef_[1:]

for mid, g in df_sorted.groupby("match_id"):
    prev_idx = None
    for idx_i in g.index:
        if prev_idx is None:
            Mt.loc[idx_i] = 0.0
        else:
            u_prev = df_sorted.loc[idx_i, u_prev_cols].values.astype(float)
            Mt.loc[idx_i] = phi * Mt.loc[prev_idx] + gamma.dot(u_prev)
        prev_idx = idx_i

print("\n动量 M_t 概要（NaN 已填补）：")
print(Mt.describe())
# ==================================================
# 动量-残差 ARX 拟合结果（填补 NaN 后）
# ==================================================
# RMSE(resid): 0.21874649227879706
# R^2(resid) : 0.002049000888824426
#          phi(resid_prev) : +0.023991
# gamma(prev_point_duration) : +0.000011
# gamma(prev_p1_distance_run) : +0.000965
# gamma(prev_p2_distance_run) : -0.002005
#  gamma(prev_rally_count) : +0.004995
#    gamma(prev_speed_mph) : +0.000015
# gamma(prev_serve_width_encoded) : -0.005135
# gamma(prev_serve_depth_encoded) : -0.007578
# gamma(prev_return_depth_encoded) : -0.000184
# 截距(intercept)     : +0.021584

# --------------------------------------------------
# 在ARX可用样本上的评估（基线 vs 动量修正）
# --------------------------------------------------
# Baseline  LogLoss: 0.6328915836506986
# Momentum  LogLoss: 0.6290005974964743
# Baseline  Brier  : 0.21924999487655136
# Momentum  Brier  : 0.21874649227879706
# Baseline  ROC AUC: 0.6893348870213798
# Momentum  ROC AUC: 0.69372985241485

# 动量 M_t 概要（NaN 已填补）：
# count    7284.000000
# mean       -0.014666
# std         0.017972
# min        -0.201893
# 25%        -0.020673
# 50%        -0.014935
# 75%        -0.007549
# max         0.937300
# dtype: float64
# =========================
# 10. 贝叶斯残差模型（Pyro）
# 修正：排序+分组递推，u_prev 标准化，小尺度先验，稳定学习率
# =========================
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import ClippedAdam
from sklearn.preprocessing import StandardScaler

# 基础残差
proba_all = pipe.predict_proba(X)[:, 1]
df["resid"] = y - proba_all

u_prev_cols = [
    "prev_point_duration",
    "prev_p1_distance_run",
    "prev_p2_distance_run",
    "prev_rally_count",
    "prev_speed_mph",
    "prev_serve_width_encoded",
    "prev_serve_depth_encoded",
    "prev_return_depth_encoded",
]

# 1) 排序 + shift，且先填补 prev_* 缺失
a = df.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort").copy()
a[u_prev_cols] = a[u_prev_cols].fillna(0)
a["resid_prev"] = a.groupby("match_id")["resid"].shift(1)

# 2) 构造用于 Pyro 的表，保留 match_id 以便分组递推
b = a[["match_id", "resid", "resid_prev"] + u_prev_cols].dropna(subset=["resid_prev"])

# 3) 标准化 u_prev（避免尺度失配导致 sigma 爆炸）；resid 本身在 [-1,1]，无需缩放
sc_U = StandardScaler()
U_prev_std = sc_U.fit_transform(b[u_prev_cols].values.astype(float))

r_t = torch.tensor(b["resid"].values, dtype=torch.float32)
r_prev = torch.tensor(b["resid_prev"].values, dtype=torch.float32)
U_prev = torch.tensor(U_prev_std, dtype=torch.float32)

# 模型：r_t ~ Normal(phi * r_{t-1} + gamma^T u_{t-1}, sigma)
def model(r_prev, U_prev, r_t):
    phi = pyro.sample("phi", dist.Normal(0.0, 1.0))
    gamma = pyro.sample("gamma", dist.Normal(torch.zeros(U_prev.shape[1]), torch.ones(U_prev.shape[1])).to_event(1))
    sigma = pyro.sample("sigma", dist.HalfNormal(0.5))  # 更小的先验尺度
    mu = phi * r_prev + (U_prev @ gamma)
    with pyro.plate("data", len(r_t)):
        pyro.sample("obs", dist.Normal(mu, sigma), obs=r_t)

# 引导：对 sigma 用 LogNormal 形式更稳定
def guide(r_prev, U_prev, r_t):
    phi_loc = pyro.param("phi_loc", torch.tensor(0.0))
    phi_scale = pyro.param("phi_scale", torch.tensor(0.1), constraint=dist.constraints.positive)
    gamma_loc = pyro.param("gamma_loc", torch.zeros(U_prev.shape[1]))
    gamma_scale = pyro.param("gamma_scale", torch.ones(U_prev.shape[1]) * 0.1, constraint=dist.constraints.positive)
    sigma_loc = pyro.param("sigma_loc", torch.tensor(-1.0))
    sigma_scale = pyro.param("sigma_scale", torch.tensor(0.2), constraint=dist.constraints.positive)
    pyro.sample("phi", dist.Normal(phi_loc, phi_scale))
    pyro.sample("gamma", dist.Normal(gamma_loc, gamma_scale).to_event(1))
    pyro.sample("sigma", dist.LogNormal(sigma_loc, sigma_scale))

pyro.clear_param_store()
optimizer = ClippedAdam({"lr": 0.005})
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

for step in range(4000):
    loss = svi.step(r_prev, U_prev, r_t)
    if step % 500 == 0:
        print(f"step {step}, ELBO: {loss:.4f}")

phi_est = pyro.param("phi_loc").item()
gamma_est = pyro.param("gamma_loc").detach().numpy()
sigma_med = float(torch.exp(pyro.param("sigma_loc")))  # LogNormal 的中位数
print("\n贝叶斯残差参数（均值场近似）：")
print("phi        :", phi_est)
print("gamma mean :", gamma_est)
print("sigma med  :", sigma_med)

# 4) 按比赛分组递推动量 M_t（使用标准化后的 u_prev）
phi = float(phi_est)
gamma = gamma_est.astype(float)
U_std_df = pd.DataFrame(U_prev_std, index=b.index, columns=u_prev_cols)
Mt_sub = np.zeros(len(b), dtype=float)
pos_map = {idx: i for i, idx in enumerate(b.index)}

for mid, g in b.groupby("match_id", sort=False):
    idxs = g.index.to_list()
    for j, idx_row in enumerate(idxs):
        pos = pos_map[idx_row]
        if j == 0:
            Mt_sub[pos] = 0.0
        else:
            u_vec = U_std_df.loc[idx_row].values.astype(float)
            prev_pos = pos_map[idxs[j - 1]]
            Mt_sub[pos] = phi * Mt_sub[prev_pos] + gamma.dot(u_vec)

alpha = 1.0
p_base_series = pd.Series(proba_all, index=df.index)
y_series = pd.Series(y, index=df.index)
p_base_sub = p_base_series.loc[b.index].values
y_sub = y_series.loc[b.index].values

logit_base_sub = np.log(p_base_sub / (1.0 - p_base_sub))
logit_adj_sub = logit_base_sub + alpha * Mt_sub
p_adj_sub = 1.0 / (1.0 + np.exp(-logit_adj_sub))

from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score
print("\n评估（基线 vs 贝叶斯动量 logit 修正）")
print("Baseline  LogLoss:", log_loss(y_sub, p_base_sub))
print("Momentum  LogLoss:", log_loss(y_sub, p_adj_sub))
print("Baseline  Brier  :", brier_score_loss(y_sub, p_base_sub))
print("Momentum  Brier  :", brier_score_loss(y_sub, p_adj_sub))
print("Baseline  ROC AUC:", roc_auc_score(y_sub, p_base_sub))
print("Momentum  ROC AUC:", roc_auc_score(y_sub, p_adj_sub))
# =========================
# 11. 实验A：控制基线（无 prev_*） + 残差动量 sanity check
# 目的：把短期记忆从 baseline 拿掉，看残差是否更可预测
# =========================
from sklearn.model_selection import GroupShuffleSplit

control_cols = [
    "server", "serve_no",
    "set_no", "game_no", "point_no",
    "p1_games", "p2_games", "p1_sets", "p2_sets",
    "is_break_point", "is_tiebreak", "is_deuce",
 ]

X_ctrl = df[control_cols].copy().fillna(0)
groups = df["match_id"].values
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(gss.split(X_ctrl, y, groups))

Xc_train, Xc_test = X_ctrl.iloc[train_idx], X_ctrl.iloc[test_idx]
yc_train, yc_test = y[train_idx], y[test_idx]

pipe_ctrl = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))
])
pipe_ctrl.fit(Xc_train, yc_train)

proba_ctrl_test = pipe_ctrl.predict_proba(Xc_test)[:, 1]
print("\n[控制基线] Group 留出评估")
print("LogLoss:", log_loss(yc_test, proba_ctrl_test))
print("Brier  :", brier_score_loss(yc_test, proba_ctrl_test))
print("ROC AUC:", roc_auc_score(yc_test, proba_ctrl_test))

# 全量 residual 用于动量检验
proba_ctrl_all = pipe_ctrl.predict_proba(X_ctrl)[:, 1]
resid_ctrl = y - proba_ctrl_all

# 用和之前相同的 u_prev_cols 测试残差是否更可预测
df_ctrl = df.copy()
df_ctrl[u_prev_cols] = df_ctrl[u_prev_cols].fillna(0)
df_ctrl["resid_ctrl"] = resid_ctrl
df_ctrl["resid_ctrl_prev"] = df_ctrl.groupby("match_id")["resid_ctrl"].shift(1)

df_ctrl_sorted = df_ctrl.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort")
arx_ctrl_df = df_ctrl_sorted[["resid_ctrl", "resid_ctrl_prev"] + u_prev_cols].dropna()

Zc = arx_ctrl_df[["resid_ctrl_prev"] + u_prev_cols].values
rc = arx_ctrl_df["resid_ctrl"].values
arx_ctrl = LinearRegression()
arx_ctrl.fit(Zc, rc)
rh = arx_ctrl.predict(Zc)

print("\n[控制基线] 残差 ARX 结果")
print("RMSE(resid):", mean_squared_error(rc, rh))
print("R^2(resid) :", r2_score(rc, rh))
print("phi(resid_prev):", arx_ctrl.coef_[0])
print("截距:", arx_ctrl.intercept_)
# /usr/local/python/3.12.1/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1135: FutureWarning: 'penalty' was deprecated in version 1.8 and will be removed in 1.10. To avoid this warning, leave 'penalty' set to its default value and use 'l1_ratio' or 'C' instead. Use l1_ratio=0 instead of penalty='l2', l1_ratio=1 instead of penalty='l1', and C=np.inf instead of penalty=None.
#   warnings.warn(
# [控制基线] Group 留出评估
# LogLoss: 0.6380637215205914
# Brier  : 0.22272501307219322
# ROC AUC: 0.6684257306235558

# [控制基线] 残差 ARX 结果
# RMSE(resid): 0.21880958485917723
# R^2(resid) : 0.0020291419281776024
# phi(resid_prev): 0.024958710549519286
# 截距: 0.0014382493822839553
# =========================
# 12. 实验B：事件驱动的 u_prev（ace/DF/winner/UE/break miss），检验是否带来更强动量
# =========================
candidate_events = [
    "p1_ace", "p2_ace",
    "p1_double_fault", "p2_double_fault",
    "p1_winner", "p2_winner",
    "p1_unf_err", "p2_unf_err",
    "p1_break_pt_missed", "p2_break_pt_missed",
    "p1_break_pt_won", "p2_break_pt_won",
]

event_cols = [c for c in candidate_events if c in df.columns]
if not event_cols:
    print("未找到事件类列，跳过实验B")
else:
    df_ev = df.copy()
    # 为事件列构造上一分特征
    for c in event_cols:
        df_ev[f"prev_{c}"] = df_ev.groupby("match_id")[c].shift(1)
    u_prev_events = [f"prev_{c}" for c in event_cols]

    df_ev[u_prev_events] = df_ev[u_prev_events].fillna(0)
    df_ev["resid_base"] = y - pipe.predict_proba(X)[:, 1]
    df_ev["resid_base_prev"] = df_ev.groupby("match_id")["resid_base"].shift(1)

    df_ev_sorted = df_ev.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort")
    ev_df = df_ev_sorted[["match_id", "resid_base", "resid_base_prev"] + u_prev_events].dropna(subset=["resid_base_prev"])

    # 标准化事件驱动特征
    sc_ev = StandardScaler()
    Ue = sc_ev.fit_transform(ev_df[u_prev_events].values.astype(float))
    re = ev_df["resid_base"].values
    re_prev = ev_df["resid_base_prev"].values
    Z_evt = np.column_stack([re_prev, Ue])

    arx_ev = LinearRegression()
    arx_ev.fit(Z_evt, re)
    r_hat_ev = arx_ev.predict(Z_evt)

    print("\n[事件驱动动量] 残差 ARX")
    print("RMSE(resid):", mean_squared_error(re, r_hat_ev))
    print("R^2(resid) :", r2_score(re, r_hat_ev))
    print("phi(resid_prev):", arx_ev.coef_[0])
    # 输出最重要的事件驱动系数
    gamma_ev = arx_ev.coef_[1:]
    topk = min(8, len(u_prev_events))
    idx_sorted = np.argsort(-np.abs(gamma_ev))[:topk]
    print("Top事件系数：")
    for i in idx_sorted:
        print(f"{u_prev_events[i]:>24} : {gamma_ev[i]:+0.4f}")
# [事件驱动动量] 残差 ARX
# RMSE(resid): 0.2186494894378006
# R^2(resid) : 0.0024914706544719722
# phi(resid_prev): 0.0348291134118627
# Top事件系数：
#          prev_p2_unf_err : -0.0147
#     prev_p1_double_fault : +0.0098
#     prev_p2_double_fault : +0.0078
#              prev_p2_ace : -0.0075
#     prev_p2_break_pt_won : +0.0071
#          prev_p1_unf_err : -0.0065
#           prev_p2_winner : +0.0051
#           prev_p1_winner : +0.0044
# # =========================
# ## 13. DBN：控制基线 + 标量动量 M_t（logit 空间联合训练，整批训练避免先验重复）
# ## - 约束 rho ∈ (-1,1) 用 tanh
# ## - ctrl / u 均标准化以匹配先验尺度
# ## - 可在后续用 match 留出做严格评估
# # =========================
# import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO, Predictive
from pyro.optim import ClippedAdam
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

# Install pyro-ppl if not already installed
try:
    import pyro
except ImportError:
    # !pip install pyro-ppl
    import pyro

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 控制变量（弱化 baseline）
ctrl_cols = [
    "server", "serve_no",
    "set_no", "game_no", "point_no",
    "p1_games", "p2_games", "p1_sets", "p2_sets",
    "is_break_point", "is_tiebreak", "is_deuce",
]

# 事件驱动 + 体能/发球状态驱动（上一分/窗口）
event_cols = [c for c in [
    "p1_ace", "p2_ace",
    "p1_double_fault", "p2_double_fault",
    "p1_winner", "p2_winner",
    "p1_unf_err", "p2_unf_err",
    "p1_break_pt_missed", "p2_break_pt_missed",
    "p1_break_pt_won", "p2_break_pt_won",
] if c in df.columns]

df_m = df.sort_values(["match_id", "set_no", "game_no", "point_no", "elapsed_seconds"], kind="mergesort").copy()

# 构造上一分事件特征
for c in event_cols:
    df_m[f"prev_{c}"] = df_m.groupby("match_id")[c].shift(1)

# 构造简单 EWMA（疲劳/发球状态），并 shift(1) 防泄漏
def ewm_prev(series, span=5):
    return series.shift(1).ewm(span=span, adjust=False).mean()

df_m["rally_ewm"] = df_m.groupby("match_id")["rally_count"].transform(lambda s: ewm_prev(s, span=6))
df_m["dist_ewm"] = df_m.groupby("match_id")["p1_distance_run"].transform(lambda s: ewm_prev(s, span=6))
df_m["serve_speed_ewm"] = df_m.groupby("match_id")["speed_mph"].transform(lambda s: ewm_prev(s, span=6))

u_cols = []
u_cols += [f"prev_{c}" for c in event_cols]
u_cols += ["rally_ewm", "dist_ewm", "serve_speed_ewm"]

df_m[u_cols] = df_m[u_cols].fillna(0)

# 设计矩阵（ctrl 也做标准化，避免尺度失配导致先验过度收缩）
X_ctrl_raw = df_m[ctrl_cols].fillna(0).values
sc_ctrl = StandardScaler()
X_ctrl = sc_ctrl.fit_transform(X_ctrl_raw)

U_drv = df_m[u_cols].values
sc_u = StandardScaler()
U_std = sc_u.fit_transform(U_drv)

y_arr = df_m["y"].values.astype(float)

Log loss : 0.6432885625659864
Brier    : 0.2210974143492638
ROC AUC  : 0.6814296807787902
                      feature      coef
0                      server -0.728515
6                    p2_games -0.073638
7                     p1_sets  0.069104
5                    p1_games  0.067341
12        prev_point_duration -0.047121
10                is_tiebreak -0.023551
16             prev_speed_mph  0.019187
19  prev_return_depth_encoded -0.019165
13       prev_p1_distance_run  0.006833
4                    point_no  0.000000
3                     game_no  0.000000
8                     p2_sets  0.000000
9              is_break_point  0.000000
1                    serve_no  0.000000
11                   is_deuce  0.000000
2                      set_no  0.000000
14       prev_p2_distance_run  0.000000
15           prev_rally_count  0.000000
17   prev_serve_width_encoded  0.000000
18   prev_serve_depth_encoded  0.000000
ROC AUC  : 0.7054745176749886
                      feature  importanc

In [9]:


# match_id 编码为整数
match_codes, match_uniques = pd.factorize(df_m["match_id"], sort=False)

ctrl_tensor = torch.tensor(X_ctrl, dtype=torch.float32).to(device)
u_tensor = torch.tensor(U_std, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y_arr, dtype=torch.float32).to(device)
match_tensor = torch.tensor(match_codes, dtype=torch.long).to(device)

pyro.clear_param_store()

def dbn_model(ctrl, u, match_ids, y):
    T, p = ctrl.shape
    q = u.shape[1]
    beta = pyro.sample("beta", dist.Normal(0.0, 1.0).expand([p]).to_event(1)).to(device)
    rho_raw = pyro.sample("rho_raw", dist.Normal(0.0, 1.0)).to(device)
    rho = torch.tanh(rho_raw)
    eta = pyro.sample("eta", dist.Normal(0.0, 1.0).expand([q]).to_event(1)).to(device)
    logits = []
    M_prev = torch.tensor(0.0, device=device)
    last_mid = match_ids[0] if T > 0 else torch.tensor(-1, device=device)
    for t in range(T):
        if match_ids[t] != last_mid:
            M_prev = torch.tensor(0.0, device=device)
            last_mid = match_ids[t]
        mean_M = rho * M_prev + (u[t] @ eta)
        M_prev = mean_M
        logit_t = (ctrl[t] @ beta) + M_prev
        pyro.sample(f"y_{t}", dist.Bernoulli(logits=logit_t), obs=y[t])
        logits.append(logit_t)
    return torch.stack(logits) if logits else torch.tensor([], device=device)

guide = pyro.infer.autoguide.AutoNormal(dbn_model)
optimizer = ClippedAdam({"lr": 0.003})
svi = SVI(dbn_model, guide, optimizer, loss=Trace_ELBO())

# 整批训练，避免先验在 mini-batch 下被重复放大
n_epochs = 100
for epoch in range(n_epochs):
    loss = svi.step(ctrl_tensor, u_tensor, match_tensor, y_tensor)
    print(f"\repoch {epoch+1}/{n_epochs} ELBO per-pt: {loss/len(df_m):.4f}", end="")
# Using device: cpu
# epoch 200/200 ELBO per-pt: 0.6552
# 取 posterior 平均的 logits 进行评估（仅取 _RETURN 以避免内存爆）
from pyro.distributions import constraints
from pyro.distributions.transforms import biject_to

predictive = Predictive(dbn_model, guide=guide, num_samples=20, return_sites=["_RETURN"])
samples = predictive(ctrl_tensor, u_tensor, match_tensor, y_tensor)
logits_mc = samples["_RETURN"]  # [S, T]
p_mc = torch.sigmoid(logits_mc)
p_mean = p_mc.mean(0).detach().cpu().numpy()
y_np = y_tensor.cpu().numpy()

print("\n[DBN 动量] 全量评估 (提醒：当前未做 match 留出)")
print("LogLoss:", log_loss(y_np, p_mean))
print("Brier  :", brier_score_loss(y_np, p_mean))
print("ROC AUC:", roc_auc_score(y_np, p_mean))

# 用后验均值参数递推一个确定性的 M_t 均值，避免逐点采样导致内存溢出
rho_raw_loc = pyro.param("AutoNormal.locs.rho_raw")
rho = float(torch.tanh(rho_raw_loc))
eta = biject_to(constraints.real)(pyro.param("AutoNormal.locs.eta")).detach().cpu().numpy()
Mt_det = np.zeros(len(df_m), dtype=float)
last_mid = match_codes[0] if len(match_codes) > 0 else -1

for i in range(len(df_m)):
    if match_codes[i] != last_mid:
        Mt_det[i] = 0.0
        last_mid = match_codes[i]
    else:
        Mt_det[i] = rho * Mt_det[i-1] + eta.dot(U_std[i])

df_m["Mt_mean"] = Mt_det
print("\n动量 M_t 概要（确定性均值递推）：")
print(df_m["Mt_mean"].describe())
print("提示：如需严格评估，请先按 match_id 过滤训练/验证，再各自跑一遍 SVI 与预测。")
# [DBN 动量] 全量评估 (提醒：当前未做 match 留出)
# LogLoss: 0.6324973369785187
# Brier  : 0.2203022688627243
# ROC AUC: 0.6881189855056287

# 动量 M_t 概要（确定性均值递推）：
# count    7284.000000
# mean        0.000549
# std         0.137156
# min        -0.648622
# 25%        -0.102190
# 50%         0.028821
# 75%         0.058438
# max         0.496915
# Name: Mt_mean, dtype: float64
# 提示：如需严格评估，请先按 match_id 过滤训练/验证，再各自跑一遍 SVI 与预测。

epoch 100/100 ELBO per-pt: 0.6718
[DBN 动量] 全量评估 (提醒：当前未做 match 留出)
LogLoss: 0.6512044412739093
Brier  : 0.22930807828465832
ROC AUC: 0.677261594697636

动量 M_t 概要（确定性均值递推）：
count    7284.000000
mean        0.000138
std         0.221802
min        -1.002380
25%        -0.091618
50%         0.019401
75%         0.048207
max         0.906768
Name: Mt_mean, dtype: float64
提示：如需严格评估，请先按 match_id 过滤训练/验证，再各自跑一遍 SVI 与预测。


In [None]:
!pip install plotly

Collecting plotly
  Downloading plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.15.0-py3-none-any.whl.metadata (13 kB)
Downloading plotly-6.5.2-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ------ --------------------------------- 1.6/9.9 MB 10.5 MB/s eta 0:00:01
   -------------------- ------------------- 5.0/9.9 MB 13.7 MB/s eta 0:00:01
   ---------------------------------------- 9.9/9.9 MB 17.6 MB/s eta 0:00:00
Downloading narwhals-2.15.0-py3-none-any.whl (432 kB)
Installing collected packages: narwhals, plotly
Successfully installed narwhals-2.15.0 plotly-6.5.2


In [12]:
# ============================================
# Visualization: Dynamic Bayesian Network (DBN) Momentum Model Analysis
# Using Plotly for Interactive Visualization
# ============================================

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, log_loss, brier_score_loss, roc_auc_score
from sklearn.calibration import calibration_curve
import numpy as np
import pandas as pd

# Color scheme
colors_plotly = px.colors.qualitative.Plotly

# Create subplots
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=(
        'Momentum Evolution (All Matches)',
        'Momentum Distribution',
        'Momentum vs Match Outcome',
        'Predicted Probability Distribution',
        'ROC Curve - Model Performance',
        'Calibration Curve',
        'Momentum Autocorrelation',
        'Confusion Matrix',
        'Model Performance Metrics'
    ),
    specs=[
        [{"type": "scatter"}, {"type": "histogram"}, {"type": "box"}],
        [{"type": "scatter"}, {"type": "scatter"}, {"type": "scatter"}],
        [{"type": "scatter"}, {"type": "heatmap"}, {"type": "table"}]
    ],
    vertical_spacing=0.08,
    horizontal_spacing=0.08
)

# 1. Momentum Evolution Trajectory
sample_matches = df_m['match_id'].unique()
df_sample = df_m[df_m['match_id'].isin(sample_matches)]
for idx, mid in enumerate(sample_matches):
    df_match = df_sample[df_sample['match_id'] == mid]
    fig.add_trace(
        go.Scatter(
            x=list(range(len(df_match))),
            y=df_match['Mt_mean'].values,
            mode='lines+markers',
            name=f'Match {idx+1}',
            marker=dict(size=4),
            line=dict(width=2),
            legendgroup='matches',
            showlegend=(idx < 3)
        ),
        row=1, col=1
    )
fig.update_xaxes(title_text="Point Number", row=1, col=1)
fig.update_yaxes(title_text="Momentum M_t", row=1, col=1)

# 2. Momentum Distribution
fig.add_trace(
    go.Histogram(
        x=df_m['Mt_mean'],
        nbinsx=50,
        marker=dict(color='steelblue', line=dict(color='black', width=1)),
        name='Momentum',
        showlegend=False
    ),
    row=1, col=2
)
# Add zero line
fig.add_vline(x=0, line_dash="dash", line_color="red", 
              annotation_text="Zero", row=1, col=2)
fig.update_xaxes(title_text="Momentum M_t", row=1, col=2)
fig.update_yaxes(title_text="Frequency", row=1, col=2)

# 3. Momentum vs Outcome Boxplot
df_m['result'] = df_m['y'].map({1: 'Win', 0: 'Loss'})
for outcome, color in zip(['Loss', 'Win'], ['lightcoral', 'lightgreen']):
    df_outcome = df_m[df_m['result'] == outcome]
    fig.add_trace(
        go.Box(
            y=df_outcome['Mt_mean'],
            name=outcome,
            marker_color=color,
            showlegend=False
        ),
        row=1, col=3
    )
fig.update_xaxes(title_text="Match Outcome", row=1, col=3)
fig.update_yaxes(title_text="Momentum M_t", row=1, col=3)

# 4. Predicted Probability Distribution
sorted_idx = np.argsort(p_mean)
fig.add_trace(
    go.Scatter(
        x=list(range(len(p_mean))),
        y=p_mean[sorted_idx],
        mode='markers',
        marker=dict(
            size=3,
            color=y_np[sorted_idx],
            colorscale='RdYlGn',
            showscale=True,
            colorbar=dict(title="Actual<br>Outcome", x=0.65, len=0.3)
        ),
        name='Predictions',
        showlegend=False
    ),
    row=2, col=1
)
fig.add_hline(y=0.5, line_dash="dash", line_color="black",
              annotation_text="Threshold=0.5", row=2, col=1)
fig.update_xaxes(title_text="Sample Index (sorted)", row=2, col=1)
fig.update_yaxes(title_text="Predicted Win Probability", row=2, col=1)

# 5. ROC Curve
fpr, tpr, thresholds = roc_curve(y_np, p_mean)
roc_auc = auc(fpr, tpr)
fig.add_trace(
    go.Scatter(
        x=fpr, y=tpr,
        mode='lines',
        name=f'ROC (AUC={roc_auc:.3f})',
        line=dict(color='darkorange', width=3),
        showlegend=False
    ),
    row=2, col=2
)
fig.add_trace(
    go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='Random',
        line=dict(color='navy', width=2, dash='dash'),
        showlegend=False
    ),
    row=2, col=2
)
fig.update_xaxes(title_text="False Positive Rate", row=2, col=2)
fig.update_yaxes(title_text="True Positive Rate", row=2, col=2)

# 6. Calibration Curve
prob_true, prob_pred = calibration_curve(y_np, p_mean, n_bins=10)
fig.add_trace(
    go.Scatter(
        x=prob_pred, y=prob_true,
        mode='lines+markers',
        name='Model',
        line=dict(color='darkblue', width=3),
        marker=dict(size=8),
        showlegend=False
    ),
    row=2, col=3
)
fig.add_trace(
    go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='Perfect',
        line=dict(color='gray', width=2, dash='dash'),
        showlegend=False
    ),
    row=2, col=3
)
fig.update_xaxes(title_text="Predicted Probability", row=2, col=3)
fig.update_yaxes(title_text="Actual Win Rate", row=2, col=3)

# 7. Momentum Autocorrelation
sample_match = df_m[df_m['match_id'] == sample_matches[0]]
Mt_vals = sample_match['Mt_mean'].values
if len(Mt_vals) > 1:
    Mt_prev = Mt_vals[:-1]
    Mt_curr = Mt_vals[1:]
    fig.add_trace(
        go.Scatter(
            x=Mt_prev, y=Mt_curr,
            mode='markers',
            marker=dict(size=6, color='purple', opacity=0.6),
            name='Data',
            showlegend=False
        ),
        row=3, col=1
    )
    # Add fitted line
    z = np.polyfit(Mt_prev, Mt_curr, 1)
    x_line = np.linspace(Mt_prev.min(), Mt_prev.max(), 100)
    y_line = z[0] * x_line + z[1]
    fig.add_trace(
        go.Scatter(
            x=x_line, y=y_line,
            mode='lines',
            name=f'Fit: ρ≈{z[0]:.3f}',
            line=dict(color='red', width=2, dash='dash'),
            showlegend=False
        ),
        row=3, col=1
    )
fig.update_xaxes(title_text="Previous Momentum M_(t-1)", row=3, col=1)
fig.update_yaxes(title_text="Current Momentum M_t", row=3, col=1)

# 8. Confusion Matrix
y_pred_binary = (p_mean > 0.5).astype(int)
cm = confusion_matrix(y_np, y_pred_binary)
fig.add_trace(
    go.Heatmap(
        z=cm,
        x=['Pred Loss', 'Pred Win'],
        y=['Actual Loss', 'Actual Win'],
        colorscale='Blues',
        showscale=False,
        text=cm,
        texttemplate='%{text}',
        textfont=dict(size=16, color='white')
    ),
    row=3, col=2
)
fig.update_xaxes(title_text="Predicted Outcome", row=3, col=2)
fig.update_yaxes(title_text="Actual Outcome", row=3, col=2)

# 9. Performance Metrics Table
metrics_data = {
    'Metric': [
        'Log Loss', 'Brier Score', 'ROC AUC', '',
        'Accuracy', 'Precision', 'Recall', 'F1 Score', '',
        'Momentum decay ρ', 'Training epochs', 'Sample size', '',
        'Momentum Mean', 'Momentum Std', 'Momentum Min', 'Momentum Max'
    ],
    'Value': [
        f'{log_loss(y_np, p_mean):.4f}',
        f'{brier_score_loss(y_np, p_mean):.4f}',
        f'{roc_auc_score(y_np, p_mean):.4f}', '',
        f'{accuracy_score(y_np, y_pred_binary):.4f}',
        f'{precision_score(y_np, y_pred_binary):.4f}',
        f'{recall_score(y_np, y_pred_binary):.4f}',
        f'{f1_score(y_np, y_pred_binary):.4f}', '',
        f'{rho:.4f}',
        f'{n_epochs}',
        f'{len(df_m)}', '',
        f'{df_m["Mt_mean"].mean():.4f}',
        f'{df_m["Mt_mean"].std():.4f}',
        f'{df_m["Mt_mean"].min():.4f}',
        f'{df_m["Mt_mean"].max():.4f}'
    ]
}
fig.add_trace(
    go.Table(
        header=dict(
            values=['<b>Metric</b>', '<b>Value</b>'],
            fill_color='paleturquoise',
            align='left',
            font=dict(size=12, color='black')
        ),
        cells=dict(
            values=[metrics_data['Metric'], metrics_data['Value']],
            fill_color=[['white', 'lightgray'] * 9],
            align='left',
            font=dict(size=11)
        )
    ),
    row=3, col=3
)

# Update layout
fig.update_layout(
    height=1200,
    width=1800,
    title_text="<b>Dynamic Bayesian Network (DBN) Momentum Model - Interactive Analysis</b>",
    title_font_size=20,
    title_x=0.5,
    showlegend=False,
    hovermode='closest'
)

fig.show()

# Summary output
print("\n" + "="*60)
print("✓ Interactive Visualization Complete!")
print("="*60)
print("\n📊 Model Core Concepts:")
print("  1. Uses momentum M_t to capture cumulative effects in matches")
print("  2. M_t = ρ·M_(t-1) + η·u_t, where ρ is the decay factor")
print("  3. Momentum resets to 0 at the start of each new match")
print("  4. Positive momentum favors wins, negative favors losses")
print(f"  5. Model achieves AUC = {roc_auc_score(y_np, p_mean):.4f}")
print("\n💡 Hover over plots for interactive details!")
print("="*60)


✓ Interactive Visualization Complete!

📊 Model Core Concepts:
  1. Uses momentum M_t to capture cumulative effects in matches
  2. M_t = ρ·M_(t-1) + η·u_t, where ρ is the decay factor
  3. Momentum resets to 0 at the start of each new match
  4. Positive momentum favors wins, negative favors losses
  5. Model achieves AUC = 0.6773

💡 Hover over plots for interactive details!


In [14]:
# Advanced Momentum Analytics: High-Insight Visualizations
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
from scipy.interpolate import UnivariateSpline
from scipy.signal import find_peaks
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Calculate player rankings based on match outcomes
player_stats = {}
for _, row in df_m.iterrows():
    p1, p2 = row['player1'], row['player2']
    p1_win = row['y']  # 1 if player1 won the point, 0 otherwise
    
    if p1 not in player_stats:
        player_stats[p1] = {'wins': 0, 'total': 0}
    if p2 not in player_stats:
        player_stats[p2] = {'wins': 0, 'total': 0}
    
    player_stats[p1]['total'] += 1
    player_stats[p2]['total'] += 1
    
    if p1_win == 1:
        player_stats[p1]['wins'] += 1
    else:
        player_stats[p2]['wins'] += 1

# Calculate win rates and rank players
for player in player_stats:
    player_stats[player]['win_rate'] = player_stats[player]['wins'] / max(1, player_stats[player]['total'])

player_ranking = sorted(player_stats.items(), key=lambda x: x[1]['win_rate'], reverse=True)
player_rank_map = {player: rank + 1 for rank, (player, _) in enumerate(player_ranking)}

# ============================================================================
# 1. WIN vs LOSS MOMENTUM TRAJECTORY COMPARISON
# ============================================================================
print("\n" + "="*70)
print("INSIGHT 1: Momentum Trajectories - Winners vs Losers")
print("="*70)

# Group by match and get trajectory for winners vs losers
df_m['match_id_str'] = df_m['match_id'].astype(str)
all_matches = df_m['match_id_str'].unique()

# Create match info with rankings and sort by rank combination
match_info = []
for match in all_matches:
    p1 = df_m[df_m['match_id_str'] == match]['player1'].iloc[0]
    p2 = df_m[df_m['match_id_str'] == match]['player2'].iloc[0]
    r1 = player_rank_map.get(p1, 999)
    r2 = player_rank_map.get(p2, 999)
    match_info.append({
        'match': match,
        'player1': p1,
        'player2': p2,
        'rank1': r1,
        'rank2': r2,
        'rank_sum': r1 + r2,  # For sorting
        'rank_min': min(r1, r2)
    })

# Sort matches by rank: first by minimum rank, then by rank sum
match_info_sorted = sorted(match_info, key=lambda x: (x['rank_min'], x['rank_sum']))
matches = [m['match'] for m in match_info_sorted]

# Calculate grid dimensions
n_matches = len(matches)
n_cols = 5
n_rows = (n_matches + n_cols - 1) // n_cols  # Ceiling division

# Create subplot titles with player rankings
subtitle_list = []
for m in match_info_sorted:
    subtitle_list.append(f"第{m['rank1']}名vs第{m['rank2']}名")

fig_traj = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=subtitle_list)

match_stats = []
for idx, match in enumerate(matches):
    df_match = df_m[df_m['match_id_str'] == match].sort_values('point_no')
    
    # Separate winners and losers
    winners = df_match[df_match['y'] == 1]
    losers = df_match[df_match['y'] == 0]
    
    match_stats.append({
        'match': match,
        'win_momentum_mean': winners['Mt_mean'].mean(),
        'loss_momentum_mean': losers['Mt_mean'].mean(),
        'win_momentum_std': winners['Mt_mean'].std(),
        'loss_momentum_std': losers['Mt_mean'].std(),
        'win_count': len(winners),
        'loss_count': len(losers)
    })
    
    row = idx // n_cols + 1
    col = idx % n_cols + 1
    
    # Plot winners trajectory
    if len(winners) > 0:
        fig_traj.add_trace(
            go.Scatter(x=winners['point_no'], y=winners['Mt_mean'], 
                      mode='lines+markers', name='Winners', 
                      line=dict(color='green', width=2),
                      legendgroup=f'm{idx}',
                      hovertemplate='Point: %{x}<br>Momentum: %{y:.3f}<extra></extra>'),
            row=row, col=col
        )
    
    # Plot losers trajectory
    if len(losers) > 0:
        fig_traj.add_trace(
            go.Scatter(x=losers['point_no'], y=losers['Mt_mean'], 
                      mode='lines+markers', name='Losers', 
                      line=dict(color='red', width=2),
                      legendgroup=f'm{idx}',
                      hovertemplate='Point: %{x}<br>Momentum: %{y:.3f}<extra></extra>'),
            row=row, col=col
        )

fig_traj.update_layout(height=300*n_rows, title_text="Momentum Trajectories: Winners vs Losers by Match", 
                       showlegend=True, hovermode='closest')
fig_traj.show()

# Print match-level statistics
print("\nMatch-Level Momentum Statistics:")
match_stats_df = pd.DataFrame(match_stats)
print(match_stats_df.to_string(index=False))
print(f"\nKey Finding: Winners have {(match_stats_df['win_momentum_mean'].mean() - match_stats_df['loss_momentum_mean'].mean()):.4f} higher average momentum")

# ============================================================================
# 2. MOMENTUM ACCELERATION ANALYSIS (2nd Derivative)
# ============================================================================
print("\n" + "="*70)
print("INSIGHT 2: Momentum Acceleration - Detecting Momentum Shifts")
print("="*70)

fig_accel = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=subtitle_list)

acceleration_stats = []
for idx, match in enumerate(matches):
    df_match = df_m[df_m['match_id_str'] == match].sort_values('point_no').reset_index(drop=True)
    
    if len(df_match) < 3:
        continue
    
    # Compute first derivative (momentum rate of change)
    momentum = df_match['Mt_mean'].values
    velocity = np.diff(momentum)
    
    # Compute second derivative (momentum acceleration)
    if len(velocity) > 1:
        acceleration = np.diff(velocity)
    else:
        continue
    
    # Identify large momentum shifts
    acc_peaks_up, _ = find_peaks(acceleration, height=0.05)
    acc_peaks_down, _ = find_peaks(-acceleration, height=0.05)
    
    acceleration_stats.append({
        'match': match,
        'avg_acceleration': np.abs(acceleration).mean(),
        'max_acceleration': np.max(np.abs(acceleration)),
        'momentum_shifts_up': len(acc_peaks_up),
        'momentum_shifts_down': len(acc_peaks_down),
        'volatility': np.std(acceleration)
    })
    
    row = idx // n_cols + 1
    col = idx % n_cols + 1
    
    # Plot acceleration
    fig_accel.add_trace(
        go.Bar(x=np.arange(len(acceleration)), y=acceleration,
               marker=dict(color=['green' if a > 0 else 'red' for a in acceleration]),
               hovertemplate='Point Interval: %{x}<br>Acceleration: %{y:.4f}<extra></extra>',
               showlegend=False),
        row=row, col=col
    )

fig_accel.update_layout(height=300*n_rows, 
                        title_text="Momentum Acceleration Analysis (Green=Gain, Red=Loss)",
                        hovermode='closest')
fig_accel.show()

# Print acceleration statistics
print("\nMomentum Acceleration Statistics:")
accel_df = pd.DataFrame(acceleration_stats)
print(accel_df.to_string(index=False))
print(f"\nKey Finding: Average momentum volatility: {accel_df['volatility'].mean():.4f} (higher = more momentum swings)")

# ============================================================================
# 3. MOMENTUM vs PREDICTION PROBABILITY (Relationship Analysis)
# ============================================================================
print("\n" + "="*70)
print("INSIGHT 3: Momentum-Probability Coupling - Predictive Power")
print("="*70)

# Create scatter plot with polynomial fit
fig_prob = go.Figure()

# Add scatter
fig_prob.add_trace(go.Scatter(
    x=df_m['Mt_mean'], y=p_mean,
    mode='markers',
    marker=dict(color=df_m['y'], colorscale='RdYlGn', size=4, 
                colorbar=dict(title="Point<br>Outcome"),
                line=dict(width=0)),
    name='Data Points',
    hovertemplate='Momentum: %{x:.3f}<br>Pred Prob: %{y:.3f}<br>Outcome: %{customdata}<extra></extra>',
    customdata=df_m['y']
))

# Fit polynomial relationship
valid_idx = ~(np.isnan(df_m['Mt_mean']) | np.isnan(p_mean))
if valid_idx.sum() > 10:
    z = np.polyfit(df_m['Mt_mean'][valid_idx], p_mean[valid_idx], 2)
    p_poly = np.poly1d(z)
    momentum_range = np.linspace(df_m['Mt_mean'].min(), df_m['Mt_mean'].max(), 100)
    prob_fit = p_poly(momentum_range)
    
    fig_prob.add_trace(go.Scatter(
        x=momentum_range, y=prob_fit,
        mode='lines',
        name='Polynomial Fit (2nd order)',
        line=dict(color='navy', width=3),
        hovertemplate='Momentum: %{x:.3f}<br>Fitted Prob: %{y:.3f}<extra></extra>'
    ))

fig_prob.update_layout(
    title="Momentum-Probability Relationship: How Momentum Drives Win Prediction",
    xaxis_title="Momentum (Mt)",
    yaxis_title="Predicted Win Probability",
    height=600,
    hovermode='closest',
    showlegend=True
)
fig_prob.show()

# Quantify correlation
correlation = np.corrcoef(df_m['Mt_mean'][valid_idx], p_mean[valid_idx])[0, 1]
print(f"Correlation (Momentum vs Predicted Probability): {correlation:.4f}")
print(f"Polynomial fit coefficients: {z}")
print(f"Interpretation: Momentum has {'strong' if abs(correlation) > 0.5 else 'moderate' if abs(correlation) > 0.3 else 'weak'} influence on win probability prediction")

# ============================================================================
# 4. MATCH STAGE ANALYSIS (Early/Mid/Late Game Momentum Dynamics)
# ============================================================================
print("\n" + "="*70)
print("INSIGHT 4: Match Stage Dynamics - Momentum Through Match Progression")
print("="*70)

# Categorize points by match stage
df_m['match_id_str'] = df_m['match_id'].astype(str)
df_m_with_stage = df_m.copy()

stage_data = []
for match in df_m['match_id_str'].unique():
    df_match = df_m[df_m['match_id_str'] == match].sort_values('point_no')
    n_points = len(df_match)
    
    third = n_points // 3
    
    early = df_match.iloc[:third]
    mid = df_match.iloc[third:2*third]
    late = df_match.iloc[2*third:]
    
    stage_data.append({
        'match': match,
        'early_momentum': early['Mt_mean'].mean() if len(early) > 0 else np.nan,
        'mid_momentum': mid['Mt_mean'].mean() if len(mid) > 0 else np.nan,
        'late_momentum': late['Mt_mean'].mean() if len(late) > 0 else np.nan,
        'early_win_rate': early['y'].mean() if len(early) > 0 else np.nan,
        'mid_win_rate': mid['y'].mean() if len(mid) > 0 else np.nan,
        'late_win_rate': late['y'].mean() if len(late) > 0 else np.nan,
    })

stage_df = pd.DataFrame(stage_data)

# Create visualization
fig_stage = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Momentum by Stage", "Win Rate by Stage"),
    specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)

stages = ['early', 'mid', 'late']
colors_stage = ['#1f77b4', '#ff7f0e', '#2ca02c']

# Momentum by stage
momentum_means = [stage_df['early_momentum'].mean(), 
                  stage_df['mid_momentum'].mean(), 
                  stage_df['late_momentum'].mean()]
momentum_stds = [stage_df['early_momentum'].std(), 
                 stage_df['mid_momentum'].std(), 
                 stage_df['late_momentum'].std()]

fig_stage.add_trace(
    go.Bar(x=stages, y=momentum_means, error_y=dict(type='data', array=momentum_stds),
           marker=dict(color=colors_stage),
           name='Momentum',
           hovertemplate='Stage: %{x}<br>Momentum: %{y:.4f}<extra></extra>'),
    row=1, col=1
)

# Win rate by stage
win_means = [stage_df['early_win_rate'].mean(), 
             stage_df['mid_win_rate'].mean(), 
             stage_df['late_win_rate'].mean()]

fig_stage.add_trace(
    go.Bar(x=stages, y=win_means,
           marker=dict(color=colors_stage),
           name='Win Rate',
           hovertemplate='Stage: %{x}<br>Win Rate: %{y:.3f}<extra></extra>'),
    row=1, col=2
)

fig_stage.update_yaxes(title_text="Mean Momentum", row=1, col=1)
fig_stage.update_yaxes(title_text="Win Rate", row=1, col=2)
fig_stage.update_layout(title_text="Momentum Dynamics Throughout Match Stages", height=500)
fig_stage.show()

print("\nMatch Stage Analysis:")
print(stage_df.describe().to_string())
print(f"\nKey Finding: Momentum {'increases' if momentum_means[-1] > momentum_means[0] else 'decreases'} from early to late game")
print(f"Late-game win rate: {win_means[-1]:.3f} (early: {win_means[0]:.3f}) - {'Momentum matters more late' if win_means[-1] > win_means[0] else 'Momentum matters more early'}")

# ============================================================================
# 5. MOMENTUM PATTERN CLUSTERING (Identifying Recurring Patterns)
# ============================================================================
print("\n" + "="*70)
print("INSIGHT 5: Momentum Pattern Clustering - Recurring Momentum Profiles")
print("="*70)

# Extract momentum profiles for each match
profiles = []
match_ids_for_cluster = []

for match in df_m['match_id_str'].unique():
    df_match = df_m[df_m['match_id_str'] == match].sort_values('point_no')
    momentum_series = df_match['Mt_mean'].values
    
    # Normalize to same length (linear interpolation to 50 points)
    if len(momentum_series) > 5:
        points = np.linspace(0, 1, len(momentum_series))
        interp_points = np.linspace(0, 1, 50)
        interp_momentum = np.interp(interp_points, points, momentum_series)
        profiles.append(interp_momentum)
        match_ids_for_cluster.append(match)

profiles = np.array(profiles)

# Standardize profiles
scaler = StandardScaler()
profiles_scaled = scaler.fit_transform(profiles)

# Clustering (4 clusters)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(profiles_scaled)

# PCA for visualization
pca = PCA(n_components=3)
profiles_pca = pca.fit_transform(profiles_scaled)

# Create 3D scatter plot
fig_cluster = go.Figure()

for cluster_id in range(4):
    mask = clusters == cluster_id
    fig_cluster.add_trace(go.Scatter3d(
        x=profiles_pca[mask, 0],
        y=profiles_pca[mask, 1],
        z=profiles_pca[mask, 2],
        mode='markers',
        marker=dict(size=8, opacity=0.8),
        name=f'Cluster {cluster_id + 1}',
        text=np.array(match_ids_for_cluster)[mask],
        hovertemplate='Match: %{text}<br>PC1: %{x:.2f}<br>PC2: %{y:.2f}<br>PC3: %{z:.2f}<extra></extra>'
    ))

fig_cluster.update_layout(
    title="Momentum Pattern Clustering (3D PCA)",
    scene=dict(xaxis_title="PC1", yaxis_title="PC2", zaxis_title="PC3"),
    height=700
)
fig_cluster.show()

# Analyze clusters
print(f"\nClustering Results (4 clusters, PCA variance explained: {pca.explained_variance_ratio_.sum():.3f}):")
for c in range(4):
    cluster_matches = np.array(match_ids_for_cluster)[clusters == c]
    cluster_outcomes = df_m[df_m['match_id_str'].isin(cluster_matches)]['y'].mean()
    print(f"  Cluster {c+1}: {len(cluster_matches)} matches, Win rate: {cluster_outcomes:.3f}")

# ============================================================================
# 6. INFLECTION POINT DETECTION (Key Turning Moments)
# ============================================================================
print("\n" + "="*70)
print("INSIGHT 6: Inflection Points - Identifying Critical Match Moments")
print("="*70)

fig_inflection = make_subplots(rows=n_rows, cols=n_cols, 
                               subplot_titles=subtitle_list)

inflection_stats = []
for idx, match in enumerate(matches):
    df_match = df_m[df_m['match_id_str'] == match].sort_values('point_no').reset_index(drop=True)
    
    momentum = df_match['Mt_mean'].values
    point_nos = df_match['point_no'].values
    
    if len(momentum) < 5:
        continue
    
    # Smooth momentum for inflection detection
    try:
        spline = UnivariateSpline(range(len(momentum)), momentum, s=1.0)
        momentum_smooth = spline(range(len(momentum)))
        
        # Find second derivative (concavity)
        second_deriv = spline.derivative(n=2)(np.arange(len(momentum)))
        
        # Inflection points where second derivative changes sign
        inflection_points = np.where(np.diff(np.sign(second_deriv)))[0]
        
        # Find significant inflection points (where momentum changes direction strongly)
        if len(inflection_points) > 0:
            inflection_values = momentum[inflection_points]
            significant_inflection = len(inflection_points[np.abs(inflection_values) > np.percentile(np.abs(momentum), 50)])
        else:
            significant_inflection = 0
        
        inflection_stats.append({
            'match': match,
            'inflection_points': len(inflection_points),
            'significant_inflections': significant_inflection,
            'momentum_range': momentum.max() - momentum.min(),
            'turning_intensity': np.sum(np.abs(np.diff(momentum)))
        })
        
        row = idx // n_cols + 1
        col = idx % n_cols + 1
        
        # Plot momentum with inflection points highlighted
        fig_inflection.add_trace(
            go.Scatter(x=point_nos, y=momentum, mode='lines', name='Momentum',
                      line=dict(color='blue', width=2),
                      hovertemplate='Point: %{x}<br>Momentum: %{y:.3f}<extra></extra>',
                      legendgroup=f'm{idx}', showlegend=False),
            row=row, col=col
        )
        
        if len(inflection_points) > 0:
            fig_inflection.add_trace(
                go.Scatter(x=point_nos[inflection_points], y=momentum[inflection_points],
                          mode='markers', name='Inflection',
                          marker=dict(color='red', size=8),
                          hovertemplate='Inflection at Point: %{x}<br>Momentum: %{y:.3f}<extra></extra>',
                          legendgroup=f'm{idx}', showlegend=False),
                row=row, col=col
            )
    except:
        pass

fig_inflection.update_layout(height=300*n_rows,
                             title_text="Inflection Points: Critical Momentum-Changing Moments (Red Dots)",
                             hovermode='closest')
fig_inflection.show()

# Print inflection statistics
print("\nInflection Point Analysis (Critical Turning Moments):")
inflection_df = pd.DataFrame(inflection_stats)
print(inflection_df.to_string(index=False))
print(f"\nKey Finding: Average {inflection_df['inflection_points'].mean():.1f} turning points per match")
print(f"Turning intensity: {inflection_df['turning_intensity'].mean():.4f} (higher = more dramatic momentum swings)")

# ============================================================================
# SUMMARY INSIGHTS
# ============================================================================
print("\n" + "="*70)
print("SUMMARY: KEY INSIGHTS FROM ADVANCED MOMENTUM ANALYSIS")
print("="*70)
print("""
1. WIN vs LOSS TRAJECTORIES: Winners maintain consistently higher momentum 
   throughout matches, providing early indicators of match direction.

2. MOMENTUM ACCELERATION: Matches with high momentum volatility show frequent
   swings between players, creating unpredictable outcomes. Low-volatility
   matches suggest dominant performances.

3. MOMENTUM-PROBABILITY COUPLING: Strong polynomial relationship (quadratic)
   shows momentum's non-linear impact - high positive momentum has 
   disproportionately strong effect on win probability.

4. MATCH STAGE DYNAMICS: Momentum behavior differs across match stages.
   Late-game momentum fluctuations may indicate tighter competition
   or fatigue effects.

5. MOMENTUM CLUSTERING: Recurring momentum patterns suggest players fall
   into distinct tactical profiles. Some players maintain steady momentum,
   others show volatile patterns.

6. INFLECTION POINTS: Critical turning moments occur regularly in matches.
   Identifying these moments could enable real-time momentum-based
   predictions during live matches.

=> RECOMMENDATION: Use momentum + acceleration features together with
   match-stage information for enhanced predictive power.
""")

print("\nVisualizations complete! All insights generated successfully.")



INSIGHT 1: Momentum Trajectories - Winners vs Losers



Match-Level Momentum Statistics:
              match  win_momentum_mean  loss_momentum_mean  win_momentum_std  loss_momentum_std  win_count  loss_count
2023-wimbledon-1503           0.038107           -0.040911          0.226408           0.234702        108          85
2023-wimbledon-1406           0.025059           -0.037348          0.211732           0.190177        110          85
2023-wimbledon-1311          -0.066070           -0.121181          0.203174           0.219786         67         103
2023-wimbledon-1602           0.032764            0.009315          0.225356           0.218308         96         106
2023-wimbledon-1405          -0.025381           -0.055935          0.252956           0.270670        120          95
2023-wimbledon-1309           0.017790           -0.034646          0.247741           0.239758        115          98
2023-wimbledon-1701           0.044902            0.016680          0.217700           0.240070        168         166
2023-wimbledon


Momentum Acceleration Statistics:
              match  avg_acceleration  max_acceleration  momentum_shifts_up  momentum_shifts_down  volatility
2023-wimbledon-1503          0.371430          1.670378                  61                    64    0.501328
2023-wimbledon-1406          0.373543          2.003259                  68                    68    0.507801
2023-wimbledon-1311          0.392825          1.772188                  59                    62    0.517083
2023-wimbledon-1602          0.369295          1.599931                  69                    71    0.497562
2023-wimbledon-1405          0.464446          1.693148                  76                    78    0.623934
2023-wimbledon-1309          0.428782          1.976164                  75                    73    0.562745
2023-wimbledon-1701          0.367715          2.149372                 110                   114    0.517301
2023-wimbledon-1601          0.401525          2.034126                  54          

Correlation (Momentum vs Predicted Probability): 0.6894
Polynomial fit coefficients: [0.02942259 0.33589221 0.49771796]
Interpretation: Momentum has strong influence on win probability prediction

INSIGHT 4: Match Stage Dynamics - Momentum Through Match Progression



Match Stage Analysis:
       early_momentum  mid_momentum  late_momentum  early_win_rate  mid_win_rate  late_win_rate
count       31.000000     31.000000      31.000000       31.000000     31.000000      31.000000
mean         0.006932     -0.008855      -0.004462        0.489453      0.510419       0.525954
std          0.043917      0.042654       0.045279        0.043376      0.065684       0.085131
min         -0.075476     -0.122205      -0.112218        0.400000      0.363636       0.293103
25%         -0.017390     -0.027629      -0.016481        0.462687      0.483946       0.485550
50%          0.001747     -0.003728       0.002079        0.492958      0.509091       0.528302
75%          0.028473      0.015325       0.029452        0.515736      0.551389       0.572173
max          0.115619      0.089376       0.079712        0.584906      0.655738       0.690476

Key Finding: Momentum decreases from early to late game
Late-game win rate: 0.526 (early: 0.489) - Momentum matt


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.




Clustering Results (4 clusters, PCA variance explained: 0.256):
  Cluster 1: 4 matches, Win rate: 0.560
  Cluster 2: 24 matches, Win rate: 0.499
  Cluster 3: 2 matches, Win rate: 0.535
  Cluster 4: 1 matches, Win rate: 0.558

INSIGHT 6: Inflection Points - Identifying Critical Match Moments



Inflection Point Analysis (Critical Turning Moments):
              match  inflection_points  significant_inflections  momentum_range  turning_intensity
2023-wimbledon-1503                103                       57        1.721518          41.133435
2023-wimbledon-1406                106                       63        1.377204          40.389612
2023-wimbledon-1311                 91                       46        1.685001          37.490533
2023-wimbledon-1602                114                       60        1.499899          41.795754
2023-wimbledon-1405                122                       73        1.827677          54.799761
2023-wimbledon-1309                128                       73        1.711341          51.812722
2023-wimbledon-1701                192                      107        1.402684          69.474926
2023-wimbledon-1601                 69                       46        1.566754          35.513574
2023-wimbledon-1501                 89                

In [15]:
# ============================================================================
# Detailed Analysis: 第2名 vs 第1名 - Multi-dimensional Visualization by Set
# ============================================================================
print("\n" + "="*70)
print("DETAILED MATCH ANALYSIS: 第2名 vs 第1名")
print("="*70)

# Find the match between rank 1 and rank 2
rank1_player = [p for p, r in player_rank_map.items() if r == 1][0]
rank2_player = [p for p, r in player_rank_map.items() if r == 2][0]

# Find the match
target_match = None
for match in df_m['match_id_str'].unique():
    df_match = df_m[df_m['match_id_str'] == match]
    p1 = df_match['player1'].iloc[0]
    p2 = df_match['player2'].iloc[0]
    if (p1 == rank1_player and p2 == rank2_player) or (p1 == rank2_player and p2 == rank1_player):
        target_match = match
        break

if target_match is None:
    print("⚠️ No match found between 第1名 and 第2名")
else:
    df_target = df_m[df_m['match_id_str'] == target_match].copy()
    
    print(f"Match ID: {target_match}")
    print(f"第1名: {rank1_player}")
    print(f"第2名: {rank2_player}")
    print(f"Total Points: {len(df_target)}")
    
    # Get unique sets
    sets = sorted(df_target['set_no'].unique())
    n_sets = len(sets)
    print(f"Total Sets: {n_sets}")
    
    # Create comprehensive multi-dimensional visualization
    fig = make_subplots(
        rows=n_sets, cols=4,
        subplot_titles=[item for s in sets for item in [
            f'Set {s}: Momentum Evolution',
            f'Set {s}: Point Duration',
            f'Set {s}: Distance Run',
            f'Set {s}: Rally Count'
        ]],
        specs=[[{"type": "scatter"}, {"type": "scatter"}, {"type": "scatter"}, {"type": "scatter"}] for _ in range(n_sets)],
        vertical_spacing=0.08 if n_sets <= 3 else 0.05,
        horizontal_spacing=0.08
    )
    
    colors_p1 = 'blue'
    colors_p2 = 'red'
    
    for set_idx, set_no in enumerate(sets):
        df_set = df_target[df_target['set_no'] == set_no].sort_values('point_no').reset_index(drop=True)
        row = set_idx + 1
        
        # Determine which player is player1 and player2
        p1_name = df_set['player1'].iloc[0]
        p2_name = df_set['player2'].iloc[0]
        
        # 1. Momentum Evolution
        fig.add_trace(
            go.Scatter(
                x=df_set.index,
                y=df_set['Mt_mean'],
                mode='lines+markers',
                name=f'Momentum',
                line=dict(color='purple', width=2),
                marker=dict(size=4, color=df_set['y'], colorscale=[[0, colors_p2], [1, colors_p1]], 
                           showscale=False),
                showlegend=(set_idx == 0),
                hovertemplate=f'Point: %{{x}}<br>Momentum: %{{y:.3f}}<br>Winner: {p1_name if 1 else p2_name}<extra></extra>'
            ),
            row=row, col=1
        )
        fig.add_hline(y=0, line_dash="dash", line_color="gray", row=row, col=1)
        fig.update_xaxes(title_text="Point in Set", row=row, col=1)
        fig.update_yaxes(title_text="Momentum", row=row, col=1)
        
        # 2. Point Duration
        if 'point_duration' in df_set.columns:
            fig.add_trace(
                go.Scatter(
                    x=df_set.index,
                    y=df_set['point_duration'],
                    mode='lines+markers',
                    name='Duration',
                    line=dict(color='orange', width=2),
                    marker=dict(size=4),
                    showlegend=(set_idx == 0),
                    hovertemplate='Point: %{x}<br>Duration: %{y:.1f}s<extra></extra>'
                ),
                row=row, col=2
            )
            fig.update_xaxes(title_text="Point in Set", row=row, col=2)
            fig.update_yaxes(title_text="Duration (seconds)", row=row, col=2)
        
        # 3. Distance Run (both players)
        if 'p1_distance_run' in df_set.columns:
            fig.add_trace(
                go.Scatter(
                    x=df_set.index,
                    y=df_set['p1_distance_run'],
                    mode='lines',
                    name=f'{p1_name[:15]}' if set_idx == 0 else None,
                    line=dict(color=colors_p1, width=2),
                    showlegend=(set_idx == 0),
                    hovertemplate=f'{p1_name}<br>Distance: %{{y:.1f}}m<extra></extra>'
                ),
                row=row, col=3
            )
            fig.add_trace(
                go.Scatter(
                    x=df_set.index,
                    y=df_set['p2_distance_run'],
                    mode='lines',
                    name=f'{p2_name[:15]}' if set_idx == 0 else None,
                    line=dict(color=colors_p2, width=2),
                    showlegend=(set_idx == 0),
                    hovertemplate=f'{p2_name}<br>Distance: %{{y:.1f}}m<extra></extra>'
                ),
                row=row, col=3
            )
            fig.update_xaxes(title_text="Point in Set", row=row, col=3)
            fig.update_yaxes(title_text="Distance Run (m)", row=row, col=3)
        
        # 4. Rally Count
        if 'rally_count' in df_set.columns:
            fig.add_trace(
                go.Bar(
                    x=df_set.index,
                    y=df_set['rally_count'],
                    name='Rally',
                    marker=dict(color=df_set['rally_count'], colorscale='Viridis', showscale=(set_idx == 0)),
                    showlegend=(set_idx == 0),
                    hovertemplate='Point: %{x}<br>Rallies: %{y}<extra></extra>'
                ),
                row=row, col=4
            )
            fig.update_xaxes(title_text="Point in Set", row=row, col=4)
            fig.update_yaxes(title_text="Rally Count", row=row, col=4)
    
    # Update layout
    fig.update_layout(
        height=400 * n_sets,
        title_text=f"<b>Detailed Match Analysis: {rank2_player} (第2名) vs {rank1_player} (第1名)</b><br><sub>Multi-dimensional View by Set</sub>",
        title_font_size=18,
        showlegend=True,
        hovermode='closest'
    )
    
    fig.show()
    
    # Print set-level statistics
    print("\n" + "="*70)
    print("SET-LEVEL STATISTICS")
    print("="*70)
    
    for set_no in sets:
        df_set = df_target[df_target['set_no'] == set_no]
        print(f"\n📊 Set {set_no}:")
        print(f"  Total Points: {len(df_set)}")
        print(f"  Momentum Mean: {df_set['Mt_mean'].mean():.4f}")
        print(f"  Momentum Std: {df_set['Mt_mean'].std():.4f}")
        print(f"  {p1_name} wins: {df_set['y'].sum()}")
        print(f"  {p2_name} wins: {len(df_set) - df_set['y'].sum()}")
        if 'rally_count' in df_set.columns:
            print(f"  Avg Rally Count: {df_set['rally_count'].mean():.2f}")
        if 'point_duration' in df_set.columns:
            print(f"  Avg Point Duration: {df_set['point_duration'].mean():.2f}s")
    
    # Create additional summary visualization
    fig2 = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Score Progression by Set',
            'Momentum Distribution by Set',
            'Win Rate by Set',
            'Average Rally Length by Set'
        ),
        specs=[
            [{"type": "scatter"}, {"type": "box"}],
            [{"type": "bar"}, {"type": "bar"}]
        ]
    )
    
    # 1. Score progression
    cumulative_p1 = []
    cumulative_p2 = []
    set_labels = []
    for set_no in sets:
        df_set = df_target[df_target['set_no'] == set_no]
        p1_wins = df_set['y'].sum()
        p2_wins = len(df_set) - p1_wins
        cumulative_p1.append(p1_wins)
        cumulative_p2.append(p2_wins)
        set_labels.append(f'Set {set_no}')
    
    fig2.add_trace(
        go.Scatter(x=set_labels, y=cumulative_p1, mode='lines+markers', 
                  name=f'{p1_name[:15]}', line=dict(color=colors_p1, width=3),
                  marker=dict(size=10)),
        row=1, col=1
    )
    fig2.add_trace(
        go.Scatter(x=set_labels, y=cumulative_p2, mode='lines+markers',
                  name=f'{p2_name[:15]}', line=dict(color=colors_p2, width=3),
                  marker=dict(size=10)),
        row=1, col=1
    )
    fig2.update_yaxes(title_text="Points Won", row=1, col=1)
    
    # 2. Momentum distribution by set
    for set_no in sets:
        df_set = df_target[df_target['set_no'] == set_no]
        fig2.add_trace(
            go.Box(y=df_set['Mt_mean'], name=f'Set {set_no}', 
                  marker_color=px.colors.qualitative.Set2[set_no-1]),
            row=1, col=2
        )
    fig2.update_yaxes(title_text="Momentum", row=1, col=2)
    
    # 3. Win rate by set
    win_rates_p1 = [df_target[df_target['set_no'] == s]['y'].mean() for s in sets]
    fig2.add_trace(
        go.Bar(x=set_labels, y=win_rates_p1, name=f'{p1_name[:15]}',
              marker_color=colors_p1),
        row=2, col=1
    )
    fig2.update_yaxes(title_text="Win Rate", row=2, col=1)
    
    # 4. Average rally by set
    if 'rally_count' in df_target.columns:
        avg_rallies = [df_target[df_target['set_no'] == s]['rally_count'].mean() for s in sets]
        fig2.add_trace(
            go.Bar(x=set_labels, y=avg_rallies, 
                  marker_color=px.colors.sequential.Viridis),
            row=2, col=2
        )
        fig2.update_yaxes(title_text="Avg Rally Count", row=2, col=2)
    
    fig2.update_layout(
        height=800,
        title_text=f"<b>Summary Statistics by Set</b>",
        title_font_size=16,
        showlegend=True
    )
    
    fig2.show()
    
    print("\n✅ Detailed visualization complete!")



DETAILED MATCH ANALYSIS: 第2名 vs 第1名
Match ID: 2023-wimbledon-1503
第1名: Roman Safiullin
第2名: Jannik Sinner
Total Points: 193
Total Sets: 4



SET-LEVEL STATISTICS

📊 Set 1:
  Total Points: 52
  Momentum Mean: -0.0046
  Momentum Std: 0.2186
  Jannik Sinner wins: 29
  Roman Safiullin wins: 23
  Avg Rally Count: 3.71
  Avg Point Duration: 40.92s

📊 Set 2:
  Total Points: 45
  Momentum Mean: 0.0216
  Momentum Std: 0.2276
  Jannik Sinner wins: 19
  Roman Safiullin wins: 26
  Avg Rally Count: 3.31
  Avg Point Duration: 41.73s

📊 Set 3:
  Total Points: 48
  Momentum Mean: -0.0414
  Momentum Std: 0.2762
  Jannik Sinner wins: 31
  Roman Safiullin wins: 17
  Avg Rally Count: 3.35
  Avg Point Duration: 41.77s

📊 Set 4:
  Total Points: 48
  Momentum Mean: 0.0394
  Momentum Std: 0.2026
  Jannik Sinner wins: 29
  Roman Safiullin wins: 19
  Avg Rally Count: 3.67
  Avg Point Duration: 43.60s



✅ Detailed visualization complete!
