# TabPFN增强版：性早熟预测模型

**使用TabPFN 6.0.6 + Extensions 0.2.2 进行全方位优化**

本笔记本展示了TabPFN的多种增强技术：
- 基础TabPFN模型（**支持50000样本**、**KV Cache加速**、**概率校准**）
- 超参数优化（HPO）
- Post-hoc集成（AutoTabPFN）
- SHAP可解释性分析
- **PDP部分依赖图分析**（新功能）
- **ShapIQ特征交互分析**（新功能）
- 特征选择优化
- 改进的采样策略

## 1. 导入必要的库

In [None]:
# 设置环境变量
import os

os.environ["SCIPY_ARRAY_API"] = "1"

import pandas as pd
import numpy as np
import warnings
import joblib

warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# TabPFN相关库
from tabpfn import TabPFNClassifier
from tabpfn_extensions.hpo import TunedTabPFNClassifier
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier
from tabpfn_extensions import interpretability
from tabpfn_extensions.embedding import TabPFNEmbedding

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

print("所有库导入完成")

所有库导入完成


## 2. 设置路径和参数

In [2]:
os.makedirs("./output", exist_ok=True)
os.makedirs("./output/models", exist_ok=True)
os.makedirs("./output/enhanced", exist_ok=True)

RANDOM_SEED = 825
np.random.seed(RANDOM_SEED)
N_JOBS = -1

print(f"随机种子设置: {RANDOM_SEED}")
print(f"输出目录: ./output/enhanced/")

随机种子设置: 825
输出目录: ./output/enhanced/


## 3. 读取数据

In [3]:
normal_data = pd.read_csv("./input/性早熟数据激发试验正常组_new.csv")
disease_data = pd.read_csv("./input/激发试验确诊性早熟组数据_new.csv")

normal_data["group"] = "N"
disease_data["group"] = "Y"

print(f"正常组: {normal_data.shape[0]} 行, 早熟组: {disease_data.shape[0]} 行")

正常组: 8970 行, 早熟组: 10654 行


## 4. 数据类型处理和合并

In [4]:
data = pd.concat([normal_data, disease_data], axis=0, ignore_index=True)
data["group"] = data["group"].astype("category")
print(f"合并后数据: {data.shape[0]} 行 x {data.shape[1]} 列")
print(f"分组统计:\n{data['group'].value_counts()}")

合并后数据: 19624 行 x 40 列
分组统计:
group
Y    10654
N     8970
Name: count, dtype: int64


## 5. 划分训练集和验证集

In [5]:
train_data, validation_data = train_test_split(
    data, test_size=0.3, stratify=data["group"], random_state=RANDOM_SEED
)

print(f"训练集: {train_data.shape[0]} 行, 验证集: {validation_data.shape[0]} 行")

训练集: 13736 行, 验证集: 5888 行


## 6. 特征工程

In [6]:
exclude_cols = ["group", "患者编号", "Unnamed: 0"]
feature_cols = [col for col in train_data.columns if col not in exclude_cols]

X_train = train_data[feature_cols].copy()
y_train = train_data["group"].copy()
X_validation = validation_data[feature_cols].copy()
y_validation = validation_data["group"].copy()

y_train_binary = (y_train == "Y").astype(int)
y_validation_binary = (y_validation == "Y").astype(int)

print(f"使用 {len(feature_cols)} 个特征")
print(f"训练集正负样本: {y_train_binary.value_counts().to_dict()}")
print(f"验证集正负样本: {y_validation_binary.value_counts().to_dict()}")

使用 38 个特征
训练集正负样本: {1: 7457, 0: 6279}
验证集正负样本: {1: 3197, 0: 2691}


## 7. 数据类型转换

In [7]:
print("数据类型转换中...")

for col in feature_cols:
    X_train[col] = pd.to_numeric(X_train[col], errors="coerce")
    X_validation[col] = pd.to_numeric(X_validation[col], errors="coerce")

print(f"转换完成 - 训练集: {X_train.dtypes.value_counts().to_dict()}")

数据类型转换中...
转换完成 - 训练集: {dtype('float64'): 38}


## 8. 数据预处理（缺失值填充）

In [None]:
print("数据预处理开始...")

# 使用MissForest方法填充缺失值（IterativeImputer + RandomForest）
# 利用特征间的非线性关系
imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=10,
        max_depth=10,
        n_jobs=-1,
        random_state=RANDOM_SEED,
    ),
    max_iter=10,
    random_state=RANDOM_SEED,
    verbose=0,
)

print("使用MissForest方法（IterativeImputer + RandomForest）...")
X_train_processed = imputer.fit_transform(X_train)
X_validation_processed = imputer.transform(X_validation)

# 移除全为NaN的特征
valid_features = ~np.isnan(X_train_processed).all(axis=0)
X_train_processed = X_train_processed[:, valid_features]
X_validation_processed = X_validation_processed[:, valid_features]

# 更新特征列表
feature_cols_processed = [
    col for col, valid in zip(feature_cols, valid_features) if valid
]

print(f"预处理完成！")
print(f"  原始特征数: {X_train.shape[1]}")
print(f"  处理后特征数: {X_train_processed.shape[1]}")
print(f"  训练集样本: {X_train_processed.shape[0]}")
print(f"  验证集样本: {X_validation_processed.shape[0]}")
print(f"  缺失值: {np.isnan(X_train_processed).sum()} (应为0)")

数据预处理开始...
预处理完成！
  原始特征数: 38
  处理后特征数: 38
  训练集样本: 13736
  验证集样本: 5888
  缺失值: 0 (应为0)


---
# 模型训练与优化

## 9. 准备训练数据（分层采样）

In [9]:
# TabPFN v2.5 支持最多50000样本，可以使用全部训练数据
max_samples = 50000  # 从10000提升到50000

if len(X_train_processed) > max_samples:
    print(f"训练集样本数({len(X_train_processed)})超过{max_samples}，进行分层采样")

    # 使用分层采样保持类别平衡
    sss = StratifiedShuffleSplit(
        n_splits=1, train_size=max_samples, random_state=RANDOM_SEED
    )
    for sample_idx, _ in sss.split(X_train_processed, y_train_binary):
        X_train_sampled = X_train_processed[sample_idx]
        y_train_sampled = y_train_binary.iloc[sample_idx].values

    print(f"采样后类别分布: {np.bincount(y_train_sampled)}")
else:
    X_train_sampled = X_train_processed
    y_train_sampled = y_train_binary.values
    print(f"使用{len(X_train_sampled)}个训练样本")

print(f"训练样本: {len(X_train_sampled)}, 特征数: {X_train_sampled.shape[1]}")

使用13736个训练样本
训练样本: 13736, 特征数: 38


## 10. 模型1：基础TabPFN

In [None]:
print("=" * 70)
print("训练基础TabPFN模型")
print("=" * 70)
from tabpfn.inference_tuning import ClassifierTuningConfig

tabpfn_basic = TabPFNClassifier(
    n_estimators=32,
    device="cuda",
    random_state=RANDOM_SEED,
    # fit_mode="fit_with_cache",
    eval_metric="f1",
    tuning_config=ClassifierTuningConfig(
        calibrate_temperature=True,  # 校准softmax温度
        tune_decision_thresholds=True,  # 优化决策阈值
    ),
)

print(f"  - 训练样本数: {len(X_train_sampled)}")
print("\n开始训练...")

tabpfn_basic.fit(X_train_sampled, y_train_sampled)

y_pred_basic = tabpfn_basic.predict(X_validation_processed)
y_pred_proba_basic = tabpfn_basic.predict_proba(X_validation_processed)[:, 1]

auc_basic = roc_auc_score(y_validation_binary, y_pred_proba_basic)
f1_basic = f1_score(y_validation_binary, y_pred_basic)
acc_basic = accuracy_score(y_validation_binary, y_pred_basic)

print(f"\n基础TabPFN性能 (带概率校准):")
print(f"  AUC: {auc_basic:.4f}")
print(f"  F1:  {f1_basic:.4f}")
print(f"  ACC: {acc_basic:.4f}")

# 保存模型
joblib.dump(
    {"model": tabpfn_basic, "imputer": imputer},
    "./output/enhanced/models/tabpfn_basic.pkl",
)
print(f"\n模型已保存: ./output/enhanced/models/tabpfn_basic.pkl")
# 基础TabPFN性能 (带概率校准):
#   AUC: 0.9645
#   F1:  0.9102
#   ACC: 0.8981

训练基础TabPFN模型
  - 训练样本数: 13736

开始训练...

基础TabPFN性能 (带概率校准):
  AUC: 0.9645
  F1:  0.9102
  ACC: 0.8981

模型已保存: ./output/enhanced/models/tabpfn_basic.pkl


## 11. 模型2：超参数优化（HPO）

使用TunedTabPFNClassifier自动搜索最优超参数

In [None]:
print("=" * 70)
print("训练HPO优化TabPFN模型")
print("=" * 70)

# 超参数优化
tabpfn_hpo = TunedTabPFNClassifier(
    n_trials=700,
    metric="f1",  # 优化目标：F1分数
    # metric='roc_auc',       # 优化目标：AUC分数
    device="cuda",
    random_state=RANDOM_SEED,
    verbose=True,
)

print("开始超参数搜索...")
tabpfn_hpo.fit(X_train_sampled, y_train_sampled)

y_pred_hpo = tabpfn_hpo.predict(X_validation_processed)
y_pred_proba_hpo = tabpfn_hpo.predict_proba(X_validation_processed)[:, 1]

auc_hpo = roc_auc_score(y_validation_binary, y_pred_proba_hpo)
f1_hpo = f1_score(y_validation_binary, y_pred_hpo)
acc_hpo = accuracy_score(y_validation_binary, y_pred_hpo)

print(f"\nHPO优化TabPFN性能:")
print(f"  AUC: {auc_hpo:.4f}")
print(f"  F1:  {f1_hpo:.4f}")
print(f"  ACC: {acc_hpo:.4f}")

# 保存模型
joblib.dump(
    {"model": tabpfn_hpo, "imputer": imputer},
    "./output/enhanced/models/tabpfn_hpo.pkl",
)
print(f"\n模型已保存: ./output/enhanced/models/tabpfn_hpo.pkl")
# HPO优化TabPFN性能:
#   AUC: 0.9654
#   F1:  0.9108
#   ACC: 0.8989

 95%|█████████▌| 666/700 [2:58:07<07:49, 13.80s/trial, best loss: -0.8944669603215551]



 97%|█████████▋| 676/700 [3:01:27<11:42, 29.27s/trial, best loss: -0.8944669603215551]



 98%|█████████▊| 684/700 [3:03:38<05:01, 18.86s/trial, best loss: -0.8944669603215551]



 98%|█████████▊| 687/700 [3:04:06<02:57, 13.62s/trial, best loss: -0.8944669603215551]



 98%|█████████▊| 688/700 [3:04:06<01:55,  9.61s/trial, best loss: -0.8944669603215551]



 99%|█████████▊| 690/700 [3:04:40<02:27, 14.78s/trial, best loss: -0.8944669603215551]



 99%|█████████▉| 692/700 [3:04:59<01:43, 12.97s/trial, best loss: -0.8944669603215551]



100%|██████████| 700/700 [3:07:43<00:00, 16.09s/trial, best loss: -0.8944669603215551]

HPO优化TabPFN性能:
  AUC: 0.9654 (vs 基础: +0.0010)
  F1:  0.9108 (vs 基础: +0.0006)
  ACC: 0.8989 (vs 基础: +0.0008)

模型已保存: ./output/enhanced/models/tabpfn_hpo.pkl


## 12. 模型3：Post-hoc集成（AutoTabPFN）

自动训练和集成多个TabPFN配置

In [11]:
print("=" * 70)
print("训练Post-hoc集成TabPFN模型")
print("=" * 70)

# 检查GPU状态
import torch

print(f"PyTorch CUDA 可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU 设备: {torch.cuda.get_device_name(0)}")
    print(f"CUDA 设备数量: {torch.cuda.device_count()}")
else:
    raise RuntimeError("CUDA 不可用，无法继续")

# 检查Ray版本
import ray

print(f"Ray 版本: {ray.__version__}")


# Post-hoc集成
tabpfn_auto = AutoTabPFNClassifier(
    max_time=18000,  # 5h
    # max_time=12600, # 3.5h
    presets="best_quality",
    device="cuda:0",
    ignore_pretraining_limits=True,  # 允许超过10000样本限制
)

print("模型初始化成功")
print("使用设备: cuda:0")
print(f"训练样本数: {len(X_train_sampled)} (已启用ignore_pretraining_limits)")
print("开始自动集成训练...")
print("(这可能需要较长时间，正在训练多个模型配置...)")

tabpfn_auto.fit(X_train_sampled, y_train_sampled)

y_pred_auto = tabpfn_auto.predict(X_validation_processed)
y_pred_proba_auto = tabpfn_auto.predict_proba(X_validation_processed)[:, 1]

auc_auto = roc_auc_score(y_validation_binary, y_pred_proba_auto)
f1_auto = f1_score(y_validation_binary, y_pred_auto)
acc_auto = accuracy_score(y_validation_binary, y_pred_auto)

print(f"\nPost-hoc集成TabPFN性能:")
print(f"  AUC: {auc_auto:.4f}")
print(f"  F1:  {f1_auto:.4f}")
print(f"  ACC: {acc_auto:.4f}")

# 保存模型
joblib.dump(
    {"model": tabpfn_auto, "imputer": imputer},
    "./output/enhanced/models/tabpfn_auto.pkl",
)
print(f"\n模型已保存: ./output/enhanced/models/tabpfn_auto.pkl")

训练Post-hoc集成TabPFN模型
PyTorch CUDA 可用: True
GPU 设备: NVIDIA GeForce RTX 3080 Laptop GPU
CUDA 设备数量: 1
Ray 版本: 2.6.0
模型初始化成功
使用设备: cuda:0
训练样本数: 13736 (已启用ignore_pretraining_limits)
开始自动集成训练...
(这可能需要较长时间，正在训练多个模型配置...)


KeyboardInterrupt: 

## 13. 无监督学习：异常检测（Unsupervised）

使用TabPFN进行异常样本检测

In [None]:
print("=" * 70)
print("TabPFN嵌入提取分析")
print("=" * 70)

print("提取TabPFN内部嵌入表示...\n")

# 使用已训练的基础TabPFN模型作为嵌入提取器
embedding_extractor = TabPFNEmbedding(tabpfn_clf=tabpfn_basic, n_fold=0)

# 提取训练集嵌入
print("提取训练集嵌入...")
train_embeddings = embedding_extractor.get_embeddings(
    X_train_sampled, y_train_sampled, X_train_sampled, data_source="train"
)

# 提取验证集嵌入
print("提取验证集嵌入...")
val_embeddings = embedding_extractor.get_embeddings(
    X_train_sampled, y_train_sampled, X_validation_processed, data_source="test"
)

print(f"嵌入维度: {train_embeddings[0].shape}")

# 使用嵌入训练逻辑回归
from sklearn.linear_model import LogisticRegression

embedding_model = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
embedding_model.fit(train_embeddings[0], y_train_sampled)

y_pred_embedding = embedding_model.predict(val_embeddings[0])
y_pred_proba_embedding = embedding_model.predict_proba(val_embeddings[0])[:, 1]

auc_embedding = roc_auc_score(y_validation_binary, y_pred_proba_embedding)
f1_embedding = f1_score(y_validation_binary, y_pred_embedding)
acc_embedding = accuracy_score(y_validation_binary, y_pred_embedding)

print(f"\n基于TabPFN嵌入的逻辑回归性能:")
print(f"  AUC: {auc_embedding:.4f}")
print(f"  F1:  {f1_embedding:.4f}")
print(f"  ACC: {acc_embedding:.4f}")

# 可视化嵌入空间（使用PCA降维）
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
val_embeddings_2d = pca.fit_transform(val_embeddings[0])

plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    val_embeddings_2d[:, 0],
    val_embeddings_2d[:, 1],
    c=y_validation_binary,
    cmap="coolwarm",
    alpha=0.6,
    edgecolors="k",
    linewidth=0.5,
)
plt.colorbar(scatter, label="标签 (0=正常, 1=早熟)")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} 方差)", fontsize=12)
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} 方差)", fontsize=12)
plt.title("TabPFN嵌入空间可视化 (PCA降维)", fontsize=14, fontweight="bold")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("./output/enhanced/TabPFN嵌入可视化.png", dpi=300, bbox_inches="tight")
plt.savefig("./output/enhanced/TabPFN嵌入可视化.pdf", dpi=300, bbox_inches="tight")
plt.show()

print(f"\n嵌入空间可视化已保存")

# 保存嵌入
np.save("./output/enhanced/train_embeddings.npy", train_embeddings[0])
np.save("./output/enhanced/val_embeddings.npy", val_embeddings[0])
print("嵌入数据已保存")

### *导入模型文件

In [None]:
print("=" * 70)
print("导入已训练的模型并计算性能")
print("=" * 70)

import os
import joblib

# 模型文件路径
model_files = {
    "基础TabPFN": "./output/enhanced/models/tabpfn_basic.pkl",
    "HPO优化TabPFN": "./output/enhanced/models/tabpfn_hpo.pkl",
    "Post-hoc集成TabPFN": "./output/enhanced/models/tabpfn_auto.pkl",
}

# 导入模型
loaded_models = {}
for model_name, file_path in model_files.items():
    if os.path.exists(file_path):
        loaded_models[model_name] = joblib.load(file_path)

print(f"成功导入 {len(loaded_models)} 个模型")

# 计算性能
if loaded_models:
    # 提取模型
    tabpfn_basic = loaded_models["基础TabPFN"]["model"]
    tabpfn_hpo = loaded_models["HPO优化TabPFN"]["model"]
    tabpfn_auto = loaded_models["Post-hoc集成TabPFN"]["model"]
    imputer = loaded_models["基础TabPFN"]["imputer"]

    # 基础TabPFN
    y_pred_basic = tabpfn_basic.predict(X_validation_processed)
    y_pred_proba_basic = tabpfn_basic.predict_proba(X_validation_processed)[:, 1]
    auc_basic = roc_auc_score(y_validation_binary, y_pred_proba_basic)
    f1_basic = f1_score(y_validation_binary, y_pred_basic)
    acc_basic = accuracy_score(y_validation_binary, y_pred_basic)

    # HPO优化TabPFN
    y_pred_hpo = tabpfn_hpo.predict(X_validation_processed)
    y_pred_proba_hpo = tabpfn_hpo.predict_proba(X_validation_processed)[:, 1]
    auc_hpo = roc_auc_score(y_validation_binary, y_pred_proba_hpo)
    f1_hpo = f1_score(y_validation_binary, y_pred_hpo)
    acc_hpo = accuracy_score(y_validation_binary, y_pred_hpo)

    # Post-hoc集成TabPFN
    y_pred_auto = tabpfn_auto.predict(X_validation_processed)
    y_pred_proba_auto = tabpfn_auto.predict_proba(X_validation_processed)[:, 1]
    auc_auto = roc_auc_score(y_validation_binary, y_pred_proba_auto)
    f1_auto = f1_score(y_validation_binary, y_pred_auto)
    acc_auto = accuracy_score(y_validation_binary, y_pred_auto)

    # 打印性能对比
    print("\n模型性能对比：")
    print(
        f"  基础TabPFN:      F1={f1_basic:.4f}  AUC={auc_basic:.4f}  ACC={acc_basic:.4f}"
    )
    print(
        f"  HPO优化:         F1={f1_hpo:.4f}  AUC={auc_hpo:.4f}  ACC={acc_hpo:.4f}  ({f1_hpo-f1_basic:+.4f})"
    )
    print(
        f"  Post-hoc集成:    F1={f1_auto:.4f}  AUC={auc_auto:.4f}  ACC={acc_auto:.4f}  ({f1_auto-f1_basic:+.4f})"
    )

    print("\n性能计算完成，变量已就绪")
else:
    print("\n未导入任何模型")

## 14.性能对比

In [None]:
# 创建性能对比表
models_list = [
    ("基础TabPFN", auc_basic, f1_basic, acc_basic),
    ("HPO优化TabPFN", auc_hpo, f1_hpo, acc_hpo),
    ("Post-hoc集成TabPFN", auc_auto, f1_auto, acc_auto),
]

performance_summary = pd.DataFrame(models_list, columns=["模型", "AUC", "F1", "准确率"])
# 按F1分数排序（从高到低）
performance_summary = performance_summary.sort_values("F1", ascending=False)

print("\n" + "=" * 70)
print("TabPFN模型性能对比（以F1分数为准）")
print("=" * 70)
print(performance_summary.to_string(index=False))
print("=" * 70)

best_model_name = performance_summary.iloc[0]["模型"]
best_f1 = performance_summary.iloc[0]["F1"]
best_auc = performance_summary.iloc[0]["AUC"]
f1_improvement = (best_f1 - f1_basic) * 100

print(f"\n最佳模型: {best_model_name}")
print(f"   F1:  {best_f1:.4f} (相比基础模型: {f1_improvement:+.2f}%)")
print(f"   AUC: {best_auc:.4f}")
print(f"   ACC: {performance_summary.iloc[0]['准确率']:.4f}")

# 保存性能对比
performance_summary.to_csv(
    "./output/enhanced/性能对比.csv", index=False, encoding="utf-8-sig"
)
print(f"\n性能对比已保存: ./output/enhanced/性能对比.csv")

## 15. ROC曲线

In [None]:
plt.figure(figsize=(12, 9))

# 基础TabPFN
fpr_basic, tpr_basic, _ = roc_curve(y_validation_binary, y_pred_proba_basic)
plt.plot(
    fpr_basic,
    tpr_basic,
    label=f"基础TabPFN (AUC = {auc_basic:.4f})",
    linewidth=2,
    color="#2E86AB",
    linestyle="-",
)

# HPO优化TabPFN
fpr_hpo, tpr_hpo, _ = roc_curve(y_validation_binary, y_pred_proba_hpo)
plt.plot(
    fpr_hpo,
    tpr_hpo,
    label=f"HPO优化TabPFN (AUC = {auc_hpo:.4f})",
    linewidth=2,
    color="#A23B72",
    linestyle="--",
)

# Post-hoc集成TabPFN
fpr_auto, tpr_auto, _ = roc_curve(y_validation_binary, y_pred_proba_auto)
plt.plot(
    fpr_auto,
    tpr_auto,
    label=f"Post-hoc集成TabPFN (AUC = {auc_auto:.4f})",
    linewidth=2,
    color="#F18F01",
    linestyle="-.",
)

# 随机猜测基线
plt.plot([0, 1], [0, 1], "k--", linewidth=1, label="随机猜测", alpha=0.5)

plt.xlabel("假阳性率 (1-特异度)", fontsize=13)
plt.ylabel("真阳性率 (灵敏度)", fontsize=13)
plt.title("TabPFN模型ROC曲线对比", fontsize=15, fontweight="bold")
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()

plt.savefig("./output/enhanced/ROC曲线对比.pdf", dpi=300, bbox_inches="tight")
plt.savefig("./output/enhanced/ROC曲线对比.png", dpi=300, bbox_inches="tight")
plt.show()

print("ROC曲线对比已保存")

## 16. 详细性能指标对比

In [None]:
# 为所有模型计算详细指标
models_results = {
    "基础TabPFN": (y_pred_basic, y_pred_proba_basic),
    "HPO优化TabPFN": (y_pred_hpo, y_pred_proba_hpo),
    "Post-hoc集成TabPFN": (y_pred_auto, y_pred_proba_auto),
}

detailed_metrics = []

for model_name, (y_pred, y_pred_proba) in models_results.items():
    cm = confusion_matrix(y_validation_binary, y_pred)

    accuracy = accuracy_score(y_validation_binary, y_pred)
    precision = precision_score(y_validation_binary, y_pred)
    recall = recall_score(y_validation_binary, y_pred)
    f1 = f1_score(y_validation_binary, y_pred)
    auc = roc_auc_score(y_validation_binary, y_pred_proba)
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])

    detailed_metrics.append(
        {
            "模型": model_name,
            "AUC": auc,
            "准确率": accuracy,
            "精确率": precision,
            "召回率": recall,
            "特异度": specificity,
            "F1分数": f1,
        }
    )

    print(f"\n{'='*70}")
    print(f"{model_name} 详细指标")
    print(f"{'='*70}")
    print(f"混淆矩阵:")
    print(cm)
    print(f"TN={cm[0,0]}, FP={cm[0,1]}, FN={cm[1,0]}, TP={cm[1,1]}")
    print(f"\n准确率: {accuracy:.4f}")
    print(f"精确率: {precision:.4f}")
    print(f"召回率: {recall:.4f}")
    print(f"特异度: {specificity:.4f}")
    print(f"F1分数: {f1:.4f}")
    print(f"AUC:    {auc:.4f}")

# 保存详细指标
detailed_df = pd.DataFrame(detailed_metrics)
detailed_df = detailed_df.sort_values("F1分数", ascending=False)
detailed_df.to_csv(
    "./output/enhanced/详细性能指标.csv", index=False, encoding="utf-8-sig"
)

print(f"\n\n{'='*70}")
print("所有模型详细指标汇总")
print(f"{'='*70}")
print(detailed_df.to_string(index=False))
print(f"\n详细性能指标已保存: ./output/enhanced/详细性能指标.csv")

## 17. 预测概率分布对比

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14, 12))

models_plot = [
    ("基础TabPFN", y_pred_proba_basic),
    ("HPO优化TabPFN", y_pred_proba_hpo),
    ("Post-hoc集成TabPFN", y_pred_proba_auto),
]

for idx, (model_name, y_pred_proba) in enumerate(models_plot):
    # 概率分布直方图
    ax1 = axes[idx, 0]
    for label, name in [(0, "正常(N)"), (1, "早熟(Y)")]:
        mask = y_validation_binary == label
        ax1.hist(y_pred_proba[mask], bins=30, alpha=0.6, label=name)
    ax1.set_xlabel("预测概率", fontsize=11)
    ax1.set_ylabel("样本数量", fontsize=11)
    ax1.set_title(f"{model_name} - 预测概率分布", fontsize=12, fontweight="bold")
    ax1.legend()
    ax1.grid(alpha=0.3)

    # 概率箱线图
    ax2 = axes[idx, 1]
    data_plot = pd.DataFrame(
        {
            "概率": y_pred_proba,
            "真实标签": [
                "正常(N)" if y == 0 else "早熟(Y)" for y in y_validation_binary
            ],
        }
    )
    sns.boxplot(data=data_plot, x="真实标签", y="概率", ax=ax2)
    ax2.set_title(f"{model_name} - 预测概率箱线图", fontsize=12, fontweight="bold")
    ax2.grid(alpha=0.3, axis="y")

plt.tight_layout()
plt.savefig("./output/enhanced/预测概率分布对比.pdf", dpi=300, bbox_inches="tight")
plt.savefig("./output/enhanced/预测概率分布对比.png", dpi=300, bbox_inches="tight")
plt.show()

print("预测概率分布对比图已保存")

In [None]:
# 保存所有模型的预测结果
predictions_dict = {
    "真实标签": y_validation_binary,
    "基础TabPFN_预测": y_pred_basic,
    "基础TabPFN_概率": y_pred_proba_basic,
    "HPO优化TabPFN_预测": y_pred_hpo,
    "HPO优化TabPFN_概率": y_pred_proba_hpo,
    "Post-hoc集成TabPFN_预测": y_pred_auto,
    "Post-hoc集成TabPFN_概率": y_pred_proba_auto,
}

# 添加嵌入模型结果（如果可用）
if auc_embedding is not None:
    predictions_dict["TabPFN嵌入_预测"] = y_pred_embedding
    predictions_dict["TabPFN嵌入_概率"] = y_pred_proba_embedding

predictions_df = pd.DataFrame(predictions_dict)

predictions_df.to_csv(
    "./output/enhanced/验证集预测结果_完整版.csv", index=False, encoding="utf-8-sig"
)
print(f"预测结果已保存: ./output/enhanced/验证集预测结果_完整版.csv")
print(f"   验证集共 {len(predictions_df)} 样本")

In [None]:
# 保存所有模型的预测结果
predictions_df = pd.DataFrame(
    {
        "真实标签": y_validation_binary,
        "基础TabPFN_预测": y_pred_basic,
        "基础TabPFN_概率": y_pred_proba_basic,
        "HPO优化TabPFN_预测": y_pred_hpo,
        "HPO优化TabPFN_概率": y_pred_proba_hpo,
        "Post-hoc集成TabPFN_预测": y_pred_auto,
        "Post-hoc集成TabPFN_概率": y_pred_proba_auto,
    }
)

predictions_df.to_csv(
    "./output/enhanced/验证集预测结果.csv", index=False, encoding="utf-8-sig"
)
print(f"预测结果已保存: ./output/enhanced/验证集预测结果.csv")
print(f"   验证集共 {len(predictions_df)} 样本")

## 18. 性能提升总结

In [None]:
print("\n" + "=" * 70)
print("性能提升总结")
print("=" * 70)

# 计算相对于基础模型的提升
improvements = pd.DataFrame(
    {
        "模型": ["HPO优化TabPFN", "Post-hoc集成TabPFN"],
        "AUC提升(%)": [(auc_hpo - auc_basic) * 100, (auc_auto - auc_basic) * 100],
        "F1提升(%)": [(f1_hpo - f1_basic) * 100, (f1_auto - f1_basic) * 100],
        "准确率提升(%)": [(acc_hpo - acc_basic) * 100, (acc_auto - acc_basic) * 100],
    }
)

print(improvements.to_string(index=False))

print(f"\n基础模型:")
print(f"  AUC: {auc_basic:.4f}")
print(f"  F1:  {f1_basic:.4f}")
print(f"  ACC: {acc_basic:.4f}")

print(f"\n最佳模型: {best_model_name}")
print(f"  AUC: {best_auc:.4f} ({(best_auc - auc_basic)*100:+.2f}%)")
print(
    f"  F1:  {performance_summary.iloc[0]['F1']:.4f} ({(performance_summary.iloc[0]['F1'] - f1_basic)*100:+.2f}%)"
)
print(
    f"  ACC: {performance_summary.iloc[0]['准确率']:.4f} ({(performance_summary.iloc[0]['准确率'] - acc_basic)*100:+.2f}%)"
)

print("\n" + "=" * 70)

## 19. SHAP

In [None]:
print("=" * 70)
print("SHAP可解释性分析")
print("=" * 70)

# 使用HPO优化模型进行SHAP分析
best_model = tabpfn_hpo
model_name = "HPO优化TabPFN"
best_f1_score = f1_hpo

print(f"使用模型: {model_name}")
print(f"F1分数: {best_f1_score:.4f}")
print(f"计算SHAP值...\n")

# 选择部分样本计算SHAP值
n_samples_shap = min(10, len(X_validation_processed))
X_shap = X_validation_processed[:n_samples_shap]

# 计算SHAP值
shap_values = interpretability.shap.get_shap_values(
    estimator=best_model,
    test_x=X_shap,
    attribute_names=feature_cols_processed,
    algorithm="permutation",
)

# 提取SHAP值数组（正类）
shap_array = shap_values.values[:, :, 1]  # (样本数, 特征数)
print(f"SHAP值形状: {shap_array.shape}\n")

# 计算特征重要性
mean_shap = np.abs(shap_array).mean(axis=0)
feature_importance = pd.DataFrame(
    {"特征": feature_cols_processed, "重要性": mean_shap}
).sort_values("重要性", ascending=False)

print("=" * 70)
print("生成SHAP可视化图表")
print("=" * 70)

# 1. Beeswarm Plot (Summary Plot) - 显示所有样本的SHAP值分布
print("\n1. 生成 Beeswarm Summary Plot...")
import shap

plt.figure(figsize=(10, 8))
shap.summary_plot(
    shap_array, X_shap, feature_names=feature_cols_processed, show=False, max_display=20
)
plt.tight_layout()
plt.savefig("./output/enhanced/SHAP_summary_beeswarm.png", dpi=300, bbox_inches="tight")
plt.savefig("./output/enhanced/SHAP_summary_beeswarm.pdf", dpi=300, bbox_inches="tight")
plt.show()
print(" 已保存: SHAP_summary_beeswarm.png")

# 2. Bar Plot (Aggregate Feature Importance) - 特征重要性总览
print("\n2. 生成 Aggregate Feature Importance...")
plt.figure(figsize=(10, 8))
shap.summary_plot(
    shap_array,
    X_shap,
    feature_names=feature_cols_processed,
    plot_type="bar",
    show=False,
    max_display=20,
)
plt.tight_layout()
plt.savefig("./output/enhanced/SHAP_bar_importance.png", dpi=300, bbox_inches="tight")
plt.savefig("./output/enhanced/SHAP_bar_importance.pdf", dpi=300, bbox_inches="tight")
plt.show()
print(" 已保存: SHAP_bar_importance.png")

# 3. Dependence Plot (最重要特征) - 显示特征值与SHAP值的关系
print("\n3. 生成 Dependence Plot（最重要特征）...")
top_feature_idx = feature_importance.index[0]
top_feature_name = feature_importance.iloc[0]["特征"]

plt.figure(figsize=(10, 6))
shap.dependence_plot(
    top_feature_idx,
    shap_array,
    X_shap,
    feature_names=feature_cols_processed,
    show=False,
)
plt.title(f"SHAP Dependence Plot - {top_feature_name}", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.savefig(
    f"./output/enhanced/SHAP_dependence_{top_feature_name}.png",
    dpi=300,
    bbox_inches="tight",
)
plt.savefig(
    f"./output/enhanced/SHAP_dependence_{top_feature_name}.pdf",
    dpi=300,
    bbox_inches="tight",
)
plt.show()
print(f"已保存: SHAP_dependence_{top_feature_name}.png")

# 打印前10个最重要特征
print("\n" + "=" * 70)
print("前10个最重要特征")
print("=" * 70)
print(feature_importance.head(10).to_string(index=False))

print(f"\n" + "=" * 70)
print("SHAP分析完成")
print("=" * 70)
print("生成的图表：")
print("   1. SHAP_summary_beeswarm.png - 特征影响分布图")
print("   2. SHAP_bar_importance.png - 特征重要性条形图")
print(f"   3. SHAP_dependence_{top_feature_name}.png - 最重要特征依赖图")

## 20. PDP部分依赖图分析（新功能）

使用tabpfn-extensions 0.2.2新增的partial_dependence_plots功能

In [None]:
print("=" * 70)
print("PDP部分依赖图分析 (tabpfn-extensions 0.2.2新功能)")
print("=" * 70)

from tabpfn_extensions.interpretability import partial_dependence_plots

# 选择最重要的特征进行PDP分析
top_features = feature_importance.head(6)["特征"].tolist()
top_feature_indices = [feature_cols_processed.index(f) for f in top_features]

print(f"分析前6个最重要特征: {top_features}")

# 使用部分验证集样本进行PDP分析
n_pdp_samples = min(500, len(X_validation_processed))
X_pdp = X_validation_processed[:n_pdp_samples]

# 为每个重要特征生成PDP图
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, (feature_idx, feature_name) in enumerate(
    zip(top_feature_indices, top_features)
):
    print(f"  生成 {feature_name} 的PDP图...")
    ax = axes[idx]

    try:
        disp = partial_dependence_plots(
            estimator=tabpfn_hpo,
            X=X_pdp,
            features=[feature_idx],
            kind="average",
            target_class=1,  # 正类（早熟）
            ax=ax,
            grid_resolution=30,
        )
        ax.set_title(f"PDP: {feature_name}", fontsize=11, fontweight="bold")
        ax.set_xlabel(feature_name, fontsize=10)
        ax.set_ylabel("部分依赖", fontsize=10)
        ax.grid(alpha=0.3)
    except Exception as e:
        ax.text(0.5, 0.5, f"Error: {str(e)[:30]}...", ha="center", va="center")
        ax.set_title(f"PDP: {feature_name} (错误)", fontsize=11)

plt.suptitle(
    "部分依赖图 - 特征对预测概率的边际影响", fontsize=14, fontweight="bold", y=1.02
)
plt.tight_layout()
plt.savefig("./output/enhanced/PDP_top_features.png", dpi=300, bbox_inches="tight")
plt.savefig("./output/enhanced/PDP_top_features.pdf", dpi=300, bbox_inches="tight")
plt.show()

print("\nPDP分析完成！")
print("已保存: ./output/enhanced/PDP_top_features.png")

## 21. ShapIQ特征交互分析

使用tabpfn-extensions的ShapIQ功能分析特征间的交互效应

In [None]:
print("=" * 70)
print("ShapIQ特征交互分析 (tabpfn-extensions 0.2.2新功能)")
print("=" * 70)

from tabpfn_extensions.interpretability import get_tabpfn_explainer

# 使用部分数据进行ShapIQ分析（计算量较大）
n_shapiq_samples = min(100, len(X_train_sampled))
n_explain_samples = min(5, len(X_validation_processed))

X_background = X_train_sampled[:n_shapiq_samples]
y_background = y_train_sampled[:n_shapiq_samples]
X_explain = X_validation_processed[:n_explain_samples]

print(f"背景数据: {n_shapiq_samples} 样本")
print(f"解释样本: {n_explain_samples} 个")

try:
    # 创建TabPFN解释器
    print("\n创建ShapIQ解释器...")
    explainer = get_tabpfn_explainer(
        model=tabpfn_hpo,
        data=X_background,
        labels=y_background,
        index="k-SII",  # k-Shapley交互指数
        max_order=2,  # 考虑二阶交互
        class_index=1,  # 解释正类（早熟）
    )

    print("计算Shapley交互值...")
    # 解释单个样本
    interaction_values = explainer.explain(X_explain[0:1])

    print(f"\n交互值形状: {interaction_values.values.shape}")

    # 可视化交互效应
    print("\n生成交互效应图...")
    fig, ax = plt.subplots(figsize=(12, 10))

    # 使用shapiq的可视化
    interaction_values.plot_network(
        feature_names=feature_cols_processed,
        ax=ax,
    )

    plt.title("ShapIQ特征交互网络图", fontsize=14, fontweight="bold")
    plt.tight_layout()
    plt.savefig(
        "./output/enhanced/ShapIQ_interaction_network.png", dpi=300, bbox_inches="tight"
    )
    plt.savefig(
        "./output/enhanced/ShapIQ_interaction_network.pdf", dpi=300, bbox_inches="tight"
    )
    plt.show()

    print("\nShapIQ分析完成！")
    print("已保存: ./output/enhanced/ShapIQ_interaction_network.png")

except ImportError as e:
    print(f"\n注意: ShapIQ需要安装shapiq库")
    print(f"请运行: pip install shapiq")
    print(f"错误详情: {e}")

except Exception as e:
    print(f"\nShapIQ分析遇到问题: {e}")
    print("这可能是由于shapiq库版本兼容性问题，可以跳过此分析")