# 性早熟预测模型

**基线数据 + 动态特征结合**

## 1. 导入必要的库

In [None]:
# 设置环境变量（必须在导入sklearn之前）
import os

os.environ["SCIPY_ARRAY_API"] = "1"

import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    f1_score,
)
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_curve
import xgboost as xgb
from tabpfn import TabPFNClassifier
import tabm
import torch
import torch.nn as nn

import matplotlib.pyplot as plt
import graphviz


plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

import re
import joblib


# 定义MissForest插补器工厂函数
def create_missforest_imputer(random_state=825):
    """创建MissForest插补器（IterativeImputer + RandomForest）"""
    return IterativeImputer(
        estimator=RandomForestRegressor(
            n_estimators=10,
            max_depth=10,
            n_jobs=-1,
            random_state=random_state,
        ),
        max_iter=10,
        random_state=random_state,
        verbose=0,
    )


if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"显存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("使用MissForest方法进行缺失值填补")

## 2. 设置路径和参数

In [None]:
os.makedirs("./output", exist_ok=True)
os.makedirs("../output/models", exist_ok=True)

RANDOM_SEED = 825
np.random.seed(RANDOM_SEED)
N_JOBS = -1

## 3. 读取数据

In [None]:
normal_data = pd.read_csv("../input/性早熟数据激发试验正常组_new.csv")
disease_data = pd.read_csv("../input/激发试验确诊性早熟组数据_new.csv")

normal_data["group"] = "N"
disease_data["group"] = "Y"

print(f"正常组: {normal_data.shape[0]} 行, 早熟组: {disease_data.shape[0]} 行")

## 4. 数据类型处理和合并

In [None]:
data = pd.concat([normal_data, disease_data], axis=0, ignore_index=True)
data["group"] = data["group"].astype("category")
print(f"合并后数据: {data.shape[0]} 行 x {data.shape[1]} 列")

## 5. 查看数据基本信息

In [None]:
print("数据基本信息:")
print(f"数据维度: {data.shape}")
print(f"分组统计:")
print(data["group"].value_counts())
print(f"数据类型:")
print(data.dtypes.value_counts())
print(f"缺失值统计:")
missing_count = data.isnull().sum().sum()
print(f"总缺失值数量: {missing_count}")
if missing_count > 0:
    missing_by_col = data.isnull().sum()
    missing_by_col = missing_by_col[missing_by_col > 0].sort_values(ascending=False)
    print("各列缺失值:")
    print(missing_by_col.head(10))

## 6. 划分训练集和验证集

In [None]:
train_data, validation_data = train_test_split(
    data, test_size=0.3, stratify=data["group"], random_state=RANDOM_SEED
)

print(f"训练集: {train_data.shape[0]} 行, 验证集: {validation_data.shape[0]} 行")

## 7. 特征工程

In [None]:
exclude_cols = ["group", "患者编号", "Unnamed: 0"]
feature_cols = [col for col in train_data.columns if col not in exclude_cols]

X_train = train_data[feature_cols].copy()
y_train = train_data["group"].copy()
X_validation = validation_data[feature_cols].copy()
y_validation = validation_data["group"].copy()

y_train_binary = (y_train == "Y").astype(int)
y_validation_binary = (y_validation == "Y").astype(int)

print(f"使用 {len(feature_cols)} 个特征")

## 8. 特征概览和缺失值分析/补缺（MissForest方法）

In [None]:
# ===== 8. 特征概览 =====
print(f"特征列表 ({len(feature_cols)}个特征):")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

print(f"\n训练集: {X_train.shape[0]}样本, 验证集: {X_validation.shape[0]}样本")
print(f"训练集正负样本: {y_train_binary.value_counts().to_dict()}")
print(f"验证集正负样本: {y_validation_binary.value_counts().to_dict()}")

# ===== 缺失值分析 =====
missing_train = X_train.isnull().sum()
missing_pct = (missing_train / len(X_train) * 100).round(2)
missing_info = (
    pd.DataFrame({"缺失数量": missing_train, "缺失率(%)": missing_pct})
    .query("缺失数量 > 0")
    .sort_values("缺失率(%)", ascending=False)
)

if not missing_info.empty:
    print(f"\n缺失值分析 (Top 10):")
    print(missing_info.head(10).to_string())

# ===== 定义分类特征和数值特征 =====
print("\n" + "=" * 60)
print("缺失值填补（分类特征+数值特征分别处理）")
print("=" * 60)

# 分类特征及其有效取值范围
categorical_info = {
    "Tanner分期": (1, 5),  # 1-5
    "乳晕色素沉着": (0, 2),  # 0, 1, 2
    "乳核": (0, 1),  # 0, 1
    "有无阴毛": (0, 1),  # 0, 1
    "有无腋毛": (0, 1),  # 0, 1
}

categorical_cols = [c for c in categorical_info.keys() if c in feature_cols]
numerical_cols = [c for c in feature_cols if c not in categorical_cols]

print(f"分类特征 ({len(categorical_cols)}个): {categorical_cols}")
print(f"数值特征 ({len(numerical_cols)}个)")

# ===== 分类特征：使用 IterativeImputer + RandomForestClassifier =====
from sklearn.ensemble import RandomForestClassifier

cat_imputer = IterativeImputer(
    estimator=RandomForestClassifier(
        n_estimators=10,
        max_depth=10,
        n_jobs=-1,
        random_state=RANDOM_SEED,
    ),
    max_iter=10,
    random_state=RANDOM_SEED,
    verbose=0,
)

if categorical_cols:
    print("使用 IterativeImputer + RandomForestClassifier 填补分类特征...")
    X_train_cat = cat_imputer.fit_transform(X_train[categorical_cols])
    X_validation_cat = cat_imputer.transform(X_validation[categorical_cols])

    # 裁剪到有效范围
    for i, col in enumerate(categorical_cols):
        min_val, max_val = categorical_info[col]
        X_train_cat[:, i] = X_train_cat[:, i].clip(min_val, max_val)
        X_validation_cat[:, i] = X_validation_cat[:, i].clip(min_val, max_val)

    print(f"分类特征填补完成")

# ===== 数值特征：使用MissForest方法 =====
imputer = create_missforest_imputer(RANDOM_SEED)
print("使用MissForest方法填补数值特征...")

X_train_num = imputer.fit_transform(X_train[numerical_cols])
X_validation_num = imputer.transform(X_validation[numerical_cols])

# ===== 合并分类和数值特征 =====
if categorical_cols:
    X_train_arr = np.hstack([X_train_cat, X_train_num])
    X_validation_arr = np.hstack([X_validation_cat, X_validation_num])
    feature_cols_processed = categorical_cols + numerical_cols
else:
    X_train_arr = X_train_num
    X_validation_arr = X_validation_num
    feature_cols_processed = numerical_cols

# 转换为DataFrame保持列名
X_train_imputed = pd.DataFrame(
    X_train_arr, columns=feature_cols_processed, index=X_train.index
)
X_validation_imputed = pd.DataFrame(
    X_validation_arr, columns=feature_cols_processed, index=X_validation.index
)

# 标准化（用于KNN/SVM/NNET等模型）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_validation_scaled = scaler.transform(X_validation_imputed)

print(f"\n填补完成！")
print(f"  处理后特征数: {X_train_imputed.shape[1]}")
print(f"  缺失值检查: {X_train_imputed.isnull().sum().sum()} (应为0)")

---
# 模型训练

## 10. 设置交叉验证策略

In [None]:
cv_strategy = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_SEED)

print(f"验证策略: 5折交叉验证 x 3次重复 = {cv_strategy.get_n_splits()}轮")
print(f"训练集: {len(X_train)}样本, 正负比={y_train_binary.value_counts().to_dict()}")
print(
    f"验证集: {len(X_validation)}样本, 正负比={y_validation_binary.value_counts().to_dict()}"
)

## 11. 训练模型1: GBM (梯度提升机)

In [None]:
gbm_model = HistGradientBoostingClassifier(
    max_iter=100,
    max_depth=3,
    learning_rate=0.01,
    l2_regularization=0.1,
    min_samples_leaf=10,
    random_state=RANDOM_SEED,
)
gbm_model.fit(X_train, y_train_binary)

y_pred_gbm = gbm_model.predict(X_validation)
y_pred_proba_gbm = gbm_model.predict_proba(X_validation)[:, 1]
auc_gbm = roc_auc_score(y_validation_binary, y_pred_proba_gbm)
f1_gbm = f1_score(y_validation_binary, y_pred_gbm)

joblib.dump(gbm_model, "../output/models/gbm_model.pkl")
print(f"GBM AUC: {auc_gbm:.4f}, F1: {f1_gbm:.4f}")

## 12. 训练模型2: KNN (K近邻)

In [None]:
# KNN使用标准化数据
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=N_JOBS)
knn_model.fit(X_train_scaled, y_train_binary)

y_pred_knn = knn_model.predict(X_validation_scaled)
y_pred_proba_knn = knn_model.predict_proba(X_validation_scaled)[:, 1]
auc_knn = roc_auc_score(y_validation_binary, y_pred_proba_knn)
f1_knn = f1_score(y_validation_binary, y_pred_knn)

joblib.dump(
    {"model": knn_model, "imputer": imputer, "scaler": scaler},
    "../output/models/knn_model.pkl",
)
print(f"KNN AUC: {auc_knn:.4f}, F1: {f1_knn:.4f}")

## 13. 训练模型3: Naive Bayes (朴素贝叶斯)

In [None]:
# NB使用填补后数据（无需标准化）
nb_model = GaussianNB()
nb_model.fit(X_train_imputed, y_train_binary)

y_pred_nb = nb_model.predict(X_validation_imputed)
y_pred_proba_nb = nb_model.predict_proba(X_validation_imputed)[:, 1]
auc_nb = roc_auc_score(y_validation_binary, y_pred_proba_nb)
f1_nb = f1_score(y_validation_binary, y_pred_nb)

joblib.dump({"model": nb_model, "imputer": imputer}, "../output/models/nb_model.pkl")
print(f"NB AUC: {auc_nb:.4f}, F1: {f1_nb:.4f}")

## 14. 训练模型4: XGBoost (极限梯度提升)

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.013,
    random_state=RANDOM_SEED,
    tree_method="hist",
    n_jobs=N_JOBS,
)
xgb_model.fit(X_train, y_train_binary)

y_pred_xgb = xgb_model.predict(X_validation)
y_pred_proba_xgb = xgb_model.predict_proba(X_validation)[:, 1]
auc_xgb = roc_auc_score(y_validation_binary, y_pred_proba_xgb)
f1_xgb = f1_score(y_validation_binary, y_pred_xgb)

joblib.dump(xgb_model, "../output/models/xgb_model.pkl")
print(f"XGB AUC: {auc_xgb:.4f}, F1: {f1_xgb:.4f}")

## 15. 训练模型5: Random Forest (随机森林)

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=500, random_state=RANDOM_SEED, n_jobs=N_JOBS
)
rf_model.fit(X_train, y_train_binary)

y_pred_rf = rf_model.predict(X_validation)
y_pred_proba_rf = rf_model.predict_proba(X_validation)[:, 1]
auc_rf = roc_auc_score(y_validation_binary, y_pred_proba_rf)
f1_rf = f1_score(y_validation_binary, y_pred_rf)

joblib.dump(rf_model, "../output/models/rf_model.pkl")
print(f"RF AUC: {auc_rf:.4f}, F1: {f1_rf:.4f}")

## 16. 训练模型6: RPART (决策树)

In [None]:
# RPART使用填补后数据
rpart_model = DecisionTreeClassifier(
    max_depth=5, min_samples_split=50, min_samples_leaf=20, random_state=RANDOM_SEED
)
rpart_model.fit(X_train_imputed, y_train_binary)

y_pred_rpart = rpart_model.predict(X_validation_imputed)
y_pred_proba_rpart = rpart_model.predict_proba(X_validation_imputed)[:, 1]
auc_rpart = roc_auc_score(y_validation_binary, y_pred_proba_rpart)
f1_rpart = f1_score(y_validation_binary, y_pred_rpart)

joblib.dump(
    {"model": rpart_model, "imputer": imputer}, "../output/models/rpart_model.pkl"
)
print(f"RPART AUC: {auc_rpart:.4f}, F1: {f1_rpart:.4f}")

## 17. 训练模型7: GLM (逻辑回归)

In [None]:
# GLM使用填补后数据
glm_model = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED, n_jobs=N_JOBS)
glm_model.fit(X_train_imputed, y_train_binary)

y_pred_glm = glm_model.predict(X_validation_imputed)
y_pred_proba_glm = glm_model.predict_proba(X_validation_imputed)[:, 1]
auc_glm = roc_auc_score(y_validation_binary, y_pred_proba_glm)
f1_glm = f1_score(y_validation_binary, y_pred_glm)

joblib.dump({"model": glm_model, "imputer": imputer}, "../output/models/glm_model.pkl")
print(f"GLM AUC: {auc_glm:.4f}, F1: {f1_glm:.4f}")

## 18. 训练模型8: SVM (支持向量机)

In [None]:
# SVM使用标准化数据
svm_model = SVC(kernel="rbf", probability=True, random_state=RANDOM_SEED)
svm_model.fit(X_train_scaled, y_train_binary)

y_pred_svm = svm_model.predict(X_validation_scaled)
y_pred_proba_svm = svm_model.predict_proba(X_validation_scaled)[:, 1]
auc_svm = roc_auc_score(y_validation_binary, y_pred_proba_svm)
f1_svm = f1_score(y_validation_binary, y_pred_svm)

joblib.dump(
    {"model": svm_model, "imputer": imputer, "scaler": scaler},
    "../output/models/svm_model.pkl",
)
print(f"SVM AUC: {auc_svm:.4f}, F1: {f1_svm:.4f}")

## 19. 训练模型9: NNET (神经网络)

In [None]:
# NNET使用标准化数据
nnet_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    max_iter=500,
    random_state=RANDOM_SEED,
    early_stopping=True,
)
nnet_model.fit(X_train_scaled, y_train_binary)

y_pred_nnet = nnet_model.predict(X_validation_scaled)
y_pred_proba_nnet = nnet_model.predict_proba(X_validation_scaled)[:, 1]
auc_nnet = roc_auc_score(y_validation_binary, y_pred_proba_nnet)
f1_nnet = f1_score(y_validation_binary, y_pred_nnet)

joblib.dump(
    {"model": nnet_model, "imputer": imputer, "scaler": scaler},
    "../output/models/nnet_model.pkl",
)
print(f"NNET AUC: {auc_nnet:.4f}, F1: {f1_nnet:.4f}")

## 21. 导入模型10: TabM

In [None]:
# 导入最佳TabM模型
print("=" * 70)
print("导入最佳TabM模型")
print("=" * 70)

import rtdl_num_embeddings

# 加载预处理器和配置
tabm_data = joblib.load("../output/models/tabm_preprocessors.pkl")
best_model_name = tabm_data.get("best_model_name", "basic")
saved_metrics = tabm_data.get("metrics", {})

print(f"最佳模型: TabM-{best_model_name}")
if saved_metrics:
    print(
        f"保存时性能: F1={saved_metrics.get('f1', 'N/A'):.4f}, AUC={saved_metrics.get('auc', 'N/A'):.4f}"
    )

# 数据预处理
X_train_processed = X_train_imputed.values
X_train_tensor = torch.tensor(X_train_processed, dtype=torch.float32).cuda()
y_train_tensor = torch.tensor(y_train_binary.values, dtype=torch.long).cuda()
X_val_tensor = torch.tensor(X_validation_imputed.values, dtype=torch.float32).cuda()

print(f"训练集: {X_train_tensor.shape}, 验证集: {X_val_tensor.shape}")

# 根据最佳模型类型创建对应架构
model_kwargs = {
    "n_num_features": X_train_tensor.shape[1],
    "cat_cardinalities": [],
    "d_out": 2,
}

# 默认嵌入参数
n_bins = 48
d_embedding = 16
use_periodic = False

# 如果是HPO模型，尝试加载HPO配置
if best_model_name == "hpo":
    hpo_config_path = "../output/tabm_enhanced/models/tabm_hpo_config.pkl"
    if os.path.exists(hpo_config_path):
        hpo_data = joblib.load(hpo_config_path)
        best_params = hpo_data["best_params"]
        n_bins = best_params["n_bins"]
        d_embedding = best_params["d_embedding"]
        model_kwargs["n_blocks"] = best_params["n_blocks"]
        model_kwargs["d_block"] = best_params["d_block"]
        model_kwargs["dropout"] = best_params["dropout"]
        print(
            f"HPO参数: n_blocks={best_params['n_blocks']}, d_block={best_params['d_block']}, dropout={best_params['dropout']}"
        )
elif best_model_name == "mini":
    model_kwargs["arch_type"] = "tabm-mini"
elif best_model_name == "periodic":
    use_periodic = True

# 创建嵌入层
if use_periodic:
    num_embeddings = rtdl_num_embeddings.PeriodicEmbeddings(
        n_features=X_train_tensor.shape[1],
        d_embedding=d_embedding,
        lite=False,
    )
else:
    num_embeddings = rtdl_num_embeddings.PiecewiseLinearEmbeddings(
        rtdl_num_embeddings.compute_bins(X_train_tensor, n_bins=n_bins),
        d_embedding=d_embedding,
        activation=False,
        version="B",
    )

model_kwargs["num_embeddings"] = num_embeddings

# 创建模型
tabm_model = tabm.TabM.make(**model_kwargs).cuda()

# 加载权重
tabm_model.load_state_dict(
    torch.load("../output/models/tabm_best.pt", map_location="cuda", weights_only=True)
)
tabm_model.eval()

print(f"模型参数量: {sum(p.numel() for p in tabm_model.parameters()):,}")

# 验证集评估
with torch.no_grad():
    val_logits = tabm_model(X_val_tensor, None)
    y_pred_proba_tabm = (
        torch.softmax(val_logits, dim=-1).mean(dim=1)[:, 1].cpu().numpy()
    )
    y_pred_tabm = (y_pred_proba_tabm >= 0.5).astype(int)

auc_tabm = roc_auc_score(y_validation_binary, y_pred_proba_tabm)
f1_tabm = f1_score(y_validation_binary, y_pred_tabm)

print(f"\nTabM验证集性能: F1={f1_tabm:.4f}, AUC={auc_tabm:.4f}")
print(f"模型来源: ../output/models/tabm_best.pt")
print("=" * 70)

## 21. 导入模型11: TabPFN

In [None]:
print("=" * 70)
print("加载Post-hoc集成TabPFN模型")
print("=" * 70)

# 加载模型文件
auto_model_path = "../output/tabpfn_enhanced/models/tabpfn_auto.pkl"

# 加载模型和预处理器
auto_data = joblib.load(auto_model_path)
tabpfn_model = auto_data["model"]

print(f"已加载Post-hoc集成TabPFN模型")
print(f"模型路径: {auto_model_path}")
print(f"模型类型: {type(tabpfn_model).__name__}")

# 使用已填补的数据（本notebook已在前面完成填补）
X_train_tabpfn = X_train_imputed.values
X_validation_tabpfn = X_validation_imputed.values

print(f"训练集: {X_train_tabpfn.shape}, 验证集: {X_validation_tabpfn.shape}")

# 验证集预测
print(f"\n在验证集上进行预测...")
y_pred_tabpfn = tabpfn_model.predict(X_validation_tabpfn)
y_pred_proba_tabpfn = tabpfn_model.predict_proba(X_validation_tabpfn)[:, 1]

# 计算性能指标
auc_tabpfn = roc_auc_score(y_validation_binary, y_pred_proba_tabpfn)
f1_tabpfn = f1_score(y_validation_binary, y_pred_tabpfn)

print(f"\nPost-hoc集成TabPFN性能指标:")
print(f"  AUC: {auc_tabpfn:.4f}")
print(f"  F1:  {f1_tabpfn:.4f}")

# 保存到与其他模型一致的位置
joblib.dump(
    {"model": tabpfn_model},
    "../output/models/tabpfn_model.pkl",
)
print(f"\n模型已复制到: ../output/models/tabpfn_model.pkl")
print("=" * 70)

In [None]:
# 模型性能汇总
models_summary = pd.DataFrame(
    {
        "模型": [
            "GBM",
            "KNN",
            "NB",
            "XGB",
            "RF",
            "RPART",
            "GLM",
            "SVM",
            "NNET",
            "TabPFN-HPO",
            "TabM",
        ],
        "AUC": [
            auc_gbm,
            auc_knn,
            auc_nb,
            auc_xgb,
            auc_rf,
            auc_rpart,
            auc_glm,
            auc_svm,
            auc_nnet,
            auc_tabpfn,
            auc_tabm,
        ],
        "F1": [
            f1_gbm,
            f1_knn,
            f1_nb,
            f1_xgb,
            f1_rf,
            f1_rpart,
            f1_glm,
            f1_svm,
            f1_nnet,
            f1_tabpfn,
            f1_tabm,
        ],
    }
).sort_values("AUC", ascending=False)

best_model_name = models_summary.iloc[0]["模型"]
best_model_auc = models_summary.iloc[0]["AUC"]
best_model_f1 = models_summary.iloc[0]["F1"]

print(f"模型性能 ({len(models_summary)}个模型):")
print(models_summary.to_string(index=False))
print(
    f"\n最佳模型: {best_model_name} (AUC = {best_model_auc:.4f}, F1 = {best_model_f1:.4f})"
)

In [None]:
plt.figure(figsize=(12, 10))

# ROC曲线对比
models_info = [
    ("GBM", y_pred_proba_gbm, auc_gbm),
    ("KNN", y_pred_proba_knn, auc_knn),
    ("NB", y_pred_proba_nb, auc_nb),
    ("XGB", y_pred_proba_xgb, auc_xgb),
    ("RF", y_pred_proba_rf, auc_rf),
    ("RPART", y_pred_proba_rpart, auc_rpart),
    ("GLM", y_pred_proba_glm, auc_glm),
    ("SVM", y_pred_proba_svm, auc_svm),
    ("NNET", y_pred_proba_nnet, auc_nnet),
    ("TabPFN-HPO", y_pred_proba_tabpfn, auc_tabpfn),
    ("TabM", y_pred_proba_tabm, auc_tabm),
]

for model_name, y_proba, auc_score in models_info:
    fpr, tpr, _ = roc_curve(y_validation_binary, y_proba)
    linewidth = 3 if model_name in ["TabPFN-HPO", "TabM"] else 2
    plt.plot(
        fpr, tpr, label=f"{model_name} (AUC = {auc_score:.4f})", linewidth=linewidth
    )

plt.plot([0, 1], [0, 1], "k--", linewidth=1, label="随机猜测")
plt.xlabel("假阳性率 (1-特异度)", fontsize=12)
plt.ylabel("真阳性率 (灵敏度)", fontsize=12)
plt.title("11个模型的ROC曲线对比（含TabPFN-HPO和TabM）", fontsize=14, fontweight="bold")
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()

plt.savefig("../output/ROC曲线对比_11模型.pdf", dpi=300, bbox_inches="tight")
plt.savefig("../output/ROC曲线对比_11模型.png", dpi=300, bbox_inches="tight")
plt.show()

print("ROC曲线已保存")

In [None]:
# 保存预测结果和性能指标
predictions_df = pd.DataFrame(
    {
        "真实标签": y_validation_binary,
        "GBM_预测": y_pred_gbm,
        "GBM_概率": y_pred_proba_gbm,
        "KNN_预测": y_pred_knn,
        "KNN_概率": y_pred_proba_knn,
        "NB_预测": y_pred_nb,
        "NB_概率": y_pred_proba_nb,
        "XGB_预测": y_pred_xgb,
        "XGB_概率": y_pred_proba_xgb,
        "RF_预测": y_pred_rf,
        "RF_概率": y_pred_proba_rf,
        "RPART_预测": y_pred_rpart,
        "RPART_概率": y_pred_proba_rpart,
        "GLM_预测": y_pred_glm,
        "GLM_概率": y_pred_proba_glm,
        "SVM_预测": y_pred_svm,
        "SVM_概率": y_pred_proba_svm,
        "NNET_预测": y_pred_nnet,
        "NNET_概率": y_pred_proba_nnet,
        "TabPFN-HPO_预测": y_pred_tabpfn,
        "TabPFN-HPO_概率": y_pred_proba_tabpfn,
        "TabM_预测": y_pred_tabm,
        "TabM_概率": y_pred_proba_tabm,
    }
)

predictions_df.to_csv(
    "../output/验证集预测结果_11模型.csv", index=False, encoding="utf-8-sig"
)
print(f"预测结果已保存，验证集共 {len(predictions_df)} 样本，包含11个模型的预测")

# 保存模型性能汇总
models_summary.to_csv(
    "../output/模型性能汇总_AUC_F1.csv", index=False, encoding="utf-8-sig"
)
print(f"模型性能汇总已保存 (包含AUC和F1指标)")