In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
# 引入 KFold, StratifiedKFold 是更适合不均衡数据集的版本
from sklearn.model_selection import StratifiedKFold
from ucimlrepo import fetch_ucirepo
from sklearn.metrics import roc_auc_score, accuracy_score, average_precision_score, recall_score, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math

# --- 引入 Scikit-learn 模型 ---
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

# --- 定义设备 ---
device = torch.device('mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu'))
print(f"Using device: {device}")

# --- Focal Loss (用于 PyTorch MLP 模型) ---
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean', pos_weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        current_pos_weight = self.pos_weight
        if current_pos_weight is not None and current_pos_weight.device != inputs.device:
             current_pos_weight = current_pos_weight.to(inputs.device)

        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none', pos_weight=current_pos_weight)
        probs = torch.sigmoid(inputs)
        pt = torch.where(targets == 1, probs, 1 - probs)
        
        alpha_t = torch.where(targets == 1, self.alpha, 1 - self.alpha).to(inputs.device)
        focal_weight = alpha_t * torch.pow(1 - pt, self.gamma)
        loss = focal_weight * BCE_loss
        
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

# --- MLP 模型 (PyTorch) ---
class MLPNet(nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes, dropout_rate=0.4):
        super(MLPNet, self).__init__()
        self.fc1 = nn.Linear(num_features, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.fc2 = nn.Linear(hidden_channels, hidden_channels // 2)
        self.bn2 = nn.BatchNorm1d(hidden_channels // 2)
        self.fc3 = nn.Linear(hidden_channels // 2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# --- 数据准备 (与之前相同) ---
myocardial_infarction_complications = fetch_ucirepo(id=579)
X_df_orig = myocardial_infarction_complications.data.features.copy()
y_df_orig = myocardial_infarction_complications.data.targets.copy()
target_names = list(y_df_orig.columns)

# ... (数据清洗和特征选择部分与原代码完全相同，此处省略以保持简洁) ...
feature_collection_time = {
    'AGE': 'admission', 'SEX': 'admission', 'INF_ANAM': 'admission', 'STENOK_AN': 'admission',
    'FK_STENOK': 'admission', 'IBS_POST': 'admission', 'IBS_NASL': 'admission', 'GB': 'admission',
    'SIM_GIPERT': 'admission', 'DLIT_AG': 'admission', 'ZSN_A': 'admission', 'nr_11': 'admission',
    'nr_01': 'admission', 'nr_02': 'admission', 'nr_03': 'admission', 'nr_04': 'admission',
    'nr_07': 'admission', 'nr_08': 'admission', 'np_01': 'admission', 'np_04': 'admission',
    'np_05': 'admission', 'np_07': 'admission', 'np_08': 'admission', 'np_09': 'admission',
    'np_10': 'admission', 'endocr_01': 'admission', 'endocr_02': 'admission', 'endocr_03': 'admission',
    'zab_leg_01': 'admission', 'zab_leg_02': 'admission', 'zab_leg_03': 'admission',
    'zab_leg_04': 'admission', 'zab_leg_06': 'admission', 'S_AD_KBRIG': 'admission',
    'D_AD_KBRIG': 'admission', 'S_AD_ORIT': 'admission', 'D_AD_ORIT': 'admission',
    'O_L_POST': 'admission', 'K_SH_POST': 'admission', 'MP_TP_POST': 'admission',
    'SVT_POST': 'admission', 'GT_POST': 'admission', 'FIB_G_POST': 'admission',
    'ant_im': 'admission', 'lat_im': 'admission', 'inf_im': 'admission', 'post_im': 'admission',
    'IM_PG_P': 'admission', 'ritm_ecg_p_01': 'admission', 'ritm_ecg_p_02': 'admission',
    'ritm_ecg_p_04': 'admission', 'ritm_ecg_p_06': 'admission', 'ritm_ecg_p_07': 'admission',
    'ritm_ecg_p_08': 'admission', 'n_r_ecg_p_01': 'admission', 'n_r_ecg_p_02': 'admission',
    'n_r_ecg_p_03': 'admission', 'n_r_ecg_p_04': 'admission', 'n_r_ecg_p_05': 'admission',
    'n_r_ecg_p_06': 'admission', 'n_r_ecg_p_08': 'admission', 'n_r_ecg_p_09': 'admission',
    'n_r_ecg_p_10': 'admission', 'n_p_ecg_p_01': 'admission', 'n_p_ecg_p_03': 'admission',
    'n_p_ecg_p_04': 'admission', 'n_p_ecg_p_05': 'admission', 'n_p_ecg_p_06': 'admission',
    'n_p_ecg_p_07': 'admission', 'n_p_ecg_p_08': 'admission', 'n_p_ecg_p_09': 'admission',
    'n_p_ecg_p_10': 'admission', 'n_p_ecg_p_11': 'admission', 'n_p_ecg_p_12': 'admission',
    'fibr_ter_01': 'admission', 'fibr_ter_02': 'admission', 'fibr_ter_03': 'admission',
    'fibr_ter_05': 'admission', 'fibr_ter_06': 'admission', 'fibr_ter_07': 'admission',
    'fibr_ter_08': 'admission', 'GIPO_K': 'admission', 'K_BLOOD': 'admission',
    'GIPER_NA': 'admission', 'NA_BLOOD': 'admission', 'ALT_BLOOD': 'admission',
    'AST_BLOOD': 'admission', 'KFK_BLOOD': 'admission', 'L_BLOOD': 'admission',
    'ROE': 'admission', 'TIME_B_S': 'admission', 'NA_KB': 'admission', 'NOT_NA_KB': 'admission',
    'LID_KB': 'admission', 'NITR_S': 'admission', 'LID_S_n': 'admission',
    'B_BLOK_S_n': 'admission', 'ANT_CA_S_n': 'admission', 'GEPAR_S_n': 'admission',
    'ASP_S_n': 'admission', 'TIKL_S_n': 'admission', 'TRENT_S_n': 'admission',
}
admission_features = [f for f, t in feature_collection_time.items() if t == 'admission' and f in X_df_orig.columns]
X_processed = X_df_orig[admission_features].copy()
for col in admission_features:
    if X_processed[col].isnull().any():
        X_processed[col].fillna(X_processed[col].median(), inplace=True)

y_processed = y_df_orig.copy()
for col in y_processed.columns:
    if y_processed[col].isnull().any():
        y_processed[col].fillna(y_processed[col].median(), inplace=True)
    y_processed[col] = y_processed[col].astype(int)

label_11_col_name = y_processed.columns[10]
y_processed[label_11_col_name] = (y_processed[label_11_col_name] > 0).astype(int)

# --- 数据标准化和准备 ---
# 特征缩放器现在将在交叉验证循环之外定义，以对整个数据集进行拟合
# 这样可以确保每一折都使用相同的缩放标准
scaler = StandardScaler()
X_scaled_np = scaler.fit_transform(X_processed)
y_np = y_processed.values

# --- 评估和绘图函数 (与之前相同) ---
def calculate_metrics(all_labels_np, all_preds_prob_np):
    # ... (此函数与之前完全相同) ...
    threshold = 0.5
    all_preds_binary_np = (all_preds_prob_np > threshold).astype(int)

    num_labels_total = all_labels_np.shape[1]
    metrics = {
        'per_label_acc': [], 'per_label_recall': [], 'per_label_specificity': [],
        'per_label_ap': [], 'per_label_roc_auc': [], 'per_label_balanced_acc': [],
        'confusion_matrices': []
    }

    for i in range(num_labels_total):
        true_1d = all_labels_np[:, i]
        prob_1d = all_preds_prob_np[:, i]
        pred_1d = all_preds_binary_np[:, i]
        unique_true = np.unique(true_1d)

        acc = accuracy_score(true_1d, pred_1d)
        metrics['per_label_acc'].append(acc)

        if len(unique_true) < 2:
            metrics['per_label_recall'].append(np.nan)
            metrics['per_label_specificity'].append(np.nan)
            metrics['per_label_ap'].append(np.nan)
            metrics['per_label_roc_auc'].append(np.nan)
            metrics['per_label_balanced_acc'].append(np.nan if np.isnan(acc) else acc)
            if unique_true[0] == 0: tn, fp, fn, tp = np.sum(pred_1d == 0), np.sum(pred_1d == 1), 0, 0
            else: tn, fp, fn, tp = 0, 0, np.sum(pred_1d == 0), np.sum(pred_1d == 1)
            metrics['confusion_matrices'].append((tn, fp, fn, tp))
            continue

        cm = confusion_matrix(true_1d, pred_1d, labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()
        metrics['confusion_matrices'].append((tn, fp, fn, tp))
        
        recall = tp / (tp + fn) if (tp + fn) > 0 else np.nan
        specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan
        balanced_acc = (recall + specificity) / 2 if not (np.isnan(recall) or np.isnan(specificity)) else np.nan
        
        metrics['per_label_recall'].append(recall)
        metrics['per_label_specificity'].append(specificity)
        metrics['per_label_balanced_acc'].append(balanced_acc)

        try:
            ap = average_precision_score(true_1d, prob_1d)
            roc_auc = roc_auc_score(true_1d, prob_1d)
        except ValueError:
            ap, roc_auc = np.nan, np.nan
        metrics['per_label_ap'].append(ap)
        metrics['per_label_roc_auc'].append(roc_auc)
        
    final_metrics = {
        'mean_acc': np.nanmean(metrics['per_label_acc']),
        'mean_recall': np.nanmean(metrics['per_label_recall']),
        'mean_specificity': np.nanmean(metrics['per_label_specificity']),
        'mean_ap': np.nanmean(metrics['per_label_ap']),
        'mean_roc_auc': np.nanmean(metrics['per_label_roc_auc']),
        'mean_balanced_acc': np.nanmean(metrics['per_label_balanced_acc']),
    }
    return final_metrics

def evaluate_sklearn_model(model, X_data_np, y_data_np):
    preds_prob_list = model.predict_proba(X_data_np)
    preds_prob_np = np.hstack([p[:, 1].reshape(-1, 1) for p in preds_prob_list])
    return calculate_metrics(y_data_np, preds_prob_np)

def evaluate_pytorch_model(model, X_data, y_data):
    model.eval()
    X_data, y_data = X_data.to(device), y_data.to(device)
    with torch.no_grad():
        out = model(X_data)
        preds_prob = torch.sigmoid(out)
    return calculate_metrics(y_data.cpu().numpy(), preds_prob.cpu().numpy())

def plot_performance_bar_chartjs(aggregated_results):
    os.makedirs('results', exist_ok=True)
    models = list(aggregated_results.keys())
    
    # 提取平均值用于绘图
    datasets_dict = {
        "Mean ROC-AUC": {"data": [aggregated_results[b]['mean_roc_auc']['mean'] for b in models], "backgroundColor": "rgba(255, 99, 132, 0.7)"},
        "Mean Balanced Acc": {"data": [aggregated_results[b]['mean_balanced_acc']['mean'] for b in models], "backgroundColor": "rgba(255, 159, 64, 0.7)"},
        "Mean Recall": {"data": [aggregated_results[b]['mean_recall']['mean'] for b in models], "backgroundColor": "rgba(54, 162, 235, 0.7)"},
        "Mean AP": {"data": [aggregated_results[b]['mean_ap']['mean'] for b in models], "backgroundColor": "rgba(255, 206, 86, 0.7)"}
    }
    chart_config = {"type": "bar", "data": {"labels": models, "datasets": [{"label": k, **v} for k,v in datasets_dict.items()]},
                    "options": {"responsive": True, "maintainAspectRatio": False, 
                                "plugins": {"title": {"display": True, "text": "Model Performance Comparison (5-Fold CV)"},
                                            "legend": {"position": "top"}},
                                "scales": {"y": {"beginAtZero": True, "title": {"display": True, "text": "Score"}}}}}
    with open('results/overall_performance_summary_chartjs.json', 'w') as f:
        json.dump(chart_config, f, indent=4)


# --- 交叉验证主循环 ---
N_SPLITS = 5
RANDOM_STATE = 42

# 用于分层的标签：1表示至少有一个并发症，0表示没有
stratify_labels = (y_np.sum(axis=1) > 0).astype(int)
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

model_configs = {
    'LogisticRegression': {
        'model': LogisticRegression, 'params': {'solver': 'liblinear', 'random_state': RANDOM_STATE, 'max_iter': 200}, 'type': 'sklearn'
    },
    'XGBoost': {
        'model': XGBClassifier, 'params': {'use_label_encoder': False, 'eval_metric': 'logloss', 'random_state': RANDOM_STATE, 'n_estimators': 150}, 'type': 'sklearn'
    },
    'CatBoost': {
        'model': CatBoostClassifier, 'params': {'random_state': RANDOM_STATE, 'verbose': 0, 'iterations': 200}, 'type': 'sklearn'
    },
    'MLP': {
        'model': MLPNet, 'params': {'hidden_channels': 128, 'dropout_rate': 0.4}, 'type': 'pytorch'
    }
}

# 存储每个模型在每一折上的结果
all_model_fold_results = {model_name: [] for model_name in model_configs.keys()}

# --- 开始5折交叉验证 ---
for fold, (train_indices, test_indices) in enumerate(skf.split(X_scaled_np, stratify_labels)):
    print(f"\n===== 开始处理第 {fold+1}/{N_SPLITS} 折 =====")

    # 1. 根据当前折的索引准备数据
    X_train, y_train = X_scaled_np[train_indices], y_np[train_indices]
    X_test, y_test = X_scaled_np[test_indices], y_np[test_indices]
    
    # 2. 为PyTorch模型准备Tensors
    X_train_tensor, y_train_tensor = torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float)
    X_test_tensor, y_test_tensor = torch.tensor(X_test, dtype=torch.float), torch.tensor(y_test, dtype=torch.float)

    # 3. 遍历并训练/评估每个模型
    for model_name, config in model_configs.items():
        print(f"--- 正在处理模型: {model_name} (第 {fold+1} 折) ---")

        if config['type'] == 'sklearn':
            # --- Scikit-learn 模型训练和评估 ---
            base_model = config['model'](**config['params'])
            model = MultiOutputClassifier(base_model)
            model.fit(X_train, y_train)
            test_result_dict = evaluate_sklearn_model(model, X_test, y_test)
        
        elif config['type'] == 'pytorch':
            # --- PyTorch MLP 训练和评估 ---
            # !! 关键：每一折都必须重新初始化模型、优化器等
            num_features = X_train.shape[1]
            num_classes = y_train.shape[1]
            model = config['model'](num_features=num_features, num_classes=num_classes, **config['params']).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
            
            # 动态计算当前训练集的 pos_weight
            num_positives_train = y_train_tensor.sum(axis=0)
            num_negatives_train = len(y_train_tensor) - num_positives_train
            pos_weight = num_negatives_train / (num_positives_train + 1e-8)
            
            criterion = FocalLoss(alpha=0.25, gamma=2.0, reduction='mean', pos_weight=pos_weight)

            epochs = 150 # 减少epoch数量，因为我们现在有5次训练
            
            for epoch in range(1, epochs + 1):
                model.train()
                optimizer.zero_grad()
                out = model(X_train_tensor.to(device))
                loss = criterion(out, y_train_tensor.to(device))
                loss.backward()
                optimizer.step()
                if epoch % 50 == 0:
                     print(f'    Epoch {epoch}/{epochs}, Loss: {loss.item():.4f}')
            
            # 在当前折的测试集上评估
            test_result_dict = evaluate_pytorch_model(model, X_test_tensor, y_test_tensor)

        print(f"    {model_name} (第 {fold+1} 折) 测试结果: ROC-AUC={test_result_dict['mean_roc_auc']:.4f}, Bal.Acc={test_result_dict['mean_balanced_acc']:.4f}")
        all_model_fold_results[model_name].append(test_result_dict)

# --- 聚合和报告最终结果 ---
print("\n===== 5折交叉验证完成，正在聚合结果... =====")

aggregated_results = {}
for model_name, fold_results in all_model_fold_results.items():
    # 使用DataFrame方便地计算均值和标准差
    df = pd.DataFrame(fold_results)
    aggregated_results[model_name] = {
        metric: {'mean': df[metric].mean(), 'std': df[metric].std()}
        for metric in df.columns
    }

# --- 打印最终的性能总结表格 ---
summary_list = []
for model_name, metrics in aggregated_results.items():
    entry = {'Model': model_name}
    for metric_name, values in metrics.items():
        entry[f"{metric_name} (Mean)"] = values['mean']
        entry[f"{metric_name} (Std)"] = values['std']
    summary_list.append(entry)

performance_summary_df = pd.DataFrame(summary_list)
# 格式化输出，使其更易读
formatted_df = performance_summary_df[['Model']].copy()
for metric in aggregated_results[list(model_configs.keys())[0]].keys():
    mean_col = f"{metric} (Mean)"
    std_col = f"{metric} (Std)"
    formatted_df[metric] = performance_summary_df.apply(
        lambda row: f"{row[mean_col]:.4f} ± {row[std_col]:.4f}", axis=1
    )

print("\n--- 最终性能总结 (Mean ± Std over 5 Folds) ---")
print(formatted_df.to_string())

# 保存详细和格式化的结果到CSV
performance_summary_df.to_csv('results/overall_performance_summary_detailed.csv', index=False)
formatted_df.to_csv('results/overall_performance_summary_formatted.csv', index=False)
print("\n详细和格式化的性能总结已保存到 'results/' 文件夹中。")

# 绘制最终的性能对比图
if aggregated_results:
    plot_performance_bar_chartjs(aggregated_results)
    print("最终性能对比图 (Chart.js config) 已保存到 results/overall_performance_summary_chartjs.json")

print("\n--- 脚本运行完毕 ---")

Using device: cpu


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_processed[col].fillna(X_processed[col].median(), inplace=True)



===== 开始处理第 1/5 折 =====
--- 正在处理模型: LogisticRegression (第 1 折) ---
    LogisticRegression (第 1 折) 测试结果: ROC-AUC=0.6348, Bal.Acc=0.5265
--- 正在处理模型: XGBoost (第 1 折) ---
    XGBoost (第 1 折) 测试结果: ROC-AUC=0.6699, Bal.Acc=0.5371
--- 正在处理模型: CatBoost (第 1 折) ---
    CatBoost (第 1 折) 测试结果: ROC-AUC=0.6869, Bal.Acc=0.5255
--- 正在处理模型: MLP (第 1 折) ---
    Epoch 50/150, Loss: 0.1027
    Epoch 100/150, Loss: 0.0704
    Epoch 150/150, Loss: -0.0627
    MLP (第 1 折) 测试结果: ROC-AUC=0.6389, Bal.Acc=0.5906

===== 开始处理第 2/5 折 =====
--- 正在处理模型: LogisticRegression (第 2 折) ---
    LogisticRegression (第 2 折) 测试结果: ROC-AUC=0.6781, Bal.Acc=0.5397
--- 正在处理模型: XGBoost (第 2 折) ---
    XGBoost (第 2 折) 测试结果: ROC-AUC=0.6322, Bal.Acc=0.5362
--- 正在处理模型: CatBoost (第 2 折) ---
    CatBoost (第 2 折) 测试结果: ROC-AUC=0.6624, Bal.Acc=0.5376
--- 正在处理模型: MLP (第 2 折) ---
    Epoch 50/150, Loss: 0.1049
    Epoch 100/150, Loss: 0.0338
    Epoch 150/150, Loss: -0.1588
    MLP (第 2 折) 测试结果: ROC-AUC=0.6766, Bal.Acc=0.5566

===== 开始处理第 3