In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, roc_curve, confusion_matrix, accuracy_score, recall_score, 
    precision_score, f1_score, matthews_corrcoef
)
from sklearn.utils import resample

train_path = r"D:\临床数据\NHANES数据清洗\train_imputed.csv"
val_path = r"D:\临床数据\NHANES数据清洗\val_imputed.csv"

train_df = pd.read_csv(train_path, low_memory=False)
val_df = pd.read_csv(val_path, low_memory=False)

selected_columns = [
    "SEQN",'Age', 'Ratio of family income to poverty', 'Alanine Aminotransferase (ALT) (U/L)', 
    'Race_2','Race_3','Race_4','Race_5', 'Chloride (mmol/L)',
    'Minutes sedentary activity', 'Walk or bicycle_2', 'Glucose_serum (mmol/L)', 
    'Direct HDL-Cholesterol (mmol/L)', 'Mean cell volume (fL)', 
    "3 year heart death"
]

train_df = train_df[selected_columns]
val_df = val_df[selected_columns]

x_train = train_df.iloc[:, 1:14]
y_train = train_df["3 year heart death"]
x_val = val_df.iloc[:, 1:14]
y_val = val_df["3 year heart death"]

estimator = LogisticRegression()
estimator.fit(x_train, y_train)

y_probs = estimator.predict_proba(x_val)[:, 1]
auc = roc_auc_score(y_val, y_probs)

n_iterations = 1000
auc_scores = []
for i in range(n_iterations):
    x_resampled, y_resampled = resample(x_val, y_val, random_state=i)
    y_probs_resampled = estimator.predict_proba(x_resampled)[:, 1]
    auc_scores.append(roc_auc_score(y_resampled, y_probs_resampled))
auc_lower = np.percentile(auc_scores, 2.5)
auc_upper = np.percentile(auc_scores, 97.5)

fpr, tpr, thresholds = roc_curve(y_val, y_probs)
youden_index = tpr - fpr
best_threshold = thresholds[np.argmax(youden_index)]
print(f"最佳阈值（Youden Index）：{best_threshold:.3f}")

y_pred_best = (y_probs >= best_threshold).astype(int)
cm = confusion_matrix(y_val, y_pred_best)
accuracy = accuracy_score(y_val, y_pred_best)
sensitivity = recall_score(y_val, y_pred_best)
specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
ppv = precision_score(y_val, y_pred_best)
npv = cm[1, 1] / (cm[1, 1] + cm[1, 0])
mcc = matthews_corrcoef(y_val, y_pred_best)
f1 = f1_score(y_val, y_pred_best)

acc_scores, sn_scores, sp_scores, ppv_scores, npv_scores, mcc_scores, f1_scores = [], [], [], [], [], [], []
for i in range(n_iterations):
    x_resampled, y_resampled = resample(x_val, y_val, random_state=i)
    y_probs_resampled = estimator.predict_proba(x_resampled)[:, 1]
    y_pred_resampled = (y_probs_resampled >= best_threshold).astype(int)
    cm_resampled = confusion_matrix(y_resampled, y_pred_resampled)
    acc_scores.append(accuracy_score(y_resampled, y_pred_resampled))
    sn_scores.append(recall_score(y_resampled, y_pred_resampled))
    sp_scores.append(cm_resampled[0, 0] / (cm_resampled[0, 0] + cm_resampled[0, 1]))
    ppv_scores.append(precision_score(y_resampled, y_pred_resampled))
    npv_scores.append(cm_resampled[1, 1] / (cm_resampled[1, 1] + cm_resampled[1, 0]))
    mcc_scores.append(matthews_corrcoef(y_resampled, y_pred_resampled))
    f1_scores.append(f1_score(y_resampled, y_pred_resampled))

def calculate_ci(scores):
    return np.percentile(scores, 2.5), np.percentile(scores, 97.5)

accuracy_ci = calculate_ci(acc_scores)
sensitivity_ci = calculate_ci(sn_scores)
specificity_ci = calculate_ci(sp_scores)
ppv_ci = calculate_ci(ppv_scores)
npv_ci = calculate_ci(npv_scores)
mcc_ci = calculate_ci(mcc_scores)
f1_ci = calculate_ci(f1_scores)

print("\n===== 验证集性能指标 =====")
print(f"AUC: {auc:.3f} ({auc_lower:.3f} - {auc_upper:.3f})")
print(f"Accuracy: {accuracy:.3f} ({accuracy_ci[0]:.3f} - {accuracy_ci[1]:.3f})")
print(f"Sensitivity: {sensitivity:.3f} ({sensitivity_ci[0]:.3f} - {sensitivity_ci[1]:.3f})")
print(f"Specificity: {specificity:.3f} ({specificity_ci[0]:.3f} - {specificity_ci[1]:.3f})")
print(f"PPV: {ppv:.3f} ({ppv_ci[0]:.3f} - {ppv_ci[1]:.3f})")
print(f"NPV: {npv:.3f} ({npv_ci[0]:.3f} - {npv_ci[1]:.3f})")
print(f"MCC: {mcc:.3f} ({mcc_ci[0]:.3f} - {mcc_ci[1]:.3f})")
print(f"F1 Score: {f1:.3f} ({f1_ci[0]:.3f} - {f1_ci[1]:.3f})")