In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import resample
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import joblib

train_path = r"D:\临床数据\NHANES数据清洗\train_imputed.csv"
val_path = r"D:\临床数据\NHANES数据清洗\val_imputed.csv"

train_df = pd.read_csv(train_path, low_memory=False)
val_df = pd.read_csv(val_path, low_memory=False)

selected_columns = [
    "SEQN", "Doctor told you have diabetes", "Education Level - Adults 20+", "Ever told you had a stroke",
    "Ever told you had coronary heart disease", "Ever told you had high blood pressure",
    "Had at least 12 alcohol drinks/1 yr?", "Marital Status", "Moderate recreational activities",
    "Moderate work activity", "Race", "Smoked at least 100 cigarettes in life",
    "Vigorous recreational activities", "Vigorous work activity", "Walk or bicycle", "Age",
    "Minutes sedentary activity", "Ratio of family income to poverty", "Waist Circumference (cm)",
    "Alanine Aminotransferase (ALT) (U/L)", "Albumin (g/L)", "Albumin_urine (mg/L)",
    "Alkaline Phosphatase (ALP) (IU/L)", "Bicarbonate (mmol/L)", "Blood urea nitrogen (mmol/L)",
    "Chloride (mmol/L)", "Cholesterol (mmol/L)", "Creatinine (umol/L)", "Creatinine_urine (umol/L)",
    "Direct HDL-Cholesterol (mmol/L)", "Gamma Glutamyl Transferase (GGT) (U/L)", "Globulin (g/L)",
    "Glucose_serum (mmol/L)", "Glycohemoglobin (%)", "Hematocrit (%)", "Hemoglobin (g/dL)",
    "Iron_refigerated (umol/L)", "Lactate Dehydrogenase (LDH) (U/L)", "Lymphocyte number (1000 cells/uL)",
    "Lymphocyte percent (%)", "Mean cell hemoglobin (pg)", "Mean cell hemoglobin concentration (g/dL)",
    "Mean cell volume (fL)", "Monocyte number (1000 cells/uL)", "Monocyte percent (%)",
    "Osmolality (mmol/Kg)", "Phosphorus (mmol/L)", "Platelet count (1000 cells/uL)",
    "Potassium (mmol/L)", "Red blood cell count (million cells/uL)", "Red cell distribution width (%)",
    "Segmented neutrophils num (1000 cell/uL)", "Segmented neutrophils percent (%)",
    "Total protein (g/L)", "Uric acid (umol/L)", "3 year heart death"
]
train_df = train_df[selected_columns]
val_df = val_df[selected_columns]

categorical_cols = train_df.columns[1:15]
numeric_cols = train_df.columns[15:-1]

for col in categorical_cols:
    train_df[col] = train_df[col].astype("category")
    val_df[col] = val_df[col].astype("category")

train_encoded = pd.get_dummies(train_df[categorical_cols], drop_first=True)
val_encoded = pd.get_dummies(val_df[categorical_cols], drop_first=True)

val_encoded = val_encoded.reindex(columns=train_encoded.columns, fill_value=0)

scaler = StandardScaler()
train_standardized = pd.DataFrame(scaler.fit_transform(train_df[numeric_cols]), columns=numeric_cols)
val_standardized = pd.DataFrame(scaler.transform(val_df[numeric_cols]), columns=numeric_cols)

# 合并
x_train = pd.concat([train_encoded, train_standardized], axis=1)
y_train = train_df["3 year heart death"]
x_val = pd.concat([val_encoded, val_standardized], axis=1)
y_val = val_df["3 year heart death"]

def bootstrap_auc(estimator, x_val, y_val, n_iterations=1000, alpha=0.95):
    auc_scores = []
    for _ in range(n_iterations):
        x_res, y_res = resample(x_val, y_val, replace=True)
        y_pred = estimator.predict_proba(x_res)[:, 1]
        auc_scores.append(roc_auc_score(y_res, y_pred))
    lower = np.percentile(auc_scores, (1 - alpha) / 2 * 100)
    upper = np.percentile(auc_scores, (1 + alpha) / 2 * 100)
    return np.mean(auc_scores), lower, upper

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga', 'lbfgs'],
    'max_iter': [100, 200, 300]
}
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, scoring='roc_auc', cv=5, n_jobs=-1)
grid_lr.fit(x_train, y_train)
lr_best = grid_lr.best_estimator_
joblib.dump(lr_best, 'LR-生活方式+检验学指标-3年心因死亡.joblib')
auc_LR, lower_LR, upper_LR = bootstrap_auc(lr_best, x_val, y_val)

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, scoring='roc_auc', cv=5, n_jobs=-1)
grid_rf.fit(x_train, y_train)
rf_best = grid_rf.best_estimator_
joblib.dump(rf_best, 'RF-生活方式+检验学指标-3年心因死亡.joblib')
auc_RF, lower_RF, upper_RF = bootstrap_auc(rf_best, x_val, y_val)

svm_model = SVC()
svm_model.fit(x_train, y_train)
joblib.dump(svm_model, 'SVM-生活方式+检验学指标-3年心因死亡.joblib')
auc_SVM, lower_SVM, upper_SVM = bootstrap_auc(svm_model, x_val, y_val)

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_xgb = GridSearchCV(xgb.XGBClassifier(eval_metric='auc'), param_grid_xgb, scoring='roc_auc', cv=5, n_jobs=-1)
grid_xgb.fit(x_train, y_train)
xgb_best = grid_xgb.best_estimator_
joblib.dump(xgb_best, 'XGBoost-生活方式+检验学指标-3年心因死亡.joblib')
auc_XGB, lower_XGB, upper_XGB = bootstrap_auc(xgb_best, x_val, y_val)

plt.rcParams['font.family'] = 'Arial'
models = {
    'LR': (lr_best, auc_LR, lower_LR, upper_LR),
    'RF': (rf_best, auc_RF, lower_RF, upper_RF),
    'SVM': (svm_model, auc_SVM, lower_SVM, upper_SVM),
    'XGBoost': (xgb_best, auc_XGB, lower_XGB, upper_XGB)
}

fig = plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--')

for name, (model, auc_mean, lower, upper) in models.items():
    probs = model.predict_proba(x_val)[:, 1]
    fpr, tpr, _ = roc_curve(y_val, probs)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc_mean:.3f}, 95%CI: {lower:.3f}-{upper:.3f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Different Models')
plt.legend(loc="lower right")

with PdfPages('roc_curve-生活方式+检验学指标-3年心因死亡.pdf') as pdf:
    pdf.savefig(fig)
    plt.close()