In [31]:
## 数据读入
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import joblib
import os

# 2rsrna feature

* feature 分别为：
rsrna_GACCAAGGAGTCTAACACGTGCGCG
rsrna_CAAGGAGTCTAACACGTGCGCG



## 7:3划分

## 1. 区分MDD vs HC+other

In [144]:

##数据读入
mirna=pd.read_csv('rsrna1.csv',encoding='GBK')
mirna = mirna[mirna['group'].isin([0, 1])]

# 划分特征和标签
miRNA_list = ['rsrna1']
X = mirna[miRNA_list]
y = mirna['group']

# 划分训练集和验证集 (7:3)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=42
)

from sklearn.preprocessing import StandardScaler
# 2. 只在训练集上计算标准化参数
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 训练集：fit + transform

# 3. 用训练集的参数标准化验证集
X_val = scaler.transform(X_val)

In [145]:
X_train

array([[ 6.89611374e-01],
       [-4.06378686e-01],
       [ 3.50578160e-01],
       [ 3.76657638e-01],
       [-5.42982992e-01],
       [-5.28626239e-01],
       [-5.33516141e-01],
       [ 1.15862859e-01],
       [-4.83313146e-01],
       [-5.12782956e-01],
       [-5.36776076e-01],
       [-3.48351848e-01],
       [-5.31820975e-01],
       [-5.03655139e-01],
       [-3.69867417e-01],
       [-1.52103777e-01],
       [-4.65057512e-01],
       [-5.14217328e-01],
       [-4.68317446e-01],
       [-5.02220768e-01],
       [-4.52017773e-01],
       [-4.98504442e-01],
       [-4.94722918e-01],
       [ 1.55023415e+00],
       [-5.05350305e-01],
       [-3.27488266e-01],
       [-5.10435803e-01],
       [-2.08174654e-01],
       [-5.29539021e-01],
       [ 1.79146932e+00],
       [-4.82661159e-01],
       [-2.47945858e-01],
       [ 8.93699968e-03],
       [-3.78343248e-01],
       [-1.97090876e-01],
       [-2.04914719e-01],
       [-1.33196155e-01],
       [-2.90976996e-01],
       [-5.4

In [147]:
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'penalty': ['l2'],
        'max_iter': [10000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    'MLP': {
        'hidden_layer_sizes': [(10,), (50,), (10, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'GBDT': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100],
        'boosting_type': ['gbdt'],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    }
}

# 定义模型集合
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'GNB': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

In [148]:
# 创建结果目录
result_path = './rsrna1-scaled/'
os.makedirs(result_path, exist_ok=True)
os.makedirs(os.path.join(result_path, 'model'), exist_ok=True)
os.makedirs(os.path.join(result_path, 'figure'), exist_ok=True)

best_models = {}

# 超参数优化（使用训练集的5折交叉验证）
for model_name, model in models.items():
    print(f"\n=== Training {model_name} ===")
    
    if model_name in param_grids:
        # 网格搜索
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # 保存最佳模型
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
    else:
        # 无参数优化的模型
        model.fit(X_train, y_train)
        best_models[model_name] = model



=== Training Logistic Regression ===
Best parameters: {'C': 0.01, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}

=== Training SVM ===
Best parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}

=== Training Random Forest ===
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

=== Training GNB ===
Best parameters: {'var_smoothing': 1e-09}

=== Training MLP ===
Best parameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}

=== Training GBDT ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0}

=== Training XGBoost ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}

=== Training LightGBM ===
[LightGBM] [Info] Number of positive: 57, number of negative: 101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds

In [149]:
# 自定义评分函数
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

def youden_index_score(y_true, y_pred):
    sensitivity = sensitivity_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    return sensitivity + specificity - 1


# 验证集评估
# =============================================================================
# 评估指标收集
metrics_results = []
roc_data = {}
confusion_matrices = {}

# 置信区间参数
bootstrap_iterations = 1000
confidence_level = 95

for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    
    # 预测验证集
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)
    
    # 计算基础指标
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_val, y_proba),
        'Accuracy': accuracy_score(y_val, y_pred),
        'Sensitivity': sensitivity_score(y_val, y_pred),
        'Specificity': specificity_score(y_val, y_pred),
        'Youden Index': youden_index_score(y_val, y_pred),
        'PPV': precision_score(y_val, y_pred),
        'NPV': precision_score(y_val, y_pred, pos_label=0),
        'F1 Score': f1_score(y_val, y_pred),
        'MCC': matthews_corrcoef(y_val, y_pred)
    }
    
    # 存储ROC数据
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    roc_data[model_name] = (fpr, tpr, metrics['AUC'])
    
    # 存储混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    confusion_matrices[model_name] = cm
    
    # Bootstrap计算置信区间
    bootstrap_metrics = {key: [] for key in metrics if key != 'Model'}
    
    for _ in range(bootstrap_iterations):
        # Bootstrap重采样
        indices = resample(np.arange(len(y_val)), replace=True)
        y_val_boot = y_val.iloc[indices]
        y_pred_boot = y_pred[indices]
        y_proba_boot = y_proba[indices]
        
        # 计算指标
        try:
            bootstrap_metrics['AUC'].append(roc_auc_score(y_val_boot, y_proba_boot))
        except:
            bootstrap_metrics['AUC'].append(np.nan)
        bootstrap_metrics['Accuracy'].append(accuracy_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Sensitivity'].append(sensitivity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Specificity'].append(specificity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Youden Index'].append(youden_index_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['PPV'].append(precision_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['NPV'].append(precision_score(y_val_boot, y_pred_boot, pos_label=0))
        bootstrap_metrics['F1 Score'].append(f1_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['MCC'].append(matthews_corrcoef(y_val_boot, y_pred_boot))
    
    # 计算置信区间
    for metric in bootstrap_metrics:
        lower = np.nanpercentile(bootstrap_metrics[metric], (100 - confidence_level)/2)
        upper = np.nanpercentile(bootstrap_metrics[metric], 100 - (100 - confidence_level)/2)
        metrics[f'{metric}_lower'] = lower
        metrics[f'{metric}_upper'] = upper
    
    metrics_results.append(metrics)

# 保存评估结果
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv(os.path.join(result_path, 'validation_metrics.csv'), index=False)


=== Evaluating Logistic Regression ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating SVM ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating Random Forest ===

=== Evaluating GNB ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating MLP ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating GBDT ===

=== Evaluating XGBoost ===

=== Evaluating LightGBM ===

=== Evaluating CatBoost ===

=== Evaluating AdaBoost ===


In [152]:
# 绘制ROC曲线
plt.rcParams['font.family'] = 'Arial'
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_data.items():
    if name == 'GNB':
        continue  # 跳过GNB
    plt.plot(fpr, tpr, 
             label=f'{name} (AUC = {auc:.3f} [{metrics_df[metrics_df.Model==name]["AUC_lower"].values[0]:.3f}-{metrics_df[metrics_df.Model==name]["AUC_upper"].values[0]:.3f}])')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves on Validation Set', fontsize=15)
plt.legend(loc='lower right', fontsize=10)
# 移除右边和上边边界
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.grid(False)

plt.savefig(os.path.join(result_path, 'figure', 'ROC_curves.svg'), bbox_inches='tight')
plt.close()

In [153]:
# 遍历每个模型，绘制验证集混淆矩阵
for name, model in best_models.items():
    # 使用最佳模型进行验证集预测
    y_val_pred = model.predict(X_val)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_val, y_val_pred)
    
    # 调整矩阵显示顺序（如果需要保持原逻辑）
    cm = np.array([[cm[1, 1], cm[1, 0]],  # 修改后格式：[[TP, FN], [FP, TN]]
                   [cm[0, 1], cm[0, 0]]])
    
    # 将数值转换为百分比
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # 创建图形
    plt.figure(figsize=(8, 6))
    
    # 绘制热力图
    sns.heatmap(cm_percentage, annot=False, fmt=".2f", cmap='Blues', 
                cbar=True, square=True,
                xticklabels=['Positive', 'Negative'], 
                yticklabels=['Positive', 'Negative'])
    
    # 添加数值标注
    for i in range(cm_percentage.shape[0]):
        for j in range(cm_percentage.shape[1]):
            color = 'black' if cm_percentage[i, j] < 50 else 'white'
            plt.text(j + 0.5, i + 0.5, f"{cm_percentage[i, j]:.2f}%", 
                     ha='center', va='center', color=color, fontsize=12)
    
    # 添加标签和标题
    plt.title(f'{name} Validation Confusion Matrix (Percentage)', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'{result_path}{name}_validation_confusion_matrix.svg', format='svg')
    plt.close()  # 关闭图形避免内存泄漏
    
    
# =============================================================================
# 模型保存
# =============================================================================

import joblib

# 保存每个最佳模型
for name, model in best_models.items():
    joblib.dump(model, f'{result_path}/model/{name}_model.pkl')
    print(f"{name} model saved successfully.")

print("\n=== 所有模型训练和评估完成 ===")
print(f"结果保存在目录: {os.path.abspath(result_path)}")

Logistic Regression model saved successfully.
SVM model saved successfully.
Random Forest model saved successfully.
GNB model saved successfully.
MLP model saved successfully.
GBDT model saved successfully.
XGBoost model saved successfully.
LightGBM model saved successfully.
CatBoost model saved successfully.
AdaBoost model saved successfully.

=== 所有模型训练和评估完成 ===
结果保存在目录: d:\adult_dep\实验结果\2025.4.8model\rsrna1-scaled


In [154]:
best_models

{'Logistic Regression': LogisticRegression(C=0.01, max_iter=10000, solver='saga'),
 'SVM': SVC(C=100, gamma='auto', probability=True, random_state=42),
 'Random Forest': RandomForestClassifier(min_samples_leaf=4, min_samples_split=10,
                        n_estimators=200, random_state=42),
 'GNB': GaussianNB(),
 'MLP': MLPClassifier(alpha=0.001, hidden_layer_sizes=(50, 50), max_iter=500,
               random_state=42),
 'GBDT': GradientBoostingClassifier(learning_rate=0.01, n_estimators=300,
                            random_state=42),
 'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.01, max_bin=None,
             

In [49]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame
prob_df = pd.DataFrame({
    'Index': X_val.index,
    'True_label': y_val.values,
    'Predicted_probability': y_proba
}).sort_values('Index')

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")

# # 可选：保存所有候选模型的预测概率
# if len(candidates) > 1:
#     print("\n=== 保存所有候选模型的预测概率 ===")
#     for _, row in candidates.iterrows():
#         model_name = row['Model']
#         model = best_models[model_name]
        
#         y_proba = model.predict_proba(X_val)[:, 1]
#         prob_df = pd.DataFrame({
#             'Index': X_val.index,
#             'True_label': y_val.values,
#             f'Prob_{model_name}': y_proba
#         }).sort_values('Index')
        
#         filename = f"candidate_probabilities_{model_name.replace(' ', '_')}.csv"
#         prob_df.to_csv(os.path.join(result_path, filename), index=False)
#         print(f"已保存 {model_name} 的预测概率")


=== 最终选择模型 LightGBM ===
* AUC值: 0.8040
* 95%置信区间: [0.6771, 0.9011]
* 预测概率文件已保存至：./rsrna1-thoredhold/sample_probabilities_LightGBM.csv


In [155]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame（无需索引）
prob_df = pd.DataFrame({
    'True_label': y_val.values,          # 使用验证集的真实标签
    'Predicted_probability': y_proba     # 预测概率直接对应验证集顺序
})

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)  # 禁用索引保存

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")


=== 最终选择模型 Random Forest ===
* AUC值: 0.8177
* 95%置信区间: [0.7092, 0.9125]
* 预测概率文件已保存至：./rsrna1-scaled/sample_probabilities_Random_Forest.csv


# 调取并计算概率

In [156]:
# 寻找AUC最高的最优模型
best_model_name = metrics_df.loc[metrics_df['AUC'].idxmax(), 'Model']
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]  # 取正类的概率

# 创建包含原始索引的DataFrame
prob_df = pd.DataFrame({
    # 'Index': X_val.index,          # 保留原始索引
    'True_label': y_val,           # 真实标签
    'Predicted_probability': y_proba  # 预测概率
})

# 按原始索引排序（如果需要保持原始顺序）
# prob_df = prob_df.sort_values('Index')

# 保存结果
prob_df.to_csv(os.path.join(result_path, 'sample_probabilities.csv'), index=False)

print(f"\n=== 最优模型 {best_model_name} 的预测概率已保存 ===")
print(f"文件路径：{os.path.join(result_path, 'sample_probabilities.csv')}")


=== 最优模型 Random Forest 的预测概率已保存 ===
文件路径：./rsrna1-scaled/sample_probabilities.csv


In [157]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./rsrna1-scaled/sample_probabilities.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'Control']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 更现代的珊瑚色
    'Control': '#2E8B57'
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    # sns.stripplot(
    #     x='Group', y=mirna, data=filtered_data, order=groups,
    #     color='black', alpha=0.65, jitter=True, ax=ax
    # )
    
    
        # ----- 散点图优化 -----
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,       # 与箱线图颜色一致
        edgecolor='w',         # 白色描边
        linewidth=0.4,         # 描边粗细
        size=5,                # 点大小
        alpha=0.75,            # 透明度
        jitter=0.15,           # 抖动幅度
        ax=ax
    )
    # ----------------------------
    # 关键修改点：确保刻度线设置的优先级
    # ----------------------------
    
    # 1. 显式设置刻度线样式
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,   # 强制显示x轴刻度线
        left=True      # 强制显示y轴刻度线
    )
    
    # 2. 直接操作轴的底层对象
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示x轴主刻度线
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示y轴主刻度线
    
    # 3. 确保其他样式设置不会覆盖刻度线
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 其他样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 生成需要比较的组合
    compare_groups = [g for g in groups if g != target_group]
    combinations = [(target_group, other) for other in compare_groups]
    
    # 统计标注参数
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08  # 调整标注间距

    # 进行两两比较（使用原始未过滤数据）
    for i, (group1, group2) in enumerate(combinations):
        # 获取数据（使用未过滤的value_df）
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # # 检查数据量
        # if len(data1) < 2 or len(data2) < 2:
        #     print(f"跳过{group1} vs {group2}（数据不足）")
        #     continue
        
        # 方差齐性检验
        levene_stat, levene_p = stats.levene(data1, data2)
        equal_var = levene_p >= 0.05
        
        # t检验
        t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=equal_var)
        
        # 标注位置计算
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注文本
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        # 添加文本
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)  # 添加半透明背景
        )
    
    # 调整y轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存图片
    plt.tight_layout()
    plt.savefig(f'1final-scaled_Boxplot_mdd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()
    
    # plt.show()

  sns.stripplot(


## rsrna2:区分MDD vs HC+other

In [215]:
##数据读入
mirna=pd.read_csv('rsrna2.csv',encoding='GBK')
mirna = mirna[mirna['group'].isin([0, 1])]

# 划分特征和标签
miRNA_list = ['rsrna2']
X = mirna[miRNA_list]
y = mirna['group']

# 划分训练集和验证集 (7:3)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=42
)


from sklearn.preprocessing import StandardScaler
# 2. 只在训练集上计算标准化参数
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 训练集：fit + transform

# 3. 用训练集的参数标准化验证集
X_val = scaler.transform(X_val)  

In [216]:
X_train

array([[ 5.25716727e-03],
       [ 2.75649154e-01],
       [ 3.97590472e+00],
       [-1.32877870e-01],
       [-4.92969459e-01],
       [-4.25312682e-01],
       [-4.53821402e-01],
       [-2.60138446e-01],
       [-5.05846562e-02],
       [-3.66825719e-01],
       [-3.55951259e-01],
       [ 6.07761052e-01],
       [-4.76158131e-01],
       [-2.90704497e-01],
       [-4.49412837e-01],
       [-3.59772015e-01],
       [ 1.99523840e-02],
       [-3.87105118e-01],
       [-6.23408295e-02],
       [-2.64253107e-01],
       [ 2.58014894e-01],
       [-1.71085433e-01],
       [-2.61607968e-01],
       [ 3.43247151e-01],
       [-2.67192150e-01],
       [ 6.04822008e-01],
       [-2.51321316e-01],
       [-3.77994084e-01],
       [-3.38316999e-01],
       [ 3.25612891e-01],
       [-4.17675261e-02],
       [-4.04445474e-01],
       [-3.45076799e-01],
       [-4.52939689e-01],
       [-3.33320625e-01],
       [ 1.15736216e+00],
       [-3.34790147e-01],
       [-4.14144317e-01],
       [-5.1

In [217]:
# 定义模型的参数网格
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'penalty': ['l2'],
        'max_iter': [10000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    'MLP': {
        'hidden_layer_sizes': [(10,), (50,), (10, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'GBDT': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100],
        'boosting_type': ['gbdt'],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    }
}

# 定义模型集合
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'GNB': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}


In [218]:
# 创建结果目录
result_path = './rsrna2-scaled/'
os.makedirs(result_path, exist_ok=True)
os.makedirs(os.path.join(result_path, 'model'), exist_ok=True)
os.makedirs(os.path.join(result_path, 'figure'), exist_ok=True)

best_models = {}

# 超参数优化（使用训练集的5折交叉验证）
for model_name, model in models.items():
    print(f"\n=== Training {model_name} ===")
    
    if model_name in param_grids:
        # 网格搜索
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # 保存最佳模型
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
    else:
        # 无参数优化的模型
        model.fit(X_train, y_train)
        best_models[model_name] = model



=== Training Logistic Regression ===
Best parameters: {'C': 0.01, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}

=== Training SVM ===
Best parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

=== Training Random Forest ===
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}

=== Training GNB ===
Best parameters: {'var_smoothing': 1e-09}

=== Training MLP ===
Best parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 50), 'learning_rate': 'constant', 'solver': 'adam'}

=== Training GBDT ===
Best parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.7}

=== Training XGBoost ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}

=== Training LightGBM ===
[LightGBM] [Info] Number of positive: 57, number of negative: 101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.


In [219]:
# 自定义评分函数
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

def youden_index_score(y_true, y_pred):
    sensitivity = sensitivity_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    return sensitivity + specificity - 1


# 验证集评估
# =============================================================================
# 评估指标收集
metrics_results = []
roc_data = {}
confusion_matrices = {}

# 置信区间参数
bootstrap_iterations = 1000
confidence_level = 95

for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    
    # 预测验证集
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)
    
    # 计算基础指标
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_val, y_proba),
        'Accuracy': accuracy_score(y_val, y_pred),
        'Sensitivity': sensitivity_score(y_val, y_pred),
        'Specificity': specificity_score(y_val, y_pred),
        'Youden Index': youden_index_score(y_val, y_pred),
        'PPV': precision_score(y_val, y_pred),
        'NPV': precision_score(y_val, y_pred, pos_label=0),
        'F1 Score': f1_score(y_val, y_pred),
        'MCC': matthews_corrcoef(y_val, y_pred)
    }
    
    # 存储ROC数据
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    roc_data[model_name] = (fpr, tpr, metrics['AUC'])
    
    # 存储混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    confusion_matrices[model_name] = cm
    
    # Bootstrap计算置信区间
    bootstrap_metrics = {key: [] for key in metrics if key != 'Model'}
    
    for _ in range(bootstrap_iterations):
        # Bootstrap重采样
        indices = resample(np.arange(len(y_val)), replace=True)
        y_val_boot = y_val.iloc[indices]
        y_pred_boot = y_pred[indices]
        y_proba_boot = y_proba[indices]
        
        # 计算指标
        try:
            bootstrap_metrics['AUC'].append(roc_auc_score(y_val_boot, y_proba_boot))
        except:
            bootstrap_metrics['AUC'].append(np.nan)
        bootstrap_metrics['Accuracy'].append(accuracy_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Sensitivity'].append(sensitivity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Specificity'].append(specificity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Youden Index'].append(youden_index_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['PPV'].append(precision_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['NPV'].append(precision_score(y_val_boot, y_pred_boot, pos_label=0))
        bootstrap_metrics['F1 Score'].append(f1_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['MCC'].append(matthews_corrcoef(y_val_boot, y_pred_boot))
    
    # 计算置信区间
    for metric in bootstrap_metrics:
        lower = np.nanpercentile(bootstrap_metrics[metric], (100 - confidence_level)/2)
        upper = np.nanpercentile(bootstrap_metrics[metric], 100 - (100 - confidence_level)/2)
        metrics[f'{metric}_lower'] = lower
        metrics[f'{metric}_upper'] = upper
    
    metrics_results.append(metrics)

# 保存评估结果
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv(os.path.join(result_path, 'validation_metrics.csv'), index=False)


=== Evaluating Logistic Regression ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating SVM ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating Random Forest ===

=== Evaluating GNB ===

=== Evaluating MLP ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating GBDT ===

=== Evaluating XGBoost ===

=== Evaluating LightGBM ===

=== Evaluating CatBoost ===

=== Evaluating AdaBoost ===


In [220]:
# 绘制ROC曲线
plt.rcParams['font.family'] = 'Arial'
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_data.items():
    if name == 'GNB':
        continue  # 跳过GNB
    plt.plot(fpr, tpr, 
             label=f'{name} (AUC = {auc:.3f} [{metrics_df[metrics_df.Model==name]["AUC_lower"].values[0]:.3f}-{metrics_df[metrics_df.Model==name]["AUC_upper"].values[0]:.3f}])')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves on Validation Set', fontsize=15)
plt.legend(loc='lower right', fontsize=10)
# 移除右边和上边边界
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.grid(False)

plt.savefig(os.path.join(result_path, 'figure', 'ROC_curves.svg'), bbox_inches='tight')
plt.close()

In [222]:
# 遍历每个模型，绘制验证集混淆矩阵
for name, model in best_models.items():
    # 使用最佳模型进行验证集预测
    y_val_pred = model.predict(X_val)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_val, y_val_pred)
    
    # 调整矩阵显示顺序（如果需要保持原逻辑）
    cm = np.array([[cm[1, 1], cm[1, 0]],  # 修改后格式：[[TP, FN], [FP, TN]]
                   [cm[0, 1], cm[0, 0]]])
    
    # 将数值转换为百分比
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # 创建图形
    plt.figure(figsize=(8, 6))
    
    # 绘制热力图
    sns.heatmap(cm_percentage, annot=False, fmt=".2f", cmap='Blues', 
                cbar=True, square=True,
                xticklabels=['Positive', 'Negative'], 
                yticklabels=['Positive', 'Negative'])
    
    # 添加数值标注
    for i in range(cm_percentage.shape[0]):
        for j in range(cm_percentage.shape[1]):
            color = 'black' if cm_percentage[i, j] < 50 else 'white'
            plt.text(j + 0.5, i + 0.5, f"{cm_percentage[i, j]:.2f}%", 
                     ha='center', va='center', color=color, fontsize=12)
    
    # 添加标签和标题
    plt.title(f'{name} Validation Confusion Matrix (Percentage)', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'{result_path}{name}_validation_confusion_matrix.svg', format='svg')
    plt.close()  # 关闭图形避免内存泄漏
    
    
# =============================================================================
# 模型保存
# =============================================================================

import joblib

# 保存每个最佳模型
for name, model in best_models.items():
    joblib.dump(model, f'{result_path}/model/{name}_model.pkl')
    print(f"{name} model saved successfully.")

print("\n=== 所有模型训练和评估完成 ===")
print(f"结果保存在目录: {os.path.abspath(result_path)}")

Logistic Regression model saved successfully.
SVM model saved successfully.
Random Forest model saved successfully.
GNB model saved successfully.
MLP model saved successfully.
GBDT model saved successfully.
XGBoost model saved successfully.
LightGBM model saved successfully.
CatBoost model saved successfully.
AdaBoost model saved successfully.

=== 所有模型训练和评估完成 ===
结果保存在目录: d:\adult_dep\实验结果\2025.4.8model\rsrna2-scaled


In [None]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame
prob_df = pd.DataFrame({
    'Index': X_val.index,
    'True_label': y_val.values,
    'Predicted_probability': y_proba
}).sort_values('Index')

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")

# # 可选：保存所有候选模型的预测概率
# if len(candidates) > 1:
#     print("\n=== 保存所有候选模型的预测概率 ===")
#     for _, row in candidates.iterrows():
#         model_name = row['Model']
#         model = best_models[model_name]
        
#         y_proba = model.predict_proba(X_val)[:, 1]
#         prob_df = pd.DataFrame({
#             'Index': X_val.index,
#             'True_label': y_val.values,
#             f'Prob_{model_name}': y_proba
#         }).sort_values('Index')
        
#         filename = f"candidate_probabilities_{model_name.replace(' ', '_')}.csv"
#         prob_df.to_csv(os.path.join(result_path, filename), index=False)
#         print(f"已保存 {model_name} 的预测概率")

In [223]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame（无需索引）
prob_df = pd.DataFrame({
    'True_label': y_val.values,          # 使用验证集的真实标签
    'Predicted_probability': y_proba     # 预测概率直接对应验证集顺序
})

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)  # 禁用索引保存

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")


=== 最终选择模型 CatBoost ===
* AUC值: 0.7088
* 95%置信区间: [0.5698, 0.8386]
* 预测概率文件已保存至：./rsrna2-scaled/sample_probabilities_CatBoost.csv


# 调取并计算概率

In [224]:
# 寻找AUC最高的最优模型
best_model_name = metrics_df.loc[metrics_df['AUC'].idxmax(), 'Model']
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]  # 取正类的概率

# 创建包含原始索引的DataFrame
prob_df = pd.DataFrame({
    # 'Index': X_val.index,          # 保留原始索引
    'True_label': y_val,           # 真实标签
    'Predicted_probability': y_proba  # 预测概率
})

# 按原始索引排序（如果需要保持原始顺序）
# prob_df = prob_df.sort_values('Index')

# 保存结果
prob_df.to_csv(os.path.join(result_path, 'sample_probabilities.csv'), index=False)

print(f"\n=== 最优模型 {best_model_name} 的预测概率已保存 ===")
print(f"文件路径：{os.path.join(result_path, 'sample_probabilities.csv')}")


=== 最优模型 CatBoost 的预测概率已保存 ===
文件路径：./rsrna2-scaled/sample_probabilities.csv


In [226]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./rsrna2-scaled/sample_probabilities_CatBoost.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'Control']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 更现代的珊瑚色
    'Control': '#2E8B57'
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    # sns.stripplot(
    #     x='Group', y=mirna, data=filtered_data, order=groups,
    #     color='black', alpha=0.65, jitter=True, ax=ax
    # )
    
    
        # ----- 散点图优化 -----
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,       # 与箱线图颜色一致
        edgecolor='w',         # 白色描边
        linewidth=0.4,         # 描边粗细
        size=5,                # 点大小
        alpha=0.75,            # 透明度
        jitter=0.15,           # 抖动幅度
        ax=ax
    )
    # ----------------------------
    # 关键修改点：确保刻度线设置的优先级
    # ----------------------------
    
    # 1. 显式设置刻度线样式
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,   # 强制显示x轴刻度线
        left=True      # 强制显示y轴刻度线
    )
    
    # 2. 直接操作轴的底层对象
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示x轴主刻度线
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示y轴主刻度线
    
    # 3. 确保其他样式设置不会覆盖刻度线
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 其他样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 生成需要比较的组合
    compare_groups = [g for g in groups if g != target_group]
    combinations = [(target_group, other) for other in compare_groups]
    
    # 统计标注参数
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08  # 调整标注间距

    # 进行两两比较（使用原始未过滤数据）
    for i, (group1, group2) in enumerate(combinations):
        # 获取数据（使用未过滤的value_df）
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # # 检查数据量
        # if len(data1) < 2 or len(data2) < 2:
        #     print(f"跳过{group1} vs {group2}（数据不足）")
        #     continue
        
        # 方差齐性检验
        levene_stat, levene_p = stats.levene(data1, data2)
        equal_var = levene_p >= 0.05
        
        # t检验
        t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=equal_var)
        
        # 标注位置计算
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注文本
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        # 添加文本
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)  # 添加半透明背景
        )
    
    # 调整y轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存图片
    plt.tight_layout()
    plt.savefig(f'2final-scaled-catboost_Boxplot_mdd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()
    
    # plt.show()

  sns.stripplot(


## rsrna1+rsrna2

In [175]:
##数据读入
mirna=pd.read_csv('rsrna1+2.csv',encoding='GBK')
mirna = mirna[mirna['group'].isin([0, 1])]

# 划分特征和标签
miRNA_list = ['rsrna1','rsrna2']
X = mirna[miRNA_list]
y = mirna['group']

# 划分训练集和验证集 (7:3)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=42
)

from sklearn.preprocessing import StandardScaler
# 2. 只在训练集上计算标准化参数
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 训练集：fit + transform

# 3. 用训练集的参数标准化验证集
X_val = scaler.transform(X_val)

In [176]:
X_train

array([[ 6.89611374e-01,  5.25716727e-03],
       [-4.06378686e-01,  2.75649154e-01],
       [ 3.50578160e-01,  3.97590472e+00],
       [ 3.76657638e-01, -1.32877870e-01],
       [-5.42982992e-01, -4.92969459e-01],
       [-5.28626239e-01, -4.25312682e-01],
       [-5.33516141e-01, -4.53821402e-01],
       [ 1.15862859e-01, -2.60138446e-01],
       [-4.83313146e-01, -5.05846562e-02],
       [-5.12782956e-01, -3.66825719e-01],
       [-5.36776076e-01, -3.55951259e-01],
       [-3.48351848e-01,  6.07761052e-01],
       [-5.31820975e-01, -4.76158131e-01],
       [-5.03655139e-01, -2.90704497e-01],
       [-3.69867417e-01, -4.49412837e-01],
       [-1.52103777e-01, -3.59772015e-01],
       [-4.65057512e-01,  1.99523840e-02],
       [-5.14217328e-01, -3.87105118e-01],
       [-4.68317446e-01, -6.23408295e-02],
       [-5.02220768e-01, -2.64253107e-01],
       [-4.52017773e-01,  2.58014894e-01],
       [-4.98504442e-01, -1.71085433e-01],
       [-4.94722918e-01, -2.61607968e-01],
       [ 1.

In [177]:
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'penalty': ['l2'],
        'max_iter': [10000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    'MLP': {
        'hidden_layer_sizes': [(10,), (50,), (10, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'GBDT': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100],
        'boosting_type': ['gbdt'],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    }
}

# 定义模型集合
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'GNB': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

In [178]:
# 创建结果目录
result_path = './rsrna1+2-scaled/'
os.makedirs(result_path, exist_ok=True)
os.makedirs(os.path.join(result_path, 'model'), exist_ok=True)
os.makedirs(os.path.join(result_path, 'figure'), exist_ok=True)

best_models = {}

# 超参数优化（使用训练集的5折交叉验证）
for model_name, model in models.items():
    print(f"\n=== Training {model_name} ===")
    
    if model_name in param_grids:
        # 网格搜索
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # 保存最佳模型
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
    else:
        # 无参数优化的模型
        model.fit(X_train, y_train)
        best_models[model_name] = model



=== Training Logistic Regression ===
Best parameters: {'C': 0.1, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}

=== Training SVM ===
Best parameters: {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}

=== Training Random Forest ===
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}

=== Training GNB ===
Best parameters: {'var_smoothing': 1e-09}

=== Training MLP ===
Best parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'constant', 'solver': 'adam'}

=== Training GBDT ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}

=== Training XGBoost ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}

=== Training LightGBM ===
[LightGBM] [Info] Number of positive: 57, number of negative: 101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You 

In [179]:
# 自定义评分函数
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

def youden_index_score(y_true, y_pred):
    sensitivity = sensitivity_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    return sensitivity + specificity - 1


# 验证集评估
# =============================================================================
# 评估指标收集
metrics_results = []
roc_data = {}
confusion_matrices = {}

# 置信区间参数
bootstrap_iterations = 1000
confidence_level = 95

for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    
    # 预测验证集
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)
    
    # 计算基础指标
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_val, y_proba),
        'Accuracy': accuracy_score(y_val, y_pred),
        'Sensitivity': sensitivity_score(y_val, y_pred),
        'Specificity': specificity_score(y_val, y_pred),
        'Youden Index': youden_index_score(y_val, y_pred),
        'PPV': precision_score(y_val, y_pred),
        'NPV': precision_score(y_val, y_pred, pos_label=0),
        'F1 Score': f1_score(y_val, y_pred),
        'MCC': matthews_corrcoef(y_val, y_pred)
    }
    
    # 存储ROC数据
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    roc_data[model_name] = (fpr, tpr, metrics['AUC'])
    
    # 存储混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    confusion_matrices[model_name] = cm
    
    # Bootstrap计算置信区间
    bootstrap_metrics = {key: [] for key in metrics if key != 'Model'}
    
    for _ in range(bootstrap_iterations):
        # Bootstrap重采样
        indices = resample(np.arange(len(y_val)), replace=True)
        y_val_boot = y_val.iloc[indices]
        y_pred_boot = y_pred[indices]
        y_proba_boot = y_proba[indices]
        
        # 计算指标
        try:
            bootstrap_metrics['AUC'].append(roc_auc_score(y_val_boot, y_proba_boot))
        except:
            bootstrap_metrics['AUC'].append(np.nan)
        bootstrap_metrics['Accuracy'].append(accuracy_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Sensitivity'].append(sensitivity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Specificity'].append(specificity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Youden Index'].append(youden_index_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['PPV'].append(precision_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['NPV'].append(precision_score(y_val_boot, y_pred_boot, pos_label=0))
        bootstrap_metrics['F1 Score'].append(f1_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['MCC'].append(matthews_corrcoef(y_val_boot, y_pred_boot))
    
    # 计算置信区间
    for metric in bootstrap_metrics:
        lower = np.nanpercentile(bootstrap_metrics[metric], (100 - confidence_level)/2)
        upper = np.nanpercentile(bootstrap_metrics[metric], 100 - (100 - confidence_level)/2)
        metrics[f'{metric}_lower'] = lower
        metrics[f'{metric}_upper'] = upper
    
    metrics_results.append(metrics)

# 保存评估结果
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv(os.path.join(result_path, 'validation_metrics.csv'), index=False)


=== Evaluating Logistic Regression ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating SVM ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating Random Forest ===

=== Evaluating GNB ===

=== Evaluating MLP ===

=== Evaluating GBDT ===

=== Evaluating XGBoost ===

=== Evaluating LightGBM ===

=== Evaluating CatBoost ===

=== Evaluating AdaBoost ===


In [181]:
# 绘制ROC曲线
plt.rcParams['font.family'] = 'Arial'
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_data.items():
    if name == 'GNB':
        continue  # 跳过GNB
    plt.plot(fpr, tpr, 
             label=f'{name} (AUC = {auc:.3f} [{metrics_df[metrics_df.Model==name]["AUC_lower"].values[0]:.3f}-{metrics_df[metrics_df.Model==name]["AUC_upper"].values[0]:.3f}])')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves on Validation Set', fontsize=15)
plt.legend(loc='lower right', fontsize=10)
# 移除右边和上边边界
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.grid(False)

plt.savefig(os.path.join(result_path, 'figure', 'ROC_curves.svg'), bbox_inches='tight')
plt.close()

In [182]:
# 遍历每个模型，绘制验证集混淆矩阵
for name, model in best_models.items():
    # 使用最佳模型进行验证集预测
    y_val_pred = model.predict(X_val)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_val, y_val_pred)
    
    # 调整矩阵显示顺序（如果需要保持原逻辑）
    cm = np.array([[cm[1, 1], cm[1, 0]],  # 修改后格式：[[TP, FN], [FP, TN]]
                   [cm[0, 1], cm[0, 0]]])
    
    # 将数值转换为百分比
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # 创建图形
    plt.figure(figsize=(8, 6))
    
    # 绘制热力图
    sns.heatmap(cm_percentage, annot=False, fmt=".2f", cmap='Blues', 
                cbar=True, square=True,
                xticklabels=['Positive', 'Negative'], 
                yticklabels=['Positive', 'Negative'])
    
    # 添加数值标注
    for i in range(cm_percentage.shape[0]):
        for j in range(cm_percentage.shape[1]):
            color = 'black' if cm_percentage[i, j] < 50 else 'white'
            plt.text(j + 0.5, i + 0.5, f"{cm_percentage[i, j]:.2f}%", 
                     ha='center', va='center', color=color, fontsize=12)
    
    # 添加标签和标题
    plt.title(f'{name} Validation Confusion Matrix (Percentage)', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'{result_path}{name}_validation_confusion_matrix.svg', format='svg')
    plt.close()  # 关闭图形避免内存泄漏
    
    
# =============================================================================
# 模型保存
# =============================================================================

import joblib

# 保存每个最佳模型
for name, model in best_models.items():
    joblib.dump(model, f'{result_path}/model/{name}_model.pkl')
    print(f"{name} model saved successfully.")

print("\n=== 所有模型训练和评估完成 ===")
print(f"结果保存在目录: {os.path.abspath(result_path)}")

Logistic Regression model saved successfully.
SVM model saved successfully.
Random Forest model saved successfully.
GNB model saved successfully.
MLP model saved successfully.
GBDT model saved successfully.
XGBoost model saved successfully.
LightGBM model saved successfully.
CatBoost model saved successfully.
AdaBoost model saved successfully.

=== 所有模型训练和评估完成 ===
结果保存在目录: d:\adult_dep\实验结果\2025.4.8model\rsrna1+2-scaled


In [183]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame（无需索引）
prob_df = pd.DataFrame({
    'True_label': y_val.values,          # 使用验证集的真实标签
    'Predicted_probability': y_proba     # 预测概率直接对应验证集顺序
})

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)  # 禁用索引保存

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")


=== 最终选择模型 SVM ===
* AUC值: 0.7898
* 95%置信区间: [0.6809, 0.8904]
* 预测概率文件已保存至：./rsrna1+2-scaled/sample_probabilities_SVM.csv


In [227]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./rsrna1+2-scaled/sample_probabilities_SVM.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'Control']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 更现代的珊瑚色
    'Control': '#2E8B57'
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    # sns.stripplot(
    #     x='Group', y=mirna, data=filtered_data, order=groups,
    #     color='black', alpha=0.65, jitter=True, ax=ax
    # )
    
    
        # ----- 散点图优化 -----
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,       # 与箱线图颜色一致
        edgecolor='w',         # 白色描边
        linewidth=0.4,         # 描边粗细
        size=5,                # 点大小
        alpha=0.75,            # 透明度
        jitter=0.15,           # 抖动幅度
        ax=ax
    )
    # ----------------------------
    # 关键修改点：确保刻度线设置的优先级
    # ----------------------------
    
    # 1. 显式设置刻度线样式
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,   # 强制显示x轴刻度线
        left=True      # 强制显示y轴刻度线
    )
    
    # 2. 直接操作轴的底层对象
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示x轴主刻度线
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示y轴主刻度线
    
    # 3. 确保其他样式设置不会覆盖刻度线
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 其他样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 生成需要比较的组合
    compare_groups = [g for g in groups if g != target_group]
    combinations = [(target_group, other) for other in compare_groups]
    
    # 统计标注参数
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08  # 调整标注间距

    # 进行两两比较（使用原始未过滤数据）
    for i, (group1, group2) in enumerate(combinations):
        # 获取数据（使用未过滤的value_df）
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # # 检查数据量
        # if len(data1) < 2 or len(data2) < 2:
        #     print(f"跳过{group1} vs {group2}（数据不足）")
        #     continue
        
        # 方差齐性检验
        levene_stat, levene_p = stats.levene(data1, data2)
        equal_var = levene_p >= 0.05
        
        # t检验
        t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=equal_var)
        
        # 标注位置计算
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注文本
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        # 添加文本
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)  # 添加半透明背景
        )
    
    # 调整y轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存图片
    plt.tight_layout()
    plt.savefig(f'1+2final-scaled_Boxplot_mdd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()
    
    # plt.show()

  sns.stripplot(


## rsrna3

In [75]:
##数据读入
mirna=pd.read_csv('rsrna3.csv',encoding='GBK')
mirna = mirna[mirna['group'].isin([0, 1])]

# 划分特征和标签
miRNA_list = ['rsrna3']
X = mirna[miRNA_list]
y = mirna['group']

# 划分训练集和验证集 (7:3)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=42
)

In [76]:
X_train

Unnamed: 0,rsrna3
151,2.870000e-06
173,7.080000e-07
80,1.830000e-06
66,2.220000e-05
44,3.830000e-06
...,...
216,5.140000e-07
22,2.240000e-07
159,3.620000e-05
123,8.250000e-07


In [77]:
# 定义模型的参数网格
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'penalty': ['l2'],
        'max_iter': [10000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    'MLP': {
        'hidden_layer_sizes': [(10,), (50,), (10, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'GBDT': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100],
        'boosting_type': ['gbdt'],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    }
}

# 定义模型集合
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'GNB': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}


In [78]:
# 创建结果目录
result_path = './rsrna3/'
os.makedirs(result_path, exist_ok=True)
os.makedirs(os.path.join(result_path, 'model'), exist_ok=True)
os.makedirs(os.path.join(result_path, 'figure'), exist_ok=True)

best_models = {}

# 超参数优化（使用训练集的5折交叉验证）
for model_name, model in models.items():
    print(f"\n=== Training {model_name} ===")
    
    if model_name in param_grids:
        # 网格搜索
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # 保存最佳模型
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
    else:
        # 无参数优化的模型
        model.fit(X_train, y_train)
        best_models[model_name] = model



=== Training Logistic Regression ===
Best parameters: {'C': 0.01, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}

=== Training SVM ===
Best parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

=== Training Random Forest ===
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

=== Training GNB ===
Best parameters: {'var_smoothing': 1e-09}

=== Training MLP ===
Best parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}

=== Training GBDT ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

=== Training XGBoost ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.7}

=== Training LightGBM ===
[LightGBM] [Info] Number of positive: 55, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000185 second

In [79]:
# 自定义评分函数
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

def youden_index_score(y_true, y_pred):
    sensitivity = sensitivity_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    return sensitivity + specificity - 1


# 验证集评估
# =============================================================================
# 评估指标收集
metrics_results = []
roc_data = {}
confusion_matrices = {}

# 置信区间参数
bootstrap_iterations = 1000
confidence_level = 95

for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    
    # 预测验证集
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)
    
    # 计算基础指标
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_val, y_proba),
        'Accuracy': accuracy_score(y_val, y_pred),
        'Sensitivity': sensitivity_score(y_val, y_pred),
        'Specificity': specificity_score(y_val, y_pred),
        'Youden Index': youden_index_score(y_val, y_pred),
        'PPV': precision_score(y_val, y_pred),
        'NPV': precision_score(y_val, y_pred, pos_label=0),
        'F1 Score': f1_score(y_val, y_pred),
        'MCC': matthews_corrcoef(y_val, y_pred)
    }
    
    # 存储ROC数据
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    roc_data[model_name] = (fpr, tpr, metrics['AUC'])
    
    # 存储混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    confusion_matrices[model_name] = cm
    
    # Bootstrap计算置信区间
    bootstrap_metrics = {key: [] for key in metrics if key != 'Model'}
    
    for _ in range(bootstrap_iterations):
        # Bootstrap重采样
        indices = resample(np.arange(len(y_val)), replace=True)
        y_val_boot = y_val.iloc[indices]
        y_pred_boot = y_pred[indices]
        y_proba_boot = y_proba[indices]
        
        # 计算指标
        try:
            bootstrap_metrics['AUC'].append(roc_auc_score(y_val_boot, y_proba_boot))
        except:
            bootstrap_metrics['AUC'].append(np.nan)
        bootstrap_metrics['Accuracy'].append(accuracy_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Sensitivity'].append(sensitivity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Specificity'].append(specificity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Youden Index'].append(youden_index_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['PPV'].append(precision_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['NPV'].append(precision_score(y_val_boot, y_pred_boot, pos_label=0))
        bootstrap_metrics['F1 Score'].append(f1_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['MCC'].append(matthews_corrcoef(y_val_boot, y_pred_boot))
    
    # 计算置信区间
    for metric in bootstrap_metrics:
        lower = np.nanpercentile(bootstrap_metrics[metric], (100 - confidence_level)/2)
        upper = np.nanpercentile(bootstrap_metrics[metric], 100 - (100 - confidence_level)/2)
        metrics[f'{metric}_lower'] = lower
        metrics[f'{metric}_upper'] = upper
    
    metrics_results.append(metrics)

# 保存评估结果
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv(os.path.join(result_path, 'validation_metrics.csv'), index=False)


=== Evaluating Logistic Regression ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating SVM ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating Random Forest ===


  _warn_prf(average, modifier, msg_start, len(result))



=== Evaluating GNB ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating MLP ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating GBDT ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating XGBoost ===

=== Evaluating LightGBM ===

=== Evaluating CatBoost ===

=== Evaluating AdaBoost ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [80]:
# 绘制ROC曲线
plt.rcParams['font.family'] = 'Arial'
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_data.items():
    if name == 'GNB':
        continue  # 跳过GNB
    plt.plot(fpr, tpr, 
             label=f'{name} (AUC = {auc:.3f} [{metrics_df[metrics_df.Model==name]["AUC_lower"].values[0]:.3f}-{metrics_df[metrics_df.Model==name]["AUC_upper"].values[0]:.3f}])')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves on Validation Set', fontsize=15)
plt.legend(loc='lower right', fontsize=10)
# 移除右边和上边边界
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.grid(False)

plt.savefig(os.path.join(result_path, 'figure', 'ROC_curves.svg'), bbox_inches='tight')
plt.close()



# 遍历每个模型，绘制验证集混淆矩阵
for name, model in best_models.items():
    # 使用最佳模型进行验证集预测
    y_val_pred = model.predict(X_val)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_val, y_val_pred)
    
    # 调整矩阵显示顺序（如果需要保持原逻辑）
    cm = np.array([[cm[1, 1], cm[1, 0]],  # 修改后格式：[[TP, FN], [FP, TN]]
                   [cm[0, 1], cm[0, 0]]])
    
    # 将数值转换为百分比
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # 创建图形
    plt.figure(figsize=(8, 6))
    
    # 绘制热力图
    sns.heatmap(cm_percentage, annot=False, fmt=".2f", cmap='Blues', 
                cbar=True, square=True,
                xticklabels=['Positive', 'Negative'], 
                yticklabels=['Positive', 'Negative'])
    
    # 添加数值标注
    for i in range(cm_percentage.shape[0]):
        for j in range(cm_percentage.shape[1]):
            color = 'black' if cm_percentage[i, j] < 50 else 'white'
            plt.text(j + 0.5, i + 0.5, f"{cm_percentage[i, j]:.2f}%", 
                     ha='center', va='center', color=color, fontsize=12)
    
    # 添加标签和标题
    plt.title(f'{name} Validation Confusion Matrix (Percentage)', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'{result_path}{name}_validation_confusion_matrix.svg', format='svg')
    plt.close()  # 关闭图形避免内存泄漏
    
    
# =============================================================================
# 模型保存
# =============================================================================

import joblib

# 保存每个最佳模型
for name, model in best_models.items():
    joblib.dump(model, f'{result_path}/model/{name}_model.pkl')
    print(f"{name} model saved successfully.")

print("\n=== 所有模型训练和评估完成 ===")
print(f"结果保存在目录: {os.path.abspath(result_path)}")

Logistic Regression model saved successfully.
SVM model saved successfully.
Random Forest model saved successfully.
GNB model saved successfully.
MLP model saved successfully.
GBDT model saved successfully.
XGBoost model saved successfully.
LightGBM model saved successfully.
CatBoost model saved successfully.
AdaBoost model saved successfully.

=== 所有模型训练和评估完成 ===
结果保存在目录: d:\adult_dep\实验结果\2025.4.8model\rsrna3


In [81]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame
prob_df = pd.DataFrame({
    'Index': X_val.index,
    'True_label': y_val.values,
    'Predicted_probability': y_proba
}).sort_values('Index')

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")

# # 可选：保存所有候选模型的预测概率
# if len(candidates) > 1:
#     print("\n=== 保存所有候选模型的预测概率 ===")
#     for _, row in candidates.iterrows():
#         model_name = row['Model']
#         model = best_models[model_name]
        
#         y_proba = model.predict_proba(X_val)[:, 1]
#         prob_df = pd.DataFrame({
#             'Index': X_val.index,
#             'True_label': y_val.values,
#             f'Prob_{model_name}': y_proba
#         }).sort_values('Index')
        
#         filename = f"candidate_probabilities_{model_name.replace(' ', '_')}.csv"
#         prob_df.to_csv(os.path.join(result_path, filename), index=False)
#         print(f"已保存 {model_name} 的预测概率")


=== 最终选择模型 GNB ===
* AUC值: 0.6047
* 95%置信区间: [0.4605, 0.7489]
* 预测概率文件已保存至：./rsrna3/sample_probabilities_GNB.csv


# 2mirna feature

## 1. 区分MDD vs BD

In [228]:

##数据读入
mirna=pd.read_csv('mirna1.csv',encoding='GBK')
mirna = mirna[mirna['group'].isin([2, 1])]
mirna['group'] = mirna['group'].replace(2, 0)

# 划分特征和标签
miRNA_list = ['mirna1']
X = mirna[miRNA_list]
y = mirna['group']

# 划分训练集和验证集 (7:3)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=16
)


from sklearn.preprocessing import StandardScaler
# 2. 只在训练集上计算标准化参数
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 训练集：fit + transform

# 3. 用训练集的参数标准化验证集
X_val = scaler.transform(X_val) 

In [229]:
X_train

array([[-0.50709452],
       [ 2.79549351],
       [-0.81684387],
       [-0.53911587],
       [ 0.48856471],
       [ 0.94620372],
       [ 1.68685632],
       [-0.42306022],
       [-0.95377695],
       [-0.42101289],
       [-0.40871016],
       [-0.83829135],
       [-0.27684322],
       [ 1.61794285],
       [ 0.42701093],
       [-0.27416697],
       [ 3.06825706],
       [-0.2447282 ],
       [-0.99140237],
       [ 1.99261806],
       [-0.00369696],
       [-0.46953333],
       [ 0.66252106],
       [-0.16711691],
       [ 3.51808141],
       [-0.95928735],
       [-0.18317442],
       [ 0.3997988 ],
       [-0.4989721 ],
       [-0.89505732],
       [-0.1680081 ],
       [-0.45289507],
       [-0.67292844],
       [-0.57306947],
       [ 0.30410407],
       [-1.03020802],
       [-0.52369798],
       [ 1.41722398],
       [ 0.60364352],
       [-0.34642576],
       [-0.56587035],
       [-0.95928735],
       [-0.55249713],
       [-1.06205541],
       [-0.95393485],
       [-0

In [230]:
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'penalty': ['l2'],
        'max_iter': [10000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    'MLP': {
        'hidden_layer_sizes': [(10,), (50,), (10, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'GBDT': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100],
        'boosting_type': ['gbdt'],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    }
}

# 定义模型集合
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'GNB': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

In [231]:
# 创建结果目录
result_path = './mirna1-scaled-seed16/'
os.makedirs(result_path, exist_ok=True)
os.makedirs(os.path.join(result_path, 'model'), exist_ok=True)
os.makedirs(os.path.join(result_path, 'figure'), exist_ok=True)

best_models = {}

# 超参数优化（使用训练集的5折交叉验证）
for model_name, model in models.items():
    print(f"\n=== Training {model_name} ===")
    
    if model_name in param_grids:
        # 网格搜索
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # 保存最佳模型
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
    else:
        # 无参数优化的模型
        model.fit(X_train, y_train)
        best_models[model_name] = model


=== Training Logistic Regression ===
Best parameters: {'C': 0.01, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}

=== Training SVM ===
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}

=== Training Random Forest ===
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

=== Training GNB ===
Best parameters: {'var_smoothing': 1e-09}

=== Training MLP ===
Best parameters: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}

=== Training GBDT ===
Best parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}

=== Training XGBoost ===
Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}

=== Training LightGBM ===
[LightGBM] [Info] Number of positive: 54, number of negative: 47
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.

In [232]:
# 自定义评分函数
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

def youden_index_score(y_true, y_pred):
    sensitivity = sensitivity_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    return sensitivity + specificity - 1


# 验证集评估
# =============================================================================
# 评估指标收集
metrics_results = []
roc_data = {}
confusion_matrices = {}

# 置信区间参数
bootstrap_iterations = 1000
confidence_level = 95

for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    
    # 预测验证集
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)
    
    # 计算基础指标
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_val, y_proba),
        'Accuracy': accuracy_score(y_val, y_pred),
        'Sensitivity': sensitivity_score(y_val, y_pred),
        'Specificity': specificity_score(y_val, y_pred),
        'Youden Index': youden_index_score(y_val, y_pred),
        'PPV': precision_score(y_val, y_pred),
        'NPV': precision_score(y_val, y_pred, pos_label=0),
        'F1 Score': f1_score(y_val, y_pred),
        'MCC': matthews_corrcoef(y_val, y_pred)
    }
    
    # 存储ROC数据
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    roc_data[model_name] = (fpr, tpr, metrics['AUC'])
    
    # 存储混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    confusion_matrices[model_name] = cm
    
    # Bootstrap计算置信区间
    bootstrap_metrics = {key: [] for key in metrics if key != 'Model'}
    
    for _ in range(bootstrap_iterations):
        # Bootstrap重采样
        indices = resample(np.arange(len(y_val)), replace=True)
        y_val_boot = y_val.iloc[indices]
        y_pred_boot = y_pred[indices]
        y_proba_boot = y_proba[indices]
        
        # 计算指标
        try:
            bootstrap_metrics['AUC'].append(roc_auc_score(y_val_boot, y_proba_boot))
        except:
            bootstrap_metrics['AUC'].append(np.nan)
        bootstrap_metrics['Accuracy'].append(accuracy_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Sensitivity'].append(sensitivity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Specificity'].append(specificity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Youden Index'].append(youden_index_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['PPV'].append(precision_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['NPV'].append(precision_score(y_val_boot, y_pred_boot, pos_label=0))
        bootstrap_metrics['F1 Score'].append(f1_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['MCC'].append(matthews_corrcoef(y_val_boot, y_pred_boot))
    
    # 计算置信区间
    for metric in bootstrap_metrics:
        lower = np.nanpercentile(bootstrap_metrics[metric], (100 - confidence_level)/2)
        upper = np.nanpercentile(bootstrap_metrics[metric], 100 - (100 - confidence_level)/2)
        metrics[f'{metric}_lower'] = lower
        metrics[f'{metric}_upper'] = upper
    
    metrics_results.append(metrics)

# 保存评估结果
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv(os.path.join(result_path, 'validation_metrics.csv'), index=False)


=== Evaluating Logistic Regression ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating SVM ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating Random Forest ===

=== Evaluating GNB ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating MLP ===

=== Evaluating GBDT ===

=== Evaluating XGBoost ===

=== Evaluating LightGBM ===

=== Evaluating CatBoost ===

=== Evaluating AdaBoost ===


In [233]:
# 绘制ROC曲线
plt.rcParams['font.family'] = 'Arial'
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_data.items():
    if name == 'GNB':
        continue  # 跳过GNB
    plt.plot(fpr, tpr, 
             label=f'{name} (AUC = {auc:.3f} [{metrics_df[metrics_df.Model==name]["AUC_lower"].values[0]:.3f}-{metrics_df[metrics_df.Model==name]["AUC_upper"].values[0]:.3f}])')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves on Validation Set', fontsize=15)
plt.legend(loc='lower right', fontsize=10)
# 移除右边和上边边界
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.grid(False)

plt.savefig(os.path.join(result_path, 'figure', 'ROC_curves.svg'), bbox_inches='tight')
plt.close()

In [234]:
# 遍历每个模型，绘制验证集混淆矩阵
for name, model in best_models.items():
    # 使用最佳模型进行验证集预测
    y_val_pred = model.predict(X_val)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_val, y_val_pred)
    
    # 调整矩阵显示顺序（如果需要保持原逻辑）
    cm = np.array([[cm[1, 1], cm[1, 0]],  # 修改后格式：[[TP, FN], [FP, TN]]
                   [cm[0, 1], cm[0, 0]]])
    
    # 将数值转换为百分比
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # 创建图形
    plt.figure(figsize=(8, 6))
    
    # 绘制热力图
    sns.heatmap(cm_percentage, annot=False, fmt=".2f", cmap='Blues', 
                cbar=True, square=True,
                xticklabels=['Positive', 'Negative'], 
                yticklabels=['Positive', 'Negative'])
    
    # 添加数值标注
    for i in range(cm_percentage.shape[0]):
        for j in range(cm_percentage.shape[1]):
            color = 'black' if cm_percentage[i, j] < 50 else 'white'
            plt.text(j + 0.5, i + 0.5, f"{cm_percentage[i, j]:.2f}%", 
                     ha='center', va='center', color=color, fontsize=12)
    
    # 添加标签和标题
    plt.title(f'{name} Validation Confusion Matrix (Percentage)', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'{result_path}{name}_validation_confusion_matrix.svg', format='svg')
    plt.close()  # 关闭图形避免内存泄漏
    
    
# =============================================================================
# 模型保存
# =============================================================================

import joblib

# 保存每个最佳模型
for name, model in best_models.items():
    joblib.dump(model, f'{result_path}/model/{name}_model.pkl')
    print(f"{name} model saved successfully.")

print("\n=== 所有模型训练和评估完成 ===")
print(f"结果保存在目录: {os.path.abspath(result_path)}")

Logistic Regression model saved successfully.
SVM model saved successfully.
Random Forest model saved successfully.
GNB model saved successfully.
MLP model saved successfully.
GBDT model saved successfully.
XGBoost model saved successfully.
LightGBM model saved successfully.
CatBoost model saved successfully.
AdaBoost model saved successfully.

=== 所有模型训练和评估完成 ===
结果保存在目录: d:\adult_dep\实验结果\2025.4.8model\mirna1-scaled-seed16


In [137]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame
prob_df = pd.DataFrame({
    'Index': X_val.index,
    'True_label': y_val.values,
    'Predicted_probability': y_proba
}).sort_values('Index')

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")

# # 可选：保存所有候选模型的预测概率
# if len(candidates) > 1:
#     print("\n=== 保存所有候选模型的预测概率 ===")
#     for _, row in candidates.iterrows():
#         model_name = row['Model']
#         model = best_models[model_name]
        
#         y_proba = model.predict_proba(X_val)[:, 1]
#         prob_df = pd.DataFrame({
#             'Index': X_val.index,
#             'True_label': y_val.values,
#             f'Prob_{model_name}': y_proba
#         }).sort_values('Index')
        
#         filename = f"candidate_probabilities_{model_name.replace(' ', '_')}.csv"
#         prob_df.to_csv(os.path.join(result_path, filename), index=False)
#         print(f"已保存 {model_name} 的预测概率")

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [235]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame（无需索引）
prob_df = pd.DataFrame({
    'True_label': y_val.values,          # 使用验证集的真实标签
    'Predicted_probability': y_proba     # 预测概率直接对应验证集顺序
})

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)  # 禁用索引保存

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")


=== 最终选择模型 CatBoost ===
* AUC值: 0.7302
* 95%置信区间: [0.5714, 0.8677]
* 预测概率文件已保存至：./mirna1-scaled-seed16/sample_probabilities_CatBoost.csv


# 调取并计算概率

In [236]:
# 寻找AUC最高的最优模型
best_model_name = metrics_df.loc[metrics_df['AUC'].idxmax(), 'Model']
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]  # 取正类的概率

# 创建包含原始索引的DataFrame
prob_df = pd.DataFrame({
    # 'Index': X_val.index,          # 保留原始索引
    'True_label': y_val,           # 真实标签
    'Predicted_probability': y_proba  # 预测概率
})

# 按原始索引排序（如果需要保持原始顺序）
# prob_df = prob_df.sort_values('Index')

# 保存结果
prob_df.to_csv(os.path.join(result_path, 'sample_probabilities.csv'), index=False)

print(f"\n=== 最优模型 {best_model_name} 的预测概率已保存 ===")
print(f"文件路径：{os.path.join(result_path, 'sample_probabilities.csv')}")


=== 最优模型 CatBoost 的预测概率已保存 ===
文件路径：./mirna1-scaled-seed16/sample_probabilities.csv


In [237]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./mirna1-scaled-seed16/sample_probabilities.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'BD']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 更现代的珊瑚色
    'BD': '#6495ED'
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    # sns.stripplot(
    #     x='Group', y=mirna, data=filtered_data, order=groups,
    #     color='black', alpha=0.65, jitter=True, ax=ax
    # )
    
    
        # ----- 散点图优化 -----
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,       # 与箱线图颜色一致
        edgecolor='w',         # 白色描边
        linewidth=0.4,         # 描边粗细
        size=5,                # 点大小
        alpha=0.75,            # 透明度
        jitter=0.15,           # 抖动幅度
        ax=ax
    )
    # ----------------------------
    # 关键修改点：确保刻度线设置的优先级
    # ----------------------------
    
    # 1. 显式设置刻度线样式
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,   # 强制显示x轴刻度线
        left=True      # 强制显示y轴刻度线
    )
    
    # 2. 直接操作轴的底层对象
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示x轴主刻度线
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示y轴主刻度线
    
    # 3. 确保其他样式设置不会覆盖刻度线
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 其他样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 生成需要比较的组合
    compare_groups = [g for g in groups if g != target_group]
    combinations = [(target_group, other) for other in compare_groups]
    
    # 统计标注参数
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08  # 调整标注间距

    # 进行两两比较（使用原始未过滤数据）
    for i, (group1, group2) in enumerate(combinations):
        # 获取数据（使用未过滤的value_df）
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # # 检查数据量
        # if len(data1) < 2 or len(data2) < 2:
        #     print(f"跳过{group1} vs {group2}（数据不足）")
        #     continue
        
        # 方差齐性检验
        levene_stat, levene_p = stats.levene(data1, data2)
        equal_var = levene_p >= 0.05
        
        # t检验
        t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=equal_var)
        
        # 标注位置计算
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注文本
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        # 添加文本
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)  # 添加半透明背景
        )
    
    # 调整y轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存图片
    plt.tight_layout()
    plt.savefig(f'4final-scaled-seed16_Boxplot_bd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()
    
    # plt.show()

  sns.stripplot(


In [143]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./mirna1-scaled/sample_probabilities.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'BD']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 橙色
    'BD': '#6495ED'     # 宝蓝色
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,
        edgecolor='w',
        linewidth=0.4,
        size=5,
        alpha=0.75,
        jitter=0.15,
        ax=ax
    )

    # 坐标轴设置
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,
        left=True
    )
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)
    
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted Probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 统计检验（Mann-Whitney U检验）
    combinations = [(groups[0], groups[1])]  # 直接比较两组
    
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08

    for i, (group1, group2) in enumerate(combinations):
        # 使用原始未过滤数据
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # Mann-Whitney U检验
        stat, p_val = stats.mannwhitneyu(data1, data2, alternative='two-sided')
        
        # 标注位置
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)
        )
    
    # 调整坐标轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存输出
    plt.tight_layout()
    plt.savefig(f'4-1final-scaled_Boxplot_bd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()

  sns.stripplot(


# mirna2

In [108]:

##数据读入
mirna=pd.read_csv('mirna2.csv',encoding='GBK')
mirna = mirna[mirna['group'].isin([2, 1])]
mirna['group'] = mirna['group'].replace(2, 0)

# 划分特征和标签
miRNA_list = ['mirna2']
X = mirna[miRNA_list]
y = mirna['group']


# 划分训练集和验证集 (7:3)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=18
)

from sklearn.preprocessing import StandardScaler
# 2. 只在训练集上计算标准化参数
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 训练集：fit + transform

# 3. 用训练集的参数标准化验证集
X_val = scaler.transform(X_val)  

In [117]:
X_val

array([[-1.04973348],
       [-1.19185576],
       [-1.15237735],
       [-0.51390089],
       [-0.38649616],
       [ 0.43465482],
       [ 0.27674117],
       [ 0.77781697],
       [ 0.59698215],
       [ 0.32583652],
       [ 1.93483448],
       [ 1.08210078],
       [ 0.39057322],
       [-0.7260105 ],
       [-0.10225159],
       [ 0.63994256],
       [ 0.22835643],
       [ 2.10853949],
       [ 0.59256846],
       [-0.22068683],
       [ 0.37921923],
       [ 0.1036915 ],
       [ 0.14251457],
       [-0.25927303],
       [ 0.11405853],
       [ 0.74258643],
       [-1.25502122],
       [ 0.01618365],
       [-0.63126231],
       [-0.28385229],
       [-0.9076112 ],
       [-0.46545298],
       [-0.16541705],
       [-0.63126231],
       [-0.40228752],
       [ 0.01401233],
       [ 0.85299965],
       [ 0.95576986],
       [ 6.56960007],
       [-0.4736566 ],
       [ 0.23726276],
       [-0.43387025],
       [-0.41123333],
       [-0.37070479]])

In [110]:
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'penalty': ['l2'],
        'max_iter': [10000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf'],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GNB': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    'MLP': {
        'hidden_layer_sizes': [(10,), (50,), (10, 50), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'GBDT': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100],
        'boosting_type': ['gbdt'],
        'subsample': [0.7, 0.8, 1.0]
    },
    'CatBoost': {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    }
}

# 定义模型集合
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'GNB': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42),
    'GBDT': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

In [111]:
# 创建结果目录
result_path = './mirna2-scaled/'
os.makedirs(result_path, exist_ok=True)
os.makedirs(os.path.join(result_path, 'model'), exist_ok=True)
os.makedirs(os.path.join(result_path, 'figure'), exist_ok=True)

best_models = {}

# 超参数优化（使用训练集的5折交叉验证）
for model_name, model in models.items():
    print(f"\n=== Training {model_name} ===")
    
    if model_name in param_grids:
        # 网格搜索
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            scoring='roc_auc',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        
        # 保存最佳模型
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters: {grid_search.best_params_}")
    else:
        # 无参数优化的模型
        model.fit(X_train, y_train)
        best_models[model_name] = model


=== Training Logistic Regression ===
Best parameters: {'C': 0.01, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'saga'}

=== Training SVM ===
Best parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

=== Training Random Forest ===
Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

=== Training GNB ===
Best parameters: {'var_smoothing': 1e-09}

=== Training MLP ===




Best parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}

=== Training GBDT ===
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}

=== Training XGBoost ===
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}

=== Training LightGBM ===
[LightGBM] [Info] Number of positive: 54, number of negative: 47
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 101, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.534653 -> initscore=0.138836
[LightGBM] [Info] Start training from score 0.138836
Best parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 

In [112]:
# 自定义评分函数
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

def youden_index_score(y_true, y_pred):
    sensitivity = sensitivity_score(y_true, y_pred)
    specificity = specificity_score(y_true, y_pred)
    return sensitivity + specificity - 1


# 验证集评估
# =============================================================================
# 评估指标收集
metrics_results = []
roc_data = {}
confusion_matrices = {}

# 置信区间参数
bootstrap_iterations = 1000
confidence_level = 95

for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    
    # 预测验证集
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_val)
    
    # 计算基础指标
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_val, y_proba),
        'Accuracy': accuracy_score(y_val, y_pred),
        'Sensitivity': sensitivity_score(y_val, y_pred),
        'Specificity': specificity_score(y_val, y_pred),
        'Youden Index': youden_index_score(y_val, y_pred),
        'PPV': precision_score(y_val, y_pred),
        'NPV': precision_score(y_val, y_pred, pos_label=0),
        'F1 Score': f1_score(y_val, y_pred),
        'MCC': matthews_corrcoef(y_val, y_pred)
    }
    
    # 存储ROC数据
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    roc_data[model_name] = (fpr, tpr, metrics['AUC'])
    
    # 存储混淆矩阵
    cm = confusion_matrix(y_val, y_pred)
    confusion_matrices[model_name] = cm
    
    # Bootstrap计算置信区间
    bootstrap_metrics = {key: [] for key in metrics if key != 'Model'}
    
    for _ in range(bootstrap_iterations):
        # Bootstrap重采样
        indices = resample(np.arange(len(y_val)), replace=True)
        y_val_boot = y_val.iloc[indices]
        y_pred_boot = y_pred[indices]
        y_proba_boot = y_proba[indices]
        
        # 计算指标
        try:
            bootstrap_metrics['AUC'].append(roc_auc_score(y_val_boot, y_proba_boot))
        except:
            bootstrap_metrics['AUC'].append(np.nan)
        bootstrap_metrics['Accuracy'].append(accuracy_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Sensitivity'].append(sensitivity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Specificity'].append(specificity_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['Youden Index'].append(youden_index_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['PPV'].append(precision_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['NPV'].append(precision_score(y_val_boot, y_pred_boot, pos_label=0))
        bootstrap_metrics['F1 Score'].append(f1_score(y_val_boot, y_pred_boot))
        bootstrap_metrics['MCC'].append(matthews_corrcoef(y_val_boot, y_pred_boot))
    
    # 计算置信区间
    for metric in bootstrap_metrics:
        lower = np.nanpercentile(bootstrap_metrics[metric], (100 - confidence_level)/2)
        upper = np.nanpercentile(bootstrap_metrics[metric], 100 - (100 - confidence_level)/2)
        metrics[f'{metric}_lower'] = lower
        metrics[f'{metric}_upper'] = upper
    
    metrics_results.append(metrics)

# 保存评估结果
metrics_df = pd.DataFrame(metrics_results)
metrics_df.to_csv(os.path.join(result_path, 'validation_metrics.csv'), index=False)


=== Evaluating Logistic Regression ===


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


=== Evaluating SVM ===

=== Evaluating Random Forest ===

=== Evaluating GNB ===

=== Evaluating MLP ===

=== Evaluating GBDT ===

=== Evaluating XGBoost ===

=== Evaluating LightGBM ===

=== Evaluating CatBoost ===

=== Evaluating AdaBoost ===


In [122]:
# 绘制ROC曲线
plt.rcParams['font.family'] = 'Arial'
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_data.items():
    if name == 'GNB':
        continue  # 跳过GNB
    plt.plot(fpr, tpr, 
             label=f'{name} (AUC = {auc:.3f} [{metrics_df[metrics_df.Model==name]["AUC_lower"].values[0]:.3f}-{metrics_df[metrics_df.Model==name]["AUC_upper"].values[0]:.3f}])')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves on Validation Set', fontsize=15)
plt.legend(loc='lower right', fontsize=10)
# 移除右边和上边边界
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.grid(False)

plt.savefig(os.path.join(result_path, 'figure', 'ROC_curves.svg'), bbox_inches='tight')
plt.close()

In [123]:
# 遍历每个模型，绘制验证集混淆矩阵
for name, model in best_models.items():
    # 使用最佳模型进行验证集预测
    y_val_pred = model.predict(X_val)
    
    # 计算混淆矩阵
    cm = confusion_matrix(y_val, y_val_pred)
    
    # 调整矩阵显示顺序（如果需要保持原逻辑）
    cm = np.array([[cm[1, 1], cm[1, 0]],  # 修改后格式：[[TP, FN], [FP, TN]]
                   [cm[0, 1], cm[0, 0]]])
    
    # 将数值转换为百分比
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # 创建图形
    plt.figure(figsize=(8, 6))
    
    # 绘制热力图
    sns.heatmap(cm_percentage, annot=False, fmt=".2f", cmap='Blues', 
                cbar=True, square=True,
                xticklabels=['Positive', 'Negative'], 
                yticklabels=['Positive', 'Negative'])
    
    # 添加数值标注
    for i in range(cm_percentage.shape[0]):
        for j in range(cm_percentage.shape[1]):
            color = 'black' if cm_percentage[i, j] < 50 else 'white'
            plt.text(j + 0.5, i + 0.5, f"{cm_percentage[i, j]:.2f}%", 
                     ha='center', va='center', color=color, fontsize=12)
    
    # 添加标签和标题
    plt.title(f'{name} Validation Confusion Matrix (Percentage)', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    
    # 保存和显示
    plt.tight_layout()
    plt.savefig(f'{result_path}{name}_validation_confusion_matrix.svg', format='svg')
    plt.close()  # 关闭图形避免内存泄漏
    
    
# =============================================================================
# 模型保存
# =============================================================================

import joblib

# 保存每个最佳模型
for name, model in best_models.items():
    joblib.dump(model, f'{result_path}/model/{name}_model.pkl')
    print(f"{name} model saved successfully.")

print("\n=== 所有模型训练和评估完成 ===")
print(f"结果保存在目录: {os.path.abspath(result_path)}")

Logistic Regression model saved successfully.
SVM model saved successfully.
Random Forest model saved successfully.
GNB model saved successfully.
MLP model saved successfully.
GBDT model saved successfully.
XGBoost model saved successfully.
LightGBM model saved successfully.
CatBoost model saved successfully.
AdaBoost model saved successfully.

=== 所有模型训练和评估完成 ===
结果保存在目录: d:\adult_dep\实验结果\2025.4.8model\mirna2-scaled


In [None]:
# 原始情况运行
# 
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame
prob_df = pd.DataFrame({
    'Index': X_val.index,
    'True_label': y_val.values,
    'Predicted_probability': y_proba
}).sort_values('Index')

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")

# # 可选：保存所有候选模型的预测概率
# if len(candidates) > 1:
#     print("\n=== 保存所有候选模型的预测概率 ===")
#     for _, row in candidates.iterrows():
#         model_name = row['Model']
#         model = best_models[model_name]
        
#         y_proba = model.predict_proba(X_val)[:, 1]
#         prob_df = pd.DataFrame({
#             'Index': X_val.index,
#             'True_label': y_val.values,
#             f'Prob_{model_name}': y_proba
#         }).sort_values('Index')
        
#         filename = f"candidate_probabilities_{model_name.replace(' ', '_')}.csv"
#         prob_df.to_csv(os.path.join(result_path, filename), index=False)
#         print(f"已保存 {model_name} 的预测概率")

In [124]:
# =======================================================================

# 寻找AUC最高的模型，考虑置信区间
max_auc = metrics_df['AUC'].max()
candidates = metrics_df[metrics_df['AUC'] == max_auc]

# 处理并列情况
if len(candidates) > 1:
    print("\n发现多个模型具有相同AUC值，正在进行置信区间比较...")
    
    # 优先选择置信区间上限更高的模型
    candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)
    best_model_name = candidates.loc[candidates['AUC_upper'].idxmax(), 'Model']
    
    # 获取比较结果
    compare_df = candidates[['Model', 'AUC', 'AUC_lower', 'AUC_upper']]
    print(f"候选模型比较：\n{compare_df.to_string(index=False)}")
    print(f"最终选择：{best_model_name} (最高置信区间上限)")
else:
    best_model_name = candidates.iloc[0]['Model']

# 获取最优模型
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]

# 创建包含详细信息的DataFrame（无需索引）
prob_df = pd.DataFrame({
    'True_label': y_val.values,          # 使用验证集的真实标签
    'Predicted_probability': y_proba     # 预测概率直接对应验证集顺序
})

# 保存结果时添加模型信息
filename = f"sample_probabilities_{best_model_name.replace(' ', '_')}.csv"
prob_df.to_csv(os.path.join(result_path, filename), index=False)  # 禁用索引保存

print(f"\n=== 最终选择模型 {best_model_name} ===")
print(f"* AUC值: {max_auc:.4f}")
print(f"* 95%置信区间: [{candidates[candidates['Model']==best_model_name]['AUC_lower'].values[0]:.4f}, "
      f"{candidates[candidates['Model']==best_model_name]['AUC_upper'].values[0]:.4f}]")
print(f"* 预测概率文件已保存至：{os.path.join(result_path, filename)}")


发现多个模型具有相同AUC值，正在进行置信区间比较...
候选模型比较：
              Model      AUC  AUC_lower  AUC_upper
Logistic Regression 0.785417   0.634426   0.904792
                GNB 0.785417   0.641990   0.909185
最终选择：GNB (最高置信区间上限)

=== 最终选择模型 GNB ===
* AUC值: 0.7854
* 95%置信区间: [0.6420, 0.9092]
* 预测概率文件已保存至：./mirna2-scaled/sample_probabilities_GNB.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidates['AUC_upper'] = candidates['AUC_upper'].astype(float)


In [126]:
# 寻找AUC最高的最优模型
best_model_name = metrics_df.loc[metrics_df['AUC'].idxmax(), 'Model']
best_model = best_models[best_model_name]

# 获取所有验证集样本预测为1的概率
y_proba = best_model.predict_proba(X_val)[:, 1]  # 取正类的概率

# 创建包含原始索引的DataFrame
prob_df = pd.DataFrame({
    # 'Index': X_val.index,          # 保留原始索引
    'True_label': y_val,           # 真实标签
    'Predicted_probability': y_proba  # 预测概率
})

# 按原始索引排序（如果需要保持原始顺序）
# prob_df = prob_df.sort_values('Index')

# 保存结果
prob_df.to_csv(os.path.join(result_path, 'sample_probabilities.csv'), index=False)

print(f"\n=== 最优模型 {best_model_name} 的预测概率已保存 ===")
print(f"文件路径：{os.path.join(result_path, 'sample_probabilities.csv')}")


=== 最优模型 Logistic Regression 的预测概率已保存 ===
文件路径：./mirna2-scaled/sample_probabilities.csv


In [127]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./mirna2-scaled/sample_probabilities.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'BD']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 更现代的珊瑚色
    'BD': '#6495ED'
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    # sns.stripplot(
    #     x='Group', y=mirna, data=filtered_data, order=groups,
    #     color='black', alpha=0.65, jitter=True, ax=ax
    # )
    
    
        # ----- 散点图优化 -----
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,       # 与箱线图颜色一致
        edgecolor='w',         # 白色描边
        linewidth=0.4,         # 描边粗细
        size=5,                # 点大小
        alpha=0.75,            # 透明度
        jitter=0.15,           # 抖动幅度
        ax=ax
    )
    # ----------------------------
    # 关键修改点：确保刻度线设置的优先级
    # ----------------------------
    
    # 1. 显式设置刻度线样式
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,   # 强制显示x轴刻度线
        left=True      # 强制显示y轴刻度线
    )
    
    # 2. 直接操作轴的底层对象
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示x轴主刻度线
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)  # 显示y轴主刻度线
    
    # 3. 确保其他样式设置不会覆盖刻度线
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 其他样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 生成需要比较的组合
    compare_groups = [g for g in groups if g != target_group]
    combinations = [(target_group, other) for other in compare_groups]
    
    # 统计标注参数
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08  # 调整标注间距

    # 进行两两比较（使用原始未过滤数据）
    for i, (group1, group2) in enumerate(combinations):
        # 获取数据（使用未过滤的value_df）
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # # 检查数据量
        # if len(data1) < 2 or len(data2) < 2:
        #     print(f"跳过{group1} vs {group2}（数据不足）")
        #     continue
        
        # 方差齐性检验
        levene_stat, levene_p = stats.levene(data1, data2)
        equal_var = levene_p >= 0.05
        
        # t检验
        t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=equal_var)
        
        # 标注位置计算
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注文本
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        # 添加文本
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)  # 添加半透明背景
        )
    
    # 调整y轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存图片
    plt.tight_layout()
    plt.savefig(f'5final-scaled_Boxplot_bd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()
    
    # plt.show()

  sns.stripplot(


In [142]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import MaxNLocator

# 读取数据
all = pd.read_csv('./mirna2-scaled/sample_probabilities.csv', encoding='GBK')

# 配置参数
selected_mirnas = ['Predicted_probability']
groups = ['MDD', 'BD']
target_group = "MDD"
palette = {
    'MDD': '#FFA500',   # 橙色
    'BD': '#6495ED'     # 宝蓝色
}

# 循环处理每个小RNA
for mirna in selected_mirnas:
    # 数据准备（不过滤异常值）
    value_df = all.dropna(subset=[mirna, 'True_label']).copy()
    value_df = value_df[value_df['True_label'].isin(groups)]
    
    # --- 创建过滤后的数据（仅用于绘图） ---
    filtered_data = pd.DataFrame()
    for group in groups:
        group_data = value_df[value_df['True_label'] == group]
        Q1 = group_data[mirna].quantile(0.25)
        Q3 = group_data[mirna].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_group = group_data[(group_data[mirna] >= lower_bound) & (group_data[mirna] <= upper_bound)]
        filtered_data = pd.concat([filtered_data, filtered_group])
    
    # 创建画布
    fig, ax = plt.subplots(figsize=(4.5, 6))
    
    # 绘制箱线图和散点图
    sns.boxplot(
        x='True_label', y=mirna, data=filtered_data, order=groups,
        palette=palette, ax=ax, linewidth=1.5, width=0.35, showfliers=False
    )
    
    sns.stripplot(
        x='True_label',
        y=mirna,
        data=filtered_data,
        order=groups,
        palette=palette,
        edgecolor='w',
        linewidth=0.4,
        size=5,
        alpha=0.75,
        jitter=0.15,
        ax=ax
    )

    # 坐标轴设置
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',
        length=6,
        width=1.5,
        colors='black',
        bottom=True,
        left=True
    )
    for tick in ax.xaxis.get_major_ticks():
        tick.tick1line.set_visible(True)
    for tick in ax.yaxis.get_major_ticks():
        tick.tick1line.set_visible(True)
    
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
    
    # 样式设置
    ax.set_title('Predicted Probability Distribution', fontsize=16)
    ax.set_ylabel('Predicted Probability', fontsize=12)
    ax.set_xlabel('')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(1)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # 统计检验（Mann-Whitney U检验）
    combinations = [(groups[0], groups[1])]  # 直接比较两组
    
    y_min, y_max = ax.get_ylim()
    vertical_step = (y_max - y_min) * 0.08

    for i, (group1, group2) in enumerate(combinations):
        # 使用原始未过滤数据
        data1 = value_df[value_df['True_label'] == group1][mirna]
        data2 = value_df[value_df['True_label'] == group2][mirna]
        
        # Mann-Whitney U检验
        stat, p_val = stats.mannwhitneyu(data1, data2, alternative='two-sided')
        
        # 标注位置
        x1 = groups.index(group1)
        x2 = groups.index(group2)
        y_pos = y_max + (i+1)*vertical_step
        
        # 绘制横线
        ax.plot([x1, x2], [y_pos, y_pos], lw=1.5, color='black')
        
        # 生成标注
        if p_val < 0.001:
            p_text = '***'
        elif p_val < 0.01:
            p_text = '**'
        elif p_val < 0.05:
            p_text = '*'
        else:
            p_text = f'p={p_val:.3f}'
        
        ax.text(
            (x1+x2)/2, 
            y_pos + vertical_step/4, 
            p_text,
            ha='center',
            va='bottom',
            fontsize=12,
            backgroundcolor=(1, 1, 1, 0.5)
        )
    
    # 调整坐标轴范围
    ax.set_ylim(y_min, y_max + len(combinations)*vertical_step + vertical_step)

    # 保存输出
    plt.tight_layout()
    plt.savefig(f'5-1final-scaled_Boxplot_bd_{mirna}.svg', dpi=600, bbox_inches='tight')
    plt.close()

  sns.stripplot(


# mirna1+2

In [189]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# 数据读入与预处理
mirna = pd.read_csv('mirna1+2.csv', encoding='GBK')
mirna = mirna[mirna['group'].isin([2, 1])]
mirna['group'] = mirna['group'].replace(2, 0)

# 检查缺失情况
print("原始数据缺失统计：")
print(mirna[['mirna1', 'mirna2']].isnull().sum())

# 按group分组插值
imputed_dfs = []
for group in [0, 1]:
    # 提取当前组数据
    group_mask = mirna['group'] == group
    group_data = mirna.loc[group_mask, ['mirna1', 'mirna2']].copy()
    
    # 仅处理有缺失的组
    if group_data.isnull().sum().sum() > 0:
        # 创建KNN插值器（调整参数示例）
        imputer = KNNImputer(
            n_neighbors=min(4, len(group_data)-1),  # 自适应邻居数
            weights='distance'                     # 距离加权
        )
        # 执行插值
        imputed_values = imputer.fit_transform(group_data)
        # 更新数据
        mirna.loc[group_mask, ['mirna1', 'mirna2']] = imputed_values

# 验证结果
print("\n插值后缺失统计：")
print(mirna[['mirna1', 'mirna2']].isnull().sum())

# 划分特征与标签
miRNA_list = ['mirna1', 'mirna2']
X = mirna[miRNA_list]
y = mirna['group']

# 后续处理（示例）
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 划分数据集
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.3,
    stratify=y,
    random_state=18
)

# 标准化处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

原始数据缺失统计：
mirna1    0
mirna2    0
dtype: int64

插值后缺失统计：
mirna1    0
mirna2    0
dtype: int64
