In [None]:
# =============================================================================
# 模块 2: 嵌套CV的单变量逻辑回归特征选择（含FDR多重检验校正）
# =============================================================================
# ✅ 正确做法：特征选择严格嵌套在交叉验证内部
# 流程：每一折 CV 内：仅在训练折上做特征选择 → 训练模型 → 在验证折上评估
# 
# 作用：对每个特征独立进行逻辑回归，评估其与目标变量的关系，评估每个特征的边际显著性
# 方法：在每一折CV内独立进行单变量分析，计算每个特征的 p 值、OR 值和置信区间，并进行FDR多重检验校正
# 输入：X_train_df, y_train, feature_names (来自模块 1)
# 输出：
#   - univariate_results_cv: 每折的特征选择结果
#   - feature_stability: 特征在各折中被选中的稳定性统计
#   - univariate_results_full: 全数据集上的参考结果（仅用于可视化对比，不用于模型训练）
# =============================================================================

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from statsmodels.stats.multitest import multipletests  # 用于FDR校正

# 配置参数
UNIVARIATE_P_THRESHOLD = 0.1      # 宽松阈值，用于初筛
UNIVARIATE_P_STRICT = 0.05        # 严格阈值
FDR_ALPHA = 0.05                  # FDR校正的显著性水平
N_SPLITS = 5                      # 交叉验证折数

print("=" * 80)
print("模块 2: 嵌套CV的单变量逻辑回归特征选择（含FDR校正）")
print("=" * 80)
print("\n✅ 采用正确的嵌套CV方法 - 避免数据泄露")
print("   每一折内独立进行特征选择，确保验证集不影响特征筛选过程\n")

def univariate_logistic_regression(X_df, y, feature_names):
    """
    对每个特征独立进行逻辑回归，返回系数、p值和OR值
    
    参数:
        X_df: 特征 DataFrame (已标准化)
        y: 目标变量
        feature_names: 特征名列表
    
    返回:
        DataFrame: 包含每个特征的统计信息
    """
    results = []
    y_array = np.array(y).ravel()
    
    for i, col in enumerate(feature_names):
        try:
            X_single = X_df.iloc[:, i].values.reshape(-1, 1)
            
            # 检查特征是否有变异
            if np.std(X_single) < 1e-10:
                results.append({
                    'Feature': col,
                    'Coefficient': np.nan,
                    'Std_Error': np.nan,
                    'Z_Score': np.nan,
                    'P_Value': 1.0,
                    'OR': np.nan,
                    'OR_CI_Lower': np.nan,
                    'OR_CI_Upper': np.nan,
                    'Significant_0.05': False,
                    'Significant_0.1': False
                })
                continue
            
            # 拟合逻辑回归（无正则化）
            model = LogisticRegression(
                penalty=None,
                solver='lbfgs',
                max_iter=1000,
                random_state=42
            )
            model.fit(X_single, y_array)
            
            coef = model.coef_[0][0]
            
            # 计算标准误和p值 (Wald 检验)
            pred_proba = model.predict_proba(X_single)[:, 1]
            pred_proba = np.clip(pred_proba, 1e-10, 1 - 1e-10)
            
            # Hessian 矩阵的对角元素
            W = pred_proba * (1 - pred_proba)
            
            # 添加截距项
            X_with_intercept = np.hstack([np.ones((X_single.shape[0], 1)), X_single])
            
            # Fisher 信息矩阵
            fisher_info = X_with_intercept.T @ np.diag(W) @ X_with_intercept
            
            try:
                cov_matrix = np.linalg.inv(fisher_info)
                std_error = np.sqrt(cov_matrix[1, 1])
            except np.linalg.LinAlgError:
                std_error = np.nan
            
            # Wald z统计量和p值
            if std_error > 0 and not np.isnan(std_error):
                z_score = coef / std_error
                p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
            else:
                z_score = np.nan
                p_value = 1.0
            
            # 计算 OR 和 95% CI
            OR = np.exp(coef)
            if not np.isnan(std_error):
                OR_CI_lower = np.exp(coef - 1.96 * std_error)
                OR_CI_upper = np.exp(coef + 1.96 * std_error)
            else:
                OR_CI_lower = np.nan
                OR_CI_upper = np.nan
            
            results.append({
                'Feature': col,
                'Coefficient': coef,
                'Std_Error': std_error,
                'Z_Score': z_score,
                'P_Value': p_value,
                'OR': OR,
                'OR_CI_Lower': OR_CI_lower,
                'OR_CI_Upper': OR_CI_upper,
                'Significant_0.05': p_value < 0.05,
                'Significant_0.1': p_value < 0.1
            })
            
        except Exception as e:
            results.append({
                'Feature': col,
                'Coefficient': np.nan,
                'Std_Error': np.nan,
                'Z_Score': np.nan,
                'P_Value': 1.0,
                'OR': np.nan,
                'OR_CI_Lower': np.nan,
                'OR_CI_Upper': np.nan,
                'Significant_0.05': False,
                'Significant_0.1': False
            })
    
    return pd.DataFrame(results)

def apply_fdr_correction(univariate_df, alpha=0.05):
    """
    对单变量分析结果应用FDR校正
    
    参数:
        univariate_df: 单变量分析结果DataFrame
        alpha: FDR控制水平
    
    返回:
        DataFrame: 添加了FDR校正结果的DataFrame
    """
    valid_p_values = univariate_df['P_Value'].fillna(1.0).values
    
    # 使用Benjamini-Hochberg方法进行FDR校正
    reject, p_corrected, alphacSidak, alphacBonf = multipletests(
        valid_p_values, 
        alpha=alpha, 
        method='fdr_bh'
    )
    
    # 添加校正结果
    result_df = univariate_df.copy()
    result_df['P_Value_FDR'] = p_corrected
    result_df['FDR_Significant'] = reject
    
    return result_df

# =============================================================================
# 交叉验证循环 - 在每一折内独立进行特征选择
# =============================================================================
print("\n" + "=" * 80)
print("步骤 2.1: 在交叉验证的每一折内独立进行特征选择")
print("=" * 80)

# 初始化交叉验证
cv_splitter = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# 存储每一折的特征选择结果
univariate_results_cv = {}
selected_features_cv = {}

print(f"\n开始 {N_SPLITS} 折交叉验证特征选择...\n")

for fold_idx, (train_idx, val_idx) in enumerate(cv_splitter.split(X_train_df, y_train), 1):
    print(f"{'='*60}")
    print(f"  折 {fold_idx}/{N_SPLITS}")
    print(f"{'='*60}")
    
    # 1. 分割数据（仅在训练折上进行特征选择）
    X_fold_train = X_train_df.iloc[train_idx]
    y_fold_train = y_train.iloc[train_idx]
    X_fold_val = X_train_df.iloc[val_idx]
    y_fold_val = y_train.iloc[val_idx]
    
    print(f"  训练集样本数: {len(train_idx)}")
    print(f"  验证集样本数: {len(val_idx)}")
    print(f"  训练集正样本比例: {y_fold_train.mean():.2%}")
    print(f"  验证集正样本比例: {y_fold_val.mean():.2%}")
    
    # 2. ✅ 仅在训练折上进行单变量分析（不接触验证折）
    print(f"\n  正在训练折上进行单变量逻辑回归...")
    univariate_fold = univariate_logistic_regression(
        X_fold_train, 
        y_fold_train, 
        feature_names
    )
    
    # 3. FDR校正
    print(f"  正在应用FDR校正...")
    univariate_fold = apply_fdr_correction(univariate_fold, alpha=FDR_ALPHA)
    
    # 按p值排序
    univariate_fold = univariate_fold.sort_values('P_Value').reset_index(drop=True)
    
    # 4. 筛选显著特征
    sig_features_fdr_fold = univariate_fold[
        univariate_fold['FDR_Significant']
    ]['Feature'].tolist()
    
    sig_features_fdr_01_fold = univariate_fold[
        univariate_fold['P_Value_FDR'] < 0.1
    ]['Feature'].tolist()
    
    sig_features_01_fold = univariate_fold[
        univariate_fold['P_Value'] < UNIVARIATE_P_THRESHOLD
    ]['Feature'].tolist()
    
    sig_features_005_fold = univariate_fold[
        univariate_fold['P_Value'] < UNIVARIATE_P_STRICT
    ]['Feature'].tolist()
    
    # 存储结果
    univariate_results_cv[f'fold_{fold_idx}'] = univariate_fold
    selected_features_cv[f'fold_{fold_idx}'] = {
        'fdr_q005': sig_features_fdr_fold,
        'fdr_q01': sig_features_fdr_01_fold,
        'p01': sig_features_01_fold,
        'p005': sig_features_005_fold
    }
    
    # 输出统计
    print(f"\n  折 {fold_idx} 特征选择结果:")
    print(f"    - FDR q < 0.05: {len(sig_features_fdr_fold)} 个特征")
    print(f"    - FDR q < 0.10: {len(sig_features_fdr_01_fold)} 个特征")
    print(f"    - 原始 p < 0.05: {len(sig_features_005_fold)} 个特征")
    print(f"    - 原始 p < 0.10: {len(sig_features_01_fold)} 个特征")
    
    print(f"  Top 5 特征:")
    display_cols = ['Feature', 'P_Value', 'P_Value_FDR', 'OR']
    print(univariate_fold.head(5)[display_cols].to_string(index=False))
    print()

# =============================================================================
# 步骤 2.2: 计算特征稳定性（在各折中被选中的频率）
# =============================================================================
print("\n" + "=" * 80)
print("步骤 2.2: 分析特征选择稳定性")
print("=" * 80)

# 统计每个特征在各折中被选中的次数
feature_selection_counts = {
    'fdr_q005': {},
    'fdr_q01': {},
    'p01': {},
    'p005': {}
}

for fold_name, features_dict in selected_features_cv.items():
    for method, features in features_dict.items():
        for feat in features:
            if feat not in feature_selection_counts[method]:
                feature_selection_counts[method][feat] = 0
            feature_selection_counts[method][feat] += 1

# 创建稳定性DataFrame
stability_data = []
for feat in feature_names:
    stability_data.append({
        'Feature': feat,
        'FDR_q005_Count': feature_selection_counts['fdr_q005'].get(feat, 0),
        'FDR_q01_Count': feature_selection_counts['fdr_q01'].get(feat, 0),
        'P01_Count': feature_selection_counts['p01'].get(feat, 0),
        'P005_Count': feature_selection_counts['p005'].get(feat, 0),
        'FDR_q005_Stability': feature_selection_counts['fdr_q005'].get(feat, 0) / N_SPLITS,
        'FDR_q01_Stability': feature_selection_counts['fdr_q01'].get(feat, 0) / N_SPLITS,
        'P01_Stability': feature_selection_counts['p01'].get(feat, 0) / N_SPLITS,
        'P005_Stability': feature_selection_counts['p005'].get(feat, 0) / N_SPLITS
    })

feature_stability = pd.DataFrame(stability_data)
feature_stability = feature_stability.sort_values('FDR_q005_Stability', ascending=False).reset_index(drop=True)

print(f"\n特征稳定性统计 (按 FDR q<0.05 稳定性排序):")
print(f"  - 总特征数: {len(feature_names)}")
print(f"  - 在所有 {N_SPLITS} 折中都被选中的特征 (FDR q<0.05): {(feature_stability['FDR_q005_Stability'] == 1.0).sum()} 个")
print(f"  - 在至少 {N_SPLITS-1} 折中被选中的特征 (FDR q<0.05): {(feature_stability['FDR_q005_Stability'] >= (N_SPLITS-1)/N_SPLITS).sum()} 个")
print(f"  - 在至少 {N_SPLITS//2+1} 折中被选中的特征 (FDR q<0.05): {(feature_stability['FDR_q005_Stability'] > 0.5).sum()} 个")

print(f"\nTop 20 最稳定特征:")
display_cols = ['Feature', 'FDR_q005_Count', 'FDR_q005_Stability', 'FDR_q01_Count', 'P005_Count']
print(feature_stability.head(20)[display_cols].to_string(index=False))

# 保存稳定性结果
stability_path = f"{output_dir}/univariate_feature_stability_cv.csv"
feature_stability.to_csv(stability_path, index=False)
print(f"\n✅ 特征稳定性结果已保存至: {stability_path}")

# =============================================================================
# 步骤 2.3: 生成推荐的特征列表（基于稳定性）
# =============================================================================
print("\n" + "=" * 80)
print("步骤 2.3: 基于稳定性生成推荐特征列表")
print("=" * 80)

# 推荐：选择在大多数折中都显著的特征
STABILITY_THRESHOLD = 0.6  # 至少在60%的折中被选中

sig_features_fdr_stable = feature_stability[
    feature_stability['FDR_q005_Stability'] >= STABILITY_THRESHOLD
]['Feature'].tolist()

sig_features_fdr_01_stable = feature_stability[
    feature_stability['FDR_q01_Stability'] >= STABILITY_THRESHOLD
]['Feature'].tolist()

sig_features_01_stable = feature_stability[
    feature_stability['P01_Stability'] >= STABILITY_THRESHOLD
]['Feature'].tolist()

sig_features_005_stable = feature_stability[
    feature_stability['P005_Stability'] >= STABILITY_THRESHOLD
]['Feature'].tolist()

print(f"\n✅ 推荐特征列表（稳定性阈值 >= {STABILITY_THRESHOLD:.0%}）:")
print(f"  - sig_features_fdr (FDR q<0.05, 稳定): {len(sig_features_fdr_stable)} 个 ⭐推荐")
print(f"  - sig_features_fdr_01 (FDR q<0.1, 稳定): {len(sig_features_fdr_01_stable)} 个")
print(f"  - sig_features_005 (p<0.05, 稳定): {len(sig_features_005_stable)} 个")
print(f"  - sig_features_01 (p<0.1, 稳定): {len(sig_features_01_stable)} 个")

# 使用稳定的FDR特征作为主要推荐
sig_features_fdr = sig_features_fdr_stable
sig_features_fdr_01 = sig_features_fdr_01_stable
sig_features_005 = sig_features_005_stable
sig_features_01 = sig_features_01_stable

# =============================================================================
# 步骤 2.4: 在全数据集上计算参考结果（仅用于可视化，不用于模型训练）
# =============================================================================
print("\n" + "=" * 80)
print("步骤 2.4: 计算全数据集参考结果（仅用于可视化对比）")
print("=" * 80)
print("⚠️  注意: 此结果仅用于可视化和理解，不应用于后续模型训练")

univariate_results_full = univariate_logistic_regression(X_train_df, y_train, feature_names)
univariate_results_full = apply_fdr_correction(univariate_results_full, alpha=FDR_ALPHA)
univariate_results_full = univariate_results_full.sort_values('P_Value').reset_index(drop=True)

# 保存全数据集参考结果
reference_path = f"{output_dir}/univariate_logistic_results_reference.csv"
univariate_results_full.to_csv(reference_path, index=False)

print(f"\n全数据集参考结果 (不用于训练):")
print(f"  - FDR q < 0.05: {(univariate_results_full['FDR_Significant']).sum()} 个")
print(f"  - FDR q < 0.10: {(univariate_results_full['P_Value_FDR'] < 0.1).sum()} 个")
print(f"  - 原始 p < 0.05: {(univariate_results_full['P_Value'] < 0.05).sum()} 个")
print(f"  - 原始 p < 0.10: {(univariate_results_full['P_Value'] < 0.1).sum()} 个")

print(f"\nTop 10 特征 (全数据集参考):")
display_cols = ['Feature', 'P_Value', 'P_Value_FDR', 'OR']
print(univariate_results_full.head(10)[display_cols].to_string(index=False))

print(f"\n✅ 参考结果已保存至: {reference_path}")

# =============================================================================
# 总结输出
# =============================================================================
print("\n" + "=" * 80)
print("✅ 嵌套CV单变量逻辑回归特征选择完成")
print("=" * 80)

print(f"\n可用变量:")
print(f"  - univariate_results_cv: 字典，包含每折的特征选择详细结果")
print(f"  - selected_features_cv: 字典，包含每折选出的特征列表")
print(f"  - feature_stability: DataFrame，特征在各折中的稳定性统计")
print(f"  - sig_features_fdr: 稳定的 FDR q<0.05 特征 ({len(sig_features_fdr)} 个) ⭐推荐使用")
print(f"  - sig_features_fdr_01: 稳定的 FDR q<0.1 特征 ({len(sig_features_fdr_01)} 个)")
print(f"  - sig_features_005: 稳定的 p<0.05 特征 ({len(sig_features_005)} 个)")
print(f"  - sig_features_01: 稳定的 p<0.1 特征 ({len(sig_features_01)} 个)")
print(f"  - univariate_results_full: 全数据集参考结果 (仅用于可视化)")

print(f"\n⚠️  重要说明:")
print(f"  1. 所有推荐特征列表基于嵌套CV结果，不存在数据泄露")
print(f"  2. 推荐使用 sig_features_fdr (FDR q<0.05 且稳定性>={STABILITY_THRESHOLD:.0%})")
print(f"  3. univariate_results_full 仅供参考，不应用于后续模型训练")
print(f"  4. 后续建模时应使用 sig_features_fdr 等稳定特征列表")

print("\n" + "=" * 80)

In [None]:
# =============================================================================
# 可视化：嵌套CV特征选择结果分析
# =============================================================================
import matplotlib.pyplot as plt
import seaborn as sns

print("=" * 80)
print("嵌套CV特征选择结果可视化")
print("=" * 80)

fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# ---------------------- 图1: 特征稳定性柱状图 ----------------------
ax1 = axes[0, 0]

# 选择Top 20最稳定的特征
top_stable = feature_stability.head(20).copy()
colors_stable = ['#2ecc71' if s >= STABILITY_THRESHOLD else '#e74c3c' 
                 for s in top_stable['FDR_q005_Stability']]

bars = ax1.barh(range(len(top_stable)), top_stable['FDR_q005_Stability'], 
                color=colors_stable, edgecolor='black', linewidth=0.5)
ax1.set_yticks(range(len(top_stable)))
ax1.set_yticklabels(top_stable['Feature'], fontsize=9)
ax1.invert_yaxis()
ax1.axvline(x=STABILITY_THRESHOLD, color='red', linestyle='--', linewidth=2, 
            label=f'稳定性阈值 ({STABILITY_THRESHOLD:.0%})')
ax1.set_xlabel('稳定性 (被选中折数/总折数)', fontsize=11, fontweight='bold')
ax1.set_title('Top 20 特征稳定性 (FDR q<0.05)', fontsize=13, fontweight='bold')
ax1.legend(fontsize=9)
ax1.grid(axis='x', alpha=0.3)

# ---------------------- 图2: 各折显著特征数量对比 ----------------------
ax2 = axes[0, 1]

fold_stats = []
for fold_idx in range(1, N_SPLITS + 1):
    fold_features = selected_features_cv[f'fold_{fold_idx}']
    fold_stats.append({
        'Fold': f'折{fold_idx}',
        'FDR q<0.05': len(fold_features['fdr_q005']),
        'FDR q<0.1': len(fold_features['fdr_q01']),
        'p<0.05': len(fold_features['p005']),
        'p<0.1': len(fold_features['p01'])
    })

fold_stats_df = pd.DataFrame(fold_stats)
x_pos = np.arange(len(fold_stats_df))
width = 0.2

bars1 = ax2.bar(x_pos - 1.5*width, fold_stats_df['FDR q<0.05'], width, 
                label='FDR q<0.05', color='#2ecc71', alpha=0.8)
bars2 = ax2.bar(x_pos - 0.5*width, fold_stats_df['FDR q<0.1'], width, 
                label='FDR q<0.1', color='#58d68d', alpha=0.8)
bars3 = ax2.bar(x_pos + 0.5*width, fold_stats_df['p<0.05'], width, 
                label='p<0.05', color='#3498db', alpha=0.8)
bars4 = ax2.bar(x_pos + 1.5*width, fold_stats_df['p<0.1'], width, 
                label='p<0.1', color='#5dade2', alpha=0.8)

ax2.set_xticks(x_pos)
ax2.set_xticklabels(fold_stats_df['Fold'], fontsize=10)
ax2.set_ylabel('显著特征数量', fontsize=11, fontweight='bold')
ax2.set_title('各折显著特征数量对比', fontsize=13, fontweight='bold')
ax2.legend(fontsize=9)
ax2.grid(axis='y', alpha=0.3)

# ---------------------- 图3: 稳定性分布直方图 ----------------------
ax3 = axes[0, 2]

stability_values = feature_stability['FDR_q005_Stability'].values
ax3.hist(stability_values, bins=np.arange(0, 1.1, 0.2), 
         color='steelblue', edgecolor='black', alpha=0.7)
ax3.axvline(x=STABILITY_THRESHOLD, color='red', linestyle='--', linewidth=2,
            label=f'阈值 ({STABILITY_THRESHOLD:.0%})')
ax3.set_xlabel('稳定性分数', fontsize=11, fontweight='bold')
ax3.set_ylabel('特征数量', fontsize=11, fontweight='bold')
ax3.set_title('特征稳定性分布 (FDR q<0.05)', fontsize=13, fontweight='bold')
ax3.legend(fontsize=9)
ax3.grid(axis='y', alpha=0.3)

# ---------------------- 图4: 全数据集 vs 嵌套CV对比 ----------------------
ax4 = axes[1, 0]

# 统计全数据集的显著特征数
full_fdr_005 = (univariate_results_full['FDR_Significant']).sum()
full_fdr_01 = (univariate_results_full['P_Value_FDR'] < 0.1).sum()
full_p_005 = (univariate_results_full['P_Value'] < 0.05).sum()
full_p_01 = (univariate_results_full['P_Value'] < 0.1).sum()

comparison_data = pd.DataFrame({
    '方法': ['全数据集\n(有泄露风险)', '嵌套CV\n(稳定特征)'],
    'FDR q<0.05': [full_fdr_005, len(sig_features_fdr)],
    'FDR q<0.1': [full_fdr_01, len(sig_features_fdr_01)],
    'p<0.05': [full_p_005, len(sig_features_005)],
    'p<0.1': [full_p_01, len(sig_features_01)]
})

x_pos_comp = np.arange(len(comparison_data))
width_comp = 0.2

bars1_comp = ax4.bar(x_pos_comp - 1.5*width_comp, comparison_data['FDR q<0.05'], width_comp, 
                     label='FDR q<0.05', color='#2ecc71', alpha=0.8)
bars2_comp = ax4.bar(x_pos_comp - 0.5*width_comp, comparison_data['FDR q<0.1'], width_comp, 
                     label='FDR q<0.1', color='#58d68d', alpha=0.8)
bars3_comp = ax4.bar(x_pos_comp + 0.5*width_comp, comparison_data['p<0.05'], width_comp, 
                     label='p<0.05', color='#3498db', alpha=0.8)
bars4_comp = ax4.bar(x_pos_comp + 1.5*width_comp, comparison_data['p<0.1'], width_comp, 
                     label='p<0.1', color='#5dade2', alpha=0.8)

ax4.set_xticks(x_pos_comp)
ax4.set_xticklabels(comparison_data['方法'], fontsize=10)
ax4.set_ylabel('特征数量', fontsize=11, fontweight='bold')
ax4.set_title('全数据集 vs 嵌套CV特征数量对比', fontsize=13, fontweight='bold')
ax4.legend(fontsize=9)
ax4.grid(axis='y', alpha=0.3)

# 添加数值标签
for bars in [bars1_comp, bars2_comp, bars3_comp, bars4_comp]:
    for bar in bars:
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom', fontsize=9)

# ---------------------- 图5: 跨折p值一致性散点图 ----------------------
ax5 = axes[1, 1]

# 计算各折的平均p值
fold_pvalues = []
for fold_idx in range(1, N_SPLITS + 1):
    fold_result = univariate_results_cv[f'fold_{fold_idx}']
    fold_pvalues.append(fold_result.set_index('Feature')['P_Value'])

# 创建p值矩阵
pvalue_matrix = pd.concat(fold_pvalues, axis=1)
pvalue_matrix.columns = [f'折{i}' for i in range(1, N_SPLITS+1)]

# 计算平均p值和标准差
pvalue_matrix['Mean'] = pvalue_matrix.mean(axis=1)
pvalue_matrix['Std'] = pvalue_matrix.std(axis=1)

# 绘制全数据集p值 vs 平均折p值
full_pvalues = univariate_results_full.set_index('Feature')['P_Value']
common_features = pvalue_matrix.index.intersection(full_pvalues.index)

ax5.scatter(full_pvalues[common_features], 
           pvalue_matrix.loc[common_features, 'Mean'],
           alpha=0.5, s=30, c='steelblue', edgecolors='black', linewidths=0.5)

# 添加对角线
max_p = max(full_pvalues[common_features].max(), pvalue_matrix.loc[common_features, 'Mean'].max())
ax5.plot([0, max_p], [0, max_p], 'r--', linewidth=2, label='y=x', alpha=0.7)

# 添加显著性阈值线
ax5.axhline(y=0.05, color='green', linestyle='--', linewidth=1.5, alpha=0.5)
ax5.axvline(x=0.05, color='green', linestyle='--', linewidth=1.5, alpha=0.5)

ax5.set_xlabel('全数据集 P-Value', fontsize=11, fontweight='bold')
ax5.set_ylabel('嵌套CV 平均 P-Value', fontsize=11, fontweight='bold')
ax5.set_title('P值一致性: 全数据集 vs 嵌套CV', fontsize=13, fontweight='bold')
ax5.legend(fontsize=9)
ax5.grid(alpha=0.3)

# ---------------------- 图6: 特征选择方法对比（饼图）----------------------
ax6 = axes[1, 2]

method_counts = [
    len(sig_features_fdr),
    len(sig_features_fdr_01) - len(sig_features_fdr),
    len(feature_names) - len(sig_features_fdr_01)
]

colors_pie = ['#2ecc71', '#58d68d', '#95a5a6']
labels_pie = [
    f'FDR q<0.05\n({len(sig_features_fdr)} 个)',
    f'FDR 0.05<q<0.1\n({method_counts[1]} 个)',
    f'未选中\n({method_counts[2]} 个)'
]

wedges, texts, autotexts = ax6.pie(method_counts, labels=labels_pie, colors=colors_pie,
                                     autopct='%1.1f%%', startangle=90,
                                     textprops={'fontsize': 10, 'fontweight': 'bold'})

ax6.set_title('嵌套CV稳定特征分布', fontsize=13, fontweight='bold')

plt.tight_layout()

# 保存图表
cv_plot_path = f"{output_dir}/univariate_nested_cv_analysis.png"
plt.savefig(cv_plot_path, dpi=300, bbox_inches='tight')
print(f"\n✅ 嵌套CV分析可视化已保存至: {cv_plot_path}")
plt.show()

# =============================================================================
# 输出详细统计
# =============================================================================
print("\n" + "=" * 80)
print("嵌套CV特征选择详细统计")
print("=" * 80)

print(f"\n1. 交叉验证配置:")
print(f"   - 折数: {N_SPLITS}")
print(f"   - FDR控制水平: α = {FDR_ALPHA}")
print(f"   - 稳定性阈值: {STABILITY_THRESHOLD:.0%}")

print(f"\n2. 各折显著特征数 (FDR q<0.05):")
for fold_idx in range(1, N_SPLITS + 1):
    n_features = len(selected_features_cv[f'fold_{fold_idx}']['fdr_q005'])
    print(f"   - 折 {fold_idx}: {n_features} 个")

print(f"\n3. 稳定特征统计 (FDR q<0.05):")
print(f"   - 在所有折中都被选中: {(feature_stability['FDR_q005_Stability'] == 1.0).sum()} 个")
print(f"   - 在至少 {N_SPLITS-1} 折中被选中: {(feature_stability['FDR_q005_Stability'] >= (N_SPLITS-1)/N_SPLITS).sum()} 个")
print(f"   - 稳定性 >= {STABILITY_THRESHOLD:.0%}: {len(sig_features_fdr)} 个 ⭐推荐使用")

print(f"\n4. 全数据集 vs 嵌套CV对比:")
print(f"   全数据集 (有泄露风险):")
print(f"     - FDR q<0.05: {full_fdr_005} 个")
print(f"     - FDR q<0.1: {full_fdr_01} 个")
print(f"   嵌套CV (稳定特征, 无泄露):")
print(f"     - FDR q<0.05稳定: {len(sig_features_fdr)} 个")
print(f"     - FDR q<0.1稳定: {len(sig_features_fdr_01)} 个")

if full_fdr_005 > len(sig_features_fdr):
    diff = full_fdr_005 - len(sig_features_fdr)
    print(f"\n   ⚠️  全数据集方法可能包含 {diff} 个不稳定/过拟合特征")
    print(f"   建议使用嵌套CV稳定特征，避免数据泄露")

print(f"\n5. 推荐特征列表:")
print(f"   ✅ 主要推荐: sig_features_fdr ({len(sig_features_fdr)} 个)")
print(f"      - 基于FDR q<0.05")
print(f"      - 稳定性 >= {STABILITY_THRESHOLD:.0%}")
print(f"      - 无数据泄露风险")

print("\n" + "=" * 80)
print("✅ 嵌套CV特征选择可视化完成")
print("=" * 80)