In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, cohen_kappa_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 读取数据
df = pd.read_csv('/content/drive/MyDrive/merged_data_by_year_生态区级 (2).csv')

# 检查数据
print("数据形状:", df.shape)
print("列名:", df.columns.tolist())
print("\n标签分布:")
print(df['b1'].value_counts())
print(f"PFCL比例: {df['b1'].mean():.4%}")

# 定义特征列和目标列
exclude_cols = ['lat', 'lon', 'year', 'ID', 'b1']
feature_cols = [col for col in df.columns if col not in exclude_cols]
target_col = 'b1'

print(f"\n特征数量: {len(feature_cols)}")
print(f"目标变量: {target_col}")

# 获取所有生态区
ecoregions = df['ID'].unique()
print(f"\n生态区数量: {len(ecoregions)}")
print(f"生态区: {ecoregions}")

# 初始化结果存储
base_rate_results = []

# 定义基础率范围 (0.5% 到 20%)
base_rates_percent = np.concatenate([
    [0.5, 1, 2, 3, 4, 5],
    np.arange(6, 21, 1)
])

print(f"\n将测试的基础率范围: {base_rates_percent}%")

# 对每个生态区进行分析
for ecoregion in tqdm(ecoregions, desc="处理生态区"):
    print(f"\n处理生态区: {ecoregion}")

    # 筛选当前生态区的数据
    eco_data = df[df['ID'] == ecoregion].copy()

    if len(eco_data) < 20:
        print(f"  生态区 {ecoregion} 数据不足 ({len(eco_data)} 行)，跳过")
        continue

    X = eco_data[feature_cols]
    y = eco_data[target_col]

    # 检查类别平衡
    class_counts = y.value_counts()
    print(f"  类别分布: 0={class_counts.get(0, 0)}, 1={class_counts.get(1, 0)}")

    if class_counts.get(1, 0) < 5:
        print(f"  PFCL样本过少，跳过")
        continue

    # 使用分层交叉验证获得预测结果
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 使用类别平衡的随机森林
    clf = RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )

    # 使用交叉验证获得预测
    try:
        y_pred = cross_val_predict(clf, X, y, cv=skf, method='predict')
        y_pred_proba = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]
    except Exception as e:
        print(f"  交叉验证错误: {e}")
        continue

    # 计算原始指标（1:1平衡采样）
    cm_original = confusion_matrix(y, y_pred)
    tn_original, fp_original, fn_original, tp_original = cm_original.ravel()

    # 计算原始指标
    oa_original = accuracy_score(y, y_pred)
    f1_original = f1_score(y, y_pred)
    kappa_original = cohen_kappa_score(y, y_pred)
    pa_original = recall_score(y, y_pred)  # 生产者精度 = 召回率
    ua_original = precision_score(y, y_pred)  # 用户精度 = 精确率

    print(f"  原始混淆矩阵: TN={tn_original}, FP={fp_original}, FN={fn_original}, TP={tp_original}")
    print(f"  原始指标: OA={oa_original:.3f}, F1={f1_original:.3f}, Kappa={kappa_original:.3f}, PA={pa_original:.3f}, UA={ua_original:.3f}")

    # 进行基于基础率的敏感性分析
    for base_rate_percent in base_rates_percent:
        base_rate = base_rate_percent / 100  # 转换为小数

        # 方法1：基于贝叶斯定理调整阈值（使用概率）
        # 在平衡采样下，默认阈值是0.5
        # 当先验概率改变时，最优阈值也改变
        # 根据贝叶斯决策理论：新阈值 = base_rate * 成本比
        # 这里假设成本比为1，所以新阈值 = base_rate

        new_threshold = base_rate

        # 应用新阈值
        y_pred_adjusted = (y_pred_proba >= new_threshold).astype(int)

        # 计算调整后的指标
        cm_adjusted = confusion_matrix(y, y_pred_adjusted)
        tn_adjusted, fp_adjusted, fn_adjusted, tp_adjusted = cm_adjusted.ravel()

        # 计算调整后的指标
        oa_adjusted = accuracy_score(y, y_pred_adjusted)
        f1_adjusted = f1_score(y, y_pred_adjusted)
        kappa_adjusted = cohen_kappa_score(y, y_pred_adjusted)
        pa_adjusted = recall_score(y, y_pred_adjusted, zero_division=0)
        ua_adjusted = precision_score(y, y_pred_adjusted, zero_division=0)

        # 方法2：重加权混淆矩阵（作为验证）
        # 原始训练是平衡的（各50%），但测试时基础率不同
        # 调整因子：新分布 vs 旧分布
        weight_non_pfcl = (1 - base_rate) / 0.5
        weight_pfcl = base_rate / 0.5

        # 调整原始混淆矩阵
        tn_weighted = tn_original * weight_non_pfcl
        fp_weighted = fp_original * weight_non_pfcl
        fn_weighted = fn_original * weight_pfcl
        tp_weighted = tp_original * weight_pfcl

        # 从加权混淆矩阵计算指标
        total_weighted = tn_weighted + fp_weighted + fn_weighted + tp_weighted

        if total_weighted > 0:
            oa_weighted = (tn_weighted + tp_weighted) / total_weighted
        else:
            oa_weighted = 0

        if tp_weighted + fn_weighted > 0:
            pa_weighted = tp_weighted / (tp_weighted + fn_weighted)
        else:
            pa_weighted = 0

        if tp_weighted + fp_weighted > 0:
            ua_weighted = tp_weighted / (tp_weighted + fp_weighted)
        else:
            ua_weighted = 0

        if pa_weighted + ua_weighted > 0:
            f1_weighted = 2 * (pa_weighted * ua_weighted) / (pa_weighted + ua_weighted)
        else:
            f1_weighted = 0

        # 计算加权Kappa
        # Po = OA_weighted
        # Pe = (预测为正的概率 * 实际为正的概率 + 预测为负的概率 * 实际为负的概率)
        pred_pos_rate_weighted = (tp_weighted + fp_weighted) / total_weighted if total_weighted > 0 else 0
        actual_pos_rate_weighted = (tp_weighted + fn_weighted) / total_weighted if total_weighted > 0 else 0
        pred_neg_rate_weighted = (tn_weighted + fn_weighted) / total_weighted if total_weighted > 0 else 0
        actual_neg_rate_weighted = (tn_weighted + fp_weighted) / total_weighted if total_weighted > 0 else 0

        pe_weighted = pred_pos_rate_weighted * actual_pos_rate_weighted + pred_neg_rate_weighted * actual_neg_rate_weighted

        if 1 - pe_weighted > 0:
            kappa_weighted = (oa_weighted - pe_weighted) / (1 - pe_weighted)
        else:
            kappa_weighted = 0

        # 存储结果 - 使用阈值调整方法的结果
        base_rate_results.append({
            'ecoregion': ecoregion,
            'pfcl_base_rate_percent': base_rate_percent,

            # 原始指标（1:1平衡采样）
            'OA_original': oa_original,
            'F1_original': f1_original,
            'Kappa_original': kappa_original,
            'PA_original': pa_original,
            'UA_original': ua_original,

            # 调整后的指标（使用阈值调整）
            'OA_adj': oa_adjusted,
            'F1_adj': f1_adjusted,
            'Kappa_adj': kappa_adjusted,
            'PA_adj': pa_adjusted,
            'UA_adj': ua_adjusted,

            # 加权混淆矩阵方法的结果（作为参考）
            'OA_weighted': oa_weighted,
            'F1_weighted': f1_weighted,
            'Kappa_weighted': kappa_weighted,
            'PA_weighted': pa_weighted,
            'UA_weighted': ua_weighted,

            # 其他信息
            'samples': len(eco_data),
            'pfcl_count': class_counts.get(1, 0),
            'new_threshold': new_threshold,

            # 原始混淆矩阵
            'tn_original': tn_original,
            'fp_original': fp_original,
            'fn_original': fn_original,
            'tp_original': tp_original,

            # 调整后混淆矩阵
            'tn_adjusted': tn_adjusted,
            'fp_adjusted': fp_adjusted,
            'fn_adjusted': fn_adjusted,
            'tp_adjusted': tp_adjusted
        })

# 转换为DataFrame
results_df = pd.DataFrame(base_rate_results)

# 保存结果到CSV
output_path = '/content/drive/MyDrive/results_base_rate_by_ecoregion_complete.csv'
results_df.to_csv(output_path, index=False)

print(f"\n结果已保存到: {output_path}")
print(f"结果形状: {results_df.shape}")
print("\n结果预览:")
print(results_df.head(10))

# 验证：对于基础率=50%，调整后的指标应接近原始指标（因为训练就是50:50平衡的）
print("\n验证: 基础率=50%时的对比")
for ecoregion in results_df['ecoregion'].unique()[:3]:
    subset = results_df[(results_df['ecoregion'] == ecoregion) &
                        (results_df['pfcl_base_rate_percent'] == 50)]
    if len(subset) > 0:
        row = subset.iloc[0]
        print(f"\n生态区 {ecoregion}:")
        print(f"  OA: 原始={row['OA_original']:.3f}, 调整={row['OA_adj']:.3f}, 差异={abs(row['OA_original']-row['OA_adj']):.4f}")
        print(f"  PA: 原始={row['PA_original']:.3f}, 调整={row['PA_adj']:.3f}, 差异={abs(row['PA_original']-row['PA_adj']):.4f}")
        print(f"  UA: 原始={row['UA_original']:.3f}, 调整={row['UA_adj']:.3f}, 差异={abs(row['UA_original']-row['UA_adj']):.4f}")

# 创建可视化
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. PA随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[0, 0].plot(eco_data['pfcl_base_rate_percent'], eco_data['PA_adj'],
                    label=ecoregion, marker='o', markersize=3)
axes[0, 0].set_xlabel('PFCL Base Rate (%)')
axes[0, 0].set_ylabel('Adjusted Producer Accuracy (PA)')
axes[0, 0].set_title('PA vs PFCL Base Rate')
axes[0, 0].legend(fontsize=8)
axes[0, 0].grid(True, alpha=0.3)

# 2. UA随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[0, 1].plot(eco_data['pfcl_base_rate_percent'], eco_data['UA_adj'],
                    label=ecoregion, marker='s', markersize=3)
axes[0, 1].set_xlabel('PFCL Base Rate (%)')
axes[0, 1].set_ylabel('Adjusted User Accuracy (UA)')
axes[0, 1].set_title('UA vs PFCL Base Rate')
axes[0, 1].legend(fontsize=8)
axes[0, 1].grid(True, alpha=0.3)

# 3. F1随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[0, 2].plot(eco_data['pfcl_base_rate_percent'], eco_data['F1_adj'],
                    label=ecoregion, marker='^', markersize=3)
axes[0, 2].set_xlabel('PFCL Base Rate (%)')
axes[0, 2].set_ylabel('Adjusted F1 Score')
axes[0, 2].set_title('F1 vs PFCL Base Rate')
axes[0, 2].legend(fontsize=8)
axes[0, 2].grid(True, alpha=0.3)

# 4. OA随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[1, 0].plot(eco_data['pfcl_base_rate_percent'], eco_data['OA_adj'],
                    label=ecoregion, marker='d', markersize=3)
axes[1, 0].set_xlabel('PFCL Base Rate (%)')
axes[1, 0].set_ylabel('Adjusted Overall Accuracy (OA)')
axes[1, 0].set_title('OA vs PFCL Base Rate')
axes[1, 0].legend(fontsize=8)
axes[1, 0].grid(True, alpha=0.3)

# 5. Kappa随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[1, 1].plot(eco_data['pfcl_base_rate_percent'], eco_data['Kappa_adj'],
                    label=ecoregion, marker='v', markersize=3)
axes[1, 1].set_xlabel('PFCL Base Rate (%)')
axes[1, 1].set_ylabel('Adjusted Kappa Coefficient')
axes[1, 1].set_title('Kappa vs PFCL Base Rate')
axes[1, 1].legend(fontsize=8)
axes[1, 1].grid(True, alpha=0.3)

# 6. 阈值随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[1, 2].plot(eco_data['pfcl_base_rate_percent'], eco_data['new_threshold'],
                    label=ecoregion, marker='*', markersize=3)
axes[1, 2].set_xlabel('PFCL Base Rate (%)')
axes[1, 2].set_ylabel('Adjusted Threshold')
axes[1, 2].set_title('Decision Threshold vs PFCL Base Rate')
axes[1, 2].legend(fontsize=8)
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/base_rate_sensitivity_complete_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# 生成汇总统计表
print("\n汇总统计 - 不同基础率下的平均表现:")
summary_stats = results_df.groupby('pfcl_base_rate_percent').agg({
    'OA_adj': 'mean',
    'F1_adj': 'mean',
    'Kappa_adj': 'mean',
    'PA_adj': 'mean',
    'UA_adj': 'mean'
}).round(3)

print(summary_stats)

# 计算乐观偏差（在典型基础率下，如1%）
ref_rate = 1.0
optimism_bias = []

for ecoregion in results_df['ecoregion'].unique():
    eco_data = results_df[results_df['ecoregion'] == ecoregion]

    # 原始指标（50%基础率）
    orig_row = eco_data[eco_data['pfcl_base_rate_percent'] == 50].iloc[0] if 50 in eco_data['pfcl_base_rate_percent'].values else None

    # 调整后指标（参考基础率，如1%）
    adj_row = eco_data[eco_data['pfcl_base_rate_percent'] == ref_rate].iloc[0] if ref_rate in eco_data['pfcl_base_rate_percent'].values else None

    if orig_row is not None and adj_row is not None:
        optimism_bias.append({
            'ecoregion': ecoregion,
            'OA_bias': orig_row['OA_original'] - adj_row['OA_adj'],
            'F1_bias': orig_row['F1_original'] - adj_row['F1_adj'],
            'Kappa_bias': orig_row['Kappa_original'] - adj_row['Kappa_adj'],
            'PA_bias': orig_row['PA_original'] - adj_row['PA_adj'],
            'UA_bias': orig_row['UA_original'] - adj_row['UA_adj'],
            'OA_relative_bias%': (orig_row['OA_original'] - adj_row['OA_adj']) / orig_row['OA_original'] * 100 if orig_row['OA_original'] > 0 else 0,
            'UA_relative_bias%': (orig_row['UA_original'] - adj_row['UA_adj']) / orig_row['UA_original'] * 100 if orig_row['UA_original'] > 0 else 0
        })

if optimism_bias:
    bias_df = pd.DataFrame(optimism_bias)
    print(f"\n乐观偏差分析 (基于{ref_rate}%基础率):")
    print(bias_df.round(3))

    # 保存乐观偏差结果
    bias_df.to_csv('/content/drive/MyDrive/optimism_bias_analysis_complete.csv', index=False)

    # 计算平均乐观偏差
    print(f"\n平均乐观偏差 (基于{ref_rate}%基础率):")
    avg_bias = bias_df[['OA_bias', 'F1_bias', 'Kappa_bias', 'PA_bias', 'UA_bias']].mean()
    print(avg_bias.round(3))

print("\n分析完成!")