In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 读取数据
df = pd.read_csv('/content/drive/MyDrive/merged_data_by_year_生态区级 (2).csv')

# 检查数据
print("数据形状:", df.shape)
print("列名:", df.columns.tolist())
print("\n标签分布:")
print(df['b1'].value_counts())
print(f"PFCL比例: {df['b1'].mean():.4%}")

# 定义特征列和目标列
# 排除非特征列：lat, lon, year, ID, b1（目标）
exclude_cols = ['lat', 'lon', 'year', 'ID', 'b1']
feature_cols = [col for col in df.columns if col not in exclude_cols]
target_col = 'b1'

print(f"\n特征数量: {len(feature_cols)}")
print(f"目标变量: {target_col}")

# 获取所有生态区
ecoregions = df['ID'].unique()
print(f"\n生态区数量: {len(ecoregions)}")
print(f"生态区: {ecoregions}")

# 初始化结果存储
base_rate_results = []

# 定义基础率范围 (0.5% 到 20%)
base_rates_percent = np.concatenate([
    [0.5, 1, 2, 3, 4, 5],
    np.arange(6, 21, 1)
])

print(f"\n将测试的基础率范围: {base_rates_percent}%")

# 对每个生态区进行分析
for ecoregion in tqdm(ecoregions, desc="处理生态区"):
    print(f"\n处理生态区: {ecoregion}")

    # 筛选当前生态区的数据
    eco_data = df[df['ID'] == ecoregion].copy()

    if len(eco_data) < 20:
        print(f"  生态区 {ecoregion} 数据不足 ({len(eco_data)} 行)，跳过")
        continue

    X = eco_data[feature_cols]
    y = eco_data[target_col]

    # 检查类别平衡
    class_counts = y.value_counts()
    print(f"  类别分布: 0={class_counts.get(0, 0)}, 1={class_counts.get(1, 0)}")

    if class_counts.get(1, 0) < 5:
        print(f"  PFCL样本过少，跳过")
        continue

    # 使用分层交叉验证获得预测结果
    # 使用1:1类别平衡采样（通过分层交叉验证和平衡的随机森林）
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 使用类别平衡的随机森林
    clf = RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',  # 平衡类别权重
        random_state=42,
        n_jobs=-1
    )

    # 使用交叉验证获得预测
    try:
        y_pred = cross_val_predict(clf, X, y, cv=skf, method='predict')
        y_pred_proba = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]
    except Exception as e:
        print(f"  交叉验证错误: {e}")
        continue

    # 计算基础指标
    cm = confusion_matrix(y, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # 计算常规指标
    oa = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    kappa = cohen_kappa_score(y, y_pred)

    # 计算PA和UA（生产者精度和用户精度）
    if tp + fn > 0:
        pa = tp / (tp + fn)  # 生产者精度（召回率）
    else:
        pa = 0

    if tp + fp > 0:
        ua = tp / (tp + fp)  # 用户精度（精确率）
    else:
        ua = 0

    print(f"  混淆矩阵: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
    print(f"  OA={oa:.3f}, F1={f1:.3f}, Kappa={kappa:.3f}")
    print(f"  PA={pa:.3f}, UA={ua:.3f}")

    # 进行基于基础率的敏感性分析
    for base_rate_percent in base_rates_percent:
        base_rate = base_rate_percent / 100  # 转换为小数

        # 调整混淆矩阵以反映新的基础率
        # 假设原始训练数据是1:1平衡的（先验概率各为0.5）
        # 我们需要根据实际基础率重新加权混淆矩阵

        # 实际中，非PFCL和PFCL的比例为 (1-base_rate):base_rate
        # 但在平衡采样中，我们假设每个类别的先验概率为0.5

        # 计算调整因子
        # 对于非PFCL行：乘以 (1-base_rate)/0.5
        # 对于PFCL行：乘以 base_rate/0.5
        weight_non_pfcl = (1 - base_rate) / 0.5
        weight_pfcl = base_rate / 0.5

        # 调整混淆矩阵
        adjusted_cm = np.array([
            [tn * weight_non_pfcl, fp * weight_non_pfcl],
            [fn * weight_pfcl, tp * weight_pfcl]
        ])

        # 计算调整后的PA和UA
        adjusted_tp = adjusted_cm[1, 1]
        adjusted_fn = adjusted_cm[1, 0]
        adjusted_fp = adjusted_cm[0, 1]

        if adjusted_tp + adjusted_fn > 0:
            pa_adj = adjusted_tp / (adjusted_tp + adjusted_fn)
        else:
            pa_adj = 0

        if adjusted_tp + adjusted_fp > 0:
            ua_adj = adjusted_tp / (adjusted_tp + adjusted_fp)
        else:
            ua_adj = 0

        # 存储结果
        base_rate_results.append({
            'ecoregion': ecoregion,
            'pfcl_base_rate_percent': base_rate_percent,
            'PA_adj': pa_adj,
            'UA_adj': ua_adj,
            'OA': oa,
            'F1': f1,
            'Kappa': kappa,
            'PA_original': pa,
            'UA_original': ua
        })

# 转换为DataFrame
results_df = pd.DataFrame(base_rate_results)

# 保存结果到CSV
output_path = '/content/drive/MyDrive/results_base_rate_by_ecoregion.csv'
results_df.to_csv(output_path, index=False)

print(f"\n结果已保存到: {output_path}")
print(f"结果形状: {results_df.shape}")
print("\n结果预览:")
print(results_df.head(10))

# 创建可视化
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# 绘制PA随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:  # 只显示前5个生态区
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[0].plot(eco_data['pfcl_base_rate_percent'], eco_data['PA_adj'],
                label=ecoregion, marker='o', markersize=4)

axes[0].set_xlabel('PFCL Base Rate (%)')
axes[0].set_ylabel('Adjusted Producer Accuracy (PA)')
axes[0].set_title('PA vs PFCL Base Rate by Ecoregion')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 绘制UA随基础率变化
for ecoregion in results_df['ecoregion'].unique()[:5]:
    eco_data = results_df[results_df['ecoregion'] == ecoregion]
    axes[1].plot(eco_data['pfcl_base_rate_percent'], eco_data['UA_adj'],
                label=ecoregion, marker='s', markersize=4)

axes[1].set_xlabel('PFCL Base Rate (%)')
axes[1].set_ylabel('Adjusted User Accuracy (UA)')
axes[1].set_title('UA vs PFCL Base Rate by Ecoregion')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/base_rate_sensitivity_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# 生成汇总统计表
summary_stats = results_df.groupby('ecoregion').agg({
    'PA_adj': ['min', 'max', 'mean'],
    'UA_adj': ['min', 'max', 'mean'],
    'OA': 'first',
    'F1': 'first',
    'Kappa': 'first'
}).round(3)

print("\n汇总统计:")
print(summary_stats)

# 计算不同基础率下的平均表现
base_rate_summary = results_df.groupby('pfcl_base_rate_percent').agg({
    'PA_adj': 'mean',
    'UA_adj': 'mean'
}).round(3)

print("\n不同基础率下的平均表现:")
print(base_rate_summary)

# 计算乐观偏差（1:1平衡采样与基础率调整后的差异）
# 使用基础率=1%作为参考
ref_rate = 1.0
ref_results = results_df[results_df['pfcl_base_rate_percent'] == ref_rate]

if len(ref_results) > 0:
    optimism_bias = []
    for _, row in ref_results.iterrows():
        bias_pa = row['PA_original'] - row['PA_adj']
        bias_ua = row['UA_original'] - row['UA_adj']
        optimism_bias.append({
            'ecoregion': row['ecoregion'],
            'PA_bias': bias_pa,
            'UA_bias': bias_ua,
            'relative_PA_bias': bias_pa / row['PA_original'] if row['PA_original'] > 0 else 0,
            'relative_UA_bias': bias_ua / row['UA_original'] if row['UA_original'] > 0 else 0
        })

    bias_df = pd.DataFrame(optimism_bias)
    print("\n乐观偏差分析 (基于1%基础率):")
    print(bias_df.round(3))

    # 保存乐观偏差结果
    bias_df.to_csv('/content/drive/MyDrive/optimism_bias_analysis.csv', index=False)

print("\n分析完成!")