In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import time
import joblib
import pickle
from matplotlib.lines import Line2D
from scipy import stats
warnings.filterwarnings('ignore')

# 读取数据
df = pd.read_csv('/content/drive/MyDrive/merged_data_by_year_生态区级 (2).csv')

# 清理数据
df = df.dropna(subset=['ID'])
df['ID'] = df['ID'].astype(str).str.strip()

# 计算全国基础率
national_base_rate = df['b1'].mean()
print(f"全国PFCL基础率: {national_base_rate:.4%}")

print("数据形状:", df.shape)
print(f"生态区数量: {df['ID'].nunique()}")
print("\n标签分布:")
print(df['b1'].value_counts())
print(f"PFCL比例: {df['b1'].mean():.4%}")

# 定义特征列和目标列
exclude_cols = ['lat', 'lon', 'year', 'ID', 'b1']
feature_cols = [col for col in df.columns if col not in exclude_cols]
target_col = 'b1'

print(f"\n特征数量: {len(feature_cols)}")
print(f"目标变量: {target_col}")

# 获取所有生态区并排序
all_ecoregions = sorted(df['ID'].unique())
print(f"\n总生态区数量: {len(all_ecoregions)}")

# 定义面积调整函数
def calculate_area_adjusted_metrics(y_true, y_pred_proba, base_rate, original_threshold=0.5):
    """
    计算面积调整后的指标

    参数:
    y_true: 真实标签
    y_pred_proba: 预测概率（正类的概率）
    base_rate: 实际基础率（PFCL在景观中的比例）
    original_threshold: 原始分类阈值（默认0.5，对应1:1平衡采样）

    返回:
    调整后的各项指标
    """
    # 根据基础率调整阈值
    # 贝叶斯调整：新阈值 = base_rate / (base_rate + (1-base_rate) * (1/原始阈值 - 1))
    adjusted_threshold = base_rate / (base_rate + (1-base_rate) * (1/original_threshold - 1))

    # 使用调整后的阈值进行分类
    y_pred_adjusted = (y_pred_proba >= adjusted_threshold).astype(int)

    # 计算混淆矩阵
    cm = confusion_matrix(y_true, y_pred_adjusted)
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        # 如果只有一个类别
        if len(np.unique(y_pred_adjusted)) == 1:
            if y_pred_adjusted[0] == 0:
                tn, fp, fn, tp = len(y_true), 0, 0, 0
            else:
                tn, fp, fn, tp = 0, 0, 0, len(y_true)
        else:
            # 这种情况不应该发生，但以防万一
            return {k: np.nan for k in ['OA', 'F1', 'Kappa', 'PA', 'UA', 'adjusted_threshold']}

    # 计算调整后的指标
    total = tn + fp + fn + tp

    if total > 0:
        oa_adj = (tn + tp) / total
    else:
        oa_adj = 0

    if tp + fn > 0:
        pa_adj = tp / (tp + fn)  # 生产者精度
    else:
        pa_adj = 0

    if tp + fp > 0:
        ua_adj = tp / (tp + fp)  # 用户精度
    else:
        ua_adj = 0

    if pa_adj + ua_adj > 0:
        f1_adj = 2 * (pa_adj * ua_adj) / (pa_adj + ua_adj)
    else:
        f1_adj = 0

    # 计算Kappa
    # 期望一致率 Pe
    p_observed = oa_adj
    p_expected = ((tn + fp) * (tn + fn) + (fn + tp) * (fp + tp)) / (total * total) if total > 0 else 0

    if 1 - p_expected > 0:
        kappa_adj = (p_observed - p_expected) / (1 - p_expected)
    else:
        kappa_adj = 0

    return {
        'OA_adj': oa_adj,
        'F1_adj': f1_adj,
        'Kappa_adj': kappa_adj,
        'PA_adj': pa_adj,
        'UA_adj': ua_adj,
        'adjusted_threshold': adjusted_threshold,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp
    }

# 第一阶段：每个生态区独立建模
print(f"\n{'='*80}")
print("第一阶段：每个生态区独立建模")
print(f"{'='*80}")

# 创建目录保存模型和测试集
model_dir = '/content/drive/MyDrive/ecoregion_models/'
testset_dir = '/content/drive/MyDrive/ecoregion_testsets/'
os.makedirs(model_dir, exist_ok=True)
os.makedirs(testset_dir, exist_ok=True)

# 存储第一阶段结果
phase1_results = []

# 简化模型配置
rf_params = {
    'n_estimators': 50,
    'class_weight': 'balanced',
    'random_state': 42,
    'n_jobs': 2,
    'max_depth': 10,
    'min_samples_split': 20
}

start_time_phase1 = time.time()

for i, ecoregion in enumerate(all_ecoregions):
    print(f"\n处理生态区 {i+1}/{len(all_ecoregions)}: {ecoregion}")

    # 获取该生态区的数据
    eco_data = df[df['ID'] == ecoregion].copy()

    if len(eco_data) < 50:  # 如果数据太少，跳过
        print(f"  跳过: 数据量不足 ({len(eco_data)} 个样本)")
        continue

    # 计算生态区基础率
    eco_base_rate = eco_data['b1'].mean()

    # 划分训练集和测试集 (80%训练, 20%测试)
    X = eco_data[feature_cols].values
    y = eco_data[target_col].values

    # 确保正负样本在划分中都有代表性
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
    except:
        # 如果分层失败，使用普通划分
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

    print(f"  训练集: {len(X_train)} 样本, PFCL比例: {y_train.mean():.2%}")
    print(f"  测试集: {len(X_test)} 样本, PFCL比例: {y_test.mean():.2%}")
    print(f"  生态区基础率: {eco_base_rate:.4%}")

    # 训练模型
    model = RandomForestClassifier(**rf_params)
    model.fit(X_train, y_train)

    # 在本地测试集上评估
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 计算原始指标（1:1平衡采样）
    oa_local = accuracy_score(y_test, y_pred)
    f1_local = f1_score(y_test, y_pred, zero_division=0)
    pa_local = recall_score(y_test, y_pred, zero_division=0)
    ua_local = precision_score(y_test, y_pred, zero_division=0)
    kappa_local = cohen_kappa_score(y_test, y_pred)

    # 计算面积调整后的指标
    adj_metrics = calculate_area_adjusted_metrics(y_test, y_pred_proba, eco_base_rate, original_threshold=0.5)

    # 保存模型
    model_path = os.path.join(model_dir, f"model_{ecoregion}.pkl")
    joblib.dump(model, model_path)

    # 保存测试集数据
    testset_path = os.path.join(testset_dir, f"testset_{ecoregion}.pkl")
    testset_data = {
        'X_test': X_test,
        'y_test': y_test,
        'feature_cols': feature_cols,
        'base_rate': eco_base_rate
    }
    with open(testset_path, 'wb') as f:
        pickle.dump(testset_data, f)

    # 记录结果
    result = {
        'ecoregion': ecoregion,
        'n_train': len(X_train),
        'n_test': len(X_test),
        'base_rate': eco_base_rate,

        # 原始指标
        'OA_local': oa_local,
        'F1_local': f1_local,
        'PA_local': pa_local,
        'UA_local': ua_local,
        'Kappa_local': kappa_local,

        # 面积调整后指标
        'OA_local_adj': adj_metrics['OA_adj'],
        'F1_local_adj': adj_metrics['F1_adj'],
        'PA_local_adj': adj_metrics['PA_adj'],
        'UA_local_adj': adj_metrics['UA_adj'],
        'Kappa_local_adj': adj_metrics['Kappa_adj'],

        'model_path': model_path,
        'testset_path': testset_path
    }
    phase1_results.append(result)

    print(f"  本地原始性能: OA={oa_local:.3f}, F1={f1_local:.3f}, PA={pa_local:.3f}, UA={ua_local:.3f}")
    print(f"  本地面积调整后: OA_adj={adj_metrics['OA_adj']:.3f}, F1_adj={adj_metrics['F1_adj']:.3f}")

# 保存第一阶段结果
phase1_df = pd.DataFrame(phase1_results)
phase1_output = '/content/drive/MyDrive/phase1_local_models_results.csv'
phase1_df.to_csv(phase1_output, index=False)
print(f"\n第一阶段完成! 结果保存到: {phase1_output}")
print(f"成功建模的生态区数量: {len(phase1_df)}")

# 第二阶段：跨生态区迁移验证
print(f"\n{'='*80}")
print("第二阶段：跨生态区迁移验证")
print(f"{'='*80}")

phase2_results = []

start_time_phase2 = time.time()

# 加载所有成功建模的生态区信息
valid_ecoregions = phase1_df['ecoregion'].tolist()

# 收集全国尺度的迁移预测数据
all_transfer_predictions = []
all_transfer_labels = []
all_transfer_probas = []

for i, target_ecoregion in enumerate(valid_ecoregions):
    print(f"\n处理目标生态区 {i+1}/{len(valid_ecoregions)}: {target_ecoregion}")

    # 加载目标生态区的测试集
    target_testset_path = phase1_df.loc[phase1_df['ecoregion'] == target_ecoregion, 'testset_path'].values[0]
    with open(target_testset_path, 'rb') as f:
        target_testset = pickle.load(f)

    X_target_test = target_testset['X_test']
    y_target_test = target_testset['y_test']
    target_base_rate = target_testset['base_rate']

    # 获取目标生态区的本地性能
    local_perf = phase1_df.loc[phase1_df['ecoregion'] == target_ecoregion,
                              ['OA_local', 'F1_local', 'PA_local', 'UA_local', 'Kappa_local',
                               'OA_local_adj', 'F1_local_adj', 'PA_local_adj', 'UA_local_adj', 'Kappa_local_adj']].iloc[0]

    # 存储所有源生态区模型的性能
    cross_performances = []
    cross_performances_adj = []
    source_models_used = []

    # 使用所有其他生态区的模型进行测试
    for source_ecoregion in valid_ecoregions:
        if source_ecoregion == target_ecoregion:
            continue  # 跳过自身

        # 加载源生态区模型
        source_model_path = phase1_df.loc[phase1_df['ecoregion'] == source_ecoregion, 'model_path'].values[0]
        source_model = joblib.load(source_model_path)

        # 使用源模型预测目标测试集
        y_pred_cross = source_model.predict(X_target_test)
        y_pred_proba_cross = source_model.predict_proba(X_target_test)[:, 1]

        # 计算原始性能指标
        oa_cross = accuracy_score(y_target_test, y_pred_cross)
        f1_cross = f1_score(y_target_test, y_pred_cross, zero_division=0)
        pa_cross = recall_score(y_target_test, y_pred_cross, zero_division=0)
        ua_cross = precision_score(y_target_test, y_pred_cross, zero_division=0)
        kappa_cross = cohen_kappa_score(y_target_test, y_pred_cross)

        # 计算面积调整后的性能指标
        cross_adj_metrics = calculate_area_adjusted_metrics(y_target_test, y_pred_proba_cross, target_base_rate, original_threshold=0.5)

        cross_perf = {
            'source_ecoregion': source_ecoregion,
            'OA_cross': oa_cross,
            'F1_cross': f1_cross,
            'PA_cross': pa_cross,
            'UA_cross': ua_cross,
            'Kappa_cross': kappa_cross,
            'OA_cross_adj': cross_adj_metrics['OA_adj'],
            'F1_cross_adj': cross_adj_metrics['F1_adj'],
            'PA_cross_adj': cross_adj_metrics['PA_adj'],
            'UA_cross_adj': cross_adj_metrics['UA_adj'],
            'Kappa_cross_adj': cross_adj_metrics['Kappa_adj']
        }
        cross_performances.append(cross_perf)
        source_models_used.append(source_ecoregion)

        # 收集迁移预测数据（用于全国尺度计算）
        for j in range(len(y_pred_proba_cross)):
            all_transfer_predictions.append(y_pred_proba_cross[j] >= 0.5)
            all_transfer_probas.append(y_pred_proba_cross[j])
            all_transfer_labels.append(y_target_test[j])

    # 计算跨区迁移的平均性能
    cross_perf_df = pd.DataFrame(cross_performances)

    # 计算跨区原始性能的平均值和标准差
    oa_cross_mean = cross_perf_df['OA_cross'].mean()
    f1_cross_mean = cross_perf_df['F1_cross'].mean()
    pa_cross_mean = cross_perf_df['PA_cross'].mean()
    ua_cross_mean = cross_perf_df['UA_cross'].mean()
    kappa_cross_mean = cross_perf_df['Kappa_cross'].mean()

    # 计算跨区面积调整后性能的平均值
    oa_cross_mean_adj = cross_perf_df['OA_cross_adj'].mean()
    f1_cross_mean_adj = cross_perf_df['F1_cross_adj'].mean()
    pa_cross_mean_adj = cross_perf_df['PA_cross_adj'].mean()
    ua_cross_mean_adj = cross_perf_df['UA_cross_adj'].mean()
    kappa_cross_mean_adj = cross_perf_df['Kappa_cross_adj'].mean()

    # 计算性能下降程度（本地 vs 跨区平均）
    # 原始指标下降
    oa_decline = local_perf['OA_local'] - oa_cross_mean
    f1_decline = local_perf['F1_local'] - f1_cross_mean
    pa_decline = local_perf['PA_local'] - pa_cross_mean
    ua_decline = local_perf['UA_local'] - ua_cross_mean
    kappa_decline = local_perf['Kappa_local'] - kappa_cross_mean

    # 面积调整后指标下降
    oa_decline_adj = local_perf['OA_local_adj'] - oa_cross_mean_adj
    f1_decline_adj = local_perf['F1_local_adj'] - f1_cross_mean_adj
    pa_decline_adj = local_perf['PA_local_adj'] - pa_cross_mean_adj
    ua_decline_adj = local_perf['UA_local_adj'] - ua_cross_mean_adj
    kappa_decline_adj = local_perf['Kappa_local_adj'] - kappa_cross_mean_adj

    # 统计有多少个源模型性能优于本地模型
    better_models_f1 = (cross_perf_df['F1_cross'] > local_perf['F1_local']).sum()
    better_models_ratio_f1 = better_models_f1 / len(cross_perf_df)

    # 统计有多少个源模型面积调整后性能优于本地模型
    better_models_f1_adj = (cross_perf_df['F1_cross_adj'] > local_perf['F1_local_adj']).sum()
    better_models_ratio_f1_adj = better_models_f1_adj / len(cross_perf_df)

    # 记录结果
    result = {
        'target_ecoregion': target_ecoregion,
        'n_test_samples': len(X_target_test),
        'target_base_rate': target_base_rate,

        # 本地原始性能
        'OA_local': local_perf['OA_local'],
        'F1_local': local_perf['F1_local'],
        'PA_local': local_perf['PA_local'],
        'UA_local': local_perf['UA_local'],
        'Kappa_local': local_perf['Kappa_local'],

        # 本地面积调整后性能
        'OA_local_adj': local_perf['OA_local_adj'],
        'F1_local_adj': local_perf['F1_local_adj'],
        'PA_local_adj': local_perf['PA_local_adj'],
        'UA_local_adj': local_perf['UA_local_adj'],
        'Kappa_local_adj': local_perf['Kappa_local_adj'],

        # 跨区迁移原始平均性能
        'OA_cross_mean': oa_cross_mean,
        'F1_cross_mean': f1_cross_mean,
        'PA_cross_mean': pa_cross_mean,
        'UA_cross_mean': ua_cross_mean,
        'Kappa_cross_mean': kappa_cross_mean,

        # 跨区迁移面积调整后平均性能
        'OA_cross_mean_adj': oa_cross_mean_adj,
        'F1_cross_mean_adj': f1_cross_mean_adj,
        'PA_cross_mean_adj': pa_cross_mean_adj,
        'UA_cross_mean_adj': ua_cross_mean_adj,
        'Kappa_cross_mean_adj': kappa_cross_mean_adj,

        # 原始性能下降
        'OA_decline': oa_decline,
        'F1_decline': f1_decline,
        'PA_decline': pa_decline,
        'UA_decline': ua_decline,
        'Kappa_decline': kappa_decline,

        # 面积调整后性能下降
        'OA_decline_adj': oa_decline_adj,
        'F1_decline_adj': f1_decline_adj,
        'PA_decline_adj': pa_decline_adj,
        'UA_decline_adj': ua_decline_adj,
        'Kappa_decline_adj': kappa_decline_adj,

        # 统计信息
        'n_source_models': len(cross_perf_df),
        'n_better_models_f1': better_models_f1,
        'better_models_ratio_f1': better_models_ratio_f1,
        'n_better_models_f1_adj': better_models_f1_adj,
        'better_models_ratio_f1_adj': better_models_ratio_f1_adj,
        'source_models': ','.join(source_models_used)
    }
    phase2_results.append(result)

    print(f"  本地F1: {local_perf['F1_local']:.3f}, 跨区平均F1: {f1_cross_mean:.3f}")
    print(f"  本地F1_adj: {local_perf['F1_local_adj']:.3f}, 跨区平均F1_adj: {f1_cross_mean_adj:.3f}")
    print(f"  F1下降: {f1_decline:.3f} (原始), {f1_decline_adj:.3f} (面积调整后)")
    print(f"  有 {better_models_f1} 个源模型性能优于本地模型 ({better_models_ratio_f1:.1%})")

# 保存第二阶段结果
phase2_df = pd.DataFrame(phase2_results)
phase2_output = '/content/drive/MyDrive/phase2_cross_ecoregion_results.csv'
phase2_df.to_csv(phase2_output, index=False)

print(f"\n第二阶段完成! 结果保存到: {phase2_output}")

# 第三阶段：分析与可视化
print(f"\n{'='*80}")
print("第三阶段：结果分析与可视化")
print(f"{'='*80}")

# 计算全国尺度的迁移模型面积调整指标
print(f"\n计算全国尺度的迁移模型面积调整指标...")

if len(all_transfer_labels) > 0:
    all_transfer_labels = np.array(all_transfer_labels)
    all_transfer_probas = np.array(all_transfer_probas)
    all_transfer_predictions = np.array(all_transfer_predictions)

    # 计算迁移模型原始指标（1:1平衡采样）
    oa_transfer_original = accuracy_score(all_transfer_labels, all_transfer_predictions)
    f1_transfer_original = f1_score(all_transfer_labels, all_transfer_predictions, zero_division=0)
    pa_transfer_original = recall_score(all_transfer_labels, all_transfer_predictions, zero_division=0)
    ua_transfer_original = precision_score(all_transfer_labels, all_transfer_predictions, zero_division=0)
    kappa_transfer_original = cohen_kappa_score(all_transfer_labels, all_transfer_predictions)

    # 计算迁移模型面积调整后指标（使用全国基础率）
    transfer_adj_metrics = calculate_area_adjusted_metrics(
        all_transfer_labels, all_transfer_probas, national_base_rate, original_threshold=0.5
    )

    print(f"\n全国尺度迁移模型性能:")
    print(f"  原始指标:")
    print(f"    OA_transfer_original: {oa_transfer_original:.4f}")
    print(f"    F1_transfer_original: {f1_transfer_original:.4f}")
    print(f"    Kappa_transfer_original: {kappa_transfer_original:.4f}")
    print(f"    PA_transfer_original: {pa_transfer_original:.4f}")
    print(f"    UA_transfer_original: {ua_transfer_original:.4f}")

    print(f"\n  面积调整后指标:")
    print(f"    OA_transfer_adj: {transfer_adj_metrics['OA_adj']:.4f}")
    print(f"    F1_transfer_adj: {transfer_adj_metrics['F1_adj']:.4f}")
    print(f"    Kappa_transfer_adj: {transfer_adj_metrics['Kappa_adj']:.4f}")
    print(f"    PA_transfer_adj: {transfer_adj_metrics['PA_adj']:.4f}")
    print(f"    UA_transfer_adj: {transfer_adj_metrics['UA_adj']:.4f}")
    print(f"    调整阈值: {transfer_adj_metrics['adjusted_threshold']:.6f}")

    # 保存全国尺度迁移模型结果
    transfer_summary = {
        'OA_transfer_original': oa_transfer_original,
        'F1_transfer_original': f1_transfer_original,
        'Kappa_transfer_original': kappa_transfer_original,
        'PA_transfer_original': pa_transfer_original,
        'UA_transfer_original': ua_transfer_original,
        'OA_transfer_adj': transfer_adj_metrics['OA_adj'],
        'F1_transfer_adj': transfer_adj_metrics['F1_adj'],
        'Kappa_transfer_adj': transfer_adj_metrics['Kappa_adj'],
        'PA_transfer_adj': transfer_adj_metrics['PA_adj'],
        'UA_transfer_adj': transfer_adj_metrics['UA_adj'],
        'national_base_rate': national_base_rate,
        'adjusted_threshold': transfer_adj_metrics['adjusted_threshold'],
        'n_test_samples': len(all_transfer_labels)
    }

    transfer_summary_df = pd.DataFrame([transfer_summary])
    transfer_summary_path = '/content/drive/MyDrive/transfer_model_area_adjusted_summary.csv'
    transfer_summary_df.to_csv(transfer_summary_path, index=False)
    print(f"\n迁移模型全国尺度结果已保存到: {transfer_summary_path}")
else:
    print(f"  警告: 没有足够的迁移预测数据")
    transfer_adj_metrics = {
        'OA_adj': np.nan,
        'F1_adj': np.nan,
        'Kappa_adj': np.nan,
        'PA_adj': np.nan,
        'UA_adj': np.nan,
        'adjusted_threshold': np.nan
    }

# 分析整体迁移性能
print("\n整体迁移性能分析:")
print(f"评估的生态区数量: {len(phase2_df)}")

# 计算整体平均下降（原始指标）
mean_f1_decline = phase2_df['F1_decline'].mean()
mean_oa_decline = phase2_df['OA_decline'].mean()
mean_pa_decline = phase2_df['PA_decline'].mean()
mean_ua_decline = phase2_df['UA_decline'].mean()

# 计算整体平均下降（面积调整后指标）
mean_f1_decline_adj = phase2_df['F1_decline_adj'].mean()
mean_oa_decline_adj = phase2_df['OA_decline_adj'].mean()
mean_pa_decline_adj = phase2_df['PA_decline_adj'].mean()
mean_ua_decline_adj = phase2_df['UA_decline_adj'].mean()

print(f"\n平均性能下降 (原始指标):")
print(f"  F1平均下降: {mean_f1_decline:.3f}")
print(f"  OA平均下降: {mean_oa_decline:.3f}")
print(f"  PA平均下降: {mean_pa_decline:.3f}")
print(f"  UA平均下降: {mean_ua_decline:.3f}")

print(f"\n平均性能下降 (面积调整后指标):")
print(f"  F1平均下降: {mean_f1_decline_adj:.3f}")
print(f"  OA平均下降: {mean_oa_decline_adj:.3f}")
print(f"  PA平均下降: {mean_pa_decline_adj:.3f}")
print(f"  UA平均下降: {mean_ua_decline_adj:.3f}")

# 统计有多少生态区表现出明显的迁移退化
significant_decline_f1 = (phase2_df['F1_decline'] > 0.1).sum()
significant_decline_ratio_f1 = significant_decline_f1 / len(phase2_df)

significant_decline_f1_adj = (phase2_df['F1_decline_adj'] > 0.1).sum()
significant_decline_ratio_f1_adj = significant_decline_f1_adj / len(phase2_df)

print(f"\n迁移退化统计:")
print(f"  原始指标 - F1下降 > 0.1 的生态区: {significant_decline_f1} ({significant_decline_ratio_f1:.1%})")
print(f"  面积调整后 - F1下降 > 0.1 的生态区: {significant_decline_f1_adj} ({significant_decline_ratio_f1_adj:.1%})")

# 找出迁移退化最严重的生态区
top_worst_original = phase2_df.nlargest(5, 'F1_decline')
top_worst_adj = phase2_df.nlargest(5, 'F1_decline_adj')

print(f"\nTop-5 迁移退化最严重的生态区 (原始指标):")
for idx, row in top_worst_original.iterrows():
    print(f"  {row['target_ecoregion']}: F1下降 = {row['F1_decline']:.3f}")

print(f"\nTop-5 迁移退化最严重的生态区 (面积调整后):")
for idx, row in top_worst_adj.iterrows():
    print(f"  {row['target_ecoregion']}: F1下降 = {row['F1_decline_adj']:.3f}")

# 找出迁移性能最好的生态区（下降最小或负下降）
top_best_original = phase2_df.nsmallest(5, 'F1_decline')
top_best_adj = phase2_df.nsmallest(5, 'F1_decline_adj')

print(f"\nTop-5 迁移性能最好的生态区 (原始指标):")
for idx, row in top_best_original.iterrows():
    print(f"  {row['target_ecoregion']}: F1下降 = {row['F1_decline']:.3f}")

print(f"\nTop-5 迁移性能最好的生态区 (面积调整后):")
for idx, row in top_best_adj.iterrows():
    print(f"  {row['target_ecoregion']}: F1下降 = {row['F1_decline_adj']:.3f}")

# 创建可视化图表

# 图1：本地F1 vs 跨区平均F1（原始和面积调整后对比）
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# 左图：原始指标
ax1 = axes[0]
ax1.scatter(phase2_df['F1_local'], phase2_df['F1_cross_mean'],
            c=phase2_df['F1_decline'], cmap='RdYlBu_r', s=80, alpha=0.8, edgecolor='black')
ax1.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='y = x (无退化)')
ax1.set_xlabel('F1 (本地模型)', fontsize=12, fontweight='bold')
ax1.set_ylabel('F1 (跨区平均)', fontsize=12, fontweight='bold')
ax1.set_title('原始指标: 本地模型 vs 跨区迁移平均性能', fontsize=14, fontweight='bold')
ax1.colorbar(label='F1下降程度')
ax1.grid(True, alpha=0.3)
ax1.legend()

# 右图：面积调整后指标
ax2 = axes[1]
scatter = ax2.scatter(phase2_df['F1_local_adj'], phase2_df['F1_cross_mean_adj'],
                      c=phase2_df['F1_decline_adj'], cmap='RdYlBu_r', s=80, alpha=0.8, edgecolor='black')
ax2.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='y = x (无退化)')
ax2.set_xlabel('F1_adj (本地模型)', fontsize=12, fontweight='bold')
ax2.set_ylabel('F1_adj (跨区平均)', fontsize=12, fontweight='bold')
ax2.set_title('面积调整后: 本地模型 vs 跨区迁移平均性能', fontsize=14, fontweight='bold')
cbar = plt.colorbar(scatter, ax=ax2)
cbar.set_label('F1下降程度 (面积调整后)', fontsize=10)
ax2.grid(True, alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/fig_local_vs_cross_f1_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# 图2：F1下降分布直方图（原始 vs 面积调整后）
plt.figure(figsize=(14, 10))

plt.subplot(2, 1, 1)
plt.hist(phase2_df['F1_decline'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
plt.axvline(x=mean_f1_decline, color='red', linestyle='--', linewidth=2,
            label=f'平均值 = {mean_f1_decline:.3f}')
plt.axvline(x=0, color='black', linestyle='-', linewidth=1, alpha=0.7)
plt.xlabel('F1下降 (本地 - 跨区平均)', fontsize=12, fontweight='bold')
plt.ylabel('生态区数量', fontsize=12, fontweight='bold')
plt.title('原始指标: 跨生态区迁移性能下降分布', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 1, 2)
plt.hist(phase2_df['F1_decline_adj'], bins=20, color='forestgreen', edgecolor='black', alpha=0.7)
plt.axvline(x=mean_f1_decline_adj, color='red', linestyle='--', linewidth=2,
            label=f'平均值 = {mean_f1_decline_adj:.3f}')
plt.axvline(x=0, color='black', linestyle='-', linewidth=1, alpha=0.7)
plt.xlabel('F1下降 (面积调整后)', fontsize=12, fontweight='bold')
plt.ylabel('生态区数量', fontsize=12, fontweight='bold')
plt.title('面积调整后: 跨生态区迁移性能下降分布', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/fig_f1_decline_distribution_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# 图3：UA变化对比（原始 vs 面积调整后）
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.scatter(phase2_df['UA_local'], phase2_df['UA_cross_mean'],
            c='blue', s=60, alpha=0.6, edgecolor='black', label='原始UA')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='y = x')
plt.xlabel('本地模型 UA', fontsize=12, fontweight='bold')
plt.ylabel('跨区平均 UA', fontsize=12, fontweight='bold')
plt.title('原始 UA 对比', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(phase2_df['UA_local_adj'], phase2_df['UA_cross_mean_adj'],
            c='green', s=60, alpha=0.6, edgecolor='black', label='面积调整后UA')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='y = x')
plt.xlabel('本地模型 UA_adj', fontsize=12, fontweight='bold')
plt.ylabel('跨区平均 UA_adj', fontsize=12, fontweight='bold')
plt.title('面积调整后 UA 对比', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/fig_UA_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# 生成最终分析报告
print(f"\n{'='*80}")
print("最终分析报告")
print(f"{'='*80}")

print(f"\n1. 全国尺度迁移模型性能汇总:")
print(f"   全国PFCL基础率: {national_base_rate:.4%}")
print(f"   测试样本总数: {len(all_transfer_labels) if len(all_transfer_labels) > 0 else 0}")

if len(all_transfer_labels) > 0:
    print(f"\n   迁移模型原始指标:")
    print(f"     OA_transfer_original: {oa_transfer_original:.4f}")
    print(f"     F1_transfer_original: {f1_transfer_original:.4f}")
    print(f"     Kappa_transfer_original: {kappa_transfer_original:.4f}")
    print(f"     PA_transfer_original: {pa_transfer_original:.4f}")
    print(f"     UA_transfer_original: {ua_transfer_original:.4f}")

    print(f"\n   迁移模型面积调整后指标:")
    print(f"     OA_transfer_adj: {transfer_adj_metrics['OA_adj']:.4f}")
    print(f"     F1_transfer_adj: {transfer_adj_metrics['F1_adj']:.4f}")
    print(f"     Kappa_transfer_adj: {transfer_adj_metrics['Kappa_adj']:.4f}")
    print(f"     PA_transfer_adj: {transfer_adj_metrics['PA_adj']:.4f}")
    print(f"     UA_transfer_adj: {transfer_adj_metrics['UA_adj']:.4f}")
    print(f"     调整阈值: {transfer_adj_metrics['adjusted_threshold']:.6f}")

print(f"\n2. 生态分区必要性验证:")
if mean_f1_decline_adj > 0.05:  # 使用面积调整后的指标
    print(f"   ✓ 验证通过: 平均F1下降 {mean_f1_decline_adj:.3f} > 0.05")
    print(f"   ✓ 说明: 跨生态区模型迁移普遍存在性能下降")
    print(f"   ✓ 结论: 生态分区建模具有必要性")
else:
    print(f"   ✗ 验证未通过: 平均F1下降 {mean_f1_decline_adj:.3f} ≤ 0.05")
    print(f"   ✗ 说明: 跨生态区模型迁移性能下降不明显")
    print(f"   ✗ 结论: 可能需要重新考虑生态分区的必要性")

print(f"\n3. 面积调整的影响分析:")
print(f"   原始UA平均下降: {mean_ua_decline:.3f}")
print(f"   面积调整后UA平均下降: {mean_ua_decline_adj:.3f}")
print(f"   原始UA与面积调整后UA差异: {abs(mean_ua_decline - mean_ua_decline_adj):.3f}")

if mean_ua_decline_adj > mean_ua_decline:
    print(f"   ✓ 面积调整后UA下降更大，表明迁移模型在真实基础率下表现更差")
else:
    print(f"   ✓ 面积调整后UA下降较小，表明迁移模型对基础率变化不敏感")

print(f"\n4. 关键发现:")
print(f"   a. {significant_decline_ratio_f1_adj:.1%} 的生态区在面积调整后表现出显著迁移退化 (F1下降 > 0.1)")
print(f"   b. 最易迁移的生态区: {top_best_adj.iloc[0]['target_ecoregion']} (F1下降 = {top_best_adj.iloc[0]['F1_decline_adj']:.3f})")
print(f"   c. 最不易迁移的生态区: {top_worst_adj.iloc[0]['target_ecoregion']} (F1下降 = {top_worst_adj.iloc[0]['F1_decline_adj']:.3f})")

print(f"\n5. 管理建议:")
if mean_f1_decline_adj > 0.1:
    print("   ✓ 强烈建议采用生态分区特异性建模")
    print("   ✓ 为每个生态区开发定制化模型")
    print("   ✓ 限制模型跨区应用")
elif mean_f1_decline_adj > 0.05:
    print("   ✓ 建议采用生态分区特异性建模")
    print("   ✓ 可考虑区域化建模 (相邻生态区合并)")
    print("   ✓ 跨区应用时需谨慎验证")
else:
    print("   ✓ 可考虑通用模型开发")
    print("   ✓ 生态分区对模型性能影响有限")
    print("   ✓ 可探索更大范围的模型迁移")

print(f"\n{'='*80}")
print("处理完成!")
print(f"{'='*80}")

# 保存完整结果汇总
summary_report = f"""
跨生态区迁移验证分析报告（包含面积调整）
========================================

分析时间: {time.strftime('%Y-%m-%d %H:%M:%S')}
分析的生态区数量: {len(phase2_df)}
全国PFCL基础率: {national_base_rate:.4%}
总测试样本量: {len(all_transfer_labels) if len(all_transfer_labels) > 0 else 0}

关键指标汇总:
1. 平均F1下降 (原始): {mean_f1_decline:.3f}
2. 平均F1下降 (面积调整后): {mean_f1_decline_adj:.3f}
3. 平均UA下降 (原始): {mean_ua_decline:.3f}
4. 平均UA下降 (面积调整后): {mean_ua_decline_adj:.3f}

全国尺度迁移模型性能:
1. OA_transfer_original: {oa_transfer_original:.4f} -> OA_transfer_adj: {transfer_adj_metrics['OA_adj']:.4f}
2. F1_transfer_original: {f1_transfer_original:.4f} -> F1_transfer_adj: {transfer_adj_metrics['F1_adj']:.4f}
3. Kappa_transfer_original: {kappa_transfer_original:.4f} -> Kappa_transfer_adj: {transfer_adj_metrics['Kappa_adj']:.4f}
4. PA_transfer_original: {pa_transfer_original:.4f} -> PA_transfer_adj: {transfer_adj_metrics['PA_adj']:.4f}
5. UA_transfer_original: {ua_transfer_original:.4f} -> UA_transfer_adj: {transfer_adj_metrics['UA_adj']:.4f}
6. 调整阈值: {transfer_adj_metrics.get('adjusted_threshold', np.nan):.6f}

迁移退化统计:
1. 原始指标 - F1下降 > 0.1 的生态区: {significant_decline_f1} ({significant_decline_ratio_f1:.1%})
2. 面积调整后 - F1下降 > 0.1 的生态区: {significant_decline_f1_adj} ({significant_decline_ratio_f1_adj:.1%})

生态分区必要性结论:
平均F1下降 (面积调整后) {mean_f1_decline_adj:.3f} {'>' if mean_f1_decline_adj > 0.05 else '≤'} 0.05
因此，生态分区建模 {'具有' if mean_f1_decline_adj > 0.05 else '缺乏明显的'} 必要性。

面积调整的影响:
- UA (用户精度) 在面积调整后通常变化最大
- 原始UA与面积调整后UA差异: {abs(mean_ua_decline - mean_ua_decline_adj):.3f}
- {'面积调整加剧了' if mean_ua_decline_adj > mean_ua_decline else '面积调整缓解了'} 迁移性能下降

Top-3 最不易迁移的生态区 (面积调整后):
1. {top_worst_adj.iloc[0]['target_ecoregion']}: F1下降 = {top_worst_adj.iloc[0]['F1_decline_adj']:.3f}
2. {top_worst_adj.iloc[1]['target_ecoregion']}: F1下降 = {top_worst_adj.iloc[1]['F1_decline_adj']:.3f}
3. {top_worst_adj.iloc[2]['target_ecoregion']}: F1下降 = {top_worst_adj.iloc[2]['F1_decline_adj']:.3f}

Top-3 最易迁移的生态区 (面积调整后):
1. {top_best_adj.iloc[0]['target_ecoregion']}: F1下降 = {top_best_adj.iloc[0]['F1_decline_adj']:.3f}
2. {top_best_adj.iloc[1]['target_ecoregion']}: F1下降 = {top_best_adj.iloc[1]['F1_decline_adj']:.3f}
3. {top_best_adj.iloc[2]['target_ecoregion']}: F1下降 = {top_best_adj.iloc[2]['F1_decline_adj']:.3f}
"""

with open('/content/drive/MyDrive/cross_ecoregion_analysis_report_with_area_adjustment.txt', 'w') as f:
    f.write(summary_report)

print(f"\n分析报告已保存到: /content/drive/MyDrive/cross_ecoregion_analysis_report_with_area_adjustment.txt")
print(f"所有结果文件:")
print(f"1. {phase1_output} - 第一阶段结果")
print(f"2. {phase2_output} - 第二阶段结果")
print(f"3. {transfer_summary_path} - 全国尺度迁移模型结果")
print(f"4. /content/drive/MyDrive/ecoregion_models/ - 所有生态区模型")
print(f"5. /content/drive/MyDrive/ecoregion_testsets/ - 所有生态区测试集")
print(f"6. 3张可视化图表")