In [None]:
# ==================== Part 3: Leave-One-Ecoregion Cross-Validation ====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Install necessary libraries
!pip install -q scikit-learn xgboost

print("Leave-One-Ecoregion Cross-Validation Analysis")
print("="*60)

# Load real data
print("Loading real data...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    data_path = '/content/drive/MyDrive/merged_data_by_year).csv'
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")

    # Handle missing values: DELETE rows with missing values
    print("\nHandling missing values...")
    original_shape = df.shape
    df = df.dropna()
    rows_removed = original_shape[0] - df.shape[0]
    print(f"  Rows with missing values removed: {rows_removed}")
    print(f"  New data shape: {df.shape[0]} rows, {df.shape[1]} columns")

except Exception as e:
    print(f"Error loading real data: {e}")
    # Use sample data for demonstration
    print("Using sample data for demonstration...")
    np.random.seed(42)
    n_samples = 5000

    # Create sample data for 6 ecoregions
    ecoregions = ['I01', 'I04', 'I07', 'I21', 'I25', 'I28']
    data = []

    for i, region in enumerate(ecoregions):
        n_region = n_samples // len(ecoregions)

        region_data = {
            'ID': [region] * n_region,
            'NBR': np.random.uniform(500 + i*50, 800 + i*50, n_region),
            'NDVI': np.random.uniform(600 + i*30, 900 + i*30, n_region),
            'aspect': np.random.uniform(0, 360, n_region),
            'elevation': np.random.randint(0 + i*200, 2000 + i*200, n_region),
            'slope': np.random.uniform(0 + i*5, 30 + i*5, n_region),
            'annual_precip': np.random.uniform(10 + i*2, 20 + i*2, n_region),
            'annual_temp': np.random.uniform(-5 + i, 5 + i, n_region),
            'b1': np.random.choice([0, 1], n_region, p=[0.7, 0.3])
        }

        # Add some unique feature patterns for each ecoregion
        if i % 2 == 0:
            region_data['NBR'] = region_data['NBR'] * 1.2
        if i % 3 == 0:
            region_data['NDVI'] = region_data['NDVI'] * 0.8

        data.append(pd.DataFrame(region_data))

    df = pd.concat(data, ignore_index=True)
    print(f"Created sample data: {df.shape[0]} rows, {df.shape[1]} columns")

print("\nData Information:")
print(f"Number of ecoregions: {df['ID'].nunique()}")
print(f"Ecoregion distribution:")
print(df['ID'].value_counts())

# Data preprocessing function
def preprocess_data(df, features):
    """Preprocess data for training"""
    X = df[features].copy()
    y = df['b1'].copy()

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, scaler

# Feature selection (excluding NBR_temporal and NDVI_temporal)
feature_cols = ['NBR', 'NDVI', 'aspect', 'elevation', 'slope', 'annual_precip', 'annual_temp']

# Ensure features exist in dataframe
available_features = [f for f in feature_cols if f in df.columns]
print(f"\nUsing {len(available_features)} available features: {available_features}")

class LeaveOneEcoregionOutValidator:
    """Leave-One-Ecoregion Cross-Validator"""

    def __init__(self, model_params=None):
        if model_params is None:
            self.model_params = {
                'n_estimators': 100,
                'max_depth': 6,
                'learning_rate': 0.1,
                'random_state': 42,
                'n_jobs': -1,
                'use_label_encoder': False,
                'eval_metric': 'logloss'
            }
        else:
            self.model_params = model_params

    def validate(self, df, feature_cols, selected_regions=None):
        """Perform leave-one-ecoregion validation"""

        # Select ecoregions for validation
        if selected_regions is None:
            # Select ecoregions with moderate sample size
            region_counts = df['ID'].value_counts()
            selected_regions = region_counts[region_counts.between(500, 5000)].index.tolist()[:5]

        print(f"Selected {len(selected_regions)} ecoregions for validation: {selected_regions}")

        results = {}

        for i, target_region in enumerate(selected_regions, 1):
            print(f"\n{'='*60}")
            print(f"Validation {i}/{len(selected_regions)}: Target Ecoregion {target_region}")
            print('='*60)

            # Split data
            train_df = df[df['ID'] != target_region].copy()
            test_df = df[df['ID'] == target_region].copy()

            print(f"Training set: {len(train_df)} samples (from {train_df['ID'].nunique()} ecoregions)")
            print(f"Test set: {len(test_df)} samples (from ecoregion {target_region})")

            # Preprocess
            X_train, y_train, scaler = preprocess_data(train_df, feature_cols)
            X_test, y_test, _ = preprocess_data(test_df, feature_cols)

            # Create and train model
            model = xgb.XGBClassifier(**self.model_params)
            model.fit(X_train, y_train)

            # Predict
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            # Calculate metrics
            metrics = self.calculate_metrics(y_test, y_pred, y_pred_proba)

            # Record results
            results[target_region] = {
                'metrics': metrics,
                'train_samples': len(train_df),
                'test_samples': len(test_df),
                'train_regions': train_df['ID'].nunique(),
                'model': model,
                'scaler': scaler
            }

            print(f"Test set performance:")
            print(f"  Accuracy: {metrics['accuracy']:.4f}")
            print(f"  F1 Score: {metrics['f1_score']:.4f}")
            print(f"  AUC-ROC: {metrics['auc_roc']:.4f}")
            print(f"  Precision: {metrics['precision']:.4f}")
            print(f"  Recall: {metrics['recall']:.4f}")

        return results

    def calculate_metrics(self, y_true, y_pred, y_pred_proba):
        """Calculate evaluation metrics"""
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1_score': f1_score(y_true, y_pred, zero_division=0)
        }

        if len(np.unique(y_true)) > 1:
            metrics['auc_roc'] = roc_auc_score(y_true, y_pred_proba)
        else:
            metrics['auc_roc'] = 0.5

        return metrics

    def analyze_results(self, results):
        """Analyze leave-one-out validation results"""
        print("\n" + "="*60)
        print("Leave-One-Ecoregion Validation Results Analysis")
        print("="*60)

        # Create summary table
        summary_data = []
        for region, result in results.items():
            metrics = result['metrics']
            summary_data.append({
                'Ecoregion': region,
                'TestSamples': result['test_samples'],
                'Accuracy': metrics['accuracy'],
                'F1Score': metrics['f1_score'],
                'AUC_ROC': metrics['auc_roc'],
                'Precision': metrics['precision'],
                'Recall': metrics['recall']
            })

        summary_df = pd.DataFrame(summary_data)

        print("\nLeave-One-Out Validation Performance by Ecoregion:")
        print(summary_df.to_string(index=False))

        # Statistical analysis
        print(f"\nStatistical Analysis:")
        print(f"Mean Accuracy: {summary_df['Accuracy'].mean():.4f} (±{summary_df['Accuracy'].std():.4f})")
        print(f"Mean F1 Score: {summary_df['F1Score'].mean():.4f} (±{summary_df['F1Score'].std():.4f})")
        print(f"Mean AUC-ROC: {summary_df['AUC_ROC'].mean():.4f} (±{summary_df['AUC_ROC'].std():.4f})")

        # Identify worst and best performing ecoregions
        worst_region = summary_df.loc[summary_df['Accuracy'].idxmin()]
        best_region = summary_df.loc[summary_df['Accuracy'].idxmax()]

        print(f"\nCross-Ecoregion Transferability Analysis:")
        print(f"Best performing ecoregion: {best_region['Ecoregion']} (Accuracy: {best_region['Accuracy']:.4f})")
        print(f"Worst performing ecoregion: {worst_region['Ecoregion']} (Accuracy: {worst_region['Accuracy']:.4f})")
        print(f"Performance difference: {best_region['Accuracy'] - worst_region['Accuracy']:.4f}")

        # Comparison with within-ecoregion validation (estimated)
        print(f"\nComparison with Within-Ecoregion Validation (estimated):")
        print("(Note: Within-ecoregion validation typically yields higher performance)")

        # Visualization analysis
        self.plot_analysis(summary_df, results)

        return summary_df

    def plot_analysis(self, summary_df, results):
        """Plot analysis visualizations"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # 1. Accuracy distribution
        axes[0, 0].bar(range(len(summary_df)), summary_df['Accuracy'])
        axes[0, 0].axhline(y=summary_df['Accuracy'].mean(), color='r', linestyle='--',
                          label=f'Mean: {summary_df["Accuracy"].mean():.3f}')
        axes[0, 0].set_xlabel('Ecoregion')
        axes[0, 0].set_ylabel('Accuracy')
        axes[0, 0].set_title('Leave-One-Out Validation Accuracy by Ecoregion')
        axes[0, 0].set_xticks(range(len(summary_df)))
        axes[0, 0].set_xticklabels(summary_df['Ecoregion'], rotation=45, ha='right')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)

        # 2. F1 Score distribution
        axes[0, 1].bar(range(len(summary_df)), summary_df['F1Score'])
        axes[0, 1].axhline(y=summary_df['F1Score'].mean(), color='r', linestyle='--',
                          label=f'Mean: {summary_df["F1Score"].mean():.3f}')
        axes[0, 1].set_xlabel('Ecoregion')
        axes[0, 1].set_ylabel('F1 Score')
        axes[0, 1].set_title('Leave-One-Out Validation F1 Score by Ecoregion')
        axes[0, 1].set_xticks(range(len(summary_df)))
        axes[0, 1].set_xticklabels(summary_df['Ecoregion'], rotation=45, ha='right')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)

        # 3. AUC-ROC distribution
        axes[0, 2].bar(range(len(summary_df)), summary_df['AUC_ROC'])
        axes[0, 2].axhline(y=summary_df['AUC_ROC'].mean(), color='r', linestyle='--',
                          label=f'Mean: {summary_df["AUC_ROC"].mean():.3f}')
        axes[0, 2].set_xlabel('Ecoregion')
        axes[0, 2].set_ylabel('AUC-ROC')
        axes[0, 2].set_title('Leave-One-Out Validation AUC-ROC by Ecoregion')
        axes[0, 2].set_xticks(range(len(summary_df)))
        axes[0, 2].set_xticklabels(summary_df['Ecoregion'], rotation=45, ha='right')
        axes[0, 2].legend()
        axes[0, 2].grid(True, alpha=0.3)

        # 4. Sample size vs performance relationship
        axes[1, 0].scatter(summary_df['TestSamples'], summary_df['Accuracy'], s=100, alpha=0.7)
        for i, row in summary_df.iterrows():
            axes[1, 0].text(row['TestSamples'], row['Accuracy'] + 0.01, row['Ecoregion'],
                           ha='center', fontsize=9)

        # Add regression line
        if len(summary_df) > 1:
            z = np.polyfit(summary_df['TestSamples'], summary_df['Accuracy'], 1)
            p = np.poly1d(z)
            x_range = np.linspace(summary_df['TestSamples'].min(), summary_df['TestSamples'].max(), 100)
            axes[1, 0].plot(x_range, p(x_range), 'r--', alpha=0.7)

        axes[1, 0].set_xlabel('Test Samples')
        axes[1, 0].set_ylabel('Accuracy')
        axes[1, 0].set_title('Sample Size vs Performance Relationship')
        axes[1, 0].grid(True, alpha=0.3)

        # 5. Performance metrics correlation heatmap
        corr_matrix = summary_df[['Accuracy', 'F1Score', 'AUC_ROC', 'Precision', 'Recall']].corr()
        im = axes[1, 1].imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
        axes[1, 1].set_xticks(range(len(corr_matrix.columns)))
        axes[1, 1].set_yticks(range(len(corr_matrix.columns)))
        axes[1, 1].set_xticklabels(corr_matrix.columns, rotation=45, ha='right')
        axes[1, 1].set_yticklabels(corr_matrix.columns)
        axes[1, 1].set_title('Performance Metrics Correlation')

        # Add values
        for i in range(len(corr_matrix.columns)):
            for j in range(len(corr_matrix.columns)):
                axes[1, 1].text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                               ha='center', va='center', color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black')

        plt.colorbar(im, ax=axes[1, 1], fraction=0.046, pad=0.04)

        # 6. Cross-ecoregion transferability summary
        axes[1, 2].axis('off')
        summary_text = f"""
Cross-Ecoregion Transferability Analysis Summary:

Ecoregions analyzed: {len(summary_df)}
Mean Accuracy: {summary_df['Accuracy'].mean():.3f}
Accuracy Standard Deviation: {summary_df['Accuracy'].std():.3f}
Performance Range: {(summary_df['Accuracy'].max() - summary_df['Accuracy'].min()):.3f}

Assessment Conclusions:
"""

        if summary_df['Accuracy'].std() > 0.1:
            summary_text += """
• Limited cross-ecoregion transferability
• Some ecoregions have unique characteristics that are difficult to transfer
• Recommendation: Use ecoregion-stratified modeling strategy
"""
        else:
            summary_text += """
• Good cross-ecoregion transferability
• Small performance differences across ecoregions
• Ecoregion-stratified modeling still valuable but not essential
"""

        axes[1, 2].text(0.1, 0.5, summary_text, transform=axes[1, 2].transAxes,
                       fontsize=11, verticalalignment='center',
                       bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

        plt.suptitle('Leave-One-Ecoregion Cross-Validation Comprehensive Analysis', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()

# Execute leave-one-ecoregion validation
print("\nStarting leave-one-ecoregion cross-validation...")
validator = LeaveOneEcoregionOutValidator()
results = validator.validate(df, available_features)

# Analyze results
summary_df = validator.analyze_results(results)

# Generate final report
print("\n" + "="*60)
print("Leave-One-Ecoregion Validation Final Report")
print("="*60)

print(f"\nKey Findings:")
if summary_df['Accuracy'].std() > 0.1:
    print("⚠ Limited cross-ecoregion transferability:")
    print(f"  Accuracy standard deviation across ecoregions: {summary_df['Accuracy'].std():.3f}")
    print(f"  Maximum performance difference: {(summary_df['Accuracy'].max() - summary_df['Accuracy'].min()):.3f}")
    print("  This supports the need for ecoregion-stratified modeling")
else:
    print("✓ Good cross-ecoregion transferability:")
    print(f"  Accuracy standard deviation across ecoregions: {summary_df['Accuracy'].std():.3f}")
    print("  Model shows consistent performance across different ecoregions")

print(f"\nManuscript Revision Suggestions:")
print("1. Add experimental design of leave-one-ecoregion validation in Methods section")
print("2. Report cross-ecoregion validation performance metrics in Results section")
print("3. Analyze limitations of cross-ecoregion transferability in Discussion section")
print("4. Justify the necessity of ecoregion-stratified modeling")

# Save results
import json
import os
os.makedirs('leave_one_out_results', exist_ok=True)

# Save only serializable data
save_results = {}
for region, result in results.items():
    save_results[region] = {
        'metrics': result['metrics'],
        'train_samples': result['train_samples'],
        'test_samples': result['test_samples'],
        'train_regions': result['train_regions']
    }

with open('leave_one_out_results/leave_one_out_results.json', 'w') as f:
    json.dump(save_results, f, indent=2)

summary_df.to_csv('leave_one_out_results/leave_one_out_summary.csv', index=False)
print("\n✓ Leave-one-out validation results saved:")
print("  - leave_one_out_results/leave_one_out_results.json")
print("  - leave_one_out_results/leave_one_out_summary.csv")

print("\n" + "="*60)
print("Leave-One-Ecoregion Cross-Validation Completed!")
print("="*60)