In [None]:
# ==================== Part 2: Spatial Cross-Validation ====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Install necessary libraries
!pip install -q scikit-learn xgboost

print("Spatial Cross-Validation Analysis")
print("="*60)

# Load real data
print("Loading real data...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    data_path = '/content/drive/MyDrive/merged_data_by_year).csv'
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")

    # Handle missing values: DELETE rows with missing values
    print("\nHandling missing values...")
    original_shape = df.shape
    df = df.dropna()
    rows_removed = original_shape[0] - df.shape[0]
    print(f"  Rows with missing values removed: {rows_removed}")
    print(f"  New data shape: {df.shape[0]} rows, {df.shape[1]} columns")

except Exception as e:
    print(f"Error loading real data: {e}")
    # Use sample data for demonstration
    print("Using sample data for demonstration...")
    np.random.seed(42)
    n_samples = 1000

    data = {
        'lat': np.random.uniform(30, 50, n_samples),
        'lon': np.random.uniform(100, 130, n_samples),
        'NBR': np.random.uniform(500, 800, n_samples),
        'NDVI': np.random.uniform(600, 900, n_samples),
        'aspect': np.random.uniform(0, 360, n_samples),
        'elevation': np.random.randint(0, 2000, n_samples),
        'slope': np.random.uniform(0, 30, n_samples),
        'annual_precip': np.random.uniform(10, 20, n_samples),
        'annual_temp': np.random.uniform(-5, 5, n_samples),
        'b1': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
    }
    df = pd.DataFrame(data)

# Data preprocessing
print("\nData preprocessing...")

# Select features (excluding NBR_temporal and NDVI_temporal)
feature_cols = ['NBR', 'NDVI', 'aspect', 'elevation', 'slope', 'annual_precip', 'annual_temp']

# Ensure features exist in dataframe
available_features = [f for f in feature_cols if f in df.columns]
print(f"Using {len(available_features)} available features: {available_features}")

X = df[available_features].values
y = df['b1'].values

# Check for coordinates
if 'lat' in df.columns and 'lon' in df.columns:
    coords = df[['lat', 'lon']].values
    print(f"Using spatial coordinates for cross-validation")
else:
    print("Warning: No spatial coordinates found, creating random coordinates for demonstration")
    np.random.seed(42)
    coords = np.random.randn(len(df), 2)
    coords[:, 0] = coords[:, 0] * 10 + 45  # Simulate latitude
    coords[:, 1] = coords[:, 1] * 10 + 115  # Simulate longitude

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\nFeature shape: {X_scaled.shape}")
print(f"Label shape: {y.shape}")
print(f"Class distribution: Stable={sum(y==0)}, Disturbance={sum(y==1)}")

class SpatialCrossValidator:
    """Spatial Cross-Validator"""

    def __init__(self, n_splits=5, spatial_strategy='blocks'):
        self.n_splits = n_splits
        self.spatial_strategy = spatial_strategy

    def create_spatial_folds(self, coords):
        """Create spatial blocks for cross-validation"""
        # Create spatial blocks based on coordinates
        lat_min, lat_max = coords[:, 0].min(), coords[:, 0].max()
        lon_min, lon_max = coords[:, 1].min(), coords[:, 1].max()

        # Create grid
        lat_bins = np.linspace(lat_min, lat_max, self.n_splits + 1)
        lon_bins = np.linspace(lon_min, lon_max, self.n_splits + 1)

        # Assign each point to a spatial block
        fold_assignments = np.zeros(len(coords), dtype=int)

        for i in range(self.n_splits):
            for j in range(self.n_splits):
                # Define current block boundaries
                lat_left = lat_bins[i]
                lat_right = lat_bins[i + 1]
                lon_bottom = lon_bins[j]
                lon_top = lon_bins[j + 1]

                # Find points within current block
                in_block = (coords[:, 0] >= lat_left) & \
                          (coords[:, 0] < lat_right) & \
                          (coords[:, 1] >= lon_bottom) & \
                          (coords[:, 1] < lon_top)

                # Assign block ID
                block_idx = i * self.n_splits + j
                if block_idx < self.n_splits:
                    fold_assignments[in_block] = block_idx

        # For unassigned samples, use K-means clustering
        from sklearn.cluster import KMeans
        unassigned = fold_assignments == 0
        if unassigned.sum() > 0:
            kmeans = KMeans(n_clusters=self.n_splits, random_state=42)
            clusters = kmeans.fit_predict(coords[unassigned])
            fold_assignments[unassigned] = clusters

        return fold_assignments

    def split(self, X, y, coords):
        """Generate spatial cross-validation splits"""
        fold_indices = self.create_spatial_folds(coords)

        for fold in range(self.n_splits):
            train_idx = np.where(fold_indices != fold)[0]
            test_idx = np.where(fold_indices == fold)[0]

            # Ensure both train and test sets have samples
            if len(train_idx) > 0 and len(test_idx) > 0:
                yield train_idx, test_idx

    def evaluate(self, model, X, y, coords):
        """Evaluate model performance with spatial cross-validation"""
        spatial_scores = []
        fold_details = []

        print(f"\nStarting {self.n_splits}-fold spatial cross-validation...")

        for fold, (train_idx, test_idx) in enumerate(self.split(X, y, coords)):
            print(f"\nFold {fold+1}/{self.n_splits}:")
            print(f"  Training samples: {len(train_idx)}, Test samples: {len(test_idx)}")

            # Split data
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            # Train model
            model.fit(X_train, y_train)

            # Predict
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, zero_division=0)

            if y_pred_proba is not None and len(np.unique(y_test)) > 1:
                auc = roc_auc_score(y_test, y_pred_proba)
            else:
                auc = 0.5

            spatial_scores.append({
                'fold': fold + 1,
                'accuracy': accuracy,
                'f1_score': f1,
                'auc_roc': auc,
                'train_samples': len(train_idx),
                'test_samples': len(test_idx)
            })

            fold_details.append({
                'train_idx': train_idx,
                'test_idx': test_idx,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba
            })

            print(f"  Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc:.4f}")

        return spatial_scores, fold_details

    def compare_with_random_cv(self, model, X, y, coords):
        """Compare spatial CV with random CV"""
        from sklearn.model_selection import StratifiedKFold

        print("\n" + "="*60)
        print("Spatial CV vs Random CV Comparison")
        print("="*60)

        # Spatial CV
        print("\n1. Spatial Cross-Validation:")
        spatial_scores, _ = self.evaluate(model, X, y, coords)

        # Random CV
        print("\n2. Random Cross-Validation:")
        random_scores = []
        random_cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)

        for fold, (train_idx, test_idx) in enumerate(random_cv.split(X, y)):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, zero_division=0)

            random_scores.append({
                'fold': fold + 1,
                'accuracy': accuracy,
                'f1_score': f1,
                'train_samples': len(train_idx),
                'test_samples': len(test_idx)
            })

            print(f"  Fold {fold+1}: Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

        # Compare results
        print("\n" + "="*60)
        print("Performance Comparison")
        print("="*60)

        spatial_acc_mean = np.mean([s['accuracy'] for s in spatial_scores])
        spatial_f1_mean = np.mean([s['f1_score'] for s in spatial_scores])
        spatial_acc_std = np.std([s['accuracy'] for s in spatial_scores])
        spatial_f1_std = np.std([s['f1_score'] for s in spatial_scores])

        random_acc_mean = np.mean([s['accuracy'] for s in random_scores])
        random_f1_mean = np.mean([s['f1_score'] for s in random_scores])
        random_acc_std = np.std([s['accuracy'] for s in random_scores])
        random_f1_std = np.std([s['f1_score'] for s in random_scores])

        print(f"\nSpatial CV:")
        print(f"  Mean Accuracy: {spatial_acc_mean:.4f} (±{spatial_acc_std:.4f})")
        print(f"  Mean F1 Score: {spatial_f1_mean:.4f} (±{spatial_f1_std:.4f})")

        print(f"\nRandom CV:")
        print(f"  Mean Accuracy: {random_acc_mean:.4f} (±{random_acc_std:.4f})")
        print(f"  Mean F1 Score: {random_f1_mean:.4f} (±{random_f1_std:.4f})")

        print(f"\nPerformance Difference:")
        print(f"  Accuracy reduction: {(random_acc_mean - spatial_acc_mean):.4f}")
        print(f"  F1 Score reduction: {(random_f1_mean - spatial_f1_mean):.4f}")

        # Visualize comparison
        self.plot_comparison(spatial_scores, random_scores)

        return {
            'spatial_scores': spatial_scores,
            'random_scores': random_scores,
            'spatial_mean_acc': spatial_acc_mean,
            'spatial_mean_f1': spatial_f1_mean,
            'random_mean_acc': random_acc_mean,
            'random_mean_f1': random_f1_mean
        }

    def plot_comparison(self, spatial_scores, random_scores):
        """Plot comparison visualization"""
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))

        # 1. Accuracy comparison
        spatial_acc = [s['accuracy'] for s in spatial_scores]
        random_acc = [s['accuracy'] for s in random_scores]

        x = np.arange(len(spatial_acc))
        width = 0.35

        axes[0, 0].bar(x - width/2, spatial_acc, width, label='Spatial CV', alpha=0.8)
        axes[0, 0].bar(x + width/2, random_acc, width, label='Random CV', alpha=0.8)
        axes[0, 0].set_xlabel('Fold')
        axes[0, 0].set_ylabel('Accuracy')
        axes[0, 0].set_title('Accuracy Comparison by Fold')
        axes[0, 0].set_xticks(x)
        axes[0, 0].set_xticklabels([f'Fold {i+1}' for i in range(len(spatial_acc))])
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)

        # 2. F1 Score comparison
        spatial_f1 = [s['f1_score'] for s in spatial_scores]
        random_f1 = [s['f1_score'] for s in random_scores]

        axes[0, 1].bar(x - width/2, spatial_f1, width, label='Spatial CV', alpha=0.8)
        axes[0, 1].bar(x + width/2, random_f1, width, label='Random CV', alpha=0.8)
        axes[0, 1].set_xlabel('Fold')
        axes[0, 1].set_ylabel('F1 Score')
        axes[0, 1].set_title('F1 Score Comparison by Fold')
        axes[0, 1].set_xticks(x)
        axes[0, 1].set_xticklabels([f'Fold {i+1}' for i in range(len(spatial_f1))])
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)

        # 3. Mean comparison
        metrics = ['Accuracy', 'F1 Score']
        spatial_means = [np.mean(spatial_acc), np.mean(spatial_f1)]
        random_means = [np.mean(random_acc), np.mean(random_f1)]

        x2 = np.arange(len(metrics))
        axes[1, 0].bar(x2 - width/2, spatial_means, width, label='Spatial CV', alpha=0.8)
        axes[1, 0].bar(x2 + width/2, random_means, width, label='Random CV', alpha=0.8)
        axes[1, 0].set_xlabel('Metric')
        axes[1, 0].set_ylabel('Mean Value')
        axes[1, 0].set_title('Mean Performance Comparison')
        axes[1, 0].set_xticks(x2)
        axes[1, 0].set_xticklabels(metrics)
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)

        # 4. Performance reduction analysis
        acc_drop = random_means[0] - spatial_means[0]
        f1_drop = random_means[1] - spatial_means[1]

        drops = [acc_drop, f1_drop]
        colors = ['red' if d > 0 else 'green' for d in drops]

        axes[1, 1].bar(metrics, drops, color=colors, alpha=0.8)
        axes[1, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        axes[1, 1].set_xlabel('Metric')
        axes[1, 1].set_ylabel('Performance Difference (Random CV - Spatial CV)')
        axes[1, 1].set_title('Performance Overestimation due to Spatial Autocorrelation')
        axes[1, 1].grid(True, alpha=0.3)

        # Add value labels
        for i, v in enumerate(drops):
            axes[1, 1].text(i, v + (0.01 if v >= 0 else -0.02), f'{v:.4f}',
                           ha='center', va='bottom' if v >= 0 else 'top', fontweight='bold')

        plt.suptitle('Spatial Cross-Validation vs Random Cross-Validation', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()

# Create model
print("\nCreating XGBoost model...")
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Execute spatial cross-validation
spatial_cv = SpatialCrossValidator(n_splits=5)

# Compare spatial CV and random CV
results = spatial_cv.compare_with_random_cv(model, X_scaled, y, coords)

# Analyze spatial autocorrelation
print("\n" + "="*60)
print("Spatial Autocorrelation Analysis")
print("="*60)

# Calculate spatial distance matrix (simplified version for demonstration)
print("Calculating spatial distances between samples...")

# Randomly select 100 samples to calculate (to avoid excessive computation)
n_samples = min(100, len(coords))
sample_idx = np.random.choice(len(coords), n_samples, replace=False)
sample_coords = coords[sample_idx]

# Calculate distance matrix
from scipy.spatial.distance import pdist, squareform
distances = squareform(pdist(sample_coords))

# Analyze spatial autocorrelation
print(f"Calculated distance matrix for {n_samples} samples")
print(f"Minimum distance: {distances[distances > 0].min():.4f}")
print(f"Maximum distance: {distances.max():.4f}")
print(f"Mean distance: {distances.mean():.4f}")

# Generate report
print("\n" + "="*60)
print("Spatial Cross-Validation Report")
print("="*60)

print(f"\nKey Findings:")
if results['random_mean_acc'] - results['spatial_mean_acc'] > 0.05:
    print("⚠ Significant spatial autocorrelation detected:")
    print(f"  Random CV accuracy is {results['random_mean_acc'] - results['spatial_mean_acc']:.4f} higher than spatial CV")
    print("  Recommendation: Report spatial CV results in the manuscript to avoid performance overestimation")
else:
    print("✓ Spatial autocorrelation not significant:")
    print(f"  Difference between random CV and spatial CV accuracy is only {results['random_mean_acc'] - results['spatial_mean_acc']:.4f}")
    print("  Model shows good spatial generalization capability")

print(f"\nManuscript Revision Suggestions:")
print("1. Add details of spatial cross-validation implementation in Methods section")
print("2. Report comparison results between spatial CV and random CV in Results section")
print("3. Analyze the impact of spatial autocorrelation on model performance in Discussion section")

# Save results
import json
import os
os.makedirs('spatial_cv_results', exist_ok=True)

with open('spatial_cv_results/spatial_cv_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print("\n✓ Spatial cross-validation results saved as: spatial_cv_results/spatial_cv_results.json")

print("\n" + "="*60)
print("Spatial Cross-Validation Completed!")
print("="*60)