In [None]:
# -*- coding: utf-8 -*-
"""
Title: Statistical Analysis of Data Splitting Strategies in Magnitude Estimation

This notebook conducts a comprehensive analysis of data splitting strategies by:
1. Running 50 experiments with random splits
2. Comparing with chronological splitting
3. Analyzing statistical significance of results
4. Investigating evidence of data leakage

Dependencies:
- torch, numpy, json
- matplotlib, seaborn (for visualization)
- tqdm (for progress tracking)
"""

# Part 1: Setup and Imports

In [None]:
#------------------------------------------------------------------------------
# Part 1: Setup and Imports
#------------------------------------------------------------------------------

# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from torchinfo import summary
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import time
import random
import seaborn as sns

# Configure environment
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Record start time
start_time = time.time()

# Part 2: Dataset and Model Classes

In [None]:
#------------------------------------------------------------------------------
# Part 2: Dataset and Model Classes
#------------------------------------------------------------------------------

class EarthquakeDataset(Dataset):
    """Dataset class for earthquake data."""
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

class EarthquakeModel(nn.Module):
    """Original MagNet architecture for comparison purposes."""
    def __init__(self):
        super(EarthquakeModel, self).__init__()
        self.conv1 = nn.Conv1d(3, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.maxpool = nn.MaxPool1d(4, padding=1)
        self.dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(32, 100, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(200, 2)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.dropout(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.dropout(x)
        x = self.maxpool(x)
        x = x.transpose(1, 2)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

# Part 3: Training Components

In [None]:
#------------------------------------------------------------------------------
# Part 3: Training Components
#------------------------------------------------------------------------------

class EarlyStopping:
    """Early stopping to prevent overfitting."""
    def __init__(self, patience=7, verbose=False, delta=0, run_id=None, 
                 test_seed=None, model_seed=None):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.run_id = run_id
        self.test_seed = test_seed
        self.model_seed = model_seed

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f})')
        model_filename = f'best_model_Run_{self.run_id}_test_data_seed_{self.test_seed}_model_seed_{self.model_seed}.pth'
        torch.save(model.state_dict(), model_filename)
        self.val_loss_min = val_loss

def custom_loss(y_pred, y_true):
    """Custom loss function combining prediction error and uncertainty."""
    y_hat = y_pred[:, 0]
    s = y_pred[:, 1]
    return torch.mean(0.5 * torch.exp(-s) * (y_true - y_hat)**2 + 0.5 * s)

# Part 4: Training and Evaluation Functions

In [None]:
#------------------------------------------------------------------------------
# Part 4: Training and Evaluation Functions
#------------------------------------------------------------------------------

def train_model(model, train_loader, val_loader, num_epochs=300, patience=5, 
                run_id=None, test_seed=None, model_seed=None):
    """Train the model with early stopping and learning rate scheduling."""
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=np.sqrt(0.1),
        cooldown=0, patience=4, verbose=True, min_lr=0.5e-6
    )

    early_stopping = EarlyStopping(
        patience=patience, verbose=True,
        run_id=run_id, test_seed=test_seed, model_seed=model_seed
    )
    
    criterion = custom_loss
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, target)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running_loss += loss.item()

        # Validation phase
        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                outputs = model(data)
                loss = criterion(outputs, target)
                val_loss += loss.item()

        # Calculate average losses
        val_loss /= len(val_loader)
        running_loss /= len(train_loader)

        # Learning rate scheduling and early stopping
        scheduler.step(val_loss)
        early_stopping(val_loss, model)

        print(f'Epoch {epoch+1}, Loss: {running_loss:.4f}, '
              f'Validation Loss: {val_loss:.4f}, '
              f'LR: {optimizer.param_groups[0]["lr"]:.6f}')

        train_losses.append(running_loss)
        val_losses.append(val_loss)

        if early_stopping.early_stop:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break

    return train_losses, val_losses

def estimate_uncertainty(model, data_loader, num_samples=50):
    """Estimate model uncertainty using Monte Carlo dropout."""
    model.eval()

    # Enable dropout during inference
    for m in model.modules():
        if isinstance(m, nn.Dropout):
            m.train()

    predictions = []
    log_variances = []

    with torch.no_grad():
        for _ in range(num_samples):
            batch_predictions = []
            batch_log_variances = []
            for data, _ in data_loader:
                data = data.to(device)
                output = model(data)
                batch_predictions.append(output[:, 0].cpu().numpy())
                batch_log_variances.append(output[:, 1].cpu().numpy())
            predictions.append(np.concatenate(batch_predictions))
            log_variances.append(np.concatenate(batch_log_variances))

    predictions = np.array(predictions)
    log_variances = np.array(log_variances)

    mean_prediction = np.mean(predictions, axis=0)
    yhat_squared_mean = np.mean(np.square(predictions), axis=0)

    sigma_squared = np.power(10, log_variances)
    aleatoric_uncertainty = np.mean(sigma_squared, axis=0)

    epistemic_uncertainty = np.std(predictions, axis=0)
    combined_uncertainty = yhat_squared_mean - np.square(mean_prediction) + aleatoric_uncertainty

    return mean_prediction, epistemic_uncertainty, aleatoric_uncertainty, combined_uncertainty

def evaluate_model(model, test_loader, run_id, test_seed, model_seed):
    """Evaluate model performance and uncertainties."""
    model_filename = f'best_model_Run_{run_id}_test_data_seed_{test_seed}_model_seed_{model_seed}.pth'
    model.load_state_dict(torch.load(model_filename))
    
    mean_pred, epistemic_unc, aleatoric_unc, combined_unc = estimate_uncertainty(model, test_loader)

    true_values = []
    for _, target in test_loader:
        true_values.append(target.numpy())
    true_values = np.concatenate(true_values)

    mae = np.mean(np.abs(mean_pred - true_values))

    return mae, mean_pred, true_values, epistemic_unc, aleatoric_unc, combined_unc

# Part 5: Experimental Functions

In [None]:
#------------------------------------------------------------------------------
# Part 5: Experimental Functions
#------------------------------------------------------------------------------

def set_seed(seed):
    """Set random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def run_experiment(data, labels, test_seed, model_seed, run_id):
    """Run a single experimental iteration."""
    set_seed(test_seed)

    # Split data (70-10-20)
    train_val_data, test_data, train_val_labels, test_labels = train_test_split(
        data, labels, test_size=0.2, shuffle=True)
    train_data, val_data, train_labels, val_labels = train_test_split(
        train_val_data, train_val_labels, test_size=0.125, shuffle=True)

    # Create datasets
    train_dataset = EarthquakeDataset(train_data, train_labels)
    val_dataset = EarthquakeDataset(val_data, val_labels)
    test_dataset = EarthquakeDataset(test_data, test_labels)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

    # Initialize and train model
    set_seed(model_seed)
    model = EarthquakeModel().to(device)
    
    train_losses, val_losses = train_model(
        model, train_loader, val_loader,
        run_id=run_id, test_seed=test_seed, model_seed=model_seed
    )
    
    mae, mean_pred, true_values, epistemic_unc, aleatoric_unc, combined_unc = \
        evaluate_model(model, test_loader, run_id, test_seed, model_seed)

    return {
        "test_seed": test_seed,
        "model_seed": model_seed,
        "mae": float(mae),
        "mean_aleatoric_uncertainty": float(np.mean(aleatoric_unc)),
        "mean_epistemic_uncertainty": float(np.mean(epistemic_unc)),
        "mean_combined_uncertainty": float(np.mean(combined_unc))
    }

# Part 6: Visualization Functions

In [None]:
#------------------------------------------------------------------------------
# Part 6: Visualization Functions
#------------------------------------------------------------------------------

def plot_experimental_results(results):
    """Plot comprehensive analysis of experimental results."""
    # Extract metrics
    mae_values = [result['median_mae'] for result in results]
    aleatoric_values = [result['median_aleatoric_uncertainty'] for result in results]
    epistemic_values = [result['median_epistemic_uncertainty'] for result in results]
    combined_values = [result['median_combined_uncertainty'] for result in results]

    # Calculate statistics
    mean_mae = np.mean(mae_values)
    std_mae = np.std(mae_values)

    # Plot 1: MAE Distribution
    plt.figure(figsize=(14, 10))
    n, bins, patches = plt.hist(mae_values, bins=10, alpha=0.7, edgecolor='black')
    
    plt.axvline(mean_mae, color='red', linestyle='dashed', linewidth=2,
                label=f'Mean = {mean_mae:.4f}')
    plt.axvline(mean_mae + std_mae, color='green', linestyle='dotted', linewidth=2,
                label=f'Mean ± Std = {mean_mae + std_mae:.4f}')
    plt.axvline(mean_mae - std_mae, color='blue', linestyle='dotted', linewidth=2,
                label=f'Mean - Std = {mean_mae - std_mae:.4f}')
    
    plt.annotate(f'0.2441: test MAE\n(chronological splitting)',
                xy=(0.2441, 0), xytext=(0.2530, max(n) * 0.6),
                arrowprops=dict(facecolor='orange', shrink=0),
                fontsize=18, ha='right', va='center')
    
    plt.xlabel('MAE', fontsize=18, fontweight='bold')
    plt.ylabel('Frequency', fontsize=18, fontweight='bold')
    plt.title('MAE Distribution over 50 Random Splitting Runs', 
              fontsize=18, fontweight='bold')
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.legend(fontsize=18)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('mae_distribution.png')
    plt.close()

    # Plot 2: Uncertainty Distributions
    fig, axes = plt.subplots(3, 1, figsize=(14, 24))
    
    # Aleatoric Uncertainty
    sns.histplot(aleatoric_values, bins=10, ax=axes[0], alpha=0.7, 
                edgecolor='black')
    axes[0].axvline(np.mean(aleatoric_values), color='red', linestyle='--',
                    label=f'Mean = {np.mean(aleatoric_values):.4f}')
    axes[0].set_title('Aleatoric Uncertainty Distribution', 
                      fontsize=18, fontweight='bold')
    axes[0].set_xlabel('Mean Aleatoric Uncertainty', fontsize=18, fontweight='bold')
    axes[0].set_ylabel('Frequency', fontsize=18, fontweight='bold')
    axes[0].tick_params(labelsize=18)
    axes[0].legend(fontsize=18)
    axes[0].grid(True)

    # Epistemic Uncertainty
    sns.histplot(epistemic_values, bins=10, ax=axes[1], alpha=0.7, 
                edgecolor='black')
    axes[1].axvline(np.mean(epistemic_values), color='red', linestyle='--',
                    label=f'Mean = {np.mean(epistemic_values):.4f}')
    axes[1].set_title('Epistemic Uncertainty Distribution', 
                      fontsize=18, fontweight='bold')
    axes[1].set_xlabel('Mean Epistemic Uncertainty', fontsize=18, fontweight='bold')
    axes[1].set_ylabel('Frequency', fontsize=18, fontweight='bold')
    axes[1].tick_params(labelsize=18)
    axes[1].legend(fontsize=18)
    axes[1].grid(True)

    # Combined Uncertainty
    sns.histplot(combined_values, bins=10, ax=axes[2], alpha=0.7, 
                edgecolor='black')
    axes[2].axvline(np.mean(combined_values), color='red', linestyle='--',
                    label=f'Mean = {np.mean(combined_values):.4f}')
    axes[2].set_title('Combined Uncertainty Distribution', 
                      fontsize=18, fontweight='bold')
    axes[2].set_xlabel('Mean Combined Uncertainty', fontsize=18, fontweight='bold')
    axes[2].set_ylabel('Frequency', fontsize=18, fontweight='bold')
    axes[2].tick_params(labelsize=18)
    axes[2].legend(fontsize=18)
    axes[2].grid(True)

    plt.tight_layout()
    plt.savefig('uncertainty_distributions.png')
    plt.close()

    # Plot 3: Box Plot of MAEs
    plt.figure(figsize=(14, 10))
    plt.boxplot(mae_values)
    plt.ylabel('MAE', fontsize=18, fontweight='bold')
    plt.title('Box Plot of MAEs across 50 Runs', fontsize=18, fontweight='bold')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('mae_boxplot.png')
    plt.close()

# Part 7: Main Execution

In [None]:
#------------------------------------------------------------------------------
# Part 7: Main Execution
#------------------------------------------------------------------------------

if __name__ == "__main__":
    # Load preprocessed data
    output_data_file = "pre_processed_data.npy"
    output_labels_file = "pre_processed_labels.npy"

    # Verify files exist
    assert os.path.isfile(output_data_file), f"Data file not found at {output_data_file}"
    assert os.path.isfile(output_labels_file), f"Labels file not found at {output_labels_file}"

    # Load data
    data = torch.tensor(np.load(output_data_file), dtype=torch.float32)
    labels = torch.tensor(np.load(output_labels_file), dtype=torch.float32)
    print(f"Data shape: {data.shape}, Labels shape: {labels.shape}")

    # Initialize results storage
    results = []
    model_seeds = [42, 123, 256, 789, 1024]  # 5 different model initializations
    results_file = "results_50_runs.json"

    # Run experiments
    for run_id in tqdm(range(1, 51)):  # 50 different test sets
        test_results = []
        for model_seed in model_seeds:
            result = run_experiment(data, labels, run_id, model_seed, run_id)
            test_results.append(result)

        # Find median performance
        sorted_results = sorted(test_results, key=lambda x: x['mae'])
        median_result = sorted_results[2]  # Index 2 is median of 5

        results.append({
            "run_id": run_id,
            "median_mae": median_result['mae'],
            "median_aleatoric_uncertainty": median_result['mean_aleatoric_uncertainty'],
            "median_epistemic_uncertainty": median_result['mean_epistemic_uncertainty'],
            "median_combined_uncertainty": median_result['mean_combined_uncertainty'],
            "all_results": test_results
        })

        # Save results after each run
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=4)

        print(f"\nCompleted run {run_id}/50")
        print(f"Median MAE: {median_result['mae']:.4f}")
        print(f"Median Aleatoric Uncertainty: {median_result['mean_aleatoric_uncertainty']:.4f}")
        print(f"Median Epistemic Uncertainty: {median_result['mean_epistemic_uncertainty']:.4f}")
        print(f"Median Combined Uncertainty: {median_result['mean_combined_uncertainty']:.4f}")

    # Plot final results
    print("\nGenerating final plots...")
    plot_experimental_results(results)

    # Calculate overall statistics
    maes = [result["median_mae"] for result in results]
    mean_mae = np.mean(maes)
    std_mae = np.std(maes)

    print("\nFinal Statistics:")
    print(f"Mean MAE: {mean_mae:.4f}")
    print(f"Standard Deviation of MAE: {std_mae:.4f}")
    print(f"Minimum MAE: {min(maes):.4f}")
    print(f"Maximum MAE: {max(maes):.4f}")

    # End timing
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"\nTotal execution time: {elapsed_time/60:.2f} minutes")

    print("\nExperiment completed. Results saved in 'results_50_runs.json'")