# Tutorial: Linear Variational Autoencoder (VAE) for Climate Data

This tutorial demonstrates the training of a Linear Variational Autoencoder (VAE) to learn transformations from input climate data to corresponding forced responses. It includes data preprocessing, model training, and evaluation using Leave-One-Out (LOO) cross-validation.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
import random
import warnings

from torch.utils.data import DataLoader

# Add utility paths
sys.path.append(os.path.join(os.getcwd(), 'utils'))

# Import utility functions
from utils.data_loading import *
from utils.data_processing import *
from utils.trend_vae_3 import *
from utils.animation import *
from utils.metrics import *
from utils.pipeline import *

# Enable autoreload
%reload_ext autoreload
%autoreload 2

# Suppress warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Define data path
current_dir = os.getcwd()
data_path = os.path.join(current_dir, 'data')
print(f"Data path: {data_path}")

Data path: /Users/lharriso/Documents/GitHub/gm4cs-l/data


In [2]:
# Use MPS / Cuda or CPU if none of the options are available
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
random.seed(42)

Using device: mps


In [3]:
# Load the data
filename = os.path.join(data_path, 'ssp585_time_series.pkl')
data, nan_mask = preprocess_data(data_path, filename)

# Randomly select and keep the data corresponding to n models
n_models = 34
model_keys = random.sample(data.keys(), n_models)
data = {key: value for key,value in data.items() if key in model_keys}

# Select one of the models randomly for testing and the rest for training according to the leave-one-out strategy
models = list(data.keys())
print(f"There are {len(models)} models in the dataset.")

Loading data from /Users/lharriso/Documents/GitHub/gm4cs-l/data/ssp585_time_series.pkl
Data loaded successfully.
Filtering data...
Data loaded successfully.
Filtering data...


100%|██████████| 72/72 [00:00<00:00, 62549.69it/s]

Data filtered. Kept 34 models
Creating NaN mask...



100%|██████████| 34/34 [00:02<00:00, 13.92it/s]

NaN mask created.
Masking out NaN values...
Masking out NaN values...



100%|██████████| 34/34 [00:01<00:00, 22.26it/s]

NaN values masked out.
Reshaping data...
Reshaping data...



100%|██████████| 34/34 [00:04<00:00,  7.22it/s]

Data reshaped.
Adding the forced response to the data...
Adding the forced response to the data...



100%|██████████| 34/34 [00:27<00:00,  1.24it/s]

Forced response added.
Removing NaN values from the grid...
Removing NaN values from the grid...



100%|██████████| 34/34 [00:03<00:00,  9.06it/s]

NaN values removed.



since Python 3.9 and will be removed in a subsequent version.
  model_keys = random.sample(data.keys(), n_models)


There are 34 models in the dataset.


## Next

In [None]:
# Leave-One-Out Cross-Validation
mse_scores = []
center = True  # Center the data
hidden_dim = 128 # Increased for better representation
feat_dim = 6523
latent_dim = 64 # Intermediate layer size
z_dim = 5 # Actual latent space dimension
trend_poly = 2
seq_len = 165 # Number of time steps in the sequence
batch_size = 32 # Batch size for training

for test_model in data.keys():
    # Split data into training and testing sets
    train_models = [model for model in data.keys() if model != test_model]
    train_data = {model: data[model] for model in train_models}
    test_data = {test_model: data[test_model]}

    # Normalize the data
    normalized_train_data, normalized_test_data, _, _ = normalize_data(train_data, test_data, center=center)

    # Create datasets and dataloaders
    train_dataset = ClimateDataset(normalized_train_data)
    test_dataset = ClimateDataset(normalized_test_data)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model
    input_dim = train_dataset[0]['input'][1]
    vae_model = Trend_Vae(
        seq_len=seq_len,
        feat_dim=feat_dim,
        hidden_dim=hidden_dim,
        latent_dim=latent_dim,
        z_dim=z_dim,
        trend_poly=trend_poly,
        use_residual_conn=True,
        device=device,
        ).to(device)

    # Train the model (simplified for tutorial purposes)
    epochs = 500
    losses = []

    optimizer = torch.optim.Adam(vae_model.parameters(), lr=3e-4) # Weight decay is not used in this version (was weight_decay=1e-5)

    losses = train_vae(vae_model, train_loader, optimizer, epochs=epochs, device=device)

    # Evaluate the model
    results = evaluate_vae(vae_model, test_loader, device, testing_statistics=None)  # Replace None with actual testing_statistics if available
    mse = results['mse']
    normalized_mse = results['normalized_mse']
    mse_scores.append(mse)

    print(f"Test model: {test_model}, MSE: {mse}, Normalized MSE: {normalized_mse}")

# Save MSE scores to a file
mse_file = os.path.join(current_dir, 'mse_scores.pkl')
with open(mse_file, 'wb') as f:
    pkl.dump(mse_scores, f)

print(f"MSE scores saved to {mse_file}")


Normalizing data...


  0%|          | 0/33 [00:00<?, ?it/s]

100%|██████████| 33/33 [00:25<00:00,  1.30it/s]

100%|██████████| 1/1 [00:00<00:00,  1.90it/s]

Data normalization completed.





Creating datasets...


Processing models: 100%|██████████| 33/33 [00:00<00:00, 49030.12it/s]



Creating datasets...


Processing models: 100%|██████████| 1/1 [00:00<00:00, 1971.93it/s]

Input dimension: torch.Size([165, 6523])





MSE scores saved to /Users/lharriso/Documents/GitHub/gm4cs-l/mse_scores.pkl


In [None]:
# Plot MSE Distributions
# Load MSE scores
with open(mse_file, 'rb') as f:
    mse_scores = pkl.load(f)

# Create a quantile plot
plt.figure(figsize=(10, 6))
sns.ecdfplot(mse_scores, complementary=True)
plt.title('Quantile Plot of MSE Distributions')
plt.xlabel('MSE')
plt.ylabel('1 - CDF')
plt.grid(True)
plt.show()