## Variational Auto Encoder

In [1]:
# Import necessary libraries
import pandas as pd
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
import random
import warnings


from torch.utils.data import DataLoader

# Add utility paths
sys.path.append(os.path.join(os.getcwd(), 'utils'))

# Import utility functions
from utils.data_loading import *
from utils.data_processing import *
from utils.vae import *
from utils.animation import *
from utils.metrics import *
from utils.pipeline import *

# Enable autoreload
%reload_ext autoreload
%autoreload 2

# Suppress warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Define data path
current_dir = os.getcwd()
data_path = os.path.join(current_dir, 'data')
print(f"Data path: {data_path}")

Data path: /Users/lharriso/Documents/GitHub/gm4cs-l/data


In [2]:
# Use MPS / Cuda or CPU if none of the options are available
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
random.seed(42)

Using device: mps


In [3]:
# Load the data
filename = os.path.join(data_path, 'ssp585_time_series.pkl')
data, nan_mask = preprocess_data(data_path, filename)

Loading data from /Users/lharriso/Documents/GitHub/gm4cs-l/data/ssp585_time_series.pkl
Data loaded successfully.
Filtering data...
Data loaded successfully.
Filtering data...


100%|██████████| 72/72 [00:00<00:00, 37673.39it/s]
100%|██████████| 72/72 [00:00<00:00, 37673.39it/s]


Data filtered. Kept 34 models
Creating NaN mask...


100%|██████████| 34/34 [00:02<00:00, 13.12it/s]
100%|██████████| 34/34 [00:02<00:00, 13.12it/s]


NaN mask created.
Masking out NaN values...


100%|██████████| 34/34 [00:01<00:00, 27.51it/s]
100%|██████████| 34/34 [00:01<00:00, 27.51it/s]


NaN values masked out.
Reshaping data...


100%|██████████| 34/34 [00:04<00:00,  7.96it/s]
100%|██████████| 34/34 [00:04<00:00,  7.96it/s]


Data reshaped.
Adding the forced response to the data...


100%|██████████| 34/34 [00:09<00:00,  3.64it/s]
100%|██████████| 34/34 [00:09<00:00,  3.64it/s]


Forced response added.
Removing NaN values from the grid...


100%|██████████| 34/34 [00:03<00:00, 10.52it/s]
100%|██████████| 34/34 [00:03<00:00, 10.52it/s]


NaN values removed.


In [4]:
# Randomly select and keep the data corresponding to n models
n = 5
model_keys = random.sample(data.keys(), n)
data = {key: value for key,value in data.items() if key in model_keys}

since Python 3.9 and will be removed in a subsequent version.
  model_keys = random.sample(data.keys(), n)


In [5]:
# Select one of the models randomly for testing and the rest for training according to the leave-one-out strategy
test_model = random.choice(list(data.keys()))
train_models = [model for model in data.keys() if model != test_model]

# Create the training and testing datasets
train_data = {model: data[model] for model in train_models}
test_data = {test_model: data[test_model]}

print(f"Training models: {train_models}")
print(f"Testing model: {test_model}")

Training models: ['EC-Earth3', 'E3SM-2-0', 'GISS-E2-1-G', 'ACCESS-ESM1-5']
Testing model: GISS-E2-2-G


In [6]:
train_data[list(train_data.keys())[0]].keys()

dict_keys(['r1i1p1f1', 'r22i1p1f1', 'r24i1p1f1', 'r21i1p1f1', 'r6i1p1f1', 'r9i1p1f1', 'r11i1p1f1', 'r13i1p1f1', 'r15i1p1f1', 'r4i1p1f1', 'r7i1p1f1', 'r19i1p1f1', 'r20i1p1f1', 'r12i1p1f1', 'r18i1p1f1', 'r16i1p1f1', 'r17i1p1f1', 'r14i1p1f1', 'r25i1p1f1', 'r10i1p1f1', 'r2i1p1f1', 'r23i1p1f1', 'r3i1p1f1', 'forced_response'])

In [7]:
# Create dataset
train_dataset = ClimateDataset(train_data)
test_dataset = ClimateDataset(test_data)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Print dataset sizes
print(f'Training dataset size: {len(train_dataset)}')
print(f'Testing dataset size: {len(test_dataset)}')

Creating datasets...


Processing models: 100%|██████████| 4/4 [00:00<00:00, 104857.60it/s]
Processing models: 100%|██████████| 4/4 [00:00<00:00, 104857.60it/s]
  self.inputs = torch.tensor(self.inputs, dtype=torch.float32)
  self.inputs = torch.tensor(self.inputs, dtype=torch.float32)


Creating datasets...


Processing models: 100%|██████████| 1/1 [00:00<00:00, 14873.42it/s]

Training dataset size: 124
Testing dataset size: 11





In [8]:
# Initialize the VAE model
input_dim = train_dataset.inputs.shape[1]  # Assuming inputs are flattened
hidden_dim = 400
latent_dim = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vae_model = VAE(input_dim=input_dim, hidden_dim=hidden_dim, latent_dim=latent_dim, device=device)

# Define optimizer
optimizer = torch.optim.Adam(vae_model.parameters(), lr=1e-3)

# Train the VAE
train_vae(vae_model, train_loader, optimizer, epochs=10, device=device)

Batch 1: Input shape: torch.Size([32, 1076295])
Error during forward pass: mat1 and mat2 shapes cannot be multiplied (32x1076295 and 165x400)
Model input shape: torch.Size([32, 1076295])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x1076295 and 165x400)