## Variational Auto Encoder

In [1]:
# Import necessary libraries
import pandas as pd
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
import random
import warnings
from torch.utils.data import DataLoader

# Add utility paths
sys.path.append(os.path.join(os.getcwd(), 'utils'))

# Import utility functions
from utils.data_loading import *
from utils.data_processing import *
from utils.vae import *
from utils.animation import *
from utils.metrics import *
from utils.pipeline import *

# Enable autoreload
%reload_ext autoreload
%autoreload 2

# Suppress warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Define data path
current_dir = os.getcwd()
data_path = os.path.join(current_dir, 'data')
print(f"Data path: {data_path}")

Data path: /Users/lharriso/Documents/GitHub/gm4cs-l/data


In [2]:
# Load the data
filename = os.path.join(data_path, 'ssp585_time_series.pkl')
data, nan_mask = preprocess_data(data_path, filename)

Loading data from /Users/lharriso/Documents/GitHub/gm4cs-l/data/ssp585_time_series.pkl
Data loaded successfully.
Filtering data...
Data loaded successfully.
Filtering data...


100%|██████████| 72/72 [00:00<00:00, 21864.31it/s]
100%|██████████| 72/72 [00:00<00:00, 21864.31it/s]


Data filtered. Kept 34 models
Creating NaN mask...


100%|██████████| 34/34 [00:02<00:00, 14.34it/s]
100%|██████████| 34/34 [00:02<00:00, 14.34it/s]


NaN mask created.
Masking out NaN values...


100%|██████████| 34/34 [00:01<00:00, 28.18it/s]
100%|██████████| 34/34 [00:01<00:00, 28.18it/s]


NaN values masked out.
Reshaping data...


100%|██████████| 34/34 [00:03<00:00,  8.63it/s]
100%|██████████| 34/34 [00:03<00:00,  8.63it/s]


Data reshaped.
Adding the forced response to the data...


100%|██████████| 34/34 [00:03<00:00,  9.51it/s]
100%|██████████| 34/34 [00:03<00:00,  9.51it/s]


Forced response added.
Removing NaN values from the grid...


100%|██████████| 34/34 [00:02<00:00, 11.64it/s]
100%|██████████| 34/34 [00:02<00:00, 11.64it/s]


NaN values removed.


In [3]:
# Randomly select and keep the data corresponding to n models
n = 5
model_keys = random.sample(data.keys(), n)
data = {key: value for key,value in data.items() if key in model_keys}

since Python 3.9 and will be removed in a subsequent version.
  model_keys = random.sample(data.keys(), n)


In [4]:
# Select one of the models randomly for testing and the rest for training according to the leave-one-out strategy
test_model = random.choice(list(data.keys()))
train_models = [model for model in data.keys() if model != test_model]

# Create the training and testing datasets
train_data = {model: data[model] for model in train_models}
test_data = {test_model: data[test_model]}

print(f"Training models: {train_models}")
print(f"Testing model: {test_model}")

Training models: ['ACCESS-CM2', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'CanESM5-1']
Testing model: UKESM1-0-LL


In [5]:
train_data[list(train_data.keys())[0]].keys()

dict_keys(['r1i1p1f1', 'r2i1p1f1', 'r3i1p1f1', 'r4i1p1f1', 'r5i1p1f1', 'r10i1p1f1', 'r6i1p1f1', 'r7i1p1f1', 'r8i1p1f1', 'r9i1p1f1', 'forced_response'])

In [6]:
# Create dataset
train_dataset = ClimateDataset(train_data)
test_dataset = ClimateDataset(test_data)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Print dataset sizes
print(f'Training dataset size: {len(train_dataset)}')
print(f'Testing dataset size: {len(test_dataset)}')

Creating datasets...


Processing models: 100%|██████████| 4/4 [00:00<00:00, 97541.95it/s]
Processing models: 100%|██████████| 4/4 [00:00<00:00, 97541.95it/s]
  self.inputs = torch.tensor(self.inputs, dtype=torch.float32)
  self.inputs = torch.tensor(self.inputs, dtype=torch.float32)


Creating datasets...


Processing models: 100%|██████████| 1/1 [00:00<00:00, 26886.56it/s]
Processing models: 100%|██████████| 1/1 [00:00<00:00, 26886.56it/s]


Training dataset size: 126
Testing dataset size: 17


In [7]:
# Initialize the VAE model
input_channels = 1  # Assuming each grid spot is treated as a single channel
latent_dim = 128
vae_model = VAE(input_channels, latent_dim)

# Define optimizer
optimizer = torch.optim.Adam(vae_model.parameters(), lr=1e-3)

# Train the VAE
train_vae(vae_model, train_loader, optimizer, num_epochs=10)

Epoch 1/10:   0%|          | 0/4 [00:44<?, ?it/s]


ValueError: Using a target size (torch.Size([32, 1, 165, 6523])) that is different to the input size (torch.Size([32, 1, 168, 6524])) is deprecated. Please ensure they have the same size.