In [6]:
import numpy as np
samples = np.load('data/global_pft_ex_norm.npy')
pft = np.load('data/global_pft_ex.npy')
samples[4,:] = pft[4,:]

In [9]:
n_subsample = 800000
subsample_indices = np.random.choice(samples.shape[1], size=n_subsample, replace=False)
# Extract features and target
X = samples[[0, 1, 4], :][:, subsample_indices].T
y = samples[3, subsample_indices]

In [10]:
X.shape, y.shape

((800000, 3), (800000,))

In [11]:
# One-hot encode the PFT variable (last column in X)
from sklearn.preprocessing import OneHotEncoder
import numpy as np

print(f"Original X shape: {X.shape}")
print(f"Unique PFT values: {np.unique(X[:, 2])}")

# Separate continuous features (SSRD, VPD) from categorical (PFT)
X_continuous = X[:, :2]  # First 2 columns: SSRD, VPD
X_pft = X[:, 2:3]        # Last column: PFT (keep 2D for OneHotEncoder)

# Initialize and fit one-hot encoder
pft_encoder = OneHotEncoder(sparse_output=False, dtype=np.float32)
X_pft_encoded = pft_encoder.fit_transform(X_pft)

print(f"PFT categories: {pft_encoder.categories_[0]}")
print(f"One-hot encoded PFT shape: {X_pft_encoded.shape}")

# Combine continuous features with one-hot encoded PFT
X_encoded = np.concatenate([X_continuous, X_pft_encoded], axis=1)

print(f"Final X shape after one-hot encoding: {X_encoded.shape}")
print(f"Feature order: [SSRD, VPD, PFT_1.0, PFT_2.0, PFT_3.0]")

# Update X to use the encoded version
X = X_encoded

Original X shape: (800000, 3)
Unique PFT values: [1. 2. 3.]
PFT categories: [1. 2. 3.]
One-hot encoded PFT shape: (800000, 3)
Final X shape after one-hot encoding: (800000, 5)
Feature order: [SSRD, VPD, PFT_1.0, PFT_2.0, PFT_3.0]


In [15]:
# Split subsampled data into training and validation sets
# Convert data to PyTorch tensors
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

import torch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # Ensure correct shape
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

print(f"Training set: {X_train_tensor.shape[0]:,} samples")
print(f"Validation set: {X_val_tensor.shape[0]:,} samples")
print(f"Total subsampled: {X_train_tensor.shape[0] + X_val_tensor.shape[0]:,} samples")

Training set: 640,000 samples
Validation set: 160,000 samples
Total subsampled: 800,000 samples


In [16]:
from model import ClimateDataset
from torch.utils.data import DataLoader
# Create DataLoaders
batch_size = 64  # You can adjust this value based on your available memory and GPU capacity
train_loader = DataLoader(ClimateDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(ClimateDataset(X_val_tensor, y_val_tensor), batch_size=batch_size, shuffle=False)

In [17]:
from model import NeuralNet
from torch import nn, optim
# Initialize model, loss function, and optimizer
input_dim = X.shape[1]  # Number of features after one-hot encoding (SSRD, VPD, PFT_1, PFT_2, PFT_3)
print(f"Model input dimension: {input_dim}")

model = NeuralNet(input_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(f"Training on device: {device}")
print(f"Model initialized with {input_dim} input features")

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation Step
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_val_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.10f}, "
            f"Val Loss: {avg_val_loss:.10f}")

Model input dimension: 5
Training on device: cuda
Model initialized with 5 input features
Epoch [1/10], Train Loss: 0.0042839308, Val Loss: 0.0021990116
Epoch [2/10], Train Loss: 0.0021645566, Val Loss: 0.0019976896
Epoch [3/10], Train Loss: 0.0020951237, Val Loss: 0.0021352977
Epoch [4/10], Train Loss: 0.0020766684, Val Loss: 0.0023167580
Epoch [5/10], Train Loss: 0.0020568987, Val Loss: 0.0019707277
Epoch [6/10], Train Loss: 0.0020509308, Val Loss: 0.0019815835
Epoch [7/10], Train Loss: 0.0020355976, Val Loss: 0.0020240303
Epoch [8/10], Train Loss: 0.0020352916, Val Loss: 0.0020893544
Epoch [9/10], Train Loss: 0.0020289829, Val Loss: 0.0021640119
Epoch [10/10], Train Loss: 0.0020258463, Val Loss: 0.0019405775


In [18]:
torch.save(model.state_dict(), "./outputs/model_weights_pre-training_pft_ex.pth")