## Reduced Rank Regression

In [1]:
import os
import pickle as pkl
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from utils.helpers import *

# autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
# Define the data path and filename
data_path = os.path.join(os.getcwd(), 'data')
filename = 'ssp585_time_series.pkl'

# Preprocess the data
data, nan_mask = preprocess_data(data_path, filename)

Loading data from ssp585_time_series.pkl
Data loaded successfully.
Filtering data...


100%|██████████| 72/72 [00:00<00:00, 67847.65it/s]


Data filtered. Kept 34 models
Creating NaN mask...


100%|██████████| 34/34 [00:01<00:00, 20.25it/s]


NaN mask created.
Masking out NaN values...


100%|██████████| 34/34 [00:01<00:00, 21.73it/s]


NaN values masked out.
Reshaping data...


100%|██████████| 34/34 [00:03<00:00,  9.45it/s]


Data reshaped.
Adding the forced response to the data...


100%|██████████| 34/34 [00:05<00:00,  6.49it/s]


Forced response added.
Removing NaN values from the grid...


100%|██████████| 34/34 [00:01<00:00, 26.26it/s]


NaN values removed.


In [3]:
# Define the lambda values to test
lambdas = [0.01, 0.1, 1, 10]

In [8]:
# Only keep 7 random models for the sake of time
random.seed(42)
models = random.sample(list(data.keys()), 7)
subset_data = {model: data[model] for model in models}
print(f"Models kept to test the pipeline: {models}")

Models kept to test the pipeline: ['GISS-E2-2-G', 'EC-Earth3', 'ACCESS-ESM1-5', 'CESM2-FV2', 'NorCPM1', 'CESM2', 'CAS-ESM2-0']


In [None]:
# Perform leave-one-out cross-validation
mse_distributions = loo_cross_validation(subset_data, lambdas, rank=15)

  0%|          | 0/7 [00:00<?, ?it/s]

Normalizing data...




100%|██████████| 6/6 [00:00<00:00, 13.36it/s]
100%|██████████| 1/1 [00:00<00:00, 23.29it/s]


Data normalization completed.
Pooling data...


100%|██████████| 6/6 [00:00<00:00, 167772.16it/s]
100%|██████████| 6/6 [00:00<00:00, 154391.56it/s]


Data pooled.
Performing leave-one-out cross-validation for model: GISS-E2-2-G
Fitting OLS...
RRR completed.
Fitting OLS...


In [None]:
# Plot the MSE distributions for each model
for model in models:
    plt.figure(figsize=(10, 6))
    for lambda_ in lambdas:
        if model in mse_distributions[lambda_]:
            mse_values = mse_distributions[lambda_][model]
            plt.hist(mse_values, bins=20, alpha=0.5, label=f'Lambda: {lambda_}')
    plt.xlabel('MSE')
    plt.ylabel('Frequency')
    plt.title(f'MSE Distribution for Model: {model}')
    plt.legend()
    plt.show()

# Calculate the best lambda overall
best_lambda = None
best_mse = float('inf')
for lambda_, model_mse in mse_distributions.items():
    all_mse_values = [mse for mse_list in model_mse.values() for mse in mse_list]
    mean_mse = np.mean(all_mse_values)
    if mean_mse < best_mse:
        best_mse = mean_mse
        best_lambda = lambda_

print(f"Best lambda overall: {best_lambda}, MSE: {best_mse:.4f}")