## Reduced Rank Regression

In [None]:
import os, sys
import pickle as pkl
import numpy as np
import random
import warnings
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

sys.path.append(os.path.join(os.getcwd(), 'utils'))

from utils.data_loading import *
from utils.data_processing import *
from utils.regression import *
from utils.animation import *
from utils.metrics import *
from utils.pipeline import *

# autoreload
%reload_ext autoreload
%autoreload 2

# ignore warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
# Remove deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Define the data path and filename
data_path = os.path.join(os.getcwd(), 'data')
filename = 'ssp585_time_series.pkl'

# Preprocess the data
data, nan_mask = preprocess_data(data_path, filename)

In [None]:
# Define the lambda values to test
# lambdas = [0.01, 0.1, 1, 10, 50, 100, 200]
# ranks = [1, 2, 5, 10, 50, 100]
lambdas = [1, 100]
ranks = [2, 10]

In [None]:
# Only keep 8 random models for the sake of time
random.seed(42)
models = random.sample(list(data.keys()), 3)
subset_data = {model: data[model] for model in models}
print(f"Models kept to test the pipeline: {models}")

In [None]:
# Perform leave-one-out cross-validation
mse_distributions, mse_by_combination = loo_cross_validation(subset_data, lambdas, ranks)

In [None]:
# Plot the mse distributions for each combination of lambda and rank
plot_mse_distributions(mse_by_combination, ranks, lambdas, output_dir='output')

In [None]:
# Plot and save the MSE distributions for each model
plot_mse_distributions_per_model(mse_distributions, models, ranks, lambdas, output_dir='output')

In [None]:
# Select the most robust combination of rank and lambda
best_rank_lambda, best_mse = select_robust_hyperparameters(mse_by_combination, mean_weight = 0.7, variance_weight = 0.3, output_dir = 'output')

In [None]:
# Extract the best rank and lambda
best_rank, best_lambda = best_rank_lambda
print(f"Selected best rank: {best_rank}, best lambda: {best_lambda}, with mean MSE: {best_mse:.4f}")

In [None]:
# Perform final cross-validation using the best rank and lambda
final_mse_losses = final_cross_validation(subset_data, best_rank, best_lambda)

In [None]:
plot_final_mse_distribution(final_mse_losses, output_dir='output')

In [None]:
# Chose a random model to test on
test_model = random.choice(models)

# Generate and save animations for the test model
generate_and_save_animations(
    data=subset_data,
    test_model=test_model,
    best_rank=best_rank,
    best_lambda=best_lambda,
    nan_mask=nan_mask,
    num_runs=3,
    output_dir="output",
    color_limits=(-2, 2)
)