In [None]:
import sys
sys.path.append('/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/')
import numpy as np
import random
import torch
from gpytorch.kernels import ScaleKernel, RBFKernel
from linear_operator.operators import KroneckerProductLinearOperator
from torch import Tensor
from torch.distributions import MultivariateNormal
from models_.lvmogp_svi import LVMOGP_SVI
from models_.gaussian_likelihood import GaussianLikelihood
from models_.variational_elbo import VariationalELBO
from tqdm import trange
from torch.optim.lr_scheduler import StepLR
from util_functions import *
import yaml
import time

## Experi random seed

In [None]:
expri_random_seed =  12  # 13, 78, 912, 73, 269

## Load in data

In [None]:
# Double Check this with data folder title! Make sure import the correct one.
w_n_C_total = 50 # totally 700 points for C
w_n_outputs = 5000 # 100, 300, 500, 1000, 2500(20), 1500, 2000

synth_data_path = f'/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/data/synth_regression/smartly_generated/ninputs_{w_n_C_total}_nlatents_{w_n_outputs}'
w_C_total = Tensor(pd.read_csv(f'{synth_data_path}/inputs.csv').to_numpy()).reshape(-1)
w_X_true = Tensor(pd.read_csv(f'{synth_data_path}/latents.csv').to_numpy()).reshape(-1, 2)
w_sample_total_data = Tensor(pd.read_csv(f'{synth_data_path}/target_data.csv').to_numpy()).reshape(-1)

w_n_C_train = 25 # the number of training data points per output
w_n_C_test = w_n_C_total - w_n_C_train

np.random.seed(expri_random_seed)
torch.manual_seed(expri_random_seed)
list_expri_random_seeds = np.random.randn(w_n_outputs)

# different from the previous case, C_train and C_test no longer a single set, but every output has different values.
w_ls_of_ls_train_C = []
w_ls_of_ls_test_C = []

w_sample_train_index, w_sample_test_index = [], []

for i in range(w_n_outputs):
    # iterate across different output functions
    random.seed(list_expri_random_seeds[i])
    train_index = random.sample(range(w_n_C_total), w_n_C_train)
    test_index = [index for index in range(w_n_C_total) if index not in train_index]
    w_ls_of_ls_train_C.append(train_index)
    w_ls_of_ls_test_C.append(test_index)

    w_sample_train_index = np.concatenate((w_sample_train_index, list(np.array(train_index) + w_n_C_total*i)))
    w_sample_test_index = np.concatenate((w_sample_test_index, list(np.array(test_index) + w_n_C_total*i)))

w_sample_train_data = w_sample_total_data[w_sample_train_index]
w_sample_test_data = w_sample_total_data[w_sample_test_index]

assert w_sample_train_data.shape[0] == w_n_C_train * w_n_outputs
assert w_sample_test_data.shape[0] == w_n_C_test * w_n_outputs

## Define model and Training

In [None]:
def only_train_variational_params_fix_others(true_hyperparams, my_model, my_likelihood):
    
    # assign true values to model hyper-parameters
    my_model.covar_module_latent.raw_outputscale.data = torch.tensor(true_hyperparams['X_raw_outputscale'])
    my_model.covar_module_input.raw_outputscale.data = torch.tensor(true_hyperparams['C_raw_outputscale'])
    my_model.covar_module_latent.base_kernel.raw_lengthscale.data = torch.tensor([true_hyperparams['X_raw_lengthscale']])
    my_model.covar_module_input.base_kernel.raw_lengthscale.data = torch.tensor([true_hyperparams['C_raw_lengthscale']])
    my_likelihood.noise = torch.tensor(true_hyperparams['likelihood_noise']) # NOTE: not .data !

    # fix gradient updates for hyperparameters
    my_model.covar_module_latent.raw_outputscale.requires_grad = False
    my_model.covar_module_input.raw_outputscale.requires_grad = False
    my_model.covar_module_latent.base_kernel.raw_lengthscale.requires_grad = False
    my_model.covar_module_input.base_kernel.raw_lengthscale.requires_grad = False
    my_likelihood.raw_noise.requires_grad = False

### Implementation 1

In [None]:
# define hyper-parameters
w_n_X = w_X_true.shape[0]
w_n_C = len(w_ls_of_ls_train_C[0])
w_n_total = w_n_X * w_n_C
w_index_dim = 1
w_latent_dim = 2
w_n_inducing_C = 30
w_n_inducing_X = 30
w_pca = False
learn_inducing_locations_X= True # True
learn_inducing_locations_C = True

Y_train = w_sample_train_data

# specify model
w_my_model = LVMOGP_SVI(w_n_X, w_n_C, w_index_dim, w_latent_dim, w_n_inducing_C, w_n_inducing_X, Y_train.reshape(w_n_X, -1), pca=w_pca, learn_inducing_locations_latent=learn_inducing_locations_X, learn_inducing_locations_input=learn_inducing_locations_C)

# Likelihood & training objective
w_likelihood = GaussianLikelihood()
w_mll = VariationalELBO(w_likelihood, w_my_model, num_data=w_n_total)

import json
with open(f'{synth_data_path}/dictionary.json', 'r') as file:
    true_hyperparams = json.load(file)
true_hyperparams['likelihood_noise'] = 0.05

# only_train_variational_params_fix_others(true_hyperparams=true_hyperparams, my_model=w_my_model, my_likelihood=w_likelihood)

# optimizer and scheduler
w_optimizer = torch.optim.Adam([
    {'params': w_my_model.parameters()},
    {'params': w_likelihood.parameters()}
], lr=0.1)

w_scheduler = StepLR(w_optimizer, step_size=20, gamma=0.95)  # every 50 iterations, learning rate multiple 0.95

# Initialize inducing points in C space
w_my_model.variational_strategy.inducing_points_input.data = Tensor(np.linspace(-10, 10, w_n_inducing_C).reshape(-1, 1))
# Another initialization: random initialization
# i.e. torch.rand(w_n_inducing_C).reshape(-1,1) * 20 - 10

# Initialize inducing points in latent space
# w_my_model.variational_strategy.inducing_points_X.data = 3 * torch.randn(w_n_inducing_X, w_latent_dim)

# Initialize likelihood noise as true value, 0.05
w_likelihood.raw_noise.data = Tensor([-2.973])
# w_likelihood.raw_noise.requires_grad = False

# start training!
w_loss_list = []
n_iterations = 1000 # 5000 # 10000
iterator = trange(n_iterations, leave=True)
batch_size_X = 50 # mini-batch for latents
batch_size_C = 20 # mini-batch for inputs, one can set w_n_C_train
num_X_MC = 5 # the number of MC samples used to approximate E_{q(X)}
w_model_max_grad_norm = 1
w_likeli_max_grad_norm = 0.1

'''
for name, params in w_my_model.named_parameters():
    print(name)
for name, params in w_likelihood.named_parameters():
    print(name)
'''

w_my_model.train()
w_likelihood.train()
start_time = time.time()
for i in iterator: 
    batch_index_X, batch_index_C = sample_index_X_and_C_from_list(w_ls_of_ls_train_C, batch_size_X=batch_size_X, batch_size_C=batch_size_C)
    # core code is here 
    w_optimizer.zero_grad()

    loss_value = 0.0
    for _ in range(num_X_MC):
        sample_batch_X = w_my_model.sample_latent_variable(batch_index_X)
        sample_batch_C = w_C_total[batch_index_C]
        output_batch = w_my_model(sample_batch_X, sample_batch_C) # q(f)
        batch_index_Y = inhomogeneous_index_of_batch_Y(batch_index_X, batch_index_C, w_n_X, w_n_C_total)
        # print('batch_index_Y', len(batch_index_Y))
        loss = -w_mll(output_batch, w_sample_total_data[batch_index_Y]).sum()
        loss_value += loss.item()
        loss.backward()

    loss_value /= num_X_MC
    
    w_loss_list.append(loss_value)
    iterator.set_description('Loss: ' + str(float(np.round(loss_value, 3))) + ", iter no: " + str(i))
    
    # Clip gradients
    torch.nn.utils.clip_grad_norm_(w_my_model.parameters(), w_model_max_grad_norm)
    torch.nn.utils.clip_grad_norm_(w_likelihood.parameters(), w_likeli_max_grad_norm)

    w_optimizer.step()
    w_scheduler.step()
    
end_time = time.time()
print('Total Training Time:',  end_time - start_time)

In [None]:
# remove abnormal values (happens when non psd matrix cholesky occur)
w_loss_list = list(np.array(w_loss_list)[np.array(w_loss_list) < 3])

In [None]:
import matplotlib.pyplot as plt
plt.plot(w_loss_list)
train_loss_path = f'/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/experi_results/syn_data_training_loss_numofoutput_{w_n_outputs}.png'
plt.savefig(train_loss_path)

## Testing

In [None]:
# prediction output for grid (total) inputs.
w_my_model.eval()
w_likelihood.eval()

all_index_X = np.array([[i]*w_n_C_total for i in range(w_n_outputs)]).reshape(-1).tolist() 
all_index_C = [i for i in range(w_n_C_total)] * w_n_outputs 
len_X = len(all_index_X)
assert len_X == len(all_index_C)
all_mean_X = w_my_model.X.q_mu

test_mini_batch_size = 1000

all_pred_mean = torch.zeros(len_X)
all_pred_var = torch.zeros(len_X)
test_continue = True
test_start_idx = 0
test_end_idx = test_mini_batch_size

while test_continue:
    batch_X = all_mean_X[all_index_X[test_start_idx:test_end_idx]]
    batch_C = w_C_total[all_index_C[test_start_idx:test_end_idx]]
    batch_output = w_likelihood(w_my_model(batch_X, batch_C))
    all_pred_mean[test_start_idx:test_end_idx] = batch_output.loc.detach()
    all_pred_var[test_start_idx:test_end_idx] = batch_output.variance.detach()

    if test_end_idx < len_X:
        test_start_idx += test_mini_batch_size
        test_end_idx += test_mini_batch_size
        test_end_idx = min(test_end_idx, len_X)
    else:
        test_continue = False

In [None]:
# finer grid for better visualization ... nothing to do with RMSE computation ... 

n_data4visual = 500
w_C_total4visual = Tensor(np.linspace(-10, 10, n_data4visual))
all_index_X4visual = np.array([[i]*n_data4visual for i in range(w_n_outputs)]).reshape(-1).tolist() 
all_index_C4visual = [i for i in range(n_data4visual)] * w_n_outputs 

len_X4visual = len(all_index_X4visual)
assert len_X4visual == len(all_index_C4visual)

test_mini_batch_size = 1000

all_pred_mean4visual = torch.zeros(len_X4visual)
all_pred_var4visual = torch.zeros(len_X4visual)

test_continue = True
test_start_idx = 0
test_end_idx = test_mini_batch_size

while test_continue:
    batch_X = all_mean_X[all_index_X4visual[test_start_idx:test_end_idx]]
    batch_C = w_C_total4visual[all_index_C4visual[test_start_idx:test_end_idx]]
    batch_output = w_likelihood(w_my_model(batch_X, batch_C))
    all_pred_mean4visual[test_start_idx:test_end_idx] = batch_output.loc.detach()
    all_pred_var4visual[test_start_idx:test_end_idx] = batch_output.variance.detach()

    if test_end_idx < len_X4visual:
        test_start_idx += test_mini_batch_size
        test_end_idx += test_mini_batch_size
        test_end_idx = min(test_end_idx, len_X4visual)
    else:
        test_continue = False

## Train/Test data RMSE

In [None]:
w_train_data_predict = all_pred_mean[w_sample_train_index]
train_rmse = (w_train_data_predict - w_sample_train_data).square().mean().sqrt()
print('Global Train RMSE', train_rmse)

w_test_data_predict = all_pred_mean[w_sample_test_index]
test_rmse = (w_test_data_predict - w_sample_test_data).square().mean().sqrt()
print('Global Test RMSE', test_rmse)

## Train/Test data NLL

In [None]:
train_nll = neg_log_likelihood(Target=w_sample_train_data, GaussianMean=all_pred_mean[w_sample_train_index], GaussianVar=all_pred_var[w_sample_train_index])
test_nll = neg_log_likelihood(Target=w_sample_test_data, GaussianMean=all_pred_mean[w_sample_test_index], GaussianVar=all_pred_var[w_sample_test_index])

print('Global Train negative log likelihood:', train_nll)
print('Global Test negative log likelihood', test_nll)

## Visual Inspect

In [None]:
def evaluate_on_simgle_output(w_function_index):
    # Pick the index of the funtion to show
    # w_function_index = 982 # 

    performance_dirct = {}
    w_train_input = w_C_total[w_ls_of_ls_train_C[w_function_index]]
    w_train_start = 0
    for i in range(w_function_index):
        w_train_start += len(w_ls_of_ls_train_C[i]) # don't assume every output has the same length of inputs
    w_train_end = w_train_start + len(w_ls_of_ls_train_C[w_function_index])
    w_train_target = w_sample_train_data[w_train_start:w_train_end]
    w_train_predict = w_train_data_predict[w_train_start:w_train_end]
    train_rmse_ = (w_train_target - w_train_predict).square().mean().sqrt()
    train_nll_ = neg_log_likelihood(w_train_target, all_pred_mean[w_sample_train_index][w_train_start:w_train_end], all_pred_var[w_sample_train_index][w_train_start:w_train_end])
    performance_dirct['train_rmse'] = train_rmse_
    performance_dirct['train_nll'] = train_nll_

    w_test_input = w_C_total[w_ls_of_ls_test_C[w_function_index]]
    w_test_start = 0
    for j in range(w_function_index):
        w_test_start += len(w_ls_of_ls_test_C[i])
    w_test_end = w_test_start + len(w_ls_of_ls_test_C[w_function_index])
    w_test_target = w_sample_test_data[w_test_start:w_test_end]
    w_test_predict = w_test_data_predict[w_test_start:w_test_end]
    test_rmse_ = (w_test_predict - w_test_target).square().mean().sqrt()
    test_nll_ = neg_log_likelihood(w_test_target, all_pred_mean[w_sample_test_index][w_test_start:w_test_end], all_pred_var[w_sample_test_index][w_test_start:w_test_end])
    performance_dirct['test_rmse'] = test_rmse_
    performance_dirct['test_nll'] = test_nll_

    w_gp_input = w_C_total
    w_gp_start = w_gp_input.shape[0] * w_function_index
    w_gp_end = w_gp_start + w_gp_input.shape[0]
    w_gp_target = w_sample_total_data[w_gp_start:w_gp_end]

    # NOTE: comment these since bad visualization ... 
    # w_gp_pred_mean = all_pred_mean[w_gp_start:w_gp_end]
    # w_gp_pred_std = all_pred_var[w_gp_start:w_gp_end]

    w_gp4visual_start = n_data4visual * w_function_index
    w_gp4visual_end = n_data4visual * (w_function_index + 1)
    w_gp_pred_mean = all_pred_mean4visual[w_gp4visual_start:w_gp4visual_end]
    w_gp_pred_std = all_pred_var4visual.sqrt()[w_gp4visual_start:w_gp4visual_end]

    return w_train_input, w_train_target, w_test_input, w_test_target, w_gp_pred_mean, w_gp_pred_std, performance_dirct
    

In [None]:
# function_index = 22
# w_train_input, w_train_target, w_test_input, w_test_target, w_gp_pred_mean, w_gp_pred_std, performance_dirct = evaluate_on_simgle_output(function_index)

In [None]:
# print(performance_dirct)

In [None]:
# picture_save_path = f'/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/experi_results/func_id_{function_index}_numofoutput_{w_n_outputs}.png'
# plot_traindata_testdata_fittedgp(train_X=w_train_input, train_Y=w_train_target, test_X=w_test_input, test_Y=w_test_target, gp_X=w_C_total4visual, gp_pred_mean=w_gp_pred_mean, gp_pred_std=w_gp_pred_std, inducing_points_X=w_my_model.variational_strategy.inducing_points_C.data, n_inducing_C=w_n_inducing_C, picture_save_path=picture_save_path) # NOTE: input is C not X

## loop over all function index

In [None]:
train_rmse_list = [] # list of tensors
test_rmse_list = []
train_nll_list = []
test_nll_list = []
for output_index in range(w_n_outputs):
    _, _, _, _, _, _, performance_dirct = evaluate_on_simgle_output(output_index)
    train_rmse_list.append(performance_dirct['train_rmse'])
    test_rmse_list.append(performance_dirct['test_rmse'])
    train_nll_list.append(performance_dirct['train_nll'])
    test_nll_list.append(performance_dirct['test_nll'])

In [None]:
def find_median_index(lst):
    sorted_lst = sorted(lst)
    n = len(lst)
    
    if n % 2 != 0:
        median = sorted_lst[n // 2]
        return lst.index(median)
    else:
        mid1 = sorted_lst[n // 2 - 1]
        mid2 = sorted_lst[n // 2]
        
        return lst.index(mid1)  # lst.index(mid2)

In [None]:
print('The output index with WORSE test rmse performance: ', test_rmse_list.index(max(test_rmse_list)))
print('The output index with WORSE test nll performance: ', test_nll_list.index(max(test_nll_list)))
print('------' * 10)
print('The output index with MIDDLE test rmse performance:', find_median_index(test_rmse_list))
print('The output index with MIDDLE test nll performance:', find_median_index(test_nll_list))
print('------' * 10)
print('The output index with BEST test rmse performance: ', test_rmse_list.index(min(test_rmse_list)))
print('The output index with BEST test nll performance: ', test_nll_list.index(min(test_nll_list)))

In [None]:
function_index = 375
w_train_input, w_train_target, w_test_input, w_test_target, w_gp_pred_mean, w_gp_pred_std, performance_dirct = evaluate_on_simgle_output(function_index)

In [None]:
picture_save_path = f'/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/experi_results/func_id_{function_index}_numofoutput_{w_n_outputs}.png'
plot_traindata_testdata_fittedgp(train_X=w_train_input, train_Y=w_train_target, test_X=w_test_input, test_Y=w_test_target, gp_X=w_C_total4visual, gp_pred_mean=w_gp_pred_mean, gp_pred_std=w_gp_pred_std, inducing_points_X=w_my_model.variational_strategy.inducing_points_input.data, n_inducing_C=w_n_inducing_C, picture_save_path=picture_save_path) # NOTE: input is C not X

## Check hyper-parameters after training

In [None]:
import json

with open(f'{synth_data_path}/dictionary.json', 'r') as file:
    true_kernel_data = json.load(file)
print(true_kernel_data)

In [None]:
print(w_my_model.covar_module_latent.outputscale.detach())
print(w_my_model.covar_module_latent.base_kernel.lengthscale.detach())
print(w_my_model.covar_module_input.outputscale.detach())
print(w_my_model.covar_module_input.base_kernel.lengthscale.detach())
# print(w_likelihood.raw_noise.detach())
print(w_likelihood.noise.detach())

## True v.s. Fitted Covariance Matrix

In [None]:
assert w_my_model.X.q_mu.detach().shape == w_X_true.shape

In [None]:
# true_cov_matrix = w_my_model.covar_module_X(w_my_model.X.q_mu.detach()).to_dense()

In [None]:
# fitted_covar_module_X = ScaleKernel(RBFKernel(ard_num_dims=w_X_true.shape[1]))
# fitted_covar_module_X.raw_outputscale.data = Tensor([true_kernel_data['X_raw_outputscale']])
# fitted_covar_module_X.base_kernel.raw_lengthscale.data = Tensor([true_kernel_data['X_raw_lengthscale']])
# fitted_cov_matrix = fitted_covar_module_X(w_X_true).to_dense()

## True v.s. Fitted latent variables

In [None]:
# plot_true_and_fitted_latent(w_X_true, w_my_model.X.q_mu.detach(), torch.nn.functional.softplus(w_my_model.X.q_log_sigma.detach()))

## Save Model

In [None]:
# torch.save(w_my_model.state_dict(), '/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/experi_results/model_weight.pth')
# torch.save(w_likelihood.state_dict(), '/Users/jiangxiaoyu/Desktop/All Projects/GPLVM_project_code/experi_results/likelihood_weight.pth')  