In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.distributions import Dirichlet, Bernoulli, Uniform
import pandas as pd
from tqdm import tqdm as tm

from src import Simulation as sim
from src import Dir_Reg
from src import Align
from src import visualize_latent_space as vls
from src import ABC_Reg

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("mps") if torch.backends.mps.is_available() else device

print(device)

cpu


Generate data sets that illustrate how the parameters of the model influence the behavior of the model. 
<br>
Settings:
<br>
Length of Time: 20 or 200
<br>
Embedding Dimemsion: 2
<br>
Number of Nodes: 1200
<br>
Parameters:  (1, 1, 5, 5), (1, 1, 2, 5), (1, 1, -2, 5), (1, 1, -5, 5)
<br>
Initial Distribution: Dir(1, 1, 1)


In [None]:
torch.manual_seed(4)

T, n, alpha_0 = 20, 300, [[1,1,1], [1,1,1], [1,1,1]]

model_pos_2 = sim.ABC(time = T,
                    nodes = n,
                    beta = [1, 1, 2, 5],
                    alpha_0 = alpha_0)
model_pos_1 = sim.ABC(time = T*10,
                    nodes = n,
                    beta = [1, 1, 1 , 5],
                    alpha_0 = alpha_0)
model_neg_2 = sim.ABC(time = T*10,
                    nodes = n,
                    beta = [1, 1, -2, 5],
                    alpha_0 = alpha_0)
model_neg_4 = sim.ABC(time = T,
                    nodes = n,
                    beta = [1, 1, -4, 5],
                    alpha_0 = alpha_0)

In [None]:
sim.ABC_Monte_Carlo.lat_pos(model_pos_2.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/pos_2_sample.csv")
sim.ABC_Monte_Carlo.lat_pos(model_pos_1.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/pos_1_sample.csv")
sim.ABC_Monte_Carlo.lat_pos(model_neg_2.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/neg_2_sample.csv")
sim.ABC_Monte_Carlo.lat_pos(model_neg_4.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/neg_4_sample.csv")

Generate data sets that illustrate 
1. what the latent position recovered from ASE alone look like, what it looks like after oracle and RGD alignment

In [None]:
p, T, beta, alpha_0 = 2, 2, [1, 1, -4, 5], [[10,1,1], [1,10,1], [1,1,10]]
model = sim.ABC(time = T, nodes = 3, beta = beta, alpha_0 = alpha_0)
torch.manual_seed(0)
lat_pos = sim.ABC_Monte_Carlo.check_lat_pos(model, 4500)

In [None]:
sim.ABC_Monte_Carlo.lat_pos(lat_pos.truth, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/tru_lat_pos.csv")
sim.ABC_Monte_Carlo.lat_pos(lat_pos.ASE, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/ASE_lat_pos.csv")
sim.ABC_Monte_Carlo.lat_pos(lat_pos.ASE_aligned, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/ASE_aligned_lat_pos.csv")
sim.ABC_Monte_Carlo.lat_pos(lat_pos.RGD_aligned, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/RGD_aligned_lat_pos.csv")

2. how the latent position recovered from an oracle-given orthogonal transformation converges to the true latent position in $\|*\|_{2 \to \infty}$. 
3. how the latent position recovered from RGD performs.

In [None]:
def norm_2_infty(X):
    result = torch.max(torch.sqrt(((X)**2).sum(dim = 2)), dim = 1).values
    return(result)

def mat_sqrt(X, dim):
    X_LR_svd = torch.svd_lowrank(X, dim)
    u = X_LR_svd[0]
    v = torch.diag(X_LR_svd[1])
    result = u @ torch.sqrt(v)
    return(result)

def performance(dist, n_set, N_iter):
    core = torch.cat(dist).reshape(len(n_set), N_iter, 2).mean(dim = 1)
    nodes = n_set.unsqueeze(dim = 1)
    iter = torch.ones(len(n_set), 1) * N_iter
    full = torch.cat([iter, nodes, core], dim = 1)
    result = pd.DataFrame(full, columns = ["iter", "nodes", "error_T0", "error_T1"])
    return(result)

In [None]:
n_set = torch.arange(1500, 12001, 750)
N_iter = 100
ortho_trans = torch.tensor([[0,1.], [1, 0]])

ASE_dist = []
RGD_dist = []
for n in n_set:
    for i in tm(range(N_iter), desc = str(n)):
        torch.manual_seed(i)
        lat_pos = sim.ABC_Monte_Carlo.check_lat_pos(model, n)
        ASE_two_infty = norm_2_infty(lat_pos.truth - lat_pos.ASE_aligned)
        ASE_dist.append(ASE_two_infty)

        results = []
        for i in range(2): 
            lpt = lat_pos.truth[i,]
            lpra = lat_pos.RGD_aligned[i,]

            rgd_dist = lpt @ lpt.T - lpra @ lpra.T
            rgd_dist_sqrt = mat_sqrt(rgd_dist, 4).unsqueeze(dim=0)

            results.append(rgd_dist_sqrt)

        rgd_dist_sqrt_combined = torch.cat(results, dim=0)
        RGD_two_infty = norm_2_infty(rgd_dist_sqrt_combined)

        RGD_dist.append(RGD_two_infty)

Oracle_performace = performance(ASE_dist, n_set, N_iter)
RGD_performance = performance(RGD_dist, n_set, N_iter)


In [None]:
Oracle_performace.to_csv(r"simulated_data/est_lat_pos/example_1/convergence/oracle_performance.csv")
RGD_performance.to_csv(r"simulated_data/est_lat_pos/example_1/convergence/RGD_performance.csv")

Below we generate the synthetic data set that shows how the latent position in ABCDPRGM evolves through time under different settings.

Let $\widehat{B} \in \mathbb{R}^{q \times p}$ be the MLE that corresponds to the design matrix $X \otimes I_p$, and $\tilde{\beta} = (C^T C)^{-1} C^T \widehat{B}$. Let $\widehat{\beta}$ be the MLE that corresponds to the design matrix $(X \otimes I_p)C$.

We first do Monte Carlo simulations to verify the asymptotic behavior of $\widehat{B}$ and $\tilde{\beta}$. 

In [6]:
""" no_oracle option is for the RGD stuff, which is still somewhat problematic. It shouldn't be turned on. """
N = 1
n_set = torch.arange(1500, 3001, 750)
beta = [1,1,-4, 5]
alpha_0 = [[10, 1, 1], [1, 10, 1], [1, 1, 10]]
OL, OA, NO = True, True, True
ntypes = OL + OA + NO

oracle_guess_results = sim.ABC_Monte_Carlo.consistency_T2(number_of_iterations = N, 
                                          nodes_set = n_set,
                                          beta = beta,
                                          alpha_0 = alpha_0,
                                          oracle_guess = True,
                                          seeded = True,
                                          constrained = False,
                                          oracle_lat_pos = OL,
                                          oracle_align = OA,
                                          no_oracle = NO)
shiwen_guess_results = sim.ABC_Monte_Carlo.consistency_T2(number_of_iterations = N, 
                                          nodes_set = n_set,
                                          beta = beta,
                                          alpha_0 = alpha_0,
                                          oracle_guess = False,
                                          seeded = True,
                                          constrained = False,
                                          oracle_lat_pos = OL,
                                          oracle_align = OA,
                                          no_oracle = NO)

tensor(1500):   0%|          | 0/1 [00:00<?, ?it/s]

tensor(1500): 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
tensor(2250): 100%|██████████| 1/1 [00:03<00:00,  3.76s/it]
tensor(3000): 100%|██████████| 1/1 [00:04<00:00,  4.41s/it]
tensor(1500): 100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
tensor(2250): 100%|██████████| 1/1 [00:09<00:00,  9.97s/it]
tensor(3000): 100%|██████████| 1/1 [00:17<00:00, 17.93s/it]


In [12]:
oracle_guess_results.MC_result.est["B_est"] - shiwen_guess_results.MC_result.est["B_est"]

0.00035075375

In [5]:
temp.MC_result.est

Unnamed: 0,seed,nodes,constrained,method_OL,method_OA,method_NO,component,B_est,B_real,info_lost,number_of_iterations,max_iterations,time_elapsed
0,0,1500,0,1,0,0,0,0.564183,1.0,0.000000,2,200,0.324608
1,0,1500,0,1,0,0,1,-0.347619,0.0,0.000000,2,200,0.324608
2,0,1500,0,1,0,0,2,-1.478894,-1.0,0.000000,2,200,0.324608
3,0,1500,0,1,0,0,3,-0.176844,0.0,0.000000,2,200,0.324608
4,0,1500,0,1,0,0,4,0.870015,1.0,0.000000,2,200,0.324608
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0,2250,0,0,0,1,16,-3.984977,-4.0,0.050222,6,200,1.235495
122,0,2250,0,0,0,1,17,4.317436,4.0,0.050222,6,200,1.235495
123,0,2250,0,0,0,1,18,4.220619,5.0,0.050222,6,200,1.235495
124,0,2250,0,0,0,1,19,4.358335,5.0,0.050222,6,200,1.235495


In [3]:
temp.MC_result.est

Unnamed: 0,seed,nodes,constrained,method_OL,method_OA,method_NO,component,B_est,B_real,info_lost,number_of_iterations,max_iterations,time_elapsed
0,0,1500,0,1,0,0,0,0.558653,1.0,0.000000,19,200,1.285633
1,0,1500,0,1,0,0,1,-0.352977,0.0,0.000000,19,200,1.285633
2,0,1500,0,1,0,0,2,-1.484647,-1.0,0.000000,19,200,1.285633
3,0,1500,0,1,0,0,3,-0.180571,0.0,0.000000,19,200,1.285633
4,0,1500,0,1,0,0,4,0.866135,1.0,0.000000,19,200,1.285633
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0,2250,0,0,0,1,16,-3.985342,-4.0,0.050222,18,200,3.182860
122,0,2250,0,0,0,1,17,4.316707,4.0,0.050222,18,200,3.182860
123,0,2250,0,0,0,1,18,4.220976,5.0,0.050222,18,200,3.182860
124,0,2250,0,0,0,1,19,4.358725,5.0,0.050222,18,200,3.182860


In [5]:
temp.MC_result.est["time_elapsed"].unique()

array([1.298176 , 1.056035 , 1.015018 , 3.481331 , 3.135339 , 3.0992348],
      dtype=float32)

In [None]:
temp.MC_result.est

In [None]:
temp.MC_result.est.to_csv(r"simulated_data/emp_var_vs_obs_var/B_oracle.csv")
temp.MC_result.fish.to_csv(r"simulated_data/emp_var_vs_obs_var/B_fish.csv")

In [None]:
model = sim.ABC(time = 2,
            nodes = 3,
            beta = [1,1,-4, 5],
            alpha_0 = [[10, 1, 1], [1, 10, 1], [1, 1, 10]])
C = model.settings.C

res = torch.tensor(temp.MC_result.est["B_est"]).reshape(N*ntypes* len(n_set), 21).T


df = pd.DataFrame(torch.linalg.solve(C.T @ C, C.T @ res).T)


df.columns = ["dim_" + str(i) for i in range(4)]
sns.histplot(df["dim_3"])

sns.scatterplot(data = temp.MC_result.est, x = "nodes", y = "info_lost")