In [1]:
import torch
import pandas as pd
from tqdm import tqdm as tm
from src import Simulation as sim
from src import Align
from src import ABC_Reg
from src import Dir_Reg as DR

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("mps") if torch.backends.mps.is_available() else device

print(device)

cuda


Generate data sets that illustrate how the parameters of the model influence the behavior of the model. 
<br>
Settings:
<br>
Length of Time: 20 or 200
<br>
Embedding Dimemsion: 2
<br>
Number of Nodes: 1200
<br>
Parameters:  (1, 1, 2, 5), (1, 1, 1, 5), (1, 1, -2, 5), (1, 1, -4, 5)
<br>
Initial Distribution: Dir(1, 1, 1)


In [11]:
torch.manual_seed(4)

T, n, alpha_0 = 20, 1200, [[1,1,1], [1,1,1], [1,1,1]]

model_pos_2 = sim.ABC(time = T,
                    nodes = n,
                    beta = [1, 1, 2, 5],
                    alpha_0 = alpha_0)
model_pos_1 = sim.ABC(time = T*10,
                    nodes = n,
                    beta = [1, 1, 1 , 5],
                    alpha_0 = alpha_0)
model_neg_2 = sim.ABC(time = T*10,
                    nodes = n,
                    beta = [1, 1, -2, 5],
                    alpha_0 = alpha_0)
model_neg_4 = sim.ABC(time = T,
                    nodes = n,
                    beta = [1, 1, -4, 5],
                    alpha_0 = alpha_0)

In [None]:
sim.ABC_Monte_Carlo.lat_pos(model_pos_2.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/pos_2_sample.csv")
sim.ABC_Monte_Carlo.lat_pos(model_pos_1.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/pos_1_sample.csv")
sim.ABC_Monte_Carlo.lat_pos(model_neg_2.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/neg_2_sample.csv")
sim.ABC_Monte_Carlo.lat_pos(model_neg_4.synth_data["lat_pos"], 3).to_csv(r"simulated_data/time_vs_lat_pos/neg_4_sample.csv")

Generate data sets that illustrate 
1. what the latent position recovered from ASE alone look like, what it looks like after oracle and RGD alignment

In [12]:
p, T, beta, alpha_0 = 2, 2, [1, 1, -4, 5], [[10,1,1], [1,10,1], [1,1,10]]
model = sim.ABC(time = T, nodes = 3, beta = beta, alpha_0 = alpha_0)
torch.manual_seed(0)
lat_pos = sim.ABC_Monte_Carlo.check_lat_pos(model, 4500)

In [None]:
sim.ABC_Monte_Carlo.lat_pos(lat_pos.truth, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/tru_lat_pos.csv")
sim.ABC_Monte_Carlo.lat_pos(lat_pos.ASE, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/ASE_lat_pos.csv")
sim.ABC_Monte_Carlo.lat_pos(lat_pos.ASE_aligned, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/ASE_aligned_lat_pos.csv")
sim.ABC_Monte_Carlo.lat_pos(lat_pos.RGD_aligned, 3).to_csv(r"simulated_data/est_lat_pos/example_1/estimate_lat_pos/RGD_aligned_lat_pos.csv")

2. how the latent position recovered from an oracle-given orthogonal transformation converges to the true latent position in $\|*\|_{2 \to \infty}$. 
3. how the latent position recovered from RGD performs.

In [13]:
def norm_2_infty(X):
    result = torch.max(torch.sqrt(((X)**2).sum(dim = 2)), dim = 1).values
    return(result)

def mat_sqrt(X, dim):
    X_LR_svd = torch.svd_lowrank(X, dim)
    u = X_LR_svd[0]
    v = torch.diag(X_LR_svd[1])
    result = u @ torch.sqrt(v)
    return(result)

def performance(dist, n_set, N_iter):
    core = torch.cat(dist).reshape(len(n_set), N_iter, 2).mean(dim = 1)
    nodes = n_set.unsqueeze(dim = 1)
    iter = torch.ones(len(n_set), 1) * N_iter
    full = torch.cat([iter, nodes, core], dim = 1)
    result = pd.DataFrame(full, columns = ["iter", "nodes", "error_T0", "error_T1"])
    return(result)

In [14]:
p, T, beta, alpha_0 = 2, 2, [1, 1, -4, 5], [[10,1,1], [1,10,1], [1,1,10]]
model = sim.ABC(time = T, nodes = 3, beta = beta, alpha_0 = alpha_0)
torch.manual_seed(0)
lat_pos = sim.ABC_Monte_Carlo.check_lat_pos(model, 4500)

In [15]:
n_set = torch.arange(1500, 12001, 750)
N_iter = 1
ortho_trans = torch.tensor([[0,1.], [1, 0]])

ASE_dist = []
RGD_dist = []
for n in n_set:
    for i in tm(range(N_iter), desc = str(n)):
        torch.manual_seed(i)
        lat_pos = sim.ABC_Monte_Carlo.check_lat_pos(model, n)
        ASE_two_infty = norm_2_infty(lat_pos.truth - lat_pos.ASE_aligned)
        ASE_dist.append(ASE_two_infty)

        if torch.diag(Align.Oracle.ortho_proc(lat_pos.ASE_aligned[0,], lat_pos.RGD_aligned[0,])).sum() < 0.1:
            lat_pos.RGD_aligned[0,] = lat_pos.RGD_aligned[0,] @ ortho_trans
            lat_pos.RGD_aligned[1,] = lat_pos.RGD_aligned[1,] @ ortho_trans
        RGD_two_infty = norm_2_infty(lat_pos.truth - lat_pos.RGD_aligned)
        RGD_dist.append(RGD_two_infty)

Oracle_performace = performance(ASE_dist, n_set, N_iter)
RGD_performance = performance(RGD_dist, n_set, N_iter)


tensor(1500): 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
tensor(2250): 100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
tensor(3000): 100%|██████████| 1/1 [00:00<00:00,  1.85it/s]
tensor(3750): 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
tensor(4500): 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
tensor(5250): 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]
tensor(6000): 100%|██████████| 1/1 [00:02<00:00,  2.53s/it]
tensor(6750): 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]
tensor(7500): 100%|██████████| 1/1 [00:10<00:00, 10.81s/it]
tensor(8250): 100%|██████████| 1/1 [00:12<00:00, 12.59s/it]
tensor(9000): 100%|██████████| 1/1 [00:14<00:00, 14.25s/it]
tensor(9750): 100%|██████████| 1/1 [00:11<00:00, 11.34s/it]
tensor(10500): 100%|██████████| 1/1 [00:17<00:00, 17.60s/it]
tensor(11250): 100%|██████████| 1/1 [00:14<00:00, 14.83s/it]
tensor(12000): 100%|██████████| 1/1 [00:21<00:00, 21.71s/it]


In [21]:
Oracle_performace.to_csv(r"simulated_data/est_lat_pos/example_1/convergence/oracle_performance.csv")
RGD_performance.to_csv(r"simulated_data/est_lat_pos/example_1/convergence/RGD_performance.csv")

**Below we generate the synthetic data set that shows how the latent position in ABCDPRGM evolves through time under different settings (different levels of knowledge about the latent positions).**

**Let $\widehat{B} \in \mathbb{R}^{q \times p}$ be the MLE that corresponds to the design matrix $X \otimes I_p$, and $\tilde{\beta} = (C^T C)^{-1} C^T \widehat{B}$. Let $\widehat{\beta}$ be the MLE that corresponds to the design matrix $(X \otimes I_p)C$.**

**We do Monte Carlo simulations to verify the asymptotic behavior of $\widehat{B}$ and $\tilde{\beta}$.**

We first verify that with the initial latent positions that we are using, there isn't a significant different between initializing at the true parameter vs. initializing using a parameter estimated by linear regression.

In [17]:
N = 10
n_set = torch.arange(1500, 3001, 750)
beta = [1,1,-4, 5]
alpha_0 = [[10, 1, 1], [1, 10, 1], [1, 1, 10]]
OL, OA, NO = True, True, True
ntypes = OL + OA + NO

oracle_guess_results = sim.ABC_Monte_Carlo.consistency_T2(number_of_iterations = N, 
                                          nodes_set = n_set,
                                          beta = beta,
                                          alpha_0 = alpha_0,
                                          oracle_guess = True,
                                          seeded = True,
                                          constrained = False,
                                          oracle_lat_pos = OL,
                                          oracle_align = OA,
                                          no_oracle = NO)
shiwen_guess_results = sim.ABC_Monte_Carlo.consistency_T2(number_of_iterations = N, 
                                          nodes_set = n_set,
                                          beta = beta,
                                          alpha_0 = alpha_0,
                                          oracle_guess = False,
                                          seeded = True,
                                          constrained = False,
                                          oracle_lat_pos = OL,
                                          oracle_align = OA,
                                          no_oracle = NO)

tensor(1500): 100%|██████████| 1/1 [00:00<00:00,  2.32it/s]
tensor(2250): 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]
tensor(3000): 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
tensor(1500): 100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
tensor(2250): 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
tensor(3000): 100%|██████████| 1/1 [00:02<00:00,  2.40s/it]


In [None]:
oracle_guess_results.MC_result.est.to_csv(r"simulated_data\theo_var_vs_emp_var\justify_oracle_guess\oracle_init_est.csv")
shiwen_guess_results.MC_result.est.to_csv(r"simulated_data\theo_var_vs_emp_var\justify_oracle_guess\lin_reg_init_est.csv")

oracle_guess_results.MC_result.fish.to_csv(r"simulated_data\theo_var_vs_emp_var\justify_oracle_guess\oracle_init_fish.csv")
shiwen_guess_results.MC_result.fish.to_csv(r"simulated_data\theo_var_vs_emp_var\justify_oracle_guess\lin_reg_init_fish.csv")

Since there isn't any significant difference between initializing at the true parameter vs. initializing with linear regression, we will initialize at the true parameter in the following monte-carlo experiment to save time.

In [None]:
N = 1000
n_set = torch.arange(1500, 12001, 750)
beta = [1,1,-4, 5]
alpha_0 = [[10, 1, 1], [1, 10, 1], [1, 1, 10]]
OL, OA, NO = True, True, True
ntypes = OL + OA + NO

oracle_guess_results = sim.ABC_Monte_Carlo.consistency_T2(number_of_iterations = N, 
                                          nodes_set = n_set,
                                          beta = beta,
                                          alpha_0 = alpha_0,
                                          oracle_guess = True,
                                          seeded = True,
                                          constrained = False,
                                          oracle_lat_pos = OL,
                                          oracle_align = OA,
                                          no_oracle = NO)

In [11]:
oracle_guess_results.MC_result.est.to_csv(r"simulated_data/theo_var_vs_emp_var/consistency/est_all.csv")
oracle_guess_results.MC_result.fish.to_csv(r"simulated_data/theo_var_vs_emp_var/consistency/fish_all.csv")

We want to investigate what happens to our estimation, when we use the correct model, but misspecify the dimension.

We first embed the graph in lower dimension, run regression, and then compare result to the truth

In [None]:
p, K, T = 6, 6, 2
# alpha_0 = torch.ones(K, p)*2
alpha_0 = torch.eye(p)*9 + 1
# alpha_0 = torch.kron(torch.eye(int(p/2))*9 + 1, torch.ones(2))

model = sim.ABC(time = T,
        nodes = 30,
        beta = [1, 1, -4, 5],
        alpha_0 = alpha_0)

seed_list = list(range(50))
n_list = list(range(1500, 6001, 1500))
p0_list = list(range(2, 7))

# Initialize an empty list to store results
results = []
for seed in seed_list:
    torch.manual_seed(seed)
    for n in n_list:
        for p0 in p0_list:
            # Update model settings
            model.update_settings(nodes = n)
            # Initialize estimation
            estimate = ABC_Reg.est(two_lat_pos = model.synth_data['lat_pos'],
                    two_adj_mat = model.synth_data['obs_adj'],
                    groups = K,
                    )
            # Perform estimation by specifying mode and embedding dimension p0
            estimate.specify_mode('NO', fit = True, embed_dim = p0)
            
            # Compute beta_est and info_lost
            beta_est = DR.fit.proj_beta(estimate.fitted.est_result["estimate"], DR.fit.gen_constraint(p0+1, True)).tolist()
            info_lost = estimate.fitted.est_result["info_lost"]
            
            # Create a dictionary for the current iteration
            result = {
                'seed': seed,
                'n': n,
                'p0': p0,
                'beta1': beta_est[0],
                'beta2': beta_est[1],
                'beta3': beta_est[2],
                'beta4': beta_est[3],
                'info_lost': info_lost  # Optional: Include if you want to store this value
            }
            # Append the result to the list
            results.append(result)

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(results)

In [None]:
df.to_csv(r"simulated_data\dimension_robustness\robustness.csv")  