#### This notebook is used to conduct parameters estimation under the Latent Space Model (LSM) setting.

In [1]:
# Importing utils functions needed for simulation studies
import utils
from importlib import reload
reload(utils)

<module 'utils' from '/mnt/code/simulation/scripts/utils.py'>

In [2]:
# Importing necessary packages
import pandas as pd
import sys,codecs
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import scipy.stats as st
import collections
import statsmodels.api as sm
from time import time
import random

In [3]:
# Importing Ray, a distributed computing framework for enabling parallel and scalable execution
import ray

ray.init(address='auto')  
print(ray.cluster_resources())

2025-07-30 15:31:32,059	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.7:6379...
2025-07-30 15:31:32,073	INFO worker.py:1927 -- Connected to Ray cluster.


{'CPU': 124.0, 'object_store_memory': 79025689803.0, 'memory': 184393276213.0, 'node:192.168.1.6': 1.0, 'node:192.168.1.5': 1.0, 'node:192.168.1.8': 1.0, 'node:__internal_head__': 1.0, 'node:192.168.1.7': 1.0}
[36m(map_fun_fmle pid=254686, ip=192.168.1.5)[0m cmle is over
[36m(map_fun_fmle pid=254686, ip=192.168.1.5)[0m Z_err_max: 6.336479794518037 Z_err_max1: 5.413772350175103
[36m(map_fun_fmle pid=30476)[0m cmle is over
[36m(map_fun_fmle pid=30476)[0m Z_err_max: 7.102516896088593 Z_err_max1: 5.195037562581565
[36m(map_fun_fmle pid=254686, ip=192.168.1.5)[0m SARX is over
[36m(map_fun_fmle pid=260029, ip=192.168.1.8)[0m cmle is over
[36m(map_fun_fmle pid=260029, ip=192.168.1.8)[0m Z_err_max: 7.335965106929267 Z_err_max1: 4.64902434340909
[36m(map_fun_fmle pid=260032, ip=192.168.1.8)[0m cmle is over[32m [repeated 8x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/

In [4]:
# Importing OS module for interacting with the operating system (e.g., file paths, directories)

import os

NUM_CPU = (ray.cluster_resources())['CPU'] 

print(f'CPU total: {NUM_CPU}')

CPU total: 124.0


In [5]:
NUM_THREADS = 4

os.environ["MKL_NUM_THREADS"]     = str(NUM_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(NUM_THREADS)
os.environ["OMP_NUM_THREADS"]     = str(NUM_THREADS)

NUM_PROCESS = NUM_CPU // NUM_THREADS
print(f'max process: {NUM_PROCESS}')

max process: 31.0


### True parameters and network setting 

In [6]:
# Dimension seting
n = 500; p = 50; q = 20; d = 3

In [7]:
# true parameter
seed = 666
rng = np.random.default_rng(seed) 
tau = np.round(rng.uniform(0.15,0.2,p),4)
rho = np.round(rng.uniform(0.2,0.9,p),4)
beta0 = np.round(rng.uniform(0.5,1,(p,10)),4)
beta1 = np.zeros((p,q-10))
beta = np.hstack((beta0, beta1))
bc = np.array(np.hstack([rng.normal(0,1,(p,d))]))
# Omega mean and variance
mean = np.zeros(p)
ta = 0.15
i, j = np.mgrid[:p, :p]
cov = ta**2*ta**abs(i-j)
cov[list(range(p)),list(range(p))] = ta
sigma2 = np.zeros(p)
for j in range(p):
    sigma2[j] = bc[j,:]@bc[j,:] + ta

In [8]:
# Adjacency matrix-LSM
dd = np.random.uniform(0,1,n)
D = -(n/4)*abs(dd.reshape(n,1)@(np.ones(n).reshape(1,n)) - (np.ones(n).reshape(n,1))@dd.reshape(1,n))
P = np.exp(D)/(1+np.exp(D))
P[list(range(n)),list(range(n))] = 0
A = (np.random.binomial(1,P)).reshape((n,n))
ind = (np.where(np.sum(A,1)==0))[0]
for i in ind:
    A[i,random.sample(list(range(n)),6)] = 1
A[list(range(n)),list(range(n))] = 0
W = A/np.sum(A,1).reshape(n,1) 

In [9]:
# check
print(np.sum(A))
np.max(np.sum(A,1))

2756


np.int64(14)

### Save path setting and save parameters

In [10]:
Save_path_dir = '../results/Results_LSM_02/'

In [11]:
#save true values
pd.DataFrame(rho).to_csv(Save_path_dir+f"n{n}_p{p}_B500/rho_true_n{n}_p{p}.csv",index=False)
pd.DataFrame(beta).to_csv(Save_path_dir+f"n{n}_p{p}_B500/beta_true_n{n}_p{p}.csv",index=False)
pd.DataFrame(bc).to_csv(Save_path_dir+f"n{n}_p{p}_B500/B_true_n{n}_p{p}.csv",index=False)
pd.DataFrame(A).to_csv(Save_path_dir+f"A_smallp_n{n}.csv",index=False)

### Parallel framework-Ray and full process

In [12]:
# parallel computation framework
import ray

ray.init(num_cpus=NUM_CPU, ignore_reinit_error=True)

2025-07-30 15:36:50,206	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.7:6379...
2025-07-30 15:36:50,207	INFO worker.py:1765 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.12.4
Ray version:,2.48.0


In [16]:
# Randomly selected one dimension response for the CP
jj = rng.integers(0, p)
Save_path_process = os.path.abspath(Save_path_dir + 'process.txt')
Save_path_process_ci = os.path.abspath(Save_path_dir + 'process_ci.txt')
print(jj)

27


In [17]:
# Whole process
# Defining `map_fun_fmle` function, implementing a specific computational task
@ray.remote(num_cpus=4) 
def map_fun_fmle(bb):

    # data generation
    Y, X, Z = utils.data_generator(n, p, q, d, W, mean, cov, rho, beta, bc, seed = bb + 88)
    n0 = int(0.1*n)
    n1 = n - n0
    WW0 = W[:n0,:n0]
    Y0, X0, Z0 = Y[:n0], X[:n0,:], Z[:n0,:]  
    WW1 = W[n0:,n0:]
    Y1, X1, Z1 = Y[n0:], X[n0:,:], Z[n0:,:]
    # tic1n = time()
    #initial estimators
    rho_hi, beta_hi, sigma2_hi, ite_hi = utils.optimize_initial(n1, p, q, WW1, Y1, X1, bb, Save_path_process)
    rho_hb, beta_hb, sigma2_hb, ite_hb = utils.optimize_initial(n0, p, q, WW0, Y0, X0, bb, Save_path_process)
    print('cmle is over')
    # toc1n = time()
    # print(toc1n - tic1n)
    
    # Estimate Zi and M
    Z_h, M = utils.Z_est(n0, p, q, d, WW0, WW1, Y1, X1, rho_hi, beta_hi, Y0, X0, rho_hb, beta_hb)
    
    Z_err_max = np.max(abs(Z_h - Z1))
    Z_err_max1 = np.max(abs(-Z_h - Z1))
    print('Z_err_max:', Z_err_max, 'Z_err_max1:', Z_err_max1)
    if Z_err_max>Z_err_max1:
        Z_h = - Z_h
    if (Z_err_max>10) and (Z_err_max1>10):
        return np.zeros((8,p)),np.zeros(p),np.zeros(p),np.zeros(p)
    
    # FMLE with true Z
    rho_hfZ, beta_hfZ, b_hfZ, tau2_hfZ, ite_hfZ = utils.optimize(n1, p, q, d, WW1, Z1, Y1, X1, bb, Save_path_process)
    print('SARX is over')
    
    # FMLE with estimated Z
    rho_hf, beta_hf, b_hf, tau2_hf, ite_hf = utils.optimize(n1, p, q, d, WW1, Z_h, Y1, X1, bb, Save_path_process)
    print('fmle is over')

    H_hat = M.T@M/p
    bc_hat = b_hf.T@H_hat
    sigma2_hat = np.zeros(p)
    for j in range(p):
        sigma2_hat[j] = bc_hat[j,:]@bc_hat[j,:] + tau2_hf[j]

    lower_est = np.zeros(p)
    upper_est = np.zeros(p)
    q0 = 10
    # for j in range(p):
    #     tic1nj = time()
    #     lower_est[j], upper_est[j] = CI_est_new(j, n1, p, q0, d, WW1, rho_hf, beta_hf, b_hf, tau2_hf, M, sigma2_hat, Y1, X1, Z_h, 1.96)
    #     toc1nj = time()
    #     with open('Results_LSM_02/process.txt', 'a') as f1:
    #         f1.write(str(bb)+', '+str(j)+', '+str(toc1nj - tic1nj)+', '+str(lower_est[j])+', '+str(upper_est[j])+'\n')   
    # CI computation
    tic1nj = time()
    lower_est[jj], upper_est[jj] = utils.CI_est_test(jj, n1, p, q0, d, WW1, rho_hf, beta_hf, b_hf, tau2_hf, M, sigma2_hat, Y1, X1, Z_h, 1.96)
    toc1nj = time()  
    with open(Save_path_process_ci, 'a') as f1:
        f1.write(str(bb)+', '+str(jj)+', '+str(toc1nj - tic1nj)+', '+str(lower_est[jj])+', '+str(upper_est[jj])+'\n')   
 

    return np.array([rho_hi, rho_hfZ, rho_hf, sigma2_hi, tau2_hfZ, tau2_hf, lower_est, upper_est]), ite_hi, ite_hfZ, ite_hf

In [18]:
from time import time
from numpy.random import default_rng

### Start parallel experiments

In [19]:
# whole experiments
BB = 500
tic1 = time()
tasks = [map_fun_fmle.remote(bb) for bb in range(BB)]
results500_p50whole = ray.get(tasks)  
toc1 = time()
print(toc1 - tic1) 

4032.156128883362


In [130]:
import ray
ray.shutdown()

### Save results and analysis

In [20]:
# Results
BB = 500
theta_hat_LSM_n500_p50_B500 = np.zeros((BB,8,p))
ite_ini = np.zeros((BB,p))
ite_fZin = np.zeros((BB,p))
ite_fin = np.zeros((BB,p))
#tic1 = time()
for bt in range(BB):
    theta_hat_LSM_n500_p50_B500[bt,:,:], ite_ini[bt,:],ite_fZin[bt,:], ite_fin[bt,:] = results500_p50whole[bt][0], results500_p50whole[bt][1],results500_p50whole[bt][2],results500_p50whole[bt][3]
#toc1 = time()

In [21]:
# ECP computation
cc = list(set(range(BB)).difference(set(np.where(ite_fin==0)[0])))
ECP = np.zeros(len(cc))
i=0
for b in cc:
    ECP[i] = (theta_hat_LSM_n500_p50_B500[b,6,jj]<=rho[jj]) & (rho[jj]<=theta_hat_LSM_n500_p50_B500[b,7,jj])
    i = i+1

In [22]:
np.mean(ECP) 

np.float64(0.948)

In [23]:
#cmle
np.nanmean(abs(theta_hat_LSM_n500_p50_B500[cc,0,:]-((rho.reshape((p,1))@np.ones((1,BB))).T)[cc,:]))

np.float64(0.0391545852570014)

In [24]:
#SARX
np.nanmean(abs(theta_hat_LSM_n500_p50_B500[cc,1,:]-((rho.reshape((p,1))@np.ones((1,BB))).T)[cc,:]))

np.float64(0.021158700812673105)

In [25]:
#fmle
np.nanmean(abs(theta_hat_LSM_n500_p50_B500[cc,2,:]-((rho.reshape((p,1))@np.ones((1,BB))).T)[cc,:]))

np.float64(0.02972762179183594)

In [27]:
# Save results
for b in range(BB):
    pd.DataFrame(theta_hat_LSM_n500_p50_B500[b,:,:]).to_csv("../results/Results_LSM_02/n500_p50_B500/theta_hat_LSM_n500_p50_"+str(b)+'_.csv',index=False)