#### This notebook is used to conduct model selection under the Latent Space Model (LSM) setting.

In [1]:
# Importing necessary packages
import pandas as pd
import sys,codecs
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import scipy.stats as st
import collections
import statsmodels.api as sm
from time import time
import random

In [2]:
# Importing utils functions needed for simulation studies
import utils_BIC
from importlib import reload
reload(utils_BIC)

<module 'utils_BIC' from '/mnt/code/simulation/scripts/utils_BIC.py'>

In [3]:
# Importing Ray, a distributed computing framework for enabling parallel and scalable execution
import ray

ray.init(address='auto')  
print(ray.cluster_resources())

2025-07-31 14:18:25,130	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.9:6379...
2025-07-31 14:18:25,174	INFO worker.py:1927 -- Connected to Ray cluster.


{'CPU': 124.0, 'object_store_memory': 78946438347.0, 'memory': 184208356149.0, 'node:192.168.1.10': 1.0, 'node:192.168.1.11': 1.0, 'node:192.168.1.9': 1.0, 'node:__internal_head__': 1.0, 'node:192.168.1.12': 1.0}
[36m(map_fun_BIC pid=14978)[0m 2 0 98 3.387892723083496
[36m(map_fun_BIC pid=14979)[0m 3 0 98 3.700544834136963
[36m(map_fun_BIC pid=15122)[0m 18 0 34 3.4180922508239746
[36m(map_fun_BIC pid=14976)[0m 0 0 80 3.931436061859131
[36m(map_fun_BIC pid=14977)[0m 1 0 21 4.117990493774414
[36m(map_fun_BIC pid=15471, ip=192.168.1.11)[0m 8 0 44 3.228391408920288
[36m(map_fun_BIC pid=15175)[0m 20 0 23 3.776562452316284
[36m(map_fun_BIC pid=15147)[0m 19 0 11 4.118931531906128
[36m(map_fun_BIC pid=15472, ip=192.168.1.11)[0m 9 0 30 3.7498691082000732
[36m(map_fun_BIC pid=15473, ip=192.168.1.11)[0m 10 0 22 3.8978095054626465
[36m(map_fun_BIC pid=15219, ip=192.168.1.12)[0m 4 0 15 4.101056337356567
[36m(map_fun_BIC pid=15224, ip=192.168.1.12)[0m 13 0 19 3.83844375610351

In [4]:
# Importing OS module for interacting with the operating system (e.g., file paths, directories)
import os

NUM_CPU = (ray.cluster_resources())['CPU'] #os.cpu_count() 

print(f'CPU total: {NUM_CPU}')

CPU total: 124.0


In [5]:
NUM_THREADS = 4

os.environ["MKL_NUM_THREADS"]     = str(NUM_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(NUM_THREADS)
os.environ["OMP_NUM_THREADS"]     = str(NUM_THREADS)

NUM_PROCESS = NUM_CPU // NUM_THREADS
print(f'max process: {NUM_PROCESS}')

max process: 31.0


### True parameters and network setting 

In [6]:
# dimension
n = 500; p = 50; q = 20; d = 3

In [7]:
# true parameter
seed = 666
rng = np.random.default_rng(seed) 
tau = np.round(rng.uniform(0.15,0.2,p),4)
rho = np.round(rng.uniform(0.2,0.9,p),4)
beta0 = np.round(rng.uniform(0.5,1,(p,10)),4)
beta1 = np.zeros((p,q-10))
beta = np.hstack((beta0, beta1))
bc = np.array(np.hstack([rng.normal(0,1,(p,d))]))
# Omega mean and variance
mean = np.zeros(p)
ta = 0.15
i, j = np.mgrid[:p, :p]
cov = ta**2*ta**abs(i-j)
cov[list(range(p)),list(range(p))] = ta
sigma2 = np.zeros(p)
for j in range(p):
    sigma2[j] = bc[j,:]@bc[j,:] + ta

In [8]:
# adjacency matrix- LSM
dd = np.random.uniform(0,1,n)
D = -(n/3)*abs(dd.reshape(n,1)@(np.ones(n).reshape(1,n)) - (np.ones(n).reshape(n,1))@dd.reshape(1,n))
P = np.exp(D)/(1+np.exp(D))
P[list(range(n)),list(range(n))] = 0
A = (np.random.binomial(1,P)).reshape((n,n))
ind = (np.where(np.sum(A,1)==0))[0]
for i in ind:
    A[i,random.sample(list(range(n)),6)] = 1
A[list(range(n)),list(range(n))] = 0
W = A/np.sum(A,1).reshape(n,1) 

In [9]:
# check
print(np.sum(A))
np.max(np.sum(A,1))

2066


np.int64(10)

In [256]:
#Save true parameters
# Save_path_dir = '../results/Results_LSM_BIC/'
# pd.DataFrame(rho).to_csv(Save_path_dir+f"n{n}_p{p}_B100/rho_true_n{n}_p{p}.csv",index=False)
# pd.DataFrame(beta).to_csv(Save_path_dir+f"n{n}_p{p}_B100/beta_true_n{n}_p{p}.csv",index=False)
# pd.DataFrame(bc).to_csv(Save_path_dir+f"n{n}_p{p}_B100/B_true_n{n}_p{p}.csv",index=False)
# pd.DataFrame(A).to_csv(Save_path_dir+f"A_smallp_n{n}.csv",index=False)

In [10]:
# Some hyperparameter settings
thre = 1e-3
BICn = 100
par = np.zeros(q+2)
par[-1] = 1
sudu = (np.log(q*p)/n)**0.5 #(np.log(q*p)/n)**0.5
lam_set = np.linspace(sudu**9,2*sudu**0.5, BICn) #2*sudu**0.5
bic_sh = np.log(n)*(np.log(p*q))/n

### Parallel framework-Ray and Full process

In [11]:
# import ray framework
import ray

ray.init(num_cpus=NUM_CPU, ignore_reinit_error=True)

2025-07-31 14:18:55,608	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.9:6379...
2025-07-31 14:18:55,609	INFO worker.py:1765 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.12.4
Ray version:,2.48.0


In [12]:
Save_path_process = os.path.abspath('../results/process.txt')

In [13]:
# Whole process
# Defining `map_fun_BIC` function, implementing a specific computational task
@ray.remote(num_cpus=4) 
def map_fun_BIC(bb):

    # data generator
    Y, X, Z = utils_BIC.data_generator(n, p, q, d, W, mean, cov, rho, beta, bc, seed = bb + 166)
    BIC_set = np.zeros((BICn,p))
    Ln_j_set = np.zeros((BICn,p))
    theta_ini = np.zeros((p, 2+q))
    # Iterate over all dimensions and alternative sets to compute the BIC
    for j in range(p):
        ticn1 = time()
        theta_ini[j,:] = utils_BIC.newton_sea_initial(n, p, q, W, par, Y[:,j], X)[0]
        rho0_h = theta_ini[j,:][0]
        beta0_h = theta_ini[j,:][1:(q+1)]
        sigma20_h = theta_ini[j,:][-1]
        for b in range(BICn):
            lambda_ = lam_set[b]
            beta_est = utils_BIC.newton_sea_SCAD(n, p, q, W, theta_ini[j,:], Y[:,j], X, lambda_)
            beta_scad = np.where(beta_est[0]<thre, 0, beta_est[0])
            Ln_j_set[b,j] = utils_BIC.log_likelihood_sar(rho0_h, beta_scad, sigma20_h, Y[:,j], X, W)
            BIC_set[b,j] = Ln_j_set[b,j] + len(np.where(beta_scad!=0)[0])*bic_sh
        tocn1 = time()
        argBIC = np.argmin(BIC_set[:,j])
        print(bb, j, argBIC, tocn1 - ticn1) 
        with open(Save_path_process, 'a') as f1:
            f1.write(str(bb) + ', '+ str(j) + ', '+ str(argBIC) +'\n')
    # Choose the smallest of these and make a model selection
    min_index = np.argmin(BIC_set, axis=0)
    beta_estt = np.zeros((p,q))
    for j in range(p):
        lambda_ = lam_set[int(min_index[j])]
        beta_estt[j,:] = utils_BIC.newton_sea_SCAD(n, p, q, W, theta_ini[j,:], Y[:,j], X, lambda_)[0]        
    return BIC_set.T, min_index, beta_estt

### Start parallel experiments

In [14]:
# Whole experiments
BB = 100
tic1 = time()
tasks = [map_fun_BIC.remote(bb) for bb in range(BB)]
resultsn500_p50BIC = ray.get(tasks)  
toc1 = time()
print(toc1 - tic1) # Total time

814.5708768367767


In [204]:
ray.shutdown()

In [15]:
# Save results
BIC_set_n500_p50_B100 = np.zeros((BB,BICn,p))
min_index_n500_p50_B100 = np.zeros((BB,p))
beta_est_n500_p50_B100 = np.zeros((BB,p,q))
for bt in range(BB):
    BIC_set_n500_p50_B100[bt,:,:], min_index_n500_p50_B100[bt,:], beta_est_n500_p50_B100[bt,:,:] = resultsn500_p50BIC[bt][0].T, resultsn500_p50BIC[bt][1], resultsn500_p50BIC[bt][2]

In [16]:
# averge selection consistency
1 - np.mean(abs((beta_est_n500_p50_B100[:,:,:] > thre).astype(int) - (beta > thre).astype(int)))

np.float64(0.98896)

In [17]:
# uniform selection consistency
1 - len(set(np.where((beta_est_n500_p50_B100[:,:,:] > thre).astype(int) - (beta > thre).astype(int) != 0)[0]))/100

0.010000000000000009

In [18]:
# Save results
# pd.DataFrame(min_index_n500_p50_B100).to_csv(Save_path_dir + f"n{n}_p{p}_B100/min_index_dep_n{n}_p{p}.csv",index=False)
# for b in range(BB):
#     pd.DataFrame(BIC_set_n500_p50_B100[b,:,:]).to_csv(Save_path_dir + f"n{n}_p{p}_B100/BIC_set_dep_n{n}_p{p}_"+str(b)+'_.csv',index=False)
#     pd.DataFrame(beta_est_n500_p50_B100[b,:,:]).to_csv(Save_path_dir + f"n{n}_p{p}_B100/beta_est_dep_n{n}_p{p}_"+str(b)+'_.csv',index=False)