#### This notebook is used to conduct parameters estimation under the Dyad Independence Model (DIM) setting.

In [1]:
# Importing utils functions needed for simulation studies
import utils
from importlib import reload
reload(utils)

<module 'utils' from '/mnt/code/simulation/scripts/utils.py'>

In [2]:
# Importing necessary packages
import pandas as pd
import sys,codecs
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import scipy.stats as st
import collections
import statsmodels.api as sm
from time import time
import random

In [3]:
# Importing Ray, a distributed computing framework for enabling parallel and scalable execution
import ray

ray.init(address='auto')  

2025-07-30 16:58:29,555	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.7:6379...
2025-07-30 16:58:29,575	INFO worker.py:1927 -- Connected to Ray cluster.


0,1
Python version:,3.12.4
Ray version:,2.48.0


In [4]:
print(ray.cluster_resources())

{'CPU': 124.0, 'object_store_memory': 79025689803.0, 'memory': 184393276213.0, 'node:192.168.1.6': 1.0, 'node:192.168.1.5': 1.0, 'node:192.168.1.8': 1.0, 'node:__internal_head__': 1.0, 'node:192.168.1.7': 1.0}


In [5]:
# Importing OS module for interacting with the operating system (e.g., file paths, directories)
import os

NUM_CPU = (ray.cluster_resources())['CPU'] #os.cpu_count() 

print(f'CPU total: {NUM_CPU}')

CPU total: 124.0


In [6]:
NUM_THREADS = 4

os.environ["MKL_NUM_THREADS"]     = str(NUM_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(NUM_THREADS)
os.environ["OMP_NUM_THREADS"]     = str(NUM_THREADS)

NUM_PROCESS = NUM_CPU // NUM_THREADS
print(f'max process: {NUM_PROCESS}')

max process: 31.0


### True parameters and network setting 

In [7]:
# Dimensions
n = 500; p = 50; q = 20; d = 3

In [8]:
# true parameter
seed = 666
rng = np.random.default_rng(seed) 
tau = np.round(rng.uniform(0.15,0.2,p),4)
rho = np.round(rng.uniform(0.2,0.9,p),4)
beta0 = np.round(rng.uniform(0.5,1,(p,10)),4)
beta1 = np.zeros((p,q-10))
beta = np.hstack((beta0, beta1))
bc = np.array(np.hstack([rng.normal(0,1,(p,d))]))
# Omega mean and variance
mean = np.zeros(p)
ta = 0.15
i, j = np.mgrid[:p, :p]
cov = ta**2*ta**abs(i-j)
cov[list(range(p)),list(range(p))] = ta
sigma2 = np.zeros(p)
for j in range(p):
    sigma2[j] = bc[j,:]@bc[j,:] + ta

In [9]:
# Adjacency matrix-DIM
pv = [2/n,0.5*(n**(-0.8)),0.5*(n**(-0.8)),1-2/n-0.5*(n**(-0.8))-0.5*(n**(-0.8))]
A = np.zeros((n,n))
for i in range(0,n):
    for j in range(i+1,n):
        m = np.argmax(rng.multinomial(1,pv,1))
        if m==0:
            A[i,j] = A[j,i] = 1
        elif m==1:
            A[i,j] = 1; A[j,i] = 0
        elif m==2:
            A[i,j] = 0; A[j,i] = 1
ind = (np.where(np.sum(A,1)==0))[0]
for i in ind:
    A[i,rng.choice(list(range(n)),4)] = 1
A[list(range(n)),list(range(n))] = 0
W = A/np.sum(A,1).reshape(n,1)

In [10]:
# Check
print(np.sum(A))
np.max(np.sum(A,1))

1892.0


np.float64(10.0)

### Save path setting and save parameters

In [11]:
Save_path_dir = '../results/Results_dep_02/'

In [12]:
#save true values
pd.DataFrame(rho).to_csv(Save_path_dir+f"n{n}_p{p}_B500/rho_true_n{n}_p{p}.csv",index=False)
pd.DataFrame(beta).to_csv(Save_path_dir+f"n{n}_p{p}_B500/beta_true_n{n}_p{p}.csv",index=False)
pd.DataFrame(bc).to_csv(Save_path_dir+f"n{n}_p{p}_B500/B_true_n{n}_p{p}.csv",index=False)
pd.DataFrame(A).to_csv(Save_path_dir+f"A_smallp_n{n}.csv",index=False)

### Parallel framework-Ray and full process

In [13]:
# parallel computation framework
import ray

ray.init(num_cpus=NUM_CPU, ignore_reinit_error=True)

2025-07-30 17:00:01,798	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.7:6379...
2025-07-30 17:00:01,800	INFO worker.py:1765 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.12.4
Ray version:,2.48.0


In [16]:
# Randomly selected one dimension response for the CP
jj = rng.integers(0, p)
Save_path_process = os.path.abspath(Save_path_dir + 'process.txt')
Save_path_process_ci = os.path.abspath(Save_path_dir + 'process_ci.txt')
jj

np.int64(8)

In [17]:
# Whole process
# Defining `map_fun_fmle` function, implementing a specific computational task
@ray.remote(num_cpus=4) 
def map_fun_fmle(bb):
    
    Y, X, Z = utils.data_generator(n, p, q, d, W, mean, cov, rho, beta, bc, seed = bb + 88)
    n0 = int(0.1*n)
    n1 = n - n0
    WW0 = W[:n0,:n0]
    Y0, X0, Z0 = Y[:n0], X[:n0,:], Z[:n0,:]  
    WW1 = W[n0:,n0:]
    Y1, X1, Z1 = Y[n0:], X[n0:,:], Z[n0:,:]
    # tic1n = time()
    #initial estimators
    rho_hi, beta_hi, sigma2_hi, ite_hi = utils.optimize_initial(n1, p, q, WW1, Y1, X1, bb, Save_path_process)
    rho_hb, beta_hb, sigma2_hb, ite_hb = utils.optimize_initial(n0, p, q, WW0, Y0, X0, bb, Save_path_process)
    print('cmle is over')
    # toc1n = time()
    # print(toc1n - tic1n)
    
    # Estimate Zi and M
    Z_h, M = utils.Z_est(n0, p, q, d, WW0, WW1, Y1, X1, rho_hi, beta_hi, Y0, X0, rho_hb, beta_hb)
    
    Z_err_max = np.max(abs(Z_h - Z1))
    Z_err_max1 = np.max(abs(-Z_h - Z1))
    print('Z_err_max:', Z_err_max, 'Z_err_max1:', Z_err_max1)
    if Z_err_max>Z_err_max1:
        Z_h = - Z_h
    if (Z_err_max>10) and (Z_err_max1>10):
        return np.zeros((8,p)),np.zeros(p),np.zeros(p),np.zeros(p)
    
    # FMLE with true Z
    rho_hfZ, beta_hfZ, b_hfZ, tau2_hfZ, ite_hfZ = utils.optimize(n1, p, q, d, WW1, Z1, Y1, X1, bb, Save_path_process)
    print('SARX is over')
    
    # FMLE with estimated Z
    rho_hf, beta_hf, b_hf, tau2_hf, ite_hf = utils.optimize(n1, p, q, d, WW1, Z_h, Y1, X1, bb, Save_path_process)
    print('fmle is over')

    H_hat = M.T@M/p
    bc_hat = b_hf.T@H_hat
    sigma2_hat = np.zeros(p)
    for j in range(p):
        sigma2_hat[j] = bc_hat[j,:]@bc_hat[j,:] + tau2_hf[j]

    lower_est = np.zeros(p)
    upper_est = np.zeros(p)
    q0 = 10
    # for j in range(p):
    #     tic1nj = time()
    #     lower_est[j], upper_est[j] = CI_est_new(j, n1, p, q0, d, WW1, rho_hf, beta_hf, b_hf, tau2_hf, M, sigma2_hat, Y1, X1, Z_h, 1.96)
    #     toc1nj = time()
    #     with open('Results_Block_02/process.txt', 'a') as f1:
    #         f1.write(str(bb)+', '+str(j)+', '+str(toc1nj - tic1nj)+', '+str(lower_est[j])+', '+str(upper_est[j])+'\n')   
    # CI computation
    tic1nj = time()
    lower_est[jj], upper_est[jj] = utils.CI_est_test(jj, n1, p, q0, d, WW1, rho_hf, beta_hf, b_hf, tau2_hf, M, sigma2_hat, Y1, X1, Z_h, 1.96)
    toc1nj = time()  
    with open(Save_path_process_ci, 'a') as f1:
        f1.write(str(bb)+', '+str(jj)+', '+str(toc1nj - tic1nj)+', '+str(lower_est[jj])+', '+str(upper_est[jj])+'\n')   
 

    return np.array([rho_hi, rho_hfZ, rho_hf, sigma2_hi, tau2_hfZ, tau2_hf, lower_est, upper_est]), ite_hi, ite_hfZ, ite_hf

In [18]:
from time import time
from numpy.random import default_rng

### Start parallel experiments

In [19]:
# whole experiments
BB = 500
tic1 = time()
tasks = [map_fun_fmle.remote(bb) for bb in range(BB)]
results500_p50whole = ray.get(tasks)  # 等待所有任务完成
toc1 = time()
print(toc1 - tic1) # 总的计算时间

4065.9902787208557


In [20]:
# Results
BB = 500
theta_hat_dep_n500_p50_B500 = np.zeros((BB,8,p))
ite_ini = np.zeros((BB,p))
ite_fZin = np.zeros((BB,p))
ite_fin = np.zeros((BB,p))
#tic1 = time()
for bt in range(BB):
    theta_hat_dep_n500_p50_B500[bt,:,:], ite_ini[bt,:],ite_fZin[bt,:], ite_fin[bt,:] = results500_p50whole[bt][0], results500_p50whole[bt][1],results500_p50whole[bt][2],results500_p50whole[bt][3]
#toc1 = time()

In [21]:
# ECP computation
cc = list(set(range(BB)).difference(set(np.where(ite_fin==0)[0])))
ECP = np.zeros(len(cc))
i=0
for b in cc:
    ECP[i] = (theta_hat_dep_n500_p50_B500[b,6,jj]<=rho[jj]) & (rho[jj]<=theta_hat_dep_n500_p50_B500[b,7,jj])
    i = i+1

In [22]:
np.mean(ECP) 

np.float64(0.962)

In [23]:
#cmle
np.nanmean(abs(theta_hat_dep_n500_p50_B500[cc,0,:]-((rho.reshape((p,1))@np.ones((1,BB))).T)[cc,:]))

np.float64(0.03038408106492361)

In [24]:
#SARX with true Z
np.nanmean(abs(theta_hat_dep_n500_p50_B500[cc,1,:]-((rho.reshape((p,1))@np.ones((1,BB))).T)[cc,:]))

np.float64(0.009686677254523931)

In [25]:
#fmle
np.nanmean(abs(theta_hat_dep_n500_p50_B500[cc,2,:]-((rho.reshape((p,1))@np.ones((1,BB))).T)[cc,:]))

np.float64(0.02046833353863398)

In [26]:
# save results
for b in range(BB):
    pd.DataFrame(theta_hat_dep_n500_p50_B500[b,:,:]).to_csv("../results/Results_dep_02/n500_p50_B500/theta_hat_dep_n500_p50_"+str(b)+'_.csv',index=False)