#### This notebook is used to conduct model selection under the Stochastic Block Model (SBM) setting.

In [1]:
# Importing necessary packages
import pandas as pd
import sys,codecs
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from sklearn.cluster import KMeans
import collections
import statsmodels.api as sm
from time import time
import random
from statsmodels.sandbox.distributions.extras import mvstdnormcdf

In [2]:
# Importing utils functions needed for simulation studies
import utils_BIC
from importlib import reload
reload(utils_BIC)

<module 'utils_BIC' from '/mnt/code/simulation/scripts/utils_BIC.py'>

In [3]:
# Importing Ray, a distributed computing framework for enabling parallel and scalable execution
import ray

ray.init(address='auto')  
print(ray.cluster_resources())

2025-07-31 15:05:58,853	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.9:6379...
2025-07-31 15:05:58,897	INFO worker.py:1927 -- Connected to Ray cluster.


{'CPU': 124.0, 'object_store_memory': 78946438347.0, 'memory': 184208356149.0, 'node:192.168.1.10': 1.0, 'node:192.168.1.11': 1.0, 'node:192.168.1.9': 1.0, 'node:__internal_head__': 1.0, 'node:192.168.1.12': 1.0}
[36m(map_fun_BIC pid=16324)[0m 2 0 95 3.5865883827209473
[36m(map_fun_BIC pid=16325)[0m 3 0 21 3.6798593997955322
[36m(map_fun_BIC pid=16322)[0m 0 0 80 3.937746047973633
[36m(map_fun_BIC pid=16323)[0m 1 0 13 3.845827579498291
[36m(map_fun_BIC pid=16468)[0m 18 0 34 3.40285062789917
[36m(map_fun_BIC pid=16521)[0m 20 0 29 3.6828441619873047
[36m(map_fun_BIC pid=23867, ip=192.168.1.11)[0m 6 0 40 3.4732484817504883
[36m(map_fun_BIC pid=23868, ip=192.168.1.11)[0m 7 0 36 3.3687782287597656
[36m(map_fun_BIC pid=21446, ip=192.168.1.12)[0m 8 0 44 3.5230023860931396
[36m(map_fun_BIC pid=16493)[0m 19 0 17 4.265380144119263
[36m(map_fun_BIC pid=23869, ip=192.168.1.11)[0m 9 0 30 3.1673660278320312
[36m(map_fun_BIC pid=23865, ip=192.168.1.11)[0m 4 0 16 3.6170570850372

In [4]:
# Importing OS module for interacting with the operating system (e.g., file paths, directories)
import os

NUM_CPU = (ray.cluster_resources())['CPU'] #os.cpu_count() 

print(f'CPU total: {NUM_CPU}')

CPU total: 124.0


In [5]:
NUM_THREADS = 4

os.environ["MKL_NUM_THREADS"]     = str(NUM_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(NUM_THREADS)
os.environ["OMP_NUM_THREADS"]     = str(NUM_THREADS)

NUM_PROCESS = NUM_CPU // NUM_THREADS
print(f'max process: {NUM_PROCESS}')

max process: 31.0


### True parameters and network setting 

In [6]:
# dimension
n = 500; p = 50; q = 20; d = 3

In [7]:
# true parameter
seed = 666
rng = np.random.default_rng(seed) 
tau = np.round(rng.uniform(0.15,0.2,p),4)
rho = np.round(rng.uniform(0.2,0.9,p),4)
beta0 = np.round(rng.uniform(0.5,1,(p,10)),4)
beta1 = np.zeros((p,q-10))
beta = np.hstack((beta0, beta1))
bc = np.array(np.hstack([rng.normal(0,1,(p,d))]))
# Omega mean and variance
mean = np.zeros(p)
ta = 0.15
i, j = np.mgrid[:p, :p]
cov = ta**2*ta**abs(i-j)
cov[list(range(p)),list(range(p))] = ta
sigma2 = np.zeros(p)
for j in range(p):
    sigma2[j] = bc[j,:]@bc[j,:] + ta

In [8]:
# Adjacency matrix-SBM
A = np.zeros((n,n))
K = 5
nk = int(n/K)
p1 = 9/n
p2 = 3/n
P = (np.kron(np.eye(K),(p1-p2)*np.ones((nk,nk)))+p2*np.ones((n,n))).flatten()
A = (rng.binomial(1,P)).reshape((n,n))

ind = (np.where(np.sum(A,1)==0))[0]
for i in ind:
    A[i,rng.choice(list(range(n)),6)] = 1
A[list(range(n)),list(range(n))] = 0
W = A/np.sum(A,1).reshape(n,1)  

In [9]:
# check
print(np.sum(A))
np.max(np.sum(A,1))

2114


np.int64(13)

In [10]:
#Save true parameters
# Save_path_dir = '../results/Results_SBM_BIC/'
# pd.DataFrame(rho).to_csv(Save_path_dir+f"n{n}_p{p}_B100/rho_true_n{n}_p{p}.csv",index=False)
# pd.DataFrame(beta).to_csv(Save_path_dir+f"n{n}_p{p}_B100/beta_true_n{n}_p{p}.csv",index=False)
# pd.DataFrame(bc).to_csv(Save_path_dir+f"n{n}_p{p}_B100/B_true_n{n}_p{p}.csv",index=False)
# pd.DataFrame(A).to_csv(Save_path_dir+f"A_smallp_n{n}.csv",index=False)

In [11]:
# Some hyperparameter settings
thre = 1e-3
BICn = 100
par = np.zeros(q+2)
par[-1] = 1
sudu = (np.log(p*q)/n)**0.5
lam_set = np.linspace(sudu**9,2*sudu**0.5, BICn)
bic_sh = np.log(n)*(np.log(p*q))/n

### Parallel framework-Ray and Full process

In [12]:
import ray

ray.init(num_cpus=NUM_CPU, ignore_reinit_error=True)

2025-07-31 15:06:50,993	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 192.168.1.9:6379...
2025-07-31 15:06:50,994	INFO worker.py:1765 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.12.4
Ray version:,2.48.0


In [13]:
Save_path_process = os.path.abspath('../results/process.txt')

In [14]:
# Whole process
# Defining `map_fun_BIC` function, implementing a specific computational task
@ray.remote(num_cpus=4) 
def map_fun_BIC(bb):

    # data generator
    Y, X, Z = utils_BIC.data_generator(n, p, q, d, W, mean, cov, rho, beta, bc, seed = bb + 166)
    BIC_set = np.zeros((BICn,p))
    Ln_j_set = np.zeros((BICn,p))
    theta_ini = np.zeros((p, 2+q))
    # Iterate over all dimensions and alternative sets to compute the BIC
    for j in range(p):
        ticn1 = time()
        theta_ini[j,:] = utils_BIC.newton_sea_initial(n, p, q, W, par, Y[:,j], X)[0]
        rho0_h = theta_ini[j,:][0]
        beta0_h = theta_ini[j,:][1:(q+1)]
        sigma20_h = theta_ini[j,:][-1]
        for b in range(BICn):
            lambda_ = lam_set[b]
            beta_est = utils_BIC.newton_sea_SCAD(n, p, q, W, theta_ini[j,:], Y[:,j], X, lambda_)
            beta_scad = np.where(beta_est[0]<thre, 0, beta_est[0])
            Ln_j_set[b,j] = utils_BIC.log_likelihood_sar(rho0_h, beta_scad, sigma20_h, Y[:,j], X, W)
            BIC_set[b,j] = Ln_j_set[b,j] + len(np.where(beta_scad!=0)[0])*bic_sh
        tocn1 = time()
        argBIC = np.argmin(BIC_set[:,j])
        print(bb, j, argBIC, tocn1 - ticn1) 
        with open(Save_path_process, 'a') as f1:
            f1.write(str(bb) + ', '+ str(j) + ', '+ str(argBIC) +'\n')
    # Choose the smallest of these and make a model selection
    min_index = np.argmin(BIC_set, axis=0)
    beta_estt = np.zeros((p,q))
    for j in range(p):
        lambda_ = lam_set[int(min_index[j])]
        beta_estt[j,:] = utils_BIC.newton_sea_SCAD(n, p, q, W, theta_ini[j,:], Y[:,j], X, lambda_)[0]        
    return BIC_set.T, min_index, beta_estt

### Start parallel experiments

In [15]:
# Whole experiments
BB = 100
tic1 = time()
tasks = [map_fun_BIC.remote(bb) for bb in range(BB)]
resultsn500_p501BIC = ray.get(tasks)  
toc1 = time()
print(toc1 - tic1) # Total time

759.7385034561157


In [73]:
ray.shutdown()

In [16]:
# Save results
BIC_set_n500_p50_B100 = np.zeros((BB,BICn,p))
min_index_n500_p50_B100 = np.zeros((BB,p))
beta_est_n500_p50_B100 = np.zeros((BB,p,q))
for bt in range(BB):
    BIC_set_n500_p50_B100[bt,:,:], min_index_n500_p50_B100[bt,:], beta_est_n500_p50_B100[bt,:,:] = resultsn500_p501BIC[bt][0].T, resultsn500_p501BIC[bt][1], resultsn500_p501BIC[bt][2]

In [17]:
# averge selection consistency
1 - np.mean(abs((beta_est_n500_p50_B100[:,:,:] > thre).astype(int) - (beta > thre).astype(int)))

np.float64(0.98884)

In [18]:
# uniform selection consistency
1 - len(set(np.where((beta_est_n500_p50_B100[:,:,:] > thre).astype(int) - (beta > thre).astype(int) != 0)[0]))/100

0.020000000000000018

In [109]:
#Save results
# pd.DataFrame(min_index_n500_p50_B100).to_csv(Save_path_dir + f"n{n}_p{p}_B100/min_index_n{n}_p{p}.csv",index=False)
# for b in range(BB):
#     pd.DataFrame(BIC_set_n500_p50_B100[b,:,:]).to_csv(Save_path_dir + f"n{n}_p{p}_B100/BIC_set_n{n}_p{p}_"+str(b)+'_.csv',index=False)
#     pd.DataFrame(beta_est_n500_p50_B100[b,:,:]).to_csv(Save_path_dir + f"n{n}_p{p}_B100/beta_est_n{n}_p{p}_"+str(b)+'_.csv',index=False)