### View the current logical CPU count of the server

In [1]:
import os

NUM_CPU = len(os.sched_getaffinity(0)) #os.cpu_count() 

print(f'CPU total: {NUM_CPU}')

CPU total: 128


### Limit the number of threads that can be called by a single process

In [2]:
NUM_THREADS = 4 

os.environ["MKL_NUM_THREADS"]     = str(NUM_THREADS)
os.environ["NUMEXPR_NUM_THREADS"] = str(NUM_THREADS)
os.environ["OMP_NUM_THREADS"]     = str(NUM_THREADS)

NUM_PROCESS = NUM_CPU // NUM_THREADS
print(f'Maximum number of parallel processes: {NUM_PROCESS}')

Maximum number of parallel processes: 32


### Import numpy, multiprocessing and other packages

In [34]:
import numpy as np
from numpy.random import default_rng
from time import time
import multiprocessing as mp
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import random
from sklearn.cluster import KMeans
import statsmodels.api as sm
import math

### Global invariant parameters

In [15]:
K = 5; q = 8;
# True Parameters
sigma = 1; theta = [3,1.5,0,0,2,0,0,0]; gamma = list(range(-4,3*K-4,3));
pi = [0.15,0.2,0.3,0.25,0.1]
mean = np.zeros(q)
rho = 0.5
i, j = np.mgrid[:q, :q]
cov = rho**abs(i-j)

### Some functions needed for global calculations

In [16]:
def dup_rows(a, indx, num_dups=1):
    return np.insert(a,[indx+1]*num_dups,a[indx],axis=0)

def dup_cols(a, indx, num_dups=1):
    return np.insert(a,[indx+1]*num_dups,a[:,[indx]],axis=1)
def function_exp(x):
    return np.exp(x)
function_vexp = np.vectorize(function_exp)
def function_bin(p,x):
    return (p**x)*(1-p)**(1-x)
function_vbin = np.vectorize(function_bin)

### Initial-EM

In [25]:
#Section 2.2-The initial estimator-EM
def em_single_initial(n,priors,X,Y):
    '''
    EM
    Arguments:
    priors:[pi_t,gamma_t,theta_t,sigma_t]
    Y:[n X 1 list]
    X:[n X q matrix]
    
    Returns:
    new_priors:[new_pi,new_gamma,new_theta,new_sigma]
    pi_t = priors[1:K]; gamma_t = priors[K:2*K]
    theta_t = priors[2*K:2*K+q]; sigma_t = priors[-1]
    '''
    pi_t = priors[0:K]; gamma_t = priors[K:2*K]
    theta_t = priors[2*K:2*K+q]; sigma_t = priors[-1]
    #E step -w_ik
    c = Y-np.dot(X,theta_t)
    g = np.array(gamma_t)
    g_pi = np.array(pi_t)
    a1 = dup_cols(np.column_stack((c,c)), indx=0, num_dups=3)
    a2 = dup_rows(np.row_stack((g,g)), indx=0, num_dups=n-2)
    e_pri = -(a1-a2)**2/(2*sigma_t**2)
    e_pri = function_vexp(e_pri)
    a_pi = dup_rows(np.row_stack((g_pi,g_pi)), indx=0, num_dups=n-2)
    w_t1 = (np.divide((e_pri*a_pi).T,(np.sum(e_pri*a_pi,1)).T)).T
    
    #M step update
    new_pi = np.average(w_t1, axis=0) 
    weight = np.divide(w_t1,sum(w_t1,0))
    new_gamma = np.dot(weight.T, Y-np.dot(X, theta_t))
    V = np.dot(w_t1,gamma_t)
    trt_inv = np.dot(np.linalg.inv(np.dot(X.T,X)),X.T)
    new_theta = np.dot(trt_inv,Y-V)
    sum_sigma = 0
    w_t1_gamma_t = np.dot(w_t1,gamma_t)
    sum_sigma = {np.dot(((Y-np.dot(X,theta_t))**2).T,[1]*n)-
              2*np.dot(np.multiply(Y-np.dot(X,theta_t),w_t1_gamma_t).T,[1]*n)+
                np.dot(np.dot(w_t1,np.array(gamma_t)**2),[1]*n)}
    new_sigma = (list(sum_sigma)[0]/n)**0.5
    return list(new_pi)+list(new_gamma)+list(new_theta)+[new_sigma]

### initial-EM- Iteration

In [26]:
def initial_em(n,prior,X,Y,tol = 1e-3,iterations=10000):
    '''
    EM
    param Y,X :Data
    param prior：Initial
    param tol：End of Iteration Threshold
    param iterations：Maximum number of iterations
    return：Locally optimal model parameters
    '''
    iteration = 0;
    while iteration < iterations:
        new_prior = em_single_initial(n,prior,X,Y)
        delta_change = abs(np.array(prior)-np.array(new_prior))
        if sum(delta_change**2)**0.5<tol:
            break
        else:
            prior = new_prior
            iteration +=1
    return [new_prior,iteration]

### 1. Simulation Data Generator--X,Y

In [27]:
def data_generator(n,seed):

    rng = default_rng(seed)                                 #Setting the seed for the random number generator
    
    X = rng.multivariate_normal(mean, cov, (n,), 'raise')   #X nxq
    mk_class = rng.multinomial(n, pvals=pi)
                                                            #Generate mK_gamma
    mK_gamma = []
    mK = []
    for k in range(K):
        idt = np.ones(int(mk_class[k]))
        mK.extend(idt*(k))
        mK_gamma.extend(idt*gamma[k])
    mK = [int(k) for k in mK]
    #Generate Y
    epsilon = list(rng.normal(size=n))
    Y = mK_gamma + np.dot(X, theta) + epsilon
            
    return [X, Y]

### 2. Function to compute the initial value of the initial-estimator

In [28]:
def initial_pri_est(n,X,Y):
    
    X_s = sm.add_constant(X)
    model = sm.OLS(Y, X_s)
    model_fit = model.fit()
    model_res = model_fit.resid
    c = model_res
    clf = KMeans(n_clusters=K)
    model_res = model_res.reshape(-1,1)
    ff = clf.fit(model_res)
    classgamma = ff.cluster_centers_
    classgamma = [x[0] for x in classgamma]
    classgamma.sort()
    gamma_pri = [x+model_fit.params[0] for x in classgamma]
    theta_pri = model_fit.params[1:]
    #To estimate pi_pri
    sample_label = pd.DataFrame({'value':c, 'label_f':ff.fit_predict(model_res),
                             'center':np.zeros(n), 'label':ff.fit_predict(model_res)})
    sample_label.sort_values(by = 'label_f')
    c = []
    for k in range(K):
        idx_k = sample_label[sample_label['label_f']==k].index.tolist()
        sample_label.iloc[idx_k,2] = np.mean(sample_label.iloc[idx_k,0])
        c.append(np.mean(sample_label.iloc[idx_k,0]))
    c.sort()
    for k in range(K):
        idx_k = sample_label[sample_label['center']==c[k]].index.tolist()
        sample_label.iloc[idx_k,3] = k
        sample_label.sort_values(by = 'label')
    counts = sample_label.label.value_counts()/n
    pi_pri = []
    for k in range(K):
        pi_pri.append(counts.loc[k])
    cs = [1]*K
    for k in range(K):
        idx_k = sample_label[sample_label['label']==k].index.tolist()
        cs[k] = np.std(sample_label.iloc[idx_k,0])
    sigma_pri = np.mean(cs)
    param_pri = pi_pri+gamma_pri+list(theta_pri)+[sigma_pri]
        
    return param_pri

### 3. Define a mapping: random number seed$\mapsto$p estimator

In [29]:
def map_fun(b):
    
    X,Y = data_generator(n,seed = b)                            #Generate simulation data
    initial_pri = initial_pri_est(n, X, Y)                      #Compute initial
    
    initial_est, initial_iter = initial_em(n,initial_pri, X, Y) #Compute Estimator and iterations
    
    return [initial_est, initial_iter] 

### 4. Constants setting for simulation

In [54]:
n = 5000; p = 1000;  #Dimension
B = 500;             #Replication

### 5. Calling multiple processes for simulation experiments

In [55]:
# Multiple Processes
tic1 = time()

with mp.Pool(NUM_PROCESS) as pool:              # Calling the NUM_PROCESS process
    Results1 = pool.map(map_fun, range(B))

toc1 = time()
print(toc1 - tic1)                              # Total computing time

47.0075147151947


### 6. Obtain Results

In [56]:
initial_est_n5000 = np.zeros((B,(2*K+q+1)))
initial_iter_n5000 = np.ones((B))
for b in range(B):
    initial_est_n5000[b,:] = Results1[b][0]
    initial_iter_n5000[b] = Results1[b][1]

In [57]:
pd.DataFrame(initial_est_n5000).to_csv("R500_initial/initial_est_n5000.csv",index=False)
pd.DataFrame(initial_iter_n5000).to_csv("R500_initial/initial_iter_n5000.csv",index=False)