In [None]:
import random
import math
import statsmodels.formula.api as smf
import sys
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier,AdaBoostRegressor
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from mlxtend.classifier import StackingCVClassifier,StackingClassifier
from mlxtend.regressor import StackingCVRegressor,StackingRegressor
from sklearn.neural_network import MLPClassifier,MLPRegressor
from pygam import GAM,LogisticGAM
import itertools
from scipy import stats
from scipy.special import expit
xgb.set_config(verbosity=0)

In [None]:
# In Simulation3, we introduced some slices of MRI image, and here are the libraries needed.
from nilearn import image
from nilearn.masking import apply_mask
import nibabel as nib

In [None]:
from sympy import Symbol, exp, log
from numpy import linalg as la
import matplotlib.pyplot as plt
import time
import scipy.stats
import numpy.random as nrd
import multiprocessing
# In Simulation3, we no more need the function "HDFPCA" to create image, so here is the image generator.
def gene_image(SampleNumber):
    w_masked_path = './triple-crossfitting/waiting_mask/'
    filename_lst = [w_masked_path+i for i in os.listdir(w_masked_path)]
    intersect = nib.load("./triple-crossfitting/cc_masks0/intersect.nii.gz")
    image_array = []
    i=0
    # Randomly selected slices
    for _ in range(SampleNumber):
        file_num = random.randrange(400, 700)
        row_num = random.randrange(5, 60)
        index_num = random.randrange(5, 60)
        high_num = random.randrange(5, 84)
        start_num1 = random.randrange(1, 15)
        start_num2 = random.randrange(1, 30)
        row_img = nib.load(filename_lst[file_num])
        int_img = row_img.get_fdata()
        random_num = random.choice([0,1,2]) # Randomly selected axial slices
        if random_num==0:
            new_img = int_img[row_num,start_num1:start_num1+50,start_num2:start_num2+50]
        elif random_num==1:
            new_img = int_img[start_num1:start_num1+50,index_num,start_num2:start_num2+50]
        elif random_num==2:
            new_img = int_img[start_num1:start_num1+50,start_num1:start_num1+50,high_num]
        new_img = new_img/1000
        np.save('./triple-crossfitting/simu_img/'+str(i)+'.npy',new_img)
        image_array.append(new_img)
        i+=1
    return np.array(image_array)

# L = 10
# pg = 250
def get_X_bar(X,N): # x = observers
    X_bar = X.reshape((N, -1))/50   # transfer x to vector ( N * grid)
    X_bar = X_bar - np.mean(X_bar, 0)[np.newaxis]   # centered x
    return X_bar 

def get_est_U(X,N,return_eigenscore=False):
    # SVD过程
    X_bar = X.reshape((N, -1))/50   # transfer x to vector ( N * grid)
    X_bar = X_bar - np.mean(X_bar, 0)[np.newaxis]   # centered x
    X_bar_square = np.zeros(shape=[N, N])
    for l in range(10):
        X_bar_part = X_bar[:, l*250:(l+1)*250]
        X_bar_square += np.matmul(X_bar_part, X_bar_part.transpose())

    v, s_2, vt = la.svd(X_bar_square)  # u: n*n, s: n (eigenvalue)
    est_eimg = np.zeros([N, 10, 250])
    for l in range(10):
        X_bar_part = X_bar[:, l*250:(l+1)*250]
        est_eimg[:, l] = np.matmul(X_bar_part.T, np.matmul(v, np.sqrt(np.diag(1/s_2)))).T # U(m)=X(m)VS_-1
    U = est_eimg.reshape((N, -1))
    if return_eigenscore==False:
        return U.T
    else:
        return np.matmul(U,X_bar.T)

In [None]:
# model setting
LG_psmodel = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
Linear_rspmodel = LinearRegression()

RF_psmodel = RandomForestClassifier()
RF_rspmodel = RandomForestRegressor()

ADA_rspmodel = AdaBoostRegressor(learning_rate=0.05)
ADA_psmodel = AdaBoostClassifier()

XGB_rspmodel = xgb.XGBRegressor(n_estimators=500,learning_rate=0.005)
XGB_psmodel = xgb.XGBClassifier(n_estimators=500,learning_rate=0.005)

GAM_rspmodel = GAM(link='identity',n_splines=4,lam=0.6,max_iter=10000)
GAM_psmodel =  LogisticGAM(n_splines=4,lam=0.6,max_iter=10000)

BPNN_rspmodel = MLPRegressor(hidden_layer_sizes=(128,),activation='relu',max_iter=100,solver='lbfgs')
BPNN_psmodel = MLPClassifier(hidden_layer_sizes=(128,),activation='relu',max_iter=100,solver='lbfgs')

ps_mds = [LG_psmodel,RF_psmodel,ADA_psmodel,XGB_psmodel,BPNN_psmodel]
rsp_mds = [Linear_rspmodel,RF_rspmodel,ADA_rspmodel,GAM_rspmodel]

import DRModel
dr = DRModel()
dr.ml_models(binary_models=ps_mds,continue_models=rsp_mds)

In [None]:
# other functions
def flatten(lis):
    ret=[]
    for item in lis:
        if not isinstance(item,list):
            ret.append(item)
        else:
            ret.extend(flatten(item))
    return ret
def coverage(p,lower,upper,true_value):
    pp=len(p)
    count = 0.0
    for _ in range(pp):
        if true_value[_] >= lower[_] and true_value[_] <= upper[_]:
            count+=1.0
    return count/pp

def I(val_lst,min_thred=-100000,max_thred=100000):
    kk = []
    for val in val_lst:
        if val>=min_thred and val<=max_thred:
            kk.append(1)
        else:
            kk.append(0)
    return np.array(kk)

def sample_splitting(dataframe,png,ids,resample=False,sample_num=800):
#     simu_frame_s1 = dataframe[dataframe['par']==ids] # shuffle
    simu_frame_s1 = dataframe
    observers_s1 = png[simu_frame_s1.index]
    if resample==True:
        simu_frame_s = simu_frame_s1.sample(frac=1)
        observers_s = png[simu_frame_s.index]
    else:
        simu_frame_s = simu_frame_s1
        observers_s = observers_s1  
    true_ate1 = (simu_frame_s[simu_frame_s['Treatment']==1]['Y'].mean() - simu_frame_s[simu_frame_s['Treatment']==0]['Y'].mean())
    frame_length = len(simu_frame_s.index)
    simu_frame1 = simu_frame_s.iloc[:int(frame_length*0.25),:]
    image_observer1 = observers_s[:int(frame_length*0.25),:] #图像切分
    
    simu_frame2 = simu_frame_s.iloc[int(frame_length*0.25):int(frame_length*0.5),:]
    image_observer2 = observers_s[int(frame_length*0.25):int(frame_length*0.5),:] #图像切分
    
    simu_frame3 = simu_frame_s.iloc[int(frame_length*0.5):int(frame_length*0.75),:]
    image_observer3 = observers_s[int(frame_length*0.5):int(frame_length*0.75),:] #图像切分
    
    simu_frame4 = simu_frame_s.iloc[int(frame_length*0.75):,:]
    image_observer4 = observers_s[int(frame_length*0.75):,:] #图像切分
    
    return true_ate1,simu_frame1,image_observer1,simu_frame2,image_observer2,simu_frame3,image_observer3,simu_frame4,image_observer4   
     
def dc_sample_splitting(simu_frame_s): 
    frame_length = len(simu_frame_s.index)
    simu_frame1 = simu_frame_s.iloc[:int(frame_length*0.333333),:]
    simu_frame2 = simu_frame_s.iloc[int(frame_length*0.333333):int(frame_length*0.666666),:]
    simu_frame3 = simu_frame_s.iloc[int(frame_length*0.666666):,:]
    
    return simu_frame1,simu_frame2,simu_frame3  
    
def frame_reindex(frame):
    a = pd.DataFrame(frame,columns = ['egs'])
    a['idx'] = ['es'+str(i) for i in range(len(a))]
    a = a.set_index('idx')
    return pd.Series(a['egs'])

def getate(some_ate,tate,par):
    b_ate_hat = {}
#     some_ate = baseline_ate
    b_indicators = list(some_ate.keys())
    sec_keys = ['True','Main Effect','ML']
    for indi in some_ate:
        b_ate_hat[indi] = {}
        for sec_key in sec_keys:
            b_ate_hat[indi][sec_key] = []
            ate_p = []
            sd_p = []
            upper_ci_p = []
            lower_ci_p = []
            for i in partition:
                sps_p = []
                sps_var = []
                par_ate = np.mean(some_ate[indi][sec_key][i],axis=0)
                for sps in range(par):
                    sp_p = some_ate[indi][sec_key][i][sps]
                    sps_p.append(np.mean(sp_p))
                    sps_var.append(np.var(sp_p - par_ate[:len(sp_p)], ddof=1)/len(sp_p))
                single_ate = np.mean(sps_p)
                single_sd = np.sqrt(np.median(sps_var + (sps_p - single_ate)**2))
                ate_p.append(single_ate)
                sd_p.append(single_sd)
                upper_ci_p.append(single_ate+1.96*single_sd)
                lower_ci_p.append(single_ate-1.96*single_sd)
            for aa in range(len(ate_p)):
                b_ate_hat[indi][sec_key].append((ate_p[aa],tate[aa]))
    return b_ate_hat

def evaluate(some_ate,tate,par):
    b_indicators = list(some_ate.keys())
    sec_keys = ['True','Main Effect','ML']
    for indi in some_ate:
        for sec_key in sec_keys:
            ate_p = []
            sd_p = []
            upper_ci_p = []
            lower_ci_p = []
            for i in partition:
                sps_p = []
                sps_var = []
                par_ate = np.mean(some_ate[indi][sec_key][i],axis=0)
                for sps in range(par):
                    sp_p = some_ate[indi][sec_key][i][sps]
                    sps_p.append(np.mean(sp_p))
                    sps_var.append(np.var(sp_p - par_ate[:len(sp_p)], ddof=1)/len(sp_p))
                single_ate = np.mean(sps_p)
                single_sd = np.sqrt(np.median(sps_var + (sps_p - single_ate)**2))
                ate_p.append(single_ate)
                sd_p.append(single_sd)
                upper_ci_p.append(single_ate+1.96*single_sd)
                lower_ci_p.append(single_ate-1.96*single_sd)

            ate_hat = np.median(ate_p)
            sd = np.median(sd_p)
            upper_limit = np.mean(upper_ci_p)
            lower_limit = np.mean(lower_ci_p)
            CLD = upper_limit-lower_limit
            s_df = pd.DataFrame([(ate_hat,sd,'('+str(lower_limit)+','+str(upper_limit)+')',CLD)],columns=['ATE','SD(ATE)','95% CL','CLD'])     

            BIAS_lst = np.array(ate_p) - tate
            t_BIAS = BIAS_lst.mean()#ate_hat - np.array(tate).mean()
            ESE = np.std(BIAS_lst, ddof=1)
            RMSE = np.sqrt(np.mean(BIAS_lst**2))
            ASE = sd
            Coverage = coverage(partition,lower_ci_p,upper_ci_p,tate)
            e_df = pd.DataFrame([(t_BIAS,ESE,RMSE)],columns=['BIAS','ESE','RMSE'])     

            if list(some_ate.keys()).index(indi)+sec_keys.index(sec_key)==0:
                b_summary_df = s_df
                b_evaluate_df = e_df
            else:
                b_summary_df = pd.concat([b_summary_df,s_df])
                b_evaluate_df = pd.concat([b_evaluate_df,e_df])
    b_summary_df['stats'] = flatten([[i]*len(sec_keys) for i in b_indicators])
    b_summary_df['sec_stats'] = sec_keys*len(b_indicators)
    b_evaluate_df['stats'] = flatten([[i]*len(sec_keys) for i in b_indicators])
    b_evaluate_df['sec_stats'] = sec_keys*len(b_indicators)
    b_p_summary_df = b_summary_df.groupby(['stats','sec_stats']).sum()
    b_p_evaluate_df = b_evaluate_df.groupby(['stats','sec_stats']).sum()
    return b_p_evaluate_df

In [None]:
# generate eigenscores from image samples
part = 100
sample_num = 1500
Za_image = []
Zb_image = []
img_arr = []
eg_score = []
for __ in range(part):
    img_arr = []
    image_beta_a = np.random.normal(0, 0.5, 1500)
    image_beta_b = np.random.uniform(-1, 1, 1500)
    for _ in range(sample_num*__,sample_num*(__+1)):
        single_img = np.load('./triple-crossfitting/simu_img/'+str(_)+'.npy')
        img_arr.append(single_img*50)
    img_arr = np.array(img_arr)
    eigenscores = get_est_U(X=img_arr,N=len(img_arr),return_eigenscore=True)
    Za_image.append(np.dot(image_beta_a,eigenscores))
    Zb_image.append(np.dot(image_beta_b,eigenscores))
Za_image = np.array(Za_image).reshape(part*sample_num,)
Zb_image = np.array(Zb_image).reshape(part*sample_num,)

In [None]:
SampleNumber = 1500*100
sigma1 = np.random.normal(0, 1, SampleNumber)
Z1 = np.random.uniform(-1, 2, SampleNumber)
Z2 = np.random.binomial(1, 0.7, SampleNumber)
Z3 = np.random.binomial(1, 0.5, SampleNumber)
Z4 = np.random.binomial(1, 0.3, SampleNumber)
a = np.array([random.choice([12,14,16,16,18,20]) for _ in range(int(0.8*SampleNumber))])
b = np.array([random.choice([13,15,17,19]) for _ in range(int(0.15*SampleNumber))])
c = np.random.randint(4,12,SampleNumber-int(0.95*SampleNumber))
Z5 = np.append(np.append(a,b),c)
random.shuffle(Z5)

d = np.random.normal(72.5, 2, int(0.5*SampleNumber))
e = np.random.normal(77.5, 3, SampleNumber-int(0.5*SampleNumber))
Z6 = np.append(d,e)
random.shuffle(Z6)

X = np.random.binomial(1,expit(-2+2*np.sin(Za_image)-(Z1)**3/5-Z2*(Z3+2)**2/5+Z5**2/125+((Z4+1)*Z6)/80))  
Y = -1+4*X+2*np.cos(Zb_image)**2+(Z1**3)/2+((Z2)**2)+3*Z4*(Z3-3)**2+Z5**2/100+(np.log(Z6-Z5-30)) + sigma1

simu_frame = pd.DataFrame(Y.reshape(SampleNumber,1),columns=['Y'])
simu_frame['Treatment'] = X
simu_frame['Z1'] = Z1
simu_frame['Z2'] = Z2
simu_frame['Z3'] = Z3
simu_frame['Z4'] = Z4
simu_frame['Z5'] = Z5
simu_frame['Z6'] = Z6
simu_frame['Za_image'] = np.sin(Za_image)
simu_frame['Zb_image'] = np.cos(Zb_image)**2

simu_frame['X_01'] = (Z1)**3
simu_frame['X_02'] = (Z2)*(Z3+2)
simu_frame['X_03'] = Z5**2/125
simu_frame['X_04'] = ((Z4+1)*Z6)/80

simu_frame['Y_01'] = (((Z1)**3))
simu_frame['Y_02'] = (Z2)**2
simu_frame['Y_03'] = Z4*(Z3-3)**2
simu_frame['Y_04'] = Z5**2/100
simu_frame['Y_05'] = (np.log(Z6-Z5-30)) 

all_simuframe = simu_frame

In [None]:
sample_size = 1500
partition = range(100)

In [None]:
'''
Please see "Simulation1-2.ipynb" Notes for more details 
'''
baseline_ate = {}
baseline_time_lst = {}
baseline_ate['G_Computation'] = {}
baseline_ate['IPW'] = {}
baseline_ate['DR'] = {}
baseline_time_lst['G_Computation'] = {}
baseline_time_lst['IPW'] = {}
baseline_time_lst['DR'] = {}
b_true_ate = []
b_true_ate1 = []
part1_time = 0
ps_tm = {}
rsp_tm = {}
ev_keys = ['True','Main Effect','ML']
for k in ev_keys:
    baseline_ate['G_Computation'][k] = {}
    baseline_ate['IPW'][k] = {}
    baseline_ate['DR'][k] = {}
    ps_tm[k] = 0
    rsp_tm[k] = 0
for __ in partition:
    sys.stdout.write('\r'+str(__+1)+'/100')
    for k in ev_keys:
        baseline_ate['G_Computation'][k][__] = []
        baseline_ate['IPW'][k][__] = []
        baseline_ate['DR'][k][__] = []  
    # load data
    simu_frame = all_simuframe.iloc[sample_size*__:sample_size*(__+1),:]
    observers = []
    for _img_ in range(1500*__,1500*(__+1)):
        s_img = np.load('D:/数据/triple-crossfitting/ADNIGO2/simu_img/'+str(_img_)+'.npy')
        observers.append(s_img)
    observers = np.array(observers)
    start = time.perf_counter()
    eigenscores = get_est_U(X=observers,N=len(observers),return_eigenscore=True)
    eigenframe = pd.DataFrame(eigenscores.T,index=simu_frame.index) 
    CPV_frame = ((eigenframe.var())/(eigenframe.var()).sum()).cumsum()
    num_of_f = CPV_frame[CPV_frame>0.95].index[0]
    eigenframe.columns = ['es'+str(i) for i in eigenframe.columns]
    simu_frame_a = simu_frame.merge(eigenframe[['es'+str(i) for i in range(num_of_f)]],left_index=True,right_index=True)
    eigen_formu = '+'.join([i for i in simu_frame_a.columns if 'es' in i])
    t_simu_frame = simu_frame_a
    t_simu_frame['par1'] = flatten([i]*int(len(t_simu_frame)/2) for i in range(2))
    part1_time+= time.perf_counter()-start
    for cv in range(2):
        p2_time = time.perf_counter()
        train_simu_frame = t_simu_frame[t_simu_frame['par1']==cv]   
        test_simu_frame = t_simu_frame[t_simu_frame['par1']!=cv]  
        train_simu_frame['idx'] = range(len(train_simu_frame.index))
        train_simu_frame.set_index('idx',inplace=True)
        test_simu_frame['idx'] = range(len(test_simu_frame.index))
        test_simu_frame.set_index('idx',inplace=True)
        
        dr.data_loadin(train_simu_frame,test_simu_frame)
        part1_time += time.perf_counter()-p2_time
        for sec_keys in ev_keys:
            if sec_keys=='True':
                ps_formula = 'Treatment ~Za_image+ X_01 + X_02 + X_03+ X_04'
                rsp_formula = 'Y~Zb_image+Treatment+ Y_01+Y_02+Y_03+Y_04+Y_05'
            else:
                ps_formula = 'Treatment ~ Z1 + Z2 + Z3 + Z4 + Z5 + Z6'
                rsp_formula = 'Y ~ Treatment + Z1 + Z2 + Z3 + Z4 + Z5 + Z6'

                ps_formula = ps_formula+'+'+eigen_formu
                rsp_formula = rsp_formula+'+'+eigen_formu
            if sec_keys=='ML':
                ml_flag=True
                test_simu_frame['propensity_score'],ps_time = dr.ps_Model(ps_formula,ml_method=True) 
                ps_tm[sec_keys]+=ps_time
                test_simu_frame['u0_X'],test_simu_frame['u1_X'],rsp_t = dr.rsp_Model(rsp_formula,ml_method=True,model_type='continue')          
                rsp_tm[sec_keys]+=rsp_t
            else:
                ml_flag=False
                test_simu_frame['propensity_score'],ps_time = dr.ps_Model(ps_formula,ml_method=False) 
                ps_tm[sec_keys]+=ps_time
                test_simu_frame['u0_X'],test_simu_frame['u1_X'],rsp_t = dr.rsp_Model(rsp_formula,ml_method=False,model_type='continue')     
                rsp_tm[sec_keys]+=rsp_t
            baseline_data = test_simu_frame[['Treatment','propensity_score','u1_X','u0_X','Y']] 
            baseline_data['Y_F'] = baseline_data['Y']
            baseline_ate_pos = (baseline_data['Treatment']*(baseline_data['Y_F']-baseline_data['u1_X']))/baseline_data['propensity_score'] + baseline_data['u1_X']
            baseline_ate_neg = ((1.0-baseline_data['Treatment'])*(baseline_data['Y_F']-baseline_data['u0_X']))/(1.0-baseline_data['propensity_score']) + baseline_data['u0_X']           

            DR = baseline_ate_pos-baseline_ate_neg
            G_Computation = baseline_data['u1_X'] - baseline_data['u0_X']
            IPW = ((baseline_data['Treatment']*baseline_data['Y_F'])/baseline_data['propensity_score'])-(((1.0-baseline_data['Treatment'])*baseline_data['Y_F'])/(1.0-baseline_data['propensity_score']))

            baseline_ate['G_Computation'][sec_keys][__].append(G_Computation)
            baseline_ate['IPW'][sec_keys][__].append(IPW)
            baseline_ate['DR'][sec_keys][__].append(DR)
    sys.stdout.flush()


In [None]:
print (('IPW-ML:')+str((part1_time+ps_tm['ML'])))
print (('IPW-MAIN:')+str((part1_time+ps_tm['Main Effect'])))
print (('IPW-TRUE:')+str((part1_time+ps_tm['True'])))
print (('G-ML:')+str((part1_time+rsp_tm['ML'])))
print (('G-MAIN:')+str((part1_time+rsp_tm['Main Effect'])))
print (('G-TRUE:')+str((part1_time+rsp_tm['True'])))
print (('DR-ML:')+str((part1_time+rsp_tm['ML']+ps_tm['ML'])))
print (('DR-MAIN:')+str((part1_time+rsp_tm['Main Effect']+ps_tm['Main Effect'])))
print (('DR-TRUE:')+str((part1_time+rsp_tm['True']+ps_tm['True'])))

In [None]:
e_df = evaluate(baseline_ate,np.array([4]*len(partition)), 2)
e_df

In [None]:
dc_ate = {}
dc_time_lst = {}
part1_time = 0
ps_tm = {}
rsp_tm = {}
dc_ate['DC_DR'] = {}
dc_time_lst['DC_DR'] = {}
splits = [1,2,3]
simu_splits = []
# ev_keys = ['True','Main Effect']
for k in ev_keys:
    dc_ate['DC_DR'][k] = {}
    dc_time_lst['DC_DR'][k] = []
for __ in partition:
    simu_frame = all_simuframe.iloc[sample_size*__:sample_size*(__+1),:]
    observers = []
    for _img_ in range(1500*__,1500*(__+1)):
        s_img = np.load('D:/数据/triple-crossfitting/ADNIGO2/simu_img/'+str(_img_)+'.npy')
        observers.append(s_img)
    observers = np.array(observers)
    start = time.perf_counter()
    eigenscores = get_est_U(X=observers,N=len(observers),return_eigenscore=True)
    eigenframe = pd.DataFrame(eigenscores.T,index=simu_frame.index) 
    CPV_frame = ((eigenframe.var())/(eigenframe.var()).sum()).cumsum()
    num_of_f = CPV_frame[CPV_frame>0.95].index[0]
    eigenframe.columns = ['es'+str(i) for i in eigenframe.columns]
    simu_frame_a = simu_frame.merge(eigenframe[['es'+str(i) for i in range(num_of_f)]],left_index=True,right_index=True)
    eigen_formu = '+'.join([i for i in simu_frame_a.columns if 'es' in i])
    t_simu_frame = simu_frame_a
    
    sys.stdout.write('\r'+str(__+1)+'/100')
    # cross-modeling
    simu_frame1,simu_frame2,simu_frame3 = dc_sample_splitting(t_simu_frame)
    
    # hdfpca / psm-model / response
    for o in splits:
        eval('simu_frame'+str(o))['idx'] = range(len(eval('simu_frame'+str(o)).index))
        eval('simu_frame'+str(o)).set_index('idx',inplace=True)
    for kys in ev_keys:
        dc_ate['DC_DR'][kys][__] = []
        ps_tm[kys] = 0
        rsp_tm[kys] = 0
    combi_lst = (list(itertools.permutations(splits, 3)))
    part1_time+=time.perf_counter()-start
    for combi in combi_lst: #【2,3,4】         
        simu_frame11 = simu_frame1.copy()
        simu_frame12 = simu_frame2.copy()
        simu_frame13 = simu_frame3.copy()
        # propensity score        
        for sec_keys in ev_keys:
            if sec_keys=='True':
                ps_formula = 'Treatment ~Za_image+ X_01 + X_02 + X_03+ X_04'
                rsp_formula = 'Y~Zb_image+Treatment+ Y_01+Y_02+Y_03+Y_04+Y_05'
            else:
                ps_formula = 'Treatment ~ Z1 + Z2 + Z3 + Z4 + Z5 + Z6'
                rsp_formula = 'Y ~ Treatment + Z1 + Z2 + Z3 + Z4 + Z5 + Z6'

                ps_formula = ps_formula+'+'+eigen_formu
                rsp_formula = rsp_formula+'+'+eigen_formu
            if sec_keys=='ML':
                ml_flag=True
                dr.data_loadin(eval('simu_frame1'+str(combi[1])),eval('simu_frame1'+str(combi[0])))
                eval('simu_frame1'+str(combi[0]))['propensity_score'],ps_time = dr.ps_Model(ps_formula,ml_flag) 
                ps_tm[sec_keys]+=ps_time
                dr.data_loadin(eval('simu_frame1'+str(combi[2])),eval('simu_frame1'+str(combi[0])))
                eval('simu_frame1'+str(combi[0]))['u0_X'],eval('simu_frame1'+str(combi[0]))['u1_X'],rsp_t = dr.rsp_Model(rsp_formula,ml_method=True,model_type='continue')          
                rsp_tm[sec_keys]+=rsp_t
            else:
                ml_flag=False
                dr.data_loadin(eval('simu_frame1'+str(combi[1])),eval('simu_frame1'+str(combi[0])))
                eval('simu_frame1'+str(combi[0]))['propensity_score'],ps_time = dr.ps_Model(ps_formula,ml_method=False) 
                ps_tm[sec_keys]+=ps_time
                dr.data_loadin(eval('simu_frame1'+str(combi[2])),eval('simu_frame1'+str(combi[0])))
                eval('simu_frame1'+str(combi[0]))['u0_X'],eval('simu_frame1'+str(combi[0]))['u1_X'],rsp_t = dr.rsp_Model(rsp_formula,ml_method=False,model_type='continue')     
                rsp_tm[sec_keys]+=rsp_t

            data = eval('simu_frame1'+str(combi[0]))[['Treatment','propensity_score','u1_X','u0_X','Y']] 
            data['Y_F'] = data['Y']
            ate_pos = (data['Treatment']*(data['Y_F']-data['u1_X']))/data['propensity_score'] + data['u1_X']
            ate_neg = ((1.0-data['Treatment'])*(data['Y_F']-data['u0_X']))/(1.0-data['propensity_score']) + data['u0_X']           
            DC_DR = ate_pos-ate_neg

            dc_ate['DC_DR'][sec_keys][__].append(DC_DR)
    sys.stdout.flush()

In [None]:
print ((ps_tm['ML']+rsp_tm['ML']+part1_time))
print ((ps_tm['Main Effect']+rsp_tm['Main Effect']+part1_time))
print ((ps_tm['True']+rsp_tm['True']+part1_time))

In [None]:
e_df = evaluate(dc_ate,np.array([4]*len(partition)), 6)
e_df


In [None]:
import warnings
warnings.filterwarnings('ignore')
# main simulation section
splits = [1,2,3,4]
part1_time = 0
ps_tm = {}
rsp_tm = {}
partition_ate = {}
partition_time_lst = {}
partition_ate['TC_DR'] = {}
partition_time_lst['TC_DR'] = {}
for k in ev_keys:
    partition_ate['TC_DR'][k] = {}
    partition_time_lst['TC_DR'][k] = []
for __ in partition:
    sys.stdout.write('\r'+str(__+1)+'/100')
    # cross-modeling
    simu_frame = all_simuframe.iloc[sample_size*__:sample_size*(__+1),:]
    simu_frame['id'] = range(len(simu_frame.index))
    simu_frame = simu_frame.set_index('id')
    observers = []
    for _img_ in range(1500):
        s_img = np.load('./triple-crossfitting/simu_img/'+str(_img_)+'.npy')
        observers.append(s_img)
    observers = np.array(observers)
    
    true,simu_frame1,image_observer1,simu_frame2,image_observer2,simu_frame3,image_observer3,simu_frame4,image_observer4 = sample_splitting(simu_frame,observers,0,resample=False,sample_num=sample_size)  
    # hdfpca / psm-model / response    
    for o in splits:
        eval('simu_frame'+str(o))['idx'] = range(len(simu_frame1.index))
        eval('simu_frame'+str(o)).set_index('idx',inplace=True)
    for kys in ev_keys:
        partition_ate['TC_DR'][kys][__] = []
        ps_tm[kys] = 0
        rsp_tm[kys] = 0
    for _ in splits:   
        start = time.perf_counter()
        est_u = get_est_U(X=eval('image_observer'+str(_)),N=len(image_observer1))
        est_u_inv = est_u.T 
        sec_splits = splits.copy()
        sec_splits.remove(_)
        combi_lst = (list(itertools.permutations(sec_splits, 3)))       
        
        eigenframe1 = pd.DataFrame(np.matmul(est_u_inv,get_X_bar(X=image_observer1,N=len(image_observer1)).T).T)    
        CPV_frame = ((eigenframe1.var())/((eigenframe1.var()).sum())).sort_values(ascending = False).cumsum()
        eigenframe1 = eigenframe1.reindex(CPV_frame.index)
        CPV_frame1 = frame_reindex(CPV_frame)    
        eigenframe1.columns = CPV_frame1.index
        
        eigenframe2 = pd.DataFrame(np.matmul(est_u_inv,get_X_bar(X=image_observer2,N=len(image_observer2)).T).T)  
        CPV_frame = ((eigenframe2.var())/((eigenframe2.var()).sum())).sort_values(ascending = False).cumsum()
        eigenframe2 = eigenframe2.reindex(CPV_frame.index)
        CPV_frame2 = frame_reindex(CPV_frame) 
        eigenframe2.columns = CPV_frame2.index

        eigenframe3 = pd.DataFrame(np.matmul(est_u_inv,get_X_bar(X=image_observer3,N=len(image_observer3)).T).T)    
        CPV_frame = ((eigenframe3.var())/((eigenframe3.var()).sum())).sort_values(ascending = False).cumsum()
        eigenframe3 = eigenframe3.reindex(CPV_frame.index)
        CPV_frame3 = frame_reindex(CPV_frame) 
        eigenframe3.columns = CPV_frame3.index
        
        eigenframe4 = pd.DataFrame(np.matmul(est_u_inv,get_X_bar(X=image_observer4,N=len(image_observer4)).T).T) 
        CPV_frame = ((eigenframe4.var())/((eigenframe4.var()).sum())).sort_values(ascending = False).cumsum()
        eigenframe4 = eigenframe4.reindex(CPV_frame.index)
        CPV_frame4 = frame_reindex(CPV_frame) 
        eigenframe4.columns = CPV_frame4.index
        
        
        choose_index = eval('CPV_frame'+str(_))[eval('CPV_frame'+str(_))<0.95].index
        simu_frame1[[i for i in choose_index]] = eigenframe1[[i for i in choose_index]]
        simu_frame2[[i for i in choose_index]] = eigenframe2[[i for i in choose_index]]
        simu_frame3[[i for i in choose_index]] = eigenframe3[[i for i in choose_index]]
        simu_frame4[[i for i in choose_index]] = eigenframe4[[i for i in choose_index]]
        part1_time+=time.perf_counter()-start
        for combi in combi_lst: #【2,3,4】
            simu_frame11 = simu_frame1.copy()
            simu_frame12 = simu_frame2.copy()
            simu_frame13 = simu_frame3.copy()
            simu_frame14 = simu_frame4.copy()
            for sec_keys in ev_keys:
                if sec_keys=='True':
                    ps_formula = 'Treatment ~Za_image+ X_01 + X_02 + X_03+ X_04'
                    rsp_formula = 'Y~Zb_image+Treatment+ Y_01+Y_02+Y_03+Y_04+Y_05'
                else:
                    ps_formula = 'Treatment ~ Z1 + Z2 + Z3 + Z4 + Z5 + Z6'
                    rsp_formula = 'Y ~ Treatment + Z1 + Z2 + Z3 + Z4 + Z5 + Z6'
                ps_formula += '+'+'+'.join([i for i in simu_frame2.columns if 'es' in i])
                rsp_formula += '+'+'+'.join([i for i in simu_frame2.columns if 'es' in i])
                # propensity score
                if sec_keys=='ML':
                    ml_flag=True
                    dr.data_loadin(eval('simu_frame1'+str(combi[1])),eval('simu_frame1'+str(combi[0])))
                    eval('simu_frame1'+str(combi[0]))['propensity_score'],ps_time = dr.ps_Model(ps_formula,ml_flag) 
                    ps_tm[sec_keys]+=ps_time
                    dr.data_loadin(eval('simu_frame1'+str(combi[2])),eval('simu_frame1'+str(combi[0])))
                    eval('simu_frame1'+str(combi[0]))['u0_X'],eval('simu_frame1'+str(combi[0]))['u1_X'],rsp_t = dr.rsp_Model(rsp_formula,ml_method=True,model_type='continue')          
                    rsp_tm[sec_keys]+=rsp_t
                else:
                    ml_flag=False
                    dr.data_loadin(eval('simu_frame1'+str(combi[1])),eval('simu_frame1'+str(combi[0])))
                    eval('simu_frame1'+str(combi[0]))['propensity_score'],ps_time = dr.ps_Model(ps_formula,ml_method=False) 
                    ps_tm[sec_keys]+=ps_time
                    dr.data_loadin(eval('simu_frame1'+str(combi[2])),eval('simu_frame1'+str(combi[0])))
                    eval('simu_frame1'+str(combi[0]))['u0_X'],eval('simu_frame1'+str(combi[0]))['u1_X'],rsp_t = dr.rsp_Model(rsp_formula,ml_method=False,model_type='continue')     
                    rsp_tm[sec_keys]+=rsp_t
                data = eval('simu_frame1'+str(combi[0]))[['Treatment','propensity_score','u1_X','u0_X','Y']] 
                data['Y_F'] = data['Y']
                ate_pos = (data['Treatment']*(data['Y_F']-data['u1_X']))/data['propensity_score'] + data['u1_X']
                ate_neg = ((1.0-data['Treatment'])*(data['Y_F']-data['u0_X']))/(1.0-data['propensity_score']) + data['u0_X']           

                TC_DR = ate_pos-ate_neg
                partition_ate['TC_DR'][sec_keys][__].append(TC_DR)
    sys.stdout.flush()


In [None]:
print ((ps_tm['ML']+rsp_tm['ML']+part1_time))
print ((ps_tm['Main Effect']+rsp_tm['Main Effect']+part1_time))
print ((ps_tm['True']+rsp_tm['True']+part1_time))

In [None]:
e_df = evaluate(partition_ate,np.array([4]*len(partition)), 24)
e_df