In [1]:
import pandas as pd
import numpy as np

def safe_divide(numerator, denominator):
    return np.where(denominator != 0, numerator / denominator, 0)

def metrics_fast(df, perc=10):
    df = df.sort_values(by='ITE', ascending=False).reset_index(drop=True)
    
    # Convert relevant columns to NumPy arrays for faster processing
    obs = df['Obs'].values
    out = df['Out'].values
    not_obs = 1 - obs
    not_out = 1 - out

    # Calculate cumulative sums using NumPy
    cum_N1_T = np.cumsum(obs & out)
    cum_N0_T = np.cumsum(obs & not_out)
    cum_N1_C = np.cumsum(not_obs & out)
    cum_N0_C = np.cumsum(not_obs & not_out)

    N_1T = cum_N1_T[-1]
    N_0T = cum_N0_T[-1]
    N_1C = cum_N1_C[-1]
    N_0C = cum_N0_C[-1]
    
    N_T = N_1T + N_0T
    N_C = N_1C + N_0C
    L = len(df)

    Xphi = np.arange(L + 1) / L
    N1_T_over_N_T = N_1T / N_T
    N1_C_over_N_C = N_1C / N_C

    Qinilist = safe_divide(cum_N1_T, N_T) - safe_divide(cum_N1_C, N_C) - \
               (np.arange(1, L + 1) / L * (N1_T_over_N_T - N1_C_over_N_C))
    Qinilist = np.concatenate([[0], Qinilist])
    QS = np.trapz(Qinilist, Xphi)

    TOCY = safe_divide(cum_N1_T, cum_N1_T + cum_N0_T) - \
           safe_divide(cum_N1_C, cum_N1_C + cum_N0_C) + \
           N1_C_over_N_C - N1_T_over_N_T
    TOCY = np.concatenate([[0], TOCY])
    TOCS = np.trapz(TOCY, Xphi)

    ROCiniY = safe_divide(cum_N1_T, N_1T) - \
              safe_divide(cum_N0_T, N_0T) + \
              safe_divide(cum_N0_C, N_0C) - \
              safe_divide(cum_N1_C, N_1C)
    ROCiniY = np.concatenate([[0], ROCiniY])
    ROCiniS = np.trapz(ROCiniY, Xphi)

    pROCiniX = (safe_divide(cum_N0_T, N_0T) + safe_divide(cum_N1_C, N_1C)) / 2
    pROCiniX = np.concatenate([[0], pROCiniX])

    pROCiniY = (safe_divide(cum_N1_T, N_1T) + safe_divide(cum_N0_C, N_0C)) / 2
    pROCiniY = np.concatenate([[0], pROCiniY])

    pROCiniS = np.trapz(pROCiniY, pROCiniX)

    pTOCX = (safe_divide(cum_N1_C, cum_N1_C + cum_N0_C)) / N1_C_over_N_C 
    pTOCX = np.concatenate([[0], pTOCX])

    pTOCY = (safe_divide(cum_N1_T, cum_N1_T + cum_N0_T)) /N1_T_over_N_T
    pTOCY = np.concatenate([[0], pTOCY])

    pTOCS = np.trapz(pTOCY, pTOCX)






    CROCX = safe_divide(cum_N0_T + cum_N1_C, N_0T + N_1C)
    CROCX = np.concatenate([[0], CROCX])

    CROCY = safe_divide(cum_N1_T + cum_N0_C, N_1T + N_0C)
    CROCY = np.concatenate([[0], CROCY])

    CROCS = np.trapz(CROCY, CROCX)

    cutoff_index = int(L * perc / 100)
    df_top_perc = df.iloc[:cutoff_index]

    obs_top = df_top_perc['Obs'].values
    out_top = df_top_perc['Out'].values
    not_obs_top = 1 - obs_top
    not_out_top = 1 - out_top

    cum_N1_T_10 = np.cumsum(obs_top & out_top)
    cum_N0_T_10 = np.cumsum(obs_top & not_out_top)
    cum_N1_C_10 = np.cumsum(not_obs_top & out_top)
    cum_N0_C_10 = np.cumsum(not_obs_top & not_out_top)

    N_1T_10 = cum_N1_T_10[-1]
    N_0T_10 = cum_N0_T_10[-1]
    N_1C_10 = cum_N1_C_10[-1]
    N_0C_10 = cum_N0_C_10[-1]
    
    N_T_10 = N_1T_10 + N_0T_10
    N_C_10 = N_1C_10 + N_0C_10
    L_10 = len(df_top_perc)

    Xphi_10 = np.arange(L_10 + 1) / L_10
    N1_T_over_N_T_10 = safe_divide(N_1T_10, N_T_10)
    N1_C_over_N_C_10 = safe_divide(N_1C_10, N_C_10)
    Qinilist_10 = safe_divide(cum_N1_T_10, N_T_10) - \
                  safe_divide(cum_N1_C_10, N_C_10) - \
                  (np.arange(1, L_10 + 1) / L * (N1_T_over_N_T_10 - N1_C_over_N_C_10))
    Qinilist_10 = np.concatenate([[0], Qinilist_10])
    QS10 = np.trapz(Qinilist_10, Xphi_10)

    return QS10, QS, TOCS, ROCiniS, pROCiniS, CROCS, pTOCS





In [6]:
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap


def draw_ITE(PC, mean, std):
    N = len(PC)
    ITE = np.random.normal(mean, std, N)
    while True:
        mask = (PC + ITE < 0) | (PC + ITE > 1)
        if not np.any(mask):
            break
        ITE[mask] = np.random.normal(mean, std, np.sum(mask))
    return ITE

def draw_ITEn(PC, ITE, mean, std):
    N = len(PC)
    ITEn = ITE + np.random.normal(mean, std, N)
    while True:
        mask = (ITEn+PC < 0) | (ITEn+PC > 1)
        if not np.any(mask):
            break
        ITEn[mask] =  ITE[mask] + np.random.normal(mean, std, np.sum(mask))
    return ITEn


# Function to generate the filenames based on your parameters
def generate_filenames(Nrange, Mrange, beta_params):
    filenames = []
    for N in Nrange:
        for alpha, beta in beta_params:
            for M in sorted(Mrange):
                M_key = int(M * 100)
                filename = f"results_N{N}_M{M_key}_alpha{alpha}_beta{beta}.feather"
                filenames.append(filename)
    return filenames


In [7]:
def QiniSimBeta(r, Nrange, RNrange, beta_params, Mrange):
    for N in Nrange:
        for alpha_val, beta_val in beta_params:
            for M in Mrange:
                M_key = int(M * 100)
                all_results = []
                
                for i in tqdm(range(r), desc=f"Processing {N} samples, M={M}, alpha={alpha_val}, beta={beta_val}"):
                    PC = np.random.beta(alpha_val, beta_val, N)
                    ITE = draw_ITE(PC, 0, M)
                    PT = PC + ITE

                    Obs = np.random.binomial(1, 0.5, N)
                    OutC = np.random.binomial(1, PC, N)
                    OutT = np.random.binomial(1, PT, N)
                    Out = np.where(Obs, OutT, OutC)
                    
                    for RN in RNrange:
                        RN_key = int(RN * 100)
                        ITEn = draw_ITEn(PC, ITE, 0, RN)
                        
                        df = pd.DataFrame({
                            'Obs': Obs,
                            'Out': Out,
                            'ITE': ITEn
                        })
                        
                        QS10, QS, TOCS, ROCiniS, pROCiniS, CROCS, pTOCS = metrics_fast(df)
                        
                        all_results.append({
                            'N': N,
                            'alpha': alpha_val,
                            'beta': beta_val,
                            'M': M,
                            'RN': RN,
                            'iteration': i,
                            'QS10': QS10,
                            'QS': QS,
                            'TOCS': TOCS,
                            'ROCiniS': ROCiniS,
                            'pROCiniS': pROCiniS,
                            'CROCS': CROCS,
                            'pTOCS': pTOCS
                        })
    
                result_df = pd.DataFrame(all_results)
                filename = f"results_N{N}_M{M_key}_alpha{alpha_val}_beta{beta_val}.feather"
                result_df.to_feather(filename)
                print(f"Saving file: {filename}")
                #print(result_df)
    
    return "saved all"

In [None]:
# Main script to run the simulation and save results
r = 1000000  # Number of repetitions
Nrange = [1000]
RNrange = [0, 0.025, 0.05,0.075,0.1]
#RNrange = [0, 0.05,0.1,0.15, 0.2, 0.25,0.3,0.35,0.4]
#beta_params = [(2, 2)]
beta_params = [(15, 15), (25, 25), (25, 5), (5, 25), (0.5, 0.5), (50,10), (10,50)]
#beta_params = [(15, 15), (25, 25)]
#Mrange = [0.2]
Mrange = np.arange(0.1, 0.4, 0.05)


# Generate the filenames
filenames = generate_filenames(Nrange, Mrange, beta_params)



QiniSimBeta(r, Nrange, RNrange, beta_params, Mrange)