In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Generate Data

In [2]:
# Ensure the parent directory is in the Python path
import sys
import os
sys.path.append(os.path.abspath("../.."))

from data import SyntheticDataGenerator

In [3]:
### RCT with treatment rate 0.5
with pd.HDFStore("RCT-50.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'RCT-50_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.5, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### RCT with treatment rate 0.05
with pd.HDFStore("RCT-5.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'RCT-5_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.05, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT (observational study / OBS) with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("OBS-CPS.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-CPS_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=True)
        dsets = gen.generate_datasets()
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("OBS-UConf.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-UConf_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=True, overlap=True)
        dsets = gen.generate_datasets()
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability held but overlap violated - propensity score e(X)_no_overlap
with pd.HDFStore("OBS-NoPos.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-NoPos_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=False)
        dsets = gen.generate_datasets()
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative censoring
info_censor_baseline=0.1
info_censor_alpha=0.05

### informative_censoring and non-RCT with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("OBS-CPS-IC.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-CPS-IC_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("OBS-UConf-IC.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-UConf-IC_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=True, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability held but overlap violated - propensity score e(X)
with pd.HDFStore("OBS-NoPos-IC.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-NoPos-IC_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=False,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

# Load data

In [4]:
from tqdm import tqdm
import sys
import os
import numpy as np
import pandas as pd
sys.path.append(os.path.abspath("../.."))
from data import load_data, prepare_data_split

num_repeats = 10
dataset_name = 'synthetic'
train_size = 5000
val_size = 2500
test_size = 2500
data_dir = './data'

experiment_setups, experiment_repeat_setups = load_data(dataset_name=dataset_name, data_dir=data_dir)

results_dict = {}
for causal_config_name, setup_dict in tqdm(experiment_setups.items(), desc="Experiment Setups"):
    results_dict[causal_config_name] = {}
    for scenario_key in tqdm(setup_dict, desc=f"{causal_config_name} Scenarios"):
        dataset_df = setup_dict[scenario_key]["dataset"]
        split_dict = prepare_data_split(
            dataset_df, experiment_repeat_setups, 
            num_repeats=num_repeats, 
            dataset_name=dataset_name,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size
        )
        results_dict[causal_config_name][scenario_key] = {}

        for rand_idx in range(num_repeats):
            X_train, W_train, Y_train, cate_true_train = split_dict[rand_idx]['train']
            X_val, W_val, Y_val, cate_true_val = split_dict[rand_idx]['val']
            X_test, W_test, Y_test, cate_true_test = split_dict[rand_idx]['test']

RCT-50 Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.42it/s]
RCT-5 Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.79it/s]s]
OBS-CPS Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.81it/s]
OBS-UConf Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.99it/s]
OBS-NoPos Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.80it/s]
OBS-CPS-IC Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.56it/s]
OBS-UConf-IC Scenarios: 100%|██████████| 5/5 [00:00<00:00, 20.07it/s]
OBS-NoPos-IC Scenarios: 100%|██████████| 5/5 [00:00<00:00, 20.21it/s]
Experiment Setups: 100%|██████████| 8/8 [00:02<00:00,  3.94it/s]


In [5]:
experiment_setups['OBS-CPS']['Scenario_A']['summary']

{'censoring_rate': 0.20140000000000002,
 'treatment_rate': 0.50288,
 'event_time_min': 1.572616577566301e-10,
 'event_time_25pct': 0.02785232905009707,
 'event_time_median': 0.171087765901413,
 'event_time_75pct': 0.7376963806187335,
 'event_time_max': 114.26970362205151,
 'event_time_mean': 0.9577137715212123,
 'event_time_std': 2.827344054158918,
 'censoring_time_min': 7.479461987258684e-05,
 'censoring_time_median': 1.4973416658085228,
 'censoring_time_max': 2.9999323915443328,
 'censoring_time_mean': 1.4983175482276327,
 'censoring_time_std': 0.8639638174514367,
 'ate': 0.1634411578102904,
 'cate_min': -80.5517567270787,
 'cate_median': -8.77738203460376e-05,
 'cate_max': 116.22756081705855}

In [6]:
split_dict[0].keys()

dict_keys(['train', 'val', 'test'])

In [7]:
experiment_setups['RCT-50']['Scenario_A'].keys()

dict_keys(['dataset', 'summary', 'metadata'])

In [8]:
cate_true_train

array([  5,   8, -11, ...,  -2,   1,   2])

In [9]:
a = np.zeros((8,5))
causal_configs = list(experiment_setups.keys())
survival_scenarios = list(experiment_setups['RCT-50'].keys())

print(causal_configs)
print(survival_scenarios)
for c, causal_config in enumerate(causal_configs):
    for s, survival_scenario in enumerate(survival_scenarios):
        ate_true = experiment_setups[causal_config][survival_scenario]['summary']['ate']
        a[c, s] = ate_true

ate_df = pd.DataFrame(a)
ate_df.index = causal_configs
ate_df.columns = survival_scenarios
ate_df

['RCT-50', 'RCT-5', 'OBS-CPS', 'OBS-UConf', 'OBS-NoPos', 'OBS-CPS-IC', 'OBS-UConf-IC', 'OBS-NoPos-IC']
['Scenario_A', 'Scenario_B', 'Scenario_C', 'Scenario_D', 'Scenario_E']


Unnamed: 0,Scenario_A,Scenario_B,Scenario_C,Scenario_D,Scenario_E
RCT-50,0.163441,0.124969,0.74996,0.723925,0.7537
RCT-5,0.163441,0.124969,0.74996,0.723925,0.7537
OBS-CPS,0.163441,0.124969,0.74996,0.723925,0.7537
OBS-UConf,0.003744,0.131728,0.74036,0.830668,0.74032
OBS-NoPos,0.163441,0.124969,0.74996,0.723925,0.7537
OBS-CPS-IC,0.163441,0.124969,0.74996,0.723925,0.7537
OBS-UConf-IC,0.003744,0.131728,0.74036,0.830668,0.74032
OBS-NoPos-IC,0.163441,0.124969,0.74996,0.723925,0.7537


In [10]:
a = np.zeros((8,5))
causal_configs = list(experiment_setups.keys())
survival_scenarios = list(experiment_setups['RCT-50'].keys())

# print(causal_configs)
# print(survival_scenarios)
for c, causal_config in enumerate(causal_configs):
    for s, survival_scenario in enumerate(survival_scenarios):
        ate_true = experiment_setups[causal_config][survival_scenario]['summary']['treatment_rate']
        a[c, s] = ate_true

treatment_rate_df = pd.DataFrame(a)
treatment_rate_df.index = causal_configs
treatment_rate_df.columns = survival_scenarios
treatment_rate_df

Unnamed: 0,Scenario_A,Scenario_B,Scenario_C,Scenario_D,Scenario_E
RCT-50,0.5022,0.5022,0.5022,0.5022,0.5022
RCT-5,0.04924,0.04924,0.04924,0.04924,0.04924
OBS-CPS,0.50288,0.50288,0.50288,0.50288,0.50288
OBS-UConf,0.5389,0.5389,0.5389,0.5389,0.5389
OBS-NoPos,0.50004,0.50004,0.50004,0.50004,0.50004
OBS-CPS-IC,0.50288,0.50288,0.50288,0.50288,0.50288
OBS-UConf-IC,0.5389,0.5389,0.5389,0.5389,0.5389
OBS-NoPos-IC,0.50004,0.50004,0.50004,0.50004,0.50004


In [15]:
a = np.zeros((8,5))
causal_configs = list(experiment_setups.keys())
survival_scenarios = list(experiment_setups['RCT-50'].keys())

# print(causal_configs)
# print(survival_scenarios)
for c, causal_config in enumerate(causal_configs):
    for s, survival_scenario in enumerate(survival_scenarios):
        ate_true = experiment_setups[causal_config][survival_scenario]['summary']['censoring_rate']
        a[c, s] = ate_true

censoring_rate = pd.DataFrame(a)
censoring_rate.index = causal_configs
censoring_rate.columns = survival_scenarios
censoring_rate

Unnamed: 0,Scenario_A,Scenario_B,Scenario_C,Scenario_D,Scenario_E
RCT-50,0.20318,0.07288,0.39204,0.9126,0.79418
RCT-5,0.19976,0.03638,0.39036,0.88072,0.76988
OBS-CPS,0.2014,0.0663,0.39266,0.91392,0.7893
OBS-UConf,0.20064,0.07322,0.39226,0.9184,0.7953
OBS-NoPos,0.20318,0.08156,0.39254,0.91192,0.8034
OBS-CPS-IC,0.1158,0.05156,0.8846,0.36574,0.92562
OBS-UConf-IC,0.11588,0.05432,0.88806,0.38074,0.92948
OBS-NoPos-IC,0.1163,0.05828,0.89098,0.40292,0.93154


In [12]:
experiment_setups[causal_config][survival_scenario]['dataset']

Unnamed: 0,id,observed_time,event,W,X1,X2,X3,X4,X5,U1,U2,T0,T1,T,C
0,0,6.368170,0,0,0.135488,0.887852,0.932606,0.445568,0.388236,0.151609,0.205535,10,6,10,6.368170
1,1,0.832344,0,1,0.257596,0.657368,0.492617,0.964238,0.800984,0.597208,0.255785,5,10,10,0.832344
2,2,1.014371,0,1,0.455205,0.801058,0.041718,0.769458,0.003171,0.370382,0.223214,10,9,9,1.014371
3,3,1.894672,0,1,0.292809,0.610914,0.913027,0.300115,0.248599,0.038464,0.409829,14,10,10,1.894672
4,4,1.643196,0,0,0.666392,0.987533,0.468270,0.123287,0.916031,0.342961,0.791330,6,14,6,1.643196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.051575,0,0,0.484593,0.998236,0.668208,0.070638,0.960140,0.497815,0.206792,5,11,5,0.051575
49996,49996,2.783642,0,0,0.036391,0.268106,0.043117,0.426886,0.342038,0.812595,0.437775,8,9,8,2.783642
49997,49997,1.786153,0,0,0.061915,0.411210,0.426204,0.414266,0.601355,0.116056,0.416950,14,9,14,1.786153
49998,49998,3.009848,0,0,0.178390,0.656522,0.817355,0.347013,0.060741,0.201218,0.935754,9,9,9,3.009848


In [13]:
experiment_setups[causal_config][survival_scenario]['metadata']

{'dataset_name': 'OBS-NoPos-IC_Scenario_E',
 'scenario': 'E',
 'n_samples': 50000,
 'n_features': 5,
 'RCT': False,
 'unobserved': False,
 'overlap': False,
 'informative_censoring': True,
 'random_state': 2025,
 'propensity_type': 'e(X)_no_overlap',
 'propensity_params': {'beta_a': 2, 'beta_b': 4},
 'T_distribution': 'Poisson',
 'info_censor_baseline': 0.1,
 'info_censor_alpha': 0.05,
 'treatment_proportion': 0.50004,
 'censoring_rate': 0.93154}

In [14]:
experiment_setups[causal_config][survival_scenario]['summary']

{'censoring_rate': 0.93154,
 'treatment_rate': 0.50004,
 'event_time_min': 0,
 'event_time_25pct': 6.0,
 'event_time_median': 8.0,
 'event_time_75pct': 10.0,
 'event_time_max': 23,
 'event_time_mean': 8.33718,
 'event_time_std': 2.96637584188428,
 'censoring_time_min': 9.265397924920696e-06,
 'censoring_time_median': 1.3778503502873223,
 'censoring_time_max': 41.816945165478124,
 'censoring_time_mean': 2.1121648349422366,
 'censoring_time_std': 2.3405764188099,
 'ate': 0.7537,
 'cate_min': -18,
 'cate_median': 1.0,
 'cate_max': 20}

In [16]:
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.abspath("../.."))
from data import load_data, prepare_data_split

num_repeats = 10
dataset_name = 'actg_syn'
cate_true_col = None
train_size = 0.5
val_size = 0.25
test_size = 0.25
data_dir = './data'

experiment_setups, experiment_repeat_setups = load_data(dataset_name=dataset_name, data_dir=data_dir)

results_dict = {}
for config_name, setup_dict in tqdm(experiment_setups.items(), desc="Experiment Setups"):
    results_dict[config_name] = {}
    for scenario_key in tqdm(setup_dict, desc=f"{config_name} Scenarios"):
        dataset_df = setup_dict[scenario_key]["dataset"]
        split_dict = prepare_data_split(
            dataset_df, experiment_repeat_setups, 
            num_repeats=num_repeats, 
            dataset_name=dataset_name,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size
        )
        results_dict[config_name][scenario_key] = {}

        for rand_idx in range(num_repeats):
            X_train, W_train, Y_train, cate_true_train = split_dict[rand_idx]['train']
            X_val, W_val, Y_val, cate_true_val = split_dict[rand_idx]['val']
            X_test, W_test, Y_test, cate_true_test = split_dict[rand_idx]['test']

Experiment Setups:   0%|          | 0/1 [00:00<?, ?it/s]

actg_syn Scenarios: 100%|██████████| 1/1 [00:00<00:00, 30.12it/s]
Experiment Setups: 100%|██████████| 1/1 [00:00<00:00, 27.68it/s]


In [17]:
experiment_setups['actg_syn']['Scenario_A']['summary']

{'censoring_rate': 0.511921458625526,
 'treatment_rate': 0.5614773258532024,
 'event_time_min': 0.6536759401597113,
 'event_time_25pct': 25.493062254005324,
 'event_time_median': 33.84489207370351,
 'event_time_75pct': 43.290868915683404,
 'event_time_max': 95.25508215241828,
 'event_time_mean': 34.60805023757897,
 'event_time_std': 14.15818138235396,
 'censoring_time_min': 23.04080751896593,
 'censoring_time_median': 33.333493284505394,
 'censoring_time_max': 44.86151373159986,
 'censoring_time_mean': 33.3661146047237,
 'censoring_time_std': 3.3184348303759723,
 'ate': 5.044362088358642,
 'cate_min': -41.1870648720257,
 'cate_median': 4.957602854450798,
 'cate_max': 43.93721264162399}