In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Generate Data

In [1]:
# Ensure the parent directory is in the Python path
import sys
import os
sys.path.append(os.path.abspath("../.."))

from data import SyntheticDataGenerator

In [None]:
### RCT with treatment rate 0.5
with pd.HDFStore("RCT-50.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'RCT-50_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.5, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### RCT with treatment rate 0.05
with pd.HDFStore("RCT-5.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'RCT-5_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=True, treatment_proportion=0.05, unobserved=False, overlap=True)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT (observational study / OBS) with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("OBS-CPS.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-CPS_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=True)
        dsets = gen.generate_datasets()
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("OBS-UConf.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-UConf_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=True, overlap=True)
        dsets = gen.generate_datasets()
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### non-RCT with ignorability held but overlap violated - propensity score e(X)_no_overlap
with pd.HDFStore("OBS-NoPos.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-NoPos_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                        n_samples=50000, random_state=2025,
                                         RCT=False, unobserved=False, overlap=False)
        dsets = gen.generate_datasets()
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative censoring
info_censor_baseline=0.1
info_censor_alpha=0.05

### informative_censoring and non-RCT with ignorability and overlap held - propensity score e(X)
with pd.HDFStore("OBS-CPS-IC.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-CPS-IC_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability violated and overlap held - propensity score e(X, U)
with pd.HDFStore("OBS-UConf-IC.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-UConf-IC_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=True, overlap=True,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

### informative_censoring and non-RCT with ignorability held but overlap violated - propensity score e(X)
with pd.HDFStore("OBS-NoPos-IC.h5") as store:
    for i in ['A', 'B', 'C', 'D', 'E']:
        dataset_name = f'OBS-NoPos-IC_Scenario_{i}'
        gen = SyntheticDataGenerator(scenario=i, dataset_name=dataset_name,
                                         n_samples=50000, random_state=2025,
                                         informative_censoring=True, RCT=False, 
                                         unobserved=False, overlap=False,
                                         info_censor_baseline=info_censor_baseline,
                                         info_censor_alpha=info_censor_alpha)
        dsets = gen.generate_datasets() 
        store[f"Scenario_{i}/data"] = dsets['data']
        store.get_storer(f"Scenario_{i}/data").attrs.metadata = dsets['metadata']

# Load data

def load_data(dataset_type='synthetic', data_dir='./data/synthetic/'):
    experiment_setups = {}
    if dataset_type == 'synthetic':
        idx_split_file_path = os.path.join(data_dir, 'idx_split.csv')
        experiment_repeat_setups = pd.read_csv(idx_split_file_path).set_index("idx")
        for causal_config in ["RCT-50",
                            "RCT-5",
                            "OBS-CPS",
                            "OBS-UConf",
                            "OBS-NoPos",
                            "OBS-CPS-IC",
                            "OBS-UConf-IC",
                            "OBS-NoPos-IC"]:
            data_path = os.path.join(data_dir, f'{causal_config}.h5')
            scenario_dict = {}
            for survival_scenario in ['A', 'B', 'C', 'D', 'E']:
                dataset_key = f'Scenario_{survival_scenario}/data'
                with pd.HDFStore(data_path, mode='r') as store:
                    df = store[dataset_key]
                    metadata = store.get_storer(dataset_key).attrs.metadata
                summary_characteristics = {
                    # rates
                    'censoring_rate': 1-df['event'].mean(),
                    'treatment_rate': df['W'].mean(),

                    # event times
                    'event_time_min': df['T'].min(),
                    'event_time_25pct': df['T'].quantile(0.25),
                    'event_time_median': df['T'].median(),
                    'event_time_75pct': df['T'].quantile(0.75),
                    'event_time_max': df['T'].max(),
                    'event_time_mean': df['T'].mean(),
                    'event_time_std': df['T'].std(),

                    # censoring times
                    'censoring_time_min': df['C'].min(),
                    'censoring_time_median': df['C'].median(),
                    'censoring_time_max': df['C'].max(),
                    'censoring_time_mean': df['C'].mean(),
                    'censoring_time_std': df['C'].std(),

                    # treatment effects
                    'ate': (df['T1']-df['T0']).mean(),
                    'cate_min': (df['T1']-df['T0']).min(),
                    'cate_median': (df['T1']-df['T0']).median(),
                    'cate_max': (df['T1']-df['T0']).max()
                    }
                result = {"dataset": df, 
                        "summary": summary_characteristics, 
                        "metadata": metadata}
                scenario_dict[f"Scenario_{survival_scenario}"] = result
            
            experiment_setups[causal_config] = scenario_dict
        
    else:
        raise NotImplementedError
    
    return experiment_setups, experiment_repeat_setups


In [2]:
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.abspath("../.."))
from data import load_data, prepare_data_split

num_repeats = 10
dataset_type = 'synthetic'
cate_true_col = None
train_size = 5000
val_size = 2500
test_size = 2500
data_dir = '/heinz-georgenas/users/xiaobins/SurvHTE-Benchmark/data'

experiment_setups, experiment_repeat_setups = load_data(dataset_type=dataset_type, data_dir=data_dir)

results_dict = {}
for causal_config_name, setup_dict in tqdm(experiment_setups.items(), desc="Experiment Setups"):
    results_dict[causal_config_name] = {}
    for scenario_key in tqdm(setup_dict, desc=f"{causal_config_name} Scenarios"):
        dataset_df = setup_dict[scenario_key]["dataset"]
        split_dict = prepare_data_split(
            dataset_df, experiment_repeat_setups, 
            num_repeats=num_repeats, 
            dataset_type=dataset_type,
            cate_true_col=cate_true_col,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size
        )
        results_dict[causal_config_name][scenario_key] = {}

        for rand_idx in range(num_repeats):
            X_train, W_train, Y_train, cate_true_train = split_dict[rand_idx]['train']
            X_val, W_val, Y_val, cate_true_val = split_dict[rand_idx]['val']
            X_test, W_test, Y_test, cate_true_test = split_dict[rand_idx]['test']

RCT-50 Scenarios: 100%|██████████| 5/5 [00:00<00:00, 18.91it/s]
RCT-5 Scenarios: 100%|██████████| 5/5 [00:00<00:00, 18.95it/s]s]
OBS-CPS Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.30it/s]
OBS-UConf Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.31it/s]
OBS-NoPos Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.40it/s]
OBS-CPS-IC Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.30it/s]
OBS-UConf-IC Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.30it/s]
OBS-NoPos-IC Scenarios: 100%|██████████| 5/5 [00:00<00:00, 19.30it/s]
Experiment Setups: 100%|██████████| 8/8 [00:02<00:00,  3.82it/s]


In [4]:
from tqdm import tqdm
import sys
import os
sys.path.append(os.path.abspath("../.."))
from data import load_data, prepare_data_split

num_repeats = 10
dataset_name = 'twin'
cate_true_col = None
train_size = 0.5
val_size = 0.25
test_size = 0.25
data_dir = '/heinz-georgenas/users/xiaobins/SurvHTE-Benchmark/data'

experiment_setups, experiment_repeat_setups = load_data(dataset_name=dataset_name, data_dir=data_dir)

results_dict = {}
for config_name, setup_dict in tqdm(experiment_setups.items(), desc="Experiment Setups"):
    results_dict[config_name] = {}
    for scenario_key in tqdm(setup_dict, desc=f"{config_name} Scenarios"):
        dataset_df = setup_dict[scenario_key]["dataset"]
        split_dict = prepare_data_split(
            dataset_df, experiment_repeat_setups, 
            num_repeats=num_repeats, 
            dataset_name=dataset_name,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size
        )
        results_dict[config_name][scenario_key] = {}

        for rand_idx in range(num_repeats):
            X_train, W_train, Y_train, cate_true_train = split_dict[rand_idx]['train']
            X_val, W_val, Y_val, cate_true_val = split_dict[rand_idx]['val']
            X_test, W_test, Y_test, cate_true_test = split_dict[rand_idx]['test']

twin30 Scenarios: 100%|██████████| 1/1 [00:00<00:00,  9.59it/s]
twin180 Scenarios: 100%|██████████| 1/1 [00:00<00:00,  9.25it/s]
Experiment Setups: 100%|██████████| 2/2 [00:00<00:00,  9.13it/s]


In [4]:
experiment_setups['actg_syn']['Scenario_A']['summary']

{'censoring_rate': 0.511921458625526,
 'treatment_rate': 0.5614773258532024,
 'event_time_min': 0.6536759401597113,
 'event_time_25pct': 25.493062254005324,
 'event_time_median': 33.84489207370351,
 'event_time_75pct': 43.290868915683404,
 'event_time_max': 95.25508215241828,
 'event_time_mean': 34.60805023757897,
 'event_time_std': 14.15818138235396,
 'censoring_time_min': 23.04080751896593,
 'censoring_time_median': 33.333493284505394,
 'censoring_time_max': 44.86151373159986,
 'censoring_time_mean': 33.3661146047237,
 'censoring_time_std': 3.3184348303759723,
 'ate': 5.044362088358642,
 'cate_min': -41.1870648720257,
 'cate_median': 4.957602854450798,
 'cate_max': 43.93721264162399}

In [13]:
split_dict[0].keys()

dict_keys(['train', 'val', 'test'])

In [11]:
experiment_setups['RCT-50']['Scenario_A'].keys()

dict_keys(['dataset', 'summary', 'metadata'])

In [8]:
cate_true_train

array([ 4, -2,  5, ..., -1,  0,  3])

In [3]:
a = np.zeros((8,5))
causal_configs = list(experiment_setups.keys())
survival_scenarios = list(experiment_setups['RCT-50'].keys())

print(causal_configs)
print(survival_scenarios)
for c, causal_config in enumerate(causal_configs):
    for s, survival_scenario in enumerate(survival_scenarios):
        ate_true = experiment_setups[causal_config][survival_scenario]['summary']['ate']
        a[c, s] = ate_true

ate_df = pd.DataFrame(a)
ate_df.index = causal_configs
ate_df.columns = survival_scenarios
ate_df

['RCT-50', 'RCT-5', 'OBS-CPS', 'OBS-UConf', 'OBS-NoPos', 'OBS-CPS-IC', 'OBS-UConf-IC', 'OBS-NoPos-IC']
['Scenario_A', 'Scenario_B', 'Scenario_C', 'Scenario_D', 'Scenario_E']


Unnamed: 0,Scenario_A,Scenario_B,Scenario_C,Scenario_D,Scenario_E
RCT-50,0.163441,0.124969,0.74996,0.723925,0.74996
RCT-5,0.163441,0.124969,0.74996,0.723925,0.74996
OBS-CPS,0.163441,0.124969,0.74996,0.723925,0.74996
OBS-UConf,0.003744,0.131728,0.74036,0.830668,0.74036
OBS-NoPos,0.163441,0.124969,0.74996,0.723925,0.74996
OBS-CPS-IC,0.163441,0.124969,0.74996,0.723925,0.74996
OBS-UConf-IC,0.003744,0.131728,0.74036,0.830668,0.74036
OBS-NoPos-IC,0.163441,0.124969,0.74996,0.723925,0.74996


In [5]:
a = np.zeros((8,5))
causal_configs = list(experiment_setups.keys())
survival_scenarios = list(experiment_setups['RCT-50'].keys())

# print(causal_configs)
# print(survival_scenarios)
for c, causal_config in enumerate(causal_configs):
    for s, survival_scenario in enumerate(survival_scenarios):
        ate_true = experiment_setups[causal_config][survival_scenario]['summary']['treatment_rate']
        a[c, s] = ate_true

treatment_rate_df = pd.DataFrame(a)
treatment_rate_df.index = causal_configs
treatment_rate_df.columns = survival_scenarios
treatment_rate_df

Unnamed: 0,Scenario_A,Scenario_B,Scenario_C,Scenario_D,Scenario_E
RCT-50,0.5022,0.5022,0.5022,0.5022,0.5022
RCT-5,0.04924,0.04924,0.04924,0.04924,0.04924
OBS-CPS,0.50288,0.50288,0.50288,0.50288,0.50288
OBS-UConf,0.5389,0.5389,0.5389,0.5389,0.5389
OBS-NoPos,0.50004,0.50004,0.50004,0.50004,0.50004
OBS-CPS-IC,0.50288,0.50288,0.50288,0.50288,0.50288
OBS-UConf-IC,0.5389,0.5389,0.5389,0.5389,0.5389
OBS-NoPos-IC,0.50004,0.50004,0.50004,0.50004,0.50004


In [6]:
experiment_setups[causal_config][survival_scenario]['dataset']

Unnamed: 0,id,observed_time,event,W,X1,X2,X3,X4,X5,U1,U2,T0,T1,T,C
0,0,0.097355,0,0,0.135488,0.887852,0.932606,0.445568,0.388236,0.151609,0.205535,9,9,9,0.097355
1,1,1.441993,0,1,0.257596,0.657368,0.492617,0.964238,0.800984,0.597208,0.255785,5,9,9,1.441993
2,2,0.010262,0,1,0.455205,0.801058,0.041718,0.769458,0.003171,0.370382,0.223214,6,7,7,0.010262
3,3,5.000000,1,1,0.292809,0.610914,0.913027,0.300115,0.248599,0.038464,0.409829,11,5,5,10.901263
4,4,1.021807,0,0,0.666392,0.987533,0.468270,0.123287,0.916031,0.342961,0.791330,10,8,10,1.021807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.989154,0,0,0.484593,0.998236,0.668208,0.070638,0.960140,0.497815,0.206792,7,11,7,0.989154
49996,49996,3.776265,0,0,0.036391,0.268106,0.043117,0.426886,0.342038,0.812595,0.437775,6,6,6,3.776265
49997,49997,3.000000,1,0,0.061915,0.411210,0.426204,0.414266,0.601355,0.116056,0.416950,3,6,3,8.417143
49998,49998,0.453547,0,0,0.178390,0.656522,0.817355,0.347013,0.060741,0.201218,0.935754,6,7,6,0.453547


In [8]:
experiment_setups[causal_config][survival_scenario]['metadata']

{'dataset_name': 'OBS-NoPos-IC_Scenario_E',
 'scenario': 'E',
 'n_samples': 50000,
 'n_features': 5,
 'RCT': False,
 'unobserved': False,
 'overlap': False,
 'informative_censoring': True,
 'random_state': 2025,
 'propensity_type': 'e(X)_no_overlap',
 'propensity_params': {'beta_a': 2, 'beta_b': 4},
 'T_distribution': 'Poisson',
 'info_censor_baseline': 0.1,
 'info_censor_alpha': 0.05,
 'treatment_proportion': 0.50004,
 'censoring_rate': 0.89098}

In [7]:
experiment_setups[causal_config][survival_scenario]['summary']

{'censoring_rate': 0.89098,
 'treatment_rate': 0.50004,
 'event_time_min': 0,
 'event_time_25pct': 5.0,
 'event_time_median': 7.0,
 'event_time_75pct': 9.0,
 'event_time_max': 21,
 'event_time_mean': 7.32416,
 'event_time_std': 2.8018346310844477,
 'censoring_time_min': 1.1151581202372974e-05,
 'censoring_time_median': 1.5367217188340097,
 'censoring_time_max': 41.061032788143805,
 'censoring_time_mean': 2.378661940522167,
 'censoring_time_std': 2.6699140555043095,
 'ate': 0.74996,
 'cate_min': -16,
 'cate_median': 1.0,
 'cate_max': 20}

In [9]:
results_dict

{'RCT-50': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'RCT-5': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'OBS-CPS': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'OBS-UConf': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'OBS-NoPos': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'OBS-CPS-IC': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'OBS-UConf-IC': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}},
 'OBS-NoPos-IC': {'Scenario_A': {},
  'Scenario_B': {},
  'Scenario_C': {},
  'Scenario_D': {},
  'Scenario_E': {}}}