In [55]:
import numpy as np
import pandas as pd

# Add parent directory to Python path
import sys
from pathlib import Path
parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from nc_csf.models import NCCausalForestDML, NCCausalForestDMLOracle, BaselineCausalForestDML
from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv('rhc.csv')

print(f"Shape of the dataset: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

Shape of the dataset: (5735, 63)
Columns: ['Unnamed: 0', 'cat1', 'cat2', 'ca', 'sadmdte', 'dschdte', 'dthdte', 'lstctdte', 'death', 'cardiohx', 'chfhx', 'dementhx', 'psychhx', 'chrpulhx', 'renalhx', 'liverhx', 'gibledhx', 'malighx', 'immunhx', 'transhx', 'amihx', 'age', 'sex', 'edu', 'surv2md1', 'das2d3pc', 't3d30', 'dth30', 'aps1', 'scoma1', 'meanbp1', 'wblc1', 'hrt1', 'resp1', 'temp1', 'pafi1', 'alb1', 'hema1', 'bili1', 'crea1', 'sod1', 'pot1', 'paco21', 'ph1', 'swang1', 'wtkilo1', 'dnr1', 'ninsclas', 'resp', 'card', 'neuro', 'gastr', 'renal', 'meta', 'hema', 'seps', 'trauma', 'ortho', 'adld3p', 'urin1', 'race', 'income', 'ptid']


Unnamed: 0.1,Unnamed: 0,cat1,cat2,ca,sadmdte,dschdte,dthdte,lstctdte,death,cardiohx,...,meta,hema,seps,trauma,ortho,adld3p,urin1,race,income,ptid
0,1,COPD,,Yes,11142,11151.0,,11382,No,0,...,No,No,No,No,No,0.0,,white,Under $11k,5
1,2,MOSF w/Sepsis,,No,11799,11844.0,11844.0,11844,Yes,1,...,No,No,Yes,No,No,,1437.0,white,Under $11k,7
2,3,MOSF w/Malignancy,MOSF w/Sepsis,Yes,12083,12143.0,,12400,No,0,...,No,No,No,No,No,,599.0,white,$25-$50k,9
3,4,ARF,,No,11146,11183.0,11183.0,11182,Yes,0,...,No,No,No,No,No,,,white,$11-$25k,10
4,5,MOSF w/Sepsis,,No,12035,12037.0,12037.0,12036,Yes,0,...,No,No,No,No,No,,64.0,white,Under $11k,11


I found two papers online that uses RHC dataset. Their ways of choosing covariates X differs a bit, so I created two dataframes to run the test. Note that I didn't run the oracle model since we don't actually know the ground truth.

### Tchetgen Tchetgen, E. J., Ying, A., Cui, Y., Shi, X., and Miao, W. An introduction to proximal causal learning. arXiv preprint arXiv:2009.10982, 2020.

In [57]:
# Treatment A
A_raw = df["swang1"]
if A_raw.dtype == "O":
    A = (A_raw == "RHC").astype(int)
else:
    A = (A_raw.astype(float) > 0).astype(int)

# Outcome Y
Y = df["t3d30"]

# Covariates X
X = pd.DataFrame({
    "age": df["age"],
    "sex": df["sex"],
    "race": df["race"]
})

if X["sex"].dtype == "O":
    X["sex"] = (X["sex"] == "Female").astype(int)

if X["race"].dtype == "O":
    X["race_black"] = (X["race"] == "black").astype(int)
    X = X.drop(columns=["race"])

# Proxies Z & W
Z = df[["pafi1", "paco21"]].copy() 
W = df[["ph1", "hema1"]].copy()  

analysis_cols = pd.concat(
    [
        Y.rename("Y"),
        A.rename("A"),
        X,
        Z.rename(columns={"pafi1": "pafi1", "paco21": "paco21"}),
        W.rename(columns={"ph1": "ph1", "hema1": "hema1"}),
    ],
    axis=1,
)

analysis_df = analysis_cols.dropna().copy()

# Overwrite with cleaned arrays
Y = analysis_df["Y"].values
A = analysis_df["A"].values.astype(int)
X_colnames = [col for col in analysis_df.columns if col not in ["Y", "A", "pafi1", "paco21", "ph1", "hema1"]]
X = analysis_df[X_colnames]
Z = analysis_df[["pafi1", "paco21"]]
W = analysis_df[["ph1", "hema1"]]

print(f"\nFinal X shape: {X.shape}")
analysis_df.head()


Final X shape: (5735, 3)


Unnamed: 0,Y,A,age,sex,race_black,pafi1,paco21,ph1,hema1
0,30,0,70.25098,0,0,68.0,40.0,7.359375,58.0
1,30,1,78.17896,1,0,218.3125,34.0,7.329102,32.5
2,30,1,46.09198,1,0,275.5,16.0,7.359375,21.097656
3,30,0,75.33197,1,0,156.65625,30.0,7.459961,26.296875
4,2,1,67.90997,0,0,478.0,17.0,7.229492,24.0


In [58]:
X_train, X_test, A_train, A_test, Y_train, Y_test, Z_train, Z_test, W_train, W_test = train_test_split(
    X.values, A, Y, Z.values, W.values, test_size=0.3, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 4014
Test samples: 1721


In [59]:
baseline = BaselineCausalForestDML(n_estimators=200, min_samples_leaf=20, random_state=42)
baseline.fit_baseline(X_train, A_train, Y_train, verbose=False)
pred_baseline = baseline.effect(X_test).ravel()

print(f"\nBaseline predictions - Mean: {pred_baseline.mean():.4f}, Std: {pred_baseline.std():.4f}")
print(f"Baseline predictions - Min: {pred_baseline.min():.4f}, Max: {pred_baseline.max():.4f}")


Baseline predictions - Mean: -1.8889, Std: 1.8212
Baseline predictions - Min: -7.6396, Max: 1.5299


In [60]:
nccsf = NCCausalForestDML(n_estimators=200, min_samples_leaf=20, cv=5, random_state=42)
nccsf.fit(Y=Y_train, T=A_train, X=X_train, Z=Z_train, W=W_train)
pred_nccsf = nccsf.effect(X_test).ravel()

print(f"\nNC-CSF predictions - Mean: {pred_nccsf.mean():.4f}, Std: {pred_nccsf.std():.4f}")
print(f"NC-CSF predictions - Min: {pred_nccsf.min():.4f}, Max: {pred_nccsf.max():.4f}")


NC-CSF predictions - Mean: -0.9961, Std: 1.8993
NC-CSF predictions - Min: -6.7210, Max: 3.4750


### Sverdrup, E., Cui, Y. Proximal Causal Learning of Conditional Average Treatment Effects

In [61]:
# Treatment A
A_raw = df["swang1"]
if A_raw.dtype == "O":
    A = (A_raw == "RHC").astype(int)
else:
    A = (A_raw.astype(float) > 0).astype(int)

# Outcome Y
Y = df["t3d30"]

# Covariates X
# Note we define cat1_coma and cat2_coma by ourselves since it doesn't exist in the original dataset
# reference: https://search.r-project.org/CRAN/refmans/ATbounds/html/RHC.html
X = pd.DataFrame({
    "age": df["age"],
    "sex": df["sex"],
    "cat1_coma": df["cat1"].apply(lambda x: 1 if x in ["Coma"] else 0),
    "cat2_coma": df["cat2"].apply(lambda x: 1 if x in ["Coma"] else 0),
    "dnr1": df["dnr1"],
    "surv2md1": df["surv2md1"],
    "aps1": df["aps1"],
})

if X["sex"].dtype == "O":
    X["sex"] = (X["sex"] == "Female").astype(int)

if X["dnr1"].dtype == "O":
    X["dnr1"] = X["dnr1"].map({"Yes": 1, "No": 0}).fillna(0).astype(int)

# Proxies Z & W
Z = df[["pafi1", "paco21"]].copy() 
W = df[["ph1", "hema1"]].copy()  

analysis_cols = pd.concat(
    [
        Y.rename("Y"),
        A.rename("A"),
        X,
        Z.rename(columns={"pafi1": "pafi1", "paco21": "paco21"}),
        W.rename(columns={"ph1": "ph1", "hema1": "hema1"}),
    ],
    axis=1,
)

analysis_df = analysis_cols.dropna().copy()

# Overwrite with cleaned arrays
Y = analysis_df["Y"].values
A = analysis_df["A"].values.astype(int)
X_colnames = [col for col in analysis_df.columns if col not in ["Y", "A", "pafi1", "paco21", "ph1", "hema1"]]
X = analysis_df[X_colnames]
Z = analysis_df[["pafi1", "paco21"]]
W = analysis_df[["ph1", "hema1"]]

print(f"\nFinal X shape: {X.shape}")
analysis_df.head()


Final X shape: (5735, 7)


Unnamed: 0,Y,A,age,sex,cat1_coma,cat2_coma,dnr1,surv2md1,aps1,pafi1,paco21,ph1,hema1
0,30,0,70.25098,0,0,0,0,0.640991,46,68.0,40.0,7.359375,58.0
1,30,1,78.17896,1,0,0,0,0.755,50,218.3125,34.0,7.329102,32.5
2,30,1,46.09198,1,0,0,0,0.317,82,275.5,16.0,7.359375,21.097656
3,30,0,75.33197,1,0,0,0,0.440979,48,156.65625,30.0,7.459961,26.296875
4,2,1,67.90997,0,0,0,1,0.437,72,478.0,17.0,7.229492,24.0


In [62]:
X_train, X_test, A_train, A_test, Y_train, Y_test, Z_train, Z_test, W_train, W_test = train_test_split(
    X.values, A, Y, Z.values, W.values, test_size=0.3, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 4014
Test samples: 1721


In [63]:
baseline = BaselineCausalForestDML(n_estimators=200, min_samples_leaf=20, random_state=42)
baseline.fit_baseline(X_train, A_train, Y_train, verbose=True)
pred_baseline = baseline.effect(X_test).ravel()

print(f"\nBaseline predictions - Mean: {pred_baseline.mean():.4f}, Std: {pred_baseline.std():.4f}")
print(f"Baseline predictions - Min: {pred_baseline.min():.4f}, Max: {pred_baseline.max():.4f}")


Baseline predictions - Mean: -1.2659, Std: 1.3492
Baseline predictions - Min: -5.8837, Max: 3.0109


In [64]:
nccsf = NCCausalForestDML(n_estimators=200, min_samples_leaf=20, cv=5, random_state=42)
nccsf.fit(Y=Y_train, T=A_train, X=X_train, Z=Z_train, W=W_train)
pred_nccsf = nccsf.effect(X_test).ravel()

print(f"\nNC-CSF predictions - Mean: {pred_nccsf.mean():.4f}, Std: {pred_nccsf.std():.4f}")
print(f"NC-CSF predictions - Min: {pred_nccsf.min():.4f}, Max: {pred_nccsf.max():.4f}")


NC-CSF predictions - Mean: -1.0821, Std: 1.3467
NC-CSF predictions - Min: -5.5421, Max: 3.3506


### Generate semi-syn (use real X, A, Z, W)

**Severity Score Construction for Unmeasured Confounder $U$**

We construct $U$ as a latent severity score from physiological lab values following a Gaussian factor model:

1. **Standardize lab values**: For each lab measurement $L_j$ (where $j \in \{\text{sod1, pot1, crea1, bili1, alb1, pafi1, paco21, ph1, hema1}\}$):
   $$\tilde{L}_j = \frac{L_j - \bar{L}_j}{s_j}$$

2. **Construct weighted score**: Using PCA to obtain weights $w_j$ from the first principal component:
   $$U_i = \sum_{j} w_j \tilde{L}_{ij} + \epsilon_i, \quad \epsilon_i \sim \mathcal{N}(0, \sigma_U^2)$$

3. **Standardize**: Final $U$ is standardized to have mean 0 and variance 1.

This creates a realistic unmeasured confounder that represents underlying patient severity, influencing both treatment assignment and outcomes.

In [65]:
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from nc_csf.data_generation import weibull_ph_time_paper, sigmoid

def generate_semi_synthetic_data(
    analysis_df,
    original_df,
    severity_columns=None,
    use_pca=True,
    sigma_u=0.3,
    linear_outcome=True,
    target_censor_rate=0,
    k_t=1.5,
    lam_t=100.0,        # Baseline scale - controls typical survival time
    tau_log_hr=-0.6,    # Log hazard ratio for treatment effect
    beta_u_in_t=0.3,    # Effect of unmeasured confounder U on outcome (reduced from 0.8)
    k_c=1.2,
    lam_c=None,
    beta_u_in_c=0.2,    # Effect of U on censoring (reduced from 0.3)
    censor_lam_lo=1e-8,
    censor_lam_hi=1e6,
    max_censor_calib_iter=60,
    admin_censor_time=None,
    seed=123
):
    """
    Generate semi-synthetic survival data using real X, A, Z, W and synthetic Y.
    U is constructed as a severity score from physiological variables (unmeasured confounder).
    
    Parameters:
    -----------
    analysis_df : pd.DataFrame
        DataFrame containing real X, A, Z, W columns (after dropna)
    original_df : pd.DataFrame
        Original DataFrame before dropna (for accessing severity columns)
    severity_columns : list
        List of column names to use for severity score U
        Default: ['sod1', 'pot1', 'crea1', 'bili1', 'alb1', 'pafi1', 'paco21', 'ph1', 'hema1']
    use_pca : bool
        If True, use first principal component for weights; if False, use equal weights
    sigma_u : float
        Standard deviation of noise added to severity score (default: 0.3)
   
    Returns:
    --------
    observed_df : pd.DataFrame
        DataFrame with semi-synthetic observed data (real X,A,Z,W and synthetic time, event)
    truth_df : pd.DataFrame
        DataFrame with additional ground truth columns (U, T0, T1, C0, C1, CATE, etc.)
    """
    
    if severity_columns is None:
        severity_columns = ['sod1', 'pot1', 'crea1', 'bili1', 'alb1', 'pafi1', 'paco21', 'ph1', 'hema1']
    
    rng = np.random.default_rng(seed)
    n = len(analysis_df)
    
    # Extract real data from analysis_df
    A = analysis_df["A"].values.astype(int)
    X_colnames = [col for col in analysis_df.columns if col not in ["Y", "A", "pafi1", "paco21", "ph1", "hema1"]]
    X_raw = analysis_df[X_colnames].values
    Z = analysis_df[["pafi1", "paco21"]].values
    W = analysis_df[["ph1", "hema1"]].values
    
    # CRITICAL: Standardize covariates to prevent extreme survival times
    # Without standardization, large values (age≈70, aps1≈50) create very negative η
    X_scaler = StandardScaler()
    X = X_scaler.fit_transform(X_raw)
    
    p = X.shape[1]
    
    # Get indices of analysis_df in original_df to match rows
    # We need to align the rows properly
    analysis_indices = analysis_df.index
    
    # Extract severity columns from original_df using the same indices
    severity_data = original_df.loc[analysis_indices, severity_columns].copy()
    
    # Handle missing values in severity columns (if any remain)
    severity_data = severity_data.fillna(severity_data.mean())
    
    # Standardize the lab values: L_tilde = (L - mean(L)) / std(L)
    scaler = StandardScaler()
    L_tilde = scaler.fit_transform(severity_data)
    
    # Create severity score U (unmeasured confounder)
    if use_pca:
        # Use first principal component direction as weights
        pca = PCA(n_components=1, random_state=seed)
        U_score = pca.fit_transform(L_tilde).ravel()
    else:
        # Use equal weights
        U_score = L_tilde.mean(axis=1)
    
    # Add Gaussian noise: U = weighted_sum + epsilon, epsilon ~ N(0, sigma_u^2)
    epsilon = rng.normal(scale=sigma_u, size=n)
    U = U_score + epsilon
    
    # Standardize U to have mean 0 and std 1 for consistency
    U = (U - U.mean()) / U.std()
    
    # 4) Potential event times T0,T1 (shared uniform u_t)
    # Use smaller scale for beta since X is now standardized
    beta_t = rng.normal(scale=0.1, size=p)
    u_t = rng.random(n)

    beta_squared = None
    beta_interact = None

    if linear_outcome:
        # Linear: standard Cox PH model
        eta_t0 = X @ beta_t + beta_u_in_t * U + tau_log_hr * 0.0
        eta_t1 = X @ beta_t + beta_u_in_t * U + tau_log_hr * 1.0
    else:
        # Non-linear: add non-linear transformations
        # Use polynomial terms and sigmoid transformations
        X_squared = X[:, :min(2, p)] ** 2  # squared terms for first few features
        X_interact = X[:, 0:1] * U.reshape(-1, 1)  # interaction with U
        
        beta_squared = rng.normal(scale=0.2, size=X_squared.shape[1])
        beta_interact = rng.normal(scale=0.2, size=X_interact.shape[1])
        
        nonlinear_part = (X_squared @ beta_squared + 
                         X_interact @ beta_interact + 
                         0.5 * sigmoid(U))
        
        eta_t0 = X @ beta_t + beta_u_in_t * U + nonlinear_part + tau_log_hr * 0.0
        eta_t1 = X @ beta_t + beta_u_in_t * U + nonlinear_part + tau_log_hr * 1.0

    T0 = weibull_ph_time_paper(u_t, k=k_t, lam=lam_t, eta=eta_t0)
    T1 = weibull_ph_time_paper(u_t, k=k_t, lam=lam_t, eta=eta_t1)

    # 5) Censoring times (use smaller beta scale for standardized X)
    beta_c = rng.normal(scale=0.1, size=p)
    u_c = rng.random(n)
    eta_c = X @ beta_c + beta_u_in_c * U

    T_obs_for_calib = np.where(A == 1, T1, T0)
    lam_c_used = lam_c

    if lam_c_used is None:
        lo, hi = float(censor_lam_lo), float(censor_lam_hi)
        for _ in range(max_censor_calib_iter):
            mid = 0.5 * (lo + hi)
            C_mid = weibull_ph_time_paper(u_c, k=k_c, lam=mid, eta=eta_c)
            censor_rate_mid = (C_mid < T_obs_for_calib).mean()
            if censor_rate_mid < target_censor_rate:
                hi = mid
            else:
                lo = mid
        lam_c_used = 0.5 * (lo + hi)

    C0 = weibull_ph_time_paper(u_c, k=k_c, lam=lam_c_used, eta=eta_c)
    C1 = weibull_ph_time_paper(u_c, k=k_c, lam=lam_c_used, eta=eta_c)

    # 6) Realized T,C and observed (time,event)
    T = np.where(A == 1, T1, T0)
    C = np.where(A == 1, C1, C0)

    time = np.minimum(T, C)
    event = (T <= C).astype(int)

    if admin_censor_time is not None:
        admin = float(admin_censor_time)
        cens_by_admin = admin < time
        time = np.where(cens_by_admin, admin, time)
        event = np.where(cens_by_admin, 0, event).astype(int)

    # Create observed DataFrame (U is NOT included as it's unmeasured)
    observed_df = analysis_df.copy()
    observed_df["time"] = time
    observed_df["event"] = event
    
    # Drop original Y if it exists, replace with new outcome
    if "Y" in observed_df.columns:
        observed_df = observed_df.drop(columns=["Y"])
    
    # Create truth DataFrame with additional ground truth information (including U)
    truth_df = observed_df.copy()
    truth_df.insert(0, "U", U)
    truth_df["T0"] = T0
    truth_df["T1"] = T1
    truth_df["C0"] = C0
    truth_df["C1"] = C1
    truth_df["T"] = T
    truth_df["C"] = C
    truth_df["eta_t0"] = eta_t0
    truth_df["eta_t1"] = eta_t1
    
    # Add ground truth CATE
    # For Weibull PH model: E[T | η] = λ * Γ(1 + 1/k) * exp(-η/k)
    # CATE = E[T(1) - T(0) | X,U] = λ * Γ(1 + 1/k) * (exp(-η₁/k) - exp(-η₀/k))
    G = math.gamma(1.0 + 1.0 / k_t)
    cate_xu = lam_t * G * (np.exp(-eta_t1 / k_t) - np.exp(-eta_t0 / k_t))
    truth_df["CATE_XU"] = cate_xu
    truth_df["ITE"] = T1 - T0
    
    actual_censor_rate = (event == 0).mean()
    print(f"Sample size: {n}")
    print(f"Severity score (U) - Mean: {U.mean():.4f}, Std: {U.std():.4f}")
    print(f"Target censoring rate: {target_censor_rate:.2%}")
    print(f"Actual censoring rate: {actual_censor_rate:.2%}")
    print(f"Treatment proportion: {A.mean():.2%}")
    print(f"Mean time: {time.mean():.2f} days")
    print(f"Mean CATE: {cate_xu.mean():.4f}")
    print(f"tau_log_hr (imposed): {tau_log_hr:.4f}, Hazard Ratio: {np.exp(tau_log_hr):.4f}")
    
    return observed_df, truth_df

# Generate semi-synthetic dataset with severity score U
observed_df, truth_df = generate_semi_synthetic_data(
    analysis_df=analysis_df,
    original_df=df,
    severity_columns=['sod1', 'pot1', 'crea1', 'bili1', 'alb1', 'pafi1', 'paco21', 'ph1', 'hema1'],
    use_pca=True,
    sigma_u=0.3,
    target_censor_rate=0,
    seed=42
)

print("\nObserved data shape:", observed_df.shape)
print("Columns:", list(observed_df.columns))
print("\nTruth data shape:", truth_df.shape)
print("Columns:", list(truth_df.columns))
print("\nFirst few rows of observed data:")
observed_df.head()

Sample size: 5735
Severity score (U) - Mean: 0.0000, Std: 1.0000
Target censoring rate: 0.00%
Actual censoring rate: 0.00%
Treatment proportion: 38.08%
Mean time: 111.95 days
Mean CATE: 46.1049
tau_log_hr (imposed): -0.6000, Hazard Ratio: 0.5488

Observed data shape: (5735, 14)
Columns: ['A', 'age', 'sex', 'cat1_coma', 'cat2_coma', 'dnr1', 'surv2md1', 'aps1', 'pafi1', 'paco21', 'ph1', 'hema1', 'time', 'event']

Truth data shape: (5735, 25)
Columns: ['U', 'A', 'age', 'sex', 'cat1_coma', 'cat2_coma', 'dnr1', 'surv2md1', 'aps1', 'pafi1', 'paco21', 'ph1', 'hema1', 'time', 'event', 'T0', 'T1', 'C0', 'C1', 'T', 'C', 'eta_t0', 'eta_t1', 'CATE_XU', 'ITE']

First few rows of observed data:


Unnamed: 0,A,age,sex,cat1_coma,cat2_coma,dnr1,surv2md1,aps1,pafi1,paco21,ph1,hema1,time,event
0,0,70.25098,0,0,0,0,0.640991,46,68.0,40.0,7.359375,58.0,95.064068,1
1,1,78.17896,1,0,0,0,0.755,50,218.3125,34.0,7.329102,32.5,187.705238,1
2,1,46.09198,1,0,0,0,0.317,82,275.5,16.0,7.359375,21.097656,200.769915,1
3,0,75.33197,1,0,0,0,0.440979,48,156.65625,30.0,7.459961,26.296875,48.647519,1
4,1,67.90997,0,0,0,1,0.437,72,478.0,17.0,7.229492,24.0,19.718504,1


In [66]:
# Split semi-synthetic data into train/test
from sklearn.model_selection import train_test_split

# Extract variables from observed_df and truth_df
X_cols = [col for col in observed_df.columns if col not in ["time", "event", "A", "pafi1", "paco21", "ph1", "hema1"]]
X_semi = observed_df[X_cols].values
A_semi = observed_df["A"].values
Y_semi = observed_df["time"].values
event_semi = observed_df["event"].values
Z_semi = observed_df[["pafi1", "paco21"]].values
W_semi = observed_df[["ph1", "hema1"]].values
U_semi = truth_df["U"].values  # Ground truth unmeasured confounder
CATE_true = truth_df["CATE_XU"].values  # Ground truth CATE

# Train/test split
X_train_semi, X_test_semi, \
A_train_semi, A_test_semi, \
Y_train_semi, Y_test_semi, \
event_train_semi, event_test_semi, \
Z_train_semi, Z_test_semi, \
W_train_semi, W_test_semi, \
U_train_semi, U_test_semi, \
CATE_train_true, CATE_test_true = train_test_split(
    X_semi, A_semi, Y_semi, event_semi, Z_semi, W_semi, U_semi, CATE_true,
    test_size=0.3, random_state=42
)

print(f"Training samples: {len(X_train_semi)}")
print(f"Test samples: {len(X_test_semi)}")
print(f"Event rate (train): {event_train_semi.mean():.2%}")
print(f"Event rate (test): {event_test_semi.mean():.2%}")

Training samples: 4014
Test samples: 1721
Event rate (train): 100.00%
Event rate (test): 100.00%


In [67]:
# 1. Baseline Causal Forest (ignores unmeasured confounding)
print("="*60)
print("BASELINE CAUSAL FOREST")
print("="*60)
baseline_semi = BaselineCausalForestDML(n_estimators=200, min_samples_leaf=20, random_state=42)
baseline_semi.fit_baseline(X_train_semi, A_train_semi, Y_train_semi, verbose=False)
pred_baseline_semi = baseline_semi.effect(X_test_semi).ravel()

print(f"\nBaseline predictions - Mean: {pred_baseline_semi.mean():.4f}, Std: {pred_baseline_semi.std():.4f}")
print(f"Ground truth CATE   - Mean: {CATE_test_true.mean():.4f}, Std: {CATE_test_true.std():.4f}")
print(f"Prediction error (RMSE): {np.sqrt(((pred_baseline_semi - CATE_test_true)**2).mean()):.4f}")

BASELINE CAUSAL FOREST

Baseline predictions - Mean: 43.1096, Std: 7.2258
Ground truth CATE   - Mean: 46.0786, Std: 13.8817
Prediction error (RMSE): 15.0995


In [68]:
# 2. NC-CSF (uses negative control proxies Z, W)
print("\n" + "="*60)
print("NC-CSF (Negative Control Causal Forest)")
print("="*60)
nccsf_semi = NCCausalForestDML(n_estimators=200, min_samples_leaf=20, cv=5, random_state=42)
nccsf_semi.fit(Y=Y_train_semi, T=A_train_semi, X=X_train_semi, Z=Z_train_semi, W=W_train_semi)
pred_nccsf_semi = nccsf_semi.effect(X_test_semi).ravel()

print(f"\nNC-CSF predictions  - Mean: {pred_nccsf_semi.mean():.4f}, Std: {pred_nccsf_semi.std():.4f}")
print(f"Ground truth CATE   - Mean: {CATE_test_true.mean():.4f}, Std: {CATE_test_true.std():.4f}")
print(f"Prediction error (RMSE): {np.sqrt(((pred_nccsf_semi - CATE_test_true)**2).mean()):.4f}")


NC-CSF (Negative Control Causal Forest)

NC-CSF predictions  - Mean: 43.3527, Std: 7.2830
Ground truth CATE   - Mean: 46.0786, Std: 13.8817
Prediction error (RMSE): 15.0935


In [69]:
# 3. Oracle (uses true unmeasured confounder U - only possible with semi-synthetic data!)
print("\n" + "="*60)
print("ORACLE (uses true U)")
print("="*60)

oracle_semi = NCCausalForestDMLOracle(n_estimators=200, min_samples_leaf=20, cv=5, random_state=42)
oracle_semi.fit(Y=Y_train_semi, T=A_train_semi, X=X_train_semi, W=U_train_semi)
pred_oracle_semi = oracle_semi.effect(X_test_semi).ravel()

print(f"\nOracle predictions  - Mean: {pred_oracle_semi.mean():.4f}, Std: {pred_oracle_semi.std():.4f}")
print(f"Ground truth CATE   - Mean: {CATE_test_true.mean():.4f}, Std: {CATE_test_true.std():.4f}")
print(f"Prediction error (RMSE): {np.sqrt(((pred_oracle_semi - CATE_test_true)**2).mean()):.4f}")


ORACLE (uses true U)

Oracle predictions  - Mean: 42.1043, Std: 6.8259
Ground truth CATE   - Mean: 46.0786, Std: 13.8817
Prediction error (RMSE): 14.9316
