In [None]:
import numpy as np
import pandas as pd

# Add parent directory to Python path
import sys
from pathlib import Path
parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from nc_csf.models import NCCausalForestDML, NCCausalForestDMLOracle, BaselineCausalForestDML
from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv('rhc.csv')

print(f"Shape of the dataset: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

Shape of the dataset: (5735, 63)
Columns: ['Unnamed: 0', 'cat1', 'cat2', 'ca', 'sadmdte', 'dschdte', 'dthdte', 'lstctdte', 'death', 'cardiohx', 'chfhx', 'dementhx', 'psychhx', 'chrpulhx', 'renalhx', 'liverhx', 'gibledhx', 'malighx', 'immunhx', 'transhx', 'amihx', 'age', 'sex', 'edu', 'surv2md1', 'das2d3pc', 't3d30', 'dth30', 'aps1', 'scoma1', 'meanbp1', 'wblc1', 'hrt1', 'resp1', 'temp1', 'pafi1', 'alb1', 'hema1', 'bili1', 'crea1', 'sod1', 'pot1', 'paco21', 'ph1', 'swang1', 'wtkilo1', 'dnr1', 'ninsclas', 'resp', 'card', 'neuro', 'gastr', 'renal', 'meta', 'hema', 'seps', 'trauma', 'ortho', 'adld3p', 'urin1', 'race', 'income', 'ptid']


Unnamed: 0.1,Unnamed: 0,cat1,cat2,ca,sadmdte,dschdte,dthdte,lstctdte,death,cardiohx,...,meta,hema,seps,trauma,ortho,adld3p,urin1,race,income,ptid
0,1,COPD,,Yes,11142,11151.0,,11382,No,0,...,No,No,No,No,No,0.0,,white,Under $11k,5
1,2,MOSF w/Sepsis,,No,11799,11844.0,11844.0,11844,Yes,1,...,No,No,Yes,No,No,,1437.0,white,Under $11k,7
2,3,MOSF w/Malignancy,MOSF w/Sepsis,Yes,12083,12143.0,,12400,No,0,...,No,No,No,No,No,,599.0,white,$25-$50k,9
3,4,ARF,,No,11146,11183.0,11183.0,11182,Yes,0,...,No,No,No,No,No,,,white,$11-$25k,10
4,5,MOSF w/Sepsis,,No,12035,12037.0,12037.0,12036,Yes,0,...,No,No,No,No,No,,64.0,white,Under $11k,11


I found two papers online that uses RHC dataset. Their ways of choosing covariates X differs a bit, so I created two dataframes to run the test. Note that I didn't run the oracle model since we don't actually know the ground truth.

### Tchetgen Tchetgen, E. J., Ying, A., Cui, Y., Shi, X., and Miao, W. An introduction to proximal causal learning. arXiv preprint arXiv:2009.10982, 2020.

In [57]:
# Treatment A
A_raw = df["swang1"]
if A_raw.dtype == "O":
    A = (A_raw == "RHC").astype(int)
else:
    A = (A_raw.astype(float) > 0).astype(int)

# Outcome Y
Y = df["t3d30"]

# Covariates X
X = pd.DataFrame({
    "age": df["age"],
    "sex": df["sex"],
    "race": df["race"]
})

if X["sex"].dtype == "O":
    X["sex"] = (X["sex"] == "Female").astype(int)

if X["race"].dtype == "O":
    X["race_black"] = (X["race"] == "black").astype(int)
    X = X.drop(columns=["race"])

# Proxies Z & W
Z = df[["pafi1", "paco21"]].copy() 
W = df[["ph1", "hema1"]].copy()  

analysis_cols = pd.concat(
    [
        Y.rename("Y"),
        A.rename("A"),
        X,
        Z.rename(columns={"pafi1": "pafi1", "paco21": "paco21"}),
        W.rename(columns={"ph1": "ph1", "hema1": "hema1"}),
    ],
    axis=1,
)

analysis_df = analysis_cols.dropna().copy()

# Overwrite with cleaned arrays
Y = analysis_df["Y"].values
A = analysis_df["A"].values.astype(int)
X_colnames = [col for col in analysis_df.columns if col not in ["Y", "A", "pafi1", "paco21", "ph1", "hema1"]]
X = analysis_df[X_colnames]
Z = analysis_df[["pafi1", "paco21"]]
W = analysis_df[["ph1", "hema1"]]

print(f"\nFinal X shape: {X.shape}")
analysis_df.head()


Final X shape: (5735, 3)


Unnamed: 0,Y,A,age,sex,race_black,pafi1,paco21,ph1,hema1
0,30,0,70.25098,0,0,68.0,40.0,7.359375,58.0
1,30,1,78.17896,1,0,218.3125,34.0,7.329102,32.5
2,30,1,46.09198,1,0,275.5,16.0,7.359375,21.097656
3,30,0,75.33197,1,0,156.65625,30.0,7.459961,26.296875
4,2,1,67.90997,0,0,478.0,17.0,7.229492,24.0


In [58]:
X_train, X_test, A_train, A_test, Y_train, Y_test, Z_train, Z_test, W_train, W_test = train_test_split(
    X.values, A, Y, Z.values, W.values, test_size=0.3, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 4014
Test samples: 1721


In [59]:
baseline = BaselineCausalForestDML(n_estimators=200, min_samples_leaf=20, random_state=42)
baseline.fit_baseline(X_train, A_train, Y_train, verbose=False)
pred_baseline = baseline.effect(X_test).ravel()

print(f"\nBaseline predictions - Mean: {pred_baseline.mean():.4f}, Std: {pred_baseline.std():.4f}")
print(f"Baseline predictions - Min: {pred_baseline.min():.4f}, Max: {pred_baseline.max():.4f}")


Baseline predictions - Mean: -1.8889, Std: 1.8212
Baseline predictions - Min: -7.6396, Max: 1.5299


In [60]:
nccsf = NCCausalForestDML(n_estimators=200, min_samples_leaf=20, cv=5, random_state=42)
nccsf.fit(Y=Y_train, T=A_train, X=X_train, Z=Z_train, W=W_train)
pred_nccsf = nccsf.effect(X_test).ravel()

print(f"\nNC-CSF predictions - Mean: {pred_nccsf.mean():.4f}, Std: {pred_nccsf.std():.4f}")
print(f"NC-CSF predictions - Min: {pred_nccsf.min():.4f}, Max: {pred_nccsf.max():.4f}")


NC-CSF predictions - Mean: -0.9961, Std: 1.8993
NC-CSF predictions - Min: -6.7210, Max: 3.4750


### Sverdrup, E., Cui, Y. Proximal Causal Learning of Conditional Average Treatment Effects

In [61]:
# Treatment A
A_raw = df["swang1"]
if A_raw.dtype == "O":
    A = (A_raw == "RHC").astype(int)
else:
    A = (A_raw.astype(float) > 0).astype(int)

# Outcome Y
Y = df["t3d30"]

# Covariates X
# Note we define cat1_coma and cat2_coma by ourselves since it doesn't exist in the original dataset
# reference: https://search.r-project.org/CRAN/refmans/ATbounds/html/RHC.html
X = pd.DataFrame({
    "age": df["age"],
    "sex": df["sex"],
    "cat1_coma": df["cat1"].apply(lambda x: 1 if x in ["Coma"] else 0),
    "cat2_coma": df["cat2"].apply(lambda x: 1 if x in ["Coma"] else 0),
    "dnr1": df["dnr1"],
    "surv2md1": df["surv2md1"],
    "aps1": df["aps1"],
})

if X["sex"].dtype == "O":
    X["sex"] = (X["sex"] == "Female").astype(int)

if X["dnr1"].dtype == "O":
    X["dnr1"] = X["dnr1"].map({"Yes": 1, "No": 0}).fillna(0).astype(int)

# Proxies Z & W
Z = df[["pafi1", "paco21"]].copy() 
W = df[["ph1", "hema1"]].copy()  

analysis_cols = pd.concat(
    [
        Y.rename("Y"),
        A.rename("A"),
        X,
        Z.rename(columns={"pafi1": "pafi1", "paco21": "paco21"}),
        W.rename(columns={"ph1": "ph1", "hema1": "hema1"}),
    ],
    axis=1,
)

analysis_df = analysis_cols.dropna().copy()

# Overwrite with cleaned arrays
Y = analysis_df["Y"].values
A = analysis_df["A"].values.astype(int)
X_colnames = [col for col in analysis_df.columns if col not in ["Y", "A", "pafi1", "paco21", "ph1", "hema1"]]
X = analysis_df[X_colnames]
Z = analysis_df[["pafi1", "paco21"]]
W = analysis_df[["ph1", "hema1"]]

print(f"\nFinal X shape: {X.shape}")
analysis_df.head()


Final X shape: (5735, 7)


Unnamed: 0,Y,A,age,sex,cat1_coma,cat2_coma,dnr1,surv2md1,aps1,pafi1,paco21,ph1,hema1
0,30,0,70.25098,0,0,0,0,0.640991,46,68.0,40.0,7.359375,58.0
1,30,1,78.17896,1,0,0,0,0.755,50,218.3125,34.0,7.329102,32.5
2,30,1,46.09198,1,0,0,0,0.317,82,275.5,16.0,7.359375,21.097656
3,30,0,75.33197,1,0,0,0,0.440979,48,156.65625,30.0,7.459961,26.296875
4,2,1,67.90997,0,0,0,1,0.437,72,478.0,17.0,7.229492,24.0


In [62]:
X_train, X_test, A_train, A_test, Y_train, Y_test, Z_train, Z_test, W_train, W_test = train_test_split(
    X.values, A, Y, Z.values, W.values, test_size=0.3, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 4014
Test samples: 1721


In [63]:
baseline = BaselineCausalForestDML(n_estimators=200, min_samples_leaf=20, random_state=42)
baseline.fit_baseline(X_train, A_train, Y_train, verbose=True)
pred_baseline = baseline.effect(X_test).ravel()

print(f"\nBaseline predictions - Mean: {pred_baseline.mean():.4f}, Std: {pred_baseline.std():.4f}")
print(f"Baseline predictions - Min: {pred_baseline.min():.4f}, Max: {pred_baseline.max():.4f}")


Baseline predictions - Mean: -1.2659, Std: 1.3492
Baseline predictions - Min: -5.8837, Max: 3.0109


In [64]:
nccsf = NCCausalForestDML(n_estimators=200, min_samples_leaf=20, cv=5, random_state=42)
nccsf.fit(Y=Y_train, T=A_train, X=X_train, Z=Z_train, W=W_train)
pred_nccsf = nccsf.effect(X_test).ravel()

print(f"\nNC-CSF predictions - Mean: {pred_nccsf.mean():.4f}, Std: {pred_nccsf.std():.4f}")
print(f"NC-CSF predictions - Min: {pred_nccsf.min():.4f}, Max: {pred_nccsf.max():.4f}")


NC-CSF predictions - Mean: -1.0821, Std: 1.3467
NC-CSF predictions - Min: -5.5421, Max: 3.3506
