In [8]:
import pandas as pd
from econml.metalearners import SLearner
from econml.metalearners import TLearner
from econml.dr import DRLearner
from econml.dr import LinearDRLearner
from econml.dr import SparseLinearDRLearner
from econml.dr import ForestDRLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.svm import SVR, SVC

from nb21 import cumulative_gain, elast
from catenets.models.jax import SNet, TNet, DRNet

from econml.cate_interpreter import SingleTreeCateInterpreter
import shap
from econml.dml import LinearDML

Data preprocessing

In [10]:
# pd.read_csv('../data/bsc_project_set.csv')

In [11]:
# Load dataset
data = pd.read_csv('../data/bsc_project_set.csv')

# Drop id
data = data.drop(['id', 'Unnamed: 0'], axis=1)


# Convert categorical data into numeric
# 'sex' and 'peep_regime' are categorical, use pd.get_dummies
categorical_columns = ['sex', 'peep_regime']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)
data = data.drop(['sex_F','peep_regime_low'], axis = 1)


numeric_columns = data.columns.difference(['mort_28'])
impute_columns = data.columns.difference(['mort_28', 'sex_M', 'peep_regime_high'])

# Normalize data
# scaler = StandardScaler()
# scaler = Normalizer()
scaler = MinMaxScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Impute missing data
# imputer = SimpleImputer(strategy='mean')
# imputer = KNNImputer(n_neighbors=5, weights='uniform')
# Impute using Iterative Imputer
imputer = IterativeImputer(max_iter=10, random_state=768)


data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Define treatment and outcome columns
treatment_column = 'peep_regime_high'
outcome_column = 'mort_28'

# Define features (excluding treatment and outcome)
features = list(set(data.columns) - {treatment_column, outcome_column})

# Confounding variables
confounders = ['age', 'sex_M', 'weight', 'height', 'pf_ratio', 'po2', 'fio2', 'heart_rate']

# print(data.shape)

X = data[features]
Y = data[outcome_column]
T = data[treatment_column]

# Train-test split
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(
    data[features], data[outcome_column], data[treatment_column], test_size=0.2, random_state=768)



# t_train = data.loc[X_train.index, treatment_column]
# t_test = data.loc[X_test.index, treatment_column]

# print(t_train.shape, t_test.shape)
# print(X_train.shape, X_test.shape)
# print(y_train.shape, y_test.shape)

In [12]:
# print(data)

y_train_nn = y_train.values
y_test_nn = y_test.values
t_train_nn = t_train.values
t_test_nn = t_test.values
X_train_nn = X_train.values
X_test_nn = X_test.values

S-Learner

In [13]:

# S-Learner with Random Forest
s_learner_rf = SLearner(overall_model=RandomForestRegressor(n_estimators=100, random_state=768))
print("S-Learner Random Forest training")
s_learner_rf.fit(Y=y_train.astype(int), T=t_train, X=X_train)

# S-Learner with Gradient Boosting
s_learner_gb = SLearner(overall_model=GradientBoostingRegressor(n_estimators=100, random_state=768))
print("S-Learner Gradient Boosting training")
s_learner_gb.fit(Y=y_train.astype(int), T=t_train, X=X_train)

# S-Learner with Linear Regression
s_learner_lr = SLearner(overall_model=LinearRegression())
print("S-Learner Linear Regression training")
s_learner_lr.fit(Y=y_train.astype(int), T=t_train, X=X_train)

# # S-Learner with SVMs
# s_learner_svm = SLearner(overall_model=SVR())
# print("S-Learner SVM training")
# s_learner_svm.fit(Y=y_train.astype(int), T=t_train, X=X_train)

# S-Learner with Neural Networks
s_learner_nn = SNet(binary_y=True)
print("S-Learner Neural Network training")
s_learner_nn.fit(y=y_train_nn.astype(int), w=t_train_nn, X=X_train_nn)

# Estimate CATE for each learner
cate_s_learner_rf = s_learner_rf.effect(X_test)
# mse_s_learner_rf = mean_squared_error(y_test, cate_s_learner_rf)
cate_s_learner_gb = s_learner_gb.effect(X_test)
# mse_s_learner_gb = mean_squared_error(y_test, cate_s_learner_gb)
cate_s_learner_lr = s_learner_lr.effect(X_test)

# cate_s_learner_svm = s_learner_svm.effect(X_test)
cate_s_learner_nn = s_learner_nn.predict(X_test)

# Print the mean and standard deviation for CATE estimates from each learner
print("S-Learner - Random Forest - Mean CATE:", np.mean(cate_s_learner_rf), "Std Dev:", np.std(cate_s_learner_rf))
print("S-Learner - Gradient Boosting - Mean CATE:", np.mean(cate_s_learner_gb), "Std Dev:", np.std(cate_s_learner_gb))
print("S-Learner - Linear Regression - Mean CATE:", np.mean(cate_s_learner_lr), "Std Dev:", np.std(cate_s_learner_lr))
# print("S-Learner - Linear Regression - Mean CATE:", np.mean(cate_s_learner_svm), "Std Dev:", np.std(cate_s_learner_svm))
print("S-Learner - Neural Networks - Mean CATE:", np.mean(cate_s_learner_nn), "Std Dev:", np.std(cate_s_learner_nn))

In [14]:
# Estimate CATE on the train set?
cate_s_learner_rf_train = s_learner_rf.effect(X_train)
cate_s_learner_gb_train = s_learner_gb.effect(X_train)
cate_s_learner_lr_train = s_learner_lr.effect(X_train)
cate_s_learner_nn_train = s_learner_nn.predict(X_train)


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_s_rf_train = cumulative_gain(train_data.assign(cate=cate_s_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_rf_test = cumulative_gain(test_data.assign(cate=cate_s_learner_rf), "cate", y=outcome_column, t=treatment_column)

gain_curve_s_gb_train = cumulative_gain(train_data.assign(cate=cate_s_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_gb_test = cumulative_gain(test_data.assign(cate=cate_s_learner_gb), "cate", y=outcome_column, t=treatment_column)

gain_curve_s_lr_train = cumulative_gain(train_data.assign(cate=cate_s_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_lr_test = cumulative_gain(test_data.assign(cate=cate_s_learner_lr), "cate", y=outcome_column, t=treatment_column)

gain_curve_s_nn_train = cumulative_gain(train_data.assign(cate=cate_s_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_nn_test = cumulative_gain(test_data.assign(cate=cate_s_learner_nn), "cate", y=outcome_column, t=treatment_column)

# plt.figure(dpi=200)
plt.plot(gain_curve_s_rf_test, color="C0", label="S-Learner Test - Random Forest")
plt.plot(gain_curve_s_rf_train, color="C1", label="S-Learner Train - Random Forest")
plt.plot(gain_curve_s_gb_test, color="C2", label="S-Learner Test - Gradient Boosting")
plt.plot(gain_curve_s_gb_train, color="C3", label="S-Learner Train - Gradient Boosting")
plt.plot(gain_curve_s_lr_test, color="C4", label="S-Learner Test - Linear Regression")
plt.plot(gain_curve_s_lr_train, color="C5", label="S-Learner Train - Linear Regression")
plt.plot(gain_curve_s_nn_test, color="C6", label="S-Learner Test - Neural Network")
plt.plot(gain_curve_s_nn_train, color="C7", label="S-Learner Train - Neural Network")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain")
plt.savefig(f"plots/mimic/S-Cumulative_gain.png")
plt.savefig(f"plots/mimic/S-Cumulative_gain.svg")
plt.show()

In [15]:
# Print the mean and standard deviation for CATE estimates from each learner
print("S-Learner - Mean CATE:", np.mean(cate_s_learner_rf), "Std Dev:", np.std(cate_s_learner_rf))
print("S-Learner - Mean CATE:", np.mean(cate_s_learner_gb), "Std Dev:", np.std(cate_s_learner_gb))
print("S-Learner - Mean CATE:", np.mean(cate_s_learner_lr), "Std Dev:", np.std(cate_s_learner_lr))
print("S-Learner - Mean CATE:", np.mean(cate_s_learner_nn), "Std Dev:", np.std(cate_s_learner_nn))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
plt.plot(cate_s_learner_rf, 'o', label='S-Learner - Random Forest')
plt.plot(cate_s_learner_gb, 'x', label='S-Learner - Gradient Boosting')
plt.plot(cate_s_learner_lr, '+', label='S-Learner - Linear Regression')
plt.plot(cate_s_learner_nn, '*', label='S-Learner - Neural Network')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: S-Learner using: Random Forest, Gradient Boosting, Linear Regression')
plt.legend()
plt.savefig(f"plots/mimic/S-CATEs.png")
plt.savefig(f"plots/mimic/S-CATEs.svg")
plt.show()

T-Learner

In [16]:
# T-Learner with Random Forest
t_learner_rf = TLearner(models=RandomForestRegressor(n_estimators=100, random_state=768))
print("T-Learner Random Forest training")
t_learner_rf.fit(y_train.astype(int), X=X_train, T=t_train)

# T-Learner with Gradient Boosting
t_learner_gb = TLearner(models=GradientBoostingRegressor(n_estimators=100, random_state=768))
print("T-Learner Gradient Boosting training")
t_learner_gb.fit(y_train.astype(int), X=X_train, T=t_train)

# T-Learner with Linear Regression
t_learner_lr = TLearner(models=LinearRegression())
print("T-Learner Linear Regression training")
t_learner_lr.fit(y_train.astype(int), X=X_train, T=t_train)

# T-Learner with SVMs
# t_learner_svm = TLearner(models=SVR())
# print("T-Learner SVM training")
# t_learner_lr.fit(y_train.astype(int), X=X_train, T=t_train)


#  T-Learner with Neural Networks
t_learner_nn = TNet(binary_y=True)
print("T-Learner Neural Network training")
# t_learner_nn.fit(y=y_train.astype(int), X=X_train, T=t_train)
t_learner_nn.fit(y=y_train_nn.astype(int), w=t_train_nn, X=X_train_nn)


cate_t_learner_rf = t_learner_rf.effect(X_test)
# mse_t_learner_rf = mean_squared_error(y_test, cate_t_learner_rf)

cate_t_learner_gb = t_learner_gb.effect(X_test)
# mse_t_learner_gb = mean_squared_error(y_test, cate_t_learner_rf)

cate_t_learner_lr = t_learner_lr.effect(X_test)

# cate_t_learner_svm = t_learner_svm.effect(X_test)

cate_t_learner_nn = t_learner_nn.predict(X_test)

print("T-Learner - Random Forest - Mean CATE:", np.mean(cate_t_learner_rf), "Std Dev:", np.std(cate_t_learner_rf))
print("T-Learner - Gradient Boosting - Mean CATE:", np.mean(cate_t_learner_rf), "Std Dev:", np.std(cate_t_learner_rf))
print("T-Learner - Linear Regression - Mean CATE:", np.mean(cate_t_learner_lr), "Std Dev:", np.std(cate_t_learner_lr))
# print("T-Learner - SVM - Mean CATE:", np.mean(cate_t_learner_svm), "Std Dev:", np.std(cate_t_learner_svm))
print("T-Learner - Neural Networks - Mean CATE:", np.mean(cate_t_learner_nn), "Std Dev:", np.std(cate_t_learner_nn))

In [17]:
# print(cate_t_learner_nn)

In [18]:
# Estimate CATE on the train set?
cate_t_learner_rf_train = t_learner_rf.effect(X_train)
cate_t_learner_gb_train = t_learner_gb.effect(X_train)
cate_t_learner_lr_train = t_learner_lr.effect(X_train)
# cate_t_learner_svm_train = t_learner_svm.effect(X_train)
cate_t_learner_nn_train = t_learner_nn.predict(X_train)


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_t_rf_train = cumulative_gain(train_data.assign(cate=cate_t_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_rf_test = cumulative_gain(test_data.assign(cate=cate_t_learner_rf), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_gb_train = cumulative_gain(train_data.assign(cate=cate_t_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_gb_test = cumulative_gain(test_data.assign(cate=cate_t_learner_gb), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_lr_train = cumulative_gain(train_data.assign(cate=cate_t_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_lr_test = cumulative_gain(test_data.assign(cate=cate_t_learner_lr), "cate", y=outcome_column, t=treatment_column)

# gain_curve_t_svm_train = cumulative_gain(train_data.assign(cate=cate_t_learner_svm_train), "cate", y=outcome_column, t=treatment_column)
# gain_curve_t_svm_test = cumulative_gain(test_data.assign(cate=cate_t_learner_svm), "cate", y=outcome_column, t=treatment_column)
 

gain_curve_t_nn_train = cumulative_gain(train_data.assign(cate=cate_t_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_nn_test = cumulative_gain(test_data.assign(cate=cate_t_learner_nn), "cate", y=outcome_column, t=treatment_column)


# plt.figure(dpi=200)
plt.plot(gain_curve_t_rf_test, color="C0", label="T-Learner Test - Random Forest")
plt.plot(gain_curve_t_rf_train, color="C1", label="T-Learner Train - Random Forest")
plt.plot(gain_curve_t_gb_test, color="C2", label="T-Learner Test - Gradient Boosting")
plt.plot(gain_curve_t_gb_train, color="C3", label="T-Learner Train - Gradient Boosting")
plt.plot(gain_curve_t_lr_test, color="C4", label="T-Learner Test - Linear Regression")
plt.plot(gain_curve_t_lr_train, color="C5", label="T-Learner Train - Linear Regression")
# plt.plot(gain_curve_t_svm_test, color="C6", label="T-Learner Test - SVM")
# plt.plot(gain_curve_t_svm_train, color="C7", label="T-Learner Train - SVM")
plt.plot(gain_curve_t_nn_test, color="C6", label="T-Learner Test - Neural Network")
plt.plot(gain_curve_t_nn_train, color="C7", label="T-Learner Train - Neural Network")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain")
plt.savefig(f"plots/mimic/T-Cumulative_gain.png")
plt.savefig(f"plots/mimic/T-Cumulative_gain.svg")
plt.show()

In [19]:
# Print the mean and standard deviation for CATE estimates from each learner
print("T-Learner - Mean CATE:", np.mean(cate_t_learner_rf), "Std Dev:", np.std(cate_t_learner_rf))
print("T-Learner - Mean CATE:", np.mean(cate_t_learner_gb), "Std Dev:", np.std(cate_t_learner_gb))
print("T-Learner - Mean CATE:", np.mean(cate_t_learner_lr), "Std Dev:", np.std(cate_t_learner_lr))
print("T-Learner - Mean CATE:", np.mean(cate_t_learner_nn), "Std Dev:", np.std(cate_t_learner_nn))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
plt.plot(cate_t_learner_rf, 'o', label='T-Learner - Random Forest')
plt.plot(cate_t_learner_gb, 'x', label='T-Learner - Gradient Boosting')
plt.plot(cate_t_learner_lr, '+', label='T-Learner - Linear Regression')
plt.plot(cate_t_learner_nn, '*', label='T-Learner - Neural Network')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: T-Learner using: Random Forest, Gradient Boosting, Linear Regression')
plt.legend()
plt.savefig(f"plots/mimic/T-CATEs.png")
plt.savefig(f"plots/mimic/T-CATEs.svg")
plt.show()

DR-learner

In [20]:
# DR-Learner with Random Forest for regression and logistic regression for propensity
# dr_learner = LinearDRLearner(
#     model_regression=RandomForestRegressor(n_estimators=100, random_state=42),
#     model_propensity=LogisticRegression(solver='lbfgs', max_iter=1000),
#     cv=3,
# )
# dr_learner = LinearDRLearner(
#     model_regression=RandomForestRegressor(n_estimators=100, random_state=768),
#     model_propensity=LogisticRegression(solver='lbfgs', max_iter=1000)
# )


features_without_confounders = list(set(features) - set(confounders))

# DR-Learner with Random Forest for regression and for propensity
dr_learner_rf = DRLearner(
    model_propensity=RandomForestClassifier(random_state=768), 
    model_regression=RandomForestClassifier(random_state=768),
    model_final=LinearRegression(),
    discrete_outcome=True
)
print("DR-Learner Random Forest training")
dr_learner_rf.fit(y_train.astype(int), t_train, X=X_train[features_without_confounders], W=X_train[confounders])

# DR-Learner with Gradient Boosting for regression and for propensity
dr_learner_gb = DRLearner(
    model_propensity=GradientBoostingClassifier(random_state=768), 
    model_regression=GradientBoostingClassifier(random_state=768),
    model_final=LinearRegression(),
    discrete_outcome=True
)
print("DR-Learner Gradient Boosting training")
dr_learner_gb.fit(y_train.astype(int), t_train, X=X_train[features_without_confounders], W=X_train[confounders])

# DR-Learner with Logistic Regression for regression and propensity
dr_learner_lr = DRLearner(
    model_propensity=LogisticRegression(random_state=768), 
    model_regression=LogisticRegression(random_state=768),
    model_final=LinearRegression(),
    discrete_outcome=True
)
print("DR-Learner Logistic Regression training")
dr_learner_lr.fit(y_train.astype(int), t_train, X=X_train[features_without_confounders], W=X_train[confounders])

# # DR-Learner with Support Vector Machines for regression and propensity
# dr_learner_svm = DRLearner(
#     model_propensity=SVC(random_state=768), 
#     model_regression=SVC(random_state=768),
#     model_final=SVR(),
#     discrete_outcome=True
# )
# print("DR-Learner SVM training")
# dr_learner_svm.fit(y_train.astype(int), t_train, X=X_train[features_without_confounders], W=X_train[confounders])


# DR-Learner with Neural Networks
dr_learner_nn = DRNet(binary_y=True)
print("DR-Learner Neural Network training")
# t_learner_nn.fit(y=y_train.astype(int), X=X_train, T=t_train)
dr_learner_nn.fit(y=y_train_nn.astype(int), w=t_train_nn, X=X_train_nn)

cate_dr_learner_rf = dr_learner_rf.effect(X_test[features_without_confounders])

cate_dr_learner_gb = dr_learner_gb.effect(X_test[features_without_confounders])
# mse_dr_learner_gb = mean_squared_error(y_test, cate_dr_learner_gb)

cate_dr_learner_lr = dr_learner_lr.effect(X_test[features_without_confounders])

# cate_dr_learner_svm = dr_learner_svm.effect(X_test[features_without_confounders])

cate_dr_learner_nn = dr_learner_nn.predict(X_test)

print("DR-Learner - Random Forest - Mean CATE", np.mean(cate_dr_learner_rf), "Std Dev:", np.std(cate_dr_learner_rf))
print("DR-Learner - Gradient Boosting - Mean CATE:", np.mean(cate_dr_learner_gb), "Std Dev:", np.std(cate_dr_learner_gb))
print("DR-Learner - Logistic Regression - Mean CATE:", np.mean(cate_dr_learner_lr), "Std Dev:", np.std(cate_dr_learner_lr))
# print("DR-Learner - SVM - Mean CATE:", np.mean(cate_dr_learner_svm), "Std Dev:", np.std(cate_dr_learner_svm))
print("DR-Learner - Neural Networks - Mean CATE:", np.mean(cate_dr_learner_nn), "Std Dev:", np.std(cate_dr_learner_nn))

# # Define models for DR-Learner
# outcome_model = RandomForestRegressor(n_estimators=100, random_state=42)
# propensity_model = LogisticRegression()
# 
# # DR-Learner instance
# dr_learner = LinearDRLearner(
#     model_regression=outcome_model,
#     model_propensity=propensity_model,
#     cv=3,
# )



In [62]:
# Estimate CATE on the train set?
cate_dr_learner_rf_train = dr_learner_rf.effect(X_train[features_without_confounders])
cate_dr_learner_gb_train = dr_learner_gb.effect(X_train[features_without_confounders])
cate_dr_learner_lr_train = dr_learner_lr.effect(X_train[features_without_confounders])
cate_dr_learner_nn_train = dr_learner_nn.predict(X_train)


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_dr_rf_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_rf_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_rf), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_gb_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_gb_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_gb), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_lr_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_lr_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_lr), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_nn_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_nn_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_nn), "cate", y=outcome_column, t=treatment_column)


# plt.figure(dpi=200)
# plt.plot(gain_curve_dr_rf_test, color="C0", label="DR-Learner Test - Random Forest")
# plt.plot(gain_curve_dr_rf_train, color="C1", label="DR-Learner Train - Random Forest")
plt.plot(gain_curve_dr_gb_test, color="C2", label="DR-Learner Test - Gradient Boosting")
plt.plot(gain_curve_dr_gb_train, color="C3", label="DR-Learner Train - Gradient Boosting")
plt.plot(gain_curve_dr_lr_test, color="C4", label="DR-Learner Test - Logistic Regression")
plt.plot(gain_curve_dr_lr_train, color="C5", label="DR-Learner Train - Logistic Regression")
plt.plot(gain_curve_dr_nn_test, color="C6", label="DR-Learner Test - Neural Network")
plt.plot(gain_curve_dr_nn_train, color="C7", label="DR-Learner Train - Neural Network")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain")
plt.savefig(f"plots/mimic/DR-Cumulative_gain.png")
plt.savefig(f"plots/mimic/DR-Cumulative_gain.svg")
plt.show()

In [22]:
# Print the mean and standard deviation for CATE estimates from each learner
print("DR-Learner - Mean CATE:", np.mean(cate_dr_learner_rf), "Std Dev:", np.std(cate_dr_learner_rf))
print("DR-Learner - Mean CATE:", np.mean(cate_dr_learner_gb), "Std Dev:", np.std(cate_dr_learner_gb))
print("DR-Learner - Mean CATE:", np.mean(cate_dr_learner_lr), "Std Dev:", np.std(cate_dr_learner_lr))
print("DR-Learner - Mean CATE:", np.mean(cate_dr_learner_nn), "Std Dev:", np.std(cate_dr_learner_nn))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
# plt.plot(cate_dr_learner_rf, 'o', label='DR-Learner - Random Forest')
plt.plot(cate_dr_learner_gb, 'x', label='DR-Learner - Gradient Boosting')
plt.plot(cate_dr_learner_lr, '+', label='DR-Learner - Logistic Regression')
plt.plot(cate_dr_learner_nn, '*', label='DR-Learner - Neural Network')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: DR-Learner using: Random Forest, Gradient Boosting, Logistic Regression')
plt.legend()
plt.savefig(f"plots/mimic/DR-CATEs.png")
plt.savefig(f"plots/mimic/DR-CATEs.svg")
plt.show()

Linear CATEs comparison

In [23]:
# Print the mean and standard deviation for CATE estimates from each learner
print("S-Learner - Linear Regression - Mean CATE:", np.mean(cate_s_learner_lr), "Std Dev:", np.std(cate_s_learner_lr))
print("T-Learner - Linear Regression - Mean CATE:", np.mean(cate_t_learner_lr), "Std Dev:", np.std(cate_t_learner_lr))
print("DR-Learner - Logistic Regression - Mean CATE:", np.mean(cate_dr_learner_lr), "Std Dev:", np.std(cate_dr_learner_lr))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
plt.plot(cate_s_learner_lr, 'o', label='S-Learner')
plt.plot(cate_t_learner_lr, 'x', label='T-Learner')
plt.plot(cate_dr_learner_lr, '+', label='DR-Learner')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: Linear and Logistic Regression: S-Learner, T-Learner, DR-Learner')
plt.legend()
plt.savefig(f"plots/mimic/LR-CATEs.png")
plt.savefig(f"plots/mimic/LR-CATEs.svg")
plt.show()

In [24]:
# Estimate CATE on the train set?
cate_s_learner_lr_train = s_learner_lr.effect(X_train)
cate_t_learner_lr_train = t_learner_lr.effect(X_train)
cate_dr_learner_lr_train = dr_learner_lr.effect(X_train[features_without_confounders])


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_s_lr_train = cumulative_gain(train_data.assign(cate=cate_s_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_lr_test = cumulative_gain(test_data.assign(cate=cate_s_learner_lr), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_lr_train = cumulative_gain(train_data.assign(cate=cate_t_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_lr_test = cumulative_gain(test_data.assign(cate=cate_t_learner_lr), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_lr_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_lr_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_lr), "cate", y=outcome_column, t=treatment_column)


# plt.figure(dpi=200)
plt.plot(gain_curve_s_lr_test, color="C0", label="S-Learner Test")
plt.plot(gain_curve_s_lr_train, color="C1", label="S-Learner Train")
plt.plot(gain_curve_t_lr_test, color="C2", label="T-Learner Test")
plt.plot(gain_curve_t_lr_train, color="C3", label="T-Learner Train")
plt.plot(gain_curve_dr_lr_test, color="C4", label="DR-Learner Test")
plt.plot(gain_curve_dr_lr_train, color="C5", label="DR-Learner Train")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain - Linear and Logistic Regression comparison")
plt.savefig(f"plots/mimic/LR-Cumulative_gain.png")
plt.savefig(f"plots/mimic/LR-Cumulative_gain.svg")
plt.show()

Random Forest CATEs comparison

In [25]:
# Print the mean and standard deviation for CATE estimates from each learner
print("S-Learner - Random Forest - Mean CATE:", np.mean(cate_s_learner_rf), "Std Dev:", np.std(cate_s_learner_rf))
print("T-Learner - Random Forest - Mean CATE:", np.mean(cate_t_learner_rf), "Std Dev:", np.std(cate_t_learner_rf))
print("DR-Learner - Random Forest - Mean CATE:", np.mean(cate_dr_learner_rf), "Std Dev:", np.std(cate_dr_learner_rf))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
plt.plot(cate_s_learner_rf, 'o', label='S-Learner')
plt.plot(cate_t_learner_rf, 'x', label='T-Learner')
plt.plot(cate_dr_learner_rf, '+', label='DR-Learner')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: Random Forest: S-Learner, T-Learner, DR-Learner')
plt.legend()
plt.savefig(f"plots/mimic/RF-CATEs.png")
plt.savefig(f"plots/mimic/RF-CATEs.svg")
plt.show()

In [26]:
cate_s_learner_rf_train = s_learner_rf.effect(X_train)
cate_t_learner_rf_train = t_learner_rf.effect(X_train)
cate_dr_learner_rf_train = dr_learner_rf.effect(X_train[features_without_confounders])


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_s_rf_train = cumulative_gain(train_data.assign(cate=cate_s_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_rf_test = cumulative_gain(test_data.assign(cate=cate_s_learner_rf), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_rf_train = cumulative_gain(train_data.assign(cate=cate_t_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_rf_test = cumulative_gain(test_data.assign(cate=cate_t_learner_rf), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_rf_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_rf_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_rf), "cate", y=outcome_column, t=treatment_column)


# plt.figure(dpi=200)
plt.plot(gain_curve_s_rf_test, color="C0", label="S-Learner Test")
plt.plot(gain_curve_s_rf_train, color="C1", label="S-Learner Train")
plt.plot(gain_curve_t_rf_test, color="C2", label="T-Learner Test")
plt.plot(gain_curve_t_rf_train, color="C3", label="T-Learner Train")
plt.plot(gain_curve_dr_rf_test, color="C4", label="DR-Learner Test")
plt.plot(gain_curve_dr_rf_train, color="C5", label="DR-Learner Train")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain - Random Forest comparison")
plt.savefig(f"plots/mimic/RF-Cumulative_gain.png")
plt.savefig(f"plots/mimic/RF-Cumulative_gain.svg")
plt.show()

Gradient Boosting CATEs comparison

In [27]:
# Print the mean and standard deviation for CATE estimates from each learner
print("S-Learner - Gradient Boosting - Mean CATE:", np.mean(cate_s_learner_gb), "Std Dev:", np.std(cate_s_learner_gb))
print("T-Learner - Gradient Boosting - Mean CATE:", np.mean(cate_t_learner_gb), "Std Dev:", np.std(cate_t_learner_gb))
print("DR-Learner - Gradient Boosting - Mean CATE:", np.mean(cate_dr_learner_gb), "Std Dev:", np.std(cate_dr_learner_gb))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
plt.plot(cate_s_learner_gb, 'o', label='S-Learner')
plt.plot(cate_t_learner_gb, 'x', label='T-Learner')
plt.plot(cate_dr_learner_gb, '+', label='DR-Learner')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: Gradient Boosting: S-Learner, T-Learner, DR-Learner')
plt.legend()
plt.savefig(f"plots/mimic/GB-CATEs.png")
plt.savefig(f"plots/mimic/GB-CATEs.svg")
plt.show()

In [28]:
cate_s_learner_gb_train = s_learner_gb.effect(X_train)
cate_t_learner_gb_train = t_learner_gb.effect(X_train)
cate_dr_learner_gb_train = dr_learner_gb.effect(X_train[features_without_confounders])


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_s_gb_train = cumulative_gain(train_data.assign(cate=cate_s_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_gb_test = cumulative_gain(test_data.assign(cate=cate_s_learner_gb), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_gb_train = cumulative_gain(train_data.assign(cate=cate_t_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_gb_test = cumulative_gain(test_data.assign(cate=cate_t_learner_gb), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_gb_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_gb_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_gb), "cate", y=outcome_column, t=treatment_column)


# plt.figure(dpi=200)
plt.plot(gain_curve_s_gb_test, color="C0", label="S-Learner Test")
plt.plot(gain_curve_s_gb_train, color="C1", label="S-Learner Train")
plt.plot(gain_curve_t_gb_test, color="C2", label="T-Learner Test")
plt.plot(gain_curve_t_gb_train, color="C3", label="T-Learner Train")
plt.plot(gain_curve_dr_gb_test, color="C4", label="DR-Learner Test")
plt.plot(gain_curve_dr_gb_train, color="C5", label="DR-Learner Train")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain - Gradient Boosting comparison")
plt.savefig(f"plots/mimic/GB-Cumulative_gain.png")
plt.savefig(f"plots/mimic/GB-Cumulative_gain.svg")
plt.show()

Neural Network CATEs comparison

In [29]:
# Print the mean and standard deviation for CATE estimates from each learner
print("S-Learner - Neural Network - Mean CATE:", np.mean(cate_s_learner_nn), "Std Dev:", np.std(cate_s_learner_nn))
print("T-Learner - Neural Network - Mean CATE:", np.mean(cate_t_learner_nn), "Std Dev:", np.std(cate_t_learner_nn))
print("DR-Learner - Neural Network - Mean CATE:", np.mean(cate_dr_learner_nn), "Std Dev:", np.std(cate_dr_learner_nn))

# Create a graph to compare CATE estimates
# plt.figure(figsize=(12, 6), dpi=200)
plt.figure(figsize=(12, 6))
plt.plot(cate_s_learner_nn, 'o', label='S-Learner')
plt.plot(cate_t_learner_nn, 'x', label='T-Learner')
plt.plot(cate_dr_learner_nn, '+', label='DR-Learner')
plt.xlabel('Observation')
plt.ylabel('CATE')
plt.title('CATE Estimates: Neural Network: S-Learner, T-Learner, DR-Learner')
plt.legend()
plt.savefig(f"plots/mimic/NN-CATEs.png")
plt.savefig(f"plots/mimic/NN-CATEs.svg")
plt.show()

In [30]:
# cate_s_learner_nn_train = s_learner_nn.effect(X_train)
# cate_t_learner_nn_train = t_learner_nn.effect(X_train)
# cate_dr_learner_nn_train = dr_learner_nn.effect(X_train[features_without_confounders])


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

gain_curve_s_nn_train = cumulative_gain(train_data.assign(cate=cate_s_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_s_nn_test = cumulative_gain(test_data.assign(cate=cate_s_learner_nn), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_nn_train = cumulative_gain(train_data.assign(cate=cate_t_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_nn_test = cumulative_gain(test_data.assign(cate=cate_t_learner_nn), "cate", y=outcome_column, t=treatment_column)

gain_curve_dr_nn_train = cumulative_gain(train_data.assign(cate=cate_dr_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_dr_nn_test = cumulative_gain(test_data.assign(cate=cate_dr_learner_nn), "cate", y=outcome_column, t=treatment_column)


# plt.figure(dpi=200)
plt.plot(gain_curve_s_nn_test, color="C0", label="S-Learner Test")
plt.plot(gain_curve_s_nn_train, color="C1", label="S-Learner Train")
plt.plot(gain_curve_t_nn_test, color="C2", label="T-Learner Test")
plt.plot(gain_curve_t_nn_train, color="C3", label="T-Learner Train")
plt.plot(gain_curve_dr_nn_test, color="C4", label="DR-Learner Test")
plt.plot(gain_curve_dr_nn_train, color="C5", label="DR-Learner Train")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain - Neural Network comparison")
plt.savefig(f"plots/mimic/NN-Cumulative_gain.png")
plt.savefig(f"plots/mimic/NN-Cumulative_gain.svg")
plt.show()

In [40]:

intrp = SingleTreeCateInterpreter(include_model_uncertainty=True, max_depth=2, min_samples_leaf=10)
# intrp.interpret(dr_learner_lr, X[features_without_confounders])
# intrp.plot(feature_names=[features_without_confounders], fontsize=12)

In [56]:
shap_values = s_learner_gb.shap_values(X)
# local view: explain hetergoeneity for a given observation
ind=0
shap.plots.force(shap_values["mort_28"]["peep_regime_high_1.0"][ind], matplotlib=True)
# global view: explain hetergoeneity for a sample of dataset
shap.summary_plot(shap_values['mort_28']['peep_regime_high_1.0'])

In [57]:
shap_values = t_learner_lr.shap_values(X_train)
# local view: explain hetergoeneity for a given observation
ind=0
shap.plots.force(shap_values["mort_28"]["peep_regime_high_1.0"][ind], matplotlib=True)
# global view: explain hetergoeneity for a sample of dataset
shap.summary_plot(shap_values['mort_28']['peep_regime_high_1.0'])

In [59]:

shap_values = dr_learner_lr.shap_values(X[features_without_confounders])
# local view: explain heterogeneity for a given observation
ind=0
shap.plots.force(shap_values["mort_28"]["peep_regime_high_1.0"][ind], matplotlib=True)
# global view: explain hetergoeneity for a sample of dataset
shap.summary_plot(shap_values['mort_28']['peep_regime_high_1.0'])

In [58]:
shap_values = dr_learner_gb.shap_values(X[features_without_confounders])
# local view: explain heterogeneity for a given observation
ind=0
shap.plots.force(shap_values["mort_28"]["peep_regime_high_1.0"][ind], matplotlib=True)
# global view: explain heterogeneity for a sample of dataset
shap.summary_plot(shap_values['mort_28']['peep_regime_high_1.0'])

In [66]:
# from nb21 import cumulative_gain, elast, cumulative_gain_inv

def cumulative_gain_inv(dataset, prediction, y, t, min_periods=30, steps=100):
    size = dataset.shape[0]
    ordered_df = dataset.sort_values(prediction, ascending=True).reset_index(drop=True)
    n_rows = list(range(min_periods, size, size // steps)) + [size]
    return np.array([elast(ordered_df.head(rows), y, t) * (rows / size) for rows in n_rows])

# cate_t_learner_rf_train = t_learner_rf.effect(X_train)
# cate_t_learner_gb_train = t_learner_gb.effect(X_train)
cate_t_learner_lr_train = t_learner_lr.effect(X_train)
# cate_t_learner_svm_train = t_learner_svm.effect(X_train)
# cate_t_learner_nn_train = t_learner_nn.predict(X_train)


train_data = pd.concat([X_train, y_train, t_train], axis=1)
test_data = pd.concat([X_test, y_test, t_test], axis=1)

# gain_curve_t_rf_train = cumulative_gain(train_data.assign(cate=cate_t_learner_rf_train), "cate", y=outcome_column, t=treatment_column)
# gain_curve_t_rf_test = cumulative_gain(test_data.assign(cate=cate_t_learner_rf), "cate", y=outcome_column, t=treatment_column)

# gain_curve_t_gb_train = cumulative_gain(train_data.assign(cate=cate_t_learner_gb_train), "cate", y=outcome_column, t=treatment_column)
# gain_curve_t_gb_test = cumulative_gain(test_data.assign(cate=cate_t_learner_gb), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_lr_train = cumulative_gain(train_data.assign(cate=cate_t_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_lr_test = cumulative_gain(test_data.assign(cate=cate_t_learner_lr), "cate", y=outcome_column, t=treatment_column)

gain_curve_t_lr_train_inv = cumulative_gain_inv(train_data.assign(cate=cate_t_learner_lr_train), "cate", y=outcome_column, t=treatment_column)
gain_curve_t_lr_test_inv = cumulative_gain_inv(test_data.assign(cate=cate_t_learner_lr), "cate", y=outcome_column, t=treatment_column)

# gain_curve_t_svm_train = cumulative_gain(train_data.assign(cate=cate_t_learner_svm_train), "cate", y=outcome_column, t=treatment_column)
# gain_curve_t_svm_test = cumulative_gain(test_data.assign(cate=cate_t_learner_svm), "cate", y=outcome_column, t=treatment_column)
 

# gain_curve_t_nn_train = cumulative_gain(train_data.assign(cate=cate_t_learner_nn_train), "cate", y=outcome_column, t=treatment_column)
# gain_curve_t_nn_test = cumulative_gain(test_data.assign(cate=cate_t_learner_nn), "cate", y=outcome_column, t=treatment_column)


plt.figure(dpi=200)
# plt.plot(gain_curve_t_rf_test, color="C0", label="T-Learner Test - Random Forest")
# plt.plot(gain_curve_t_rf_train, color="C1", label="T-Learner Train - Random Forest")
# plt.plot(gain_curve_t_gb_test, color="C2", label="T-Learner Test - Gradient Boosting")
# plt.plot(gain_curve_t_gb_train, color="C3", label="T-Learner Train - Gradient Boosting")
# plt.plot(gain_curve_t_lr_test, color="C4", label="T-Learner Test - Linear Regression")
plt.plot(gain_curve_t_lr_train, color="C5", label="T-Learner Train - Linear Regression")
# plt.plot(gain_curve_t_lr_test_inv, color="C8", label="T-Learner Test - Linear Regression - inv")
plt.plot(gain_curve_t_lr_train_inv, color="C9", label="T-Learner Train - Linear Regression - inv")
# plt.plot(gain_curve_t_svm_test, color="C6", label="T-Learner Test - SVM")
# plt.plot(gain_curve_t_svm_train, color="C7", label="T-Learner Train - SVM")
# plt.plot(gain_curve_t_nn_test, color="C6", label="T-Learner Test - Neural Network")
# plt.plot(gain_curve_t_nn_train, color="C7", label="T-Learner Train - Neural Network")
plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
plt.legend()
plt.title("Cumulative gain")
# plt.savefig(f"plots/mimic/T-Cumulative_gain.png")
# plt.savefig(f"plots/mimic/T-Cumulative_gain.svg")
plt.show()