In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel("/Users/lijiazheng/Desktop/ihdp_data.xlsx")
df = df.drop(['y_cfactual','mu0','mu1'], axis = 1)
df.head()

Unnamed: 0,treatment,y_factual,x1,x2,x3,x4,x5,x6,x7,x8,...,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25
0,True,5.599916,-0.528603,-0.343455,1.128554,0.161703,-0.316603,1.295216,1,0,...,1,1,1,1,0,0,0,0,0,0
1,False,6.875856,-1.736945,-1.802002,0.383828,2.24432,-0.629189,1.295216,0,0,...,1,1,1,1,0,0,0,0,0,0
2,False,2.996273,-0.807451,-0.202946,-0.360898,-0.879606,0.808706,-0.526556,0,0,...,1,0,1,1,0,0,0,0,0,0
3,False,1.366206,0.390083,0.596582,-1.85035,-0.879606,-0.004017,-0.857787,0,0,...,1,0,1,1,0,0,0,0,0,0
4,False,1.963538,-1.045229,-0.60271,0.011465,0.161703,0.683672,-0.36094,1,0,...,1,1,1,1,0,0,0,0,0,0


In [3]:
df['treatment'].value_counts()

treatment
False    608
True     139
Name: count, dtype: int64

In [4]:
train_data, test_data = train_test_split(df, test_size=0.3, random_state=1)

Propensity Score Matching

In [9]:
# Estimate propensity scores
model = LogisticRegression()
model.fit(train_data.drop(['treatment', 'y_factual'], axis=1), train_data['treatment'])
train_data['propensity_score'] = model.predict_proba(train_data.drop(['treatment', 'y_factual'], axis=1))[:,1]

In [12]:
from sklearn.metrics import pairwise_distances_argmin_min

treated = train_data[train_data['treatment'] == 1]
untreated = train_data[train_data['treatment'] == 0]
indices, _ = pairwise_distances_argmin_min(treated[['propensity_score']], untreated[['propensity_score']])
matched = treated.copy()
matched['matched_outcome'] = untreated.iloc[indices]['y_factual'].values

In [14]:
att_psm = (matched['y_factual'] - matched['matched_outcome']).mean()
att_psm

3.972023119649341

Inverse Probability of Treatment Weighting Using the Propensity Score

In [6]:
X_train = train_data.drop(['treatment', 'y_factual'], axis=1)
T_train = train_data['treatment']
Y_train = train_data['y_factual']

X_test = test_data.drop(['treatment', 'y_factual'], axis=1)
T_test = test_data['treatment']
Y_test = test_data['y_factual']

In [7]:
# Estimate propensity scores
ps_model = LogisticRegression()
ps_model.fit(X_train, T_train)
propensity_scores_train = ps_model.predict_proba(X_train)[:, 1]
propensity_scores_test = ps_model.predict_proba(X_test)[:, 1]

In [8]:
# Calculate IPTW weights
weights_train = T_train / propensity_scores_train + (1 - T_train) / (1 - propensity_scores_train)
weights_test = T_test / propensity_scores_test + (1 - T_test) / (1 - propensity_scores_test)

In [9]:
def weighted_least_squares(X, T, Y, weights):
    X_with_intercept = np.c_[np.ones(X.shape[0]), X]
    WLS_model = np.linalg.lstsq(X_with_intercept * weights[:, np.newaxis], Y * weights, rcond=None)[0]
    return WLS_model[1]

# Calculate the weighted mean outcome for treated and untreated subjects
treated_weights_train = weights_train[T_train == 1]
untreated_weights_train = weights_train[T_train == 0]

treated_outcomes_train = Y_train[T_train == 1]
untreated_outcomes_train = Y_train[T_train == 0]

weighted_mean_treated = np.sum(treated_weights_train * treated_outcomes_train) / np.sum(treated_weights_train)
weighted_mean_untreated = np.sum(untreated_weights_train * untreated_outcomes_train) / np.sum(untreated_weights_train)

# Estimate ATE
ate_iptw = weighted_mean_treated - weighted_mean_untreated

print(f'Estimated ATE using IPTW: {ate_iptw}')

Estimated ATE using IPTW: 3.9455353117075003


Double Machine Learning

In [16]:
from doubleml import DoubleMLData, DoubleMLPLR

train_data, test_data = train_test_split(df, test_size=0.3, random_state=1)

# Define covariates, treatment, and outcome
X = train_data.drop(['treatment', 'y_factual'], axis=1)
T = train_data['treatment']
Y = train_data['y_factual']

# Create DoubleMLData object
dml_data = DoubleMLData.from_arrays(X.values, Y.values, T.values)

# Define the DML model using RandomForest for both nuisance models
ml_g = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=1)
ml_m = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)

dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m, n_folds=5)

# Fit the model
dml_plr.fit()

# Estimate the treatment effect
att_dml_1 = dml_plr.coef
print(f'Estimated ATT using DML: {att_dml_1}')

Estimated ATT using DML: [3.90723326]




In [18]:
from sklearn.utils import resample


n_bootstraps = 100
bootstrap_estimates_dml = []
bootstrap_estimates_psm = []

for _ in range(n_bootstraps):
    # Resample the data with replacement
    X_resampled, Y_resampled, T_resampled = resample(X.values, Y.values, T.values, random_state=_)

    ps_model = LogisticRegression()
    ps_model.fit(X_resampled, T_resampled)
    propensity_scores = ps_model.predict_proba(X_resampled)[:, 1]
    
    # Perform matching
    treated = np.where(T_resampled == 1)[0]
    untreated = np.where(T_resampled == 0)[0]
    indices, _ = pairwise_distances_argmin_min(propensity_scores[treated].reshape(-1, 1), propensity_scores[untreated].reshape(-1, 1))
    
    matched_outcomes = Y_resampled[untreated][indices]
    att_psm = (Y_resampled[treated] - matched_outcomes).mean()
    bootstrap_estimates_psm.append(att_psm)

# Convert bootstrap estimates to numpy arrays
bootstrap_estimates_psm = np.array(bootstrap_estimates_psm)

mean_estimate_psm = np.mean(bootstrap_estimates_psm)
variance_estimate_psm = np.var(bootstrap_estimates_psm)
ci_lower_psm = np.percentile(bootstrap_estimates_psm, 2.5)
ci_upper_psm = np.percentile(bootstrap_estimates_psm, 97.5)

print(f'PSM Mean Estimate: {mean_estimate_psm}')
print(f'PSM Variance Estimate: {variance_estimate_psm}')
print(f'PSM 95% Confidence Interval: [{ci_lower_psm}, {ci_upper_psm}]')

PSM Mean Estimate: 3.8519239567903627
PSM Variance Estimate: 0.0815964630959717
PSM 95% Confidence Interval: [3.189316577515217, 4.342124608303891]


In [21]:
import numpy as np
from doubleml import DoubleMLData, DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier

# Create DoubleMLData object
dml_data = DoubleMLData.from_arrays(X.values, Y.values, T.values)

# Define the DML model using different ML algorithms
# Random Forest
ml_g_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=1)
ml_m_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)

# Gradient Boosting (XGBoost)
ml_g_xgb = XGBRegressor(n_estimators=100, max_depth=10, random_state=1)
ml_m_xgb = XGBClassifier(n_estimators=100, max_depth=10, random_state=1)

# Neural Network
ml_g_nn = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=1)
ml_m_nn = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=1)

# Initialize DML models
dml_plr_rf = DoubleMLPLR(dml_data, ml_g_rf, ml_m_rf, n_folds=5)
dml_plr_xgb = DoubleMLPLR(dml_data, ml_g_xgb, ml_m_xgb, n_folds=5)
dml_plr_nn = DoubleMLPLR(dml_data, ml_g_nn, ml_m_nn, n_folds=5)

# Fit the models and estimate treatment effects
dml_plr_rf.fit()
treatment_effect_rf = dml_plr_rf.coef
print(f'Estimated ATT using DML with Random Forest: {treatment_effect_rf}')

dml_plr_xgb.fit()
treatment_effect_xgb = dml_plr_xgb.coef
print(f'Estimated ATT using DML with XGBoost: {treatment_effect_xgb}')

dml_plr_nn.fit()
treatment_effect_nn = dml_plr_nn.coef
print(f'Estimated ATT using DML with Neural Network: {treatment_effect_nn}')

Estimated ATT using DML with Random Forest: [3.84972142]
Estimated ATT using DML with XGBoost: [3.50029822]




Estimated ATT using DML with Neural Network: [3.82749805]




In [1]:
from sklearn.utils import resample

# Number of bootstrap samples
n_bootstraps = 100
bootstrap_estimates_rf = []
bootstrap_estimates_xgb = []
bootstrap_estimates_nn = []
bootstrap_estimates_psm = []

for _ in range(n_bootstraps):
    # Resample the data with replacement
    X_resampled, Y_resampled, T_resampled = resample(X.values, Y.values, T.values, random_state=_)
    
    # For DML with Random Forest
    dml_data_resampled = DoubleMLData.from_arrays(X_resampled, Y_resampled, T_resampled)
    dml_plr_resampled_rf = DoubleMLPLR(dml_data_resampled, ml_g_rf, ml_m_rf, n_folds=5)
    dml_plr_resampled_rf.fit()
    bootstrap_estimates_rf.append(dml_plr_resampled_rf.coef[0])
    
    # For DML with XGBoost
    dml_plr_resampled_xgb = DoubleMLPLR(dml_data_resampled, ml_g_xgb, ml_m_xgb, n_folds=5)
    dml_plr_resampled_xgb.fit()
    bootstrap_estimates_xgb.append(dml_plr_resampled_xgb.coef[0])
    
    # For DML with Neural Network
    dml_plr_resampled_nn = DoubleMLPLR(dml_data_resampled, ml_g_nn, ml_m_nn, n_folds=5)
    dml_plr_resampled_nn.fit()
    bootstrap_estimates_nn.append(dml_plr_resampled_nn.coef[0])
    

# Convert bootstrap estimates to numpy arrays
bootstrap_estimates_rf = np.array(bootstrap_estimates_rf)
bootstrap_estimates_xgb = np.array(bootstrap_estimates_xgb)
bootstrap_estimates_nn = np.array(bootstrap_estimates_nn)

# Calculate mean, variance, and confidence intervals for the bootstrap estimates
mean_estimate_rf = np.mean(bootstrap_estimates_rf)
variance_estimate_rf = np.var(bootstrap_estimates_rf)
ci_lower_rf = np.percentile(bootstrap_estimates_rf, 2.5)
ci_upper_rf = np.percentile(bootstrap_estimates_rf, 97.5)

mean_estimate_xgb = np.mean(bootstrap_estimates_xgb)
variance_estimate_xgb = np.var(bootstrap_estimates_xgb)
ci_lower_xgb = np.percentile(bootstrap_estimates_xgb, 2.5)
ci_upper_xgb = np.percentile(bootstrap_estimates_xgb, 97.5)

mean_estimate_nn = np.mean(bootstrap_estimates_nn)
variance_estimate_nn = np.var(bootstrap_estimates_nn)
ci_lower_nn = np.percentile(bootstrap_estimates_nn, 2.5)
ci_upper_nn = np.percentile(bootstrap_estimates_nn, 97.5)

print(f'DML with Random Forest Mean Estimate: {mean_estimate_rf}')
print(f'DML with Random Forest Variance Estimate: {variance_estimate_rf}')
print(f'DML with Random Forest 95% Confidence Interval: [{ci_lower_rf}, {ci_upper_rf}]')

print(f'DML with XGBoost Mean Estimate: {mean_estimate_xgb}')
print(f'DML with XGBoost Variance Estimate: {variance_estimate_xgb}')
print(f'DML with XGBoost 95% Confidence Interval: [{ci_lower_xgb}, {ci_upper_xgb}]')

print(f'DML with Neural Network Mean Estimate: {mean_estimate_nn}')
print(f'DML with Neural Network Variance Estimate: {variance_estimate_nn}')
print(f'DML with Neural Network 95% Confidence Interval: [{ci_lower_nn}, {ci_upper_nn}]')

In [26]:
results = pd.DataFrame({
    'Method': ['PSM', 'DML with Random Forest', 'DML with XGBoost', 'DML with Neural Network'],
    'Mean Estimate': [mean_estimate_psm, mean_estimate_rf, mean_estimate_xgb, mean_estimate_nn],
    'ATT': [att_psm, treatment_effect_rf, treatment_effect_xgb, treatment_effect_nn],
    'Variance Estimate': [variance_estimate_psm, variance_estimate_rf, variance_estimate_xgb, variance_estimate_nn],
    '95% Confidence Interval': [
        f'[{ci_lower_psm}, {ci_upper_psm}]',
        f'[{ci_lower_rf}, {ci_upper_rf}]',
        f'[{ci_lower_xgb}, {ci_upper_xgb}]',
        f'[{ci_lower_nn}, {ci_upper_nn}]'
    ]
})

from tabulate import tabulate

print(tabulate(results, headers='keys', tablefmt='pretty'))

+---+-------------------------+--------------------+-------------------+----------------------+------------------------------------------+
|   |         Method          |   Mean Estimate    |        ATT        |  Variance Estimate   |         95% Confidence Interval          |
+---+-------------------------+--------------------+-------------------+----------------------+------------------------------------------+
| 0 |           PSM           | 3.8519239567903627 | 3.794118484040032 |  0.0815964630959717  |  [3.189316577515217, 4.342124608303891]  |
| 1 | DML with Random Forest  | 3.8198245438317913 |   [3.84972142]    | 0.027804319027535915 | [3.4807645073912354, 4.112833844403021]  |
| 2 |    DML with XGBoost     | 3.3949875019273335 |   [3.50029822]    | 0.07055016623846297  | [2.9395560822871447, 3.9222282668418345] |
| 3 | DML with Neural Network | 3.8278507106009094 |   [3.82749805]    | 0.060833183529511524 |  [3.360617363630418, 4.320482391145038]  |
+---+----------------------