In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import math

Dataset Description:

Y is the outcome (continuous) 
A is a binary treatment indicator
V1 through Vp are covariates. (The number of covariates, p, varies across datasets. Could be binary or continuous)

Data source: Atlantic Causal Inference Conference (ACIC) Data Challenge

The true ATE would be released next week.

Quick reminder: The report should have side-by-side comparison of methods performance and computational efficiency.  And the report should be a reproducible R notebook or a similar format. (See coursework for more details)

In [2]:
lowDim_dataset = pd.read_csv('../data/lowDim_dataset.csv')
highDim_dataset = pd.read_csv('../data/highDim_dataset.csv')

In [3]:
lowDim_dataset.head()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
0,19.678858,0,1.59,0.0,0.0,0.0,0.24,1.35,0.73,2.58,...,0.12,0.0,4.55,0.0,1.72,0.0,0.49,0.98,0.0,1.309683
1,17.842989,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.62,...,0.27,0.0,4.87,0.0,0.81,0.27,0.27,0.0,0.0,1.719547
2,22.108788,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.12,0.0,0.0,0.0,2.12,0.99621
3,15.355899,0,0.0,0.0,0.0,0.56,0.0,0.0,0.0,0.0,...,0.0,0.0,1.12,0.0,0.0,0.0,0.0,0.0,0.0,1.504077
4,16.787813,1,1.81,0.0,0.0,0.0,0.0,0.0,0.0,1.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327864


In [4]:
highDim_dataset.shape

(2000, 187)

In [5]:
X = lowDim_dataset.iloc[:,2:].values
A = lowDim_dataset['A'].values
Y = lowDim_dataset['Y'].values

### 2. Propensity Score

In [6]:
# creating gbm model for calculating propensity score
gbm = GradientBoostingClassifier(n_estimators = 100).fit(X,A)

In [7]:
propensity = pd.DataFrame(gbm.predict_proba(X))

# Inverse Propensity Weighting

In [8]:
lowDim_dataset_lPTW = lowDim_dataset.copy(deep=True)

In [9]:
# DataFrame's lookup method extracts the column index provided by lowDim_dataset_lPTW["A"]for each row
lowDim_dataset_lPTW['iptw'] = 1. / propensity.lookup(np.arange(propensity.shape[0]), lowDim_dataset_lPTW["A"])

In [10]:
lowDim_dataset_lPTW

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V14,V15,V16,V17,V18,V19,V20,V21,V22,iptw
0,19.678858,0,1.59,0.00,0.00,0.00,0.24,1.35,0.73,2.58,...,0.00,4.55,0.00,1.72,0.00,0.49,0.98,0.00,1.309683,2.019747
1,17.842989,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.62,...,0.00,4.87,0.00,0.81,0.27,0.27,0.00,0.00,1.719547,1.130074
2,22.108788,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,2.12,0.00,0.00,0.00,2.12,0.996210,2.539580
3,15.355899,0,0.00,0.00,0.00,0.56,0.00,0.00,0.00,0.00,...,0.00,1.12,0.00,0.00,0.00,0.00,0.00,0.00,1.504077,1.066396
4,16.787813,1,1.81,0.00,0.00,0.00,0.00,0.00,0.00,1.81,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.327864,2.131521
5,11.378754,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.405465,1.076853
6,14.689351,0,0.43,0.86,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,2.17,0.00,0.00,0.00,0.00,0.00,0.00,1.072610,1.226303
7,19.082760,1,0.19,0.19,0.00,0.00,0.38,0.19,0.00,2.32,...,0.00,2.51,0.00,1.16,0.00,0.00,0.00,0.19,0.732368,1.419332
8,11.474976,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.000000,1.076853
9,24.658394,1,0.05,0.13,0.05,0.26,0.44,0.76,0.26,0.97,...,0.00,3.24,0.00,1.50,0.00,1.02,0.34,0.00,1.799231,1.713654
