If the cell below doesn't run then do 'pip install rpy2' or 'conda install -c r rpy2' and 'conda install tzlocal' in command line

In [1]:
import rpy2
import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.vectors import StrVector
from rpy2.robjects import FloatVector, Formula
#print(rpy2.__version__)

In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import math
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import GridSearchCV
import time

In [3]:
lowDim_dataset = pd.read_csv('../data/lowDim_dataset.csv')
highDim_dataset = pd.read_csv('../data/highDim_dataset.csv')

## Grid Search

In [4]:
#low dim grid search (commented out since it takes a few minutes to run)

#X=lowDim_dataset.iloc[:,2:].values
#A=lowDim_dataset['A'].values
#Y=lowDim_dataset['Y'].values

#params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3,4], 'n_estimators':[50,100,150],
#          'min_samples_leaf':[1,3,5],'min_samples_split':[2,4,6]}
#gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=5).fit(X,A)
#gscv.best_params_

#output: {'learning_rate': 0.01,
# 'max_depth': 2,
# 'min_samples_leaf': 1,
# 'min_samples_split': 2,
# 'n_estimators': 150}

In [5]:
#high dim grid search (commented out since it takes a few minutes to run)

#X=highDim_dataset.iloc[:,2:].values
#A=highDim_dataset['A'].values
#Y=highDim_dataset['Y'].values

#params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3,4], 'n_estimators':[50,100,150],
#          'min_samples_leaf':[1,3,5],'min_samples_split':[2,4,6]}
#gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=5).fit(X,A)
#gscv.best_params_


#output: {'learning_rate': 0.05,
# 'max_depth': 1,
# 'min_samples_leaf': 5,
# 'min_samples_split': 2,
# 'n_estimators': 100}

## Calculating Propensity and Linear Propensity Scores

In [6]:
def logit(x):
    return math.log(x/(1-x))

### Low Dimensional Dataset

In [7]:
X=lowDim_dataset.iloc[:,2:].values
A=lowDim_dataset['A'].values
Y=lowDim_dataset['Y'].values

gbm = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 2, min_samples_leaf = 1,
                                min_samples_split = 2, n_estimators = 150).fit(X,A)

low_dim_propensity_scores = [x[1] for x in gbm.predict_proba(X)]
low_dim_linear_propensity_scores = [logit(x) for x in low_dim_propensity_scores]

In [8]:
lowDim_dataset_propensity = lowDim_dataset.copy(deep=True)
lowDim_dataset_propensity['propensity_score'] = low_dim_propensity_scores

In [9]:
lowDim_dataset_linear_propensity = lowDim_dataset.copy(deep=True)
lowDim_dataset_linear_propensity['linear_propensity_score'] = low_dim_linear_propensity_scores

In [10]:
pd.DataFrame({'propensity_scores':low_dim_propensity_scores}).to_csv('../output/low_dim_propensity_scores.csv')
pd.DataFrame({'linear_propensity_scores':low_dim_linear_propensity_scores}).to_csv('../output/low_dim_linear_propensity_scores.csv')

### High Dimensional Dataset

In [11]:
X=highDim_dataset.iloc[:,2:].values
A=highDim_dataset['A'].values
Y=highDim_dataset['Y'].values

gbm = GradientBoostingClassifier(learning_rate = 0.05, max_depth = 1, min_samples_leaf = 5,
                                min_samples_split = 2, n_estimators = 100).fit(X,A)

high_dim_propensity_scores = [x[1] for x in gbm.predict_proba(X)]
high_dim_linear_propensity_scores = [logit(x) for x in high_dim_propensity_scores]

In [12]:
highDim_dataset_propensity = highDim_dataset.copy(deep=True)
highDim_dataset_propensity['propensity_score'] = high_dim_propensity_scores

In [13]:
highDim_dataset_linear_propensity = highDim_dataset.copy(deep=True)
highDim_dataset_linear_propensity['linear_propensity_score'] = high_dim_linear_propensity_scores

In [14]:
pd.DataFrame({'propensity_scores':high_dim_propensity_scores}).to_csv('../output/high_dim_propensity_scores.csv')
pd.DataFrame({'linear_propensity_scores':high_dim_linear_propensity_scores}).to_csv('../output/high_dim_linear_propensity_scores.csv')

## Full Matching

### True ATE: 2.5 for low dim and -3 for high dim

### Set Up rpy2 (Python Interface to R)

In [15]:
%%capture 
# utils = rpackages.importr('utils')
utils = importr('utils')
utils.chooseCRANmirror(ind=1)
packnames = ('optmatch')
utils.install_packages('DirichletReg')


In [16]:
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

In [17]:
%%capture 
utils.chooseCRANmirror(ind=1)
robjects.r(f'install.packages("{"optmatch"}")')

In [18]:
optmatch = rpackages.importr('optmatch')

### Low Dimensional Dataset

In [19]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    try:
        lowDim_R_runtime = time.time()
        lowDim_dataset_R = robjects.conversion.py2rpy(lowDim_dataset)
        lowDim_R_runtime = time.time()-lowDim_R_runtime
        
        lowDim_propensity_R_runtime = time.time()
        lowDim_dataset_propensity_R = robjects.conversion.py2rpy(lowDim_dataset_propensity)
        lowDim_propensity_R_runtime = time.time()-lowDim_propensity_R_runtime
        
        lowDim_linear_propensity_R_runtime = time.time()
        lowDim_dataset_linear_propensity_R = robjects.conversion.py2rpy(lowDim_dataset_linear_propensity)
        lowDim_linear_propensity_R_runtime = time.time()-lowDim_linear_propensity_R_runtime
        
    except:
        lowDim_R_runtime = time.time()
        lowDim_dataset_R = pandas2ri.py2ri(lowDim_dataset)
        lowDim_R_runtime = time.time()-lowDim_R_runtime
        
        lowDim_propensity_R_runtime = time.time()
        lowDim_dataset_propensity_R = pandas2ri.py2ri(lowDim_dataset_propensity)
        lowDim_propensity_R_runtime = time.time()-lowDim_propensity_R_runtime
        
        lowDim_linear_propensity_R_runtime = time.time()
        lowDim_dataset_linear_propensity_R = pandas2ri.py2ri(lowDim_dataset_linear_propensity)
        lowDim_linear_propensity_R_runtime = time.time()-lowDim_linear_propensity_R_runtime

In [20]:
robjects.r.head(lowDim_dataset_propensity_R)

Y,A,V1,...,V21,V22,propensity_score
19.678858,0,1.59,...,0.0,1.309683,0.405085
17.842989,0,0.0,,0.0,1.719547,0.322207
22.108788,1,0.0,,2.12,0.99621,0.298967
15.355899,0,0.0,,0.0,1.504077,0.175873
16.787813,1,1.81,,0.0,0.327864,0.159559
11.378754,0,0.0,,0.0,0.405465,0.146625


### Method 1: Mahalanobis

Mahalanobis distance is 
$$D_{ij} = (X_i-X_j)^T\Sigma^{-1}(X_i-X_j)$$
where $\Sigma$ is the covariance matrix of $X$ in the pooled treatment and full control groups.

In [21]:
start = time.time()
full_match_Mahalanobis_factor = optmatch.fullmatch(optmatch.match_on(Formula('A~.-Y'),data=lowDim_dataset_R,method='mahalanobis'),data=lowDim_dataset_R)

In [22]:
lowDim_dataset['assign'] = list(full_match_Mahalanobis_factor)

In [23]:
#if this doesn't print anything then that means each group has at least one control and at least one treatment which is good

for i in range(max(list(full_match_Mahalanobis_factor))):
    temp = lowDim_dataset.loc[lowDim_dataset['assign']==i+1][['Y','A','assign']]
    grouping = temp['A'].values
    
    if (sum(grouping)==0 or sum(grouping)==len(grouping)):
        print(i+1)    

In [24]:
#compute ATE
ATE_vec = []
weights = []

for i in range(max(list(full_match_Mahalanobis_factor))):
    temp = lowDim_dataset.loc[lowDim_dataset['assign']==i+1]
    
    treatment_Y = temp.loc[temp['A']==1]['Y'].values
    control_Y = temp.loc[temp['A']==0]['Y'].values
    
    ATE_vec.append(np.mean(treatment_Y)-np.mean(control_Y))
    weights.append(len(treatment_Y)+len(control_Y))

Mahalanobis_lowDim_est_ATE = np.average(ATE_vec, weights = weights)

end = time.time()
lowDim_mahalanobis_match_runtime = end-start

In [25]:
#runtime is time to convert to R data frame + time to do matching
lowDim_mahalanobis_runtime = lowDim_R_runtime+lowDim_mahalanobis_match_runtime
lowDim_mahalanobis_runtime

0.5643661022186279

In [26]:
Mahalanobis_lowDim_est_ATE

2.905992967731523

### Method 2: Propensity Score

In [27]:
start = time.time()
full_match_propensity_factor = optmatch.fullmatch(optmatch.match_on(Formula('A~propensity_score'),data=lowDim_dataset_propensity_R,method='euclidean'),data=lowDim_dataset_propensity_R)

After the above step, we can do the rest of the code in the implement_full_match.R using python functions.

In [28]:
lowDim_dataset_propensity['assign'] = list(full_match_propensity_factor)

In [29]:
#example group
lowDim_dataset_propensity.loc[lowDim_dataset_propensity['assign']==67][['Y','A','assign','propensity_score']]

Unnamed: 0,Y,A,assign,propensity_score
185,18.392843,0,67,0.292125
407,23.001812,1,67,0.293804


In [30]:
#if this doesn't print anything then that means each group has at least one control and at least one treatment which is good

for i in range(max(list(full_match_propensity_factor))):
    temp = lowDim_dataset_propensity.loc[lowDim_dataset_propensity['assign']==i+1][['Y','A','assign','propensity_score']]
    grouping = temp['A'].values
    
    if (sum(grouping)==0 or sum(grouping)==len(grouping)):
        print(i+1)    

In [31]:
#compute ATE
ATE_vec = []
weights = []

for i in range(max(list(full_match_propensity_factor))):
    temp = lowDim_dataset_propensity.loc[lowDim_dataset_propensity['assign']==i+1]
    
    treatment_Y = temp.loc[temp['A']==1]['Y'].values
    control_Y = temp.loc[temp['A']==0]['Y'].values
    
    ATE_vec.append(np.mean(treatment_Y)-np.mean(control_Y))
    weights.append(len(treatment_Y)+len(control_Y))

lowDim_propensity_est_ATE = np.average(ATE_vec, weights=weights)
    
end = time.time()
lowDim_propensity_match_runtime = end-start
    

In [32]:
lowDim_propensity_runtime = lowDim_propensity_R_runtime+lowDim_propensity_match_runtime
lowDim_propensity_runtime

0.4408571720123291

In [33]:
lowDim_propensity_est_ATE

3.387793812531444

### Method 3: Linear Propensity Score

In [34]:
start = time.time()
full_match_linear_propensity_factor = optmatch.fullmatch(optmatch.match_on(Formula('A~linear_propensity_score'),data=lowDim_dataset_linear_propensity_R,method='euclidean'),data=lowDim_dataset_linear_propensity_R)

In [35]:
lowDim_dataset_linear_propensity['assign'] = list(full_match_linear_propensity_factor)

In [36]:
#example group
lowDim_dataset_linear_propensity.loc[lowDim_dataset_linear_propensity['assign']==67][['Y','A','assign','linear_propensity_score']]

Unnamed: 0,Y,A,assign,linear_propensity_score
43,20.359007,1,67,-1.019142
58,19.588519,0,67,-1.029623
106,16.301866,0,67,-1.011693
126,15.106868,0,67,-1.036838
160,17.005677,0,67,-1.025284
217,26.525997,0,67,-1.000699
308,16.718287,0,67,-1.031649
371,21.382709,0,67,-1.037664


In [37]:
#if this doesn't print anything then that means each group has at least one control and at least one treatment which is good

for i in range(max(list(full_match_linear_propensity_factor))):
    temp = lowDim_dataset_linear_propensity.loc[lowDim_dataset_linear_propensity['assign']==i+1][['Y','A','assign','linear_propensity_score']]
    grouping = temp['A'].values
    
    if (sum(grouping)==0 or sum(grouping)==len(grouping)):
        print(i+1)    

In [38]:
#compute ATE
ATE_vec = []
weights = []

for i in range(max(list(full_match_linear_propensity_factor))):
    temp = lowDim_dataset_linear_propensity.loc[lowDim_dataset_linear_propensity['assign']==i+1]
    
    treatment_Y = temp.loc[temp['A']==1]['Y'].values
    control_Y = temp.loc[temp['A']==0]['Y'].values
    
    ATE_vec.append(np.mean(treatment_Y)-np.mean(control_Y))
    weights.append(len(treatment_Y)+len(control_Y))

lowDim_linear_propensity_est_ATE = np.average(ATE_vec, weights=weights)

end = time.time()
lowDim_linear_propensity_match_runtime = end-start

In [39]:
lowDim_linear_propensity_runtime = lowDim_linear_propensity_R_runtime+lowDim_linear_propensity_match_runtime
lowDim_linear_propensity_runtime

0.42873597145080566

In [40]:
lowDim_linear_propensity_est_ATE

3.476470407279265

### High Dimensional Dataset

In [41]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    try:
        highDim_R_runtime = time.time()
        highDim_dataset_R = robjects.conversion.py2rpy(highDim_dataset)
        highDim_R_runtime = time.time()-highDim_R_runtime
        
        highDim_propensity_R_runtime = time.time()
        highDim_dataset_propensity_R = robjects.conversion.py2rpy(highDim_dataset_propensity)
        highDim_propensity_R_runtime = time.time()-highDim_propensity_R_runtime
        
        highDim_linear_propensity_R_runtime = time.time()
        highDim_dataset_linear_propensity_R = robjects.conversion.py2rpy(highDim_dataset_linear_propensity)
        highDim_linear_propensity_R_runtime = time.time()-highDim_linear_propensity_R_runtime
        
    except:
        highDim_R_runtime = time.time()
        highDim_dataset_R = pandas2ri.py2ri(highDim_dataset)
        highDim_R_runtime = time.time()-highDim_R_runtime
        
        highDim_propensity_R_runtime = time.time()
        highDim_dataset_propensity_R = pandas2ri.py2ri(highDim_dataset_propensity)
        highDim_propensity_R_runtime = time.time()-highDim_propensity_R_runtime
        
        highDim_linear_propensity_R_runtime = time.time()
        highDim_dataset_linear_propensity_R = pandas2ri.py2ri(highDim_dataset_linear_propensity)
        highDim_linear_propensity_R_runtime = time.time()-highDim_linear_propensity_R_runtime

In [42]:
robjects.r.head(highDim_dataset_propensity_R)

Y,A,V1,...,V184,V185,propensity_score
-11.682472,1,0,...,-1,-1,0.503957
-13.176546,0,1,,-1,-1,0.43171
-2.195401,1,0,,-1,-1,0.574668
-0.005454,1,1,,-10,-10,0.465927
-1.987538,1,1,,-10,-10,0.549626
-17.81082,1,1,,8,5,0.465929


### Method 1: Mahalanobis

In [43]:
start = time.time()
full_match_Mahalanobis_factor = optmatch.fullmatch(optmatch.match_on(Formula('A~.-Y'),data=highDim_dataset_R,method='mahalanobis'),data=highDim_dataset_R)

In [44]:
highDim_dataset['assign'] = list(full_match_Mahalanobis_factor)

In [45]:
#if this doesn't print anything then that means each group has at least one control and at least one treatment which is good

for i in range(max(list(full_match_Mahalanobis_factor))):
    temp = highDim_dataset.loc[highDim_dataset['assign']==i+1][['Y','A','assign']]
    grouping = temp['A'].values
    
    if (sum(grouping)==0 or sum(grouping)==len(grouping)):
        print(i+1)    

In [46]:
#compute ATE
ATE_vec = []
weights = []

for i in range(max(list(full_match_Mahalanobis_factor))):
    temp = highDim_dataset.loc[highDim_dataset['assign']==i+1]
    
    treatment_Y = temp.loc[temp['A']==1]['Y'].values
    control_Y = temp.loc[temp['A']==0]['Y'].values
    
    ATE_vec.append(np.mean(treatment_Y)-np.mean(control_Y))
    weights.append(len(treatment_Y)+len(control_Y))
    
highDim_Mahalanobis_est_ATE = np.average(ATE_vec, weights=weights)
    
end = time.time()
highDim_mahalanobis_match_runtime = end-start

In [47]:
highDim_mahalanobis_runtime = highDim_R_runtime+highDim_mahalanobis_match_runtime
highDim_mahalanobis_runtime

49.16640853881836

In [48]:
highDim_Mahalanobis_est_ATE

-1.5534687043745337

### Method 2: Propensity Score

In [49]:
start = time.time()
full_match_propensity_factor = optmatch.fullmatch(optmatch.match_on(Formula('A~propensity_score'),data=highDim_dataset_propensity_R,method='euclidean'),data=highDim_dataset_propensity_R)

In [50]:
highDim_dataset_propensity['assign'] = list(full_match_propensity_factor)

In [51]:
#if this doesn't print anything then that means each group has at least one control and at least one treatment which is good

for i in range(max(list(full_match_propensity_factor))):
    temp = highDim_dataset_propensity.loc[highDim_dataset_propensity['assign']==i+1][['Y','A','assign','propensity_score']]
    grouping = temp['A'].values
    
    if (sum(grouping)==0 or sum(grouping)==len(grouping)):
        print(i+1)    

In [52]:
#compute ATE
ATE_vec = []
weights = []

for i in range(max(list(full_match_propensity_factor))):
    temp = highDim_dataset_propensity.loc[highDim_dataset_propensity['assign']==i+1]
    
    treatment_Y = temp.loc[temp['A']==1]['Y'].values
    control_Y = temp.loc[temp['A']==0]['Y'].values
    
    ATE_vec.append(np.mean(treatment_Y)-np.mean(control_Y))
    weights.append(len(treatment_Y)+len(control_Y))

highDim_propensity_est_ATE = np.average(ATE_vec, weights=weights)    

end = time.time()
highDim_propensity_match_runtime = end-start

In [53]:
highDim_propensity_runtime = highDim_propensity_R_runtime+highDim_propensity_match_runtime
highDim_propensity_runtime

6.412713527679443

In [54]:
highDim_propensity_est_ATE

-3.292183319977513

### Method 3: Linear Propensity Score

In [55]:
start = time.time()
full_match_linear_propensity_factor = optmatch.fullmatch(optmatch.match_on(Formula('A~linear_propensity_score'),data=highDim_dataset_linear_propensity_R,
                                                                           method='euclidean'),data=highDim_dataset_linear_propensity_R)

In [56]:
highDim_dataset_linear_propensity['assign'] = list(full_match_linear_propensity_factor)

In [57]:
#if this doesn't print anything then that means each group has at least one control and at least one treatment which is good

for i in range(max(list(full_match_linear_propensity_factor))):
    temp = highDim_dataset_linear_propensity.loc[highDim_dataset_linear_propensity['assign']==i+1][['Y','A','assign','linear_propensity_score']]
    grouping = temp['A'].values
    
    if (sum(grouping)==0 or sum(grouping)==len(grouping)):
        print(i+1)    

In [58]:
#compute ATE
ATE_vec = []
weights = []

for i in range(max(list(full_match_linear_propensity_factor))):
    temp = highDim_dataset_linear_propensity.loc[highDim_dataset_linear_propensity['assign']==i+1]
    
    treatment_Y = temp.loc[temp['A']==1]['Y'].values
    control_Y = temp.loc[temp['A']==0]['Y'].values
    
    ATE_vec.append(np.mean(treatment_Y)-np.mean(control_Y))
    weights.append(len(treatment_Y)+len(control_Y))
    
highDim_linear_propensity_est_ATE=np.average(ATE_vec, weights=weights)

end = time.time()
highDim_linear_propensity_match_runtime = end-start

In [59]:
highDim_linear_propensity_runtime = highDim_linear_propensity_R_runtime+highDim_linear_propensity_match_runtime
highDim_linear_propensity_runtime

6.017297267913818

In [60]:
highDim_linear_propensity_est_ATE

-3.2319717857151256

## Reference Papers

https://projecteuclid.org/download/pdfview_1/euclid.ss/1280841730
https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36552.pdf
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5784842/
https://www.researchgate.net/publication/8132035_Propensity_Score_Estimation_With_Boosted_Regression_for_Evaluating_Causal_Effects_in_Observational_Studies