In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import math
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import GridSearchCV

#if this doesn't run then do 'pip install causalinference' in command line
from causalinference import CausalModel

In [2]:
lowDim_dataset = pd.read_csv('../data/lowDim_dataset.csv')
highDim_dataset = pd.read_csv('../data/highDim_dataset.csv')

## A1) Propensity Score Full Matching

### 1. Mahalanobis Metric (Does not need propensity score)

Mahalanobis distance is 
$$D_{ij} = (X_i-X_j)^T\Sigma^{-1}(X_i-X_j)$$
where $\Sigma$ is the covariance matrix of $X$ in the pooled treatment and full control groups.

In [3]:
X=lowDim_dataset.iloc[:,2:].values
A=lowDim_dataset['A'].values
Y=lowDim_dataset['Y'].values

In [4]:
#don't need this anymore
#dist_matrix_mahalanobis = pairwise_distances(X,metric='mahalanobis')

#full matching:
#fullmatch(match_on(A~X,data=lowDim_dataset,method='mahalanobis'),data=df)

### 2. Propensity Score

In [5]:
#creating gbm model for calculating propensity score

gbm = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.01, max_depth = 2).fit(X,A)
propensity_scores = [x[1] for x in gbm.predict_proba(X)]

In [6]:
lowDim_dataset_propensity = lowDim_dataset.copy(deep=True)
lowDim_dataset_propensity['propensity_score'] = propensity_scores

#full matching:
#matchit(A~propensity_scores,data=lowDim_dataset_propensity,method='full')
#OR
#fullmatch(match_on(A~propensity_scores,data=lowDim_dataset_propensity,method='euclidean'),data=lowDim_dataset_propensity,method='full')

### 3. Linear Propensity Score

In [7]:
def logit(x):
    return math.log(x/(1-x))

In [8]:
#negative values make sense since if you look at a graph of log(x/(1-x))
linear_propensity_scores = [logit(x) for x in propensity_scores]

In [9]:
lowDim_dataset_linear_propensity = lowDim_dataset.copy(deep=True)
lowDim_dataset_linear_propensity['linear_propensity_score'] = linear_propensity_scores

#full matching:
#matchit(A~linear_propensity_scores,data=lowDim_dataset_linear_propensity,method='full')
#OR
##fullmatch(match_on(A~linear_propensity_scores,data=lowDim_dataset_linear_propensity,method='euclidean'),data=lowDim_dataset_linear_propensity,method='full')

## Matching

Now we actually need to do full matching using these distance matrices.
https://projecteuclid.org/download/pdfview_1/euclid.ss/1280841730

In [10]:
lowDim_dataset

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
0,19.678858,0,1.59,0.00,0.00,0.00,0.24,1.35,0.73,2.58,...,0.12,0.00,4.55,0.00,1.72,0.00,0.49,0.98,0.00,1.309683
1,17.842989,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.62,...,0.27,0.00,4.87,0.00,0.81,0.27,0.27,0.00,0.00,1.719547
2,22.108788,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,2.12,0.00,0.00,0.00,2.12,0.996210
3,15.355899,0,0.00,0.00,0.00,0.56,0.00,0.00,0.00,0.00,...,0.00,0.00,1.12,0.00,0.00,0.00,0.00,0.00,0.00,1.504077
4,16.787813,1,1.81,0.00,0.00,0.00,0.00,0.00,0.00,1.81,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.327864
5,11.378754,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.405465
6,14.689351,0,0.43,0.86,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,2.17,0.00,0.00,0.00,0.00,0.00,0.00,1.072610
7,19.082760,1,0.19,0.19,0.00,0.00,0.38,0.19,0.00,2.32,...,0.00,0.00,2.51,0.00,1.16,0.00,0.00,0.00,0.19,0.732368
8,11.474976,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.000000
9,24.658394,1,0.05,0.13,0.05,0.26,0.44,0.76,0.26,0.97,...,0.44,0.00,3.24,0.00,1.50,0.00,1.02,0.34,0.00,1.799231


## Grid Search 

In [12]:
#low dim grid search (commented out since it takes a few minutes to run)

params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3,4], 'n_estimators':[50,100,150],
          'min_samples_leaf':[1,3,5],'min_samples_split':[2,4,6]}
gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=5).fit(X,A)
gscv.best_params_

#output: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 150}

{'learning_rate': 0.01,
 'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

In [13]:
#high dim grid search (commented out since it takes a few minutes to run)

X=highDim_dataset.iloc[:,2:].values
A=highDim_dataset['A'].values
Y=highDim_dataset['Y'].values

params = {'learning_rate':[0.01,0.05,0.1,0.5], 'max_depth': [1,2,3,4], 'n_estimators':[50,100,150],
          'min_samples_leaf':[1,3,5],'min_samples_split':[2,4,6]}
gscv = GridSearchCV(GradientBoostingClassifier(),params,cv=5).fit(X,A)
gscv.best_params_


#output: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100}

{'learning_rate': 0.05,
 'max_depth': 1,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 100}