In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import math
from sklearn.metrics import pairwise_distances

#if this doesn't run then do 'pip install causalinference' in command line
from causalinference import CausalModel

In [2]:
lowDim_dataset = pd.read_csv('../data/lowDim_dataset.csv')
highDim_dataset = pd.read_csv('../data/highDim_dataset.csv')

## A1) Propensity Score Full Matching

### 1. Mahalanobis Metric (Does not need propensity score)

Mahalanobis distance is 
$$D_{ij} = (X_i-X_j)^T\Sigma^{-1}(X_i-X_j)$$
where $\Sigma$ is the covariance matrix of $X$ in the pooled treatment and full control groups.

In [3]:
X=lowDim_dataset.iloc[:,2:].values
A=lowDim_dataset['A'].values
Y=lowDim_dataset['Y'].values

In [4]:
#don't need this anymore
#dist_matrix_mahalanobis = pairwise_distances(X,metric='mahalanobis')

#full matching:
#fullmatch(match_on(A~X,data=lowDim_dataset,method='mahalanobis'),data=df)

### 2. Propensity Score

In [5]:
#creating gbm model for calculating propensity score
gbm = GradientBoostingClassifier(n_estimators = 100).fit(X,A)
propensity_scores = [x[1] for x in gbm.predict_proba(X)]

In [6]:
lowDim_dataset_propensity = lowDim_dataset.copy(deep=True)
lowDim_dataset_propensity['propensity_score'] = propensity_scores

#full matching:
#matchit(A~propensity_scores,data=lowDim_dataset_propensity,method='full')
#OR
#fullmatch(match_on(A~propensity_scores,data=lowDim_dataset_propensity,method='euclidean'),data=lowDim_dataset_propensity,method='full')

### 3. Linear Propensity Score

In [7]:
def logit(x):
    return math.log(x/(1-x))

In [8]:
#negative values make sense since if you look at a graph of log(x/(1-x))
linear_propensity_scores = [logit(x) for x in propensity_scores]

In [9]:
lowDim_dataset_linear_propensity = lowDim_dataset.copy(deep=True)
lowDim_dataset_linear_propensity['linear_propensity_score'] = linear_propensity_scores

#full matching:
#matchit(A~linear_propensity_scores,data=lowDim_dataset_linear_propensity,method='full')
#OR
##fullmatch(match_on(A~linear_propensity_scores,data=lowDim_dataset_linear_propensity,method='euclidean'),data=lowDim_dataset_linear_propensity,method='full')

## Matching

Now we actually need to do full matching using these distance matrices.
https://projecteuclid.org/download/pdfview_1/euclid.ss/1280841730