In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import math
from sklearn.metrics import pairwise_distances

#if this doesn't run then do 'pip install causalinference' in command line
from causalinference import CausalModel

In [2]:
lowDim_dataset = pd.read_csv('../data/lowDim_dataset.csv')
highDim_dataset = pd.read_csv('../data/highDim_dataset.csv')

## A1) Propensity Score Full Matching

### 1. Mahalanobis Metric (Does not need propensity score)

Mahalanobis distance is 
$$D_{ij} = (X_i-X_j)^T\Sigma^{-1}(X_i-X_j)$$
where $\Sigma$ is the covariance matrix of $X$ in the pooled treatment and full control groups.

In [3]:
#testing CausalModel package matching function
#working with the low dimensional data first
cm = CausalModel(
    Y=lowDim_dataset['Y'].values, #response
    D=lowDim_dataset['A'].values, #treatment/control column
    X=lowDim_dataset.iloc[:,2:].values #features
)

In [4]:
cm.est_via_matching()
print(cm.estimates)
#not sure how to interpret this output


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      2.886      0.412      7.004      0.000      2.079      3.694
           ATC      2.738      0.467      5.862      0.000      1.822      3.654
           ATT      3.366      0.406      8.296      0.000      2.571      4.161



In [5]:
#estimate propensity score with logistic regression
#for some reason this doesn't output anything
#print(cm.est_propensity_s()['fitted'])

In [6]:
#trying to implement the mahalanobis metric
X=lowDim_dataset.iloc[:,2:].values
A=lowDim_dataset['A'].values
Y=lowDim_dataset['Y'].values

In [7]:
#creating the covariance matrix 
Sigma = np.cov(X)
#not invertible???
#Sigma_inv = np.linalg.inv(Sigma)
np.linalg.det(Sigma)

0.0

The above code isn't needed since it looks like pairwise_distances can find the Mahalanobis matrix for us.

In [8]:
dist_matrix_mahalanobis = pairwise_distances(X,metric='mahalanobis')

### 2. Propensity Score

In [9]:
#creating gbm model for calculating propensity score

gbm = GradientBoostingClassifier(n_estimators = 100).fit(X,A)
propensity_scores = [x[1] for x in gbm.predict_proba(X)]

#they're all around 0.5 so we need to tune the parameters
propensity_scores

[0.540793409015478,
 0.17089269398632936,
 0.41985342963971295,
 0.07422376382983786,
 0.378889542312114,
 0.06792209894384736,
 0.16947682454917307,
 0.6778160520303985,
 0.06792209894384736,
 0.6389571975765068,
 0.2378486335164228,
 0.15616952306410659,
 0.1443462465961906,
 0.17265633997093643,
 0.8477260974217318,
 0.24139645230527454,
 0.9271027704625037,
 0.6891962551709103,
 0.18753506240009332,
 0.13476971556655068,
 0.6383477464458966,
 0.12356084084030348,
 0.11765830328951386,
 0.16420805980402753,
 0.8088758824019727,
 0.2503252079431286,
 0.10477245499098467,
 0.16303994956206927,
 0.6016740953258133,
 0.10245855454129908,
 0.022095939338215326,
 0.2081773475417851,
 0.14405206127541179,
 0.06830297002273422,
 0.36249091132546124,
 0.0767418369572157,
 0.046620343823462,
 0.036025919111614076,
 0.5960701505585924,
 0.10684243776115881,
 0.015116716455536009,
 0.22467731137610122,
 0.13476971556655068,
 0.5795372676324342,
 0.16987636006912915,
 0.7950723857539634,
 0.1185

In [10]:
dist_matrix_propensity = pairwise_distances(np.array(propensity_scores).reshape(-1,1), metric='l1')

### 3. Linear Propensity Score

In [11]:
def logit(x):
    return math.log(x/(1-x))

In [12]:
linear_propensity_scores = [logit(x) for x in propensity_scores]

In [13]:
#negative values make sense since if you look at a graph of log(x/(1-x))
linear_propensity_scores

[0.16353713909146636,
 -1.5793137480831656,
 -0.323375105810584,
 -2.523548193693384,
 -0.49426421927200737,
 -2.6190549513132093,
 -1.589339644934546,
 0.7437532920197052,
 -2.6190549513132093,
 0.5708409511111768,
 -1.1645107019255785,
 -1.6870095141687707,
 -1.7796508969587532,
 -1.5669170142384061,
 1.7168766954378223,
 -1.1450386936554007,
 2.5430137882272144,
 0.7963644036301154,
 -1.4661069321541431,
 -1.8594281850101881,
 0.5682000673921362,
 -1.959133615412934,
 -2.0147947033393616,
 -1.6272454256463311,
 1.4427224369226528,
 -1.0968785976171258,
 -2.1452870229102925,
 -1.635781079770425,
 0.41244540265446245,
 -2.170200927210239,
 -3.790017716450974,
 -1.3359470942466938,
 -1.7820347841730169,
 -2.6130544361147217,
 -0.5645691319005369,
 -2.4874618733849654,
 -3.017976195759233,
 -3.286825752483013,
 0.38911713953103255,
 -2.123407801874593,
 -4.176721958759286,
 -1.2386141139782199,
 -1.8594281850101881,
 0.32087412215482497,
 -1.5865037747358959,
 1.3557763462276236,
 -2.00

In [14]:
dist_matrix_linear = pairwise_distances(np.array(linear_propensity_scores).reshape(-1,1), metric='l1')

## Matching

Now we actually need to do full matching using these distance matrices.
https://projecteuclid.org/download/pdfview_1/euclid.ss/1280841730