Propensity Score Matching(Full matching) - Linear Propensity Score

1. Data Processing

In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
low_dim = pd.read_csv('..\data\lowDim_dataset.csv')
high_dim = pd.read_csv('..\data\highDim_dataset.csv')

In [3]:
X_high = high_dim.drop(['A','Y'], axis = 1)
X_low = low_dim.drop(['A','Y'], axis = 1)

y_high = high_dim[['A']]
y_low = low_dim[['A']]

2. Choosing the best parameter to calculate propensity score

In [4]:
def best_para(data, C):
    X=data.drop(['A','Y'], axis = 1)
    y=data[['A']]
    diff=[]
    for c in C:
        clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
        clf.fit(X, y.values.ravel())
        ps_logit=clf.predict_log_proba(X)[:, 1]
        data['log_ps']=ps_logit
        treated=data[data['A']==1]
        control=data[data['A']==0]
        di=max(treated['log_ps'])-min(treated['log_ps'])
        dj=max(control['log_ps'])-min(control['log_ps'])
        diff.append(abs(di-dj))
    best_ind=diff.index(min(diff))
    best_c=C[best_ind]
    best_diff=diff[best_ind]
    return best_c, best_diff

3. Propensity score matching model

In [5]:
def PSM(treated_df, control_df):
    
    treated_df.loc[:,'group']=None
    treated_df.loc[:,'control_Y']=None
    
    for i in range(len(treated_df)):
        diff_i=[]
        for j in range(len(control_df)):
            diff_i.append(abs(control_df.loc[j,'log_ps']-treated_df.loc[i,'log_ps']))

        ind_j=diff_i.index(min(diff_i))
        r=(max(diff_i)-min(diff_i))/5
        treated_df.loc[i,'control_Y']=control_df.loc[ind_j,'Y']
        if min(diff_i)<r:
            treated_df.loc[i,'group']=1
        elif min(diff_i)>=r and min(diff_i)<r*2:
            treated_df.loc[i,'group']=2
        elif min(diff_i)>=r*2 and min(diff_i)<r*3:
            treated_df.loc[i,'group']=3
        elif min(diff_i)>=r*3 and min(diff_i)<r*4:
            treated_df.loc[i,'group']=4
        else:
            treated_df.loc[i,'group']=5
    
    ATE=0
    for k in range(treated_df.loc[:,'group'].max()):
        group=treated_df[treated_df.loc[:,'group']==k+1]
        ATE=ATE+(group.loc[:,'Y']-group.loc[:,'control_Y']).mean()*len(group)/len(treated_df)
    
    return ATE

4. Performance on Low Dimension Data

In [6]:
start_low = time.time()
clf_low = LogisticRegression(penalty='l1', C = 0.2, solver = 'liblinear')
clf_low.fit(X_low, y_low.values.ravel())
ps_logit_low = clf_low.predict_log_proba(X_low)[:, 1]
low_dim['log_ps']=ps_logit_low

In [7]:
treated_low = low_dim[low_dim['A']==1]
treated_low = treated_low.reset_index(drop = True)
control_low = low_dim[low_dim['A']==0]
control_low = control_low.reset_index(drop = True)

PSM(treated_low, control_low)

2.5058765993180043

In [8]:
time.time()-start_low

0.8238942623138428

5. Performance on High Dimension Data

In [9]:
start_high = time.time()
clf_high = LogisticRegression(penalty='l1', C = 0.5, solver = 'liblinear')
clf_high.fit(X_high, y_high.values.ravel())
ps_logit_high = clf_high.predict_log_proba(X_high)[:, 1]
high_dim['log_ps']=ps_logit_high

In [10]:
treated_high = high_dim[high_dim['A']==1]
treated_high = treated_high.reset_index(drop = True)
control_high = high_dim[high_dim['A']==0]
control_high = control_high.reset_index(drop = True)

PSM(treated_high, control_high)

-2.932560148763459

In [11]:
time.time()-start_high

24.316996812820435