This Notebook contains the code for all the methods we used for this project.

The models are ordered in the following way:
1. Regression Estimate
2. Doubly Robust Estimation
3. Propensity Matching with Linear Propensity Score

Each model contains two analysis for both High and Low Dimension data.
At the end of this Notebook the reader will find a comparison table for the methods.

In [1]:
# importing packages used in this Notebook

import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [2]:
# real ATE are given:

real_low = 2.5
real_high = -3

# Regression Estimate

In [3]:
# loading data

low_dim = pd.read_csv('..\data\lowDim_dataset.csv')
high_dim = pd.read_csv('..\data\highDim_dataset.csv')

# inspecting data

low_dim.isna().sum().sum(),high_dim.isna().sum().sum(),low_dim.shape, high_dim.shape

(0, 0, (475, 24), (2000, 187))

## Low Dimension

In [4]:
# starting to measure run time for low dimension

start_time_low = time.time()

# deviding the data into treated and control groups

low_dim_treated = low_dim[low_dim['A'] == 1]
low_dim_treated = low_dim_treated.reset_index(drop = True)

low_dim_control = low_dim[low_dim['A'] == 0]
low_dim_control = low_dim_control.reset_index(drop = True)

# inspecting sizes:

len(low_dim_control), len(low_dim_treated)

(363, 112)

In [5]:
# running a regression for the treated group:

lr = LinearRegression()
X, y = low_dim_treated.iloc[:,2:], low_dim_treated.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_treated_low = lr.coef_
intercept_treated_low =lr.intercept_

In [6]:
# running a regression for the control group:

lr = LinearRegression()
X, y = low_dim_control.iloc[:,2:], low_dim_control.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_control_low = lr.coef_
intercept_control_low =lr.intercept_

In [7]:
# calculating fitted y's for both treated and control groups

fitted_y_treated_low = low_dim.iloc[:,2:].transpose().multiply(coef_treated_low, axis =0).sum() + intercept_treated_low
fitted_y_control_low = low_dim.iloc[:,2:].transpose().multiply(coef_control_low, axis =0).sum() + intercept_control_low

In [8]:
# calculating the difference b.w the treatment and control group

ate_low = (fitted_y_treated_low - fitted_y_control_low).mean()
# measuring accuracy:

accuracy_low = real_low - ate_low
# stopping the clock:

run_time_low = time.time() - start_time_low

## High Dimension

In [9]:
# starting to measure run time for low dimension

start_time_high = time.time()

# deviding the data into treated and control groups

high_dim_treated = high_dim[high_dim['A'] == 1]
high_dim_treated = high_dim_treated.reset_index(drop = True)

high_dim_control = high_dim[high_dim['A'] == 0]
high_dim_control = high_dim_control.reset_index(drop = True)

# inspecting sizes:

len(high_dim_control), len(high_dim_treated)

(1103, 897)

In [10]:
# running a regression for the treated group:

lr = LinearRegression()
X, y = high_dim_treated.iloc[:,2:], high_dim_treated.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_treated_high = lr.coef_
intercept_treated_high =lr.intercept_

In [11]:
# running a regression for the control group:

lr = LinearRegression()
X, y = high_dim_control.iloc[:,2:], high_dim_control.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_control_high = lr.coef_
intercept_control_high =lr.intercept_

In [12]:
# calculating fitted y's for both treated and control groups

fitted_y_treated_high = high_dim.iloc[:,2:].transpose().multiply(coef_treated_high, axis =0).sum() + intercept_treated_high
fitted_y_control_high = high_dim.iloc[:,2:].transpose().multiply(coef_control_high, axis =0).sum() + intercept_control_high

In [13]:
# calculating the difference b.w the treatment and control group

ate_high = (fitted_y_treated_high - fitted_y_control_high).mean()
# measuring accuracy

accuracy_high = real_high - ate_high
# stopping clock

run_time_high = time.time() - start_time_high

## Results

In [14]:
# building a df for the results:

regression_estimate_low = pd.Series(data = [run_time_low, ate_low, accuracy_low], 
                                   index = ['run_time','ate','accuracy']).rename('low')
regression_estimate_high = pd.Series(data = [run_time_high, ate_high, accuracy_high], 
                                     index = ['run_time','ate', 'accuracy']).rename('high')
results_regression_estimate = pd.DataFrame([regression_estimate_low, regression_estimate_high]).round(3)
results_regression_estimate = pd.concat({'regression_estimate': results_regression_estimate}, names=['method'])
results_regression_estimate

Unnamed: 0_level_0,Unnamed: 1_level_0,run_time,ate,accuracy
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
regression_estimate,low,0.078,2.527,-0.027
regression_estimate,high,0.159,-2.96,-0.04


# Doubly Robust Estimation

## Low Dimension

## High Dimension

# PSM

In [15]:
X_high = high_dim.drop(['A','Y'], axis = 1)
X_low = low_dim.drop(['A','Y'], axis = 1)

y_high = high_dim[['A']]
y_low = low_dim[['A']]

In [16]:
def best_para(data, C):
    X=data.drop(['A','Y'], axis = 1)
    y=data[['A']]
    diff=[]
    for c in C:
        clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
        clf.fit(X, y.values.ravel())
        ps_logit=clf.predict_log_proba(X)[:, 1]
        data['log_ps']=ps_logit
        treated=data[data['A']==1]
        control=data[data['A']==0]
        di=max(treated['log_ps'])-min(treated['log_ps'])
        dj=max(control['log_ps'])-min(control['log_ps'])
        diff.append(abs(di-dj))
    best_ind=diff.index(min(diff))
    best_c=C[best_ind]
    best_diff=diff[best_ind]
    return best_c, best_diff

In [17]:
def PSM(treated_df, control_df):
    
    treated_df.loc[:,'group']=None
    treated_df.loc[:,'control_Y']=None
    
    for i in range(len(treated_df)):
        diff_i=[]
        for j in range(len(control_df)):
            diff_i.append(abs(control_df.loc[j,'log_ps']-treated_df.loc[i,'log_ps']))

        ind_j=diff_i.index(min(diff_i))
        r=(max(diff_i)-min(diff_i))/5
        treated_df.loc[i,'control_Y']=control_df.loc[ind_j,'Y']
        if min(diff_i)<r:
            treated_df.loc[i,'group']=1
        elif min(diff_i)>=r and min(diff_i)<r*2:
            treated_df.loc[i,'group']=2
        elif min(diff_i)>=r*2 and min(diff_i)<r*3:
            treated_df.loc[i,'group']=3
        elif min(diff_i)>=r*3 and min(diff_i)<r*4:
            treated_df.loc[i,'group']=4
        else:
            treated_df.loc[i,'group']=5
    
    ATE=0
    for k in range(treated_df.loc[:,'group'].max()):
        group=treated_df[treated_df.loc[:,'group']==k+1]
        ATE=ATE+(group.loc[:,'Y']-group.loc[:,'control_Y']).mean()*len(group)/len(treated_df)
    
    return ATE

## Low Dimension

In [18]:
start_low = time.time()
clf_low = LogisticRegression(penalty='l1', C = 0.2, solver = 'liblinear')
clf_low.fit(X_low, y_low.values.ravel())
ps_logit_low = clf_low.predict_log_proba(X_low)[:, 1]
low_dim['log_ps']=ps_logit_low

In [19]:
treated_low = low_dim[low_dim['A']==1]
treated_low = treated_low.reset_index(drop = True)
control_low = low_dim[low_dim['A']==0]
control_low = control_low.reset_index(drop = True)

ate_psm_low = PSM(treated_low, control_low)
run_time_psm_low = time.time()-start_low
accuracy_psm_low = real_low - ate_psm_low


## High Dimension

In [20]:
start_high = time.time()
clf_high = LogisticRegression(penalty='l1', C = 0.5, solver = 'liblinear')
clf_high.fit(X_high, y_high.values.ravel())
ps_logit_high = clf_high.predict_log_proba(X_high)[:, 1]
high_dim['log_ps']=ps_logit_high

In [21]:
treated_high = high_dim[high_dim['A']==1]
treated_high = treated_high.reset_index(drop = True)
control_high = high_dim[high_dim['A']==0]
control_high = control_high.reset_index(drop = True)

ate_psm_high = PSM(treated_high, control_high)
accuracy_psm_high = real_high - ate_psm_high
run_time_psm_high = time.time()-start_high


## Results

In [22]:
# building a df for psm results:

psm_low = pd.Series(data = [run_time_psm_low, ate_psm_low, accuracy_psm_low], 
                                   index = ['run_time','ate','accuracy']).rename('low')

psm_high = pd.Series(data = [run_time_psm_high, ate_psm_high, accuracy_psm_high], 
                                     index = ['run_time','ate', 'accuracy']).rename('high')
results_psm = pd.DataFrame([psm_low, psm_high]).round(3)
results_psm = pd.concat({'psm': results_psm}, names=['method'])

results_psm

Unnamed: 0_level_0,Unnamed: 1_level_0,run_time,ate,accuracy
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
psm,low,0.954,2.506,-0.006
psm,high,31.696,-3.27,0.27


# Methods Comparison

In [30]:
results_final = pd.concat([results_regression_estimate, results_psm]).round(3)
results_final['accuracy'] = results_final['accuracy'].abs()

In [34]:
results_final

Unnamed: 0_level_0,Unnamed: 1_level_0,run_time,ate,accuracy
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
regression_estimate,low,0.078,2.527,0.027
regression_estimate,high,0.159,-2.96,0.04
psm,low,0.954,2.506,0.006
psm,high,31.696,-3.27,0.27
