This Notebook contains the code for all the methods we used for this project.

The models are ordered in the following way:
1. Regression Estimate
2. Doubly Robust Estimation
3. Propensity Matching with Linear Propensity Score

Each model contains two analysis for both High and Low Dimension data.
At the end of this Notebook the reader will find a comparison table for the methods.

In [1]:
# importing packages used in this Notebook

import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [2]:
# real ATE are given:

real_low = 2.5
real_high = -3

# Regression Estimate

In [3]:
# loading data

low_dim = pd.read_csv('..\data\lowDim_dataset.csv')
high_dim = pd.read_csv('..\data\highDim_dataset.csv')

# inspecting data

low_dim.isna().sum().sum(),high_dim.isna().sum().sum(),low_dim.shape, high_dim.shape

(0, 0, (475, 24), (2000, 187))

## Low Dimension

In [4]:
# starting to measure run time for low dimension

start_time_low = time.time()

# deviding the data into treated and control groups

low_dim_treated = low_dim[low_dim['A'] == 1]
low_dim_treated = low_dim_treated.reset_index(drop = True)

low_dim_control = low_dim[low_dim['A'] == 0]
low_dim_control = low_dim_control.reset_index(drop = True)

# inspecting sizes:

len(low_dim_control), len(low_dim_treated)

(363, 112)

In [5]:
# running a regression for the treated group:

lr = LinearRegression()
X, y = low_dim_treated.iloc[:,2:], low_dim_treated.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_treated_low = lr.coef_
intercept_treated_low =lr.intercept_

In [6]:
# running a regression for the control group:

lr = LinearRegression()
X, y = low_dim_control.iloc[:,2:], low_dim_control.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_control_low = lr.coef_
intercept_control_low =lr.intercept_

In [7]:
# calculating fitted y's for both treated and control groups

fitted_y_treated_low = low_dim.iloc[:,2:].transpose().multiply(coef_treated_low, axis =0).sum() + intercept_treated_low
fitted_y_control_low = low_dim.iloc[:,2:].transpose().multiply(coef_control_low, axis =0).sum() + intercept_control_low

In [8]:
# calculating the difference b.w the treatment and control group

ate_low = (fitted_y_treated_low - fitted_y_control_low).mean()
# measuring accuracy:

accuracy_low = real_low - ate_low
# stopping the clock:

run_time_low = time.time() - start_time_low

## High Dimension

In [9]:
# starting to measure run time for low dimension

start_time_high = time.time()

# deviding the data into treated and control groups

high_dim_treated = high_dim[high_dim['A'] == 1]
high_dim_treated = high_dim_treated.reset_index(drop = True)

high_dim_control = high_dim[high_dim['A'] == 0]
high_dim_control = high_dim_control.reset_index(drop = True)

# inspecting sizes:

len(high_dim_control), len(high_dim_treated)

(1103, 897)

In [10]:
# running a regression for the treated group:

lr = LinearRegression()
X, y = high_dim_treated.iloc[:,2:], high_dim_treated.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_treated_high = lr.coef_
intercept_treated_high =lr.intercept_

In [11]:
# running a regression for the control group:

lr = LinearRegression()
X, y = high_dim_control.iloc[:,2:], high_dim_control.iloc[:,0]
lr.fit(X, y)
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)
coef_control_high = lr.coef_
intercept_control_high =lr.intercept_

In [12]:
# calculating fitted y's for both treated and control groups

fitted_y_treated_high = high_dim.iloc[:,2:].transpose().multiply(coef_treated_high, axis =0).sum() + intercept_treated_high
fitted_y_control_high = high_dim.iloc[:,2:].transpose().multiply(coef_control_high, axis =0).sum() + intercept_control_high

In [13]:
# calculating the difference b.w the treatment and control group

ate_high = (fitted_y_treated_high - fitted_y_control_high).mean()
# measuring accuracy

accuracy_high = real_high - ate_high
# stopping clock

run_time_high = time.time() - start_time_high

In [16]:
# building a df for the results:

regression_estimate_low = pd.Series(data = [run_time_low, ate_low, accuracy_low], 
                                   index = ['run_time','ate','accuracy']).rename('low')
regression_estimate_high = pd.Series(data = [run_time_high, ate_high, accuracy_high], 
                                     index = ['run_time','ate', 'accuracy']).rename('high')
results_regression_estimate = pd.DataFrame([regression_estimate_low, regression_estimate_high]).round(3)

results_regression_estimate

Unnamed: 0,run_time,ate,accuracy
low,0.16,2.527,-0.027
high,0.223,-2.96,-0.04


In [None]:
#