# GR5241 Project4 
## Doubly Robust Estimation + L1 penalized logistic regression

### L1 penalized logistic regression to calculate propensity score

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


#### Data Preprocessing

In [2]:
%pwd 
%cd /Users/duansiyu/Downloads

/Users/duansiyu/Downloads


In [5]:
# Load data

highDim = pd.read_csv(r'C:\Users\zlj01\Documents\Columbia University\GR5243\project 4\Data\highDim_dataset.csv')
lowDim = pd.read_csv(r'C:\Users\zlj01\Documents\Columbia University\GR5243\project 4\Data\lowDim_dataset.csv')

# View high dimensional data
highDim.head()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185
0,-11.682472,1,0,1,2,16,3,-1,13,-0.13,...,5,7,8,6,8,-1,-1,-1,-1,-1
1,-13.176546,0,1,1,12,14,14,14,13,0.24,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,-2.195401,1,0,1,21,22,10,10,14,0.27,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,-0.005454,1,1,1,9,20,11,2,10,0.09,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
4,-1.987538,1,1,1,7,16,16,11,6,0.15,...,70,70,80,70,80,-10,-10,-10,-10,-10


In [6]:
# View low dimensional data
lowDim.head()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
0,19.678858,0,1.59,0.0,0.0,0.0,0.24,1.35,0.73,2.58,...,0.12,0.0,4.55,0.0,1.72,0.0,0.49,0.98,0.0,1.309683
1,17.842989,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.62,...,0.27,0.0,4.87,0.0,0.81,0.27,0.27,0.0,0.0,1.719547
2,22.108788,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.12,0.0,0.0,0.0,2.12,0.99621
3,15.355899,0,0.0,0.0,0.0,0.56,0.0,0.0,0.0,0.0,...,0.0,0.0,1.12,0.0,0.0,0.0,0.0,0.0,0.0,1.504077
4,16.787813,1,1.81,0.0,0.0,0.0,0.0,0.0,0.0,1.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327864


In [7]:
# Create X from the features
X_high = highDim.drop(['A','Y'], axis = 1)
X_low = lowDim.drop(['A','Y'], axis = 1)

# Create y from output
y_high = highDim[['A']]
y_low = lowDim[['A']]

#### Split The Data Into Training And Test Sets

In [8]:
# Split The Data Into Training And Test Sets
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.25, random_state=0)
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.25, random_state=0)

#### Standardize Features

Because the regularization penalty is comprised of the sum of the absolute value of the coefficients, we need to scale the data so the coefficients are all based on the same scale.

In [9]:
# Create a scaler object
sc = StandardScaler()

# High Dimensional data
# Fit the scaler to the training data and transform
X_train_high_std = sc.fit_transform(X_train_high)

# Apply the scaler to the test data
X_test_high_std = sc.transform(X_test_high)

# Low Dimensional data
# Fit the scaler to the training data and transform
X_train_low_std = sc.fit_transform(X_train_low)

# Apply the scaler to the test data
X_test_low_std = sc.transform(X_test_low)

#### Fit logistic regression with a L1 penalty

In [None]:
C = [.06, .05, .04, .03, .02, .01, 0.008, 0.005, 0.001]

# High Dimensional data
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_high, y_train_high)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_high_std, y_train_high))
    print('Test accuracy:', clf.score(X_test_high_std, y_test_high))
    print('')



#### Best model for high dimensional data

In [10]:
# Best: C = 0.06
clf_high = LogisticRegression(penalty='l1', C = 0.06, solver = 'liblinear')

In [None]:
C = [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.5, 0.3, 0.2, 0.1]

# Low Dimensional data
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_low, y_train_low)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_low_std, y_train_low))
    print('Test accuracy:', clf.score(X_test_low_std, y_test_low))
    print('')
    

#### Best model for low dimensional data

In [11]:
# Best: C = 1
clf_low = LogisticRegression(penalty='l1', C = 1, solver = 'liblinear')

In [12]:
# High dimensional propensity score
clf_high.fit(X_high, y_high)
ps_high=clf_high.predict_proba(X_high)[:, 1]
len(ps_high)

  return f(**kwargs)


2000

In [13]:
# Low dimensional propensity score
clf_low.fit(X_low, y_low)
ps_low=clf_low.predict_proba(X_low)[:, 1]
type(ps_low)

  return f(**kwargs)


numpy.ndarray

### Doubly Robust Estimation  Algorithm for ATE

#### Add propensity scores to the data frame

In [14]:
full_high_dim= highDim.copy()
full_low_dim = lowDim.copy()

In [15]:
full_high_dim['PS']=pd.Series(ps_high, index=full_high_dim.index)
full_high_dim.head()
full_low_dim['PS']=pd.Series(ps_low, index=full_low_dim.index)
full_low_dim.head()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V14,V15,V16,V17,V18,V19,V20,V21,V22,PS
0,19.678858,0,1.59,0.0,0.0,0.0,0.24,1.35,0.73,2.58,...,0.0,4.55,0.0,1.72,0.0,0.49,0.98,0.0,1.309683,0.620715
1,17.842989,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.62,...,0.0,4.87,0.0,0.81,0.27,0.27,0.0,0.0,1.719547,0.417021
2,22.108788,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.12,0.0,0.0,0.0,2.12,0.99621,0.172024
3,15.355899,0,0.0,0.0,0.0,0.56,0.0,0.0,0.0,0.0,...,0.0,1.12,0.0,0.0,0.0,0.0,0.0,0.0,1.504077,0.14491
4,16.787813,1,1.81,0.0,0.0,0.0,0.0,0.0,0.0,1.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327864,0.166764


#### Calculate ATE 

##### Low dimensional Data

In [24]:
# deviding the data into treated and control groups

lowDim_treated = lowDim[lowDim['A'] == 1]
lowDim_treated = lowDim_treated.reset_index(drop = True)

lowDim_control = lowDim[lowDim['A'] == 0]
lowDim_control = lowDim_control.reset_index(drop = True)

#Fit a glm model to get the estimation of y given T and X 
#Treated group
X1_low_treated = lowDim_treated.drop(['Y'], axis = 1)
y_low_treated = lowDim_treated['Y']

X1_low_treated = sm.add_constant(X1_low_treated)
lr_low_treated = sm.OLS(y_low_treated, X1_low_treated).fit()
est_low_treated = lr_low_treated.predict(X1_low_treated)

# Control group
# Fit a regression model to get the estimation of y given T and X 
X1_low_control = lowDim_control.drop(['Y'], axis = 1)
y_low_control = lowDim_control['Y']

X1_low_control = sm.add_constant(X1_low_control)
lr_low_control = sm.OLS(y_low_control, X1_low_control).fit()
est_low_control = lr_low_control.predict(X1_low_control)

##### High dimensional Data

In [25]:
# deviding the data into treated and control groups
highDim_treated = highDim[highDim['A'] == 1]
highDim_treated = highDim_treated.reset_index(drop = True)

highDim_control = highDim[highDim['A'] == 0]
highDim_control = highDim_control.reset_index(drop = True)

# Treated group
# Fit a regression model to get the estimation of y given T and X 
X1_high_treated = highDim_treated.drop(['Y'], axis = 1)
y_high_treated = highDim_treated['Y']

X1_high_treated = sm.add_constant(X1_high_treated)
lr_high_treated = sm.OLS(y_high_treated, X1_high_treated).fit()
est_high_treated = lr_high_treated.predict(X1_high_treated)

# Control group
# Fit a regression model to get the estimation of y given T and X 
X1_high_control = highDim_control.drop(['Y'], axis = 1)
y_high_control = highDim_control['Y']

X1_high_control = sm.add_constant(X1_high_control)
lr_high_control = sm.OLS(y_high_control, X1_high_control, family = sm.families.Binomial()).fit()
est_high_control = lr_high_control.predict(X1_high_control)

In [30]:
highDim['A'][3]

1

In [31]:
# Doubly Robust Estimation function

def DRE(df_treated, df_control, est_treated, est_control):
    
    n1 = len(df_treated.index) + 1
    n2 = len(df_control.index) + 1
    result1 = 0
    result2 = 0
    
    for i in range(n1):
        result1 = result1 + (df_treated['A'][i] * df_treated['Y'][i] 
                             - (df_treated['A'][i] - df_treated['PS'][i])*est_treated[i])/df_treated['PS'][i]
        
    for i in range(n2):
        result2 = result2 + (df_control['A'][i] * df_control['Y'][i] 
                             - (df_control['A'][i] - df_control['PS'][i])*est_control[i])/df_control['PS'][i]
        
    result = 1/n1*result1-1/n2*result2
    
    return result

In [33]:
print(DRE(highDim_treated, highDim_control, est_high_treated, est_high_control))

KeyError: 'PS'