# GR5241 Project4 
##  L1 penalized logistic regression + Doubly Robust Estimation 

In [60]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [8]:
#load data
lowDim = pd.read_csv('../data/lowDim_dataset.csv')
highDim = pd.read_csv('../data/highDim_dataset.csv')

In [11]:
# Create X and Y variables from origianal datasets for L1 Logistic Regression 
def logit_dataset (df):
    #all the covariates as X
    X = df.drop(['A','Y'], axis = 1)  
    y = df[['A']]
    return X, y

In [15]:
#Scaler training features for regression model
def std_feature (x_train, x_test):
    sc = StandardScaler()
    x_train_scaled = sc.fit_transform(x_train)
    x_test_scaled = sc.transform(x_test)
    return x_train_scaled,  x_test_scaled

###  Low Dimensional data
#### Data processing 


In [24]:
X_low = logit_dataset(lowDim)[0]
y_low = logit_dataset(lowDim)[1]

# Split data into training and testing sets
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.25, random_state=0)

In [27]:
# Standardize features
X_train_low_std =std_feature(X_train_low,X_test_low)[0]
X_test_low_std = std_feature(X_train_low,X_test_low)[1]

#### L1 penalized logistic regression for propencity scores


In [98]:
param_grid = {'C': [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.5, 0.4, 0.3, 0.2, 0.25,0.1]}
clf = GridSearchCV(LogisticRegression(penalty='l1'), param_grid)
clf=LogisticRegression(penalty='l1',solver = 'liblinear')
clf_cv=GridSearchCV(clf,param_grid,cv=5)
clf_cv.fit(X_train_low_std, y_train_low.values.ravel())

print("tuned hpyerparameters :(best parameters) ",clf_cv.best_params_)
print("acuracy from cross_validation:",clf_cv.best_score_)
print("Testing accuracy",clf_cv.score(X_test_low_std, y_test_low))

tuned hpyerparameters :(best parameters)  {'C': 0.1}
acuracy from cross_validation: 0.7809076682316118
Testing accuracy 0.7394957983193278


In [99]:
start_time_l1_low = time.time()
# Best: C = 0.1
clf_low = LogisticRegression(penalty='l1', C = 0.1, solver = 'liblinear')
#Calculate propensity scores
clf_low.fit(X_low, y_low.values.ravel())
ps_low=clf_low.predict_proba(X_low)[:, 1]

#### Doubly Robust Estimation for ATE

In [100]:
full_low_dim = lowDim.copy()
full_low_dim['PS']=pd.Series(ps_low, index=full_low_dim.index)

In [101]:
# deviding the low dimensional data into treated and control groups

lowDim_treated = lowDim[lowDim['A'] == 1]
lowDim_treated = lowDim_treated.reset_index(drop = True)

lowDim_control = lowDim[lowDim['A'] == 0]
lowDim_control = lowDim_control.reset_index(drop = True)

#Fit a regression model to get the estimation of y given T=1 and X 
X1_low_treated = lowDim_treated.drop(['Y'], axis = 1)
y_low_treated = lowDim_treated['Y']
lr_low_treated = LinearRegression().fit(X1_low_treated, y_low_treated)

# Fit a regression model to get the estimation of y given T=0and X 
X1_low_control = lowDim_control.drop(['Y'], axis = 1)
y_low_control = lowDim_control['Y']
lr_low_control = LinearRegression().fit(X1_low_control, y_low_control)

In [102]:
# Select all covariates and 'A' columns from full dataset
X_low_new = full_low_dim.drop(['Y','PS'], axis = 1)
m1_low= lr_low_treated.predict(X_low_new)
m0_low= lr_low_control.predict(X_low_new)
# join m1 and m0 to full_low_dim
full_low_dim['m1'] = pd.Series(m1_low, index = full_low_dim.index)
full_low_dim['m0'] = pd.Series(m0_low, index = full_low_dim.index)

In [103]:
def DRE(full_data):
    
    n = len(full_data.index)
    result1 = 0
    result2 = 0
    
    for i in range(n):
        result1 = result1 + (full_data['A'][i] * full_data['Y'][i] - (full_data['A'][i] - full_data['PS'][i])*full_data['m1'][i])/full_data['PS'][i]
        result2 = result2 + ((1-full_data['A'][i])* full_data['Y'][i] - (full_data['A'][i] - full_data['PS'][i])*full_data['m0'][i])/(1-full_data['PS'][i])
        
    ETA = 1/n*(result1-result2)
    
    return ETA

In [85]:
start_time_dre_low = time.time()
DRE(full_low_dim)
low_acc = (DRE(full_low_dim) - 2.5)/2.5 * 100
l1_time_low = time.time() - start_time_l1_low 
dre_time_low = time.time() - start_time_dre_low



###  High Dimensional data
#### Data processing 



In [86]:
X_high = logit_dataset(highDim)[0]
y_high = logit_dataset(highDim)[1]

X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.25, random_state=0)

# Standardize features
X_train_high_std =std_feature(X_train_high,X_test_high)[0]
X_test_high_std = std_feature(X_train_high,X_test_high)[1]

#### L1 penalized logistic regression for propencity scores


In [88]:
param_grid = {'C': [.06, .05, .04, .03, .02, .01, 0.008, 0.005, 0.001]}
clf = GridSearchCV(LogisticRegression(penalty='l1'), param_grid)
clf=LogisticRegression(penalty='l1',solver = 'liblinear')
clf_cv=GridSearchCV(clf,param_grid,cv=5)
clf_cv.fit(X_train_high_std, y_train_high.values.ravel())

print("tuned hpyerparameters :(best parameters) ",clf_cv.best_params_)
print("acuracy from cross_validation:",clf_cv.best_score_)
print("Testing accuracy",clf_cv.score(X_test_high_std, y_test_high))

tuned hpyerparameters :(best parameters)  {'C': 0.04}
acuracy from cross_validation: 0.6040000000000001
Testing accuracy 0.594


In [105]:
# Best: C = 0.04
start_time_l1_high = time.time()
clf_high = LogisticRegression(penalty='l1', C = 0.04, solver = 'liblinear')
#Calculate propensity scores
clf_high.fit(X_high, y_high.values.ravel())
ps_high=clf_high.predict_proba(X_high)[:, 1]

#### Doubly Robust Estimation Algorithm to calculate ATE

In [106]:
full_high_dim= highDim.copy()
full_high_dim['PS']=pd.Series(ps_high, index=full_high_dim.index)

In [107]:
# deviding the high dimensional data into treated and control groups
highDim_treated = highDim[highDim['A'] == 1]
highDim_treated = highDim_treated.reset_index(drop = True)

highDim_control = highDim[highDim['A'] == 0]
highDim_control = highDim_control.reset_index(drop = True)


# Fit a regression model to get the estimation of y given T=1 and X 
X1_high_treated = highDim_treated.drop(['Y'], axis = 1)
y_high_treated = highDim_treated['Y']
lr_high_treated = LinearRegression().fit(X1_high_treated, y_high_treated)


# Fit a regression model to get the estimation of y given T=0 and X 
X1_high_control = highDim_control.drop(['Y'], axis = 1)
y_high_control = highDim_control['Y']
lr_high_control = LinearRegression().fit(X1_high_control, y_high_control)

#Add m1 and m0 to dataset
X_high_new = full_high_dim.drop(['Y','PS'], axis = 1)
m1_high= lr_high_treated.predict(X_high_new)
m0_high= lr_high_control.predict(X_high_new)
full_high_dim['m1'] = pd.Series(m1_high, index = full_high_dim.index)
full_high_dim['m0'] = pd.Series(m0_high, index = full_high_dim.index)

In [92]:
start_time_dre_high = time.time()
DRE(full_high_dim)
high_acc = (DRE(full_high_dim) - (-3))/(-3) * 100
l1_time_high = time.time() - start_time_l1_high 
dre_time_high = time.time() - start_time_dre_high

### Summary Table 

In [108]:
summary_table = pd.DataFrame(np.array([[l1_time_low, dre_time_low,low_acc], [l1_time_high, dre_time_high,high_acc]]),
                   columns=['l1 running time', 'DRE running time', 'Accuracy of ATE'],
                            index=['low dim','high dim'])


In [109]:
summary_table

Unnamed: 0,l1 running time,DRE running time,Accuracy of ATE
low dim,15.548346,5.928025,5.814338
high dim,65.148969,0.327322,2.74712
