# GR5241 Project4 
##  L1 penalized logistic regression + Doubly Robust Estimation 

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


###  Low Dimensional data
#### Data processing 
#### 1.1 Create X and Y variables for L1 Logistic Regression 

In [2]:
lowDim = pd.read_csv(r'../data/lowDim_dataset.csv')
X_low = lowDim.drop(['A','Y'], axis = 1)  
y_low = lowDim[['A']]

#### 1.2 Split data into training and testing sets

In [3]:
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.25, random_state=0)

#### 1.3 Standardize features
- Because the regularization penalty is comprised of the sum of the absolute value of the coefficients, we need to scale the data so the coefficients are all based on the same scale.

In [4]:
# Create a scaler object
sc = StandardScaler()
X_train_low_std = sc.fit_transform(X_train_low)
X_test_low_std = sc.transform(X_test_low)

#### 2. L1 penalized logistic regression 
#### 2.1 Parameter tunning 

In [5]:
C = [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.5, 0.3, 0.2, 0.1]
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_low, y_train_low)
    print('C:', c)
    print('Training accuracy:', clf.score(X_train_low_std, y_train_low))
    print('Test accuracy:', clf.score(X_test_low_std, y_test_low))
    print('')   

C: 1
Training accuracy: 0.7837078651685393
Test accuracy: 0.7226890756302521

C: 0.95
Training accuracy: 0.7837078651685393
Test accuracy: 0.7142857142857143

C: 0.9
Training accuracy: 0.7837078651685393
Test accuracy: 0.7226890756302521

C: 0.85
Training accuracy: 0.7837078651685393
Test accuracy: 0.7142857142857143

C: 0.8
Training accuracy: 0.7837078651685393
Test accuracy: 0.7142857142857143

C: 0.75
Training accuracy: 0.7837078651685393
Test accuracy: 0.7142857142857143

C: 0.7
Training accuracy: 0.7808988764044944
Test accuracy: 0.7142857142857143

C: 0.65
Training accuracy: 0.7808988764044944
Test accuracy: 0.7226890756302521

C: 0.5
Training accuracy: 0.7837078651685393
Test accuracy: 0.7226890756302521

C: 0.3
Training accuracy: 0.7837078651685393
Test accuracy: 0.7394957983193278

C: 0.2
Training accuracy: 0.7837078651685393
Test accuracy: 0.7394957983193278

C: 0.1
Training accuracy: 0.7696629213483146
Test accuracy: 0.7478991596638656



  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


In [6]:
start_time_l1_low = time.time()
# Best: C = 1
clf_low = LogisticRegression(penalty='l1', C = 1, solver = 'liblinear')

#### 2.2 Calculate propensity scores

In [7]:
clf_low.fit(X_low, y_low.values.ravel())
ps_low=clf_low.predict_proba(X_low)[:, 1]

#### 3. Doubly Robust Estimation for ATE

In [8]:
full_low_dim = lowDim.copy()
full_low_dim['PS']=pd.Series(ps_low, index=full_low_dim.index)

In [9]:
# deviding the low dimensional data into treated and control groups

lowDim_treated = lowDim[lowDim['A'] == 1]
lowDim_treated = lowDim_treated.reset_index(drop = True)

lowDim_control = lowDim[lowDim['A'] == 0]
lowDim_control = lowDim_control.reset_index(drop = True)

#Fit a regression model to get the estimation of y given T=1 and X 
X1_low_treated = lowDim_treated.drop(['Y'], axis = 1)
y_low_treated = lowDim_treated['Y']
lr_low_treated = LinearRegression().fit(X1_low_treated, y_low_treated)

# Fit a regression model to get the estimation of y given T=0and X 
X1_low_control = lowDim_control.drop(['Y'], axis = 1)
y_low_control = lowDim_control['Y']
lr_low_control = LinearRegression().fit(X1_low_control, y_low_control)

In [10]:
# Select all covariates and 'A' columns from full dataset
X_low_new = full_low_dim.drop(['Y','PS'], axis = 1)
m1_low= lr_low_treated.predict(X_low_new)
m0_low= lr_low_control.predict(X_low_new)
# join m1 and m0 to full_low_dim
full_low_dim['m1'] = pd.Series(m1_low, index = full_low_dim.index)
full_low_dim['m0'] = pd.Series(m0_low, index = full_low_dim.index)

In [11]:
def DRE(full_data):
    
    n = len(full_data.index)
    result1 = 0
    result2 = 0
    
    for i in range(n):
        result1 = result1 + (full_data['A'][i] * full_data['Y'][i] - (full_data['A'][i] - full_data['PS'][i])*full_data['m1'][i])/full_data['PS'][i]
        result2 = result2 + ((1-full_data['A'][i])* full_data['Y'][i] - (full_data['A'][i] - full_data['PS'][i])*full_data['m0'][i])/(1-full_data['PS'][i])
        
    ETA = 1/n*(result1-result2)
    
    return ETA

In [12]:
start_time_dre_low = time.time()
DRE(full_low_dim)

2.4041556552378998

In [13]:
print(f'The estimated ATE for low dimensional data is: {DRE(full_low_dim)}')
low_acc = abs(DRE(full_low_dim) - 2.5)/2.5 * 100
print(f'Accuracy of ATE estimation of low dimensional data is: {low_acc}')
l1_time_low = time.time() - start_time_l1_low 
print(f'Running time of L1 Logist Regression model on low dimensional data is: {l1_time_low} s')
dre_time_low = time.time() - start_time_dre_low
print(f'Running time of DRE model on low dimensional data is: {dre_time_low} s')


The estimated ATE for low dimensional data is: 2.4041556552378998
Accuracy of ATE estimation of low dimensional data is: 3.8337737904840097
Running time of L1 Logist Regression model on low dimensional data is: 0.43085384368896484 s
Running time of DRE model on low dimensional data is: 0.21741843223571777 s


###  High Dimensional data
#### 1. Data processing 
#### 1.1 Create X and Y variables for L1 Logistic Regression 


In [14]:
highDim = pd.read_csv(r'../data/highDim_dataset.csv')
X_high = highDim.drop(['A','Y'], axis = 1)
y_high = highDim[['A']]


#### 1.2 Split data into training and testing sets

In [15]:
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.25, random_state=0)

#### 1.3 Standardize features

In [16]:
X_train_high_std = sc.fit_transform(X_train_high)
X_test_high_std = sc.transform(X_test_high)

#### 2. L1 penalized logistic regression 
#### 2.1 Parameter tunning 

In [17]:
C = [.06, .05, .04, .03, .02, .01, 0.008, 0.005, 0.001]

for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_high, y_train_high)
    print('C:', c)
    print('Training accuracy:', clf.score(X_train_high_std, y_train_high))
    print('Test accuracy:', clf.score(X_test_high_std, y_test_high))
    print('')


  return f(**kwargs)


C: 0.06
Training accuracy: 0.618
Test accuracy: 0.606



  return f(**kwargs)


C: 0.05
Training accuracy: 0.614
Test accuracy: 0.596



  return f(**kwargs)


C: 0.04
Training accuracy: 0.6053333333333333
Test accuracy: 0.59



  return f(**kwargs)


C: 0.03
Training accuracy: 0.6086666666666667
Test accuracy: 0.602

C:

  return f(**kwargs)
  return f(**kwargs)


 0.02
Training accuracy: 0.604
Test accuracy: 0.598

C: 0.01
Training accuracy: 0.5886666666666667
Test accuracy: 0.596

C: 0.008
Training accuracy: 0.584
Test accuracy: 0.604



  return f(**kwargs)
  return f(**kwargs)


C: 0.005
Training accuracy: 0.5826666666666667
Test accuracy: 0.612

C: 0.001
Training accuracy: 0.5846666666666667
Test accuracy: 0.496



  return f(**kwargs)


#### 2.2 Best model 

In [18]:
# Best: C = 0.06
start_time_l1_high = time.time()
clf_high = LogisticRegression(penalty='l1', C = 0.06, solver = 'liblinear')

#### 2.3 Calculate propensity scores

In [19]:
clf_high.fit(X_high, y_high.values.ravel())
ps_high=clf_high.predict_proba(X_high)[:, 1]

#### 3. Doubly Robust Estimation Algorithm to calculate ATE

In [20]:
full_high_dim= highDim.copy()
full_high_dim['PS']=pd.Series(ps_high, index=full_high_dim.index)

In [21]:
# deviding the high dimensional data into treated and control groups
highDim_treated = highDim[highDim['A'] == 1]
highDim_treated = highDim_treated.reset_index(drop = True)

highDim_control = highDim[highDim['A'] == 0]
highDim_control = highDim_control.reset_index(drop = True)


# Fit a regression model to get the estimation of y given T=1 and X 
X1_high_treated = highDim_treated.drop(['Y'], axis = 1)
y_high_treated = highDim_treated['Y']
lr_high_treated = LinearRegression().fit(X1_high_treated, y_high_treated)


# Fit a regression model to get the estimation of y given T=0 and X 
X1_high_control = highDim_control.drop(['Y'], axis = 1)
y_high_control = highDim_control['Y']
lr_high_control = LinearRegression().fit(X1_high_control, y_high_control)

In [22]:
X_high_new = full_high_dim.drop(['Y','PS'], axis = 1)
m1_high= lr_high_treated.predict(X_high_new)
m0_high= lr_high_control.predict(X_high_new)
full_high_dim['m1'] = pd.Series(m1_high, index = full_high_dim.index)
full_high_dim['m0'] = pd.Series(m0_high, index = full_high_dim.index)

In [23]:
start_time_dre_high = time.time()
DRE(full_high_dim)

-3.097056214874587

In [24]:
print(f'The estimated ATE for low dimensional data is: {DRE(full_high_dim)}')
high_acc = (DRE(full_high_dim) - (-3))/(-3) * 100
print(f'Accuracy of ATE estimation of low dimensional data is: {high_acc}')
l1_time_high = time.time() - start_time_l1_high 
print(f'Running time of L1 Logist Regression model on low dimensional data is: {l1_time_high} s')
dre_time_high = time.time() - start_time_dre_high
print(f'Running time of DRE model on low dimensional data is: {dre_time_high} s')


The estimated ATE for low dimensional data is: -3.097056214874587
Accuracy of ATE estimation of low dimensional data is: 3.2352071624862275
Running time of L1 Logist Regression model on low dimensional data is: 1.397301435470581 s
Running time of DRE model on low dimensional data is: 0.6203784942626953 s


### Summary Table 

In [25]:
summary_table = pd.DataFrame(np.array([[l1_time_low, dre_time_low,low_acc], [l1_time_high, dre_time_high,high_acc]]),
                   columns=['l1 running time', 'DRE running time', 'Accuracy of ATE'],
                            index=['low dim','high dim'])


In [26]:
summary_table

Unnamed: 0,l1 running time,DRE running time,Accuracy of ATE
low dim,0.430854,0.217418,3.833774
high dim,1.397301,0.620378,3.235207
