# GR5241 Project4 
##  L1 penalized logistic regression + Doubly Robust Estimation 

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


###  Low Dimensional data
#### Data processing 
#### 1.1 Create X and Y variables for L1 Logistic Regression 

In [2]:
lowDim = pd.read_csv(r'../data/lowDim_dataset.csv')
X_low = lowDim.drop(['A','Y'], axis = 1)  
y_low = lowDim[['A']]

#### 1.2 Split data into training and testing sets

In [3]:
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.25, random_state=0)

#### 1.3 Standardize features
- Because the regularization penalty is comprised of the sum of the absolute value of the coefficients, we need to scale the data so the coefficients are all based on the same scale.

In [4]:
# Create a scaler object
sc = StandardScaler()
X_train_low_std = sc.fit_transform(X_train_low)
X_test_low_std = sc.transform(X_test_low)

#### 2. L1 penalized logistic regression 
#### 2.1 Parameter tunning 

In [5]:
C = [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.5, 0.3, 0.2, 0.1]
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_low, y_train_low)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_low_std, y_train_low))
    print('Test accuracy:', clf.score(X_test_low_std, y_test_low))
    print('')   

C: 1
Coefficient of each feature: [[ 0.34419367  0.43238251  0.73102716 -0.15445403  0.          0.13306032
   0.          0.26229326  0.          0.33354899 -0.0579125   0.
   0.04015877 -0.50944388  0.14782478  0.          0.24494353  0.10429014
   0.          0.          0.00275068  0.41842302]]
Training accuracy: 0.7837078651685393
Test accuracy: 0.7226890756302521

C: 0.95
Coefficient of each feature: [[ 3.41394331e-01  4.19164596e-01  7.21203160e-01 -1.47436611e-01
   0.00000000e+00  1.30737853e-01  0.00000000e+00  2.60321521e-01
   0.00000000e+00  3.21937086e-01 -4.60120791e-02  0.00000000e+00
   3.64427152e-02 -5.04450999e-01  1.46549264e-01  0.00000000e+00
   2.44899467e-01  1.03654487e-01  0.00000000e+00  0.00000000e+00
   2.75226924e-04  4.16449548e-01]]
Training accuracy: 0.7837078651685393
Test accuracy: 0.7142857142857143

C: 0.9
Coefficient of each feature: [[ 0.33881124  0.40536357  0.710973   -0.13966154  0.          0.12831076
   0.          0.25802403  0.          0.

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


In [37]:
start_time_l1_low = time.time()
# Best: C = 1
clf_low = LogisticRegression(penalty='l1', C = 1, solver = 'liblinear')

#### 2.2 Calculate propensity scores

In [7]:
clf_low.fit(X_low, y_low.values.ravel())
ps_low=clf_low.predict_proba(X_low)[:, 1]

#### 3. Doubly Robust Estimation for ATE

In [8]:
full_low_dim = lowDim.copy()
full_low_dim['PS']=pd.Series(ps_low, index=full_low_dim.index)

In [9]:
# deviding the low dimensional data into treated and control groups

lowDim_treated = lowDim[lowDim['A'] == 1]
lowDim_treated = lowDim_treated.reset_index(drop = True)

lowDim_control = lowDim[lowDim['A'] == 0]
lowDim_control = lowDim_control.reset_index(drop = True)

#Fit a regression model to get the estimation of y given T=1 and X 
X1_low_treated = lowDim_treated.drop(['Y'], axis = 1)
y_low_treated = lowDim_treated['Y']
lr_low_treated = LinearRegression().fit(X1_low_treated, y_low_treated)

# Fit a regression model to get the estimation of y given T=0and X 
X1_low_control = lowDim_control.drop(['Y'], axis = 1)
y_low_control = lowDim_control['Y']
lr_low_control = LinearRegression().fit(X1_low_control, y_low_control)

In [10]:
# Select all covariates and 'A' columns from full dataset
X_low_new = full_low_dim.drop(['Y','PS'], axis = 1)
m1_low= lr_low_treated.predict(X_low_new)
m0_low= lr_low_control.predict(X_low_new)
# join m1 and m0 to full_low_dim
full_low_dim['m1'] = pd.Series(m1_low, index = full_low_dim.index)
full_low_dim['m0'] = pd.Series(m0_low, index = full_low_dim.index)

In [29]:
def DRE(full_data):
    
    n = len(full_data.index)
    result1 = 0
    result2 = 0
    
    for i in range(n):
        result1 = result1 + (full_data['A'][i] * full_data['Y'][i] - (full_data['A'][i] - full_data['PS'][i])*full_data['m1'][i])/full_data['PS'][i]
        result2 = result2 + ((1-full_data['A'][i])* full_data['Y'][i] - (full_data['A'][i] - full_data['PS'][i])*full_data['m0'][i])/(1-full_data['PS'][i])
        
    ETA = 1/n*(result1-result2)
    
    return ETA

In [34]:
start_time_dre_low = time.time()
DRE(full_low_dim)

2.404204395264653

In [46]:
print(f'The estimated ATE for low dimensional data is: {DRE(full_low_dim)}')
low_acc = abs(DRE(full_low_dim) - 2.5)/2.5 * 100
print(f'Accuracy of ATE estimation of low dimensional data is: {low_acc}')
l1_time_low = time.time() - start_time_l1_low 
print(f'Running time of L1 Logist Regression model on low dimensional data is: {l1_time_low} s')
dre_time_low = time.time() - start_time_dre_low
print(f'Running time of DRE model on low dimensional data is: {dre_time_low} s')


The estimated ATE for low dimensional data is: 2.404204395264653
Accuracy of ATE estimation of low dimensional data is: 3.8318241894138794
Running time of L1 Logist Regression model on low dimensional data is: 385.05313205718994 s
Running time of DRE model on low dimensional data is: 408.0132300853729 s


###  High Dimensional data
#### 1. Data processing 
#### 1.1 Create X and Y variables for L1 Logistic Regression 


In [16]:
highDim = pd.read_csv(r'../data/highDim_dataset.csv')
X_high = highDim.drop(['A','Y'], axis = 1)
y_high = highDim[['A']]


#### 1.2 Split data into training and testing sets

In [17]:
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.25, random_state=0)

#### 1.3 Standardize features

In [18]:
X_train_high_std = sc.fit_transform(X_train_high)
X_test_high_std = sc.transform(X_test_high)

#### 2. L1 penalized logistic regression 
#### 2.1 Parameter tunning 

In [54]:
C = [.06, .05, .04, .03, .02, .01, 0.008, 0.005, 0.001]

for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_high, y_train_high)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_high_std, y_train_high))
    print('Test accuracy:', clf.score(X_test_high_std, y_test_high))
    print('')


  return f(**kwargs)


C: 0.06
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  1.59166918e-02  7.23985853e-03
   1.62634659e-02 -2.35102332e-02  1.71913233e-03  0.00000000e+00
   0.00000000e+00 -2.97422041e-02 -2.15419306e-02  2.76391478e-03
   6.23530049e-03  6.25793290e-03  7.34008963e-03 -2.82868384e-03
  -9.57286623e-05  0.00000000e+00  5.51828762e-03  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -3.42339806e-02  0.00000000e+00  1.16418160e-02
   8.36849268e-04  0.00000000e+00  3.48584623e-04 -5.16455860e-03
   1.51980949e-05  0.00000000e+00  1.18314250e-02  9.94941463e-03
  -6.77703719e-04 -1.10475319e-03  0.00000000e+00  3.55811321e-05
  -8.63962786e-03  2.31466691e-02 -3.54103493e-02  5.11208779e-02
   5.34635885e-02  0.00000000e+00  0.00000000e+00 -1.74477818e-02
   0.00000000e+00  1.61338939e-02 -5.36136279e-03  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -2.42570001e-03  0.00000000e+00
  -1.40454097e-03 -6.12501746e-02  0.00

  return f(**kwargs)


C: 0.05
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  1.20242407e-02  7.61166063e-03
   1.51865678e-02 -2.16432651e-02  1.48789639e-03  0.00000000e+00
   0.00000000e+00 -2.84772615e-02 -1.31137802e-02  2.38391888e-03
   5.74696865e-03  5.84251497e-03  6.82904857e-03 -2.79783188e-03
  -2.49889267e-04  0.00000000e+00  2.10972316e-03  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -3.10587952e-02  0.00000000e+00  9.14818310e-03
   8.30522329e-04  0.00000000e+00  3.69921859e-04 -4.81080066e-03
   1.43521289e-05  0.00000000e+00  1.10980059e-02  8.70358898e-03
  -6.48353792e-04 -1.10673242e-03  0.00000000e+00  1.91236763e-05
  -7.29026166e-03  1.93262145e-02 -3.27726229e-02  4.98154815e-02
   4.76586895e-02  0.00000000e+00  0.00000000e+00 -1.41951941e-02
   0.00000000e+00  1.50006022e-02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -5.98523015e-02  0.00

  return f(**kwargs)


C: 0.04
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  5.93751970e-03  8.34516579e-03
   1.38854915e-02 -1.99088394e-02  1.12644149e-03  0.00000000e+00
   0.00000000e+00 -2.68922990e-02 -1.22286663e-03  2.06277431e-03
   5.07841559e-03  5.49914768e-03  6.20177692e-03 -2.61152800e-03
  -1.35189011e-04  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -2.68983906e-02  0.00000000e+00  6.52549145e-03
   8.55944356e-04  0.00000000e+00  3.84375958e-04 -4.44215537e-03
   1.34190812e-05  0.00000000e+00  9.31700965e-03  7.05720032e-03
  -5.94008323e-04 -1.07476683e-03  0.00000000e+00  1.06849520e-05
  -4.73653095e-03  1.26846555e-02 -2.72247779e-02  4.82029082e-02
   4.39549753e-02  0.00000000e+00  0.00000000e+00 -7.78328275e-03
   0.00000000e+00  1.24301919e-02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -5.75624273e-02  0.00

  return f(**kwargs)


C: 0.03
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  1.69021716e-03  8.79042581e-03
   1.09211184e-02 -1.63977910e-02  7.79434841e-04  0.00000000e+00
   0.00000000e+00 -2.40264994e-02  0.00000000e+00  1.79452075e-03
   4.85292461e-03  5.21642851e-03  5.49772449e-03 -2.00042174e-03
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -2.17070274e-02  0.00000000e+00  3.68336607e-03
   8.82835597e-04  0.00000000e+00  4.81130882e-04 -3.91960300e-03
   1.24785015e-05  0.00000000e+00  7.66088681e-03  3.59424684e-03
  -5.26926477e-04 -1.04585876e-03  0.00000000e+00  1.97935597e-05
  -1.64751007e-03  3.05333074e-03 -1.64987540e-02  4.56170373e-02
   4.07561879e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  8.84200273e-03  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -5.21740952e-02  0.00

  return f(**kwargs)
  return f(**kwargs)


C: 0.01
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  4.95520224e-03
   0.00000000e+00  0.00000000e+00  3.41221863e-04  0.00000000e+00
   0.00000000e+00 -7.14512168e-03  0.00000000e+00  0.00000000e+00
   3.77295196e-04  2.05592777e-03  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   9.22828722e-04  0.00000000e+00  2.59281530e-04 -1.70519273e-03
   6.36681372e-06  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -2.32475984e-04 -7.84573267e-04  0.00000000e+00  1.82092110e-05
   0.00000000e+00  0.00000000e+00  0.00000000e+00  5.95953072e-03
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -1.10138860e-02  0.00

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


#### 2.2 Best model 

In [39]:
# Best: C = 0.06
start_time_l1_high = time.time()
clf_high = LogisticRegression(penalty='l1', C = 0.06, solver = 'liblinear')

#### 2.3 Calculate propensity scores

In [40]:
clf_high.fit(X_high, y_high.values.ravel())
ps_high=clf_high.predict_proba(X_high)[:, 1]

#### 3. Doubly Robust Estimation Algorithm to calculate ATE

In [41]:
full_high_dim= highDim.copy()
full_high_dim['PS']=pd.Series(ps_high, index=full_high_dim.index)

In [42]:
# deviding the high dimensional data into treated and control groups
highDim_treated = highDim[highDim['A'] == 1]
highDim_treated = highDim_treated.reset_index(drop = True)

highDim_control = highDim[highDim['A'] == 0]
highDim_control = highDim_control.reset_index(drop = True)


# Fit a regression model to get the estimation of y given T=1 and X 
X1_high_treated = highDim_treated.drop(['Y'], axis = 1)
y_high_treated = highDim_treated['Y']
lr_high_treated = LinearRegression().fit(X1_high_treated, y_high_treated)


# Fit a regression model to get the estimation of y given T=0 and X 
X1_high_control = highDim_control.drop(['Y'], axis = 1)
y_high_control = highDim_control['Y']
lr_high_control = LinearRegression().fit(X1_high_control, y_high_control)

In [43]:
X_high_new = full_high_dim.drop(['Y','PS'], axis = 1)
m1_high= lr_high_treated.predict(X_high_new)
m0_high= lr_high_control.predict(X_high_new)
full_high_dim['m1'] = pd.Series(m1_high, index = full_high_dim.index)
full_high_dim['m0'] = pd.Series(m0_high, index = full_high_dim.index)

In [44]:
start_time_dre_high = time.time()
DRE(full_high_dim)

-3.0962049953946535

In [47]:
print(f'The estimated ATE for low dimensional data is: {DRE(full_high_dim)}')
high_acc = (DRE(full_high_dim) - (-3))/(-3) * 100
print(f'Accuracy of ATE estimation of low dimensional data is: {high_acc}')
l1_time_high = time.time() - start_time_l1_high 
print(f'Running time of L1 Logist Regression model on low dimensional data is: {l1_time_high} s')
dre_time_high = time.time() - start_time_dre_high
print(f'Running time of DRE model on low dimensional data is: {dre_time_high} s')


The estimated ATE for low dimensional data is: -3.0962049953946535
Accuracy of ATE estimation of low dimensional data is: 3.206833179821785
Running time of L1 Logist Regression model on low dimensional data is: 208.83839631080627 s
Running time of DRE model on low dimensional data is: 201.6145739555359 s


### Summary Table 

In [50]:
summary_table = pd.DataFrame(np.array([[l1_time_low, dre_time_low,low_acc], [l1_time_high, dre_time_high,high_acc]]),
                   columns=['l1 running time', 'DRE running time', 'Accuracy of ATE'],
                            index=['low dim','high dim'])


In [51]:
summary_table

Unnamed: 0,l1 running time,DRE running time,Accuracy of ATE
low dim,385.053132,408.01323,3.831824
high dim,208.838396,201.614574,3.206833
