# GR5241 Project4 
## Doubly Robust Estimation + L1 penalized logistic regression

### L1 penalized logistic regression to calculate propensity score

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#### Data Preprocessing

In [3]:
# Load data
highDim = pd.read_csv(r'C:\Users\zlj01\Documents\Columbia University\GR5243\project 4\Data\highDim_dataset.csv')
lowDim = pd.read_csv(r'C:\Users\zlj01\Documents\Columbia University\GR5243\project 4\Data\lowDim_dataset.csv')

# View high dimensional data
highDim.head()


Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185
0,-11.682472,1,0,1,2,16,3,-1,13,-0.13,...,5,7,8,6,8,-1,-1,-1,-1,-1
1,-13.176546,0,1,1,12,14,14,14,13,0.24,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,-2.195401,1,0,1,21,22,10,10,14,0.27,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,-0.005454,1,1,1,9,20,11,2,10,0.09,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
4,-1.987538,1,1,1,7,16,16,11,6,0.15,...,70,70,80,70,80,-10,-10,-10,-10,-10


In [4]:
# View low dimensional data
lowDim.head()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22
0,19.678858,0,1.59,0.0,0.0,0.0,0.24,1.35,0.73,2.58,...,0.12,0.0,4.55,0.0,1.72,0.0,0.49,0.98,0.0,1.309683
1,17.842989,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.62,...,0.27,0.0,4.87,0.0,0.81,0.27,0.27,0.0,0.0,1.719547
2,22.108788,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.12,0.0,0.0,0.0,2.12,0.99621
3,15.355899,0,0.0,0.0,0.0,0.56,0.0,0.0,0.0,0.0,...,0.0,0.0,1.12,0.0,0.0,0.0,0.0,0.0,0.0,1.504077
4,16.787813,1,1.81,0.0,0.0,0.0,0.0,0.0,0.0,1.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327864


In [5]:
# Create X from the features
X_high = highDim.drop(['A','Y'], axis = 1)
X_low = lowDim.drop(['A','Y'], axis = 1)

# Create y from output
y_high = highDim[['A']]
y_low = lowDim[['A']]

#### Split The Data Into Training And Test Sets

In [6]:
# Split The Data Into Training And Test Sets
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.25, random_state=0)
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.25, random_state=0)

#### Standardize Features

Because the regularization penalty is comprised of the sum of the absolute value of the coefficients, we need to scale the data so the coefficients are all based on the same scale.

In [8]:
# Create a scaler object
sc = StandardScaler()

# High Dimensional data
# Fit the scaler to the training data and transform
X_train_high_std = sc.fit_transform(X_train_high)

# Apply the scaler to the test data
X_test_high_std = sc.transform(X_test_high)

# Low Dimensional data
# Fit the scaler to the training data and transform
X_train_low_std = sc.fit_transform(X_train_low)

# Apply the scaler to the test data
X_test_low_std = sc.transform(X_test_low)

#### Fit logistic regression with a L1 penalty

In [14]:
C = [.06, .05, .04, .03, .02, .01, 0.008, 0.005, 0.001]

# High Dimensional data
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_high, y_train_high)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_high_std, y_train_high))
    print('Test accuracy:', clf.score(X_test_high_std, y_test_high))
    print('')

# Best: C = 0.06
clf_high = LogisticRegression(penalty='l1', C = 0.06, solver = 'liblinear')

  return f(**kwargs)


C: 0.06
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  1.58979160e-02  7.23276502e-03
   1.62670449e-02 -2.35239693e-02  1.71720719e-03  0.00000000e+00
   0.00000000e+00 -2.97362984e-02 -2.15658592e-02  2.76705634e-03
   6.24389308e-03  6.26603513e-03  7.34374050e-03 -2.82990481e-03
  -9.45905799e-05  0.00000000e+00  5.53142877e-03  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -3.42371252e-02  0.00000000e+00  1.16284555e-02
   8.36503124e-04  0.00000000e+00  3.48205151e-04 -5.16416214e-03
   1.51965544e-05  0.00000000e+00  1.18313289e-02  9.97146165e-03
  -6.77867251e-04 -1.10482970e-03  0.00000000e+00  3.57607912e-05
  -8.64353277e-03  2.31270759e-02 -3.53992812e-02  5.11322645e-02
   5.34814164e-02  0.00000000e+00  0.00000000e+00 -1.74566508e-02
   0.00000000e+00  1.61654524e-02 -5.32785880e-03  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -2.40652731e-03  0.00000000e+00
  -1.40984931e-03 -6.12624971e-02  0.00

  return f(**kwargs)


C: 0.05
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  1.20101237e-02  7.61216592e-03
   1.51873448e-02 -2.16570793e-02  1.48903560e-03  0.00000000e+00
   0.00000000e+00 -2.84796378e-02 -1.31300781e-02  2.38505662e-03
   5.74682060e-03  5.84937233e-03  6.82820588e-03 -2.79755957e-03
  -2.46463766e-04  0.00000000e+00  2.11224088e-03  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -3.10436741e-02  0.00000000e+00  9.14771827e-03
   8.30229395e-04  0.00000000e+00  3.70060466e-04 -4.80963044e-03
   1.43490749e-05  0.00000000e+00  1.10908579e-02  8.73405285e-03
  -6.48609527e-04 -1.10614112e-03  0.00000000e+00  1.89119830e-05
  -7.27478030e-03  1.93235646e-02 -3.27876996e-02  4.98095383e-02
   4.76465718e-02  0.00000000e+00  0.00000000e+00 -1.41826295e-02
   0.00000000e+00  1.50042025e-02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -5.98730678e-02  0.00

  return f(**kwargs)


C: 0.04
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  5.71170934e-03  8.38690353e-03
   1.39217137e-02 -1.99707957e-02  1.14553013e-03  0.00000000e+00
   0.00000000e+00 -2.69304721e-02 -1.21077612e-03  2.08924093e-03
   5.14426798e-03  5.58084696e-03  6.23241742e-03 -2.58453150e-03
  -8.57775717e-05  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -2.68736701e-02  0.00000000e+00  6.53781622e-03
   8.59724031e-04  0.00000000e+00  3.81428602e-04 -4.42533171e-03
   1.33405011e-05  0.00000000e+00  9.23063425e-03  7.24222652e-03
  -5.91125217e-04 -1.07540966e-03  0.00000000e+00  9.78988933e-06
  -4.78087077e-03  1.26280138e-02 -2.71249017e-02  4.82029254e-02
   4.41839147e-02  0.00000000e+00  0.00000000e+00 -7.70957390e-03
   0.00000000e+00  1.26010030e-02  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -5.77493353e-02  0.00

  return f(**kwargs)


 0.03
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  1.52858230e-03  8.84789584e-03
   1.09522539e-02 -1.64465267e-02  7.99223153e-04  0.00000000e+00
   0.00000000e+00 -2.40407166e-02  0.00000000e+00  1.79846698e-03
   4.89612725e-03  5.27266345e-03  5.50211277e-03 -1.98542449e-03
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -2.16743249e-02  0.00000000e+00  3.74686717e-03
   8.86172005e-04  0.00000000e+00  4.79610284e-04 -3.89609240e-03
   1.24098926e-05  0.00000000e+00  7.59794513e-03  3.72410949e-03
  -5.26500546e-04 -1.04808918e-03  0.00000000e+00  1.98170614e-05
  -1.68280182e-03  3.17008865e-03 -1.65395657e-02  4.56289486e-02
   4.09546461e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  8.95093212e-03  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -5.22901078e-02  0.0000

  return f(**kwargs)
  return f(**kwargs)


 0.01
Coefficient of each feature: [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  4.92511386e-03
   0.00000000e+00  0.00000000e+00  3.41505908e-04  0.00000000e+00
   0.00000000e+00 -7.14983950e-03  0.00000000e+00  0.00000000e+00
   3.71671384e-04  2.04405753e-03  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   9.23076573e-04  0.00000000e+00  2.61514790e-04 -1.70458154e-03
   6.38489604e-06  0.00000000e+00  0.00000000e+00  0.00000000e+00
  -2.33804220e-04 -7.85694345e-04  0.00000000e+00  1.81166715e-05
   0.00000000e+00  0.00000000e+00  0.00000000e+00  5.98253614e-03
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -1.09977763e-02  0.0000

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


In [15]:
C = [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.5, 0.3, 0.2, 0.1]

# Low Dimensional data
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_low, y_train_low)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_low_std, y_train_low))
    print('Test accuracy:', clf.score(X_test_low_std, y_test_low))
    print('')
    
# Best: C = 1
clf_low = LogisticRegression(penalty='l1', C = 1, solver = 'liblinear')

C: 1
Coefficient of each feature: [[ 0.34417019  0.43238324  0.73102525 -0.15445137  0.          0.13306508
   0.          0.26229936  0.          0.33353252 -0.05789636  0.
   0.04014376 -0.50946271  0.14782053  0.          0.24494551  0.10429296
   0.          0.          0.00274624  0.41842793]]
Training accuracy: 0.7837078651685393
Test accuracy: 0.7226890756302521

C: 0.95
Coefficient of each feature: [[ 3.41398945e-01  4.19127909e-01  7.21208155e-01 -1.47451229e-01
   0.00000000e+00  1.30765948e-01  0.00000000e+00  2.60289957e-01
   0.00000000e+00  3.21900610e-01 -4.60236115e-02  0.00000000e+00
   3.64326446e-02 -5.04471427e-01  1.46534933e-01  0.00000000e+00
   2.44938544e-01  1.03662850e-01  0.00000000e+00  0.00000000e+00
   2.81170568e-04  4.16450228e-01]]
Training accuracy: 0.7837078651685393
Test accuracy: 0.7142857142857143

C: 0.9
Coefficient of each feature: [[ 0.33881442  0.40531854  0.71103734 -0.13969525  0.          0.12834441
   0.          0.2579888   0.          0.

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


In [22]:
# High dimensional propensity score
clf_high.fit(X_high, y_high)
clf_high.predict_proba(X_high)[:, 1]

  return f(**kwargs)


array([0.56218302, 0.34083366, 0.67252103, ..., 0.3389656 , 0.285809  ,
       0.54666708])

In [23]:
# Low dimensional propensity score
clf_low.fit(X_low, y_low)
clf_low.predict_proba(X_low)[:, 1]

  return f(**kwargs)


array([0.62068185, 0.4169973 , 0.17205195, 0.14492186, 0.16678062,
       0.09433371, 0.2467299 , 0.29488666, 0.08246259, 0.40883861,
       0.19978719, 0.26623674, 0.14665718, 0.16206573, 0.599206  ,
       0.32875618, 0.44831722, 0.4347246 , 0.25703983, 0.47546671,
       0.16082692, 0.22712675, 0.18469108, 0.15766654, 0.29260334,
       0.1506285 , 0.10719758, 0.13163903, 0.23334568, 0.37815961,
       0.16375794, 0.66316997, 0.28577995, 0.14875151, 0.23686009,
       0.15292498, 0.18039204, 0.10881857, 0.18618067, 0.14967025,
       0.13090014, 0.39996692, 0.47546671, 0.23323966, 0.66948432,
       0.66944362, 0.12396091, 0.1718912 , 0.172     , 0.43697684,
       0.15016925, 0.17868689, 0.10285733, 0.15915187, 0.11975   ,
       0.13312288, 0.1442913 , 0.62068185, 0.26468956, 0.27442248,
       0.09923539, 0.42024931, 0.25881791, 0.23929178, 0.11178088,
       0.34253391, 0.51255609, 0.36678315, 0.14278901, 0.13460043,
       0.25071981, 0.3504797 , 0.15815588, 0.16299304, 0.06755