# GR5241 Project4 
## Doubly Robust Estimation + L1 penalized logistic regression

### L1 penalized logistic regression to calculate propensity score

In [29]:
# Import required libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


#### Data Preprocessing

In [2]:
%pwd 
%cd /Users/duansiyu/Downloads

/Users/duansiyu/Downloads


In [20]:
# Load data

highDim = pd.read_csv('highDim_dataset.csv')
lowDim = pd.read_csv('lowDim_dataset.csv')

# View high dimensional data
highDim.head()

2000

In [19]:
# View low dimensional data
lowDim.head()

475

In [12]:
# Create X from the features
X_high = highDim.drop(['A','Y'], axis = 1)
X_low = lowDim.drop(['A','Y'], axis = 1)

# Create y from output
y_high = highDim[['A']]
y_low = lowDim[['A']]

#### Split The Data Into Training And Test Sets

In [13]:
# Split The Data Into Training And Test Sets
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X_high, y_high, test_size=0.25, random_state=0)
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, test_size=0.25, random_state=0)

#### Standardize Features

Because the regularization penalty is comprised of the sum of the absolute value of the coefficients, we need to scale the data so the coefficients are all based on the same scale.

In [None]:
# Create a scaler object
sc = StandardScaler()

# High Dimensional data
# Fit the scaler to the training data and transform
X_train_high_std = sc.fit_transform(X_train_high)

# Apply the scaler to the test data
X_test_high_std = sc.transform(X_test_high)

# Low Dimensional data
# Fit the scaler to the training data and transform
X_train_low_std = sc.fit_transform(X_train_low)

# Apply the scaler to the test data
X_test_low_std = sc.transform(X_test_low)

#### Fit logistic regression with a L1 penalty

In [None]:
C = [.06, .05, .04, .03, .02, .01, 0.008, 0.005, 0.001]

# High Dimensional data
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_high, y_train_high)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_high_std, y_train_high))
    print('Test accuracy:', clf.score(X_test_high_std, y_test_high))
    print('')



#### Best model for high dimensional data

In [9]:
# Best: C = 0.06
clf_high = LogisticRegression(penalty='l1', C = 0.06, solver = 'liblinear')

In [None]:
C = [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.5, 0.3, 0.2, 0.1]

# Low Dimensional data
for c in C:
    clf = LogisticRegression(penalty='l1', C = c, solver = 'liblinear')
    clf.fit(X_train_low, y_train_low)
    print('C:', c)
    print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', clf.score(X_train_low_std, y_train_low))
    print('Test accuracy:', clf.score(X_test_low_std, y_test_low))
    print('')
    

#### Best model for low dimensional data

In [10]:
# Best: C = 1
clf_low = LogisticRegression(penalty='l1', C = 1, solver = 'liblinear')

In [17]:
# High dimensional propensity score
clf_high.fit(X_high, y_high)
ps_high=clf_high.predict_proba(X_high)[:, 1]
len(ps_high)

  return f(**kwargs)


2000

In [24]:
# Low dimensional propensity score
clf_low.fit(X_low, y_low)
ps_low=clf_low.predict_proba(X_low)[:, 1]
type(ps_low)

  return f(**kwargs)


numpy.ndarray

### Doubly Robust Estimation  Algorithm for ATE

#### Add propensity scores to the data frame

In [22]:
full_high_dim= highDim.copy()
full_low_dim = lowDim.copy()

In [27]:
full_high_dim['PS']=pd.Series(ps_high, index=full_high_dim.index)
full_high_dim.head()
full_low_dim['PS']=pd.Series(ps_low, index=full_low_dim.index)
full_low_dim.head()

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V14,V15,V16,V17,V18,V19,V20,V21,V22,PS
0,19.678858,0,1.59,0.0,0.0,0.0,0.24,1.35,0.73,2.58,...,0.0,4.55,0.0,1.72,0.0,0.49,0.98,0.0,1.309683,0.620726
1,17.842989,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.62,...,0.0,4.87,0.0,0.81,0.27,0.27,0.0,0.0,1.719547,0.417034
2,22.108788,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.12,0.0,0.0,0.0,2.12,0.99621,0.172026
3,15.355899,0,0.0,0.0,0.0,0.56,0.0,0.0,0.0,0.0,...,0.0,1.12,0.0,0.0,0.0,0.0,0.0,0.0,1.504077,0.144911
4,16.787813,1,1.81,0.0,0.0,0.0,0.0,0.0,0.0,1.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327864,0.166767


#### Calculate ATE 

In [43]:
#Fit a regression model to get the estimation of y given T and X 
X1= lowDim.drop(['Y'], axis = 1)
X1 = sm.add_constant(X1)
glm_low = sm.GLM(y_low, X1, family = sm.families.Binomial()).fit()
est_low = glm_low.predict_proba(X1)


PerfectSeparationError: Perfect separation detected, results not available

In [44]:
glm_low

<statsmodels.genmod.generalized_linear_model.GLM at 0x128fa2670>