# ML for Causal Inference: High-Dimensional Controls

In [1]:
# Load Packages
library(simstudy)
library(glmnet)
library(rdd)

# # # # # # # # #
# Simulate Data #
# # # # # # # # #

set.seed(1)
# Number of Observations
N <- 1e3

total.covar <- 50 + 1e3
# Number of covariates (excluding treatment variable)
p <- total.covar - 2

# Simulate Data
mu.vector <- rep(0, total.covar)
variance.vector <- abs(rnorm(total.covar, mean = 1, sd = .5))
#variance.vector <- rep(1, total.covar)
simulated.data <- as.data.frame.matrix(genCorGen(n = N, nvars = total.covar, params1 = mu.vector, params2 = variance.vector, dist = 'normal',  rho = .5,
                            corstr = 'ar1', wide='True'))[2:(total.covar+1)]
colnames(simulated.data)[1] <- 'W'
colnames(simulated.data)[total.covar] <- 'C'


X <- simulated.data[, 2:(total.covar-1)]

covariate.names <- colnames(X)

error <- rnorm(n = N)

# Make W a function of the X's and unobservable
simulated.data$W <- simulated.data$W + .5 * simulated.data$C + 3 * simulated.data$V80 - 6 * simulated.data$V81
# Assign treatment
treated <- (simulated.data$W > 0) * 1.0


# Generate Y as a function of treatment, W, X's, and unobservable C
beta.true.linear <- rnorm(p, mean = 5, sd = 5)
beta.true.linear[30:p] <- 0
Y <- 1.2 * treated - 4 * simulated.data$W  + data.matrix(X) %*% beta.true.linear + .6 * simulated.data$C + error

df <- cbind(Y, simulated.data)
colnames(df)[1] <- 'Y'

Loading required package: data.table
Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-10

"package 'rdd' was built under R version 3.4.2"Loading required package: sandwich
Loading required package: lmtest
Loading required package: zoo

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Loading required package: AER
Loading required package: car
Loading required package: survival
Loading required package: Formula


In [2]:

# Equations to be used
sumx <- paste(covariate.names, collapse=" + ")
eq.propensity <- paste("W", sumx, sep=" ~ ")
eq.propensity <- as.formula(eq.propensity)

eq.outcome <- paste("Y", sumx, sep=" ~ ")
eq.outcome <- as.formula(eq.outcome)


# LASSO for outcome variables
lasso.fit.outcome <- cv.glmnet(data.matrix(X), df$Y, alpha=1)

coef <- predict(lasso.fit.outcome, type = "nonzero")
colnames <- colnames(X)
H <- colnames[unlist(coef)]
# Vars selected by LASSO:
H

In [3]:
# LASSO for propensity variables
lasso.fit.propensity <- cv.glmnet(data.matrix(X), df$W, alpha=1)

coef <- predict(lasso.fit.propensity, type = "nonzero")
K <- colnames[unlist(coef)]
# Vars selected by LASSO:
K

In [4]:

# Union of selected variables:
doubleselection.names <- unique(c(H, K))
doubleselection.names
sum.doubleselection <- paste(doubleselection.names, collapse = " + ")
eq.doubleselection <- paste("Y ~ W | ", sum.doubleselection)


In [5]:
# RDD, using all covariates selected by double selection
fit <- RDestimate(eq.doubleselection, data = df)
summary(fit)


Call:
RDestimate(formula = eq.doubleselection, data = df)

Type:
sharp 

Estimates:
           Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
LATE       2.081      345           1.422     0.2914      4.881    1.054e-06
Half-BW    1.041      180           1.636     0.4205      3.891    9.963e-05
Double-BW  4.163      628           1.246     0.2196      5.673    1.406e-08
              
LATE       ***
Half-BW    ***
Double-BW  ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F-statistics:
           F      Num. DoF  Denom. DoF  p
LATE       18314  35        309         0
Half-BW     9944  35        144         0
Double-BW  35587  35        592         0
