In [17]:
%matplotlib inline

import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

from auxiliary import *

np.random.seed(123)

# Instrumental variable estimators of causal effects

**Overview**

* Causal effect estimation with a binary IV

* Traditional IV estimators

* Instrumental variable estimators in the presence of individual-level heterogeneity

* Conclusions

In [25]:
def get_sample_iv_demonstration():
    
    num_agents = 10000
    columns = ['Y', 'D', 'Z']
    index = pd.Index(range(10000), name='Identifier')
    df = pd.DataFrame(columns=columns, index=index)

    for i in range(10000):
        if i < 8000:
            y, d, z = np.random.normal(50), 0, 0
        elif i < 9000:
            y, d, z = np.random.normal(60), 1, 0
        elif i < 9800:
            y, d, z = np.random.normal(50), 0, 1
        else:
            y, d, z = np.random.normal(58), 1, 1
        
        df.loc[i, :] = [y, d, z]
    # TODO: shuffle
    df = df.sample(frac=1).reset_index(drop=True)
    
    df = df.astype(np.float)
    df = df.astype({'D': np.int, 'Z': np.int})

    
    return df

df = get_sample_iv_demonstration()    

In [26]:
df.groupby(['D', 'Z'])['Y'].mean()

D  Z
0  0    50.005587
   1    49.929482
1  0    59.961788
   1    57.995732
Name: Y, dtype: float64

In [23]:
rslt = smf.ols(formula='Y ~ D', data=df).fit()
rslt.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.901
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,90510.0
Date:,"Fri, 03 May 2019",Prob (F-statistic):,0.0
Time:,16:18:23,Log-Likelihood:,-14613.0
No. Observations:,10000,AIC:,29230.0
Df Residuals:,9998,BIC:,29240.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,49.9983,0.011,4495.353,0.000,49.977,50.020
D,9.6591,0.032,300.841,0.000,9.596,9.722

0,1,2,3
Omnibus:,20.736,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.007
Skew:,-0.083,Prob(JB):,1.66e-05
Kurtosis:,3.16,Cond. No.,3.13


In [57]:
def get_wald_estimate(df):
    
    average_outcome = df.groupby('Z')['Y'].mean().to_dict()
    numerator = average_outcome[1] - average_outcome[0]
    
    average_treatment = df.groupby('Z')['D'].mean().to_dict()
    denominator = average_treatment[1] -  average_treatment[0]
    
    rslt = numerator / denominator
    
    return rslt
    

rslt = get_wald_estimate(df)
np.cov(df['Y'], df['Z'])[0, 1] / np.cov(df['D'], df['Z'])[0, 1]


4.847632666866422