In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sps
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS

In [2]:
ohie = pd.read_stata('OHIE.dta')
ohie.head()

Unnamed: 0,weight_total_inp,tab1_gender_inp,tab2dia_dx_post_lottery,tab2hbp_dx_post_lottery,tab2chl_dx_post_lottery,tab2dep_dx_post_lottery,tab3_pcs8_score,tab3_mcs8_score,tab5_usual_clinic_inp,tab5_needmet_med_inp,...,tab5_med_qual_bin_inp,tab5_smk_curr_bin_inp,tab3_poshappiness_bin_inp,tab5_mam50_chk_inp,tab5_doc_num_mod_inp,tab5_ed_num_mod_inp,tab5_surg_num_mod_inp,tab5_hosp_num_mod_inp_2,tab4_any_oop_inp,tab4_tr_tot_spend_inp
0,1.150416,Female,No,No,No,No,55.331001,45.38245,,Yes,...,,not at all,very/pretty happy,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.89746,Male,No,Yes,Yes,No,20.077925,53.045685,No,No,...,fair/poor,everyday/some days,very/pretty happy,,6.0,2.0,0.0,0.0,1.0,170.0
2,0.0,,,,,,,,,,...,,,,,,,,,,
3,1.0,Female,No,No,No,,50.224506,50.813576,Yes,Yes,...,fair/poor,not at all,very/pretty happy,,12.0,1.0,0.0,0.0,0.0,0.0
4,1.212644,Male,No,No,No,No,44.189751,47.706932,No,No,...,good/vgood/excellent,everyday/some days,not too happy,,0.0,1.0,0.0,0.0,1.0,456.0


In [3]:
ohie.columns

Index(['weight_total_inp', 'tab1_gender_inp', 'tab2dia_dx_post_lottery',
       'tab2hbp_dx_post_lottery', 'tab2chl_dx_post_lottery',
       'tab2dep_dx_post_lottery', 'tab3_pcs8_score', 'tab3_mcs8_score',
       'tab5_usual_clinic_inp', 'tab5_needmet_med_inp', 'tab5_chl_chk_inp',
       'tab5_pap_chk_inp', 'tab5_fobt_chk_inp', 'tab5_col_chk_inp',
       'tab5_psa_chk_inp', 'tab5_did_flu_inp', 'tab2cvd_risk_point',
       'tab4_catastrophic_exp_inp', 'tab4_owe_inp', 'tab4_borrow_inp',
       'tab1_hispanic_inp', 'tab1_race_white_inp', 'tab1_race_black_inp',
       'tab1_race_nwother_inp', 'tab2a1c_inp', 'tab2hdl_inp', 'tab2chl_inp',
       'tab2bp_sar_inp', 'tab2bp_dar_inp', 'tab5_rx_num_mod_inp',
       'tab2hbp_diure_med_inp', 'tab2antihyperlip_med_inp',
       'tab2diabetes_med_inp', 'tab2antidep_med_inp', 'household_id',
       'treatment', 'ohp_all_ever_admin', 'tab1_age_19_34_inp',
       'tab1_age_35_49_inp', 'tab1_age_50_64_inp', 'tab1_itvw_english_inp',
       'tab3_pain_low_i

In [4]:
outcomes = ['tab2bp_hyper', 'tab2phqtot_high', 'tab4_catastrophic_exp_inp', 'tab5_needmet_med_inp']
restr_cols = ['treatment', 'ohp_all_ever_admin'] + outcomes
sub_ohie = ohie[restr_cols]
sub_ohie.head() 

Unnamed: 0,treatment,ohp_all_ever_admin,tab2bp_hyper,tab2phqtot_high,tab4_catastrophic_exp_inp,tab5_needmet_med_inp
0,Selected,NOT enrolled,1.0,0.0,No,Yes
1,Not selected,NOT enrolled,0.0,0.0,No,No
2,Not selected,NOT enrolled,,,,
3,Not selected,Enrolled,1.0,0.0,No,Yes
4,Not selected,NOT enrolled,1.0,1.0,No,No


In [5]:
print(sub_ohie.shape)
sub_ohie = sub_ohie.dropna()
print(sub_ohie.shape)
sub_ohie = sub_ohie.assign(tab4_catastrophic_exp_inp=sub_ohie.tab4_catastrophic_exp_inp.map({'Yes': 1, 'No': 0}).astype(float))
sub_ohie = sub_ohie.assign(tab5_needmet_med_inp=sub_ohie.tab5_needmet_med_inp.map({'Yes': 1, 'No': 0}).astype(float))
sub_ohie = sub_ohie.assign(treatment_bin=sub_ohie.treatment.map({'Selected': 1, 'Not selected': 0}).astype(float))
sub_ohie = sub_ohie.assign(ohp_all_ever_admin_bin=sub_ohie.ohp_all_ever_admin.map({'Enrolled': 1, 'NOT enrolled': 0}).astype(float))


res_list = list()
for y in outcomes:
    ate = sub_ohie[sub_ohie.treatment=='Selected'][y].mean() - sub_ohie[sub_ohie.treatment=='Not selected'][y].mean()
    var_ate = sub_ohie[sub_ohie.treatment=='Selected'][y].var() / len(sub_ohie[sub_ohie.treatment=='Selected']) +\
        sub_ohie[sub_ohie.treatment=='Not selected'][y].var() / len(sub_ohie[sub_ohie.treatment=='Not selected'])
    ci = [ate - sps.norm.ppf(0.975) * np.sqrt(var_ate), 
          ate + sps.norm.ppf(0.975) * np.sqrt(var_ate)]
    res_list.append([y, ate, var_ate] + ci)
res_df = pd.DataFrame(res_list, columns=['outcome', 'ATE', 'var ATE', 'ci_95_low', 'ci_95_high'])
res_df

(20745, 6)
(11698, 6)


Unnamed: 0,outcome,ATE,var ATE,ci_95_low,ci_95_high
0,tab2bp_hyper,0.001153,4.6e-05,-0.012115,0.01442
1,tab2phqtot_high,-0.036184,7e-05,-0.052575,-0.019793
2,tab4_catastrophic_exp_inp,-0.015821,1.5e-05,-0.023427,-0.008216
3,tab5_needmet_med_inp,0.032184,8e-05,0.014649,0.04972


The ITT on elevated blood pressure is $0.0012$ with a 95\% confidence interval of $[-0.012, 0.014]$. We fail to reject the null at $\alpha = .05$ that winning the Medicaid lottery has no effect on high blood pressure

The ITT on screening for depression is $-0.036$ with a 95\% confidence interval of $[-0.053, -0.020]$. We reject the null of no effect at $\alpha = .05$ and conclude that winning the Medicaid lottery reduced the incidence of depression.

The ITT on the incidence of catastrophic medical expenditures is $-0.016$ with a 95\% confidence interval of $[-0.023, -0.0082]$. We reject the null of no effect at $\alpha = .05$ and conclude that winning the Medicaid lottery reduced the probability that individuals would experience catastrophic medical expenditures.

The ITT on whether participants felt their medical needs were met is is $0.032$ with a 95\% confidence interval of $[0.015, 0.050]$. We reject the null of no effect at $\alpha = .05$ and conclude that winning the Medicaid lottery increased the probability that individuals perceived that their medical needs were met.

Overall, it seems that winning the lottery to subscribe to Medicaid has a positive effect on an individual's health, which is expected.

### Question 3

In [6]:
res_list_med = list()
for y in outcomes:
    ate = sub_ohie[sub_ohie.ohp_all_ever_admin=='Enrolled'][y].mean() - sub_ohie[sub_ohie.ohp_all_ever_admin=='NOT enrolled'][y].mean()
    var_ate = sub_ohie[sub_ohie.ohp_all_ever_admin=='Enrolled'][y].var() / len(sub_ohie[sub_ohie.ohp_all_ever_admin=='Enrolled']) +\
        sub_ohie[sub_ohie.ohp_all_ever_admin=='NOT enrolled'][y].var() / len(sub_ohie[sub_ohie.ohp_all_ever_admin=='NOT enrolled'])
    ci = [ate - sps.norm.ppf(0.975) * np.sqrt(var_ate), 
          ate + sps.norm.ppf(0.975) * np.sqrt(var_ate)]
    res_list_med.append([y, ate, var_ate] + ci)
res_df_med = pd.DataFrame(res_list_med, columns=['outcome', 'ATE', 'var ATE', 'ci_95_low', 'ci_95_high'])
res_df_med

Unnamed: 0,outcome,ATE,var ATE,ci_95_low,ci_95_high
0,tab2bp_hyper,-0.017036,5.4e-05,-0.031428,-0.002644
1,tab2phqtot_high,0.053713,8.9e-05,0.035215,0.072211
2,tab4_catastrophic_exp_inp,-0.010467,1.6e-05,-0.018419,-0.002515
3,tab5_needmet_med_inp,0.056327,9.5e-05,0.037249,0.075405


The naive estimate of Medicaid enrollment on elevated blood pressure is $-0.017$ with a 95\% confidence interval of $[-0.031, -0.0026]$. We would reject the null of no difference in elevated blood pressure between enrolled and unenrolled respondents at $\alpha = .05$.

The naive estimate on screening for depression is $0.054$ with a 95\% confidence interval of $[0.035, 0.072]$. This suggests that participants enrolled in Medicaid were *more* likely to screen positive for depression compared to those who were unenrolled, which is surprising. We would reject the null of no difference in depression between enrolled and unenrolled respondents at $\alpha = .05$.

The naive estimate on incidence of catastrophic expenditures is $-0.010$ with a 95\% confidence interval of $[-0.018, -0.0025]$. This suggests that participants enrolled in Medicaid were less likely to incur catastrophic medical expenditures compared to those not enrolled. We would reject the null of no difference between enrolled and unenrolled respondents at $\alpha = .05$.

The naive estimate on whether participants felt their medical needs were met is is $0.056$ with a 95\% confidence interval of $[0.037, 0.075]$. We reject the null of no difference at $\alpha = .05$ and find that medicaid enrolees were more likely to perceive their medical needs were met than those non-enrolled.

These difference-in-means estimates are likely biased estimates of the average treatment effect of medicaid enrollment. This is because there are likely **unobserved confounders** of whether participants were able to enroll in medicaid and these outcomes. These participants are likely to have different medical outcomes, eespecially if their eligibility was due to socio-economic characteristics that would make them eligible for OHP Plus. Enrollment is not *directly* randomized.


In [7]:
# proportion of always-takers
always_takers_prop = float(sub_ohie[(sub_ohie.treatment_bin==0)&(sub_ohie.ohp_all_ever_admin=='Enrolled')].shape[0]) \
    / sub_ohie[(sub_ohie.treatment_bin==0)].shape[0]
print('proportion of always-takers', always_takers_prop)

proportion of always-takers 0.15456005711226128


In [8]:
complier_prop = float(sub_ohie[(sub_ohie.treatment_bin==1)&(sub_ohie.ohp_all_ever_admin=='Enrolled')].shape[0]) / \
    sub_ohie[(sub_ohie.treatment_bin==1)].shape[0] - always_takers_prop
print('proportion of compliers', complier_prop)

proportion of compliers 0.2516745614275254


In [9]:
print(np.corrcoef(sub_ohie.treatment_bin, sub_ohie.ohp_all_ever_admin_bin)[0, 1])

0.27831335601878815


On average, 15.4\% of participants who did not win the lottery nevertheless were able to enroll in the Oregon Health Plan. This jumps to about 40.6\% of participants who did win the lottery. Winning the OHP lottery raised participants probability of enrollment by 25.2 percentage points. Under monotonicity, this suggests that $25.2\%$ of participants are "compliers" -- in other words, they would be induced by winning the lottery to enroll in Medicaid and would not enroll if they did not win. 

The correlation between winning the lottery and Medicaid unrollment is 0.278, which is also relatively high. Overall, the lottery seems to be a rather strong instrument.

To identify the LATE in the subpopulations of compliers, we need the following assumptions
- monotonicity that we have already mentioned, such that units are less likely to subscribe to medicaid if they win the lotery. That seems plausible
- relevance, meaning that the instrument has a significant influence on the treatment assignment, which is supported by the estimated proportion of compliers and the correlation we have established previously
- exclusion restriction, there is no effect of winning the lottery on health other than through the facto to subscribe to medicaid. It seems rather plausible, except if we consider the fact to be reminided by the lottery to take care of oneself and visit the doctor, in addition to subscribing to Medicaid. However, given the financial burden, seeking medical help is probably not independent from getting Medicaid.
- instrument unconfoundedness, likely here because the instrument (lottery) is randomized.

So we can identify the LATE.


In [10]:
#We add a constant to the dataset (this is going to be the intercept term of the model)
sub_ohie = sub_ohie.assign(const = 1)
for y in outcomes:
    print(y, IV2SLS(dependent = sub_ohie[y], #outcome
           endog = sub_ohie.ohp_all_ever_admin_bin, #treatment
           exog = sub_ohie.const, #we just add a constant as covariate
           instruments = sub_ohie.treatment_bin #Instrument
          ).fit(cov_type = "unadjusted").summary)

tab2bp_hyper                           IV-2SLS Estimation Summary                          
Dep. Variable:           tab2bp_hyper   R-squared:                     -0.0003
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0004
No. Observations:               11698   F-statistic:                    0.0290
Date:                Sun, Dec 11 2022   P-value (F-stat)                0.8648
Time:                        23:23:49   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                                   Parameter Estimates                                    
                        Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------
const                      0.1578     0.0084     18.791     0.0000      0.1413    

The IV estimate of Medicaid enrollment on elevated blood pressure is $-0.0046$ with a 95\% confidence interval of $[-0.0481, 0.0573]$. We would fail to reject the null at $\alpha = .05$ that there is no effect of Medicaid enrollment on blood pressure. This differs from the difference-in-means estimate (which suggested that enrollment reduced the incidence elevated blood pressure), suggesting that the naive association was primarily driven by unobserved confounding. We would conclude that enrollment does not affect the incidence of high blood pressure.


The IV estimate of Medicaid enrollment on positive depression screening is $-0.1438$ with a 95\% confidence interval of $[-0.2100, -0.0775]$. We would reject the null of no treatment effect at the $\alpha = .05$ level and conclude that Medicaid enrollment reduced the incidence of screening positive for depression by about 14.4 percentage points.

This differs substantially from the difference-in-means estimate, which suggested that Medicaid enrollment *raised* the incidence of depression. Using the IV strategy, we conclude the exact opposite. 


The IV estimate of Medicaid enrollment on catastrophic expenditures is $-0.0629$ with a 95\% confidence interval of $[-0.0931, -0.0326]$. We would reject the null of no treatment effect at the $\alpha = .05$ level and conclude that Medicaid enrollment reduced the incidence of catastrophic expenditures by about 6 percentage points.

This is actually slightly larger than our naive difference-in-means estimate, though both estimates are in the same direction and both have confidence intervals that do not include zero. However, our CIs are much larger in the 2SLS case (which intuitively makes sense since we're leveraging only the variation induced by the instrument).

The IV estimate of Medicaid enrollment on whether participants felt their medical needs were met is $0.1279$ with a 95\% confidence interval of $[0.0582, 0.1976]$. We would reject the null of no treatment effect at the $\alpha = .05$ level and conclude that Medicaid enrollment improved respondents' self-assessment of whether their medical needs were met.

Again, this is larger than our naive difference-in-means estimate, though in the same direction (positive).


IV estimates are local average treatment effects on the sub-population of "compliers." In order to generalize to the entire population (including those who would always/never receive coverage irrespective of winning the lottery), we would have to assume that the average treatment effect is the same across always-takers, never-takers and compliers (in other words, there is no effect heterogeneity driven by the principal strata). A "constant" effects assumption would be a stronger version of this assumption (though in this case, constant effects is likely implausible).

In this case, such a no-heterogeneity assumption may not be plausible.