# Dependencies

In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from pathlib import Path
from collections import Counter

# scikit-learn: iterative imputer setup
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

# Global pandas display settings for easier inspection
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 150)
pd.set_option("display.precision", 3)

# Loading Data

### Part I: Outcome Data

In [2]:
data_dir = Path("../Data_Used")

# For 12m survey data
# I will only pull columns for outcomes I need, no covariates as this is post-treatment
col_keep_y = [
    # --- A. Key & Filter Variables ---
    'person_id',      # The unique ID to merge all files
    'returned_12m',   # Later will restrict analysis sample to returned_12m == 1
    'weight_12m',     # Survey weight adjusting for 12-month survey nonresponse
    'weight_intensive_12m',
    'weight_newlottery_12m',

    # --- B. Primary Financial Outcomes (Y Variables) ---
    # These variables directly measure financial hardship.
    
    # 1. Medical Debt
    'cost_any_owe_12m',   # [1/0] Do they have *any* medical debt?
    'cost_tot_owe_12m',   # [Num] How much medical debt do they have?

    # 2. Financial Distress
    'cost_borrow_12m',    # [1/0] Did they have to skip bills or borrow?
    'cost_refused_12m',   # [1/0] Were they refused care for non-payment? 

    # These variables let us build our "catastrophic" outcome and
    # conduct more detailed "for what" analysis.
    
    'cost_tot_oop_12m',   # [Num] Total Out-of-Pocket spending
    'cost_any_oop_12m',   # [1/0] Any Out-of-Pocket spending
    'hhinc_cat_12m',      # [Cat] Household income. 
                          
    # 3. Detailed Spending (for secondary analysis)
    'cost_doc_oop_12m',   # [Num] OOP spending on doctors
    'cost_er_oop_12m',    # [Num] OOP spending on ER visits
    'cost_rx_oop_12m',    # [Num] OOP spending on prescriptions (a key theoretical channel)
    'cost_oth_oop_12m'    # [Num] OOP spending on other care
]

df_y_path = data_dir / "oregonhie_survey12m_vars.dta"
df_y = pd.read_stata(
    df_y_path,
    columns=col_keep_y,
    convert_categoricals=False,
    preserve_dtypes=True
)
if df_y["person_id"].isna().any():
    raise ValueError("df_y has missing person_id values.")

df_y["person_id"] = df_y["person_id"].astype("int64")

print("df_y shape:", df_y.shape)
df_y.head()

df_y shape: (74922, 16)


Unnamed: 0,person_id,returned_12m,weight_12m,weight_intensive_12m,weight_newlottery_12m,cost_any_owe_12m,cost_tot_owe_12m,cost_borrow_12m,cost_refused_12m,cost_tot_oop_12m,cost_any_oop_12m,hhinc_cat_12m,cost_doc_oop_12m,cost_er_oop_12m,cost_rx_oop_12m,cost_oth_oop_12m
0,1,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,,0.0,0.0,5.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,1.047,,,,,,,,,,,
3,4,,,,,,,,,,,,,,,
4,5,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0


### Part 2: Covariate Data

In [3]:
# For 0m survey data
# To pull covariates (X)

col_keep_X = [
    # --- A. Key & Filter Variables ---
    'person_id',         # The unique ID to merge all files
    'returned_0m',       # I will use this to confirm the row has baseline data
    'surv_lang_0m',      # Language of survey,
    
    # --- B. Baseline Health Need & Utilization ---

    'needmet_med_0m',    # [1/0] Did they get *all* the care they needed? (Captures unmet need)
    'needmet_rx_0m',     # [1/0] Did they get *all* the medication they needed? (Captures unmet need)

    'rx_num_mod_0m',     # [Num] How many *different* prescription drugs? (Measures chronic need)
    'doc_num_mod_0m',    # [Num] Number of doctor visits in last 6 mos (Measures utilization)
    'er_num_mod_0m',     # [Num] Number of ER visits in last 6 mos (Measures high-cost shocks)
    'hosp_num_mod_0m',   # [Num] Number of hospitalizations in last 6 mos (Measures high-cost shocks)
    'need_rx_0m',        # [1/0] Did they have all medication needed
    'need_med_0m',       # [1/0] Did they have all treatment needed
    
    'ins_months_0m',     # [Num] How many of the last 6 mos were they insured? (A key eligibility/need var)
    
    # --- C. Baseline Health Status ---
    'health_gen_0m',     # [1-5] Overall health (Excellent, Good, Fair, Poor)
    'baddays_phys_0m',   # [0-30] Num days physical health was "not good"
    'baddays_ment_0m',   # [0-30] Num days mental health was "not good"
    'health_chg_0m',     # [1-3] Health trajectory (Better, Same, Worse)

    # --- D. Baseline Diagnosed Conditions ---
    'dia_dx_0m',         # [1/0] Diagnosed with Diabetes
    'ast_dx_0m',         # [1/0] Diagnosed with Asthma
    'hbp_dx_0m',         # [1/0] Diagnosed with High Blood Pressure
    'emp_dx_0m',         # [1/0] Diagnosed with Emphysema/COPD
    'chf_dx_0m',         # [1/0] Diagnosed with Congestive Heart Failure
    'dep_dx_0m',         # [1/0] Diagnosed with Depression/Anxiety

    # --- E. Baseline Demographics ---
    'female_0m',         # [1/0] Gender variable
    'birthyear_0m',      # [Year] I will use this to calculate baseline age
    'edu_0m',            # [Cat] Highest level of education
    
    # --- F. Baseline Race & Ethnicity ---
    'race_hisp_0m',      # [1/0] Hispanic
    'race_white_0m',     # [1/0] White
    'race_black_0m',     # [1/0] Black
    'race_amerindian_0m',# [1/0] American Indian/Alaska Native
    'race_asian_0m',     # [1/0] Asian
    'race_pacific_0m',   # [1/0] Native Hawaiian/Pacific Islander
    'race_other_qn_0m',  # [1/0] Other race
    
    # --- G. Baseline Employment & Household ---
    'employ_0m',         # [0/1] Employed 
    'employ_hrs_0m',     # [Cat] Hours worked per week (Note: Corrected from 'emply_hrs_qn_0m')
    'hhinc_cat_0m',      # [Cat] Household income category (CRITICAL covariate)
    'hhsize_0m',         # [Num] Household size
    'num19_0m',          # [Num] Number of children under 19 in household
    
    # --- H. Baseline Financial Status ---  
    'cost_any_oop_0m',        # [1/0] Any out of pocket costs for medical care
    'cost_borrow_0m',         # [1/0] Borrowed money/skipped bills to pay health care bills
    'cost_any_owe_0m',        # [1/0] Currently owe money for medical expenses
    'cost_tot_owe_0m',        # [Num] Total amount currently owed for medical expenses
    'cost_refused_0m',        # [1/0] Have you been refused care because you owed money for a past treatment?
    'cost_tot_oop_correct_0m' # [Num] Total *corrected* out-of-pocket spending
]

df_X_path = data_dir / "oregonhie_survey0m_vars.dta"
df_X = pd.read_stata(
    df_X_path,
    columns=col_keep_X,
    convert_categoricals=False,
    preserve_dtypes=True
)
if df_X["person_id"].isna().any():
    raise ValueError("df_X has missing person_id values.")

df_X["person_id"] = df_X["person_id"].astype("int64")

print("df_X shape:", df_X.shape)
df_X.head()

df_X shape: (74922, 43)


Unnamed: 0,person_id,returned_0m,surv_lang_0m,needmet_med_0m,needmet_rx_0m,rx_num_mod_0m,doc_num_mod_0m,er_num_mod_0m,hosp_num_mod_0m,need_rx_0m,need_med_0m,ins_months_0m,health_gen_0m,baddays_phys_0m,baddays_ment_0m,health_chg_0m,dia_dx_0m,ast_dx_0m,hbp_dx_0m,emp_dx_0m,chf_dx_0m,dep_dx_0m,female_0m,birthyear_0m,edu_0m,race_hisp_0m,race_white_0m,race_black_0m,race_amerindian_0m,race_asian_0m,race_pacific_0m,race_other_qn_0m,employ_0m,employ_hrs_0m,hhinc_cat_0m,hhsize_0m,num19_0m,cost_any_oop_0m,cost_borrow_0m,cost_any_owe_0m,cost_tot_owe_0m,cost_refused_0m,cost_tot_oop_correct_0m
0,1,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,5,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Part 3: Instrument Data

In [4]:
# For descriptive data
# To pull instrument mainly
col_keep_IV = [
    # --- A. Key & Instrument Variables ---
    'person_id',         # The unique ID to merge all files
    'household_id',      # The unique household ID
    'treatment',         # [1/0] This is the Instrument (Z)
    
    # --- B. Critical Control Variable ---
    'numhh_list',        # [Num] Number of people in the household on the lottery list.
                         
    # --- C. Baseline Demographics (from lottery list) --- collected before the lottery
    'zip_msa_list',      # [1/0] Is the zip code in a Metropolitan Statistical Area (urban vs. rural)
    'female_list',       # [1/0] Gender from lottery sign-up card (backup)
    'birthyear_list',    # [Year] Birth year from lottery sign-up card (backup)               
]

df_IV_path = data_dir / "oregonhie_descriptive_vars.dta"
df_IV = pd.read_stata(
    df_IV_path,
    columns=col_keep_IV,
    convert_categoricals=False,
    preserve_dtypes=True
)
if df_IV["person_id"].isna().any():
    raise ValueError("df_IV has missing person_id values.")
if df_IV["household_id"].isna().any():
    raise ValueError("df_IV has missing household_id values.")

df_IV["person_id"] = df_IV["person_id"].astype("int64")
df_IV["household_id"] = df_IV["household_id"].astype("int64")

print("df_IV shape:", df_IV.shape)
df_IV.head()

df_IV shape: (74922, 7)


Unnamed: 0,person_id,household_id,treatment,numhh_list,zip_msa_list,female_list,birthyear_list
0,1,100001,1,1,1.0,0.0,1978
1,2,100002,1,1,1.0,1.0,1984
2,3,100003,0,1,1.0,1.0,1971
3,4,100004,0,1,1.0,1.0,1955
4,5,100005,1,1,1.0,1.0,1969


### Part 4: Treatment Data

In [5]:
# For state program data
# To pull treatment
col_keep_W = [
    # --- A. Key & Treatment Variables ---
    'person_id',                       # The unique ID to merge all files
    
    'ohp_all_ever_firstn_30sep2009',    # [1/0] This is Treatment (W).
                                       # The codebook confirms this is the correct
                                       # "ever enrolled in Medicaid" variable to use
    'ohp_all_mo_firstn_30sep2009'      # continuous treatment intensity

]

df_W_path = data_dir / "oregonhie_stateprograms_vars.dta"
df_W = pd.read_stata(
    df_W_path,
    columns=col_keep_W,
    convert_categoricals=False,
    preserve_dtypes=True
)
if df_W["person_id"].isna().any():
    raise ValueError("df_W has missing person_id values.")

df_W["person_id"] = df_W["person_id"].astype("int64")

print("df_W shape:", df_W.shape)
df_W.head()

df_W shape: (74922, 3)


Unnamed: 0,person_id,ohp_all_ever_firstn_30sep2009,ohp_all_mo_firstn_30sep2009
0,1,0,0
1,2,1,12
2,3,0,0
3,4,1,18
4,5,0,0


### Sanity Checks

In [6]:
def validate_df(df, name):
    print(f"\n{name}: {df.shape[0]:,} rows × {df.shape[1]} cols")
    
    missing = df['person_id'].isna().sum()
    if missing > 0:
        raise ValueError(f"{name} has {missing} missing person_id values")
    
    dups = df['person_id'].duplicated().sum()
    if dups > 0:
        raise ValueError(f"{name} has {dups} duplicate person_id values")
    
    print(f"  - ID type: {df['person_id'].dtype}")
    print(f"  - Unique IDs: {df['person_id'].nunique():,}")

# Run validation on all datasets
validate_df(df_y, "Outcomes (df_y)")
validate_df(df_X, "Covariates (df_X)")
validate_df(df_IV, "IV/Lottery (df_IV)")
validate_df(df_W, "Treatment (df_W)")

pid_sets = {
    'outcomes': set(df_y['person_id']),
    'covariates': set(df_X['person_id']),
    'IV': set(df_IV['person_id']),
    'treatment': set(df_W['person_id'])
}

common_ids_all_files = set.intersection(*pid_sets.values())
print(f"\nCommon IDs across all four files: {len(common_ids_all_files):,} individuals")
print("  (Intersection of all person_id sets)")

for name, pid_set in pid_sets.items():
    loss = len(pid_set - common_ids_all_files)
    print(f"  - Present in {name}: {len(pid_set):,} (lost {loss:,} when intersecting all files)")


Outcomes (df_y): 74,922 rows × 16 cols
  - ID type: int64
  - Unique IDs: 74,922

Covariates (df_X): 74,922 rows × 43 cols
  - ID type: int64
  - Unique IDs: 74,922

IV/Lottery (df_IV): 74,922 rows × 7 cols
  - ID type: int64
  - Unique IDs: 74,922

Treatment (df_W): 74,922 rows × 3 cols
  - ID type: int64
  - Unique IDs: 74,922

Common IDs across all four files: 74,922 individuals
  (Intersection of all person_id sets)
  - Present in outcomes: 74,922 (lost 0 when intersecting all files)
  - Present in covariates: 74,922 (lost 0 when intersecting all files)
  - Present in IV: 74,922 (lost 0 when intersecting all files)
  - Present in treatment: 74,922 (lost 0 when intersecting all files)


# Merging Datasets

In [7]:
# start with universe (IV), safe for attrition analysis and errors out if person_id is not 1:1
df_merged = (df_IV
             .merge(df_X, on="person_id", how="left", validate="1:1")
             .merge(df_y, on="person_id", how="left", validate="1:1")
             .merge(df_W, on="person_id", how="left", validate="1:1")
            )

print(f"After merge: {df_merged.shape[0]:,} rows × {df_merged.shape[1]} columns")

After merge: 74,922 rows × 66 columns


### Rename columns

In [8]:
# To alighn with Econometric Standard (Z, W)
rename_map = {
    'treatment': 'Z_lottery',                     # The Instrument
    'ohp_all_ever_firstn_30sep2009': 'W_medicaid' # The Treatment
}

df_merged.rename(columns=rename_map, inplace=True)

# Attrition Analysis

In [9]:
# If some rows have missing Z_lottery, drop them for these diagnostics
mask_Z = df_merged['Z_lottery'].isin([0, 1])
df_attr = df_merged.loc[mask_Z].copy()

# Define responded indicator: 1 if 12m survey returned, 0 otherwise
df_attr['responded'] = (df_attr['returned_12m'] == 1).astype(int)

df_attr['in_12m_sample'] = df_attr['returned_12m'].notna().astype(int)
df_attr_12m = df_attr[df_attr['in_12m_sample'] == 1].copy()

# Overall attrition by lottery status with cluster-robust test
attrition_summary = (
    df_attr_12m
    .groupby('Z_lottery')['responded']
    .agg(N_Total='count',
         N_Responded='sum',
         Response_Rate=lambda x: 100 * x.mean())
    .reset_index())

# Ensure clean formatting
attrition_summary['N_Total'] = attrition_summary['N_Total'].astype(int)
attrition_summary['N_Responded'] = attrition_summary['N_Responded'].astype(int)
attrition_summary['Response_Rate'] = attrition_summary['Response_Rate'].round(3)

print("ATTRITION SUMMARY BY LOTTERY STATUS (within 12m survey frame)")
print("="*70)
print(attrition_summary.to_string(index=False))

# Cluster-robust LPM: responded ~ Z_lottery
attrition_model = smf.ols('responded ~ Z_lottery', data=df_attr_12m).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_attr_12m['household_id']})

diff_pp = (attrition_summary.loc[attrition_summary['Z_lottery'] == 1, 'Response_Rate'].iloc[0]
         - attrition_summary.loc[attrition_summary['Z_lottery'] == 0, 'Response_Rate'].iloc[0])

print(f"\nDifference in response rate (Z=1 minus Z=0): {diff_pp:+.2f} percentage points")
print(f"Cluster-robust p-value for Z_lottery: {attrition_model.pvalues['Z_lottery']:.4f}")

alpha = 0.05
p_val = attrition_model.pvalues['Z_lottery']
if p_val < alpha:
    print(f"p-value = {p_val:.4g} < {alpha}")
    print("⇒ Overall 12m response differs significantly by lottery status.")
else:
    print(f"p-value = {p_val:.4g} ≥ {alpha}")
    print("⇒ No statistically significant difference in overall 12m response by lottery status.")

ATTRITION SUMMARY BY LOTTERY STATUS (within 12m survey frame)
 Z_lottery  N_Total  N_Responded  Response_Rate
         0    28816        11966         41.526
         1    29589        11811         39.917

Difference in response rate (Z=1 minus Z=0): -1.61 percentage points
Cluster-robust p-value for Z_lottery: 0.0003
p-value = 0.0002814 < 0.05
⇒ Overall 12m response differs significantly by lottery status.


In [10]:
# Define 12m sample indicator and attriters among that sample
attriters = df_attr_12m[df_attr_12m['responded'] == 0]

attriters_Z0 = attriters[attriters['Z_lottery'] == 0]
attriters_Z1 = attriters[attriters['Z_lottery'] == 1]

# testing for a subset of covariates: demographic, medical, financial
balance_vars = ['birthyear_0m', 'female_0m', 'health_gen_0m', 'hhinc_cat_0m', 'cost_any_owe_0m']
means_Z0 = attriters_Z0[balance_vars].mean()
means_Z1 = attriters_Z1[balance_vars].mean()
vars_Z0 = attriters_Z0[balance_vars].var()
vars_Z1 = attriters_Z1[balance_vars].var()
sd_pooled = np.sqrt(0.5 * vars_Z0 + 0.5 * vars_Z1)
std_diff = (means_Z1 - means_Z0) / sd_pooled
attriter_balance = pd.DataFrame({
    'Mean_Z0_attriters': means_Z0,
    'Mean_Z1_attriters': means_Z1,
    'Raw_diff_Z1_minus_Z0': means_Z1 - means_Z0,
    'Std_diff_Z1_minus_Z0': std_diff}).round(3)
col_Z0 = f"Mean_Z0_attriters (N={len(attriters_Z0):,})"
col_Z1 = f"Mean_Z1_attriters (N={len(attriters_Z1):,})"
attriter_balance = attriter_balance.rename(columns={
    'Mean_Z0_attriters': col_Z0,
    'Mean_Z1_attriters': col_Z1})
display(attriter_balance)
print("\nRule of thumb: |Std_diff| > 0.1 indicates non-trivial imbalance.")


Unnamed: 0,"Mean_Z0_attriters (N=16,850)","Mean_Z1_attriters (N=17,778)",Raw_diff_Z1_minus_Z0,Std_diff_Z1_minus_Z0
birthyear_0m,1967.894,1968.447,0.553,0.046
female_0m,0.534,0.529,-0.005,-0.009
health_gen_0m,2.805,2.813,0.008,0.007
hhinc_cat_0m,6.058,6.315,0.258,0.057
cost_any_owe_0m,0.676,0.654,-0.021,-0.045



Rule of thumb: |Std_diff| > 0.1 indicates non-trivial imbalance.


In [11]:
# Restrict to units with observed W_medicaid
mask_W = df_attr['W_medicaid'].notna()
df_fs_full = df_attr.loc[mask_W].copy()
df_fs_resp = df_attr.loc[mask_W & (df_attr['returned_12m'] == 1)].copy()
fs_rates_full = df_fs_full.groupby('Z_lottery')['W_medicaid'].mean()
fs_rates_resp = df_fs_resp.groupby('Z_lottery')['W_medicaid'].mean()
fs_table = pd.DataFrame({
    'Sample': ['Full randomized sample', '12m responders only'],
    'Z=0 enrollment rate': [fs_rates_full.get(0, np.nan), fs_rates_resp.get(0, np.nan)],
    'Z=1 enrollment rate': [fs_rates_full.get(1, np.nan), fs_rates_resp.get(1, np.nan)]}).round(3)
print(fs_table.to_string(index=False))

# First-stage regressions with household clustering
fs_model_full = smf.ols('W_medicaid ~ Z_lottery', data=df_fs_full).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_fs_full['household_id']})
fs_model_resp = smf.ols('W_medicaid ~ Z_lottery', data=df_fs_resp).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_fs_resp['household_id']})

print("\nFirst-stage F-statistics (cluster-robust):")
print(f"  Full sample:     F = {fs_model_full.fvalue:.2f}, p = {fs_model_full.f_pvalue:.4g}")
print(f"  12m responders:  F = {fs_model_resp.fvalue:.2f}, p = {fs_model_resp.f_pvalue:.4g}")

                Sample  Z=0 enrollment rate  Z=1 enrollment rate
Full randomized sample                0.146                0.395
   12m responders only                0.134                0.428

First-stage F-statistics (cluster-robust):
  Full sample:     F = 5006.46, p = 0
  12m responders:  F = 2492.75, p = 0


In [12]:
# Only units in the 12m survey file have weight_12m
mask_weight = df_attr['weight_12m'].notna()
df_weight = df_attr.loc[mask_weight].copy()

unweighted_rate = (df_weight['returned_12m'] == 1).mean()
weighted_rate = np.average(
    (df_weight['returned_12m'] == 1).astype(int),
    weights=df_weight['weight_12m'])

print(f"Unweighted 12m response rate: {unweighted_rate:.3%}")
print(f"Weighted 12m response rate:   {weighted_rate:.3%}")
print(f"Difference (weighted - unweighted): {weighted_rate - unweighted_rate:+.3%}")

Unweighted 12m response rate: 40.711%
Weighted 12m response rate:   50.039%
Difference (weighted - unweighted): +9.328%


In [13]:
if abs(diff_pp) > 2.0:
    print(f"Differential attrition is larger than 2pp (|{diff_pp:.1f}| > 2).")
else:
    print(f"Differential attrition is modest (≈ {diff_pp:.1f} pp).")

max_std_diff = attriter_balance['Std_diff_Z1_minus_Z0'].abs().max()
print(f"Max standardized difference among attriters: {max_std_diff:.3f}")
if max_std_diff > 0.1:
    print("Some baseline variables among attriters show |StdDiff| > 0.1.")
else:
    print("Attriter baseline balance looks acceptable (all |StdDiff| ≤ 0.1).")

Differential attrition is modest (≈ -1.6 pp).
Max standardized difference among attriters: 0.057
Attriter baseline balance looks acceptable (all |StdDiff| ≤ 0.1).


# Analysis Sample

In [14]:
# Keep only individuals who answered BOTH baseline (0m) and 12m surveys
df_analysis_sample = df_merged.loc[
    (df_merged["returned_0m"] == 1) &
    (df_merged["returned_12m"] == 1)
].copy()

print(f"Analysis sample (responded to 0m and 12m): "
      f"{df_analysis_sample.shape[0]:,} rows × {df_analysis_sample.shape[1]} columns")

Analysis sample (responded to 0m and 12m): 16,579 rows × 66 columns


In [15]:
# Distribution of instrument (Z) and treatment (W) in the analysis sample
print(df_analysis_sample["Z_lottery"].value_counts(dropna=False), "\n")

print(df_analysis_sample["W_medicaid"].value_counts(dropna=False), "\n")

# Explicitly confirm no missing values in Z or W
print("Missing values:")
print("Z_lottery: ", df_analysis_sample["Z_lottery"].isna().sum())
print("W_medicaid:", df_analysis_sample["W_medicaid"].isna().sum())

Z_lottery
0    8432
1    8147
Name: count, dtype: int64 

W_medicaid
0    11818
1     4761
Name: count, dtype: int64 

Missing values:
Z_lottery:  0
W_medicaid: 0


# Balance Checks

In [16]:
# Choose a set of key pre-treatment covariates:
#   - demographics
#   - baseline health
#   - baseline financial situation
#   - pre-lottery list info
balance_vars = [
    'birthyear_0m',    # age proxy (you can later switch to age_0m once you construct it)
    'female_0m',
    'health_gen_0m',
    'baddays_phys_0m',
    'baddays_ment_0m',
    'dia_dx_0m', 'ast_dx_0m', 'hbp_dx_0m', 'emp_dx_0m', 'chf_dx_0m', 'dep_dx_0m',
    'hhinc_cat_0m',
    'hhsize_0m',
    'num19_0m',
    'cost_any_owe_0m',
    'cost_tot_owe_0m',
    'cost_any_oop_0m',
    'ins_months_0m',
    'zip_msa_list',    # pre-lottery MSA indicator
    'numhh_list'       # household size on lottery list (pre-randomization)
]

pd.options.display.float_format = '{:,.2f}'.format
# Some vars might be missing in rare cases; keep only those present in the DataFrame
balance_vars = [v for v in balance_vars if v in df_analysis_sample.columns]

means_Z0 = df_analysis_sample.loc[df_analysis_sample['Z_lottery'] == 0, balance_vars].mean()
means_Z1 = df_analysis_sample.loc[df_analysis_sample['Z_lottery'] == 1, balance_vars].mean()

vars_Z0 = df_analysis_sample.loc[df_analysis_sample['Z_lottery'] == 0, balance_vars].var()
vars_Z1 = df_analysis_sample.loc[df_analysis_sample['Z_lottery'] == 1, balance_vars].var()

sd_pooled = np.sqrt(0.5 * vars_Z0 + 0.5 * vars_Z1)
std_diff = (means_Z1 - means_Z0) / sd_pooled

balance_table = pd.DataFrame({
    'Mean_Z0': means_Z0,
    'Mean_Z1': means_Z1,
    'Raw_diff_Z1_minus_Z0': means_Z1 - means_Z0,
    'Std_diff_Z1_minus_Z0': std_diff
}).round(3)

display(balance_table)
print("\nRule of thumb: |Std_diff| > 0.1 indicates non-trivial imbalance.")

max_std_diff = balance_table['Std_diff_Z1_minus_Z0'].abs().max()
print(f"\nMax standardized difference in analysis sample: {max_std_diff:.3f}")
# here cost_tot_owe_0m will later be taken in log

Unnamed: 0,Mean_Z0,Mean_Z1,Raw_diff_Z1_minus_Z0,Std_diff_Z1_minus_Z0
birthyear_0m,1964.69,1965.0,0.31,0.03
female_0m,0.6,0.57,-0.03,-0.05
health_gen_0m,2.75,2.83,0.07,0.07
baddays_phys_0m,9.67,9.08,-0.59,-0.05
baddays_ment_0m,11.24,10.41,-0.83,-0.07
dia_dx_0m,0.12,0.11,-0.01,-0.04
ast_dx_0m,0.16,0.15,-0.01,-0.04
hbp_dx_0m,0.29,0.27,-0.02,-0.05
emp_dx_0m,0.08,0.07,-0.01,-0.03
chf_dx_0m,0.03,0.03,-0.0,-0.01



Rule of thumb: |Std_diff| > 0.1 indicates non-trivial imbalance.

Max standardized difference in analysis sample: 0.209


# Check Instrument Strength

In [17]:
# Compliance patterns (raw counts)
compliance_tab = pd.crosstab(df_analysis_sample["Z_lottery"],
                             df_analysis_sample["W_medicaid"], 
                             margins=True)
print("\nJoint distribution of (Z, W) in analysis sample:")
print(compliance_tab)

always_takers = compliance_tab.loc[0, 1] # Z=0, W=1
never_takers = compliance_tab.loc[1, 0] # Z=1, W=0

print("\nObserved compliance patterns (counts):")
print(f"  Always-takers (Z=0, W=1): {always_takers:,}")
print(f"  Never-takers  (Z=1, W=0): {never_takers:,}")
print("  Interpretation as always-/never-takers relies on the monotonicity assumption (no defiers).")

# ITT on W (first-stage mean difference)
enrollment_means = df_analysis_sample.groupby("Z_lottery")["W_medicaid"].mean()
mean_lost = enrollment_means.loc[0]
mean_won = enrollment_means.loc[1]
compliance_rate = mean_won - mean_lost

print("\nEnrollment rates by lottery status in analysis sample:")
print(f"  Enrollment rate (Z=1, won):  {mean_won:.1%} (n={compliance_tab.loc[1, 'All']:,})")
print(f"  Enrollment rate (Z=0, lost): {mean_lost:.1%} (n={compliance_tab.loc[0, 'All']:,})")
print(f"  ITT effect Z → W (compliance rate): {compliance_rate:+.1%} points")

# RAW first stage (for weak instrument diagnostic)
fs_raw = smf.ols('W_medicaid ~ Z_lottery', data=df_analysis_sample).fit(cov_type='cluster',
                                                cov_kwds={'groups': df_analysis_sample['household_id']})
fs_raw_test = fs_raw.f_test("Z_lottery = 0")
F_raw = float(fs_raw_test.fvalue)
p_raw = float(fs_raw_test.pvalue)

print("\nRaw first stage (diagnostic, no controls):")
print(f"  Coefficient on Z_lottery: {fs_raw.params['Z_lottery']:.4f}")
print(f"  Robust SE:                {fs_raw.bse['Z_lottery']:.4f}")
print(f"  Robust F-statistic:       {F_raw:.2f}")
print(f"  Robust p-value:           {p_raw:.4g}")

F_THRESHOLD = 10
if F_raw < F_THRESHOLD:
    print(f"\nWeak first stage (robust F = {F_raw:.1f} < {F_THRESHOLD}).")
else:
    print(f"\nStrong first stage (robust F = {F_raw:.1f} ≥ {F_THRESHOLD}).")

# Conditional first stage WITH numhh_list (for estimation)
fs_data = df_analysis_sample.dropna(subset=['W_medicaid', 'Z_lottery', 'numhh_list', 'household_id'])

fs_adj = smf.ols('W_medicaid ~ Z_lottery + numhh_list', data=fs_data).fit(cov_type='cluster',
                                                    cov_kwds={'groups': fs_data['household_id']})

print("\nAdjusted first stage (with numhh_list control):")
print(fs_adj.summary().tables[1].as_text())

fs_test = fs_adj.f_test("Z_lottery = 0")
f_statistic = float(fs_test.fvalue)
p_f = float(fs_test.pvalue)
t_statistic = fs_adj.tvalues["Z_lottery"]

print(f"\nCluster-robust F-stat for Z_lottery (conditional): {f_statistic:.2f} (p = {p_f:.4g})")
print(f"Cluster-robust t-stat for Z_lottery:                 {t_statistic:.2f}")


Joint distribution of (Z, W) in analysis sample:
W_medicaid      0     1    All
Z_lottery                     
0            7328  1104   8432
1            4490  3657   8147
All         11818  4761  16579

Observed compliance patterns (counts):
  Always-takers (Z=0, W=1): 1,104
  Never-takers  (Z=1, W=0): 4,490
  Interpretation as always-/never-takers relies on the monotonicity assumption (no defiers).

Enrollment rates by lottery status in analysis sample:
  Enrollment rate (Z=1, won):  44.9% (n=8,147)
  Enrollment rate (Z=0, lost): 13.1% (n=8,432)
  ITT effect Z → W (compliance rate): +31.8% points

Raw first stage (diagnostic, no controls):
  Coefficient on Z_lottery: 0.3179
  Robust SE:                0.0071
  Robust F-statistic:       2021.84
  Robust p-value:           0

Strong first stage (robust F = 2021.8 ≥ 10).

Adjusted first stage (with numhh_list control):
                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------

# LaTeX Table

In [18]:
# def get_balance_row(df, var, label):
#     grp = df.groupby('Z_lottery')[var]
#     mean_0 = grp.mean()[0]
#     mean_1 = grp.mean()[1]
    
#     var_0 = grp.var()[0]
#     var_1 = grp.var()[1]
#     sd_pooled = np.sqrt(0.5 * var_0 + 0.5 * var_1)
    
#     diff = mean_1 - mean_0
#     std_diff = diff / sd_pooled
    
#     # Formatting: special cases if you ever want them
#     if 'birthyear' in var:
#         m0_str = f"{mean_0:.0f}"
#         m1_str = f"{mean_1:.0f}"
#         diff_str = f"{diff:.1f}"
#     else:
#         # log debt and all other continuous vars: 3 decimals
#         m0_str = f"{mean_0:.3f}"
#         m1_str = f"{mean_1:.3f}"
#         diff_str = f"{diff:.3f}"
        
#     return f"{label} & {m0_str} & {m1_str} & {diff_str} & {std_diff:.3f} \\\\"


# Model 1: Raw
m1 = smf.ols('W_medicaid ~ Z_lottery', data=df_analysis_sample).fit(
    cov_type='cluster', cov_kwds={'groups': df_analysis_sample['household_id']}
)

# Model 2: Adjusted (Design-adjusted: add numhh_list)
df_fs = df_analysis_sample.dropna(subset=['numhh_list'])
m2 = smf.ols('W_medicaid ~ Z_lottery + numhh_list', data=df_fs).fit(
    cov_type='cluster', cov_kwds={'groups': df_fs['household_id']}
)

print("\n" + "="*30)
print("LATEX CODE FOR TABLE 1 (FIRST STAGE)")
print("="*30 + "\n")

# Cluster-robust first-stage F for Z_lottery only
F1 = float(m1.f_test("Z_lottery = 0").fvalue)
F2 = float(m2.f_test("Z_lottery = 0").fvalue)

control_mean = df_analysis_sample.loc[df_analysis_sample['Z_lottery'] == 0, 'W_medicaid'].mean()

latex_t2 = [
    r"\begin{table}[htbp]",
    r"\centering",
    r"\caption{First Stage Effect of Lottery on Medicaid Enrollment}",
    r"\label{tab:first_stage}",
    r"\begin{tabular}{lcc}",
    r"\hline \hline",
    r" & (1) & (2) \\",
    r"Dependent Variable: & \multicolumn{2}{c}{Enrolled in Medicaid} \\",
    r"Specification: & Unadjusted & \textbf{Design-Adjusted} \\",
    r"\hline",
    r" & & \\",
    f"Lottery Win ($Z$) & {m1.params['Z_lottery']:.3f}*** & {m2.params['Z_lottery']:.3f}*** \\\\",
    f" & ({m1.bse['Z_lottery']:.3f}) & ({m2.bse['Z_lottery']:.3f}) \\\\",
    r" & & \\",
    r"Household Size Control & No & Yes \\\\",
    r"\hline",
    f"Control Mean ($Z=0$) & {control_mean:.3f} & {control_mean:.3f} \\\\",
    f"F-Statistic (Cluster) & {F1:.1f} & {F2:.1f} \\\\",
    f"Observations & {int(m1.nobs):,} & {int(m2.nobs):,} \\\\",
    r"\hline \hline",
    r"\multicolumn{3}{p{0.6\textwidth}}{\footnotesize \textit{Notes:} Standard errors clustered at the household level in parentheses. Column (2) controls for \texttt{numhh\_list} (household size on the lottery list), which is mechanically related to the probability that at least one household member receives an offer. *** p$<0.01$.}",
    r"\end{tabular}",
    r"\end{table}"
]

print("\n".join(latex_t2))



LATEX CODE FOR TABLE 1 (FIRST STAGE)

\begin{table}[htbp]
\centering
\caption{First Stage Effect of Lottery on Medicaid Enrollment}
\label{tab:first_stage}
\begin{tabular}{lcc}
\hline \hline
 & (1) & (2) \\
Dependent Variable: & \multicolumn{2}{c}{Enrolled in Medicaid} \\
Specification: & Unadjusted & \textbf{Design-Adjusted} \\
\hline
 & & \\
Lottery Win ($Z$) & 0.318*** & 0.325*** \\
 & (0.007) & (0.007) \\
 & & \\
Household Size Control & No & Yes \\\\
\hline
Control Mean ($Z=0$) & 0.131 & 0.131 \\
F-Statistic (Cluster) & 2021.8 & 2140.9 \\
Observations & 16,579 & 16,579 \\
\hline \hline
\multicolumn{3}{p{0.6\textwidth}}{\footnotesize \textit{Notes:} Standard errors clustered at the household level in parentheses. Column (2) controls for \texttt{numhh\_list} (household size on the lottery list), which is mechanically related to the probability that at least one household member receives an offer. *** p$<0.01$.}
\end{tabular}
\end{table}


# dTypes

In [19]:
# Fix dtype of surv_lang_0m
print(df_analysis_sample['surv_lang_0m'].value_counts())

df_analysis_sample['surv_lang_0m'] = np.where(
    df_analysis_sample['surv_lang_0m'] == 'English', 1, 0)

print(df_analysis_sample['surv_lang_0m'].value_counts())

surv_lang_0m
English    15739
Spanish      840
Name: count, dtype: int64
surv_lang_0m
1    15739
0      840
Name: count, dtype: int64


# Missing Values: Part 1

### Current Missing Value Stats

In [20]:
# info on fully merged respondants
df_analysis_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16579 entries, 5 to 74920
Data columns (total 66 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   person_id                    16579 non-null  int64  
 1   household_id                 16579 non-null  int64  
 2   Z_lottery                    16579 non-null  int8   
 3   numhh_list                   16579 non-null  int8   
 4   zip_msa_list                 16579 non-null  float64
 5   female_list                  16578 non-null  float64
 6   birthyear_list               16579 non-null  int16  
 7   returned_0m                  16579 non-null  float64
 8   surv_lang_0m                 16579 non-null  int64  
 9   needmet_med_0m               15814 non-null  float64
 10  needmet_rx_0m                16013 non-null  float64
 11  rx_num_mod_0m                15328 non-null  float64
 12  doc_num_mod_0m               16434 non-null  float64
 13  er_num_mod_0m        

In [21]:
# missing values and percentages
def missing_report(df_analysis_sample):
    missing_report = df_analysis_sample.isna().sum().to_frame(name="Missing_Count")
    missing_report["Percent_Missing"] = (missing_report["Missing_Count"] / len(df_analysis_sample) * 100).round(2)
    missing_report = (missing_report[missing_report["Missing_Count"] > 0]
                .sort_values("Percent_Missing", ascending=False))

    print(f"Total analysis sample size: {len(df_analysis_sample):,}")
    print("-" * 50)
    return missing_report  
original_missing = missing_report(df_analysis_sample)
original_missing

Total analysis sample size: 16,579
--------------------------------------------------


Unnamed: 0,Missing_Count,Percent_Missing
cost_tot_owe_12m,2801,16.89
cost_oth_oop_12m,2403,14.49
cost_tot_owe_0m,2344,14.14
cost_er_oop_12m,2047,12.35
cost_rx_oop_12m,1656,9.99
cost_doc_oop_12m,1638,9.88
hhsize_0m,1631,9.84
baddays_phys_0m,1473,8.88
baddays_ment_0m,1468,8.85
cost_any_oop_0m,1315,7.93


### Outcome Variables

#### Medical Debt

In [22]:
def medical_debt_fillin(any_tot_col):
    print("Before cleaning: Debt indicator vs total debt")
    print(df_analysis_sample[any_tot_col].value_counts(dropna=False))
    print('-'*50)
    # Case A: "Any debt" == 0, but total debt missing → set total debt = 0
    missing_tot_debt = ((df_analysis_sample[any_tot_col[0]] == 0) &
                        (df_analysis_sample[any_tot_col[1]].isna()))
    df_analysis_sample.loc[missing_tot_debt, any_tot_col[1]] = 0
    
    # Case B: total debt == 0, but "any debt" missing → set indicator = 0
    missing_any_debt = ((df_analysis_sample[any_tot_col[1]] == 0) &
                        (df_analysis_sample[any_tot_col[0]].isna()))
    df_analysis_sample.loc[missing_any_debt, any_tot_col[0]] = 0
    
    print("\nAfter cleaning: Debt indicator vs total debt")
    print(df_analysis_sample[[any_tot_col[0], any_tot_col[1]]]
          .value_counts(dropna=False))

medical_debt_fillin(["cost_any_owe_12m", "cost_tot_owe_12m"])

Before cleaning: Debt indicator vs total debt
cost_any_owe_12m  cost_tot_owe_12m
0.00              0.00                6840
1.00              NaN                 2023
0.00              NaN                  576
1.00              2,000.00             378
                  1,000.00             328
                                      ... 
                  337.21                 1
                  3,490,110.00           1
                  3,200,000.00           1
                  800,000.00             1
                  400,000.00             1
Name: count, Length: 783, dtype: int64
--------------------------------------------------

After cleaning: Debt indicator vs total debt
cost_any_owe_12m  cost_tot_owe_12m
0.00              0.00                7416
1.00              NaN                 2023
                  2,000.00             378
                  1,000.00             328
                  3,000.00             312
                                      ... 
                 

#### OOP (Total/Any) Costs

In [23]:
print("\nBefore cleaning: Any OOP vs total OOP")
print(df_analysis_sample[["cost_any_oop_12m", "cost_tot_oop_12m"]]
      .value_counts(dropna=False))
print('-'*50)

# Conflict : total OOP == 0 but "any OOP" == 1
oop_conflict = ((df_analysis_sample["cost_tot_oop_12m"] == 0) &
                (df_analysis_sample["cost_any_oop_12m"] == 1))
print(f"\nConflict 1 (total=0, any=1): {oop_conflict.sum()} rows")

# Trust the specific dollar amount (intensive margin) more than the yes/no flag.
# So if total == 0, force "any OOP" to 0.
df_analysis_sample.loc[oop_conflict, "cost_any_oop_12m"] = 0

# Re-check conflict
conflict_after = ((df_analysis_sample["cost_tot_oop_12m"] == 0) &
                 (df_analysis_sample["cost_any_oop_12m"] == 1))
print(f"\nConflict after correction: {conflict_after.sum()} rows")
print('-'*50)
# no other conflict from below stats

# Now fix NA/incomplete combinations:
# Case A: total OOP == 0, "any OOP" missing → set any OOP = 0
mask = ((df_analysis_sample["cost_tot_oop_12m"] == 0) &
        (df_analysis_sample["cost_any_oop_12m"].isna()))
df_analysis_sample.loc[mask, "cost_any_oop_12m"] = 0

# Case B: "any OOP" == 0, total OOP missing → set total OOP = 0
mask = ((df_analysis_sample["cost_any_oop_12m"] == 0) &
        (df_analysis_sample["cost_tot_oop_12m"].isna()))
df_analysis_sample.loc[mask, "cost_tot_oop_12m"] = 0

print("\nAfter cleaning: Any OOP vs total OOP")
print(df_analysis_sample[["cost_any_oop_12m", "cost_tot_oop_12m"]]
      .value_counts(dropna=False))


Before cleaning: Any OOP vs total OOP
cost_any_oop_12m  cost_tot_oop_12m
0.00              0.00                7671
1.00              0.00                 549
                  NaN                  534
                  100.00               297
                  200.00               226
                                      ... 
                  24,825.00              1
                  25,200.00              1
                  19,300.00              1
                  20,120.00              1
                  20,140.00              1
Name: count, Length: 1105, dtype: int64
--------------------------------------------------

Conflict 1 (total=0, any=1): 549 rows

Conflict after correction: 0 rows
--------------------------------------------------

After cleaning: Any OOP vs total OOP
cost_any_oop_12m  cost_tot_oop_12m
0.00              0.00                8255
1.00              NaN                  534
                  100.00               297
                  200.00           

#### OOP Total/Component Costs

In [24]:
oop_cols = [
    "cost_doc_oop_12m",
    "cost_er_oop_12m",
    "cost_rx_oop_12m",
    "cost_oth_oop_12m",
]

# 1) If total is missing but all components are observed → fill total as their sum
tot_na_full_components = (df_analysis_sample["cost_tot_oop_12m"].isna() &
                          df_analysis_sample[oop_cols].notna().all(axis=1))
df_analysis_sample.loc[tot_na_full_components, "cost_tot_oop_12m"] = (
    df_analysis_sample.loc[tot_na_full_components, oop_cols].sum(axis=1))

# 2) If total == 0 and all components are missing → set components to 0
tot_zero_components_na = ((df_analysis_sample["cost_tot_oop_12m"] == 0) &
                           df_analysis_sample[oop_cols].isna().all(axis=1))
df_analysis_sample.loc[tot_zero_components_na, oop_cols] = 0

# 3) If exactly one component is missing and total is observed → back it out as total - sum(other components)
one_missing = (df_analysis_sample["cost_tot_oop_12m"].notna() &
              (df_analysis_sample[oop_cols].isna().sum(axis=1) == 1))

for col in oop_cols:
    fill_this = one_missing & df_analysis_sample[col].isna()
    if fill_this.any():
        other_cols = [c for c in oop_cols if c != col]
        df_analysis_sample.loc[fill_this, col] = (
            df_analysis_sample.loc[fill_this, "cost_tot_oop_12m"] -
            df_analysis_sample.loc[fill_this, other_cols].sum(axis=1))

#### Borrow/Skip

In [25]:
def borrow_fill(cost_any_tot_owe_oop, borrow_col):
    no_financial_hardship = ((df_analysis_sample[cost_any_tot_owe_oop[0]] == 0) &   
                             (df_analysis_sample[cost_any_tot_owe_oop[1]] == 0) &   
                             (df_analysis_sample[cost_any_tot_owe_oop[2]] == 0) &          
                             (df_analysis_sample[cost_any_tot_owe_oop[3]] == 0))   

    print(f"'No financial hardship' pattern holds for "
          f"{no_financial_hardship.sum():,} respondents.")
    
    before_borrow_na = df_analysis_sample[borrow_col].isna().sum()
    print(f"\nMissing cost_borrow before: {before_borrow_na:,}")
    
    borrow_fill0 = (df_analysis_sample[borrow_col].isna() & no_financial_hardship)
    
    df_analysis_sample.loc[borrow_fill0, borrow_col] = 0
    
    after_borrow_na = df_analysis_sample[borrow_col].isna().sum()
    print(f"Filled {borrow_fill0.sum():,} cost_borrow = 0 "
          f"under 'no hardship' pattern.")
    print(f"Missing cost_borrow after:  {after_borrow_na:,}")
    
    print("\ncost_borrow value counts (post-cleaning):")
    print(df_analysis_sample[borrow_col].value_counts(dropna=False))

borrow_fill(["cost_any_owe_12m", "cost_tot_owe_12m", "cost_any_oop_12m", "cost_tot_oop_12m"], "cost_borrow_12m")

'No financial hardship' pattern holds for 4,659 respondents.

Missing cost_borrow before: 222
Filled 27 cost_borrow = 0 under 'no hardship' pattern.
Missing cost_borrow after:  195

cost_borrow value counts (post-cleaning):
cost_borrow_12m
0.00    10852
1.00     5532
NaN       195
Name: count, dtype: int64


### Covariates

#### Race

In [26]:
race_cols = [c for c in df_analysis_sample.columns 
             if c.startswith("race_") and c.endswith("_0m")]
print(f"Race columns found: {race_cols}")

race_sums = df_analysis_sample[race_cols].sum(axis=1)
multiracial_count = (race_sums > 1).sum()
print(f"{multiracial_count} respondents selected more than one race/ethnicity.")

answered_race = (df_analysis_sample[race_cols].sum(axis=1) >= 1)
print(f"{answered_race.sum()} respondents selected at least one race.")

df_analysis_sample.loc[answered_race, race_cols] = (
    df_analysis_sample.loc[answered_race, race_cols].fillna(0))

Race columns found: ['race_hisp_0m', 'race_white_0m', 'race_black_0m', 'race_amerindian_0m', 'race_asian_0m', 'race_pacific_0m', 'race_other_qn_0m']
2235 respondents selected more than one race/ethnicity.
16462 respondents selected at least one race.


#### Employment

In [27]:
# if employ_0m = 0 --> emply_hrs_0m = 1
# if emply_hrs_0m = 2,3,4 --> employ_0m =1
# if employ_hrs = 1 -x-> employ_0m = 0 maybe sick leave

print(df_analysis_sample[['employ_0m', 'employ_hrs_0m']].value_counts(dropna=False))

not_employed = (df_analysis_sample['employ_0m'] == 0) & (df_analysis_sample['employ_hrs_0m'].isna())
df_analysis_sample.loc[not_employed, 'employ_hrs_0m'] = 1.0

high_hours = df_analysis_sample['employ_hrs_0m'].isin([2, 3, 4]) 
df_analysis_sample.loc[high_hours & df_analysis_sample['employ_0m'].isna(), 'employ_0m'] = 1.0

print(df_analysis_sample[['employ_0m', 'employ_hrs_0m']].value_counts(dropna=False))

employ_0m  employ_hrs_0m
0.00       1.00             8326
1.00       4.00             4352
           3.00             1753
           2.00             1474
NaN        1.00              166
1.00       NaN               128
NaN        4.00               98
1.00       1.00               88
NaN        2.00               78
           NaN                73
           3.00               43
Name: count, dtype: int64
employ_0m  employ_hrs_0m
0.00       1.00             8326
1.00       4.00             4450
           3.00             1796
           2.00             1552
NaN        1.00              166
1.00       NaN               128
           1.00               88
NaN        NaN                73
Name: count, dtype: int64


#### Gender

In [28]:
# prioritizing 0m survey over signup card
no_female = (df_analysis_sample['female_0m'].isna()) & (df_analysis_sample['female_list'].notna())
df_analysis_sample.loc[no_female, 'female_0m'] = df_analysis_sample.loc[no_female, 'female_list']

#### Birthday

In [29]:
# prioritizing 0m survey over signup card
no_bd = (df_analysis_sample['birthyear_0m'].isna()) & (df_analysis_sample['birthyear_list'].notna())
df_analysis_sample.loc[no_bd, 'birthyear_0m'] = df_analysis_sample.loc[no_bd, 'birthyear_list']

#### RX Medication - Covariate

In [30]:
# 1) If num_mod > 0, then need must be 1 (trust realized use)
num_pos = df_analysis_sample["rx_num_mod_0m"] > 0
need_fix = num_pos & (
                   df_analysis_sample["need_rx_0m"].isna() | (df_analysis_sample["need_rx_0m"] == 0))
df_analysis_sample.loc[need_fix, "need_rx_0m"] = 1

# 2) If need == 0 and num_mod is missing, set num_mod = 0
need0_num_na = ((df_analysis_sample["need_rx_0m"] == 0) &
                  df_analysis_sample["rx_num_mod_0m"].isna())
df_analysis_sample.loc[need0_num_na, "rx_num_mod_0m"] = 0

# 3) If num_mod == 0 and need is missing, set need = 0
num0_need_na = ((df_analysis_sample["rx_num_mod_0m"] == 0) &
                 df_analysis_sample["need_rx_0m"].isna())
df_analysis_sample.loc[num0_need_na, "need_rx_0m"] = 0

#### Medical Debt - Covariate

In [31]:
medical_debt_fillin(["cost_any_owe_0m", "cost_tot_owe_0m"])

Before cleaning: Debt indicator vs total debt
cost_any_owe_0m  cost_tot_owe_0m
0.00             0.00               6539
1.00             NaN                2197
                 2,000.00            411
                 1,000.00            370
                 300.00              341
                                    ... 
                 2,700,000.00          1
                 1,000,000.00          1
                 710,000.00            1
                 140,000.00            1
                 135,000.00            1
Name: count, Length: 861, dtype: int64
--------------------------------------------------

After cleaning: Debt indicator vs total debt
cost_any_owe_0m  cost_tot_owe_0m
0.00             0.00               6551
1.00             NaN                2197
                 2,000.00            411
                 1,000.00            370
                 300.00              341
                                    ... 
                 205.00                1
              

#### OOP (any/Total) Costs - Covariates

In [32]:
print(df_analysis_sample[["cost_any_oop_0m", "cost_tot_oop_correct_0m"]]
      .value_counts(dropna=False))

# no conflicts. nothing to correct for

cost_any_oop_0m  cost_tot_oop_correct_0m
0.00             0.00                       4827
NaN              NaN                        1315
1.00             300.00                      298
                 100.00                      260
                 200.00                      231
                                            ... 
                 462.00                        1
                 8.80                          1
                 7.77                          1
                 461.00                        1
                 5.05                          1
Name: count, Length: 1659, dtype: int64


#### Borrow/Skip - Covariate

In [33]:
borrow_fill(["cost_any_owe_0m", "cost_tot_owe_0m", "cost_any_oop_0m", "cost_tot_oop_correct_0m"], "cost_borrow_0m")

'No financial hardship' pattern holds for 2,755 respondents.

Missing cost_borrow before: 232
Filled 13 cost_borrow = 0 under 'no hardship' pattern.
Missing cost_borrow after:  219

cost_borrow value counts (post-cleaning):
cost_borrow_0m
0.00    9172
1.00    7188
NaN      219
Name: count, dtype: int64


### New Missing Values Stats

In [34]:
comparison = original_missing.join(missing_report(df_analysis_sample), lsuffix='_Pre', rsuffix='_Post', how='outer')
comparison.fillna(0, inplace=True)
comparison['Fixed_Count'] = (comparison['Missing_Count_Pre'] - comparison['Missing_Count_Post'])
comparison['Fixed_Pct'] = np.where(comparison['Missing_Count_Pre'] > 0,
                                  (comparison['Fixed_Count'] / comparison['Missing_Count_Pre'] * 100).round(1),0)
display(comparison[['Missing_Count_Pre', 'Missing_Count_Post', 'Fixed_Count', 'Fixed_Pct']]
                                  .sort_values('Fixed_Count', ascending=False).astype(int).head())
print("Note: Showing only first 10 cols, numbers (percentages) are rounded")

Total analysis sample size: 16,579
--------------------------------------------------


Unnamed: 0,Missing_Count_Pre,Missing_Count_Post,Fixed_Count,Fixed_Pct
cost_tot_owe_12m,2801,2225,576,20
cost_oth_oop_12m,2403,1862,541,22
birthyear_0m,519,0,519,100
cost_er_oop_12m,2047,1792,255,12
cost_doc_oop_12m,1638,1398,240,14


Note: Showing only first 10 cols, numbers (percentages) are rounded


# Organize and Drop columns Before Imputer

### Organizing Cols

In [35]:
drop_cols = ['returned_12m', 'returned_0m', 'female_list', 'birthyear_list']
             
identification_cols = ['person_id', 'household_id', 'numhh_list', 'Z_lottery', 'W_medicaid',
                       'weight_intensive_12m', 'weight_newlottery_12m','weight_12m', 'ohp_all_mo_firstn_30sep2009']
   

Y_cols = ['cost_any_owe_12m', 'cost_tot_owe_12m', 'cost_borrow_12m', 
          'cost_refused_12m', 'cost_tot_oop_12m', 'cost_any_oop_12m', 
          'hhinc_cat_12m', 'cost_doc_oop_12m', 'cost_er_oop_12m', 
          'cost_rx_oop_12m', 'cost_oth_oop_12m']

X_cols = ['surv_lang_0m', 'needmet_med_0m', 'needmet_rx_0m', 'need_rx_0m', 'need_med_0m',
          'rx_num_mod_0m', 'doc_num_mod_0m', 'er_num_mod_0m', 'hosp_num_mod_0m', 
          'ins_months_0m', 'health_gen_0m', 'baddays_phys_0m', 'baddays_ment_0m', 'health_chg_0m',
          'dia_dx_0m', 'ast_dx_0m', 'hbp_dx_0m', 'emp_dx_0m', 'chf_dx_0m', 'dep_dx_0m', 
          'female_0m', 'birthyear_0m', 'edu_0m', 
          'race_hisp_0m', 'race_white_0m', 'race_black_0m', 'race_amerindian_0m', 'race_asian_0m', 'race_pacific_0m', 'race_other_qn_0m', 
          'employ_0m', 'employ_hrs_0m', 'hhinc_cat_0m', 
          'hhsize_0m', 'num19_0m', 'cost_tot_oop_correct_0m', 'cost_borrow_0m', 'cost_any_owe_0m', 'cost_tot_owe_0m', 'cost_refused_0m', 'cost_any_oop_0m',
          'zip_msa_list']

### Col Sanity Check 

In [36]:
all_listed = set(drop_cols + identification_cols + Y_cols + X_cols)
df_actual = set(df_analysis_sample.columns)
print("--- Column Integrity Report ---")

# Check A: Are any columns in the DF missing from lists?
missing_from_lists = df_actual - all_listed
if missing_from_lists:
    print(f"\n{len(missing_from_lists)} columns in original df but not in lists:")
    print(sorted(list(missing_from_lists)))
else:
    print("\nAll DataFrame columns are accounted for.")

# Check B: Are any columns in lists missing from the DF? 
missing_from_df = all_listed - df_actual
if missing_from_df:
    print(f"\n{len(missing_from_df)} columns in lists are not in original df:")
    print(sorted(list(missing_from_df)))
else:
    print("\nAll list columns exist in the DataFrame.")

# Check C: Overlaps 
all_cols_flat = drop_cols + identification_cols + Y_cols + X_cols
counts = Counter(all_cols_flat)
duplicates = [col for col, count in counts.items() if count > 1]

if duplicates:
    print(f"Found duplicated columns across lists:")
    print(duplicates)
else:
    print("\nNo overlaps between lists.")

--- Column Integrity Report ---

All DataFrame columns are accounted for.

All list columns exist in the DataFrame.

No overlaps between lists.


### Drop Cols

In [37]:
df_analysis_sample = df_analysis_sample.drop(drop_cols, axis = 1)

### Col Renaming

In [38]:
rename_map = {}
# Prefix X variables
for col in X_cols:
    if col in df_analysis_sample.columns:
        rename_map[col] = f"X_{col}"
# Prefix Y variables
for col in Y_cols:
    if col in df_analysis_sample.columns:
        rename_map[col] = f"Y_{col}"

# 2. Execute Renaming
df_analysis_sample.rename(columns=rename_map, inplace=True)

X_cols = [f"X_{c}" for c in X_cols if f"X_{c}" in df_analysis_sample.columns]
Y_cols = [f"Y_{c}" for c in Y_cols if f"Y_{c}" in df_analysis_sample.columns]

# Missing Values: Part 2 (Imputer)

In [39]:
col_schema = {
    'binary' :     ['X_need_med_0m', 'X_needmet_med_0m', 'X_need_rx_0m', 'X_needmet_rx_0m',
                    'X_dia_dx_0m', 'X_ast_dx_0m', 'X_hbp_dx_0m', 'X_emp_dx_0m', 'X_chf_dx_0m',
                    'X_dep_dx_0m', 'X_female_0m', 'X_employ_0m', 'X_zip_msa_list', 
                    'X_race_hisp_0m', 'X_race_white_0m', 'X_race_black_0m', 'X_race_amerindian_0m',
                    'X_race_asian_0m', 'X_race_pacific_0m', 'X_race_other_qn_0m', 
                    'X_cost_borrow_0m', 'X_cost_any_owe_0m', 'X_cost_refused_0m', 'X_cost_any_oop_0m'],
    'count' :      ['X_rx_num_mod_0m', 'X_doc_num_mod_0m', 'X_er_num_mod_0m','X_hosp_num_mod_0m',
                    'X_hhsize_0m', 'X_num19_0m'],
    'ordinal' :    ['X_surv_lang_0m', 'X_health_gen_0m', 'X_health_chg_0m', 'X_birthyear_0m',
                    'X_hhinc_cat_0m', 'X_edu_0m', 'X_employ_hrs_0m',
                    'X_baddays_phys_0m', 'X_baddays_ment_0m', 'X_ins_months_0m'],
    'continuous' : ['X_cost_tot_owe_0m', 'X_cost_tot_oop_correct_0m']
}

all_x_cols = [col for group in col_schema.values() for col in group]

# Identifying cols with missingness
def get_cols_with_missing(df, cols):
    return [c for c in cols if df[c].isna().sum() > 0]

cols_to_impute = get_cols_with_missing(df_analysis_sample, all_x_cols)
print(f"Columns to impute: {len(cols_to_impute)} / {len(all_x_cols)}")

# Creating missingness flags
missing_flag_cols = []
for col in cols_to_impute:
    flag_col = f"{col}_missing"
    df_analysis_sample[flag_col] = df_analysis_sample[col].isna().astype(np.int8)
    missing_flag_cols.append(flag_col)
    print(f"Created flag for {col}: {df_analysis_sample[col].isna().mean():.2%} missing")

print(f"Total missing-flag columns: {len(missing_flag_cols)}")

# Original ranges for ordinal variables
orig_dtypes = df_analysis_sample[all_x_cols].dtypes.to_dict()

orig_minmax = {col: (df_analysis_sample[col].min(skipna=True), df_analysis_sample[col].max(skipna=True))
                   for col in col_schema['ordinal'] if col in cols_to_impute}

# Configuring imputer
ets = ExtraTreesRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1)

imputer = IterativeImputer(
    estimator=ets,
    max_iter=10,
    tol=1e-3,
    initial_strategy='median',
    imputation_order='ascending',
    add_indicator=False,  # Created own flags
    random_state=42,
    verbose=1)

# Impute using full safe sample
clean_x_cols = [c for c in all_x_cols if c not in cols_to_impute]
valid_aux = ['numhh_list']
cols_for_model = clean_x_cols + valid_aux + cols_to_impute
X_full_context = df_analysis_sample[cols_for_model].copy()
imputer.fit(X_full_context)
X_imputed_array = imputer.transform(X_full_context)
X_imputed = pd.DataFrame(
    X_imputed_array,
    columns=cols_for_model,
    index=df_analysis_sample.index)

df_analysis_sample[cols_to_impute] = X_imputed[cols_to_impute]

# Clip continuous variables
for col in col_schema['continuous']:
    if col in cols_to_impute:
        low_cap = df_analysis_sample[col].quantile(0.01)
        high_cap = df_analysis_sample[col].quantile(0.99)
        df_analysis_sample[col] = df_analysis_sample[col].clip(lower=low_cap, upper=high_cap)
        print(f"Clipped {col} to [{low_cap:.2f}, {high_cap:.2f}]")

# Enforce data types
print("\nPost-processing data types...")

# Binary
for col in (c for c in col_schema['binary'] if c in cols_to_impute):
    df_analysis_sample[col] = df_analysis_sample[col].round().clip(0, 1).astype(np.int8)

# Counts
for col in (c for c in col_schema['count'] if c in cols_to_impute):
    df_analysis_sample[col] = df_analysis_sample[col].round().clip(lower=0).astype(np.int16)

# Ordinal: round, clip to original range, cast back to original dtype
for col, (low, high) in orig_minmax.items():
    dtype = orig_dtypes[col]   
    df_analysis_sample[col] = (
        df_analysis_sample[col]
        .round()
        .clip(low, high)
        .astype(dtype))

# Final validation
print("\n=== VALIDATION ===")

# Check remaining missingness
remaining_nans = df_analysis_sample[all_x_cols].isna().sum().sum()
print(f"Remaining missing values: {remaining_nans}")
assert remaining_nans == 0, "Imputation failed!"

# Verify flags are binary
for col in missing_flag_cols:
    if not set(df_analysis_sample[col].unique()).issubset({0, 1}):
        raise ValueError(f"Flag {col} has non-binary values")

# Verify dtypes are correct
print(f"Imputation successful: {df_analysis_sample.shape[0]} rows, "
      f"{len(all_x_cols)} covariates + {len(missing_flag_cols)} flags")

# Final feature set ready for causal estimation
final_features = all_x_cols + missing_flag_cols

Columns to impute: 32 / 42
Created flag for X_need_med_0m: 0.83% missing
Created flag for X_needmet_med_0m: 4.61% missing
Created flag for X_need_rx_0m: 0.51% missing
Created flag for X_needmet_rx_0m: 3.41% missing
Created flag for X_employ_0m: 1.44% missing
Created flag for X_race_hisp_0m: 0.46% missing
Created flag for X_race_white_0m: 0.02% missing
Created flag for X_race_black_0m: 0.02% missing
Created flag for X_race_amerindian_0m: 0.02% missing
Created flag for X_race_asian_0m: 0.02% missing
Created flag for X_race_pacific_0m: 0.02% missing
Created flag for X_race_other_qn_0m: 0.02% missing
Created flag for X_cost_borrow_0m: 1.32% missing
Created flag for X_cost_any_owe_0m: 0.81% missing
Created flag for X_cost_refused_0m: 4.74% missing
Created flag for X_cost_any_oop_0m: 7.93% missing
Created flag for X_rx_num_mod_0m: 7.55% missing
Created flag for X_doc_num_mod_0m: 0.87% missing
Created flag for X_er_num_mod_0m: 0.93% missing
Created flag for X_hosp_num_mod_0m: 0.98% missing
Cr



Clipped X_cost_tot_owe_0m to [0.00, 14079641.95]
Clipped X_cost_tot_oop_correct_0m to [0.00, 82600.21]

Post-processing data types...

=== VALIDATION ===
Remaining missing values: 0
Imputation successful: 16579 rows, 42 covariates + 32 flags


# New Fields

### Creating Catostrophic Income Output

In [40]:
# MAPPING: OHIE Income Categories to Numeric Midpoints (2008 Dollars)

income_map = {
    1: 0,       # "$0"
    2: 1250,    # "$1 to $2,500"
    3: 3750,    # "$2,501 to $5,000"
    4: 6250,    # "$5,001 to $7,500"
    5: 8750,    # "$7,501 to $10,000"
    6: 11250,   # "$10,001 to $12,500"
    7: 13750,   # "$12,501 to $15,000"
    8: 16250,   # "$15,001 to $17,500"
    9: 18750,   # "$17,501 to $20,000"
    10: 21250,  # "$20,001 to $22,500"
    11: 23750,  # "$22,501 to $25,000"
    12: 26250,  # "$25,001 to $27,500"
    13: 28750,  # "$27,501 to $30,000"
    14: 31250,  # "$30,001 to $32,500"
    15: 33750,  # "$32,501 to $35,000"
    16: 36250,  # "$35,001 to $37,500"
    17: 38750,  # "$37,501 to $40,000"
    18: 41250,  # "$40,001 to $42,500"
    19: 43750,  # "$42,501 to $45,000"
    20: 46250,  # "$45,001 to $47,500"
    21: 48750,  # "$47,501 to $50,000"
    22: 60000   # "$50,001 or more" does not matter
}
df_analysis_sample['Y_income_num_12m'] = df_analysis_sample['Y_hhinc_cat_12m'].map(income_map)
print(df_analysis_sample[['Y_hhinc_cat_12m', 'Y_income_num_12m']].value_counts(dropna=False))
Y_cols.append('Y_income_num_12m')

Y_hhinc_cat_12m  Y_income_num_12m
1.00             0.00                2146
2.00             1,250.00            1669
6.00             11,250.00           1438
5.00             8,750.00            1375
7.00             13,750.00           1335
4.00             6,250.00            1074
3.00             3,750.00            1067
8.00             16,250.00            926
9.00             18,750.00            920
NaN              NaN                  776
11.00            23,750.00            741
10.00            21,250.00            725
12.00            26,250.00            539
13.00            28,750.00            431
14.00            31,250.00            327
15.00            33,750.00            229
22.00            60,000.00            213
16.00            36,250.00            175
17.00            38,750.00            148
18.00            41,250.00            113
19.00            43,750.00             75
21.00            48,750.00             73
20.00            46,250.00             64


In [41]:
# DEFINITION: Catastrophic Expenditure = OOP > 30% of Income 
# Handle division by zero: If income is 0, any OOP > 0 is catastrophic.

def calc_catastrophic(row):
    income = row['Y_income_num_12m']
    oop = row['Y_cost_tot_oop_12m']
    
    # Missing data check
    if pd.isna(income) or pd.isna(oop):
        return np.nan
        
    if income == 0:
        return 1 if oop > 0 else 0
    
    return 1 if (oop / income) > 0.30 else 0

df_analysis_sample['Y_catastrophic_exp_12m'] = df_analysis_sample.apply(calc_catastrophic, axis=1)
Y_cols.append('Y_catastrophic_exp_12m')
print(f"Catastrophic Rate: {df_analysis_sample['Y_catastrophic_exp_12m'].mean():.4f}")

Catastrophic Rate: 0.0685


### Creating Age Col - Covariate

In [42]:
df_analysis_sample['X_age_0m'] = 2008 - df_analysis_sample['X_birthyear_0m']
print(f"Age Range: {df_analysis_sample['X_age_0m'].min()} to {df_analysis_sample['X_age_0m'].max()}")
print(df_analysis_sample[['X_age_0m', 'X_birthyear_0m']].value_counts(dropna=False))
df_analysis_sample = df_analysis_sample.drop('X_birthyear_0m', axis = 1)

Age Range: 19.0 to 64.0
X_age_0m  X_birthyear_0m
48.00     1,960.00          565
46.00     1,962.00          550
53.00     1,955.00          529
47.00     1,961.00          507
52.00     1,956.00          500
55.00     1,953.00          495
49.00     1,959.00          495
50.00     1,958.00          492
51.00     1,957.00          477
54.00     1,954.00          460
56.00     1,952.00          443
57.00     1,951.00          433
45.00     1,963.00          424
58.00     1,950.00          412
44.00     1,964.00          397
59.00     1,949.00          386
43.00     1,965.00          382
61.00     1,947.00          366
42.00     1,966.00          360
60.00     1,948.00          356
27.00     1,981.00          355
39.00     1,969.00          352
26.00     1,982.00          348
62.00     1,946.00          341
28.00     1,980.00          340
31.00     1,977.00          340
30.00     1,978.00          339
29.00     1,979.00          338
41.00     1,967.00          336
25.00     1,983.00     

# Save data file

In [43]:
# Save the final, clean dataset to a file
file_name = "../Data_Used/ohie_full_intermediate_dataset.feather"
df_analysis_sample.to_feather(file_name)