# Verifying Assumption 1 on ACSPublicCoverage Task

In [15]:
# Load all necessary packages
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import seaborn as sns
import folktables
from folktables import ACSDataSource
import matplotlib.pyplot as plt
from balancers import BinaryBalancer
from tqdm import tqdm
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import os
sns.set_theme()

In [16]:
# Necessary functions 
def public_coverage_filter(data):
    """
    Filters for the public health insurance prediction task; focus on low income Americans, and those not eligible for Medicare
    """
    df = data
    df = df[df['AGEP'] < 65]
    df = df[df['PINCP'] <= 30000]
    return df

In [17]:
# Set relevant variables
results_dir = 'results/'

state_list = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI',
              'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI',
              'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC',
              'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
              'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR']

ACStask = folktables.BasicProblem(
    features=[
        'AGEP',
        'SCHL',
        'MAR',
        'DIS',
        'ESP',
        'CIT',
        'MIG',
        'MIL',
        'ANC',
        'NATIVITY',
        'DEAR',
        'DEYE',
        'DREM',
        'PINCP',
        'ESR',
        'FER',
        'RAC1P',
    ],
    target='PUBCOV',
    target_transform=lambda x: x == 1,
    group='SEX',
    preprocess=public_coverage_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

In [18]:
# Checking which states satisy Assumption 1 (or its TPR/FPR relaxations)
assump = []
tpr_assump = []
fpr_assump = []
for state in tqdm(state_list):
    # Load state data
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states =[state], download=True)
    x, y, a = ACStask.df_to_numpy(acs_data)
    a = (a == 1)*1

    # Split data
    x_train, x_test, y_train, y_test, a_train, a_test = train_test_split( 
        x, y, a, test_size=0.25, random_state=0)

    # RF classifier for Y
    y_model = RandomForestClassifier(max_depth=10)
    y_model.fit(x_train, y_train)
    y_hat = y_model.predict(x_test)

    # RF classifier for A
    a_model = RandomForestClassifier(max_depth=10)
    a_model.fit(x_train, a_train)
    a_hat = a_model.predict(x_test)

    # Check assumption
    try:
        # Create balancer
        true_balancer = BinaryBalancer(y=y_test, y_=y_hat, a=a_test, a_hat = a_hat, adjusted = False)
        assump.append(true_balancer.assumption)
        tpr_assump.append(true_balancer.tpr_assumption)
        fpr_assump.append(true_balancer.fpr_assumption)
    except:
        assump.append('N/A')
        tpr_assump.append('N/A')
        fpr_assump.append('N/A')
df = pd.DataFrame(data=zip(state_list, assump, tpr_assump, fpr_assump),columns=['State', 'Assumption', 'TPR Assumption', 'FPR_Assumption'])
df.to_csv(os.path.join(results_dir, 'State_A1Verification.csv'), index = 0)

100%|██████████| 51/51 [02:01<00:00,  2.39s/it]


In [19]:
# Load particular state data
state = "CA"
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states =[state], download=True)
x, y, a = ACStask.df_to_numpy(acs_data)
a = (a == 1)*1

# Split data
x_train, x_test, y_train, y_test, a_train, a_test = train_test_split( 
    x, y, a, test_size=0.25, random_state=0)

# RF classifier for Y
y_model = RandomForestClassifier(max_depth=10)
y_model.fit(x_train, y_train)
y_hat = y_model.predict(x_test)

# RF classifier for A
a_model = RandomForestClassifier(max_depth=10)
a_model.fit(x_train, a_train)
a_hat = a_model.predict(x_test)

In [20]:
# Setting necessary parameters/variables
k = 0 
iter = 1000
y = y_test
a = a_test
num_rows = len(a)

alpha_h_11 = []
alpha_h_11_lb = []
alpha_h_01 = []
alpha_h_01_lb = []
alpha_h_10 = []
alpha_h_10_lb = []
alpha_h_00 = []
alpha_h_00_lb = []


while k < iter:
    random_indices = np.random.choice(num_rows, size=num_rows, replace=True)
    a_s = a[random_indices]
    y_s = y[random_indices]
    a_hat_s = a_hat[random_indices]
    y_hat_s = y_hat[random_indices]
    true_balancer = BinaryBalancer(y=y_s, y_=y_hat_s, a=a_s, a_hat = a_hat_s, adjusted = False)
    
    # Calculating bounds of assumption 1 
    alpha_h_11.append(true_balancer.a_gr_list[1].tpr)
    alpha_h_11_lb.append(true_balancer.U1/true_balancer.est_base_rates['rh_11'])

    alpha_h_01.append(true_balancer.a_gr_list[0].tpr)
    alpha_h_01_lb.append(true_balancer.U0/true_balancer.est_base_rates['rh_01'])

    alpha_h_10.append(true_balancer.a_gr_list[1].fpr)
    alpha_h_10_lb.append(true_balancer.U1/true_balancer.est_base_rates['rh_10'])

    alpha_h_00.append(true_balancer.a_gr_list[0].fpr)
    alpha_h_00_lb.append(true_balancer.U0/true_balancer.est_base_rates['rh_00'])

    if k%200 == 0:
        print(k)
    k = k+1 

0
200
400
600
800


In [None]:
# Statistics to show assumption holds
rows = ['alpha_h_11', 'alpha_h_11_ub', 'alpha_h_11_lb', \
        'alpha_h_01', 'alpha_h_01_ub', 'alpha_h_01_lb', \
        'alpha_h_10', 'alpha_h_10_ub', 'alpha_h_10_lb', \
        'alpha_h_00', 'alpha_h_00_ub', 'alpha_h_00_lb']
values = [np.mean(np.array(alpha_h_11)), np.mean(1 - np.array(alpha_h_11_lb)), np.mean(np.array(alpha_h_11_lb)), \
          np.mean(np.array(alpha_h_01)), np.mean(1 - np.array(alpha_h_01_lb)), np.mean(np.array(alpha_h_01_lb)), \
          np.mean(np.array(alpha_h_10)), np.mean(1 - np.array(alpha_h_10_lb)), np.mean(np.array(alpha_h_10_lb)),
          np.mean(np.array(alpha_h_00)), np.mean(1 - np.array(alpha_h_00_lb)), np.mean(np.array(alpha_h_00_lb))]
df = pd.DataFrame(data = zip(rows, values), columns = ['Paramater', 'Value'])
df.to_csv(os.path.join(results_dir, state + '_assumption_results.csv'), index = 0)