# Verifying Assumption 1 on CheXpert Task

In [1]:
# Load all necessary packages
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
from balancers import BinaryBalancer
from utils import ROC
import os

In [2]:
# Load data 
results_dir = 'results/'
data = pd.read_csv('data/results_grouped_condition.csv', index_col=[0])
a_total = np.array(data['Sex'])
y_total = np.array(data['grouped_condition'])
a_hat_total = np.array(data['a_hat'])
y_prob_total = np.array(data['y_prob'])
val_num = len(a_total) - 100000
val_indices = np.random.choice(val_num, size=val_num, replace=False)
y_val = y_total[val_indices]
y_prob_val = y_prob_total[val_indices]

# Determine optimal threshold to get y_hat
thresholds, fprs, tprs = ROC(y_val,y_prob_val)
opt_t = thresholds[np.argmax(np.array(tprs) - np.array(fprs))]

# Create test set
a = np.delete(a_total,val_indices)
y = np.delete(y_total,val_indices)
a_hat = np.delete(a_hat_total,val_indices)
y_prob = np.delete(y_prob_total,val_indices)
y_hat = y_prob >= opt_t

In [3]:
# Run experiment
K = 1000
num_rows = len(a)
k = 0 

alpha_h_11 = []
alpha_h_11_lb = []
alpha_h_01 = []
alpha_h_01_lb = []
alpha_h_10 = []
alpha_h_10_lb = []
alpha_h_00 = []
alpha_h_00_lb = []


while k < K:
    random_indices = np.random.choice(num_rows, size=num_rows, replace=True)
    a_s = a[random_indices]
    y_s = y[random_indices]
    a_hat_s = a_hat[random_indices]
    y_hat_s = y_hat[random_indices]
    true_balancer = BinaryBalancer(y=y_s, y_=y_hat_s, a=a_s, a_hat = a_hat_s, adjusted = False)

    # See if conditions are met
    '''
    if true_balancer.assumption == 0:
        true_balancer.adjust(con = 'tpr/fpr', obj = 'none')
        true_balancer = BinaryBalancer(y=y_s, y_=true_balancer.y_adj, a=a_s, a_hat = a_hat_s)
    '''
    
    # Show assumption holds
    alpha_h_11.append(true_balancer.a_gr_list[1].tpr)
    alpha_h_11_lb.append(true_balancer.U1/true_balancer.est_base_rates['rh_11'])

    alpha_h_01.append(true_balancer.a_gr_list[0].tpr)
    alpha_h_01_lb.append(true_balancer.U0/true_balancer.est_base_rates['rh_01'])

    alpha_h_10.append(true_balancer.a_gr_list[1].fpr)
    alpha_h_10_lb.append(true_balancer.U1/true_balancer.est_base_rates['rh_10'])

    alpha_h_00.append(true_balancer.a_gr_list[0].fpr)
    alpha_h_00_lb.append(true_balancer.U0/true_balancer.est_base_rates['rh_00'])

    if k%200 == 0:
        print(k)
    k = k+1 

0
200
400
600
800


In [None]:
# Statistics to show assumption holds
rows = ['alpha_h_11', 'alpha_h_11_ub', 'alpha_h_11_lb', \
        'alpha_h_01', 'alpha_h_01_ub', 'alpha_h_01_lb', \
        'alpha_h_10', 'alpha_h_10_ub', 'alpha_h_10_lb', \
        'alpha_h_00', 'alpha_h_00_ub', 'alpha_h_00_lb']

values = [np.mean(np.array(alpha_h_11)), np.mean(1 - np.array(alpha_h_11_lb)), np.mean(np.array(alpha_h_11_lb)), \
          np.mean(np.array(alpha_h_01)), np.mean(1 - np.array(alpha_h_01_lb)), np.mean(np.array(alpha_h_01_lb)), \
          np.mean(np.array(alpha_h_10)), np.mean(1 - np.array(alpha_h_10_lb)), np.mean(np.array(alpha_h_10_lb)),
          np.mean(np.array(alpha_h_00)), np.mean(1 - np.array(alpha_h_00_lb)), np.mean(np.array(alpha_h_00_lb))]
df = pd.DataFrame(data = zip(rows, values), columns = ['Paramater', 'Value'])
df.to_csv(os.path.join(results_dir, 'assumption_results.csv'), index = 0)