In [None]:
import sys
sys.path.append('..')
from datetime import datetime
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from data.linear.binary import BinaryLinearData
from data.linear.continuous import GaussianLinearData

from algorithm.general import GConfounderTest, PearsonConfounderTest
from algorithm.baseline import GEnvironmentTest, PearsonEnvironmentTest

from experiment.utils import run
from experiment.plot import set_mpl_default_settings, marker_dict, name_dict

set_mpl_default_settings()

# Experiment: Comparing our procedure to the $Y \perp E \mid T$

In [None]:
load = False 

# Data and algorithm
simulate_class = BinaryLinearData 
test_method_list = [GConfounderTest(), GEnvironmentTest()]

# Experiment parameters
nbr_env = [500]
nbr_samples = [100]
repetitions = 50
sign_level = 0.05

# Fixed dataset
conf_strength = [0,10]
dist_param_list = []
for k in list(np.linspace(0,0.25,10)):
     dist_param_list.append( {'X': {'a': 0.0, 'b': 1.0},
                              'Y': {'a': 0.0, 'b': k},
                              'T': {'a': 0.0, 'b': 1.0}
                              })

# Get timestamp for experiment
now = datetime.now()
timestamp = now.strftime("%m%d%H%M")
print('Timestamp:', timestamp)

In [None]:
nbr_proc=4
def experiment_vary_dist(test_method, path):

    df_list = []
    for d in dist_param_list:
        args = [d, nbr_env, nbr_samples, conf_strength, simulate_class, test_method, repetitions, sign_level]
        res = run(args, save_during_run=path)
        df_list.append(pd.concat(res))
    
    return pd.concat(df_list)


In [None]:
if not load:
    
    experiment_results = {}
    for alg in tqdm(test_method_list):
        alg_name = type(alg).__name__
        res = experiment_vary_dist(alg, f'results/comparison_{alg_name}_{timestamp}.csv') 
        experiment_results[alg_name] = res

else:
    
    # Load data
    timestamp_str = "11151428"
    test_method_list = ['PearsonConfounderTest', 'PearsonEnvironmentTest']
    timestamp = int(timestamp_str)

    experiment_results = {}
    for alg in test_method_list:
        path = f'results/comparison_{alg}_{timestamp_str}.csv'
        df = pd.read_csv(path)
        experiment_results[alg] = df

# Plot results

In [None]:
def plot_experiment(confounder_strength : float):
    '''
    Plot curves for each fixed number of environments
    '''

    for alg in experiment_results:
        tmp_df = experiment_results[alg]
        tmp_df = tmp_df[(np.abs(tmp_df.confounder_strength - confounder_strength) < 1e-3)]
        
        p = tmp_df.reject_rate
        plt.plot(tmp_df.Y_b, p, label=name_dict[alg], marker=marker_dict[alg])
        std = np.sqrt(p*(1-p)/repetitions)
        plt.fill_between(tmp_df.Y_b, p-std, p+std, alpha=0.5)

    if confounder_strength == 0:
        plt.ylabel('False detection rate')
    else:
        plt.ylabel('Detection rate')
    plt.xlabel('$\\sigma_{\\theta_Y}^2$')
    plt.ylim([-.1,1.1])
    
    
    if confounder_strength == 0:
        plt.axhline(y=0.05, color='black', linestyle='--')
        plt.legend()

    path = f'results/figures/comparison_cs{confounder_strength}_{timestamp}.pdf'
    plt.savefig(path, format='pdf', bbox_inches='tight')

In [None]:
for c in conf_strength:
    plt.figure()
    plot_experiment(c)