In [1]:
# standard python packages
import os, sys
from math import exp, log
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# from tqdm.notebook import tqdm
from copy import deepcopy
from glob import glob
from tqdm import tqdm
from joblib import Parallel, delayed   
import itertools

from pprint import pprint

# import functions from Toybrains utils
TOYBRAINS_DIR = os.path.abspath('../')
if TOYBRAINS_DIR not in sys.path: sys.path.append(TOYBRAINS_DIR)
from create_toybrains import ToyBrainsData
from utils.vizutils import show_contrib_table
from utils.genutils import gen_toybrains_dataset
from utils.genutils_all import *

# Toybrain dataset generation

In [2]:
# Sections to run 
GEN_DATASETS=True
N_SAMPLES=5000
N_SAMPLES_TEST=500
TEST_ON_OOD=True
GEN_IMAGES=False
DATASET_SUFFIX=''
OVERWRITE_EXISTING=True
VERBOSE=0

VIZ_DAG=False
VIZ_DISTS_BEFORE=False
VIZ_DISTS_AFTER=False
VIZ_IMG_SAMPLES=False and GEN_IMAGES
GEN_BASELINES=True
VIZ_BASELINE=True and GEN_BASELINES
BASELINE_MODELS= [('LR', {}), 
                # ('LR', {'C':0.0001}), ('LR', {'C':0.0002}), ('LR', {'C':0.0005}), ('LR', {'C':0.001}), ('LR', {'penalty': None}),
                # ('SVM', {}),
                 ]
BASELINE_METRICS=['balanced_accuracy', 
                  'd2', 'logodds_r2', 
                  'logodds_mae','logodds_mse'
                 ] #,'roc_auc', 'adjusted_mutual_info_score'

## Debug any pre-generated Toybrains dataset

In [None]:
# if we want to visualize a different dataset configuration than the one just generated
basefilename = 'con1_cov-90-cat'
TWEAK_STEPS = 36
cov_name = basefilename.split('_')[1].replace('-','_')
config_fnames = sorted(glob(f'configs/*{basefilename}*.py'))
assert len(config_fnames)==TWEAK_STEPS, f"Incorrect config files found with the name {basefilename}:\n {config_fnames}"

TEST_CON_ASSOCIATIONS = False
VIZ_DISTS_AFTER = False
GEN_BASELINES = True
VIZ_BASELINE = True
verbose = 0

In [None]:
if TEST_CON_ASSOCIATIONS:

    from sklearn.linear_model import LogisticRegression, LinearRegression
    from sklearn.model_selection import train_test_split

    for config_fname in config_fnames:
        dataset_path = f"dataset/toybrains_n{N_SAMPLES}_{os.path.basename(config_fname).replace('.py','')}"
        assert os.path.exists(dataset_path+'/train/'), f"Dataset not found at {dataset_path}"
        cy = int(os.path.basename(config_fname).split('_')[-1].replace('cy','').replace('.py',''))
        df = pd.read_csv(f'{dataset_path}/train/{os.path.basename(dataset_path)}.csv')
        print(f"Testing the association between {cov_name} and lbl_y for cy={cy}")
        
        # read out which attribute is influenced by the confounder covariate
        toy_temp = ToyBrainsData(config_fname)
        display(toy_temp.show_current_config(cov_name))

        X = df[cov_name]
        # convert the categorical covariate to one-hot encoding
        if 'cat' in cov_name:
            X = (pd.get_dummies(X)).values

        ## (1) compute the prediction accuracy from the covariate to the label
        print("{}  ---> lbl_y".format(cov_name))

        y = df['lbl_y']
        y = (y=='s1').astype(bool)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        if X_train.ndim==1:
            X_train = X_train.reshape(-1,1)
            X_test = X_test.reshape(-1,1)
        # print(f"shapes of [X_train={X_train.shape}, X_test={X_test.shape}, y_train={y_train.shape}, y_test={y_test.shape}]")

        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        print(f"\t Acc: {(lr.score(X_test, y_test)-0.5)*100:.0f}%")

        ## (2) compute the prediction accuracy from the covariate to the attribute
        attr_col = list(list(toy_temp.rules[cov_name].items())[0][1].keys())
        attr_col.remove('lbl_y')
        attr_col = attr_col[0]
        print("{}  ---> {}".format(cov_name, attr_col))
        y = df[attr_col].values
        # if attr is colors then extract a numerical order from the strings
        if isinstance(y[0], str):
            get_ints = np.vectorize(lambda x: int(x.split('-')[0]))
            y = get_ints(y) 
            y = y/y.max()
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        if X_train.ndim==1:
            X_train = X_train.reshape(-1,1)
            X_test = X_test.reshape(-1,1)

        lr = LinearRegression()
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        print(f"\t R2 : {(lr.score(X_test, y_test))*100:.0f}%")



In [None]:
# choose one config file out of the 4, to debug
config_fname = config_fnames[3]
print(f"Rerunning the baseline generation the config file: {config_fname}")

N_SAMPLES=5000
dataset_path = f"dataset/toybrains_n{N_SAMPLES}_{os.path.basename(config_fname).replace('.py','')}"
assert os.path.exists(dataset_path+'/train/'), f"Dataset not found at {dataset_path}"

: 

In [None]:
if VIZ_DISTS_AFTER: 
    from utils.vizutils import plot_col_dists, plot_col_counts, show_images
    df = pd.read_csv(f'{dataset_path}/train/{os.path.basename(dataset_path)}.csv')
    # show the image attributes distributions
    cov_cols = ['lbl_y'] +  df.filter(regex='^cov_').columns.tolist()
    attr_cols = ['lbl_y'] + [c for c in df.columns if c not in cov_cols and 'ID' not in c] 
    # print(cov_cols, attr_cols)
    plot_col_dists(df, 
                attr_cols=attr_cols, 
                cov_cols=cov_cols, 
                title=f"Distributions of the image attributes after sampling")

: 

In [None]:
toy = ToyBrainsData(config=config_fname, verbose=verbose)
display(toy.show_current_config(subset=['lbl_y',cov_name]))

toy.load_generated_dataset(dataset_path)
display(toy.draw_dag())

: 

In [None]:
## rerun baseline generation
if GEN_BASELINES:
    import re

    N_SAMPLES = 5000
    n_samples_test_ood =1000

    trials = 5
    n_jobs = trials
    baseline_models =  BASELINE_MODELS
    baseline_metrics = BASELINE_METRICS

    baselines_file = f"{toy.DATASET_DIR}/../baseline_results.csv"
    if os.path.exists(baselines_file):
                print(f"Baseline results file '{baselines_file}' already exists. Overwriting it.")

    df_results_all = []
    for model, model_params in baseline_models:
        if verbose>0: print(f"Estimating ground truth associations using {model}({model_params}) model...")
        # if OOD test datasets are available then just estimate the ground truth association using them
        test_data_glob = toy.DATASET_DIR.replace('/train/', '/test_*')
        re_pattern = r"_n(\d+)"
        test_data_glob = re.sub(re_pattern, "_n*", test_data_glob)
        test_datasets =  {data_dir.rstrip('/').split('/')[-1]: data_dir for data_dir in glob(test_data_glob)}

        if verbose>0: print(f"holdout datasets used for baselining: {list(test_datasets.keys())}")
        
        contrib_estimator_args =  dict(
                holdout_data=test_datasets,
                output_labels_prefix=['lbl'], 
                model_name=model, model_params=model_params,
                outer_CV=trials, n_jobs=n_jobs,
                metrics=baseline_metrics,
                verbose=verbose)
        
        # check if there are other test datasets than just 'test_all'
        df_results = toy.fit_contrib_estimators(
            input_feature_sets=["attr_all"],
            **contrib_estimator_args)
            
        df_results_all.append(df_results)

    df_results_all = pd.concat(df_results_all) if len(df_results_all)>1 else df_results_all[0]
    df_results_all.to_csv(baselines_file, index=False)
    # display(df_results_all)

: 

In [None]:
# df_results_all[['score_holdout_test_all_r2','score_holdout_test_lbl_y_r2', 'score_holdout_test_cov_7_cat2_r2']]

: 

In [None]:
metric='r2'
show_scores_decomp(df_results_all, 
                metric=metric, center_metric=False)
                

: 