# Experiment Results

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from experiment_result import (
    ExperimentResult,
    load_json_file,
    cumsum_to_differences,
    find_arg,
    TEST_TO_VARIABLE_NAME
)

In [2]:
# Show all rows and columns
pd.set_option('display.max_rows', None)

In [3]:
result_dir = Path("results")

In [4]:
# Example of a ExperimentResult object
print(ExperimentResult(load_json_file(result_dir / "main_results_json" / "comve_falcon-7b-chat_100_partition.json")))

Model: tiiuae/falcon-7b-instruct (torch.float16)
Tests: ['atanasova_counterfactual', 'atanasova_input_from_expl', 'cc_shap-posthoc', 'turpin', 'cc_shap-cot']
Explainer: {'type': 'shap.explainers.Partition()', 'max_evaluations': 500})
Examples: 100
Args: Namespace(c_task='comve', model_name='falcon-7b-chat', number_of_samples=100, explainer_type='partition', max_evaluations=500, classify_pred=False)
Time elapsed: 4:03:02.435313
LLO sim threshold: None


In [5]:
# Collect all experiment results jsons from the results folder
experiments = (result_dir / "main_results_json").glob("*.json")

rows = []
for experiment in experiments:
    experiment = ExperimentResult(load_json_file(experiment))
    args = experiment.args
    short_model_name = find_arg(args, "model_name")
    dataset = find_arg(args, "c_task")
    n_samples = find_arg(args, "number_of_samples")

    assert n_samples == len(experiment.examples_names()), "Number of samples from args is different than actual number of samples"

    for test in experiment.tests:
        variable_name = TEST_TO_VARIABLE_NAME[test]
        test_results = experiment.get_variable(variable_name)

        # The "atanasova_input_from_expl" keeps a counter and not 0 or 1
        # per sample
        if test == "atanasova_input_from_expl":
            test_results = cumsum_to_differences(test_results)

        mean = np.mean(test_results)
        std = np.std(test_results)
        min_val = np.min(test_results)
        max_val = np.max(test_results)

        new_row = {
            "Model": short_model_name,
            "dataset": dataset,
            "n_samples": n_samples,
            "test": test,
            "mean": mean,
            "std": std,
            "min": min_val,
            "max": max_val,
        }
        rows.append(new_row)

experimentsresults_dataframe = pd.DataFrame(rows)

In [6]:
# Function to style specific rows
COLORS = True

MODEL_COLORS = {
    "falcon-7b-chat": "#332288",
    "falcon3-7B-chat": "#A18A6E",
    "llama2-7b-chat": "#44AA99",
    "mistral-7b-chat": "#882255",
    "phi3-medium-chat": "#994F00",
    "phi4": "#0C7BDC"
} if COLORS else {}

def highlight_row(row):
    return [f"background-color: {MODEL_COLORS.get(row['Model'], '')}"] * len(row)

In [7]:
display(
    experimentsresults_dataframe.sort_values(
        by=["dataset", "test", "Model"]
    ).style.apply(highlight_row, axis=1)
)

Unnamed: 0,Model,dataset,n_samples,test,mean,std,min,max
60,falcon-7b-chat,comve,100,atanasova_counterfactual,0.23,0.420833,0.0,1.0
25,falcon3-7B-chat,comve,100,atanasova_counterfactual,0.85,0.357071,0.0,1.0
40,llama2-7b-chat,comve,100,atanasova_counterfactual,0.86,0.346987,0.0,1.0
55,mistral-7b-chat,comve,100,atanasova_counterfactual,0.75,0.433013,0.0,1.0
65,phi3-medium-chat,comve,100,atanasova_counterfactual,0.95,0.217945,0.0,1.0
20,phi4,comve,100,atanasova_counterfactual,0.94,0.237487,0.0,1.0
61,falcon-7b-chat,comve,100,atanasova_input_from_expl,0.0,0.0,0.0,0.0
26,falcon3-7B-chat,comve,100,atanasova_input_from_expl,0.49,0.4999,0.0,1.0
41,llama2-7b-chat,comve,100,atanasova_input_from_expl,0.23,0.420833,0.0,1.0
56,mistral-7b-chat,comve,100,atanasova_input_from_expl,0.48,0.4996,0.0,1.0
