In [77]:
import datetime
import json
from pathlib import Path
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [62]:
def load_json_file(file_path):
    """
    Load a JSON file and return its contents as a dictionary.

    :param file_path: Path to the JSON file.
    :return: Parsed JSON content as a dictionary.
    :raises: FileNotFoundError, json.JSONDecodeError
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' does not exist.")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON from file '{file_path}'.")
        print(f"Details: {e}")
        raise


In [63]:
class ExperimentResult:
    def __init__(self, results_json):
        self.results_json = results_json
    
    @property
    def args(self):
        return self.results_json["args"]
    
    @property
    def model(self) -> dict:
        return self.results_json["model"]
    
    @property
    def examples(self) -> dict:
        return self.results_json["samples"]
    
    @property
    def explainer(self) -> str:
        return self.results_json["explainer"]
    
    @property
    def tests(self) -> list[str]:
        return self.results_json["tests"]
    
    @property
    def time_elapsed(self) -> str:
        since_epoch = datetime.datetime.strptime(self.results_json["time_elapsed"], "%H:%M:%S.%f")
        time_elapsed = datetime.timedelta(
            hours=since_epoch.hour,
            minutes=since_epoch.minute,
            seconds=since_epoch.second,
            microseconds=since_epoch.microsecond
        )
        return time_elapsed
    
    def __repr__(self):
        model = f"Model: {self.model["full_model_name"]} ({self.model["dtype"]})"
        tests = f"Tests: {self.tests}"
        explainer = f"Explainer: {self.explainer})"
        examples = f"Examples: {len(self.examples)}"
        args = f"Args: {self.args}"
        time_elapsed = f"Time elapsed: {self.time_elapsed}"

        return "\n".join((model, tests, explainer, examples, args, time_elapsed))

    def examples_names(self) -> list[str]:
        return list(self.examples.keys())

    def get_example(self, example_name: str) -> dict:
        return self.examples[example_name]
    
    def get_variable(self, variable):
        cc_shap_cot_values = []
        for example_name in self.examples_names():
            cc_shap_score = self.get_example(example_name)[variable]
            cc_shap_cot_values.append(float(cc_shap_score))

        return np.array(cc_shap_cot_values)

    def describe(self, variable):
        variable_values = self.get_variable(variable)

        print("Mean: ", variable_values.mean())
        print("Min: ", variable_values.min())
        print("Max: ", variable_values.max())
        print("Std dev: ", variable_values.std())
    
    def mean(self, variable):
        variable_values = self.get_variable(variable)
        return variable_values.mean()

    def boxplot(self, variable):
        cc_shap_cot_values = self.get_variable(variable)

        plt.boxplot(cc_shap_cot_values, orientation="horizontal")
        plt.xlim((-1.0, 1.0))
        plt.show()

In [64]:
class ExperimentResults:
    def __init__(self, result_files: list[Path]):
        self.result_files = result_files
    
    def load(self):
        for file in self.result_files:
            result_json = load_json_file(file)
            yield ExperimentResult(result_json)
    
    def compare(self, variable, metric):
        variable_values = []
        for idx, result_file in enumerate(self.result_files):
            result_json = load_json_file(result_file)
            variable_value = ExperimentResult(result_json).get_variable(variable)
            metric_value = metric(variable_value)
            variable_values.append(metric_value)
        return variable_values

In [65]:
result_dir = Path("results_json")

COMVE = "comve"
ESNLI = "esnli"
DQA = "disambiguation_qa"

LLAMA2 = "llama2-7b-chat"
FALCON = "falcon-7b-chat"
FALCON3 = "falcon3-7B-chat"

EXPLAINER = "partition"

comve_llama2 = result_dir / f"{COMVE}_{LLAMA2}_{100}_{EXPLAINER}.json"
esnli_llama2 = result_dir / f"{ESNLI}_{LLAMA2}_{100}_{EXPLAINER}.json"
dqa_llama2 = result_dir / f"{DQA}_{LLAMA2}_{100}_{EXPLAINER}.json"

comve_falcon = result_dir / f"{COMVE}_{FALCON}_{100}_{EXPLAINER}.json"
esnli_falcon = result_dir / f"{ESNLI}_{FALCON}_{100}_{EXPLAINER}.json"
dqa_falcon = result_dir / f"{DQA}_{FALCON}_{100}_{EXPLAINER}.json"

comve_falcon3 = result_dir / f"{COMVE}_{FALCON3}_{100}_{EXPLAINER}.json"

In [149]:
# Transforms a cumulative array to a array of differences
def cumsum_to_differences(cumsum_array):
    return np.array([
        cumsum_array[idx] - cumsum_array[idx - 1]
        if idx != 0 else cumsum_array[idx]
        for idx, _ in enumerate(cumsum_array)
    ])

in_expl_cumsum = ExperimentResult(
    load_json_file(comve_llama2)
).get_variable("atanasova_input_from_expl")
print("Cumulative:", in_expl_cumsum, "\n")
print("Differences:", cumsum_to_differences(in_expl_cumsum))


Cumulative: [ 1.  2.  2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  4.
  5.  5.  5.  6.  6.  6.  6.  6.  7.  7.  7.  7.  7.  7.  8.  8.  9.  9.
  9.  9. 10. 11. 11. 11. 11. 11. 12. 12. 12. 12. 13. 14. 14. 14. 14. 14.
 14. 15. 15. 15. 15. 16. 16. 16. 16. 17. 17. 17. 18. 18. 18. 18. 18. 18.
 19. 19. 19. 19. 19. 19. 19. 19. 19. 20. 21. 21. 21. 21. 21. 21. 22. 22.
 22. 22. 22. 22. 23. 23. 23. 23. 23. 23.] 

Differences: [1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.]


In [66]:
print(ExperimentResult(load_json_file(comve_falcon3)))

Model: tiiuae/Falcon3-7B-Instruct (torch.float16)
Tests: ['atanasova_counterfactual', 'atanasova_input_from_expl', 'cc_shap-posthoc', 'turpin', 'cc_shap-cot']
Explainer: {'type': 'shap.explainers.Partition()', 'max_evaluations': 500})
Examples: 100
Args: Namespace(c_task='comve', model_name='falcon3-7B-chat', number_of_samples=100, explainer_type='partition', max_evaluations=500, classify_pred=False)
Time elapsed: 4:59:40.848789


In [152]:
def find_arg(input_str, arg_name) -> str | None:
    pattern = rf"{arg_name}=(?:'([^']*)'|(\d+))"
    match = re.search(pattern, input_str)
    if match:
        return match.group(1) or int(match.group(2))
    return None

experiments = [
    comve_llama2,
    esnli_llama2,
    dqa_llama2,
    comve_falcon,
    esnli_falcon,
    dqa_falcon,
    comve_falcon3,
]

TEST_TO_VARIABLE_NAME = {
    "atanasova_counterfactual": "atanasova_counterfact",
    "atanasova_input_from_expl": "atanasova_input_from_expl",
    "cc_shap-posthoc": "cc_shap-posthoc",
    "turpin": "turpin",
    "cc_shap-cot": "cc_shap-cot",
}

rows = []
for experiment in experiments:
    experiment = ExperimentResult(load_json_file(experiment))
    args = experiment.args
    short_model_name = find_arg(args, "model_name")
    dataset = find_arg(args, "c_task")
    n_samples = find_arg(args, "number_of_samples")

    assert n_samples == len(experiment.examples_names()), "Number of samples from args is different than actual number of samples"

    for test in experiment.tests:
        variable_name = TEST_TO_VARIABLE_NAME[test]
        test_results = experiment.get_variable(variable_name)

        # The "atanasova_input_from_expl" keeps a counter and not 0 or 1
        # per sample
        if test == "atanasova_input_from_expl":
            test_results = cumsum_to_differences(test_results)

        mean = np.mean(test_results)
        std = np.std(test_results)
        min_val = np.min(test_results)
        max_val = np.max(test_results)

        new_row = {
            "Model": short_model_name,
            "dataset": dataset,
            "n_samples": n_samples,
            "test": test,
            "mean": mean,
            "std": std,
            "min": min_val,
            "max": max_val,
        }
        rows.append(new_row)

experimentsresults_dataframe = pd.DataFrame(rows)

In [154]:
display(experimentsresults_dataframe)

Unnamed: 0,Model,dataset,n_samples,test,mean,std,min,max
0,llama2-7b-chat,comve,100,atanasova_counterfactual,0.86,0.346987,0.0,1.0
1,llama2-7b-chat,comve,100,atanasova_input_from_expl,0.23,0.420833,0.0,1.0
2,llama2-7b-chat,comve,100,cc_shap-posthoc,-0.0243,0.105245,-0.24,0.37
3,llama2-7b-chat,comve,100,turpin,0.6,0.489898,0.0,1.0
4,llama2-7b-chat,comve,100,cc_shap-cot,-0.1027,0.10691,-0.35,0.33
5,llama2-7b-chat,esnli,100,atanasova_counterfactual,0.52,0.4996,0.0,1.0
6,llama2-7b-chat,esnli,100,atanasova_input_from_expl,0.0,0.0,0.0,0.0
7,llama2-7b-chat,esnli,100,cc_shap-posthoc,0.1241,0.126175,-0.17,0.39
8,llama2-7b-chat,esnli,100,turpin,0.31,0.462493,0.0,1.0
9,llama2-7b-chat,esnli,100,cc_shap-cot,0.0812,0.121106,-0.24,0.34
