In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
from kneed import KneeLocator
import os

def plot_kth_neighbor_distances_elbow(file_path, k, sample=0):
    df = pd.read_csv(file_path)
    
    df = df.iloc[:, :-1]

    if sample != 0:
        sampled_df = df.sample(n=sample, random_state=42)
        dist_matrix = pairwise_distances(sampled_df.values, metric='euclidean')
    else:
        dist_matrix = pairwise_distances(df.values, metric='euclidean')
    
    np.fill_diagonal(dist_matrix, np.inf)
    
    kth_distances = np.partition(dist_matrix, kth=k, axis=1)[:, k]
    
    sorted_distances = np.sort(kth_distances)

    base_filename = os.path.splitext(os.path.basename(file_path))[0]
    
    x = range(len(sorted_distances))
    y = sorted_distances
    knee = KneeLocator(x, y, curve="convex", direction="increasing")
    
    plt.figure(figsize=(8, 5))
    plt.plot(x, y, marker='.', label='k-th neighbor distances')
    
    if knee.knee is not None:
        plt.axvline(knee.knee, color='r', linestyle='--', label=f'Elbow at distance {y[knee.knee]:.4f}')
        plt.scatter(knee.knee, y[knee.knee], color='red', zorder=5)
        print(f"Elbow detected at sorted index: {knee.knee}, distance: {y[knee.knee]:.4f}")
    else:
        print("No clear elbow detected.")

    plt.title(f"Sorted L2 Distance to {k}-th Nearest neighbor — {base_filename}")
    plt.xlabel("Sorted Observation Index")
    plt.ylabel("Distance")
    plt.legend()
    plt.grid(True)
    plt.show()


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
import plotly.graph_objects as go

def plot_kth_neighbor_distances_interactive(file_path, k, sample=0):
    df = pd.read_csv(file_path)
    
    df = df.iloc[:, :-1]
    
    if sample != 0:
        sampled_df = df.sample(n=sample, random_state=42)
        dist_matrix = pairwise_distances(sampled_df.values, metric='euclidean')
    else:
        dist_matrix = pairwise_distances(df.values, metric='euclidean')
    
    np.fill_diagonal(dist_matrix, np.inf)
    
    kth_distances = np.partition(dist_matrix, kth=k, axis=1)[:, k]
    
    sorted_distances = np.sort(kth_distances)
    
    fig = go.Figure(data=go.Scatter(
        x=list(range(len(sorted_distances))),
        y=sorted_distances,
        mode='lines+markers',
        marker=dict(size=4),
        hovertemplate='Index: %{x}<br>Distance: %{y:.4f}<extra></extra>',
        name=f'Distance to {k}-th neighbor'
    ))

    fig.update_layout(
        title=f'L2 Distance to {k}-th Nearest Neighbor (Sorted)',
        xaxis_title='Sorted Observation Index',
        yaxis_title='Distance',
        hovermode='closest',
        template='plotly_white'
    )

    fig.show()


In [3]:
import pandas as pd
import os
import glob
from datetime import datetime

def get_files_after(reference_folder_name, batch_size=None):
    base_dir = r"C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results"

    ref_time = datetime.strptime(
        reference_folder_name.split("_")[0] + "_" + reference_folder_name.split("_")[1],
        "%m-%d_%H-%M-%S"
    )

    subdirs = [name for name in os.listdir(base_dir)
               if os.path.isdir(os.path.join(base_dir, name))]

    filtered_dirs = []
    for dir_name in subdirs:
        try:
            dir_time = datetime.strptime(
                dir_name.split("_")[0] + "_" + dir_name.split("_")[1],
                "%m-%d_%H-%M-%S"
            )
            if dir_time >= ref_time:
                filtered_dirs.append(dir_name)
        except Exception as e:
            print(f"Skipping {dir_name}: {e}")
    
    filtered_dirs.sort(
        key=lambda name: datetime.strptime(name.split("_")[0] + "_" + name.split("_")[1], "%m-%d_%H-%M-%S")
    )

    full_paths = [os.path.join(base_dir, d) for d in filtered_dirs]

    print("Selected directories:")
    for d in full_paths:
        print(d)

    csv_files = []
    for d in full_paths:
        csv_files.extend(glob.glob(os.path.join(d, "results", "*.csv")))

    print(f"\nFound {len(csv_files)} CSV files.")

    df_list = []
    if batch_size:
        for i in range(0, len(csv_files), batch_size):
            batch_files = csv_files[i:i + batch_size]
            batch_dfs = [pd.read_csv(f) for f in batch_files]
            df_list.append(pd.concat(batch_dfs, ignore_index=True))
            print(f"Processed batch {i//batch_size + 1} with {len(batch_files)} files.")
    else:
        df_list = [pd.read_csv(f) for f in csv_files]
        print("Read all files without batching.")

    combined_df = pd.concat(df_list, ignore_index=True)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.colheader_justify', 'left')
    pd.set_option('display.max_colwidth', None)

    return combined_df

# Example usage:
# df = get_files_after("06-23_09-52-01_85267", batch_size=500)
# output_file = r"C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\combined_results_rice.csv"
# df.to_csv(output_file, index=False)
# print(f"\nCombined dataFrame saved to {output_file}")

In [6]:
k = 2

filename = "heloc_0.csv"
file_path = os.path.join(r"C:\Users\wikto\PycharmProjects\ce-robustness\framework", filename)

plot_kth_neighbor_distances_elbow(file_path, k)
plot_kth_neighbor_distances_interactive(file_path, k)

# wine quality k = 5
# 0.3725, 0.2901, 0.2594

# wine quality k = 3
# 0.3252, 0.2746, 0.2433

# heloc k = 2
# 0.51 0.43 0.46  

ValueError: could not convert string to float: 'heloc'

In [7]:
df = get_files_after("08-12_07-07-59_73009", 100)

# output_file = r"C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\rice_lof13.csv"
# df.to_csv(output_file, index=False)
# print(f"\nCombined DataFrame saved to {output_file}")

Selected directories:
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-03-41_20878
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-03-41_90396
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-03-43_48625
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-15-10_23374
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-20-36_91692
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-24-55_39292
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-33-52_82509
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-40-05_51042
C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\just_base\results\08-12_08-44-37_60139
C:\Users\wikto\PycharmPr

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,dataset_name,fold_i,model_type_to_use,experiment_type,base_cf_method,x_test_sample,y_test_sample,model1_pred_proba,model1_pred_crisp,model2_pred_proba,model2_pred_crisp,The same decision for both models,base_counterfactual,base_counterfactual_model1_pred_proba,base_counterfactual_model1_pred_crisp,base_counterfactual_model2_pred_proba,base_counterfactual_model2_pred_crisp,base_counterfactual_validity,base_counterfactual_validity_model2,base_counterfactual_proximityL1,base_counterfactual_proximityL2,base_counterfactual_plausibility,base_counterfactual_discriminative_power,base_counterfactual_lof,base_counterfactual_num_of_neigh,base_counterfactual_agreement,base_counterfactual_acc_change,base_counterfactual_time,is_base,is_e2e,delta_max,lr,norm,robust_counterfactual,robust_counterfactual_model1_pred_proba,robust_counterfactual_model1_pred_crisp,robust_counterfactual_model2_pred_proba,robust_counterfactual_model2_pred_crisp,robust_counterfactual_validity,robust_counterfactual_validity_model2,robust_counterfactual_proximityL1,robust_counterfactual_proximityL2,robust_counterfactual_plausibility,robust_counterfactual_discriminative_power,robust_counterfactual_lof,robust_counterfactual_num_of_neigh,robust_counterfactual_agreement,robust_counterfactual_acc_change,robust_counterfactual_time,posthoc_explainer,k,beta,delta,robust_counterfactual_L1_distance_from_base,robust_counterfactual_L2_distance_from_base,robust_cf_method,start_sample_passes_test,counterfactual_does_not_pass_test,counterfactual_does_not_have_target_class,counterfactual_is_nan,highest_delta,lower_bound_beta,upper_bound_beta,perturb_radius,delta_plus,sigma,tau,variance
0,0,rice,2,neural_network,Weights,roar,"[array([0.40917092, 0.46535968, 0.47517868, 0.46480086, 0.75752197,\n 0.40822785, 0.19745748])]",1,0.542929,1,0.799944,1,True,"[array([ 0.15960665, 0.21579435, 0.22567655, 0.21519284, 0.50793403,\n 0.15864325, -0.02993326], dtype=float32)]",0.994514,1,0.992617,1,0,False,1.724803,0.652238,0.340144,1.0,[-1],13,1.0,0.055906,0.098304,False,True,0.1,0.05,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,rice,2,neural_network,Weights,roar,"[array([0.40917092, 0.46535968, 0.47517868, 0.46480086, 0.75752197,\n 0.40822785, 0.19745748])]",1,0.542929,1,0.532849,1,True,"[array([ 0.15960665, 0.21579435, 0.22567655, 0.21519284, 0.50793403,\n 0.15864325, -0.02993326], dtype=float32)]",0.994514,1,0.978867,1,0,False,1.724803,0.652238,0.340144,1.0,[-1],13,1.0,0.011811,0.100751,False,True,0.1,0.05,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,rice,2,neural_network,Weights,roar,"[array([0.40917092, 0.46535968, 0.47517868, 0.46480086, 0.75752197,\n 0.40822785, 0.19745748])]",1,0.542929,1,0.683663,1,True,"[array([ 0.15960665, 0.21579435, 0.22567655, 0.21519284, 0.50793403,\n 0.15864325, -0.02993326], dtype=float32)]",0.994514,1,0.983673,1,0,False,1.724803,0.652238,0.340144,1.0,[-1],13,1.0,0.029134,0.094024,False,True,0.1,0.05,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,rice,2,neural_network,Weights,roar,"[array([0.40917092, 0.46535968, 0.47517868, 0.46480086, 0.75752197,\n 0.40822785, 0.19745748])]",1,0.542929,1,0.797068,1,True,"[array([ 0.15960665, 0.21579435, 0.22567655, 0.21519284, 0.50793403,\n 0.15864325, -0.02993326], dtype=float32)]",0.994514,1,0.992957,1,0,False,1.724803,0.652238,0.340144,1.0,[-1],13,1.0,0.029921,0.11434,False,True,0.1,0.05,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,rice,2,neural_network,Weights,roar,"[array([0.40917092, 0.46535968, 0.47517868, 0.46480086, 0.75752197,\n 0.40822785, 0.19745748])]",1,0.542929,1,0.545967,1,True,"[array([ 0.15960665, 0.21579435, 0.22567655, 0.21519284, 0.50793403,\n 0.15864325, -0.02993326], dtype=float32)]",0.994514,1,0.99046,1,0,False,1.724803,0.652238,0.340144,1.0,[-1],13,1.0,0.017323,0.114067,False,True,0.1,0.05,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict

base_methods = {"face", "dice"} #, "gs"}
robust_methods = {"betarob", "robx"}
e2e_methods = {"roar", "rbr"}
experiment_types = ["Architecture", "Bootstrap", "Weights"]

first_table = True

path = r"C:\Users\wikto\PycharmProjects\ce-robustness\framework"

files = [
    "diabetes_lof13_cleaned.csv",
    "rice_lof13_cleaned.csv",
    "heloc_lof13_cleaned.csv",
    "wine_quality_lof13_cleaned.csv",
    "car_eval_lof13_cleaned.csv"
]

results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

def clean_lof(val):
    if pd.isna(val):
        return None
    val = str(val).strip('[]')  
    if val == '1':
        return 1
    elif val == '-1':
        return 0
    else:
        return None

def analyse_rob(df, dataset_name, method, experiment_type):
    total = len(df)
    res = {}
    res['ValidityM1'] = np.round((df["robust_counterfactual_validity"] == 1).sum() / total, 3)
    res['ValidityM2'] = np.round((df["robust_counterfactual_validity_model2"] == 1).sum() / total, 3)
    res['ProximityL2'] = np.round(df["robust_counterfactual_proximityL2"].mean(), 3)
    cleaned_lof = df["robust_counterfactual_lof"].apply(clean_lof)
    res['LOF'] = np.round(cleaned_lof.mean(), 3)
    #res['NN'] = np.round(df["robust_counterfactual_num_of_neigh"].mean(), 3)
    #res['0N'] = np.round((df["robust_counterfactual_num_of_neigh"] == 0).sum() / total, 3)
    res['Agreement'] = np.round(df["robust_counterfactual_agreement"].mean(), 3)
    res['Stability'] = np.round((df["robust_counterfactual_agreement"] > 0.5).sum() / total, 3)
    return res

def analyse(df, dataset_name, method, experiment_type):
    total = len(df)
    res = {}
    res['ValidityM1'] = np.round((df["base_counterfactual_validity"] == 1).sum() / total, 3)
    res['ValidityM2'] = np.round((df["base_counterfactual_validity_model2"] == 1).sum() / total, 3)
    res['ProximityL2'] = np.round(df["base_counterfactual_proximityL2"].mean(), 3)
    cleaned_lof = df["base_counterfactual_lof"].apply(clean_lof)
    res['LOF'] = np.round(cleaned_lof.mean(), 3)
    #res['NN'] = np.round(df["base_counterfactual_num_of_neigh"].mean(), 3)
    #res['0N'] = np.round((df["robust_counterfactual_num_of_neigh"] == 0).sum() / total, 3)
    res['Agreement'] = np.round(df["base_counterfactual_agreement"].mean(), 3)
    res['Stability'] = np.round((df["base_counterfactual_agreement"] > 0.5).sum() / total, 3)
    return res

def get_group(method):
    if method in base_methods:
        return "base"
    elif method in robust_methods:
        return "posthoc"
    elif method in e2e_methods:
        return "e2e"
    else:
        return "other"

for file in files:
    full_path = os.path.join(path, file)
    df = pd.read_csv(full_path)
    if file == "combined_results_diabetes.csv":
        df = df[df['dataset_name'] != 'wine_quality']
        df = df[df['dataset_name'] != 'heloc']
    dataset_name = df['dataset_name'].iloc[0]
    print(dataset_name)

    for exp_type in experiment_types:
        exp_df = df[df['experiment_type'] == exp_type]

        for method in base_methods:
            subset = exp_df[exp_df['base_cf_method'] == method]
            if not subset.empty:
                res = analyse(subset, dataset_name, method, exp_type)
                results[exp_type][dataset_name][get_group(method)][method] = res

        for method in robust_methods:
           subset = exp_df[
               (exp_df['base_cf_method'] == "face") &
               (exp_df['robust_cf_method'] == method)
           ]
           if not subset.empty:
               res = analyse_rob(subset, dataset_name, method, exp_type)
               results[exp_type][dataset_name][get_group(method)][method] = res

        for method in e2e_methods:
            subset = exp_df[
                (exp_df['base_cf_method'] == method) &
                (exp_df['is_e2e'] == 1)
            ]
            if not subset.empty:
                res = analyse(subset, dataset_name, method, exp_type)
                results[exp_type][dataset_name][get_group(method)][method] = res

metrics = ["ValidityM1", "ValidityM2", "ProximityL2", "LOF", "Agreement", "Stability"]
method_order = ["face", "dice", "roar", "rbr", "betarob", "robx"]

def safe_str(val):
    if isinstance(val, float):
        return f"{val:.3f}"
    return str(val)

def generate_latex_table(exp_type, data):
    lines = []
    lines.append("\\begin{table}[ht]")
    lines.append("\\centering")
    lines.append(f"\\caption{{Results for {exp_type} experiment}}")
    col_format = "l l l " + " ".join(["c"] * len(metrics))
    lines.append("\\resizebox{\\textwidth}{!}{")
    lines.append(f"\\begin{{tabular}}{{{col_format}}}")
    lines.append("\\toprule")
    header = ["Dataset", "Group", "Method"] + metrics
    lines.append(" & ".join(header) + " \\\\")
    lines.append("\\midrule")

    dataset_row_counts = {}
    group_row_counts = {}
    for dataset in data:
        total_rows_dataset = 0
        for group in data[dataset]:
            n_methods = len(data[dataset][group])
            group_row_counts[(dataset, group)] = n_methods
            total_rows_dataset += n_methods
        dataset_row_counts[dataset] = total_rows_dataset

    sorted_datasets = sorted(data.keys())
    num_metric_cols = len(metrics)
    last_col_index = 3 + num_metric_cols  

    for i_d, dataset in enumerate(sorted_datasets):
        first_dataset_row = True
        sorted_groups = sorted(data[dataset].keys())
        for i_g, group in enumerate(sorted_groups):
            first_group_row = True
            sorted_methods = sorted(
                data[dataset][group].keys(),
                key=lambda m: method_order.index(m) if m in method_order else len(method_order)
            )
            for method in sorted_methods:
                row = data[dataset][group][method]

                escaped_dataset = dataset.replace("_", r"\_")
                escaped_group = group.replace("_", r"\_")
                
                if first_dataset_row:
                    dataset_cell = f"\\multirow{{{dataset_row_counts[dataset]}}}{{*}}{{{escaped_dataset}}}"
                else:
                    dataset_cell = ""

                if first_group_row:
                    group_cell = f"\\multirow{{{group_row_counts[(dataset, group)]}}}{{*}}{{{escaped_group}}}"
                else:
                    group_cell = ""

                metric_values = [safe_str(row.get(m, "-")) for m in metrics]

                cells = [
                    dataset_cell,
                    group_cell,
                    method, 
                ] + metric_values

                cells = [c if c else "" for c in cells]
                line = " & ".join(cells) + " \\\\"
                lines.append(line)

                first_dataset_row = False
                first_group_row = False

            if i_g < len(sorted_groups) - 1:
                lines.append(f"\\cmidrule(lr){{2-{last_col_index}}}")

        if i_d < len(sorted_datasets) - 1:
            lines.append("\\midrule")

    lines.append("\\bottomrule")
    lines.append("\\end{tabular}")
    lines.append("}")
    lines.append("\\end{table}")

    return "\n".join(lines)

for exp_type in experiment_types:
    if exp_type in results:
        latex_table = generate_latex_table(exp_type, results[exp_type])
        if first_table:
            print("\n\n")
            print(r"\clearpage")
            print(r"\vspace*{\fill}")
            print("\n")
            print(latex_table)
            print("\n")
            print(r"\vspace*{\fill}")
            first_table = False
        else:
            print(latex_table)
            print("\n\n") 


# add \clearpage
#\vspace*{\fill}
# before and 

# \vspace*{\fill}
# after the 1st table to center it

diabetes
rice
heloc
wine_quality
car_eval



\clearpage
\vspace*{\fill}


\begin{table}[ht]
\centering
\caption{Results for Architecture experiment}
\resizebox{\textwidth}{!}{
\begin{tabular}{l l l c c c c c c}
\toprule
Dataset & Group & Method & ValidityM1 & ValidityM2 & ProximityL2 & LOF & Agreement & Stability \\
\midrule
\multirow{6}{*}{car\_eval} & \multirow{2}{*}{base} & face & 1.000 & 0.866 & 0.587 & 1.000 & 0.708 & 0.823 \\
 &  & dice & 1.000 & 0.909 & 0.834 & 1.000 & 0.737 & 0.872 \\
\cmidrule(lr){2-9}
 & \multirow{2}{*}{e2e} & roar & 0.244 & 0.262 & 0.431 & 1.000 & 0.868 & 0.939 \\
 &  & rbr & 0.467 & 0.409 & 0.396 & 0.989 & 0.746 & 0.839 \\
\cmidrule(lr){2-9}
 & \multirow{2}{*}{posthoc} & betarob & 1.000 & 0.961 & 0.607 & 1.000 & 0.742 & 0.873 \\
 &  & robx & 1.000 & 0.931 & 0.607 & 1.000 & 0.749 & 0.893 \\
\midrule
\multirow{6}{*}{diabetes} & \multirow{2}{*}{base} & face & 1.000 & 0.694 & 0.352 & 1.000 & 0.481 & 0.491 \\
 &  & dice & 1.000 & 0.753 & 0.683 & 0.617 & 0.586 & 

In [15]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from seaborn import color_palette

colors = color_palette("Set2", 5)

dataset_palette = {
    "diabetes": "#db5e57",  
    "wine": "#b8db57",      
    "car_eval": "#57db94",   
    "rice": "#5783db",      
    "heloc": "#c857db"     
}

dataset_marker = 'o'

experiment_types = ["architecture", "bootstrap", "weights"]

base_methods = {"dice", "face"}
robust_methods = {"betarob", "robx"}
e2e_methods = {"roar", "rbr"}
all_methods = ["face", "dice", "roar", "rbr", "betarob", "robx"]

base_path = r"C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\thesis\cleaned_results"

files = [
    "diabetes_lof13_cleaned.csv",
    "rice_lof13_cleaned.csv",
    "heloc_lof13_cleaned.csv",
    "wine_quality_lof13_cleaned.csv",
    "car_eval_lof13_cleaned.csv"
]

files = [os.path.join(base_path, file) for file in files]

def clean_lof(val):
    if pd.isna(val):
        return None
    val = str(val).strip('[]')  
    if val == '1':
        return 1
    elif val == '-1':
        return 0
    else:
        return None

def analyse_rob(df):
    total = len(df)
    if total == 0:
        return {}
    cleaned_lof = df["robust_counterfactual_lof"].apply(clean_lof)
    return {
        'validityM2': np.round((df["robust_counterfactual_validity_model2"] == 1).sum() / total, 3),
        'LOF': np.round(cleaned_lof.mean(), 3),
        'Agreement': np.round(df["robust_counterfactual_agreement"].mean(), 3),
        'Stability': np.round((df["robust_counterfactual_agreement"] > 0.5).sum() / total, 3)
    }

def analyse(df):
    total = len(df)
    if total == 0:
        return {}
    cleaned_lof = df["base_counterfactual_lof"].apply(clean_lof)
    return {
        'validityM2': np.round((df["base_counterfactual_validity_model2"] == 1).sum() / total, 3),
        'LOF': np.round(cleaned_lof.mean(), 3),
        'Agreement': np.round(df["base_counterfactual_agreement"].mean(), 3),
        'Stability': np.round((df["base_counterfactual_agreement"] > 0.5).sum() / total, 3)
    }

def simplify_dataset_name(name):
    name = name.lower()
    for key in dataset_palette:
        if key in name:
            return key
    return name

data_cache = {}
for file in files:
    df = pd.read_csv(file, low_memory=False)
    dataset_name = simplify_dataset_name(df['dataset_name'].iloc[0])
    for exp_type in df['experiment_type'].str.lower().unique():
        exp_df = df[df['experiment_type'].str.lower() == exp_type]
        for method in all_methods:
            if method in base_methods:
                subset = exp_df[exp_df['base_cf_method'] == method]
                mtype = 'base'
            elif method in robust_methods:
                subset = exp_df[(exp_df['base_cf_method'] == "face") & (exp_df['robust_cf_method'] == method)]
                mtype = 'robust'
            else:
                subset = exp_df[(exp_df['base_cf_method'] == method) & (exp_df['is_e2e'] == 1)]
                mtype = 'e2e'
            data_cache[(dataset_name, exp_type, method, mtype)] = subset

output_dirs = {
    "plots_cleaned": "plots_cleaned"
}

for folder in output_dirs.values():
    os.makedirs(folder, exist_ok=True)

def plot_metric(metric, y_label, folder):
    for exp_type in experiment_types:
        fig, axs = plt.subplots(3, 2, figsize=(12, 18))  
        plotted_labels_per_ax = [set() for _ in range(6)]

        for i, method in enumerate(all_methods):
            ax = axs[i // 2, i % 2]
            ax.set_xlim(-0.02, 1.02)
            ax.set_ylim(-0.02, 1.02)
            ax.set_title(method.upper(), fontsize=14)
            ax.set_xlabel("ValidityM2", fontsize=12)
            ax.set_ylabel(y_label, fontsize=12)
            ax.grid(True)

            plotted_labels = set()
            for dataset_name in dataset_palette:
                if method in base_methods:
                    mtype = 'base'
                elif method in robust_methods:
                    mtype = 'robust'
                else:
                    mtype = 'e2e'

                subset = data_cache.get((dataset_name, exp_type, method, mtype), pd.DataFrame())
                if subset.empty:
                    continue

                if mtype == 'robust':
                    res = analyse_rob(subset)
                else:
                    res = analyse(subset)

                if metric not in res or len(res) == 0:
                    continue

                x = res['validityM2']
                y = res[metric]

                label_to_use = dataset_name if dataset_name not in plotted_labels else None
                if label_to_use:
                    ax.scatter(x, y, color=dataset_palette[dataset_name], marker=dataset_marker, s=100, label=dataset_name)
                    plotted_labels.add(dataset_name)
                else:
                    ax.scatter(x, y, color=dataset_palette[dataset_name], marker=dataset_marker, s=100)

            handles, labels = ax.get_legend_handles_labels()
            by_label = dict(zip(labels, handles))
            ax.legend(by_label.values(), by_label.keys(), fontsize=9)

        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.subplots_adjust(hspace=0.3)
        plt.subplots_adjust(wspace=0.3)
        save_path = os.path.join(folder, f"{metric.lower()}_{exp_type}.png")
        plt.savefig(save_path)
        plt.close()

for metric, ylabel, folder in [
    ("LOF", "LOF", output_dirs["plots_cleaned"]),
    ("Agreement", "Agreement", output_dirs["plots_cleaned"]),
    ("Stability", "Stability (Agreement > 0.5)", output_dirs["plots_cleaned"])
]:
    plot_metric(metric, ylabel, folder)

print("All plots saved in", os.getcwd())

All plots saved in C:\Users\wikto\PycharmProjects\ce-robustness\framework\experiments\thesis\notebooks
