In [None]:
import csv
import os
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from matplotlib.lines import Line2D
from scipy.spatial.distance import jensenshannon
from scipy.stats import fisher_exact, mannwhitneyu, probplot
from typing import Callable, Dict, Tuple

# Function Definition

## Section 1: Data Loading

In [None]:
def load_data(chapter: str, group: str, study: str = None) -> list[pd.DataFrame]:
    """
    Loads the base dataset and all study datasets found in subfolders of {chapter}/{group}/.

    Parameters:
        chapter (str): Top-level directory (e.g., '31_agents').
        group (str): Sub-directory containing study folders.
        study (str, optional): If provided, filters to only that subfolder name.

    Returns:
        list[pd.DataFrame]: List of loaded and preprocessed DataFrames.
                            First is always the base dataset.
    """
    df_list = []
    labels_list = ['Baseline']

    # Load base dataset
    base_path = 'sensitivity/counting_base.csv'
    if not os.path.exists(base_path):
        raise FileNotFoundError(f"Baseline dataset not found at {base_path}")
    df_base = pd.read_csv(base_path)
    df_base['decision_reached'] = df_base['decision_reached'].astype(bool)
    df_list.append(df_base)

    # Load all study datasets from subfolders
    parent_path = os.path.join(chapter, group)
    if not os.path.isdir(parent_path):
        print("Warning: Only baseline dataset was loaded. No study data found.")
        return df_list, labels_list

    for folder in sorted(os.listdir(parent_path)):
        folder_path = os.path.join(parent_path, folder)
        if os.path.isdir(folder_path):
            if study and folder != study:
                continue
            file_path = os.path.join(folder_path, 'counting.csv')
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                if 'decision_reached' in df.columns:
                    df['decision_reached'] = df['decision_reached'].astype(bool)
                df_list.append(df)
                labels_list.append(folder_path.rsplit('\\', 1)[-1])
            else:
                print(f"Warning: counting.csv not found in {folder_path}")

    return df_list, labels_list

## Section 2: Data Preparation

In [None]:
def classify_outcome(df: pd.DataFrame) -> pd.DataFrame:
    """
    Classify rows into outcome categories.
    """
    def classify(row):
        val = row.get('misled')

        if pd.isna(val):
            return 'no decision'
        # At this point, val is either True or False
        if bool(val):  # True → 'misled'; False → 'rejected'
            return 'misled'
        else:
            return 'rejected'
        
    df['outcome'] = df.apply(classify, axis=1)
    return df

def prepare_data(df_list: list[pd.DataFrame], study_labels: list[str]) -> dict[str, pd.DataFrame]:
    """
    Classify outcomes for each DataFrame and return a dictionary mapping labels to DataFrames.
    """
    if len(df_list) != len(study_labels):
        raise ValueError("`df_list` and `study_labels` must have the same length.")
    
    return {
        label: classify_outcome(df.copy())
        for label, df in zip(study_labels, df_list)
    }


In [None]:
def exchange_items_from_csv(input_list):
    """
    Replace items in input_list based on mappings defined in a CSV file.

    Args:
        input_list (list of str): The list to process.
        csv_filename (str): The CSV file name in the same directory, containing 'original' and 'replacement' columns.

    Returns:
        list of str: The modified list with items replaced based on CSV mappings.
    """
    # Read the mapping from CSV into a dictionary
    mapping = {}
    with open('experiment_labels.csv', mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            original = row['original'].strip()
            replacement = row['replacement'].strip()
            mapping[original] = replacement

    # Replace items in the list if they exist in the mapping
    replaced_list = [mapping.get(item, item) for item in input_list]

    return replaced_list

## Section 3: Outcome Distribution Analysis Across Trials

In [None]:
def get_outcome_distribution_across_trials(
    df_dict: dict[str, pd.DataFrame],
    trial_steps: list[int]
) -> pd.DataFrame:
    results = []
    for study, df in df_dict.items():
        for n in trial_steps:
            if n > len(df):
                continue
            subset = df.iloc[:n]
            counts = subset['outcome'].value_counts(normalize=True) * 100
            results.append({
                'study': study,
                'Number of Trials': n,
                'misled': counts.get('misled', 0),
                'rejected': counts.get('rejected', 0),
                'no decision': counts.get('no decision', 0)
            })
    return pd.DataFrame(results)


def plot_outcome_distribution(plot_df: pd.DataFrame, study: str):
    colors = ['#009688', '#B22222', '#A9A9A9']
    study_df = plot_df[plot_df['study'] == study].set_index('Number of Trials')
    ax = study_df[['misled', 'rejected', 'no decision']].plot(
        kind='bar', stacked=True, color=colors, figsize=(10,6)
    )
    for container in ax.containers:
        ax.bar_label(container, fmt='%.0f%%', label_type='center', fontsize=9, color='black')
    ax.set_title(f'Outcome Distribution by Number of Trials ({study})')
    ax.set_xlabel('Number of Trials')
    ax.set_ylabel('Percentage')
    ax.set_ylim(0, 100)
    ax.set_xticklabels(study_df.index, rotation=0)
    ax.set_yticklabels([f'{int(t)}%' for t in ax.get_yticks()])
    plt.tight_layout()
    plt.show()


## Section 4: Distribution Stability Over Trials

In [None]:
def get_distribution(data: list[str], categories: list[str]) -> np.ndarray:
    counts = Counter(data)
    return np.array([counts.get(cat, 0) for cat in categories], dtype=float) / len(data)

def total_variation_distance(p: np.ndarray, q: np.ndarray) -> float:
    return 0.5 * np.sum(np.abs(p - q))

def plot_distribution_stability(outcomes: list[str], categories: list[str], study: str, step=1):
    tvd_vals, tvd_vals_next, jsd_vals, trial_sizes = [], [], [], []
    final_dist = get_distribution(outcomes, categories)

    for n in range(step, len(outcomes) - step, step):
        dist_n = get_distribution(outcomes[:n], categories)
        dist_n_next = get_distribution(outcomes[:n + step], categories)
        tvd_vals_next.append(total_variation_distance(dist_n, dist_n_next))
        tvd_vals.append(total_variation_distance(dist_n, final_dist))
        jsd_vals.append(jensenshannon(dist_n, dist_n_next, base=2))
        trial_sizes.append(n)

    plt.figure(figsize=(10, 5))
    plt.plot(trial_sizes, tvd_vals, label='TVD vs Final', marker='o')
    plt.plot(trial_sizes, tvd_vals_next, label='TVD vs Next', marker='x')
    plt.axhline(0.05, color='gray', linestyle='--', label='Threshold (0.05)')
    plt.xlabel('Number of Trials')
    plt.ylabel('Distance')
    plt.title(f'Distribution Stability ({study})')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

## Section 5: Comparing Multiple Distributions

In [None]:
# Configure matplotlib for LaTeX fonts to match document
fntsz = 15
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": ["Computer Modern"],
    "font.size": fntsz,
    "axes.titlesize": fntsz,
    "axes.labelsize": fntsz,
    "xtick.labelsize": fntsz,
    "ytick.labelsize": fntsz,
    "legend.fontsize": fntsz,
})

In [None]:
def plot_comparison(df_list: list, title: str):
    """
    Plot comparative bar charts of outcome proportions for multiple DataFrames.
    """
    ref_categories = sorted(set(df_list[0]['outcome'].unique()))
    n_dfs = len(df_list)
    bar_width = 0.2
    x = np.arange(len(ref_categories))

    fig, ax = plt.subplots(figsize=(10,6))

    for i, df in enumerate(df_list):
        counts = df['outcome'].value_counts(normalize=True).reindex(ref_categories, fill_value=0)
        label = 'Baseline Distribution' if i == 0 else f'DataFrame {i+1}'
        bars = ax.bar(x + (i - (n_dfs-1)/2)*bar_width, counts, width=bar_width, label=label, alpha=0.7)

        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2, height,
                    f'{height*100:.1f}%', ha='center', va='bottom', fontsize=9)

    ax.set_xticks(x)
    ax.set_xticklabels(ref_categories)
    ax.set_xlabel('Outcome')
    ax.set_ylabel('Proportion')
    ax.set_title(title)
    ax.legend()
    plt.tight_layout()
    plt.show()

def calculate_jsd_tvd(df_dict: dict[str, pd.DataFrame], print_output: bool = True):
    """
    Calculate and print Jensen-Shannon Divergence and Total Variation Distance
    for each DataFrame in the dict compared to the base DataFrame.
    """
    base_df = df_dict['Baseline']
    categories = sorted(base_df['outcome'].dropna().unique())

    results = []
    base_dist = base_df['outcome'].value_counts(normalize=True).reindex(categories, fill_value=0)

    for experiment, df in df_dict.items():
        if experiment == 'Baseline':
            continue

        dist = df['outcome'].value_counts(normalize=True).reindex(categories, fill_value=0)

        jsd = jensenshannon(base_dist, dist, base=2)
        tvd = np.sum(np.abs(base_dist - dist)) / 2

        if print_output:
            print(f"\n--- Comparison with '{experiment}' ---")
            print(f"JSD: {jsd:.4f}")
            print(f"TVD: {tvd:.4f}")
        results.append({
            'experiment': experiment,
            'jsd': jsd,
            'tvd': tvd
        })
    return pd.DataFrame(results).set_index('experiment')

def compute_significance_band(df: pd.DataFrame, alpha: float, resilience: bool = True) -> tuple[float, float]:
    """
    Compute the significance band of acceptable proportions for a second sample
    based on Fisher's Exact Test comparing to the input dataframe's outcome.
    
    Parameters:
        df (pd.DataFrame): Input dataframe with a binary 'outcome' column.
        resilience (bool): If False, test against 'misled'; otherwise, 'rejected'.
        alpha (float): Significance level
        
    Returns:
        (min_prop, max_prop): Tuple of minimum and maximum proportion where p >= alpha.
    """
    target = 'rejected' if resilience else 'misled'
    
    # Actual counts
    outcome_counts = df['outcome'].value_counts()
    true_count = outcome_counts.get(target, 0)
    false_count = df['outcome'].notna().sum() - true_count
    n = true_count + false_count
    
    # Scan possible outcomes in second sample (same size n)
    p_values = []
    for test_true in range(0, n + 1):
        test_false = n - test_true
        
        # Contingency table: [ [base_true, base_false], [test_true, test_false] ]
        table = [[true_count, false_count], [test_true, test_false]]
        _, p = fisher_exact(table, alternative='two-sided')
        p_values.append((test_true / n, p))
    
    # Filter for p >= alpha (non-significant region)
    non_sig = [prop for prop, p in p_values if p >= alpha]
    
    if not non_sig:
        return (np.nan, np.nan)  # No band found (very extreme base case)
    
    return (min(non_sig), max(non_sig))


## Section 6: Statistical Testing Functions

In [None]:
def calculate_binary_counts(
    df_base: pd.DataFrame,
    df_comp: pd.DataFrame,
    column: str,
    true_condition: Callable[[pd.Series], pd.Series]
) -> Dict[str, Tuple[int, int]]:
    """
    Calculate binary outcome counts for base and comparison DataFrames.

    Returns:
        Dict with keys 'base' and 'comp', and values as (count_true, count_false).
    """
    def count_true_false(df: pd.DataFrame) -> Tuple[int, int]:
        col_data = df[column]
        mask = true_condition(col_data)
        return mask.sum(), (~mask).sum()

    return {
        "base": count_true_false(df_base),
        "comp": count_true_false(df_comp)
    }

def run_fisher_test(df_base: pd.DataFrame, df_comp: pd.DataFrame, column: str, condition_func, label: str, experiment: str, print_output: bool) -> dict:
    """
    Run Fisher's Exact Test for binary categorical column across multiple dataframes.
    """
    results = calculate_binary_counts(df_base, df_comp, column, condition_func)
    
    base_counts = list(results["base"])
    comp_counts = list(results["comp"])

    table = [base_counts, comp_counts]
    
    oddsratio, pval = fisher_exact(table)
    
    if print_output:
        print(f"\n--- {label} (Fisher's Exact Test) ---")
        print(pd.DataFrame(table, columns=['True', 'False'], index=[f'Baseline', experiment]))
        print(f"Odds Ratio: {oddsratio:.4f}")
        print(f"p-value: {pval:.4e}")
        print("Result:", "Significant difference." if pval < 0.05 else "No significant difference.")

        if pval < 0.05:
            print(f"Significant difference in {label} distribution detected.")
            # plot boxplots base vs experiment next to each other
            base_correct = df_base[label].dropna()
            sample_correct = df_comp[label].dropna()
            plt.figure(figsize=(8, 6))
            plt.boxplot([base_correct, sample_correct], labels=['Baseline', experiment])
            plt.title(f'{label} Comparison - {experiment}')
            plt.ylabel(label)
            plt.grid(True)
            plt.show()

    return {
        'label': label,
        'test': 'Fisher\'s Exact Test',
        'p-value': pval,
        'significant': True if pval < 0.05 else False
    }

def run_mannwhitney_test(df_base: pd.DataFrame, df_comp: pd.DataFrame, column: str, label: str, experiment: str, print_output: bool) -> dict:
    """
    Run Mann-Whitney U test to compare distribution of a continuous/numerical variable.
    """
    df_list = [df_base, df_comp]
    base = df_list[0][column].dropna()
    
    sample = df_comp[column].dropna()
    stat, pval = mannwhitneyu(sample, base, alternative='two-sided')
    
    if print_output:
        # Q-Q Plot (optional, still useful for visualization)
        probplot(sample, dist="norm", plot=plt)
        plt.title(f"Q-Q Plot - {experiment}")
        plt.show()

        print(f"\n--- {label} (Mann–Whitney U Test) ---")
        print(f"Mean {experiment}: {sample.mean():.2f}, Mean Base: {base.mean():.2f}")
        print(f"Median {experiment}: {sample.median():.2f}, Median Base: {base.median():.2f}")
        print(f"U statistic: {stat:.2f}, p-value: {pval:.4e}")
        print("Result:", "Significant difference." if pval < 0.05 else "No significant difference.")

    return {
            'label': label,
            'test': 'Mann-Whitney U Test',
            'p-value': pval,
            'significant': True if pval < 0.05 else False
        }

In [None]:
def calculate_correctness_ratio(df_list: list, experiment: str) -> dict:
    ratios = {}
    for i, df in enumerate(df_list):
        not_misled = df['misled'] == False if 'misled' in df.columns else ~df['outcome'].eq('misled')
        correct_false = ((not_misled) & (df['correct'] == False)).sum()
        correct_true = ((not_misled) & (df['correct'] == True)).sum()
        ratio = correct_false / correct_true if correct_true != 0 else np.nan
        ratios[experiment] = (ratio, [correct_false, correct_true])
    return ratios

def calculate_correctness_table(df_list: list) -> list:
    table = []
    for df in df_list:
        not_misled = df['misled'] == False
        correct_false = ((not_misled) & (df['correct'] == False)).sum()
        remaining = len(df) - correct_false
        table.append((correct_false, remaining))
    return table


def compare_correctness_ratios(base_df: pd.DataFrame, comp_df: pd.DataFrame, experiment: str, print_output: bool = True) -> dict:
    """
    Compare correctness ratio across multiple datasets using Fisher's Exact Test.
    Assumes binary correctness (True/False).
    """
    if 'correct' not in comp_df.columns:
        print(f"The 'correct' column is missing in the DataFrame of experiment {experiment}. Test for correctness skipped.")
        return {
            'label': 'Correctness',
            'test': 'Fisher\'s Exact Test',
            'p-value': None,
            'significant': None
        }
    df_list = [base_df, comp_df]
    table = calculate_correctness_table(df_list)
    oddsratio, pval = fisher_exact(table)
    
    
    if print_output:
        print(f"\n--- Correctness Ratio Comparison ({experiment}) ---")
        labels = ['Baseline', experiment]
        for label, (incorrect, remaining) in zip(labels, table):
            ratio = incorrect / remaining if remaining != 0 else float('nan')
            print(f"{label} Correctness Ratio: {ratio:.2f} ({incorrect}/{remaining})")
        print("\nContingency Table (Incorrect / Remaining):")
        print(pd.DataFrame(table, columns=["Incorrect", "Remaining"], index=labels))
        print(f"\nFisher's Exact Test p-value: {pval:.4e}")
        print("Result:", "Significant difference." if pval < 0.05 else "No significant difference.")

    return {
            'label': 'Correctness',
            'test': 'Fisher\'s Exact Test',
            'p-value': pval,
            'significant': True if pval < 0.05 else False
        }


In [None]:
def run_stat_tests(df_dict: dict[str, pd.DataFrame], resilience: bool, print_output: bool) -> pd.DataFrame:
    if len(df_dict) < 2:
        raise ValueError("At least two dataframes are required to compare.")
    
    target = 'rejected' if resilience else 'misled'
    
    results = []
    base_df = df_dict['Baseline']

    for experiment, comp_df in {k: v for k, v in df_dict.items() if k != 'Baseline'}.items():
        required_cols = {'misled', 'decision_reached', 'iterations_needed'}
        if not required_cols.issubset(comp_df.columns):
            print(f"Skipping {experiment}: required columns not found in base DataFrame.")
            continue
        print(f"\n--- Running Statistical Tests for Base Case vs. '{experiment}' ---")

        # Append with experiment key
        results.append({**run_fisher_test(base_df, comp_df, 'misled', lambda col: col == (not resilience), label = target, experiment=experiment, print_output=print_output), 'experiment': experiment})
        results.append({**run_fisher_test(base_df, comp_df, 'decision_reached', lambda col: col == True, "decision_reached", experiment=experiment, print_output=print_output), 'experiment': experiment})
        results.append({**run_mannwhitney_test(base_df, comp_df, 'iterations_needed', "iterations_needed", experiment, print_output=print_output), 'experiment': experiment})
        results.append({**compare_correctness_ratios(base_df, comp_df, experiment, print_output=print_output), 'experiment': experiment})

    return pd.DataFrame(results).set_index('experiment')

## Section 7: Plotting

In [None]:
def plot_multi_comparison(df_dict: dict[str, pd.DataFrame], resilience: bool):
    """
    Creates a single stacked bar chart showing outcome proportions for each study and a second set of bars
    for iterations_needed next to the outcome bars.

    Parameters:
        df_dict (dict[str, pd.DataFrame]): Dict with dataframes and study labels
        resilience (bool): Whether to use resilience-based sorting or not.
        title (str): Plot title.
    """
    study_labels = list(df_dict.keys())
    df_list = list(df_dict.values())
    
    assert len(df_list) == len(study_labels), "study_labels must match df_list in length"

    outcomes = ['misled', 'no decision', 'rejected']
    colors = {
        'misled': "#B33131",
        'no decision': "#707070",
        'rejected': "#2CAA8F",
        'non_sig_min': "#9B7C4F",  # Color for non-significant minimum
        'non_sig_max': "#F1A83B"   # Color for non-significant maximum
    }

    # Compute misled proportions and sort accordingly
    misled_props = [df['outcome'].value_counts(normalize=True).get('misled', 0) for df in df_list]
    misled_false_props = [(df['misled'] == False).mean() if 'misled' in df.columns else np.nan for df in df_list]
    sorted_indices = np.argsort(misled_false_props) if resilience else np.argsort(misled_props)

    # Compute significance band
    alf = 0.05
    non_sig_min, non_sig_max = compute_significance_band(df_list[0], alpha=alf, resilience=resilience)
    
    # Reorder both df_list and study_labels
    df_list = [df_list[i] for i in sorted_indices]
    study_labels = [study_labels[i] for i in sorted_indices]

    # Prepare data: get proportions for each outcome in each study
    proportions = {outcome: [] for outcome in outcomes}
    iterations_needed = []
    iterations_std = []

    for df in df_list:
        counts = df['outcome'].value_counts(normalize=True)
        for outcome in outcomes:
            proportions[outcome].append(counts.get(outcome, 0))
        iterations_needed.append(df['iterations_needed'].mean())
        iterations_std.append(df['iterations_needed'].std())
  
    n_groups = len(df_list)
    fixed_bar_width = 0.4
    group_spacing = fixed_bar_width * 2  # Space between groups of bars
    x = np.arange(n_groups) * group_spacing
    padding = fixed_bar_width / 2  # Padding for sides

     # Adjust figure width dynamically, e.g. base width of 4 + extra per group
    base_width = 4
    width_per_group = 0.9  # inches per group (tweak as needed)
    fig_width = min(base_width + n_groups * width_per_group, 14)  # max width of 14 inches

    fig, ax1 = plt.subplots(figsize=(fig_width, 6))

    bottom = np.zeros(len(df_list))
    for outcome in outcomes:
        vals = proportions[outcome]
        bar_colors = [colors[outcome]] * len(vals)
        bars = ax1.bar(x - fixed_bar_width / 2, vals, bottom=bottom, width=fixed_bar_width, label=outcome,
                       color=bar_colors, edgecolor='white')

        # Add percentage labels
        for i, bar in enumerate(bars):
            height = bar.get_height()
            if height > 0.04:  # Only label if the section is big enough
                ax1.text(bar.get_x() + bar.get_width() / 2,
                         bottom[i] + height / 2,
                         f'{height * 100:.0f}\\%',
                         ha='center', va='center', color='black', fontweight='bold')
        bottom += vals

    # Set the labels and title for the primary axis
    ax1.set_xticks(x)
    ax1.set_xticklabels(study_labels, rotation=45, ha='right')
    ax1.set_xlim(x[0] - fixed_bar_width - padding, x[-1] + fixed_bar_width / 2 + padding)
    ax1.set_ylabel('Proportion of Outcomes')
    ax1.set_ylim(0, 1)
    ax1.yaxis.set_major_formatter(PercentFormatter(1.0))  # Format y-axis as percent based on fraction (1.0 = 100%)
    # ax1.set_title(title, fontsize=14, weight='bold')
    ax1.grid(axis='y', linestyle='--', alpha=0.5)

    # Plot horizontal lines for non_sig_min and non_sig_max (Lower Significance Threshold and Upper Significance Threshold)
    non_sig_max_plot = 1 - non_sig_max  if resilience else non_sig_max  # Adjust for resilience case
    non_sig_min_plot = 1 - non_sig_min if resilience else non_sig_min  # Adjust for resilience case
    ax1.axhline(non_sig_max_plot, color=colors['non_sig_max'], linestyle='--', label=f'Upper Confidence Bound ({non_sig_max * 100:.0f}\\%) ($\\alpha = {alf}$) [{"rejected" if resilience else "misled"}]')
    ax1.axhline(non_sig_min_plot, color=colors['non_sig_min'], linestyle='--', label=f'Lower Confidence Bound ({non_sig_min * 100:.0f}\\%) ($\\alpha = {alf}$) [{"rejected" if resilience else "misled"}]')

    # Add the legend for main outcome plots
    fig.legend(title='Outcome', loc='upper left', bbox_to_anchor=(-0.5, 0.03), ncol=3)

    
    # Create separate bars for iterations_needed
    ax2 = ax1.twinx()  # Create a secondary y-axis for iterations
    ax2.set_ylabel('Number of Interaction Cycles')
    ax2.set_ylim(0, 6)  # Set y-axis limit for iterations_needed
    bars2 = ax2.bar(x + 0.5*fixed_bar_width / 2, iterations_needed, width=fixed_bar_width/2, facecolor='none', edgecolor='black', linewidth=1, label='Iterations', yerr=iterations_std, capsize=5, error_kw=dict(ecolor='black', lw=1))
    
    # Add the legend for iterations
    errorbar_proxy = Line2D([0], [0], color='black', lw=1.5, linestyle='-', label='Std Dev (error bars)')
    fig.legend(handles=[bars2, errorbar_proxy], title='Iterations', loc='upper left', bbox_to_anchor=(-0.5, -0.15), ncol=1) #0.75, -0.18

    # Display the plot
    #plt.tight_layout()
    plt.subplots_adjust(bottom=0.2)
    plt.show()

# Execution

In [None]:
# Parameters
chapter = 'experiments/31_agents' #31_agents, 32_problem_setting, 33_system_design
group = 'none' # 'number_of_advisors', interaction_moderator_iter10
STUDY = 'decentral_smm'

# Load and prepare data
df_list, study_labels_raw = load_data(chapter, group)

# Collect indices where labels contain those keywords
keywords = ('next_step', 'find_path_assisting', 'decentral_sm_open', 'FDm_APs_PEs', 'mms_named', 'ms_named')
indices_to_remove = [i for i, label in enumerate(study_labels_raw) 
                     if any(keyword in label for keyword in keywords)]

# Remove items from df_list and study_labels_raw in reverse order
for i in sorted(indices_to_remove, reverse=True):
    df_list.pop(i)
    study_labels_raw.pop(i)

# add SMM and MSM for comparison
# df_smm = pd.read_csv('33_system_design/number_of_advisors/smm/counting.csv')
# df_msm = pd.read_csv('33_system_design/number_of_advisors/msm/counting.csv')
# df_smm['decision_reached'] = df_smm['decision_reached'].astype(bool)
# df_msm['decision_reached'] = df_msm['decision_reached'].astype(bool)
# df_list.append(df_smm)
# df_list.append(df_msm)
# study_labels_raw.append('SMM')
# study_labels_raw.append('MSM')

# exchange study labels
study_labels = exchange_items_from_csv(study_labels_raw)
# for df in df_list[1:]:
#     df.drop(['agents misled','agents rejected'], axis=1, inplace=True)
df_dict_prepared = prepare_data(df_list, study_labels)

# Display first entry of dict for sanity check
# print(df_dict_prepared['Baseline'].head())

# Analyze outcome distribution over trials
trial_steps = [2, 3, 5, 10, 15, 20, 30]
plot_df = get_outcome_distribution_across_trials(df_dict_prepared, trial_steps)
#plot_outcome_distribution(plot_df, STUDY)

# Plot stability of distribution of base case
categories = ['misled', 'rejected', 'no decision']
outcomes_list = df_dict_prepared[next(iter(df_dict_prepared))]['outcome'].tolist()
#plot_distribution_stability(outcomes_list, categories, STUDY)

# Compare multiple datasets' distributions visually
resilience = True if 'lead' in group else False
plot_multi_comparison(df_dict_prepared, resilience = resilience)

# Calculate and print divergence metrics between base and others
tvd_df = calculate_jsd_tvd(df_dict_prepared, print_output=False)

# Run statistical tests - comparison to base case
stats_df = run_stat_tests(df_dict_prepared, resilience = resilience, print_output=False)

# Statistics

In [None]:
stats_df.to_csv(f'{os.path.join(chapter, group)}/stats_results.csv', index=True)
print(f"The significant statistics for experiment {group}:")

In [None]:
filtered_df = stats_df.dropna()
filtered_df = filtered_df[filtered_df['significant']].drop(columns=['significant'])
filtered_df.to_csv(f'{os.path.join(chapter, group)}/stats_results_significant.csv', index=True)
print(filtered_df)

# Summarize Experiment Results

In [None]:
import csv
import os
import json

def summarize_csv(file_path):
    total_rows = 0
    misled_count = 0
    reject_count = 0
    decision_count = 0
    correct_count = 0
    total_iterations = 0
    valid_iteration_rows = 0

    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            total_rows += 1

            # Count iterations if present
            try:
                iterations = int(row['iterations_needed'])
                total_iterations += iterations
                valid_iteration_rows += 1
            except (ValueError, TypeError):
                pass

            # Check decision_reached
            if row['decision_reached'].strip().lower() == 'true':
                decision_count += 1

            # Check misled
            if row['misled'].strip().lower() == 'true':
                misled_count += 1

            # Check rejected (misled == false)
            if row['misled'].strip().lower() == 'false':
                reject_count += 1

                # Check correctness only if rejected
                if 'correct' in row and row['correct'].strip().lower() == 'true':
                    correct_count += 1

    summary = {
        'misleading_rate': round((misled_count / total_rows) * 100, 2) if total_rows else 0,
        'rejecting_rate': round((reject_count / total_rows) * 100, 2) if total_rows else 0,
        'decision_reached_rate': round((decision_count / total_rows) * 100, 2) if total_rows else 0,
        'average_iterations_needed': round(total_iterations / valid_iteration_rows, 2) if valid_iteration_rows else 0,
        'correctness_rate': round((correct_count / reject_count) * 100, 2) if reject_count else 0
    }

    return summary

# === Build Summary Dictionary ===
summary_dict = {}

# Automatically detect chapters (top-level directories)
chapters = ['31_agents', '32_problem_setting', '33_system_design']
for chapter in chapters:
    if not os.path.isdir(chapter):
        continue  # Skip files

    summary_dict[chapter] = {}
    chapter_path = os.path.join(chapter)

    # Automatically detect groups within each chapter
    for group in sorted(os.listdir(chapter_path)):
        group_path = os.path.join(chapter_path, group)
        if not os.path.isdir(group_path):
            continue

        summary_dict[chapter][group] = {}

        # Loop through study folders inside each group
        for folder in sorted(os.listdir(group_path)):
            folder_path = os.path.join(group_path, folder)
            if not os.path.isdir(folder_path):
                continue

            file_path = os.path.join(folder_path, 'counting.csv')
            if os.path.exists(file_path):
                result = summarize_csv(file_path)
                summary_dict[chapter][group][folder] = result
            else:
                print(f"[Missing] {file_path}")

In [None]:
# Write summary to file
with open("summary_results.json", "w") as f:
    json.dump(summary_dict, f, indent=4)