# Notebook for visual evaluation of benchmark results

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

JSON_DIRS: list[str] = ["../scripts/json_coverage_TheAlgorithm/", "../scripts/json_coverage_examples/"]

## Preprocess results

In [None]:
dfs: list[pd.DataFrame] = []
for json_dir in JSON_DIRS:
    print(f"Reading: {json_dir}")
    df = pd.read_csv(json_dir + "merged_coverage_reports.csv")
    # Identify common prefix in file_name
    common_prefix = os.path.commonprefix(df['file_name'].to_list())
    print(f"Common prefix: {common_prefix}")
    # Remove the prefix
    df['file_name'] = df['file_name'].str[len(common_prefix):]
    # Create column for folder structure
    df['folder'] = df['file_name'].apply(os.path.dirname)
    if (df['folder'].str.strip() == '').all():
        df['folder'] = json_dir.split("_")[-1][:-1]
    # Fill NaN strategy with "None"
    df['strategy'] = df['strategy'].fillna('None')
    print(f"Total Length: {len(df)} - Files Used: {df.file_name.nunique()} - Strategies Used: {df.strategy.nunique()} - Seeds Used {df.seed.nunique()}")
    dfs.append(df)

merged_df = pd.concat(dfs, axis=0, ignore_index=True)
merged_df

## Create plots

In [None]:
def plot_coverage_barplot(df, error_type='std'):
    """
    Plots average coverage per strategy per folder with error bars.
    
    Parameters:
    - df: DataFrame with columns ['strategy', 'seed', 'summary_percent_covered', 'folder']
    - error_type: 'std' (standard deviation) or 'minmax' (min/max range)
    """
    # Group by folder and strategy
    agg_funcs = {'summary_percent_covered': ['mean', 'std', 'min', 'max']}
    grouped = df.groupby(['folder', 'strategy']).agg(agg_funcs)
    grouped.columns = ['mean', 'std', 'min', 'max']
    grouped = grouped.reset_index()

    folders = grouped['folder'].unique()
    strategies = grouped['strategy'].unique()
    
    # Set up bar positions
    bar_width = 0.8 / len(strategies)
    x = np.arange(len(folders))

    _, ax = plt.subplots(figsize=(14, 7))

    # Use a colormap for consistent strategy coloring
    colors = ['#1f77b4', '#4fa5d5']

    for i, strategy in enumerate(strategies):
        data = grouped[grouped['strategy'] == strategy]
        means = data['mean'].values
        if error_type == 'std':
            errors = data['std'].values
        elif error_type == 'minmax':
            errors = [means - data['min'].values, data['max'].values - means]
        else:
            raise ValueError("error_type must be 'std' or 'minmax'")

        positions = x + i * bar_width

        bars = ax.bar(
            positions, means, bar_width,
            yerr=errors, capsize=5,
            label=strategy, color=colors[i]
        )

        # Annotate mean value on top of bars
        for j, bar in enumerate(bars):
            height = bar.get_height()
            additional_height = errors[j] if error_type == 'std' else errors[1][j]
            x_off_set = (
                2 if additional_height == 0
                else 4
            )
            ax.annotate(f'{height:.1f}%',
                        # xy=(bar.get_x() + bar.get_width() / 2, height + errors[j] if error_type == 'std' else height + errors[1][j]),
                        xy=(bar.get_x() + bar.get_width() / x_off_set, height),
                        xytext=(0, 5), textcoords="offset points",
                        ha='center', va='bottom', fontsize=9, color='black')

    # Configure x-axis
    ax.set_xticks(x + bar_width * (len(strategies) - 1) / 2)
    ax.set_xticklabels(folders, rotation=45, ha='right', fontsize=10)

    # Labels and title
    ax.set_ylabel('Average Coverage (%)', fontsize=12)
    ax.set_title('Average Coverage per Strategy per Function Type', fontsize=14, weight='bold')
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Optional dynamic y-limits
    y_max = grouped['max'].max()
    ax.set_ylim(0, y_max * 1.15)

    # Legend
    ax.legend(title='Strategy', fontsize=10, title_fontsize=11)

    plt.tight_layout()
    plt.show()

plot_coverage_barplot(merged_df, error_type="minmax")

## Evaluate performance

In [None]:
from scipy.stats import wilcoxon

def compare_strategies_wilcoxon(df, strategy_a, strategy_b,
                                 metric_col='summary_percent_covered',
                                 id_cols=['file_name', 'seed']):
    """
    Paired Wilcoxon signed-rank test between two strategies,
    matched by file_name and seed.

    Returns:
    - Wilcoxon statistic, p-value, and win/tie/loss count.
    """
    # Pivot to align each (file_name, seed) pair with both strategies
    pivot = df[df['strategy'].isin([strategy_a, strategy_b])].pivot_table(
        index=id_cols,
        columns='strategy',
        values=metric_col
    ).dropna()

    # Extract values for paired test
    a_vals = pivot[strategy_a].values
    b_vals = pivot[strategy_b].values

    # Perform Wilcoxon test (alternative='two-sided' by default)
    stat, p = wilcoxon(a_vals, b_vals)

    # Count how many times A > B, A < B, A == B
    diffs = a_vals - b_vals
    wins = (diffs > 0).sum()
    losses = (diffs < 0).sum()
    ties = (diffs == 0).sum()

    return {
        'strategy_a': strategy_a,
        'strategy_b': strategy_b,
        'wilcoxon_statistic': float(stat),
        'p_value': float(p),
        'n': len(diffs),
        'a > b': int(wins),
        'a < b': int(losses),
        'a == b': int(ties)
    }

wilcoxon_res = compare_strategies_wilcoxon(merged_df, strategy_a="tree_traverse", strategy_b="None")
wilcoxon_res

In [None]:
def plot_wilcoxon_outcome_bar(result_dict):
    """
    Visualizes Wilcoxon signed-rank test win/loss/tie results as horizontal bars,
    styled to match plot_coverage_barplot().

    Parameters:
    - result_dict: output from compare_strategies_wilcoxon()
    """
    strategy_a = result_dict['strategy_a']
    strategy_b = result_dict['strategy_b']
    
    wins = result_dict['a > b']
    losses = result_dict['a < b']
    ties = result_dict['a == b']

    labels = [
        f"{strategy_a} > {strategy_b}",
        f"{strategy_a} < {strategy_b}",
        f"{strategy_a} = {strategy_b}"
    ]
    counts = [wins, losses, ties]
    colors = ['#1f77b4', '#4fa5d5', '#c6dfee']  # match blue palette with softer tie color

    _, ax = plt.subplots(figsize=(10, 4))

    bars = ax.barh(labels, counts, color=colors)

    # Annotate count on each bar
    for bar in bars:
        width = bar.get_width()
        ax.text(width + 5, bar.get_y() + bar.get_height() / 2,
                f'{int(width)}', va='center', fontsize=10)

    ax.set_xlabel('Number of Cases', fontsize=12)
    ax.set_title('Wilcoxon Test Outcome Summary', fontsize=14, weight='bold', pad=10)
    ax.grid(axis='x', linestyle='--', alpha=0.7)

    # Clean y-ticks and frame
    ax.tick_params(axis='y', labelsize=10)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    plt.tight_layout()
    plt.show()

plot_wilcoxon_outcome_bar(wilcoxon_res)