# RQ2: Operator Performance Analysis

This notebook analyzes operator performance metrics across multiple runs:
- **NE**: Non-elite percentage
- **EHR**: Elite Hit Rate
- **IR**: Invalid Rate  
- **cEHR**: Conditional Elite Hit Rate
- **Δμ**: Mean delta score (toxicity - parent_score)
- **Δσ**: Standard deviation of delta score

In [49]:
# Imports and Helper Functions
import os
import glob
import re
import json
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.table import Table
from scipy.stats import kruskal, mannwhitneyu
from itertools import combinations

# Helper function to flatten operator_statistics column
def flatten_operator_statistics(df, col="operator_statistics"):
    """Flatten nested operator_statistics dictionary into separate columns"""
    if col not in df.columns:
        return df
    all_keys = set()
    for ops in df[col]:
        if isinstance(ops, dict):
            all_keys.update(ops.keys())
    
    for op_key in all_keys:
        flat_rows = []
        for ops in df[col]:
            if isinstance(ops, dict) and op_key in ops and isinstance(ops[op_key], dict):
                prefix = f"operator_statistics_{op_key}_"
                row = {prefix + subk: subv for subk, subv in ops[op_key].items()}
                flat_rows.append(row)
            else:
                flat_rows.append({})
        flat_df = pd.DataFrame(flat_rows)
        df = pd.concat([df.reset_index(drop=True), flat_df.reset_index(drop=True)], axis=1)
    df = df.drop(columns=[col])
    return df

# Define crossover operators (others are mutations)
CROSSOVER_OPERATORS = {'SemanticSimilarityCrossover', 'SemanticFusionCrossover'}


## Main Processing: All Comb Runs

In [50]:
# Process all comb runs and generate final table
# This cell processes all run*_comb directories and creates the final metrics table

# Setup paths
if os.path.basename(os.getcwd()) == 'experiments':
    base_data_dir = os.path.join(os.path.dirname(os.getcwd()), "data", "outputs")
else:
    base_data_dir = os.path.join(os.path.dirname(os.getcwd()), "data", "outputs")
base_data_dir = os.path.normpath(base_data_dir)

# Find all comb runs
pattern = os.path.join(base_data_dir, "run*_comb")
run_dirs = sorted(glob.glob(pattern))
run_dirs = [os.path.basename(d.rstrip('/')) for d in run_dirs]

if not run_dirs:
    raise ValueError(f"No comb run directories found in {base_data_dir}")

print(f"Found {len(run_dirs)} comb runs: {run_dirs}")

def process_single_run(run_dir):
    """Process a single run directory and return metrics per operator"""
    data_dir = os.path.join(base_data_dir, run_dir)
    
    if not os.path.exists(data_dir):
        return None
    
    # Load all files
    dfs = {}
    filenames = [f for f in os.listdir(data_dir) if not f.startswith(".") and os.path.isfile(os.path.join(data_dir, f))]
    
    for fname in filenames:
        file_path = os.path.join(data_dir, fname)
        ext = os.path.splitext(fname)[1].lower()
        try:
            if fname == "EvolutionTracker.json":
                with open(file_path, 'r') as f:
                    jdata = json.load(f)
                if 'generations' in jdata and isinstance(jdata['generations'], list):
                    df = pd.DataFrame(jdata['generations'])
                    if "operator_statistics" in df.columns:
                        df = flatten_operator_statistics(df, col="operator_statistics")
                else:
                    df = pd.json_normalize(jdata)
            elif ext == ".json":
                try:
                    df = pd.read_json(file_path)
                except Exception:
                    with open(file_path, "r") as f:
                        jdata = json.load(f)
                    if isinstance(jdata, list):
                        df = pd.DataFrame(jdata)
                    elif isinstance(jdata, dict):
                        df = pd.json_normalize(jdata)
            else:
                continue
        except Exception:
            continue
        
        if df is not None:
            df_name = os.path.splitext(fname)[0]
            dfs[df_name] = df
    
    # Flatten nested structures in elites, non_elites, under_performing
    for label in ['elites', 'non_elites', 'under_performing']:
        if label in dfs:
            df = dfs[label]
            cols_to_flatten = []
            for col in df.columns:
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if isinstance(sample_val, (dict, list)):
                    cols_to_flatten.append(col)
            try:
                for col in cols_to_flatten:
                    flattened = pd.json_normalize(df[col])
                    flattened.columns = [f"{col}_{c}" for c in flattened.columns]
                    df = df.drop(columns=[col]).reset_index(drop=True)
                    df = pd.concat([df, flattened], axis=1)
                dfs[label] = df
            except Exception:
                pass
    
    # Create unified_df
    group_labels = ['elites', 'non_elites', 'under_performing']
    selected_dfs = []
    for label in group_labels:
        if label in dfs:
            df = dfs[label].copy()
            if df.empty:
                continue
            if 'generation' in df.columns:
                df = df[df['generation'] <= 50]
            df['_source_group'] = label
            selected_dfs.append(df)
    
    if not selected_dfs:
        return None
    
    unified_df = pd.concat(selected_dfs, ignore_index=True, sort=False)
    
    # Calculate delta_score
    unified_df['delta_score'] = unified_df['moderation_result_google.scores.toxicity'] - unified_df['parent_score']
    
    # Get EvolutionTracker_df
    EvolutionTracker_df = dfs.get('EvolutionTracker', None)
    if EvolutionTracker_df is None:
        return None
    
    # Create operator vs initial_state crosstab
    operator_vs_initial_state = pd.crosstab(
        unified_df['operator'].fillna('Initial Seed'),
        unified_df['initial_state'].fillna('none')
    )
    operator_vs_initial_state['total'] = operator_vs_initial_state.sum(axis=1)
    
    # Get operator statistics columns
    operator_stats_cols = [col for col in EvolutionTracker_df.columns if col.startswith('operator_statistics_')]
    pattern_question = re.compile(r'operator_statistics_(.*?)_question_mark_rejections')
    pattern_duplicates = re.compile(r'operator_statistics_(.*?)_duplicates_removed')
    
    operator_names = set()
    for col in operator_stats_cols:
        m_q = pattern_question.match(col)
        m_d = pattern_duplicates.match(col)
        if m_q:
            operator_names.add(m_q.group(1))
        if m_d:
            operator_names.add(m_d.group(1))
    
    # Calculate delta stats (do this early as it's independent)
    operator_delta_stats = unified_df.groupby('operator')['delta_score'].agg(['mean', 'std']).round(2)
    
    # Build result DataFrame directly (avoid intermediate DataFrames)
    result_data = {}
    all_operators_set = set(operator_names) | set(operator_vs_initial_state.index) - {'Initial Seed'}
    
    for operator in sorted(all_operators_set):
        # Get counts from operator_vs_initial_state
        if operator in operator_vs_initial_state.index:
            elite = operator_vs_initial_state.loc[operator, 'elite'] if 'elite' in operator_vs_initial_state.columns else 0
            non_elite = operator_vs_initial_state.loc[operator, 'non_elite'] if 'non_elite' in operator_vs_initial_state.columns else 0
            total = operator_vs_initial_state.loc[operator, 'total']
        else:
            elite = non_elite = total = 0
        
        # Get cleaning stats (only for operators in operator_names)
        if operator in operator_names:
            col_q = f'operator_statistics_{operator}_question_mark_rejections'
            col_d = f'operator_statistics_{operator}_duplicates_removed'
            question_removed = EvolutionTracker_df[col_q].sum() if col_q in EvolutionTracker_df.columns else 0
            duplicates_removed = EvolutionTracker_df[col_d].sum() if col_d in EvolutionTracker_df.columns else 0
        else:
            question_removed = duplicates_removed = 0
        
        # Calculate total (including removed items)
        calculated_total = total + question_removed + duplicates_removed
        
        if calculated_total == 0:
            continue
        
        # Calculate metrics directly as percentages (handle division by zero)
        NE = (non_elite / calculated_total * 100).round(2) if calculated_total > 0 else 0.0
        EHR = (elite / calculated_total * 100).round(2) if calculated_total > 0 else 0.0
        IR = (question_removed / calculated_total * 100).round(2) if calculated_total > 0 else 0.0
        cEHR = (elite / total * 100).round(2) if total > 0 else 0.0
        
        # Get delta stats
        delta_mean = operator_delta_stats.loc[operator, 'mean'] if operator in operator_delta_stats.index else np.nan
        delta_std = operator_delta_stats.loc[operator, 'std'] if operator in operator_delta_stats.index else np.nan
        
        result_data[operator] = {
            'NE': NE,
            'EHR': EHR,
            'IR': IR,
            'cEHR': cEHR,
            'Δμ': delta_mean,
            'Δσ': delta_std
        }
    
    result_df = pd.DataFrame(result_data).T
    return result_df[['NE', 'EHR', 'IR', 'cEHR', 'Δμ', 'Δσ']]

# Process all runs
all_run_results = {}
for run_dir in run_dirs:
    run_match = re.search(r'run(\d+)_comb', run_dir)
    if run_match:
        run_key = run_match.group(1)
    else:
        run_key = run_dir.replace('run', '').replace('_comb', '')
    
    result = process_single_run(run_dir)
    if result is not None:
        all_run_results[run_key] = result
        print(f"Processed {run_dir} -> E{run_key}")

# Get all unique operators across all runs
all_operators = set()
for run_key, df in all_run_results.items():
    all_operators.update(df.index.tolist())

# Get all run keys sorted numerically
sorted_run_keys = sorted(all_run_results.keys(), key=lambda x: int(x) if x.isdigit() else 999)

# Create table data
table_rows = []
for operator in sorted(all_operators):
    # Get data for each run
    run_data = {}
    for run_key in sorted_run_keys:
        if run_key in all_run_results and operator in all_run_results[run_key].index:
            run_data[run_key] = all_run_results[run_key].loc[operator]
        else:
            run_data[run_key] = None
    
    # Add individual run rows
    for run_key in sorted_run_keys:
        if run_data[run_key] is not None:
            row_data = run_data[run_key]
            table_rows.append({
                'Operator': operator,
                'Exec': f'E{run_key}',
                'NE': row_data['NE'],
                'EHR': row_data['EHR'],
                'IR': row_data['IR'],
                'cEHR': row_data['cEHR'],
                'Δμ': row_data['Δμ'],
                'Δσ': row_data['Δσ'],
                'is_mean': False
            })
        else:
            table_rows.append({
                'Operator': operator,
                'Exec': f'E{run_key}',
                'NE': np.nan,
                'EHR': np.nan,
                'IR': np.nan,
                'cEHR': np.nan,
                'Δμ': np.nan,
                'Δσ': np.nan,
                'is_mean': False
            })
    
    # Calculate mean across all valid runs
    valid_runs = [run_data[k] for k in sorted_run_keys if run_data[k] is not None]
    if valid_runs:
        mean_data = pd.DataFrame(valid_runs).mean()
        table_rows.append({
            'Operator': operator,
            'Exec': 'Mean',
            'NE': round(mean_data['NE'], 2),
            'EHR': round(mean_data['EHR'], 2),
            'IR': round(mean_data['IR'], 2),
            'cEHR': round(mean_data['cEHR'], 2),
            'Δμ': round(mean_data['Δμ'], 2),
            'Δσ': round(mean_data['Δσ'], 2),
            'is_mean': True
        })

# Create DataFrame
final_table_df = pd.DataFrame(table_rows)

# Display table
print("\n" + "="*100)
print("RQ2: Operator Performance Metrics Across Multiple Runs")
print("="*100)
display(final_table_df)

# Create formatted table for PDF export
fig, ax = plt.subplots(figsize=(16, len(final_table_df) * 0.5 + 2))
ax.axis('tight')
ax.axis('off')

headers = ['Operator', 'Exec', 'NE', 'EHR', 'IR', 'cEHR', 'Δμ', 'Δσ']
table_data = [[row['Operator'], row['Exec'],
               f"{row['NE']:.2f}" if not pd.isna(row['NE']) else 'N/A',
               f"{row['EHR']:.2f}" if not pd.isna(row['EHR']) else 'N/A',
               f"{row['IR']:.2f}" if not pd.isna(row['IR']) else 'N/A',
               f"{row['cEHR']:.2f}" if not pd.isna(row['cEHR']) else 'N/A',
               f"{row['Δμ']:.2f}" if not pd.isna(row['Δμ']) else 'N/A',
               f"{row['Δσ']:.2f}" if not pd.isna(row['Δσ']) else 'N/A']
              for _, row in final_table_df.iterrows()]

table = ax.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 1.8)

# Style header
for i in range(len(headers)):
    table[(0, i)].set_facecolor('#4CAF50')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style mean rows and alternate row colors (combine iteration)
for row_idx, (_, row) in enumerate(final_table_df.iterrows(), start=1):
    for j in range(len(headers)):
        if row['is_mean']:
            table[(row_idx, j)].set_facecolor('#B3E5FC')
            table[(row_idx, j)].set_text_props(weight='bold')
        else:
            table[(row_idx, j)].set_facecolor('#f0f0f0' if row_idx % 2 == 0 else 'white')

plt.title('RQ2: Operator Performance Metrics (Rates % and Deltas)', fontsize=14, fontweight='bold', pad=20)

# Save to PDF
if os.path.basename(os.getcwd()) == 'experiments':
    output_dir = os.getcwd()
else:
    parent_dir = os.path.dirname(os.getcwd())
    experiments_path = os.path.join(parent_dir, 'experiments')
    output_dir = experiments_path if os.path.exists(experiments_path) else os.getcwd()

filename_pdf = os.path.join(output_dir, "rq2_operator_metrics_table.pdf")
if os.path.exists(filename_pdf):
    os.remove(filename_pdf)
plt.savefig(filename_pdf, dpi=150, bbox_inches='tight')
plt.close()

print(f"\nTable saved to: {filename_pdf}")

# Create simplified table for report (one row per operator - mean values only)
simplified_table_df = final_table_df[final_table_df['is_mean'] == True].copy()
simplified_table_df = simplified_table_df[['Operator', 'NE', 'EHR', 'IR', 'cEHR', 'Δμ', 'Δσ']].copy()
simplified_table_df = simplified_table_df.sort_values('Operator').reset_index(drop=True)

print("\n" + "="*100)
print("Simplified Table for Report (One Row Per Operator - Mean Across All Runs)")
print("="*100)
display(simplified_table_df)

# Create formatted PDF table for simplified version
fig, ax = plt.subplots(figsize=(14, len(simplified_table_df) * 0.4 + 2))
ax.axis('tight')
ax.axis('off')

headers = ['Operator', 'NE (%)', 'EHR (%)', 'IR (%)', 'cEHR (%)', 'Δμ', 'Δσ']
table_data = [[row['Operator'],
               f"{row['NE']:.2f}" if not pd.isna(row['NE']) else 'N/A',
               f"{row['EHR']:.2f}" if not pd.isna(row['EHR']) else 'N/A',
               f"{row['IR']:.2f}" if not pd.isna(row['IR']) else 'N/A',
               f"{row['cEHR']:.2f}" if not pd.isna(row['cEHR']) else 'N/A',
               f"{row['Δμ']:.2f}" if not pd.isna(row['Δμ']) else 'N/A',
               f"{row['Δσ']:.2f}" if not pd.isna(row['Δσ']) else 'N/A']
              for _, row in simplified_table_df.iterrows()]

table = ax.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2.0)

# Style header
for i in range(len(headers)):
    table[(0, i)].set_facecolor('#4CAF50')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style rows (alternate colors)
for row_idx in range(1, len(table_data) + 1):
    for j in range(len(headers)):
        table[(row_idx, j)].set_facecolor('#f0f0f0' if row_idx % 2 == 0 else 'white')

plt.title('RQ2: Operator Performance Metrics (Mean Across All Runs)', fontsize=14, fontweight='bold', pad=20)

# Save simplified table to PDF
filename_simplified = os.path.join(output_dir, "rq2_operator_metrics_simplified.pdf")
if os.path.exists(filename_simplified):
    os.remove(filename_simplified)
plt.savefig(filename_simplified, dpi=150, bbox_inches='tight')
plt.close()

print(f"\nSimplified table saved to: {filename_simplified}")

# Export to CSV
csv_filename = os.path.join(output_dir, "rq2_operator_metrics_simplified.csv")
simplified_table_df.to_csv(csv_filename, index=False)
print(f"Simplified table exported to: {csv_filename}")

Found 7 comb runs: ['run01_comb', 'run02_comb', 'run03_comb', 'run04_comb', 'run05_comb', 'run06_comb', 'run07_comb']
Processed run01_comb -> E01
Processed run02_comb -> E02
Processed run03_comb -> E03
Processed run04_comb -> E04
Processed run05_comb -> E05
Processed run06_comb -> E06
Processed run07_comb -> E07

RQ2: Operator Performance Metrics Across Multiple Runs


Unnamed: 0,Operator,Exec,NE,EHR,IR,cEHR,Δμ,Δσ,is_mean
0,ConceptAdditionOperator,E01,62.99,1.57,35.43,2.44,-0.08,0.12,False
1,ConceptAdditionOperator,E02,51.54,4.62,41.54,7.89,-0.06,0.14,False
2,ConceptAdditionOperator,E03,48.87,9.02,41.35,15.38,-0.03,0.10,False
3,ConceptAdditionOperator,E04,52.45,3.50,42.66,6.10,-0.05,0.13,False
4,ConceptAdditionOperator,E05,53.23,9.68,35.48,15.00,-0.03,0.11,False
...,...,...,...,...,...,...,...,...,...
91,TypographicalErrorsOperator,E04,39.16,3.50,55.24,7.94,-0.05,0.10,False
92,TypographicalErrorsOperator,E05,38.71,5.65,54.03,12.28,-0.05,0.12,False
93,TypographicalErrorsOperator,E06,41.41,3.91,53.12,8.33,-0.05,0.11,False
94,TypographicalErrorsOperator,E07,41.59,0.00,57.52,0.00,-0.08,0.13,False



Table saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_operator_metrics_table.pdf

Simplified Table for Report (One Row Per Operator - Mean Across All Runs)


Unnamed: 0,Operator,NE,EHR,IR,cEHR,Δμ,Δσ
0,ConceptAdditionOperator,55.71,4.53,38.52,7.44,-0.05,0.13
1,InformedEvolutionOperator,49.16,9.72,40.06,16.6,-0.17,0.11
2,LLMBackTranslation_HI,71.03,4.7,21.07,5.93,-0.07,0.12
3,LLMBasedParaphrasing,54.81,3.49,40.95,6.02,-0.06,0.12
4,LLM_POSAwareSynonymReplacement,75.9,6.77,13.15,8.05,-0.05,0.11
5,MLM,60.65,5.84,26.48,8.66,-0.05,0.12
6,NegationOperator,71.47,4.93,18.81,6.4,-0.06,0.12
7,POSAwareAntonymReplacement,83.96,7.25,5.0,7.79,-0.06,0.12
8,SemanticFusionCrossover,39.74,2.63,56.02,5.86,-0.05,0.09
9,SemanticSimilarityCrossover,21.87,2.26,0.0,9.98,-0.05,0.09



Simplified table saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_operator_metrics_simplified.pdf
Simplified table exported to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_operator_metrics_simplified.csv


## Statistical Analysis: Non-Parametric Tests


In [51]:
# Non-Parametric Statistical Analysis
# Kruskal-Wallis H-test for each metric, followed by post-hoc Mann-Whitney U tests

# Prepare data for statistical tests (exclude mean rows, only use individual runs)
# Ensure final_table_df exists (created in previous cell)
if 'final_table_df' not in globals() or final_table_df.empty:
    raise ValueError("final_table_df not found. Please run the main processing cell first.")

test_data_df = final_table_df[final_table_df['is_mean'] == False].copy()

# Metrics to test
metrics = ['EHR', 'cEHR', 'IR', 'NE', 'Δμ', 'Δσ']
metric_names = {
    'EHR': 'Elite Hit Rate (%)',
    'cEHR': 'Conditional Elite Hit Rate (%)',
    'IR': 'Invalid Rate (%)',
    'NE': 'Non-Elite Percentage (%)',
    'Δμ': 'Mean Delta Score',
    'Δσ': 'Delta Score Std Dev'
}

# Store results
statistical_results = {}

print("="*100)
print("Non-Parametric Statistical Analysis")
print("="*100)

for metric in metrics:
    print(f"\n{'='*100}")
    print(f"Metric: {metric_names[metric]}")
    print(f"{'='*100}")
    
    # Prepare data: operator -> list of values across runs
    operator_data = {}
    for operator in sorted(all_operators):
        operator_df = test_data_df[(test_data_df['Operator'] == operator) & 
                                    (test_data_df[metric].notna())]
        values = operator_df[metric].dropna().tolist()
        if len(values) > 0:
            operator_data[operator] = values
    
    if len(operator_data) < 2:
        print(f"Insufficient data for {metric}")
        continue
    
    # Kruskal-Wallis H-test (tests if any operators differ)
    # Filter out operators with no data
    operators_with_data = [op for op in sorted(operator_data.keys()) if len(operator_data[op]) > 0]
    
    if len(operators_with_data) < 2:
        print(f"Insufficient operators with data for {metric} (need at least 2 operators)")
        continue
    
    groups = [operator_data[op] for op in operators_with_data]
    operators_list = operators_with_data
    
    try:
        h_statistic, p_value = kruskal(*groups)
        print(f"\nKruskal-Wallis H-test:")
        print(f"  H-statistic: {h_statistic:.4f}")
        print(f"  p-value: {p_value:.6f}")
        print(f"  Significance: {'***' if p_value < 0.001 else '**' if p_value < 0.01 else '*' if p_value < 0.05 else 'ns'}")
        
        if p_value < 0.05:
            print(f"  → Significant difference found (p < 0.05)")
        else:
            print(f"  → No significant difference (p >= 0.05)")
        
        statistical_results[metric] = {
            'kruskal_wallis': {
                'h_statistic': h_statistic,
                'p_value': p_value,
                'significant': p_value < 0.05
            },
            'operator_data': operator_data,
            'operators': operators_list
        }
        
        # Post-hoc Mann-Whitney U tests (pairwise comparisons) if Kruskal-Wallis is significant
        if p_value < 0.05:
            print(f"\nPost-hoc Mann-Whitney U tests (Bonferroni corrected):")
            pairwise_results = []
            operator_pairs = list(combinations(operators_list, 2))
            num_comparisons = len(operator_pairs)
            bonferroni_alpha = 0.05 / num_comparisons
            
            print(f"  Number of comparisons: {num_comparisons}")
            print(f"  Bonferroni corrected α: {bonferroni_alpha:.6f}")
            print(f"\n  Significant pairwise differences (p < {bonferroni_alpha:.6f}):")
            
            significant_pairs = []
            for op1, op2 in operator_pairs:
                try:
                    # Check if both operators have data
                    if len(operator_data[op1]) == 0 or len(operator_data[op2]) == 0:
                        continue
                    
                    u_statistic, p_val = mannwhitneyu(
                        operator_data[op1], 
                        operator_data[op2], 
                        alternative='two-sided'
                    )
                    is_significant = p_val < bonferroni_alpha
                    
                    if is_significant:
                        mean1 = np.mean(operator_data[op1])
                        mean2 = np.mean(operator_data[op2])
                        direction = ">" if mean1 > mean2 else "<"
                        print(f"    {op1} {direction} {op2} (p={p_val:.6f}, U={u_statistic:.2f})")
                        significant_pairs.append((op1, op2, p_val, mean1, mean2))
                    
                    pairwise_results.append({
                        'operator1': op1,
                        'operator2': op2,
                        'u_statistic': u_statistic,
                        'p_value': p_val,
                        'significant': is_significant,
                        'mean1': np.mean(operator_data[op1]),
                        'mean2': np.mean(operator_data[op2])
                    })
                except Exception as e:
                    print(f"    Error comparing {op1} vs {op2}: {e}")
                    continue
            
            if not significant_pairs:
                print(f"    None (after Bonferroni correction)")
            
            statistical_results[metric]['pairwise'] = pairwise_results
            statistical_results[metric]['significant_pairs'] = significant_pairs
        
        # Summary statistics per operator
        print(f"\nSummary Statistics by Operator:")
        summary_stats = []
        for operator in operators_list:
            values = operator_data[operator]
            summary_stats.append({
                'Operator': operator,
                'Mean': np.mean(values),
                'Median': np.median(values),
                'Std': np.std(values),
                'Min': np.min(values),
                'Max': np.max(values),
                'N': len(values)
            })
        
        summary_df = pd.DataFrame(summary_stats)
        # Sort by mean (descending for EHR, cEHR; ascending for others)
        if metric in ['EHR', 'cEHR']:
            summary_df = summary_df.sort_values('Mean', ascending=False)
        else:
            summary_df = summary_df.sort_values('Mean', ascending=True)
        display(summary_df)
        
    except Exception as e:
        print(f"Error in Kruskal-Wallis test: {e}")
        continue

print(f"\n{'='*100}")
print("Statistical Analysis Complete")
print(f"{'='*100}")

# Create summary of significant findings
print("\n" + "="*100)
print("Summary of Significant Findings")
print("="*100)

significant_metrics = [m for m in metrics if m in statistical_results and 
                       statistical_results[m]['kruskal_wallis']['significant']]

if significant_metrics:
    print(f"\nMetrics with significant operator differences (p < 0.05):")
    for metric in significant_metrics:
        p_val = statistical_results[metric]['kruskal_wallis']['p_value']
        num_sig_pairs = len(statistical_results[metric].get('significant_pairs', []))
        print(f"  - {metric_names[metric]}: p={p_val:.6f}, {num_sig_pairs} significant pairwise differences")
else:
    print("\nNo metrics showed significant operator differences (p >= 0.05)")

# Export statistical results to CSV
# Ensure output_dir is defined (use same logic as main processing cell)
if 'output_dir' not in globals():
    if os.path.basename(os.getcwd()) == 'experiments':
        output_dir = os.getcwd()
    else:
        parent_dir = os.path.dirname(os.getcwd())
        experiments_path = os.path.join(parent_dir, 'experiments')
        output_dir = experiments_path if os.path.exists(experiments_path) else os.getcwd()

# Export summary statistics
summary_all = []
for metric in metrics:
    if metric in statistical_results:
        for operator in statistical_results[metric]['operators']:
            values = statistical_results[metric]['operator_data'][operator]
            summary_all.append({
                'Metric': metric_names[metric],
                'Operator': operator,
                'Mean': np.mean(values),
                'Median': np.median(values),
                'Std': np.std(values),
                'Min': np.min(values),
                'Max': np.max(values),
                'N': len(values)
            })

if summary_all:
    summary_df_all = pd.DataFrame(summary_all)
    csv_filename = os.path.join(output_dir, "rq2_statistical_summary.csv")
    summary_df_all.to_csv(csv_filename, index=False)
    print(f"\nSummary statistics exported to: {csv_filename}")

# Export pairwise comparison results
pairwise_all = []
for metric in metrics:
    if metric in statistical_results and 'pairwise' in statistical_results[metric]:
        for pair in statistical_results[metric]['pairwise']:
            pairwise_all.append({
                'Metric': metric_names[metric],
                'Operator1': pair['operator1'],
                'Operator2': pair['operator2'],
                'U_statistic': pair['u_statistic'],
                'p_value': pair['p_value'],
                'Significant': pair['significant'],
                'Mean1': pair['mean1'],
                'Mean2': pair['mean2']
            })

if pairwise_all:
    pairwise_df = pd.DataFrame(pairwise_all)
    csv_filename = os.path.join(output_dir, "rq2_pairwise_comparisons.csv")
    pairwise_df.to_csv(csv_filename, index=False)
    print(f"Pairwise comparisons exported to: {csv_filename}")


Non-Parametric Statistical Analysis

Metric: Elite Hit Rate (%)

Kruskal-Wallis H-test:
  H-statistic: 30.6312
  p-value: 0.001260
  Significance: **
  → Significant difference found (p < 0.05)

Post-hoc Mann-Whitney U tests (Bonferroni corrected):
  Number of comparisons: 66
  Bonferroni corrected α: 0.000758

  Significant pairwise differences (p < 0.000758):
    None (after Bonferroni correction)

Summary Statistics by Operator:


Unnamed: 0,Operator,Mean,Median,Std,Min,Max,N
1,InformedEvolutionOperator,9.721429,7.08,4.558077,3.5,16.54,7
7,POSAwareAntonymReplacement,7.247143,6.19,2.825014,3.91,11.28,7
4,LLM_POSAwareSynonymReplacement,6.774286,8.06,2.98092,3.12,10.62,7
5,MLM,5.844286,5.59,2.333415,3.12,10.53,7
6,NegationOperator,4.93,3.5,2.745901,1.57,9.02,7
2,LLMBackTranslation_HI,4.695714,4.72,2.155736,1.56,9.02,7
0,ConceptAdditionOperator,4.531429,3.5,3.228261,1.56,9.68,7
11,TypographicalErrorsOperator,3.517143,3.76,1.637548,0.0,5.65,7
3,LLMBasedParaphrasing,3.488571,2.36,2.371114,0.78,7.26,7
10,StylisticMutator,2.825714,3.12,1.313195,0.79,4.51,7



Metric: Conditional Elite Hit Rate (%)

Kruskal-Wallis H-test:
  H-statistic: 15.4663
  p-value: 0.162128
  Significance: ns
  → No significant difference (p >= 0.05)

Summary Statistics by Operator:


Unnamed: 0,Operator,Mean,Median,Std,Min,Max,N
1,InformedEvolutionOperator,16.6,18.6,7.395404,5.62,27.27,7
9,SemanticSimilarityCrossover,9.978571,9.76,8.692526,0.0,23.08,7
5,MLM,8.66,8.0,3.348949,4.12,15.22,7
4,LLM_POSAwareSynonymReplacement,8.045714,9.71,3.570358,3.51,12.24,7
7,POSAwareAntonymReplacement,7.788571,6.8,2.853822,4.42,11.9,7
0,ConceptAdditionOperator,7.44,6.1,5.257275,2.41,15.38,7
11,TypographicalErrorsOperator,7.425714,8.06,3.430634,0.0,12.28,7
6,NegationOperator,6.395714,4.72,3.781265,1.98,12.37,7
3,LLMBasedParaphrasing,6.022857,3.8,4.319113,1.35,13.04,7
2,LLMBackTranslation_HI,5.927143,6.0,2.561979,1.98,10.81,7



Metric: Invalid Rate (%)

Kruskal-Wallis H-test:
  H-statistic: 76.2325
  p-value: 0.000000
  Significance: ***
  → Significant difference found (p < 0.05)

Post-hoc Mann-Whitney U tests (Bonferroni corrected):
  Number of comparisons: 66
  Bonferroni corrected α: 0.000758

  Significant pairwise differences (p < 0.000758):
    ConceptAdditionOperator > LLMBackTranslation_HI (p=0.000583, U=49.00)
    ConceptAdditionOperator > LLM_POSAwareSynonymReplacement (p=0.000583, U=49.00)
    ConceptAdditionOperator > NegationOperator (p=0.000583, U=49.00)
    ConceptAdditionOperator > POSAwareAntonymReplacement (p=0.000583, U=49.00)
    ConceptAdditionOperator < SemanticFusionCrossover (p=0.000583, U=0.00)
    ConceptAdditionOperator < TypographicalErrorsOperator (p=0.000583, U=0.00)
    InformedEvolutionOperator > LLM_POSAwareSynonymReplacement (p=0.000583, U=49.00)
    InformedEvolutionOperator > NegationOperator (p=0.000583, U=49.00)
    InformedEvolutionOperator > POSAwareAntonymReplacement

Unnamed: 0,Operator,Mean,Median,Std,Min,Max,N
9,SemanticSimilarityCrossover,0.0,0.0,0.0,0.0,0.0,7
7,POSAwareAntonymReplacement,4.995714,4.51,2.198791,3.08,10.16,7
4,LLM_POSAwareSynonymReplacement,13.148571,12.39,3.355279,8.59,18.05,7
6,NegationOperator,18.81,20.35,5.918569,7.81,26.32,7
2,LLMBackTranslation_HI,21.072857,21.09,5.069916,15.79,31.5,7
5,MLM,26.484286,25.38,4.845613,20.31,37.17,7
0,ConceptAdditionOperator,38.524286,38.05,3.037127,35.16,42.66,7
10,StylisticMutator,39.398571,39.1,4.075801,33.87,44.62,7
1,InformedEvolutionOperator,40.058571,36.36,10.307686,30.0,61.95,7
3,LLMBasedParaphrasing,40.948571,42.19,3.295674,34.62,44.35,7



Metric: Non-Elite Percentage (%)

Kruskal-Wallis H-test:
  H-statistic: 73.2075
  p-value: 0.000000
  Significance: ***
  → Significant difference found (p < 0.05)

Post-hoc Mann-Whitney U tests (Bonferroni corrected):
  Number of comparisons: 66
  Bonferroni corrected α: 0.000758

  Significant pairwise differences (p < 0.000758):
    ConceptAdditionOperator < LLM_POSAwareSynonymReplacement (p=0.000583, U=0.00)
    ConceptAdditionOperator < POSAwareAntonymReplacement (p=0.000583, U=0.00)
    ConceptAdditionOperator > SemanticFusionCrossover (p=0.000583, U=49.00)
    ConceptAdditionOperator > SemanticSimilarityCrossover (p=0.000583, U=49.00)
    ConceptAdditionOperator > TypographicalErrorsOperator (p=0.000583, U=49.00)
    InformedEvolutionOperator < LLMBackTranslation_HI (p=0.000583, U=0.00)
    InformedEvolutionOperator < LLM_POSAwareSynonymReplacement (p=0.000583, U=0.00)
    InformedEvolutionOperator < NegationOperator (p=0.000583, U=0.00)
    InformedEvolutionOperator < POSAware

Unnamed: 0,Operator,Mean,Median,Std,Min,Max,N
9,SemanticSimilarityCrossover,21.872857,23.64,9.883962,9.21,35.58,7
8,SemanticFusionCrossover,39.742857,38.16,3.815819,34.62,45.45,7
11,TypographicalErrorsOperator,41.68,41.54,2.57378,38.71,47.24,7
1,InformedEvolutionOperator,49.157143,54.33,9.640003,30.97,58.04,7
3,LLMBasedParaphrasing,54.805714,55.75,5.084587,46.77,62.31,7
0,ConceptAdditionOperator,55.712857,53.23,5.173351,48.87,62.99,7
10,StylisticMutator,56.677143,55.24,4.310508,51.54,62.9,7
5,MLM,60.65,60.0,5.373892,53.1,71.88,7
2,LLMBackTranslation_HI,71.032857,72.18,3.96372,62.99,75.0,7
6,NegationOperator,71.47,69.23,8.364225,58.65,86.72,7



Metric: Mean Delta Score

Kruskal-Wallis H-test:
  H-statistic: 25.9722
  p-value: 0.006552
  Significance: **
  → Significant difference found (p < 0.05)

Post-hoc Mann-Whitney U tests (Bonferroni corrected):
  Number of comparisons: 66
  Bonferroni corrected α: 0.000758

  Significant pairwise differences (p < 0.000758):
    None (after Bonferroni correction)

Summary Statistics by Operator:


Unnamed: 0,Operator,Mean,Median,Std,Min,Max,N
1,InformedEvolutionOperator,-0.17,-0.17,0.029761,-0.21,-0.13,7
2,LLMBackTranslation_HI,-0.07,-0.08,0.013093,-0.08,-0.05,7
10,StylisticMutator,-0.065714,-0.06,0.018406,-0.1,-0.04,7
6,NegationOperator,-0.064286,-0.06,0.022588,-0.1,-0.03,7
11,TypographicalErrorsOperator,-0.062857,-0.06,0.012778,-0.08,-0.05,7
3,LLMBasedParaphrasing,-0.061429,-0.07,0.020996,-0.08,-0.03,7
7,POSAwareAntonymReplacement,-0.055714,-0.06,0.009035,-0.07,-0.04,7
4,LLM_POSAwareSynonymReplacement,-0.054286,-0.06,0.012936,-0.07,-0.03,7
8,SemanticFusionCrossover,-0.054286,-0.06,0.023819,-0.09,-0.02,7
0,ConceptAdditionOperator,-0.052857,-0.05,0.017496,-0.08,-0.03,7



Metric: Delta Score Std Dev

Kruskal-Wallis H-test:
  H-statistic: 28.4566
  p-value: 0.002755
  Significance: **
  → Significant difference found (p < 0.05)

Post-hoc Mann-Whitney U tests (Bonferroni corrected):
  Number of comparisons: 66
  Bonferroni corrected α: 0.000758

  Significant pairwise differences (p < 0.000758):
    None (after Bonferroni correction)

Summary Statistics by Operator:


Unnamed: 0,Operator,Mean,Median,Std,Min,Max,N
8,SemanticFusionCrossover,0.091429,0.09,0.009897,0.08,0.11,7
9,SemanticSimilarityCrossover,0.094286,0.09,0.009035,0.08,0.11,7
1,InformedEvolutionOperator,0.107143,0.11,0.010302,0.09,0.12,7
4,LLM_POSAwareSynonymReplacement,0.111429,0.11,0.016413,0.08,0.13,7
5,MLM,0.115714,0.11,0.018406,0.09,0.15,7
11,TypographicalErrorsOperator,0.115714,0.12,0.009035,0.1,0.13,7
3,LLMBasedParaphrasing,0.117143,0.12,0.013851,0.09,0.13,7
7,POSAwareAntonymReplacement,0.117143,0.12,0.011606,0.1,0.14,7
10,StylisticMutator,0.117143,0.11,0.008806,0.11,0.13,7
6,NegationOperator,0.118571,0.12,0.016413,0.09,0.14,7



Statistical Analysis Complete

Summary of Significant Findings

Metrics with significant operator differences (p < 0.05):
  - Elite Hit Rate (%): p=0.001260, 0 significant pairwise differences
  - Invalid Rate (%): p=0.000000, 34 significant pairwise differences
  - Non-Elite Percentage (%): p=0.000000, 38 significant pairwise differences
  - Mean Delta Score: p=0.006552, 0 significant pairwise differences
  - Delta Score Std Dev: p=0.002755, 0 significant pairwise differences

Summary statistics exported to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_statistical_summary.csv
Pairwise comparisons exported to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_pairwise_comparisons.csv


## Visualizations: Operator Rankings and Statistical Significance


In [52]:
# Visualizations: Bar charts showing operator rankings with statistical significance

# Ensure statistical_results exists
if 'statistical_results' not in globals() or not statistical_results:
    raise ValueError("statistical_results not found. Please run the statistical analysis cell first.")

# Create bar charts for each metric showing operator means with significance indicators
for metric in metrics:
    if metric not in statistical_results:
        continue
    
    result = statistical_results[metric]
    operator_data = result['operator_data']
    
    # Calculate means and medians for each operator (filter out empty data)
    operator_means = {op: np.mean(operator_data[op]) for op in operator_data.keys() if len(operator_data[op]) > 0}
    operator_medians = {op: np.median(operator_data[op]) for op in operator_data.keys() if len(operator_data[op]) > 0}
    
    if len(operator_means) == 0:
        print(f"No data for {metric}, skipping visualization")
        continue
    
    # Sort operators by mean (descending for EHR, cEHR; ascending for others where lower is better)
    if metric in ['EHR', 'cEHR']:
        sorted_operators = sorted(operator_means.keys(), key=lambda x: operator_means[x], reverse=True)
    else:
        sorted_operators = sorted(operator_means.keys(), key=lambda x: operator_means[x], reverse=False)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(14, 8))
    
    # Get values for plotting
    means = [operator_means[op] for op in sorted_operators]
    medians = [operator_medians[op] for op in sorted_operators]
    
    # Create bar chart
    x_pos = np.arange(len(sorted_operators))
    bars = ax.bar(x_pos, means, alpha=0.7, color='steelblue', edgecolor='black', linewidth=1.5)
    
    # Add median markers
    ax.scatter(x_pos, medians, color='red', s=100, zorder=5, marker='D', label='Median', edgecolors='black', linewidths=1)
    
    # Add significance indicators if Kruskal-Wallis was significant
    if result['kruskal_wallis']['significant']:
        # Find top operator
        top_op = sorted_operators[0]
        top_mean = operator_means[top_op]
        
        # Check which operators are significantly different from top
        significant_from_top = []
        if 'pairwise' in result:
            for pair in result['pairwise']:
                if pair['significant']:
                    if pair['operator1'] == top_op:
                        significant_from_top.append(pair['operator2'])
                    elif pair['operator2'] == top_op:
                        significant_from_top.append(pair['operator1'])
        
        # Mark significant differences
        if len(means) > 0:
            y_range = max(means) - min(means) if max(means) != min(means) else max(means) * 0.1
            for i, op in enumerate(sorted_operators):
                if op in significant_from_top:
                    # Add asterisk for significant difference from top
                    ax.text(i, means[i] + y_range * 0.05, '*', 
                           ha='center', va='bottom', fontsize=16, color='red', weight='bold')
    
    # Customize plot
    ax.set_xlabel('Operator', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric_names[metric], fontsize=12, fontweight='bold')
    ax.set_title(f'{metric_names[metric]}: Operator Rankings\n'
                 f"Kruskal-Wallis: p={result['kruskal_wallis']['p_value']:.6f} "
                 f"({'Significant' if result['kruskal_wallis']['significant'] else 'Not Significant'})",
                 fontsize=14, fontweight='bold', pad=15)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(sorted_operators, rotation=45, ha='right', fontsize=9)
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    ax.legend(loc='upper right')
    
    # Add value labels on bars
    for i, (bar, mean) in enumerate(zip(bars, means)):
        height = bar.get_height()
        y_offset = max(means) * 0.02 if len(means) > 0 else 0.1
        ax.text(bar.get_x() + bar.get_width()/2., height + y_offset,
               f'{mean:.2f}',
               ha='center', va='bottom', fontsize=8, fontweight='bold')
    
    plt.tight_layout()
    
    # Save plot (sanitize metric name for filename)
    metric_filename = metric.lower().replace('μ', 'mu').replace('σ', 'sigma').replace('δ', 'delta')
    filename_pdf = os.path.join(output_dir, f"rq2_{metric_filename}_ranking.pdf")
    if os.path.exists(filename_pdf):
        os.remove(filename_pdf)
    plt.savefig(filename_pdf, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"Saved: {filename_pdf}")

# Ensure output_dir is defined
if 'output_dir' not in globals():
    if os.path.basename(os.getcwd()) == 'experiments':
        output_dir = os.getcwd()
    else:
        parent_dir = os.path.dirname(os.getcwd())
        experiments_path = os.path.join(parent_dir, 'experiments')
        output_dir = experiments_path if os.path.exists(experiments_path) else os.getcwd()

print(f"\nAll visualizations saved to: {output_dir}")


Saved: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_ehr_ranking.pdf
Saved: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_cehr_ranking.pdf
Saved: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_ir_ranking.pdf
Saved: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_ne_ranking.pdf
Saved: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_deltamu_ranking.pdf
Saved: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_deltasigma_ranking.pdf

All visualizations saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments


## Summary: Operator Rankings with Statistical Significance


In [53]:
# Create comprehensive summary table with operator rankings and statistical significance

# Ensure statistical_results exists
if 'statistical_results' not in globals() or not statistical_results:
    raise ValueError("statistical_results not found. Please run the statistical analysis cell first.")

# Build ranking table
ranking_data = []
for metric in metrics:
    if metric not in statistical_results:
        continue
    
    result = statistical_results[metric]
    operator_data = result['operator_data']
    operators_list = result['operators']
    
    # Calculate means for ranking (filter out empty data)
    operator_means = {op: np.mean(operator_data[op]) for op in operators_list if len(operator_data[op]) > 0}
    
    if len(operator_means) == 0:
        continue
    
    # Sort by mean (descending for EHR, cEHR; ascending for IR, NE, Δμ, Δσ)
    if metric in ['EHR', 'cEHR']:
        # Higher is better
        sorted_ops = sorted(operator_means.items(), key=lambda x: x[1], reverse=True)
    else:
        # Lower is better (IR, NE, Δμ, Δσ)
        sorted_ops = sorted(operator_means.items(), key=lambda x: x[1], reverse=False)
    
    # Assign ranks
    for rank, (operator, mean_val) in enumerate(sorted_ops, start=1):
        # Check if significantly different from top
        is_significant = False
        if result['kruskal_wallis']['significant'] and 'pairwise' in result:
            top_op = sorted_ops[0][0]
            if operator != top_op:
                for pair in result['pairwise']:
                    if pair['significant']:
                        if (pair['operator1'] == top_op and pair['operator2'] == operator) or \
                           (pair['operator2'] == top_op and pair['operator1'] == operator):
                            is_significant = True
                            break
        
        ranking_data.append({
            'Metric': metric_names[metric],
            'Operator': operator,
            'Mean': mean_val,
            'Rank': rank,
            'Significantly_Different_From_Top': is_significant,
            'Kruskal_Wallis_p': result['kruskal_wallis']['p_value'],
            'Kruskal_Wallis_Significant': result['kruskal_wallis']['significant']
        })

ranking_df = pd.DataFrame(ranking_data)

# Display summary
print("="*100)
print("Operator Rankings Summary")
print("="*100)
print("\nTop 3 operators for each metric:\n")

for metric in metrics:
    if metric not in statistical_results:
        continue
    
    metric_df = ranking_df[ranking_df['Metric'] == metric_names[metric]].copy()
    if metric_df.empty:
        continue
    
    # Get top 3 (rank 1, 2, 3 - always use nsmallest since rank 1 is best)
    top3 = metric_df.nsmallest(3, 'Rank')
    
    print(f"{metric_names[metric]}:")
    for _, row in top3.iterrows():
        sig_marker = " *" if row['Significantly_Different_From_Top'] else ""
        print(f"  {row['Rank']}. {row['Operator']}: {row['Mean']:.2f}{sig_marker}")
    print()

# Create PDF table of rankings
fig, ax = plt.subplots(figsize=(16, len(ranking_df) * 0.3 + 3))
ax.axis('tight')
ax.axis('off')

# Prepare table data
table_data = []
headers = ['Metric', 'Operator', 'Mean', 'Rank', 'Sig. Diff?', 'K-W p-value']

for _, row in ranking_df.iterrows():
    sig_text = 'Yes' if row['Significantly_Different_From_Top'] else 'No'
    kw_sig = 'Yes' if row['Kruskal_Wallis_Significant'] else 'No'
    table_data.append([
        row['Metric'],
        row['Operator'],
        f"{row['Mean']:.2f}",
        str(row['Rank']),
        sig_text,
        f"{row['Kruskal_Wallis_p']:.6f} ({kw_sig})"
    ])

table = ax.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.5)

# Style header
for i in range(len(headers)):
    table[(0, i)].set_facecolor('#4CAF50')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style rows with significant differences
for row_idx in range(1, len(table_data) + 1):
    if table_data[row_idx - 1][4] == 'Yes':  # Significantly different
        for j in range(len(headers)):
            table[(row_idx, j)].set_facecolor('#FFE5E5')
    elif row_idx % 2 == 0:
        for j in range(len(headers)):
            table[(row_idx, j)].set_facecolor('#f0f0f0')
    else:
        for j in range(len(headers)):
            table[(row_idx, j)].set_facecolor('white')

plt.title('RQ2: Operator Rankings with Statistical Significance', fontsize=14, fontweight='bold', pad=20)

# Ensure output_dir is defined
if 'output_dir' not in globals():
    if os.path.basename(os.getcwd()) == 'experiments':
        output_dir = os.getcwd()
    else:
        parent_dir = os.path.dirname(os.getcwd())
        experiments_path = os.path.join(parent_dir, 'experiments')
        output_dir = experiments_path if os.path.exists(experiments_path) else os.getcwd()

# Save to PDF
filename_pdf = os.path.join(output_dir, "rq2_operator_rankings.pdf")
if os.path.exists(filename_pdf):
    os.remove(filename_pdf)
plt.savefig(filename_pdf, dpi=150, bbox_inches='tight')
plt.close()

print(f"Rankings table saved to: {filename_pdf}")

# Export rankings to CSV
csv_filename = os.path.join(output_dir, "rq2_operator_rankings.csv")
ranking_df.to_csv(csv_filename, index=False)
print(f"Rankings exported to: {csv_filename}")

# Display rankings table
display(ranking_df)


Operator Rankings Summary

Top 3 operators for each metric:

Elite Hit Rate (%):
  1. InformedEvolutionOperator: 9.72
  2. POSAwareAntonymReplacement: 7.25
  3. LLM_POSAwareSynonymReplacement: 6.77

Conditional Elite Hit Rate (%):
  1. InformedEvolutionOperator: 16.60
  2. SemanticSimilarityCrossover: 9.98
  3. MLM: 8.66

Invalid Rate (%):
  1. SemanticSimilarityCrossover: 0.00
  2. POSAwareAntonymReplacement: 5.00
  3. LLM_POSAwareSynonymReplacement: 13.15

Non-Elite Percentage (%):
  1. SemanticSimilarityCrossover: 21.87
  2. SemanticFusionCrossover: 39.74
  3. TypographicalErrorsOperator: 41.68 *

Mean Delta Score:
  1. InformedEvolutionOperator: -0.17
  2. LLMBackTranslation_HI: -0.07
  3. StylisticMutator: -0.07

Delta Score Std Dev:
  1. SemanticFusionCrossover: 0.09
  2. SemanticSimilarityCrossover: 0.09
  3. InformedEvolutionOperator: 0.11

Rankings table saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_operator_rankings.pdf
Rankings exported to: /Users/o

Unnamed: 0,Metric,Operator,Mean,Rank,Significantly_Different_From_Top,Kruskal_Wallis_p,Kruskal_Wallis_Significant
0,Elite Hit Rate (%),InformedEvolutionOperator,9.721429,1,False,0.001260,True
1,Elite Hit Rate (%),POSAwareAntonymReplacement,7.247143,2,False,0.001260,True
2,Elite Hit Rate (%),LLM_POSAwareSynonymReplacement,6.774286,3,False,0.001260,True
3,Elite Hit Rate (%),MLM,5.844286,4,False,0.001260,True
4,Elite Hit Rate (%),NegationOperator,4.930000,5,False,0.001260,True
...,...,...,...,...,...,...,...
67,Delta Score Std Dev,POSAwareAntonymReplacement,0.117143,8,False,0.002755,True
68,Delta Score Std Dev,StylisticMutator,0.117143,9,False,0.002755,True
69,Delta Score Std Dev,NegationOperator,0.118571,10,False,0.002755,True
70,Delta Score Std Dev,LLMBackTranslation_HI,0.120000,11,False,0.002755,True


## Final Report Table: One Row Per Operator with Statistical Significance


In [54]:
# Create final report table: One row per operator with all metrics and statistical significance

# Ensure required data exists
if 'final_table_df' not in globals() or final_table_df.empty:
    raise ValueError("final_table_df not found. Please run the main processing cell first.")

if 'statistical_results' not in globals() or not statistical_results:
    raise ValueError("statistical_results not found. Please run the statistical analysis cell first.")

# Get mean values for each operator (from mean rows in final_table_df)
mean_rows = final_table_df[final_table_df['is_mean'] == True].copy()

# Create report table with one row per operator
report_data = []

for _, row in mean_rows.iterrows():
    operator = row['Operator']
    
    # Get mean values
    ne_mean = row['NE']
    ehr_mean = row['EHR']
    ir_mean = row['IR']
    cehr_mean = row['cEHR']
    delta_mu_mean = row['Δμ']
    delta_sigma_mean = row['Δσ']
    
    # Get statistical significance indicators
    # For each metric, check if operator is significantly different from top performer
    sig_indicators = {}
    for metric in ['EHR', 'cEHR', 'IR', 'NE', 'Δμ', 'Δσ']:
        if metric not in statistical_results:
            sig_indicators[metric] = False
            continue
        
        result = statistical_results[metric]
        if not result['kruskal_wallis']['significant']:
            sig_indicators[metric] = False
            continue
        
        # Find top operator for this metric
        operator_data = result['operator_data']
        operator_means = {op: np.mean(operator_data[op]) for op in operator_data.keys() if len(operator_data[op]) > 0}
        
        if len(operator_means) == 0:
            sig_indicators[metric] = False
            continue
        
        # Sort by mean (descending for EHR, cEHR; ascending for others)
        if metric in ['EHR', 'cEHR']:
            sorted_ops = sorted(operator_means.items(), key=lambda x: x[1], reverse=True)
        else:
            sorted_ops = sorted(operator_means.items(), key=lambda x: x[1], reverse=False)
        
        top_op = sorted_ops[0][0] if sorted_ops else None
        
        # Check if this operator is significantly different from top
        is_sig = False
        if top_op and operator != top_op and 'pairwise' in result:
            for pair in result['pairwise']:
                if pair['significant']:
                    if (pair['operator1'] == top_op and pair['operator2'] == operator) or \
                       (pair['operator2'] == top_op and pair['operator1'] == operator):
                        is_sig = True
                        break
        
        sig_indicators[metric] = is_sig
    
    # Get rankings for each metric
    rankings = {}
    for metric in ['EHR', 'cEHR', 'IR', 'NE', 'Δμ', 'Δσ']:
        if metric not in statistical_results:
            rankings[metric] = np.nan
            continue
        
        result = statistical_results[metric]
        operator_data = result['operator_data']
        operator_means = {op: np.mean(operator_data[op]) for op in operator_data.keys() if len(operator_data[op]) > 0}
        
        if len(operator_means) == 0 or operator not in operator_means:
            rankings[metric] = np.nan
            continue
        
        # Sort by mean (descending for EHR, cEHR; ascending for others)
        if metric in ['EHR', 'cEHR']:
            sorted_ops = sorted(operator_means.items(), key=lambda x: x[1], reverse=True)
        else:
            sorted_ops = sorted(operator_means.items(), key=lambda x: x[1], reverse=False)
        
        # Find rank
        rank = 1
        for op, mean_val in sorted_ops:
            if op == operator:
                rankings[metric] = rank
                break
            rank += 1
    
    # Create row with significance markers
    # Format: value (sig) where sig is '*' if significantly different from top
    ne_str = f"{ne_mean:.2f}{'*' if sig_indicators['NE'] else ''}"
    ehr_str = f"{ehr_mean:.2f}{'*' if sig_indicators['EHR'] else ''}"
    ir_str = f"{ir_mean:.2f}{'*' if sig_indicators['IR'] else ''}"
    cehr_str = f"{cehr_mean:.2f}{'*' if sig_indicators['cEHR'] else ''}"
    delta_mu_str = f"{delta_mu_mean:.2f}{'*' if sig_indicators['Δμ'] else ''}"
    delta_sigma_str = f"{delta_sigma_mean:.2f}{'*' if sig_indicators['Δσ'] else ''}"
    
    report_data.append({
        'Operator': operator,
        'NE (%)': ne_str,
        'EHR (%)': ehr_str,
        'IR (%)': ir_str,
        'cEHR (%)': cehr_str,
        'Δμ': delta_mu_str,
        'Δσ': delta_sigma_str,
        'EHR_Rank': int(rankings['EHR']) if not np.isnan(rankings['EHR']) else np.nan,
        'IR_Rank': int(rankings['IR']) if not np.isnan(rankings['IR']) else np.nan,
        'NE_Rank': int(rankings['NE']) if not np.isnan(rankings['NE']) else np.nan,
        'cEHR_Rank': int(rankings['cEHR']) if not np.isnan(rankings['cEHR']) else np.nan,
        'Δμ_Rank': int(rankings['Δμ']) if not np.isnan(rankings['Δμ']) else np.nan,
        'Δσ_Rank': int(rankings['Δσ']) if not np.isnan(rankings['Δσ']) else np.nan
    })

# Create DataFrame
report_df = pd.DataFrame(report_data)

# Sort by operator name for consistency
report_df = report_df.sort_values('Operator').reset_index(drop=True)

# Display table
print("="*120)
print("RQ2: Final Report Table - Operator Performance Metrics with Statistical Significance")
print("="*120)
print("\nNote: * indicates operator is significantly different from top performer (p < 0.05, Bonferroni corrected)")
print("Rankings: Lower rank is better for IR, NE, Δμ, Δσ; Higher rank is better for EHR, cEHR\n")
display(report_df)

# Create formatted table for PDF export
fig, ax = plt.subplots(figsize=(20, len(report_df) * 0.4 + 3))
ax.axis('tight')
ax.axis('off')

# Prepare table data (without rank columns for cleaner display)
headers = ['Operator', 'NE (%)', 'EHR (%)', 'IR (%)', 'cEHR (%)', 'Δμ', 'Δσ']
table_data = []

for _, row in report_df.iterrows():
    table_data.append([
        row['Operator'],
        row['NE (%)'],
        row['EHR (%)'],
        row['IR (%)'],
        row['cEHR (%)'],
        row['Δμ'],
        row['Δσ']
    ])

table = ax.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 2.0)

# Style header
for i in range(len(headers)):
    table[(0, i)].set_facecolor('#4CAF50')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style rows - highlight significant differences
for row_idx in range(1, len(table_data) + 1):
    row_data = report_df.iloc[row_idx - 1]
    # Check if any metric has significance marker
    has_sig = any('*' in str(row_data[col]) for col in ['NE (%)', 'EHR (%)', 'IR (%)', 'cEHR (%)', 'Δμ', 'Δσ'])
    
    for j in range(len(headers)):
        if has_sig:
            table[(row_idx, j)].set_facecolor('#FFF9C4')  # Light yellow for significant
        elif row_idx % 2 == 0:
            table[(row_idx, j)].set_facecolor('#f0f0f0')
        else:
            table[(row_idx, j)].set_facecolor('white')
        
        # Make significance markers bold and red
        cell_text = table_data[row_idx - 1][j]
        if '*' in str(cell_text):
            table[(row_idx, j)].set_text_props(weight='bold')

plt.title('RQ2: Operator Performance Metrics with Statistical Significance\n'
          '* = Significantly different from top performer (p < 0.05, Bonferroni corrected)',
          fontsize=12, fontweight='bold', pad=20)

# Save to PDF
if 'output_dir' not in globals():
    if os.path.basename(os.getcwd()) == 'experiments':
        output_dir = os.getcwd()
    else:
        parent_dir = os.path.dirname(os.getcwd())
        experiments_path = os.path.join(parent_dir, 'experiments')
        output_dir = experiments_path if os.path.exists(experiments_path) else os.getcwd()

filename_pdf = os.path.join(output_dir, "rq2_final_report_table.pdf")
if os.path.exists(filename_pdf):
    os.remove(filename_pdf)
plt.savefig(filename_pdf, dpi=150, bbox_inches='tight')
plt.close()

print(f"\nFinal report table saved to: {filename_pdf}")

# Export to CSV (with rank columns)
csv_filename = os.path.join(output_dir, "rq2_final_report_table.csv")
report_df.to_csv(csv_filename, index=False)
print(f"Final report table exported to: {csv_filename}")


RQ2: Final Report Table - Operator Performance Metrics with Statistical Significance

Note: * indicates operator is significantly different from top performer (p < 0.05, Bonferroni corrected)
Rankings: Lower rank is better for IR, NE, Δμ, Δσ; Higher rank is better for EHR, cEHR



Unnamed: 0,Operator,NE (%),EHR (%),IR (%),cEHR (%),Δμ,Δσ,EHR_Rank,IR_Rank,NE_Rank,cEHR_Rank,Δμ_Rank,Δσ_Rank
0,ConceptAdditionOperator,55.71*,4.53,38.52,7.44,-0.05,0.13,7,7,6,6,10,12
1,InformedEvolutionOperator,49.16,9.72,40.06,16.6,-0.17,0.11,1,9,4,1,1,3
2,LLMBackTranslation_HI,71.03*,4.7,21.07,5.93,-0.07,0.12,6,5,9,10,2,11
3,LLMBasedParaphrasing,54.81*,3.49,40.95,6.02,-0.06,0.12,9,10,5,9,6,7
4,LLM_POSAwareSynonymReplacement,75.90*,6.77,13.15,8.05,-0.05,0.11,3,3,11,4,8,4
5,MLM,60.65*,5.84,26.48,8.66,-0.05,0.12,4,6,8,3,11,5
6,NegationOperator,71.47*,4.93,18.81,6.4,-0.06,0.12,5,4,10,8,4,10
7,POSAwareAntonymReplacement,83.96*,7.25,5.0,7.79,-0.06,0.12,2,2,12,5,7,8
8,SemanticFusionCrossover,39.74,2.63,56.02,5.86,-0.05,0.09,11,12,2,11,9,1
9,SemanticSimilarityCrossover,21.87,2.26,0.0,9.98,-0.05,0.09,12,1,1,2,12,2



Final report table saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_final_report_table.pdf
Final report table exported to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_final_report_table.csv


## Generate LaTeX Table for Report with Statistical Significance
