# RQ2: Operator Performance Analysis

This notebook analyzes operator performance metrics across multiple runs:
- **NE**: Non-elite percentage
- **EHR**: Elite Hit Rate
- **IR**: Invalid Rate  
- **cEHR**: Conditional Elite Hit Rate
- **Δμ**: Mean delta score (toxicity - parent_score)
- **Δσ**: Standard deviation of delta score

In [35]:
# Imports and Helper Functions
import os
import glob
import re
import json
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.table import Table

# Helper function to flatten operator_statistics column
def flatten_operator_statistics(df, col="operator_statistics"):
    """Flatten nested operator_statistics dictionary into separate columns"""
    if col not in df.columns:
        return df
    all_keys = set()
    for ops in df[col]:
        if isinstance(ops, dict):
            all_keys.update(ops.keys())
    
    for op_key in all_keys:
        flat_rows = []
        for ops in df[col]:
            if isinstance(ops, dict) and op_key in ops and isinstance(ops[op_key], dict):
                prefix = f"operator_statistics_{op_key}_"
                row = {prefix + subk: subv for subk, subv in ops[op_key].items()}
                flat_rows.append(row)
            else:
                flat_rows.append({})
        flat_df = pd.DataFrame(flat_rows)
        df = pd.concat([df.reset_index(drop=True), flat_df.reset_index(drop=True)], axis=1)
    df = df.drop(columns=[col])
    return df

# Define crossover operators (others are mutations)
CROSSOVER_OPERATORS = {'SemanticSimilarityCrossover', 'SemanticFusionCrossover'}


## Main Processing: All Comb Runs

In [36]:
# Process all comb runs and generate final table
# This cell processes all run*_comb directories and creates the final metrics table

# Setup paths
if os.path.basename(os.getcwd()) == 'experiments':
    base_data_dir = os.path.join(os.path.dirname(os.getcwd()), "data", "outputs")
else:
    base_data_dir = os.path.join(os.path.dirname(os.getcwd()), "data", "outputs")
base_data_dir = os.path.normpath(base_data_dir)

# Find all comb runs
pattern = os.path.join(base_data_dir, "run*_comb")
run_dirs = sorted(glob.glob(pattern))
run_dirs = [os.path.basename(d.rstrip('/')) for d in run_dirs]

if not run_dirs:
    raise ValueError(f"No comb run directories found in {base_data_dir}")

print(f"Found {len(run_dirs)} comb runs: {run_dirs}")

def process_single_run(run_dir):
    """Process a single run directory and return metrics per operator"""
    data_dir = os.path.join(base_data_dir, run_dir)
    
    if not os.path.exists(data_dir):
        return None
    
    # Load all files
    dfs = {}
    filenames = [f for f in os.listdir(data_dir) if not f.startswith(".") and os.path.isfile(os.path.join(data_dir, f))]
    
    for fname in filenames:
        file_path = os.path.join(data_dir, fname)
        ext = os.path.splitext(fname)[1].lower()
        try:
            if fname == "EvolutionTracker.json":
                with open(file_path, 'r') as f:
                    jdata = json.load(f)
                if 'generations' in jdata and isinstance(jdata['generations'], list):
                    df = pd.DataFrame(jdata['generations'])
                    if "operator_statistics" in df.columns:
                        df = flatten_operator_statistics(df, col="operator_statistics")
                else:
                    df = pd.json_normalize(jdata)
            elif ext == ".json":
                try:
                    df = pd.read_json(file_path)
                except Exception:
                    with open(file_path, "r") as f:
                        jdata = json.load(f)
                    if isinstance(jdata, list):
                        df = pd.DataFrame(jdata)
                    elif isinstance(jdata, dict):
                        df = pd.json_normalize(jdata)
            else:
                continue
        except Exception:
            continue
        
        if df is not None:
            df_name = os.path.splitext(fname)[0]
            dfs[df_name] = df
    
    # Flatten nested structures in elites, non_elites, under_performing
    for label in ['elites', 'non_elites', 'under_performing']:
        if label in dfs:
            df = dfs[label]
            cols_to_flatten = []
            for col in df.columns:
                sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
                if isinstance(sample_val, (dict, list)):
                    cols_to_flatten.append(col)
            try:
                for col in cols_to_flatten:
                    flattened = pd.json_normalize(df[col])
                    flattened.columns = [f"{col}_{c}" for c in flattened.columns]
                    df = df.drop(columns=[col]).reset_index(drop=True)
                    df = pd.concat([df, flattened], axis=1)
                dfs[label] = df
            except Exception:
                pass
    
    # Create unified_df
    group_labels = ['elites', 'non_elites', 'under_performing']
    selected_dfs = []
    for label in group_labels:
        if label in dfs:
            df = dfs[label].copy()
            if df.empty:
                continue
            if 'generation' in df.columns:
                df = df[df['generation'] <= 50]
            df['_source_group'] = label
            selected_dfs.append(df)
    
    if not selected_dfs:
        return None
    
    unified_df = pd.concat(selected_dfs, ignore_index=True, sort=False)
    
    # Calculate delta_score
    unified_df['delta_score'] = unified_df['moderation_result_google.scores.toxicity'] - unified_df['parent_score']
    
    # Get EvolutionTracker_df
    EvolutionTracker_df = dfs.get('EvolutionTracker', None)
    if EvolutionTracker_df is None:
        return None
    
    # Create operator vs initial_state crosstab
    operator_vs_initial_state = pd.crosstab(
        unified_df['operator'].fillna('Initial Seed'),
        unified_df['initial_state'].fillna('none')
    )
    operator_vs_initial_state['total'] = operator_vs_initial_state.sum(axis=1)
    
    # Get operator statistics columns
    operator_stats_cols = [col for col in EvolutionTracker_df.columns if col.startswith('operator_statistics_')]
    pattern_question = re.compile(r'operator_statistics_(.*?)_question_mark_rejections')
    pattern_duplicates = re.compile(r'operator_statistics_(.*?)_duplicates_removed')
    
    operator_names = set()
    for col in operator_stats_cols:
        m_q = pattern_question.match(col)
        m_d = pattern_duplicates.match(col)
        if m_q:
            operator_names.add(m_q.group(1))
        if m_d:
            operator_names.add(m_d.group(1))
    
    # Calculate delta stats (do this early as it's independent)
    operator_delta_stats = unified_df.groupby('operator')['delta_score'].agg(['mean', 'std']).round(2)
    
    # Build result DataFrame directly (avoid intermediate DataFrames)
    result_data = {}
    all_operators_set = set(operator_names) | set(operator_vs_initial_state.index) - {'Initial Seed'}
    
    for operator in sorted(all_operators_set):
        # Get counts from operator_vs_initial_state
        if operator in operator_vs_initial_state.index:
            elite = operator_vs_initial_state.loc[operator, 'elite'] if 'elite' in operator_vs_initial_state.columns else 0
            non_elite = operator_vs_initial_state.loc[operator, 'non_elite'] if 'non_elite' in operator_vs_initial_state.columns else 0
            total = operator_vs_initial_state.loc[operator, 'total']
        else:
            elite = non_elite = total = 0
        
        # Get cleaning stats (only for operators in operator_names)
        if operator in operator_names:
            col_q = f'operator_statistics_{operator}_question_mark_rejections'
            col_d = f'operator_statistics_{operator}_duplicates_removed'
            question_removed = EvolutionTracker_df[col_q].sum() if col_q in EvolutionTracker_df.columns else 0
            duplicates_removed = EvolutionTracker_df[col_d].sum() if col_d in EvolutionTracker_df.columns else 0
        else:
            question_removed = duplicates_removed = 0
        
        # Calculate total (including removed items)
        calculated_total = total + question_removed + duplicates_removed
        
        if calculated_total == 0:
            continue
        
        # Calculate metrics directly as percentages
        NE = (non_elite / calculated_total * 100).round(2)
        EHR = (elite / calculated_total * 100).round(2)
        IR = (question_removed / calculated_total * 100).round(2)
        cEHR = (elite / total * 100).round(2) if total > 0 else 0.0
        
        # Get delta stats
        delta_mean = operator_delta_stats.loc[operator, 'mean'] if operator in operator_delta_stats.index else np.nan
        delta_std = operator_delta_stats.loc[operator, 'std'] if operator in operator_delta_stats.index else np.nan
        
        result_data[operator] = {
            'NE': NE,
            'EHR': EHR,
            'IR': IR,
            'cEHR': cEHR,
            'Δμ': delta_mean,
            'Δσ': delta_std
        }
    
    result_df = pd.DataFrame(result_data).T
    return result_df[['NE', 'EHR', 'IR', 'cEHR', 'Δμ', 'Δσ']]

# Process all runs
all_run_results = {}
for run_dir in run_dirs:
    run_match = re.search(r'run(\d+)_comb', run_dir)
    if run_match:
        run_key = run_match.group(1)
    else:
        run_key = run_dir.replace('run', '').replace('_comb', '')
    
    result = process_single_run(run_dir)
    if result is not None:
        all_run_results[run_key] = result
        print(f"Processed {run_dir} -> E{run_key}")

# Get all unique operators across all runs
all_operators = set()
for run_key, df in all_run_results.items():
    all_operators.update(df.index.tolist())

# Get all run keys sorted numerically
sorted_run_keys = sorted(all_run_results.keys(), key=lambda x: int(x) if x.isdigit() else 999)

# Create table data
table_rows = []
for operator in sorted(all_operators):
    # Get data for each run
    run_data = {}
    for run_key in sorted_run_keys:
        if run_key in all_run_results and operator in all_run_results[run_key].index:
            run_data[run_key] = all_run_results[run_key].loc[operator]
        else:
            run_data[run_key] = None
    
    # Add individual run rows
    for run_key in sorted_run_keys:
        if run_data[run_key] is not None:
            row_data = run_data[run_key]
            table_rows.append({
                'Operator': operator,
                'Exec': f'E{run_key}',
                'NE': row_data['NE'],
                'EHR': row_data['EHR'],
                'IR': row_data['IR'],
                'cEHR': row_data['cEHR'],
                'Δμ': row_data['Δμ'],
                'Δσ': row_data['Δσ'],
                'is_mean': False
            })
        else:
            table_rows.append({
                'Operator': operator,
                'Exec': f'E{run_key}',
                'NE': np.nan,
                'EHR': np.nan,
                'IR': np.nan,
                'cEHR': np.nan,
                'Δμ': np.nan,
                'Δσ': np.nan,
                'is_mean': False
            })
    
    # Calculate mean across all valid runs
    valid_runs = [run_data[k] for k in sorted_run_keys if run_data[k] is not None]
    if valid_runs:
        mean_data = pd.DataFrame(valid_runs).mean()
        table_rows.append({
            'Operator': operator,
            'Exec': 'Mean',
            'NE': round(mean_data['NE'], 2),
            'EHR': round(mean_data['EHR'], 2),
            'IR': round(mean_data['IR'], 2),
            'cEHR': round(mean_data['cEHR'], 2),
            'Δμ': round(mean_data['Δμ'], 2),
            'Δσ': round(mean_data['Δσ'], 2),
            'is_mean': True
        })

# Create DataFrame
final_table_df = pd.DataFrame(table_rows)

# Display table
print("\n" + "="*100)
print("RQ2: Operator Performance Metrics Across Multiple Runs")
print("="*100)
display(final_table_df)

# Create formatted table for PDF export
fig, ax = plt.subplots(figsize=(16, len(final_table_df) * 0.5 + 2))
ax.axis('tight')
ax.axis('off')

headers = ['Operator', 'Exec', 'NE', 'EHR', 'IR', 'cEHR', 'Δμ', 'Δσ']
table_data = [[row['Operator'], row['Exec'],
               f"{row['NE']:.2f}" if not pd.isna(row['NE']) else 'N/A',
               f"{row['EHR']:.2f}" if not pd.isna(row['EHR']) else 'N/A',
               f"{row['IR']:.2f}" if not pd.isna(row['IR']) else 'N/A',
               f"{row['cEHR']:.2f}" if not pd.isna(row['cEHR']) else 'N/A',
               f"{row['Δμ']:.2f}" if not pd.isna(row['Δμ']) else 'N/A',
               f"{row['Δσ']:.2f}" if not pd.isna(row['Δσ']) else 'N/A']
              for _, row in final_table_df.iterrows()]

table = ax.table(cellText=table_data, colLabels=headers, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 1.8)

# Style header
for i in range(len(headers)):
    table[(0, i)].set_facecolor('#4CAF50')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style mean rows and alternate row colors (combine iteration)
for row_idx, (_, row) in enumerate(final_table_df.iterrows(), start=1):
    for j in range(len(headers)):
        if row['is_mean']:
            table[(row_idx, j)].set_facecolor('#B3E5FC')
            table[(row_idx, j)].set_text_props(weight='bold')
        else:
            table[(row_idx, j)].set_facecolor('#f0f0f0' if row_idx % 2 == 0 else 'white')

plt.title('RQ2: Operator Performance Metrics (Rates % and Deltas)', fontsize=14, fontweight='bold', pad=20)

# Save to PDF
if os.path.basename(os.getcwd()) == 'experiments':
    output_dir = os.getcwd()
else:
    parent_dir = os.path.dirname(os.getcwd())
    experiments_path = os.path.join(parent_dir, 'experiments')
    output_dir = experiments_path if os.path.exists(experiments_path) else os.getcwd()

filename_pdf = os.path.join(output_dir, "rq2_operator_metrics_table.pdf")
if os.path.exists(filename_pdf):
    os.remove(filename_pdf)
plt.savefig(filename_pdf, dpi=150, bbox_inches='tight')
plt.close()

print(f"\nTable saved to: {filename_pdf}")

Found 7 comb runs: ['run01_comb', 'run02_comb', 'run03_comb', 'run04_comb', 'run05_comb', 'run06_comb', 'run07_comb']
Processed run01_comb -> E01
Processed run02_comb -> E02
Processed run03_comb -> E03
Processed run04_comb -> E04
Processed run05_comb -> E05
Processed run06_comb -> E06
Processed run07_comb -> E07

RQ2: Operator Performance Metrics Across Multiple Runs


Unnamed: 0,Operator,Exec,NE,EHR,IR,cEHR,Δμ,Δσ,is_mean
0,ConceptAdditionOperator,E01,62.99,1.57,35.43,2.44,-0.08,0.12,False
1,ConceptAdditionOperator,E02,51.54,4.62,41.54,7.89,-0.06,0.14,False
2,ConceptAdditionOperator,E03,48.87,9.02,41.35,15.38,-0.03,0.10,False
3,ConceptAdditionOperator,E04,52.45,3.50,42.66,6.10,-0.05,0.13,False
4,ConceptAdditionOperator,E05,53.23,9.68,35.48,15.00,-0.03,0.11,False
...,...,...,...,...,...,...,...,...,...
91,TypographicalErrorsOperator,E04,39.16,3.50,55.24,7.94,-0.05,0.10,False
92,TypographicalErrorsOperator,E05,38.71,5.65,54.03,12.28,-0.05,0.12,False
93,TypographicalErrorsOperator,E06,41.41,3.91,53.12,8.33,-0.05,0.11,False
94,TypographicalErrorsOperator,E07,41.59,0.00,57.52,0.00,-0.08,0.13,False



Table saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq2_operator_metrics_table.pdf
