In [2]:
import pandas as pd
import os
from pathlib import Path

csv_dir = '/home/eeyifanshen/e2e_audio_LLM/multi_modal_vap/output'

def read_all_final_scores(output_dir):
    """Read all final_score.csv files and extract average rows for comparison"""
    all_averages = []
    
    # Find all directories containing final_score.csv
    for folder in os.listdir(output_dir):
        folder_path = os.path.join(output_dir, folder)
        if os.path.isdir(folder_path):
            csv_path = os.path.join(folder_path, 'final_score.csv')
            if os.path.exists(csv_path):
                try:
                    # Read the CSV file
                    df = pd.read_csv(csv_path)
                    
                    # Find the average row
                    avg_row = df[df['model'] == 'Average'].copy()
                    if not avg_row.empty:
                        # Add experiment name from folder
                        avg_row['experiment'] = folder
                        all_averages.append(avg_row)
                        print(f"✓ Found average scores for: {folder}")
                    else:
                        print(f"⚠ No 'Average' row found in: {folder}")
                        
                except Exception as e:
                    print(f"✗ Error reading {csv_path}: {e}")
    
    if not all_averages:
        print("No valid final_score.csv files found!")
        return None
    
    # Combine all average rows
    combined_df = pd.concat(all_averages, ignore_index=True)
    
    # Reorder columns to put experiment first
    cols = ['experiment'] + [col for col in combined_df.columns if col != 'experiment']
    combined_df = combined_df[cols]
    
    return combined_df

# Read all final scores
print("Scanning for final_score.csv files...")
print("=" * 50)
results_df = read_all_final_scores(csv_dir)

if results_df is not None:
    print("\n" + "=" * 50)
    print("COMPARISON OF AVERAGE SCORES ACROSS EXPERIMENTS")
    print("=" * 50)
    
    # Display key metrics in a clean format
    key_metrics = ['experiment', 'test_loss']
    #  'shift_f1', 'shift_precision', 'shift_recall', 
                #    'hold_f1', 'hold_precision', 'hold_recall', 'accuracy', 'top_k_accuracy',]
    
    if all(col in results_df.columns for col in key_metrics):
        print("\nKEY PERFORMANCE METRICS:")
        print("-" * 30)
        display_df = results_df[key_metrics].round(4)
        print(display_df.to_string(index=False, max_colwidth=25))
    
    # # Display additional metrics
    # additional_metrics = ['shift_hold', 'short_long', 'shift_pred', 'ov_pred', 'bc_pred']
    additional_metrics = ['shift_hold', 'short_long', 'shift_pred', 'bc_pred']
    
    available_additional = [col for col in additional_metrics if col in results_df.columns]
    
    if available_additional:
        print(f"\n\nADDITIONAL METRICS:")
        print("-" * 20)
        additional_df = results_df[['experiment'] + available_additional].round(4)
        print(additional_df.to_string(index=False, max_colwidth=25))
    
    # Show best performing experiment for key metrics
    print(f"\n\nBEST PERFORMING EXPERIMENTS:")
    print("-" * 35)
    
    metrics_to_check = {
        'Lowest Test Loss': ('test_loss', 'min'),
        'Highest Shift F1': ('shift_f1', 'max'),
        'Highest Accuracy': ('accuracy', 'max'),
        'Highest Hold F1': ('hold_f1', 'max')
    }
    
    for metric_name, (column, operation) in metrics_to_check.items():
        if column in results_df.columns:
            if operation == 'min':
                best_idx = results_df[column].idxmin()
            else:
                best_idx = results_df[column].idxmax()
            
            best_exp = results_df.loc[best_idx, 'experiment']
            best_value = results_df.loc[best_idx, column]
            print(f"{metric_name:20}: {best_exp} ({best_value:.4f})")
    
    print(f"\n\nFull DataFrame shape: {results_df.shape}")
    print("All columns:", list(results_df.columns))

Scanning for final_score.csv files...
✓ Found average scores for: audio_wav_vis_null_2025_06_11_143013
✓ Found average scores for: audio_wav_va_vis_null_2025_06_11_143157
✓ Found average scores for: audio_full_vis_null_2025_06_11_143327
✓ Found average scores for: audio_null_vis_gaze_2025_06_11_143518
✓ Found average scores for: audio_null_vis_au_2025_06_11_143655
✓ Found average scores for: audio_null_vis_head_2025_06_11_143810
✓ Found average scores for: audio_null_vis_pose_2025_06_11_143955
✓ Found average scores for: audio_null_vis_full_2025_06_11_144146
✓ Found average scores for: audio_full_vis_gaze_2025_06_11_144446
✓ Found average scores for: audio_full_vis_au_2025_06_11_144658
✓ Found average scores for: audio_full_vis_head_2025_06_11_144859
✓ Found average scores for: audio_full_vis_pose_2025_06_11_145135
✓ Found average scores for: audio_full_vis_full_2025_06_11_145413

COMPARISON OF AVERAGE SCORES ACROSS EXPERIMENTS

KEY PERFORMANCE METRICS:
------------------------------
 