In [11]:

import json
import math
from pathlib import Path
from collections import defaultdict
import statistics
from typing import Tuple, Dict, Any

def calculate_scores(directory_path: str) -> Tuple[Dict[str, Any], float]:
    """
    Reads all JSON files in a directory, calculates the average score for each
    metric, and calculates the overall average accuracy. It treats 'NaN'
    values as 0.0.

    Args:
        directory_path: The path to the directory containing the JSON files.

    Returns:
        A tuple containing:
        - A dictionary with the calculated average scores for each category.
        - A float representing the overall average accuracy.
    """
    all_scores = defaultdict(lambda: defaultdict(list))
    # New list to store all accuracy scores for the final overall average
    all_accuracy_scores = []

    data_dir = Path(directory_path)

    if not data_dir.is_dir():
        print(f"Error: Directory not found at '{directory_path}'")
        return {}, 0.0

    json_files = list(data_dir.glob('*.json'))
    if not json_files:
        print(f"No JSON files found in '{directory_path}'")
        return {}, 0.0
        
    print(f"Found {len(json_files)} JSON files to process...")

    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

                for category, metrics in data.items():
                    for metric, value in metrics.items():
                        
                        # Handle NaN values by converting them to 0.0
                        if isinstance(value, float) and math.isnan(value):
                            value = 0.0
                        
                        # Ensure the value is a number before processing
                        if isinstance(value, (int, float)):
                            all_scores[category][metric].append(value)
                            
                            # --- NEW ---
                            # If the metric is 'answer_correctness', add it to our overall list
                            if metric == 'answer_correctness':
                                all_accuracy_scores.append(value)
                            # --- END NEW ---
                                
                        else:
                            print(f"⚠️  Warning: Skipping non-numeric value '{value}' for '{metric}' in {file_path.name}.")
                            continue

        except json.JSONDecodeError:
            print(f"⚠️  Warning: Could not decode JSON from {file_path.name}. Skipping file.")
        except Exception as e:
            print(f"An error occurred with {file_path.name}: {e}. Skipping file.")
    
    # --- Calculate per-category averages ---
    average_scores = defaultdict(dict)
    for category, metrics in all_scores.items():
        for metric, values in metrics.items():
            if values:
                average_scores[category][metric] = statistics.mean(values)

    # --- NEW: Calculate the single overall average accuracy ---
    overall_avg_accuracy = statistics.mean(all_accuracy_scores) if all_accuracy_scores else 0.0
    
    return average_scores, overall_avg_accuracy



  

In [15]:
  # ❗️ IMPORTANT: Change this path to your directory of JSON files.
DIRECTORY_PATH = "/home/nick/projects/GraphRAG-Benchmark/results/embed" 

# Calculate the average scores
category_averages, overall_accuracy = calculate_scores(DIRECTORY_PATH)
# --- END MODIFIED ---

# Print the per-category results
if category_averages:
    print("\n--- Average Scores by Category ---")
    for category, metrics in sorted(category_averages.items()):
        print(f"\n## {category}")
        for metric, avg_value in sorted(metrics.items()):
            print(f"  - Average {metric}: {avg_value:.4f}")
    
    # --- NEW: Print the final overall average ---
    print("\n----------------------------------")
    print("--- Overall Performance ---")
    print(f"📈 Overall Average Accuracy: {overall_accuracy:.4f}")
    print("---------------------------")
    # --- END NEW ---

Found 20 JSON files to process...

--- Average Scores by Category ---

## Complex Reasoning
  - Average answer_correctness: 0.5478
  - Average rouge_score: 0.2591

## Contextual Summarize
  - Average answer_correctness: 0.6038
  - Average coverage_score: 0.6991

## Creative Generation
  - Average answer_correctness: 0.6316
  - Average coverage_score: 0.2897
  - Average faithfulness: 0.7666

## Fact Retrieval
  - Average answer_correctness: 0.4996
  - Average rouge_score: 0.3839

----------------------------------
--- Overall Performance ---
📈 Overall Average Accuracy: 0.5683
---------------------------
