In [1]:
import json
import numpy as np
import pandas as pd
import re

# Load ground truth
with open("qa/nvda_ground_truth3.json") as f:
    ground_truth = json.load(f)

# Load all 10 agent results from before_results/
agent_results_list = []
for i in range(1, 11):
    with open(f"results/baseline_results/agent_results{i}.json", "r") as f:
    # with open(f"results/new/agent_results_with_eval{i}.json", "r") as f:
        agent_results_list.append(json.load(f))

print(f"Loaded {len(agent_results_list)} result files")
print(f"Ground truth has {len(ground_truth)} questions")

Loaded 10 result files
Ground truth has 3 questions


In [2]:
def parse_year_str(y):
    """Normalize any year format ('25','FY25','2025','FY2025') -> '2025' (string)."""
    y = str(y).strip()
    y = y.replace("FY", "").replace("fy", "")
    if len(y) == 2:
        return f"20{y}"
    if len(y) == 4:
        return y
    return None

def normalize_agent_data_generic(agent_list):
    normalized = {}
    for item in agent_list:
        key = None
        
        # Handle fiscal_period field (e.g., "Q2 FY26" or "FY26Q2")
        if "fiscal_period" in item:
            period_text = item["fiscal_period"].strip()
            # Match patterns like "Q2 FY26", "FY26Q2", "Q2FY26", "FY26 Q2"
            parts = re.findall(r"(?:FY)?(\d{2,4})?[-\s]*(Q\d)[-\s]*(?:FY)?(\d{2,4})?|(?:Q\d)\s*(?:FY)?(\d{2,4})", period_text)
            if parts:
                # Flatten and filter out empty strings
                flat = [p for group in parts for p in group if p]
                q = next((p for p in flat if p.startswith('Q')), None)
                y = next((p for p in flat if not p.startswith('Q')), None)
                if q and y:
                    year = parse_year_str(y)
                    key = f"FY{year}-{q}"
        elif "quarter" in item:
            qtext = item["quarter"].strip()
            parts = re.findall(r"(Q\d)\s*(FY)?(\d{2,4})", qtext)
            if parts:
                q, _, y = parts[0]
                year = parse_year_str(y)
                key = f"FY{year}-{q}"
        elif "year" in item:
            y = parse_year_str(item["year"])
            if y:
                key = f"FY{y}"
        elif "fiscal_year" in item:
            y = parse_year_str(item["fiscal_year"])
            if y:
                key = f"FY{y}"

        if not key:
            continue

        entry = {}
        for k, v in item.items():
            k_norm = k.lower().strip()
            if isinstance(v, str) and v.endswith("%"):
                try:
                    v = float(v.replace("%", "").strip())
                except ValueError:
                    pass
            
            # Normalize field names and values
            if k_norm in ["operating_expenses", "opex"]:
                entry["opex"] = float(v) if v is not None else None
            elif "gross" in k_norm and "margin" in k_norm:
                entry["gross_margin"] = float(v) if v is not None else None
            elif k_norm in ["computed_value", "efficiency_ratio", "operating_efficiency_ratio"]:
                entry["efficiency_ratio"] = float(v) if v is not None else None
            elif k_norm in ["yoy_change", "yoy_change (%)", "yoy_change(%)"]:
                entry["yoy_change (%)"] = float(v) if v not in [None, "None", ""] else None
            else:
                entry[k_norm] = v

        normalized[key] = entry
    return normalized

def normalize_gt_data_generic(gt_list):
    normalized = {}
    for item in gt_list:
        key = None
        fy = item.get("fiscal_year") or item.get("year")
        if fy:
            year = parse_year_str(fy)
            if "fiscal_quarter" in item:
                q = item["fiscal_quarter"].strip()
                key = f"FY{year}-{q}"
            else:
                key = f"FY{year}"

        if not key:
            continue

        entry = {}
        for k, v in item.items():
            k_norm = k.lower().strip()
            if isinstance(v, str) and v.endswith("%"):
                v = float(v.replace("%", "").strip())

            if k_norm in ["operating_expenses", "opex"]:
                entry["opex"] = float(v) if v is not None else None
            elif "gross" in k_norm and "margin" in k_norm:
                entry["gross_margin"] = float(v) if v is not None else None
            elif k_norm in ["computed_value", "efficiency_ratio", "operating_efficiency_ratio"]:
                entry["efficiency_ratio"] = float(v) if v is not None else None
            elif k_norm in ["yoy_change", "yoy_change (%)", "yoy_change(%)"]:
                entry["yoy_change (%)"] = float(v) if v not in [None, "None", ""] else None
            else:
                entry[k_norm] = v
        normalized[key] = entry
    return normalized

def compare_financials(agent_list, gt_list, value_fields=None, tolerance=0.01):
    """Compare financials with tolerance only for rounding differences (0.01 allows differences up to 0.01)."""
    agent_norm = normalize_agent_data_generic(agent_list)
    gt_norm = normalize_gt_data_generic(gt_list)

    if value_fields is None:
        value_fields = ["opex", "gross_margin", "efficiency_ratio", "yoy_change (%)"]

    total_fields = 0
    correct = 0
    mismatches = []

    for key, gt_entry in gt_norm.items():
        if key in agent_norm:
            agent_entry = agent_norm[key]
            for field in value_fields:
                if field in gt_entry:
                    total_fields += 1
                    if field in agent_entry:
                        try:
                            gt_val = gt_entry[field]
                            agent_val = agent_entry[field]
                            
                            # Handle None values
                            if gt_val is None and agent_val is None:
                                correct += 1
                            elif gt_val is None or agent_val is None:
                                mismatches.append({
                                    "key": key,
                                    "field": field,
                                    "gt": gt_val,
                                    "agent": agent_val,
                                    "reason": "None mismatch"
                                })
                            else:
                                gt_val = float(gt_val)
                                agent_val = float(agent_val)
                                diff = abs(gt_val - agent_val)
                                
                                if diff <= tolerance:
                                    correct += 1
                                else:
                                    mismatches.append({
                                        "key": key,
                                        "field": field,
                                        "gt": gt_val,
                                        "agent": agent_val,
                                        "diff": diff
                                    })
                        except Exception as e:
                            mismatches.append({
                                "key": key,
                                "field": field,
                                "error": str(e)
                            })
                    else:
                        mismatches.append({
                            "key": key,
                            "field": field,
                            "reason": "Missing in agent"
                        })
        else:
            for field in value_fields:
                if field in gt_entry:
                    total_fields += 1
                    mismatches.append({
                        "key": key,
                        "reason": "Missing period in agent"
                    })

    accuracy = (correct / total_fields * 100) if total_fields > 0 else 0
    
    # Print mismatches for debugging
    if mismatches and len(mismatches) <= 5:
        print("  Mismatches:")
        for mm in mismatches:
            print(f"    {mm}")
    
    return accuracy

In [3]:
# Calculate accuracy for each run and each question
accuracy_data = []

for run_idx, agent_data in enumerate(agent_results_list, start=1):
    run_accuracies = []
    
    for q_idx in range(len(ground_truth)):
        result = agent_data[q_idx]
        
        # Safely extract computed_values with error handling
        try:
            agent_computed = result.get('data', {}).get('computed_values')
            # If no computed_values, try data_values
            if not agent_computed:
                agent_computed = result.get('data', {}).get('data_values')
                if agent_computed:
                    print(f"ℹ️  Run {run_idx}, Q{q_idx+1}: Using data_values (computed_values not found)")
        except (KeyError, AttributeError, TypeError) as e:
            agent_computed = None
        
        gt_computed = ground_truth[q_idx]['expected_computed_values']
        
        # Check if computed_values is None or empty
        if not agent_computed or agent_computed is None:
            print(f"⚠️  Run {run_idx}, Q{q_idx+1}: No computed_values or data_values - Setting accuracy = 0")
            acc = 0.0
        else:
            # Print what we're comparing
            print(f"\n{'='*60}")
            print(f"Run {run_idx}, Q{q_idx+1} - Comparing:")
            print(f"{'='*60}")
            print(f"Ground Truth:")
            for item in gt_computed:
                print(f"  {item}")
            
            print(f"\nAgent Output:")
            for item in agent_computed:
                print(f"  {item}")
            
            acc = compare_financials(agent_computed, gt_computed)
            print(f"\nCalculated Accuracy: {acc:.2f}%")
            print(f"{'='*60}\n")
        
        run_accuracies.append(acc)
    
    accuracy_data.append(run_accuracies)

print("\n✓ Accuracy calculated for all 10 runs across 3 questions")


Run 1, Q1 - Comparing:
Ground Truth:
  {'fiscal_year': '2025', 'fiscal_quarter': 'Q2', 'gross_margin': 75.1}
  {'fiscal_year': '2025', 'fiscal_quarter': 'Q3', 'gross_margin': 74.6}
  {'fiscal_year': '2025', 'fiscal_quarter': 'Q4', 'gross_margin': 73.0}
  {'fiscal_year': '2026', 'fiscal_quarter': 'Q1', 'gross_margin': 60.5}
  {'fiscal_year': '2026', 'fiscal_quarter': 'Q2', 'gross_margin': 72.4}

Agent Output:
  {'quarter': 'Q2 FY26', 'gross_margin': '72.4%'}
  {'quarter': 'Q1 FY26', 'gross_margin': '60.5%'}
  {'quarter': 'Q4 FY25', 'gross_margin': '73.0%'}
  {'quarter': 'Q3 FY25', 'gross_margin': '74.6%'}
  {'quarter': 'Q2 FY25', 'gross_margin': '75.1%'}

Calculated Accuracy: 100.00%


Run 1, Q2 - Comparing:
Ground Truth:
  {'fiscal_year': '2023', 'opex': 11132.0, 'yoy_change (%)': None, 'units': 'millions USD'}
  {'fiscal_year': '2024', 'opex': 11329.0, 'yoy_change (%)': 1.77, 'units': 'millions USD'}
  {'fiscal_year': '2025', 'opex': 16405.0, 'yoy_change (%)': 44.81, 'units': 'millio

In [4]:
# Create accuracy table: rows = questions, columns = runs 1-10
accuracy_array = np.array(accuracy_data).T  # Transpose to get questions as rows

df_accuracy = pd.DataFrame(
    accuracy_array,
    columns=[f"Run {i}" for i in range(1, 11)],
    index=[f"Q{i+1}_Accuracy" for i in range(len(ground_truth))]
)

print("="*80)
print("ACCURACY TABLE (%) - Each Question Across 10 Runs")
print("="*80)
print(df_accuracy.round(2))
print("="*80)

ACCURACY TABLE (%) - Each Question Across 10 Runs
              Run 1  Run 2  Run 3  Run 4   Run 5  Run 6  Run 7  Run 8  Run 9  \
Q1_Accuracy  100.00   60.0  40.00  80.00  100.00  60.00  80.00  80.00   0.00   
Q2_Accuracy   16.67   50.0  16.67  50.00   50.00  50.00  50.00  16.67  50.00   
Q3_Accuracy   33.33    0.0  33.33  33.33   33.33  33.33  33.33  33.33  33.33   

             Run 10  
Q1_Accuracy   40.00  
Q2_Accuracy   50.00  
Q3_Accuracy   33.33  


In [5]:
# Calculate P50 and P95 for each question
p50_values = []
p95_values = []
mean_values = []
std_values = []

for q_idx in range(len(ground_truth)):
    q_accuracies = accuracy_array[q_idx]
    
    p50 = np.percentile(q_accuracies, 50)
    p95 = np.percentile(q_accuracies, 95)
    mean = np.mean(q_accuracies)
    std = np.std(q_accuracies)
    
    p50_values.append(p50)
    p95_values.append(p95)
    mean_values.append(mean)
    std_values.append(std)

# Create summary table
df_summary = pd.DataFrame({
    'Question': [f"Q{i+1}" for i in range(len(ground_truth))],
    'P50 (%)': p50_values,
    'P95 (%)': p95_values,
    'Mean (%)': mean_values,
    'Std Dev (%)': std_values,
    'Min (%)': [np.min(accuracy_array[i]) for i in range(len(ground_truth))],
    'Max (%)': [np.max(accuracy_array[i]) for i in range(len(ground_truth))]
})

print("\n" + "="*80)
print("ACCURACY STATISTICS: P50 and P95 Analysis")
print("="*80)
print(df_summary.round(2).to_string(index=False))
print("="*80)


ACCURACY STATISTICS: P50 and P95 Analysis
Question  P50 (%)  P95 (%)  Mean (%)  Std Dev (%)  Min (%)  Max (%)
      Q1    70.00   100.00      64.0        29.39     0.00   100.00
      Q2    50.00    50.00      40.0        15.28    16.67    50.00
      Q3    33.33    33.33      30.0        10.00     0.00    33.33
