In [12]:
import json
import numpy as np
import pandas as pd
import re

# Load ground truth
with open("qa/nvda_ground_truth3.json") as f:
    ground_truth = json.load(f)

# Load all 10 agent results from before_results/
agent_results_list = []
for i in range(1, 11):
    with open(f"results/baseline_results/agent_results{i}.json", "r") as f:
        agent_results_list.append(json.load(f))

print(f"Loaded {len(agent_results_list)} result files")
print(f"Ground truth has {len(ground_truth)} questions")

Loaded 10 result files
Ground truth has 3 questions


In [13]:
def parse_year_str(y):
    """Normalize any year format ('25','FY25','2025','FY2025') -> '2025' (string)."""
    y = str(y).strip()
    y = y.replace("FY", "").replace("fy", "")
    if len(y) == 2:
        return f"20{y}"
    if len(y) == 4:
        return y
    return None

def normalize_agent_data_generic(agent_list):
    normalized = {}
    for item in agent_list:
        key = None
        if "quarter" in item:
            qtext = item["quarter"].strip()
            parts = re.findall(r"(Q\d)\s*(FY)?(\d{2,4})", qtext)
            if parts:
                q, _, y = parts[0]
                year = parse_year_str(y)
                key = f"FY{year}-{q}"
        elif "year" in item:
            y = parse_year_str(item["year"])
            if y:
                key = f"FY{y}"
        elif "fiscal_year" in item:
            y = parse_year_str(item["fiscal_year"])
            key = f"FY{y}"

        if not key:
            continue

        entry = {}
        for k, v in item.items():
            k_norm = k.lower().strip()
            if isinstance(v, str) and v.endswith("%"):
                try:
                    v = float(v.replace("%", "").strip())
                except ValueError:
                    pass
            if k_norm in ["operating_expenses", "opex"]:
                entry["opex"] = float(v)
            elif "gross" in k_norm and "margin" in k_norm:
                entry["gross_margin"] = float(v)
            elif k_norm in ["computed_value", "efficiency_ratio"]:
                entry["efficiency_ratio"] = float(v)
            elif "yoy" in k_norm:
                entry["yoy_change (%)"] = float(v) if v not in [None, "None", ""] else None
            else:
                entry[k_norm] = v

        normalized[key] = entry
    return normalized

def normalize_gt_data_generic(gt_list):
    normalized = {}
    for item in gt_list:
        key = None
        fy = item.get("fiscal_year") or item.get("year")
        if fy:
            year = parse_year_str(fy)
            if "fiscal_quarter" in item:
                q = item["fiscal_quarter"].strip()
                key = f"FY{year}-{q}"
            else:
                key = f"FY{year}"

        if not key:
            continue

        entry = {}
        for k, v in item.items():
            k_norm = k.lower().strip()
            if isinstance(v, str) and v.endswith("%"):
                v = float(v.replace("%", "").strip())

            if k_norm in ["operating_expenses", "opex"]:
                entry["opex"] = float(v)
            elif "gross" in k_norm and "margin" in k_norm:
                entry["gross_margin"] = float(v)
            elif k_norm in ["computed_value", "efficiency_ratio"]:
                entry["efficiency_ratio"] = float(v)
            elif "yoy" in k_norm:
                entry["yoy_change (%)"] = float(v) if v not in [None, "None", ""] else None
            else:
                entry[k_norm] = v
        normalized[key] = entry
    return normalized

def compare_financials(agent_list, gt_list, value_fields=None, tolerance=0):
    agent_norm = normalize_agent_data_generic(agent_list)
    gt_norm = normalize_gt_data_generic(gt_list)

    if value_fields is None:
        value_fields = ["opex", "gross_margin", "efficiency_ratio", "yoy_change (%)"]

    total_fields = 0
    correct = 0

    for key, gt_entry in gt_norm.items():
        if key in agent_norm:
            agent_entry = agent_norm[key]
            for field in value_fields:
                if field in gt_entry:
                    total_fields += 1
                    if field in agent_entry:
                        try:
                            gt_val = float(gt_entry[field])
                            agent_val = float(agent_entry[field])
                            diff = abs(gt_val - agent_val)
                            if diff <= tolerance:
                                correct += 1
                        except Exception:
                            pass
        else:
            for field in value_fields:
                if field in gt_entry:
                    total_fields += 1

    accuracy = (correct / total_fields * 100) if total_fields > 0 else 0
    return accuracy

In [14]:
# Calculate accuracy for each run and each question
accuracy_data = []

for run_idx, agent_data in enumerate(agent_results_list, start=1):
    run_accuracies = []
    
    for q_idx in range(len(ground_truth)):
        result = agent_data[q_idx]
        
        # Safely extract computed_values with error handling
        try:
            agent_computed = result.get('data', {}).get('computed_values')
        except (KeyError, AttributeError, TypeError) as e:
            agent_computed = None
        
        gt_computed = ground_truth[q_idx]['expected_computed_values']
        
        # Check if computed_values is None or empty
        if not agent_computed or agent_computed is None:
            print(f"Run {run_idx}, Q{q_idx+1}: computed_values is None or empty - Setting accuracy = 0")
            acc = 0.0
        else:
            acc = compare_financials(agent_computed, gt_computed)
        
        run_accuracies.append(acc)
    
    accuracy_data.append(run_accuracies)

print("Accuracy calculated for all 10 runs across 3 questions")

Run 2, Q3: computed_values is None or empty - Setting accuracy = 0
Accuracy calculated for all 10 runs across 3 questions


In [15]:
# Create accuracy table: rows = questions, columns = runs 1-10
accuracy_array = np.array(accuracy_data).T  # Transpose to get questions as rows

df_accuracy = pd.DataFrame(
    accuracy_array,
    columns=[f"Run {i}" for i in range(1, 11)],
    index=[f"Q{i+1}_Accuracy" for i in range(len(ground_truth))]
)

print("="*80)
print("ACCURACY TABLE (%) - Each Question Across 10 Runs")
print("="*80)
print(df_accuracy.round(2))
print("="*80)

ACCURACY TABLE (%) - Each Question Across 10 Runs
             Run 1  Run 2  Run 3  Run 4  Run 5  Run 6  Run 7  Run 8  Run 9  \
Q1_Accuracy  100.0   60.0   40.0   80.0  100.0   60.0   80.0   80.0    0.0   
Q2_Accuracy    0.0   50.0    0.0   50.0   50.0   50.0   50.0    0.0   50.0   
Q3_Accuracy    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

             Run 10  
Q1_Accuracy    40.0  
Q2_Accuracy    50.0  
Q3_Accuracy     0.0  


In [16]:
# Calculate P50 and P95 for each question
p50_values = []
p95_values = []
mean_values = []
std_values = []

for q_idx in range(len(ground_truth)):
    q_accuracies = accuracy_array[q_idx]
    
    p50 = np.percentile(q_accuracies, 50)
    p95 = np.percentile(q_accuracies, 95)
    mean = np.mean(q_accuracies)
    std = np.std(q_accuracies)
    
    p50_values.append(p50)
    p95_values.append(p95)
    mean_values.append(mean)
    std_values.append(std)

# Create summary table
df_summary = pd.DataFrame({
    'Question': [f"Q{i+1}" for i in range(len(ground_truth))],
    'P50 (%)': p50_values,
    'P95 (%)': p95_values,
    'Mean (%)': mean_values,
    'Std Dev (%)': std_values,
    'Min (%)': [np.min(accuracy_array[i]) for i in range(len(ground_truth))],
    'Max (%)': [np.max(accuracy_array[i]) for i in range(len(ground_truth))]
})

print("\n" + "="*80)
print("ACCURACY STATISTICS: P50 and P95 Analysis")
print("="*80)
print(df_summary.round(2).to_string(index=False))
print("="*80)


ACCURACY STATISTICS: P50 and P95 Analysis
Question  P50 (%)  P95 (%)  Mean (%)  Std Dev (%)  Min (%)  Max (%)
      Q1     70.0    100.0      64.0        29.39      0.0    100.0
      Q2     50.0     50.0      35.0        22.91      0.0     50.0
      Q3      0.0      0.0       0.0         0.00      0.0      0.0
