In [None]:
import json
import pandas as pd
import numpy as np
from tqdm.std import tqdm
import re
from typing import List, Dict, Any

In [None]:
result_path = ""

In [None]:
data = []
with open(result_path, "r") as f:
    for line in tqdm(f):
        json_line = json.loads(line)
        data.append(json_line)

In [None]:
def parse_result(json_data):
    true_answer = []
    pred_answer = []

    ## for the results generated from lm-evaluation-harness framework
    for json_line in tqdm(json_data): 
        target = json_line.get("target")
        resps = json_line.get("filtered_resps")[0]
        true_answer.append(target)
        pred_answer.append(resps)

    return true_answer, pred_answer

In [None]:
true_answer, pred_answer = parse_result(data)

### llm-as-a-judge

In [None]:
from openai import OpenAI

In [None]:
api_key=""

In [None]:
client = OpenAI(api_key=api_key)

In [None]:
def get_prompt(true_answer, pred_answer):
    return f"""Instruction: You are an evaluator. Your task is to judge whether the modelâ€™s output pred_answer is correct compared to the given true_answer. 
Follow the rules strictly:

Step 1 (Structure Check):
    Verify whether pred_answer has the same structure as true_answer. The required structure is a JSON object with exactly two keys:
        {{"extracted_value": <value>, "calculated_value": <value>}}
    Minor formatting differences (e.g., line breaks, indentation, whitespace) are acceptable.
    If the structure is invalid, output the label: S
    If valid, continue to Step 2

Step 2 (Extracted Value Check):
    Compare true_answer["extracted_value"] and pred_answer["extracted_value"] by their mathematical meaning, not their string form. For example, "-1,284" and "-1284" are considered equal.
    If they are not equal in numeric meaning, output the label: E
    If equal, continue to Step 3

Step 3 (Calculated Value Check):
    Compare true_answer["calculated_value"] and pred_answer["calculated_value"] strictly in numeric meaning. They must be exactly equal (zero tolerance).
    If they are not equal, output the label: C
    If equal, then everything is correct

Final Decision:
    If all three checks pass, output the label: A
Output only one label: S, E, C, or A. Do not explain your reasoning.

Example 1:
    true_answer = {{"extracted_value": "-1,286", "calculated_value": "1,286"}}
    pred_answer = {{"extracted_value": "-1286", "calculated_value": "0"}}

    Output: C

Example 2:
    true_answer = {{"extracted_value": "5,000", "calculated_value": "5,000"}}
    pred_answer = {{"extracted_value": "5000", "calculated_value": "5000"}}

    Output: A

Example 3:
    true_answer = {{"extracted_value": "123", "calculated_value": "456"}}
    pred_answer = {{"extracted_value": "124", "calculated_value": "456"}}

    Output: E

Example 4:
    true_answer = {{"extracted_value": "100", "calculated_value": "200"}}
    pred_answer = {{"wrong_key": "100", "calculate_value": "200"}}

    Output: S

Input:
    true_answer = {true_answer}
    pred_answer = {pred_answer}
Output:
"""

In [None]:
def get_response(user_input):
    response = client.responses.create(
        model="gpt-5-mini",
        input=user_input,
        reasoning={
            "effort": "minimal"
        },
        text={
            "verbosity": "low"
        }
    )
    return response.output_text

In [None]:
VALID_LABELS = {"A", "S", "E", "C"}

def evaluate_performance(true_answer: List[str], pred_answer: List[str]) -> Dict[str, Any]:
    """
    Based on the output of LLM-as-a-judge (A/S/E/C), calculate:
    - Parsing success rate (percentage of samples evaluated)
    - Accuracy rate (percentage of samples A evaluated)
    - Structural error rate (percentage of samples S evaluated)
    - Extraction error rate (percentage of samples E evaluated)
    - Calculation error rate (percentage of samples C evaluated)
    
    Requires external definitions:
    - get_prompt(t_a, p_a) -> str
    - get_response(user_input) -> str | Any object that can be converted to a string
    """
    if len(true_answer) != len(pred_answer):
        # To avoid silent truncation, the shorter length is selected and prompted
        n = min(len(true_answer), len(pred_answer))
        ta_iter = true_answer[:n]
        pa_iter = pred_answer[:n]
    else:
        n = len(true_answer)
        ta_iter = true_answer
        pa_iter = pred_answer

    A_list, S_list, E_list, C_list = [], [], [], []
    output_errors = []

    for t_a, p_a in tqdm(zip(ta_iter, pa_iter), total=n, desc="Evaluating"):
        user_input = get_prompt(t_a, p_a)
        res_raw = get_response(user_input)
        # Normalization judgment: only accept single characters and within {A,S,E,C}
        res = str(res_raw).strip().upper()
        if res in VALID_LABELS and len(res) == 1:
            if res == "A":
                A_list.append(res)
            elif res == "S":
                S_list.append(res)
            elif res == "E":
                E_list.append(res)
            elif res == "C":
                C_list.append(res)
        else:
            output_errors.append(res_raw)

    total = n
    evaluated = total - len(output_errors)

    def pct(x, d):
        return round(100.0 * x / d, 2) if d > 0 else 0.0

    results = {
        # Parsing success rate: The proportion of samples that were successfully parsed and participated in the evaluation to the total samples
        "Parsing success rate(%)": pct(evaluated, total),

        # The denominators of the following four items are evaluated (the number of samples actually evaluated)
        "ACC(%)": pct(len(A_list), evaluated),
        "Structural error rate(%)": pct(len(S_list), evaluated),
        "Extraction error rate(%)": pct(len(E_list), evaluated),
        "Calculating error rate(%)": pct(len(C_list), evaluated),
    }
    return results


In [None]:
evaluate_performance(true_answer, pred_answer)