In [None]:
import json
import pandas as pd
import numpy as np
from tqdm.std import tqdm
import re
from typing import List
import codecs
import ast

In [None]:
result_path = ""

In [None]:
data = []
with open(result_path, "r") as f:
    for line in tqdm(f):
        json_line = json.loads(line)
        data.append(json_line)

In [None]:
def parse_code_block(text: str):
    """
    Remove surrounding triple backtick code block markers and parse into a Python object.
    Handles escaped JSON strings as a special case.
    """
    if not isinstance(text, str):
        return text

    cleaned = text.strip()

    # strip ```lang ... ``` fences
    if cleaned.startswith("```") and cleaned.endswith("```"):
        cleaned = re.sub(r"^```[a-zA-Z]*\s*", "", cleaned, flags=re.DOTALL)
        cleaned = re.sub(r"\s*```$", "", cleaned, flags=re.DOTALL).strip()

    # 1) plain JSON
    try:
        out = json.loads(cleaned)
        # if the decoded result is itself a JSON-looking string, decode again
        if isinstance(out, str) and out.strip().startswith(("[", "{")):
            try:
                return json.loads(out)
            except json.JSONDecodeError:
                pass
        return out
    except json.JSONDecodeError:
        pass

    # 2) escaped JSON like: [\"a\", \"b\"]
    try:
        unescaped = codecs.decode(cleaned, "unicode_escape")
        out = json.loads(unescaped)
        return out
    except Exception:
        pass

    # 3) Python-literal-style list/dict like: ['a', 'b']
    try:
        return ast.literal_eval(cleaned)
    except Exception:
        pass

    # 4) last resort: simple de-escape of \" then try JSON again
    try:
        return json.loads(cleaned.replace('\\"', '"'))
    except Exception:
        return cleaned


In [None]:
##### for the results generated from lm-evaluation-harness framework
def parse_result(json_data):
    true_answer = []
    pred_answer = []
    
    for json_line in tqdm(json_data): 
        target = eval(json.loads(json_line.get("target")))
        resps = parse_code_block(json_line.get("filtered_resps")[0])
        true_answer.append(target)
        pred_answer.append(resps)

    return true_answer, pred_answer

In [None]:
true_answer, pred_answer = parse_result(data)

In [None]:
def hit_rate_at_k(true_answer: List[List[str]], pred_answer: List[List[str]], k: int) -> float:
    """
    Compute HR@k (Hit Rate at k).
    true_answer[i] : list of ground truth elements for query i
    pred_answer[i] : list of predicted elements for query i (ranked list)
    """
    assert len(true_answer) == len(pred_answer), "true_answer and pred_answer must have the same length"
    N = len(true_answer)
    hits = 0
    for g, p in zip(true_answer, pred_answer):
        topk_preds = set(p[:k])
        if topk_preds & set(g):  # intersection is not empty
            hits += 1
    return hits / N

In [None]:
def recall_at_k(true_answer: List[List[str]], pred_answer: List[List[str]], k: int) -> float:
    """
    Compute R@k (Recall at k).
    """
    assert len(true_answer) == len(pred_answer), "true_answer and pred_answer must have the same length"
    N = len(true_answer)
    recall_sum = 0
    for g, p in zip(true_answer, pred_answer):
        if len(g) == 0:  # avoid division by zero
            continue
        topk_preds = set(p[:k])
        recall_sum += len(topk_preds & set(g)) / len(g)
    return recall_sum / N

In [None]:
def macro_f1_at_k(true_answer: List[List[str]],
                  pred_answer: List[List[str]],
                  k: int,
                  zero_division: float = 0.0,
                  ignore_empty_gold: bool = True) -> float:
    """
    Compute Macro F1@k.
    - true_answer[i]: ground-truth elements for query i
    - pred_answer[i]: ranked predictions for query i
    - k: use top-k predictions
    - zero_division: value used when precision or recall denominator is zero
    - ignore_empty_gold: if True, skip queries with empty gold set
    """
    assert len(true_answer) == len(pred_answer), "Lengths must match."

    f1_sum = 0.0
    n = 0

    for g, p in zip(true_answer, pred_answer):
        G = set(g)
        if ignore_empty_gold and len(G) == 0:
            continue

        P = set(p[:k])

        tp = len(G & P)
        fp = len(P - G)
        fn = len(G - P)

        precision = tp / (tp + fp) if (tp + fp) > 0 else zero_division
        recall    = tp / (tp + fn) if (tp + fn) > 0 else zero_division
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

        f1_sum += f1
        n += 1

    return (f1_sum / n) if n > 0 else 0.0

In [None]:
def evaluate_metrics(true_answer, pred_answer, ks=[1, 5, 10, 20]):
    results = {}
    hr_list, r_list, f1_list = [], [], []

    for k in ks:
        hr = hit_rate_at_k(true_answer, pred_answer, k) * 100
        r = recall_at_k(true_answer, pred_answer, k) * 100
        macro_f1 = macro_f1_at_k(true_answer, pred_answer, k) * 100

        results[f"HR@{k}"] = f"{hr:.2f}%"
        results[f"R@{k}"] = f"{r:.2f}%"
        results[f"Macro-F1@{k}"] = f"{macro_f1:.2f}%"

        hr_list.append(hr)
        r_list.append(r)
        f1_list.append(macro_f1)

    results["HR@avg"] = f"{sum(hr_list) / len(hr_list):.2f}%"
    results["R@avg"] = f"{sum(r_list) / len(r_list):.2f}%"
    results["Macro-F1@avg"] = f"{sum(f1_list) / len(f1_list):.2f}%"

    return results

In [None]:
evaluate_metrics(true_answer, pred_answer)