In [1]:
# Copyright (c) 2025 Microsoft Corporation.
import os
from pathlib import Path
from typing import cast

import pandas as pd
from pydantic import SecretStr
from rich import print as rich_print

from benchmark_qed.autoe.pairwise_scores import analyze_criteria, get_pairwise_scores
from benchmark_qed.autoe.reference_scores import (
    get_reference_scores,
    summarize_reference_scores,
)
from benchmark_qed.cli.utils import print_df
from benchmark_qed.config.llm_config import (
    LLMConfig,
    LLMProvider,
)
from benchmark_qed.config.model.score import (
    pairwise_scores_criteria,
    reference_scores_criteria,
)
from benchmark_qed.llm.factory import ModelFactory

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import os
print(os.getcwd())

e:\NLP\GraphRag_Eval\autoe


In [3]:
%reload_ext dotenv
%dotenv

In [19]:
# Config LLM model to be used as judge
llm_config = LLMConfig(
    model="gpt-5-nano",
    api_key=SecretStr(os.environ["OPENAI_API_KEY"]),
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=32,
    call_args={"temperature": 1.0, "seed": 42},
)
llm_client = ModelFactory.create_chat_model(llm_config)

In [None]:
# Config conditions for comparison
base = "vector_rag"
others = ["lazygraphrag", "graphrag_global"]
question_sets = ["activity_global", "activity_local"]
trials = 2  # number of trials to run for each combination of [query, base, other]. Trials must be an even number to support counterbalancing.
alpha = 0.05  # significance level used for statistical tests

input_dir = "GraphRag_Eval/autoe"
output_dir = Path("./output/win_rates")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

# load default criteria. You can also define your own criteria as a list Criteria objects
criteria = pairwise_scores_criteria()

In [21]:
# run pairwise comparisons for each question set and each pair of [base, other].
all_results = []
for question_set in question_sets:
    for other in others:
        rich_print(f"Processing {base} vs {other} for question set: {question_set}")
        result = get_pairwise_scores(
            llm_client=llm_client,
            llm_config=llm_config,
            base_name=base,
            other_name=other,
            base_answers=pd.read_json(f"{input_dir}/{base}/{question_set}.json"),
            other_answers=pd.read_json(f"{input_dir}/{other}/{question_set}.json"),
            criteria=criteria,
            trials=trials,
            include_score_id_in_prompt=True,
            question_id_key="question_id",
        )
        result["question_set"] = question_set
        all_results.append(result)

        # save pairwise results for each question set and pair of [base, other]
        result.to_csv(
            output_dir / f"{question_set}_{base}--{other}.csv",
            index=False,
        )

# save all pairwise results in a single file
all_results_df = pd.concat(all_results, ignore_index=True)
all_results_df.to_csv(output_dir / "win_rates.csv", index=False)

# perform significance testing on the results
significance_test_results = analyze_criteria(
    all_results_df,
    alpha=alpha,
)
significance_test_results.to_csv(output_dir / "winrates_sig_tests.csv", index=False)

print_df(
    cast(
        pd.DataFrame,
        significance_test_results[
            [
                "question_set",
                "criteria",
                "base_name",
                "other_name",
                "base_mean",
                "other_mean",
                "formatted_corrected_p_value",
            ]
        ],
    ),
    "Win Rates Summary",
)

  shapiro_base = shapiro(base_scores)
  shapiro_other = shapiro(other_scores)


In [22]:
rich_print("Model usage statistics:")
rich_print(llm_client.get_usage())

Assertion Based 

In [23]:
# Config
assertions_file = "activity_global_assertions.json"
generated_rag = "vector_rag"
pass_threshold = 0.5
trials = 4  # number of trials

input_dir = "./example_answers"
output_dir = Path("./output/assertion_scores")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

In [26]:
import numpy as np

from benchmark_qed.autoe.assertion_scores import get_assertion_scores

answers = pd.read_json("E:/NLP/GraphRag_Eval/autoe/vector_rag/activity_global.json")

assertions = (
    pd.read_json("E:/NLP/GraphRag_Eval/autoe/activity_global_assertions.json")
    .explode("assertions")
    .rename(columns={"assertions": "assertion"})
    .reset_index(drop=True)
)

assertion_score = get_assertion_scores(
    llm_client=llm_client,
    llm_config=llm_config,
    answers=answers,
    assertions=assertions,
    trials=4,
    question_id_key="question_id",
    question_text_key="question_text",
    answer_text_key="answer",
)

assertion_score.to_csv(output_dir / "assertion_scores.csv", index=False)

summary_by_assertion = (
    assertion_score.groupby(["question", "assertion"])
    .agg(score=("score", lambda x: int(x.mean() > 0.5)), scores=("score", list))
    .reset_index()
)

summary_by_question = (
    summary_by_assertion.groupby(["question"])
    .agg(
        success=("score", lambda x: (x == 1).sum()),
        fail=("score", lambda x: (x == 0).sum()),
    )
    .reset_index()
)

summary_by_assertion["score_mean"] = summary_by_assertion["scores"].apply(
    lambda x: np.mean(x) if len(x) > 0 else 0.0
)
summary_by_assertion["score_std"] = summary_by_assertion["scores"].apply(
    lambda x: np.std(x) if len(x) > 0 else 0.0
)
summary_by_assertion = summary_by_assertion.drop(columns=["scores"])

print_df(
    summary_by_question,
    "Assertion Scores Summary by Question",
)

failed_assertions: pd.DataFrame = cast(
    pd.DataFrame, summary_by_assertion[summary_by_assertion["score"] == 0]
)

failed_assertions = failed_assertions.drop(columns=["score"])

if len(failed_assertions) > 0:
    print_df(
        failed_assertions,
        f"[bold red]{failed_assertions.shape[0]} Failed Assertions[/bold red]",
    )
    rich_print(
        f"[bold red]{failed_assertions.shape[0]} assertions failed. See {output_dir / 'assertion_scores.csv'} for details.[/bold red]"
    )
else:
    rich_print("[bold green]All assertions passed.[/bold green]")