feat: add support for the original mt-bench #21

geoalgo · 2026-03-19T19:18:14Z

The name of the file is a bit misleading, probably eval_utils.py is better.

-Original file line number
+Diff line change
@@ -0,0 +1,181 @@
+    """CLI argument configuration for generation and evaluation entrypoints."""
+    import argparse
+    import json
+    from dataclasses import dataclass, field
+    @dataclass
+    class CliArgs:
+        dataset: str
+        model_A: str
+        model_B: str
+        judge_model: str
+        n_instructions: int | None = None
+        provide_explanation: bool = False
+        swap_mode: str = "fixed"
+        ignore_cache: bool = False
+        use_tqdm: bool = False
+        truncate_all_input_chars: int = 8192
+        max_out_tokens_models: int = 32768
+        max_out_tokens_judge: int = 32768
+        max_model_len: int | None = None
+        chat_template: str | None = None
+        result_folder: str = "results"
+        engine_kwargs: dict = field(default_factory=dict)
+        def __post_init__(self):
+            supported_modes = ["fixed", "both"]
+            assert self.swap_mode in supported_modes, (
+                f"Only {supported_modes} modes are supported but got {self.swap_mode}."
+            )
+        @classmethod
+        def parse_args(cls):
+            parser = argparse.ArgumentParser(
+                prog="Generate completion and evaluate with a judge",
+            )
+            parser.add_argument(
+                "--dataset",
+                help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction "
+                "tuning cases or `french-contexts`, `spanish-contexts` for base models.",
+            )
+            parser.add_argument(
+                "--model_A",
+                required=True,
+                help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+            )
+            parser.add_argument(
+                "--model_B",
+                required=True,
+                help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+            )
+            parser.add_argument(
+                "--judge_model",
+                required=True,
+                help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
+                "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc",
+            )
+            parser.add_argument(
+                "--n_instructions",
+                type=int,
+                required=False,
+            )
+            parser.add_argument(
+                "--provide_explanation",
+                action="store_true",
+                help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve"
+                "the accuracy of the judge but enables some result interpretation.",
+            )
+            parser.add_argument(
+                "--swap_mode",
+                type=str,
+                choices=["fixed", "both"],
+                default="fixed",
+                help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order "
+                "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account "
+                "for judge position bias. Default is 'fixed'.",
+            )
+            parser.add_argument(
+                "--ignore_cache",
+                action="store_true",
+                help="If specified, ignore cache of previous completions.",
+            )
+            parser.add_argument(
+                "--use_tqdm",
+                action="store_true",
+                help="If specified, use tqdm, does not work with all model providers, vLLM in particular.",
+            )
+            parser.add_argument(
+                "--result_folder",
+                type=str,
+                required=False,
+                default="results",
+                help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in"
+                " `[result_folder]/[evaluation_name]`.",
+            )
+            parser.add_argument(
+                "--truncate_all_input_chars",
+                type=int,
+                required=False,
+                default=8192,
+                help="Character-level truncation applied before tokenization: truncates each instruction "
+                "before model A/B generation and truncates each completion before judge evaluation.",
+            )
+            parser.add_argument(
+                "--max_out_tokens_models",
+                type=int,
+                required=False,
+                default=32768,
+                help=(
+                    "Generation token budget for each model A/B response. For VLLM, keep this <= "
+                    "--max_model_len (if provided)."
+                ),
+            )
+            parser.add_argument(
+                "--max_out_tokens_judge",
+                type=int,
+                required=False,
+                default=32768,
+                help=(
+                    "Generation token budget for the judge response (reasoning + scores). For "
+                    "VLLM, keep this <= --max_model_len (if provided)."
+                ),
+            )
+            parser.add_argument(
+                "--max_model_len",
+                type=int,
+                required=False,
+                default=None,
+                help=(
+                    "Optional total context window for VLLM models (prompt + generation). This is "
+                    "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap "
+                    "generated tokens. This is useful on smaller GPUs to avoid OOM."
+                ),
+            )
+            parser.add_argument(
+                "--chat_template",
+                type=str,
+                required=False,
+                default=None,
+                help="Jinja2 chat template string to use instead of the model's tokenizer template. "
+                "If not provided, ChatML is used as fallback for models without a chat template.",
+            )
+            parser.add_argument(
+                "--engine_kwargs",
+                type=str,
+                required=False,
+                default="{}",
+                help=(
+                    "JSON dict of engine-specific kwargs forwarded to the underlying engine. "
+                    'Example for vLLM: \'{"tensor_parallel_size": 2, "gpu_memory_utilization": 0.9}\'.'
+                ),
+            )
+            args = parser.parse_args()
+            try:
+                engine_kwargs = json.loads(args.engine_kwargs) if args.engine_kwargs else {}
+                if not isinstance(engine_kwargs, dict):
+                    raise ValueError("engine_kwargs must be a JSON object")
+            except Exception as e:
+                raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e
+            return cls(
+                dataset=args.dataset,
+                model_A=args.model_A,
+                model_B=args.model_B,
+                judge_model=args.judge_model,
+                n_instructions=args.n_instructions,
+                provide_explanation=args.provide_explanation,
+                swap_mode=args.swap_mode,
+                ignore_cache=args.ignore_cache,
+                use_tqdm=args.use_tqdm,
+                truncate_all_input_chars=args.truncate_all_input_chars,
+                max_out_tokens_models=args.max_out_tokens_models,
+                max_out_tokens_judge=args.max_out_tokens_judge,
+                max_model_len=args.max_model_len,
+                chat_template=args.chat_template,
+                result_folder=args.result_folder,
+                engine_kwargs=engine_kwargs,
+            )

-Original file line number
+Diff line change
@@ -0,0 +1,155 @@
+    """Shared evaluation runtime helpers used by entrypoints and benchmark pipelines."""
+    from __future__ import annotations
+    from dataclasses import dataclass
+    import pandas as pd
+    from judgearena.evaluate import PairScore, annotate_battles
+    from judgearena.utils import compute_pref_summary
+    def print_results(results):
+        """Print battle results in a readable format."""
+        print("\n" + "=" * 60)
+        print("🏆 MODEL BATTLE RESULTS 🏆".center(60))
+        print(f"📊 Dataset: {results['dataset']}")
+        print(
+            f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}"
+        )
+        print(f"⚖️ Judge: {results['judge_model']}")
+        print("📈 Results Summary:")
+        print(f"   Total Battles: {results['num_battles']}")
+        print(f"   Win Rate (A): {results['winrate']:.1%}")
+        print(f"   ✅ Wins:   {results['num_wins']}")
+        print(f"   ❌ Losses: {results['num_losses']}")
+        print(f"   🤝 Ties:   {results['num_ties']}")
+        if results.get("num_missing", 0) > 0:
+            print(f"   ❓ Missing: {results['num_missing']}")
+        per_category = results.get("per_category")
+        if per_category:
+            print("\nPer-Category Breakdown:")
+            print(
+                f"  {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}"
+            )
+            print(f"  {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}")
+            for cat, stats in sorted(per_category.items()):
+                print(
+                    f"  {cat:<14} | {stats['winrate']:>11.1%} | "
+                    f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}"
+                )
+        per_turn = results.get("per_turn")
+        if per_turn:
+            print("\nPer-Turn Breakdown:")
+            for turn, stats in sorted(per_turn.items()):
+                print(
+                    f"  Turn {turn} Win Rate(A): {stats['winrate']:.1%} "
+                    f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})"
+                )
+        print("=" * 60 + "\n")
+    def _compute_grouped_stats(
+        preferences: pd.Series,
+        metadata: list[dict[str, object]],
+        group_by: str,
+    ) -> dict[object, dict[str, float | int]]:
+        grouped: dict[object, list[float]] = {}
+        for meta, pref in zip(metadata, preferences, strict=True):
+            key = meta.get(group_by)
+            if key is None:
+                continue
+            grouped.setdefault(key, []).append(pref)
+        return {key: compute_pref_summary(pd.Series(vals)) for key, vals in grouped.items()}
+    def _parse_preferences_from_annotations(
+        annotations: list,
+        score_parser: PairScore,
+    ) -> pd.Series:
+        return pd.Series(
+            [
+                score_parser.parse_model_raw(annotation.judge_completion)
+                for annotation in annotations
+            ]
+        )
+    @dataclass
+    class JudgeAnnotationResult:
+        annotations: list
+        annotations_reversed: list
+        metadata_for_annotations: list[dict[str, object]]
+        metadata_for_reversed_annotations: list[dict[str, object]]
+        preferences: pd.Series
+        combined_metadata: list[dict[str, object]]
+    def _make_judge_annotation(
+        *,
+        judge_chat_model,
+        instructions: list[str],
+        completions_A: list[str],
+        completions_B: list[str],
+        metadata: list[dict[str, object]],
+        score_parser: PairScore,
+        provide_explanation: bool,
+        swap_mode: str,
+        truncate_input_chars: int | None,
+        use_tqdm: bool,
+        system_prompt: str | None = None,
+        user_prompt_template: str | None = None,
+    ) -> JudgeAnnotationResult:
+        if not instructions:
+            raise ValueError("instructions must be non-empty")
+        annotations = annotate_battles(
+            judge_chat_model=judge_chat_model,
+            instructions=instructions,
+            completions_A=completions_A,
+            completions_B=completions_B,
+            provide_explanation=provide_explanation,
+            system_prompt=system_prompt,
+            user_prompt_template=user_prompt_template,
+            truncate_input_chars=truncate_input_chars,
+            use_tqdm=use_tqdm,
+        )
+        preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)]
+        annotations_reversed: list = []
+        metadata_for_reversed_annotations: list[dict[str, object]] = []
+        combined_metadata = list(metadata)
+        if swap_mode == "both":
+            print("Correction for judge bias towards a certain model position is set.")
+            print("Evaluating completions with models reversed.")
+            annotations_reversed = annotate_battles(
+                judge_chat_model=judge_chat_model,
+                instructions=instructions,
+                completions_A=completions_B,
+                completions_B=completions_A,
+                provide_explanation=provide_explanation,
+                system_prompt=system_prompt,
+                user_prompt_template=user_prompt_template,
+                truncate_input_chars=truncate_input_chars,
+                use_tqdm=use_tqdm,
+            )
+            prefs_reversed = _parse_preferences_from_annotations(
+                annotations_reversed, score_parser
+            )
+            preference_parts.append(1 - prefs_reversed)
+            metadata_for_reversed_annotations = list(metadata)
+            combined_metadata.extend(metadata)
+        preferences = pd.concat(preference_parts).reset_index(drop=True)
+        return JudgeAnnotationResult(
+            annotations=annotations,
+            annotations_reversed=annotations_reversed,
+            metadata_for_annotations=list(metadata),
+            metadata_for_reversed_annotations=metadata_for_reversed_annotations,
+            preferences=preferences,
+            combined_metadata=combined_metadata,
+        )

-Original file line number
+Diff line change
@@ Expand Up @@
                 return float(m.group(group_index).strip(" "))
+    _COMPLETION_LABEL_SINGLE = "Answer"
+    _COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
+    _EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
+    _SCORE_FENCE = "\n```"
     def load_judge_system_and_user_prompt(
         provide_explanation: bool = True,
+        multi_turn: bool = False,
     ) -> tuple[str, str]:
-        # Prepare judge
-        with open(Path(__file__).parent / "prompts" / "system-prompt.txt") as f:
-            system_prompt = str(f.read())
+        prompts_dir = Path(__file__).parent / "prompts"
+        system_prompt = (prompts_dir / "system-prompt.txt").read_text()
         prompt_filename = (
             "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
         )
-        with open(Path(__file__).parent / "prompts" / prompt_filename) as f:
-            user_prompt_template = str(f.read())
+        user_prompt_template = (prompts_dir / prompt_filename).read_text()
+        user_prompt_template = user_prompt_template.replace(
+            "{completion_label}",
+            _COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE,
+        )
+        user_prompt_template = user_prompt_template.replace(
+            "{explanation_suffix}",
+            _EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE,
+        )
         return system_prompt, user_prompt_template
     def resolve_judge_prompts(
         *,
         provide_explanation: bool,
+        multi_turn: bool = False,
         system_prompt: str | None = None,
         user_prompt_template: str | None = None,
     ) -> tuple[str, str]:
         default_system_prompt, default_user_prompt_template = (
-            load_judge_system_and_user_prompt(provide_explanation=provide_explanation)
+            load_judge_system_and_user_prompt(
+                provide_explanation=provide_explanation, multi_turn=multi_turn
+            )
         )
         return (
             system_prompt if system_prompt is not None else default_system_prompt,
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add support for the original mt-bench #21

Uh oh!

Diff view

Diff view

There are no files selected for viewing

geoalgo Mar 19, 2026

Uh oh!

Uh oh!

Uh oh!

feat: add support for the original mt-bench #21

Uh oh!

feat: add support for the original mt-bench #21

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

geoalgo Mar 19, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!