diff --git a/README.md b/README.md index 24087a0..fd4e51a 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Compared to other libraries, here is a breakdown of features: | **Arena-Hard-Auto** | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | **Lighteval** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Evalchemy** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **OpenJury** | 🔜 | ✅ | ✅ | ✅ | ✅ | ✅ | +| **OpenJury** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue or send a PR, we will be happy to update the information. @@ -191,10 +191,29 @@ python openjury/generate_and_evaluate.py \ This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side. +### MT-Bench (Multi-Turn Evaluation) + +MT-Bench evaluates multi-turn conversation ability using 80 two-turn questions across 8 categories +(writing, roleplay, reasoning, math, coding, extraction, STEM, humanities). +It uses category-dependent judge prompts and reference answers for math/reasoning/coding. +Questions are automatically downloaded from the [LMSYS MT-Bench HuggingFace space](https://huggingface.co/spaces/lmsys/mt-bench). + +```bash +uv run python openjury/generate_and_evaluate.py \ + --dataset mt-bench \ + --model_A VLLM/Qwen/Qwen2.5-7B-Instruct \ + --model_B OpenRouter/openai/gpt-4o \ + --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ + --n_instructions 10 +``` + +Results include per-category and per-turn win rate breakdowns. Use `--swap_mode both` to correct for judge position bias. + ## 📊 Supported Datasets | Dataset | Description | |-----------------------|------------------------------------------------------------------------------------------------| +| `mt-bench` | 80 multi-turn (2-turn) questions across 8 categories ([LMSYS MT-Bench](https://arxiv.org/abs/2306.05685)) | | `alpaca-eval` | General instruction-following benchmark | | `arena-hard` | More challenging evaluation suite | | `m-arena-hard` | Translated version of Arena-Hard in 23 languages | diff --git a/openjury/config.py b/openjury/config.py new file mode 100644 index 0000000..80802eb --- /dev/null +++ b/openjury/config.py @@ -0,0 +1,212 @@ +"""CLI argument configuration for generation and evaluation entrypoints.""" + +import argparse +import json +from dataclasses import dataclass, field + + +@dataclass +class CliArgs: + dataset: str + model_A: str + model_B: str + judge_model: str + + n_instructions: int | None = None + provide_explanation: bool = False + swap_mode: str = "fixed" + ignore_cache: bool = False + use_tqdm: bool = False + truncate_all_input_chars: int = 8192 + max_out_tokens_models: int = 32768 + max_out_tokens_judge: int = 32768 + max_model_len: int | None = None + chat_template: str | None = None + mt_bench_turns: str = "both" + mt_bench_compatibility: str = "openjury" + result_folder: str = "results" + engine_kwargs: dict = field(default_factory=dict) + + def __post_init__(self): + supported_modes = ["fixed", "both"] + assert ( + self.swap_mode in supported_modes + ), f"Only {supported_modes} modes are supported but got {self.swap_mode}." + supported_mt_bench_modes = ["openjury", "fastchat"] + assert ( + self.mt_bench_compatibility in supported_mt_bench_modes + ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}." + + @classmethod + def parse_args(cls): + parser = argparse.ArgumentParser( + prog="Generate completion and evaluate with a judge", + ) + parser.add_argument( + "--dataset", + help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction " + "tuning cases or `french-contexts`, `spanish-contexts` for base models.", + ) + parser.add_argument( + "--model_A", + required=True, + help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", + ) + parser.add_argument( + "--model_B", + required=True, + help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", + ) + parser.add_argument( + "--judge_model", + required=True, + help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, " + "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc", + ) + parser.add_argument( + "--n_instructions", + type=int, + required=False, + ) + parser.add_argument( + "--provide_explanation", + action="store_true", + help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve" + "the accuracy of the judge but enables some result interpretation.", + ) + parser.add_argument( + "--swap_mode", + type=str, + choices=["fixed", "both"], + default="fixed", + help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order " + "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account " + "for judge position bias. Default is 'fixed'.", + ) + parser.add_argument( + "--ignore_cache", + action="store_true", + help="If specified, ignore cache of previous completions.", + ) + parser.add_argument( + "--use_tqdm", + action="store_true", + help="If specified, use tqdm, does not work with all model providers, vLLM in particular.", + ) + parser.add_argument( + "--result_folder", + type=str, + required=False, + default="results", + help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in" + " `[result_folder]/[evaluation_name]`.", + ) + parser.add_argument( + "--truncate_all_input_chars", + type=int, + required=False, + default=8192, + help="Character-level truncation applied before tokenization: truncates each instruction " + "before model A/B generation and truncates each completion before judge evaluation.", + ) + parser.add_argument( + "--max_out_tokens_models", + type=int, + required=False, + default=32768, + help=( + "Generation token budget for each model A/B response. For VLLM, keep this <= " + "--max_model_len (if provided)." + ), + ) + parser.add_argument( + "--max_out_tokens_judge", + type=int, + required=False, + default=32768, + help=( + "Generation token budget for the judge response (reasoning + scores). For " + "VLLM, keep this <= --max_model_len (if provided)." + ), + ) + parser.add_argument( + "--max_model_len", + type=int, + required=False, + default=None, + help=( + "Optional total context window for VLLM models (prompt + generation). This is " + "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap " + "generated tokens. This is useful on smaller GPUs to avoid OOM." + ), + ) + parser.add_argument( + "--chat_template", + type=str, + required=False, + default=None, + help="Jinja2 chat template string to use instead of the model's tokenizer template. " + "If not provided, ChatML is used as fallback for models without a chat template.", + ) + parser.add_argument( + "--mt_bench_turns", + type=str, + choices=["both", "single", "multi"], + default="both", + help="Which MT-Bench turns to evaluate. 'single': only turn 1, " + "'multi': only turn 2 (with full conversation context), " + "'both' (default): evaluate both turns.", + ) + parser.add_argument( + "--mt_bench_compatibility", + type=str, + choices=["openjury", "fastchat"], + default="openjury", + help=( + "MT-Bench evaluation/generation mode. " + "'openjury' (default): OpenJury score_A/score_B prompt + softmax preference. " + "'fastchat': use FastChat/MT-Bench pairwise prompts with [[A]]/[[B]]/[[C]] verdict parsing, " + "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures." + ), + ) + parser.add_argument( + "--engine_kwargs", + type=str, + required=False, + default="{}", + help=( + "JSON dict of engine-specific kwargs forwarded to the underlying engine. " + "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'." + ), + ) + args = parser.parse_args() + + try: + engine_kwargs = ( + json.loads(args.engine_kwargs) if args.engine_kwargs else {} + ) + if not isinstance(engine_kwargs, dict): + raise ValueError("engine_kwargs must be a JSON object") + except Exception as e: + raise SystemExit(f"Failed to parse --engine_kwargs: {e}") + + return cls( + dataset=args.dataset, + model_A=args.model_A, + model_B=args.model_B, + judge_model=args.judge_model, + n_instructions=args.n_instructions, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + ignore_cache=args.ignore_cache, + use_tqdm=args.use_tqdm, + truncate_all_input_chars=args.truncate_all_input_chars, + max_out_tokens_models=args.max_out_tokens_models, + max_out_tokens_judge=args.max_out_tokens_judge, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + mt_bench_turns=args.mt_bench_turns, + mt_bench_compatibility=args.mt_bench_compatibility, + result_folder=args.result_folder, + engine_kwargs=engine_kwargs, + ) diff --git a/openjury/eval_runtime.py b/openjury/eval_runtime.py new file mode 100644 index 0000000..dd367e5 --- /dev/null +++ b/openjury/eval_runtime.py @@ -0,0 +1,171 @@ +"""Shared evaluation runtime helpers used by entrypoints and benchmark pipelines.""" + +from __future__ import annotations + +import pandas as pd + +from openjury.evaluate import annotate_battles, PairScore + + +def print_results(results): + """Print battle results in a readable format.""" + print("\n" + "=" * 60) + print("🏆 MODEL BATTLE RESULTS 🏆".center(60)) + print(f"📊 Dataset: {results['dataset']}") + print( + f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}" + ) + print(f"⚖️ Judge: {results['judge_model']}") + print("📈 Results Summary:") + print(f" Total Battles: {results['num_battles']}") + print(f" Win Rate (A): {results['winrate']:.1%}") + print(f" ✅ Wins: {results['num_wins']}") + print(f" ❌ Losses: {results['num_losses']}") + print(f" 🤝 Ties: {results['num_ties']}") + if results.get("num_missing", 0) > 0: + print(f" ❓ Missing: {results['num_missing']}") + + per_category = results.get("per_category") + if per_category: + print("\nPer-Category Breakdown:") + print( + f" {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}" + ) + print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}") + for cat, stats in sorted(per_category.items()): + print( + f" {cat:<14} | {stats['winrate']:>11.1%} | " + f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}" + ) + + per_turn = results.get("per_turn") + if per_turn: + print("\nPer-Turn Breakdown:") + for turn, stats in sorted(per_turn.items()): + print( + f" Turn {turn} Win Rate(A): {stats['winrate']:.1%} " + f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})" + ) + print("=" * 60 + "\n") + + +def compute_preference_stats(prefs: pd.Series) -> dict: + """Derive win/loss/tie counts and winrate from a Series of preferences.""" + num_battles = len(prefs) + num_wins = int(sum(prefs < 0.5)) + num_losses = int(sum(prefs > 0.5)) + num_ties = int(sum(prefs == 0.5)) + num_missing = num_battles - (num_wins + num_losses + num_ties) + denom = num_wins + num_losses + num_ties + winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0 + return { + "num_battles": num_battles, + "num_wins": num_wins, + "num_losses": num_losses, + "num_ties": num_ties, + "num_missing": num_missing, + "winrate": winrate, + } + + +def _compute_grouped_stats( + preferences: pd.Series, + metadata: list[dict[str, object]], + group_by: str, +) -> dict[object, dict[str, float | int]]: + grouped: dict[object, list[float]] = {} + for meta, pref in zip(metadata, preferences): + key = meta.get(group_by) + if key is None: + continue + grouped.setdefault(key, []).append(pref) + return { + key: compute_preference_stats(pd.Series(vals)) + for key, vals in grouped.items() + } + + +def _parse_preferences_from_annotations( + annotations: list, + score_parser: PairScore, +) -> pd.Series: + return pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations + ] + ) + + +def _judge_turn( + *, + judge_chat_model, + instructions: list[str], + completions_A: list[str], + completions_B: list[str], + metadata: list[dict[str, object]], + score_parser: PairScore, + provide_explanation: bool, + swap_mode: str, + truncate_input_chars: int | None, + use_tqdm: bool, + system_prompt: str | None = None, + user_prompt_template: str | None = None, +) -> tuple[ + list, + list, + list[dict[str, object]], + list[dict[str, object]], + pd.Series, + list[dict[str, object]], +]: + if not instructions: + return [], [], [], [], pd.Series(dtype=float), [] + + annotations = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions, + completions_A=completions_A, + completions_B=completions_B, + provide_explanation=provide_explanation, + system_prompt=system_prompt, + user_prompt_template=user_prompt_template, + truncate_input_chars=truncate_input_chars, + use_tqdm=use_tqdm, + ) + preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)] + + annotations_reversed: list = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + combined_metadata = list(metadata) + + if swap_mode == "both": + print("Correction for judge bias towards a certain model position is set.") + print("Evaluating completions with models reversed.") + annotations_reversed = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions, + completions_A=completions_B, + completions_B=completions_A, + provide_explanation=provide_explanation, + system_prompt=system_prompt, + user_prompt_template=user_prompt_template, + truncate_input_chars=truncate_input_chars, + use_tqdm=use_tqdm, + ) + prefs_reversed = _parse_preferences_from_annotations( + annotations_reversed, score_parser + ) + preference_parts.append(1 - prefs_reversed) + metadata_for_reversed_annotations = list(metadata) + combined_metadata.extend(metadata) + + preferences = pd.concat(preference_parts).reset_index(drop=True) + return ( + annotations, + annotations_reversed, + list(metadata), + metadata_for_reversed_annotations, + preferences, + combined_metadata, + ) diff --git a/openjury/evaluate.py b/openjury/evaluate.py index 342da57..b5845ba 100644 --- a/openjury/evaluate.py +++ b/openjury/evaluate.py @@ -17,6 +17,7 @@ data_root, download_hf, do_inference, + truncate, ) @@ -51,18 +52,29 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1): return float(m.group(group_index).strip(" ")) +_COMPLETION_LABEL_SINGLE = "Answer" +_COMPLETION_LABEL_MULTI_TURN = "Conversation with User" +_EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement" +_SCORE_FENCE = "\n```" + + def load_judge_system_and_user_prompt( provide_explanation: bool = True, + multi_turn: bool = False, ) -> tuple[str, str]: - # Prepare judge - with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f: - system_prompt = str(f.read()) + prompts_dir = Path(__file__).parent / "prompts" + + system_prompt = (prompts_dir / "system-prompt.txt").read_text() - prompt_filename = ( - "prompt-with-explanation.txt" if provide_explanation else "prompt.txt" + user_prompt_template = (prompts_dir / "prompt.txt").read_text() + user_prompt_template = user_prompt_template.replace( + "{completion_label}", + _COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE, + ) + user_prompt_template = user_prompt_template.replace( + "{explanation_suffix}", + _EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE, ) - with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f: - user_prompt_template = str(f.read()) return system_prompt, user_prompt_template @@ -287,14 +299,6 @@ def annotate_battles( [("system", system_prompt), ("user", user_prompt_template)] ) - def truncate(s: str, max_len: int | None = None): - if not isinstance(s, str): - return "" - if max_len is not None: - return s[:max_len] - else: - return s - inputs = prompt_template.batch( [ { diff --git a/openjury/generate.py b/openjury/generate.py index 11c6508..64eb789 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -1,17 +1,59 @@ import pandas as pd from langchain.prompts import ChatPromptTemplate +from typing import Any from openjury.utils import ( do_inference, make_model, + truncate, ) -def truncate(s: str, max_len: int | None = None): - if max_len is not None: - return s[:max_len] - else: - return s +def _set_temperature_on_model(chat_model, temperature: float) -> None: + if hasattr(chat_model, "set_temperature"): + chat_model.set_temperature(temperature) + return + if hasattr(chat_model, "temperature"): + setattr(chat_model, "temperature", temperature) + + +def _infer_grouped_by_temperature( + *, + model_spec: str, + provider: str, + max_tokens: int | None, + model_kwargs: dict[str, Any], + base_model, + inputs: list, + temperatures: list[float], + use_tqdm: bool, +) -> list[str]: + outputs: list[str] = [""] * len(inputs) + groups: dict[float, list[int]] = {} + for idx, temp in enumerate(temperatures): + groups.setdefault(float(temp), []).append(idx) + + for temp in sorted(groups.keys()): + idxs = groups[temp] + group_inputs = [inputs[i] for i in idxs] + + if provider in {"VLLM", "LlamaCpp"}: + _set_temperature_on_model(base_model, temp) + group_model = base_model + else: + group_model = make_model( + model_spec, max_tokens=max_tokens, temperature=temp, **model_kwargs + ) + + group_outs = do_inference( + chat_model=group_model, + inputs=group_inputs, + use_tqdm=use_tqdm, + ) + for i, out in zip(idxs, group_outs): + outputs[i] = out + + return outputs def generate_instructions( @@ -57,6 +99,136 @@ def generate_instructions( return df_outputs +def generate_multiturn( + questions: pd.DataFrame, + model: str, + truncate_input_chars: int | None = 8192, + max_tokens: int | None = 8192, + use_tqdm: bool = True, + temperature_config: dict[str, float] | None = None, + **model_kwargs, +) -> pd.DataFrame: + """Generate two-turn completions for MT-Bench style questions. + + Generates turn 1 answers first, then uses them as conversation context + to generate turn 2 answers. + + Args: + questions: DataFrame with columns turn_1, turn_2, and index instruction_index. + model: Model specification string (e.g. "VLLM/model-name"). + temperature_config: Optional category -> temperature mapping. When set, + inputs are inferred in temperature-homogeneous groups to match + MT-Bench/FastChat category defaults. + **model_kwargs: Provider-specific options forwarded to make_model + (e.g. max_model_len, chat_template for VLLM). + Returns: + DataFrame with columns: instruction_index, completion_turn_1, completion_turn_2 + """ + provider = model.split("/")[0] + use_category_temperatures = temperature_config is not None + local_provider = provider in {"VLLM", "LlamaCpp"} + + chat_model = None + if use_category_temperatures and local_provider: + chat_model = make_model(model, max_tokens=max_tokens, temperature=0.0, **model_kwargs) + else: + chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) + + system_prompt = "You are a helpful assistant." + idxs = questions.index.tolist() + temperatures: list[float] = [] + if use_category_temperatures: + temperatures = [ + temperature_config.get(str(questions.loc[idx].get("category") or ""), 0.7) + for idx in idxs + ] + + turn1_template = ChatPromptTemplate.from_messages( + [("system", system_prompt), ("user", "{user_prompt}")] + ) + + turn1_inputs = turn1_template.batch( + [ + {"user_prompt": truncate(row["turn_1"], max_len=truncate_input_chars)} + for _, row in questions.iterrows() + ] + ) + + print(f"Generating turn 1 completions ({len(turn1_inputs)} questions).") + if use_category_temperatures: + completions_turn_1 = _infer_grouped_by_temperature( + model_spec=model, + provider=provider, + max_tokens=max_tokens, + model_kwargs=model_kwargs, + base_model=chat_model, + inputs=turn1_inputs, + temperatures=temperatures, + use_tqdm=use_tqdm, + ) + else: + completions_turn_1 = do_inference( + chat_model=chat_model, + inputs=turn1_inputs, + use_tqdm=use_tqdm, + ) + + turn2_inputs = [] + for (_, row), t1_answer in zip(questions.iterrows(), completions_turn_1): + if row["turn_2"] is None: + turn2_inputs.append( + turn1_template.invoke( + {"user_prompt": "No follow-up question."} + ) + ) + else: + multi_turn_template = ChatPromptTemplate.from_messages( + [ + ("system", system_prompt), + ("user", "{turn_1}"), + ("assistant", "{turn_1_answer}"), + ("user", "{turn_2}"), + ] + ) + turn2_inputs.append( + multi_turn_template.invoke( + { + "turn_1": truncate(row["turn_1"], max_len=truncate_input_chars), + "turn_1_answer": truncate(str(t1_answer), max_len=truncate_input_chars), + "turn_2": truncate(row["turn_2"], max_len=truncate_input_chars), + } + ) + ) + + print(f"Generating turn 2 completions ({len(turn2_inputs)} questions).") + if use_category_temperatures: + completions_turn_2 = _infer_grouped_by_temperature( + model_spec=model, + provider=provider, + max_tokens=max_tokens, + model_kwargs=model_kwargs, + base_model=chat_model, + inputs=turn2_inputs, + temperatures=temperatures, + use_tqdm=use_tqdm, + ) + else: + completions_turn_2 = do_inference( + chat_model=chat_model, + inputs=turn2_inputs, + use_tqdm=use_tqdm, + ) + + df_outputs = pd.DataFrame( + data={ + "instruction_index": idxs, + "completion_turn_1": completions_turn_1, + "completion_turn_2": completions_turn_2, + }, + ) + return df_outputs + + def generate_base( instructions: pd.Series, model: str, diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 83cec69..66d15af 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -3,26 +3,33 @@ and then evaluates them using a judge model. """ -import argparse import json -from dataclasses import asdict, dataclass, field -from datetime import datetime, timezone +import os +from dataclasses import asdict +from datetime import datetime from functools import partial from pathlib import Path -import numpy as np import pandas as pd +from openjury.config import CliArgs from openjury.evaluate import ( - annotate_battles, PairScore, - resolve_judge_prompts, ) +from openjury.eval_runtime import _judge_turn, compute_preference_stats, print_results from openjury.generate import generate_instructions, generate_base from openjury.instruction_dataset import load_instructions -from openjury.repro import write_run_metadata, _to_jsonable -from openjury.utils import data_root, read_df, download_hf -from openjury.utils import make_model, cache_function_dataframe, compute_pref_summary +from openjury.mt_bench.pipeline import ( + format_mt_bench_for_evaluation, + run_mt_bench, +) +from openjury.utils import ( + cache_function_dataframe, + data_root, + download_hf, + make_model, + read_df, +) def try_load_dataset_completions( @@ -62,208 +69,11 @@ def try_load_dataset_completions( ) -@dataclass -class CliArgs: - dataset: str - model_A: str - model_B: str - judge_model: str - - n_instructions: int | None = None - provide_explanation: bool = False - swap_mode: str = "fixed" - ignore_cache: bool = False - use_tqdm: bool = False - truncate_all_input_chars: int = 8192 - max_out_tokens_models: int = 32768 - max_out_tokens_judge: int = 32768 - max_model_len: int | None = None - chat_template: str | None = None - result_folder: str = "results" - engine_kwargs: dict = field(default_factory=dict) - - def __post_init__(self): - supported_modes = ["fixed", "both"] - assert ( - self.swap_mode in supported_modes - ), f"Only {supported_modes} modes are supported but got {self.swap_mode}." - - @classmethod - def parse_args(cls): - parser = argparse.ArgumentParser( - prog="Generate completion and evaluate with a judge", - ) - parser.add_argument( - "--dataset", - help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction " - "tuning cases or `french-contexts`, `spanish-contexts` for base models.", - ) - parser.add_argument( - "--model_A", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--model_B", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--judge_model", - required=True, - help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, " - "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc", - ) - parser.add_argument( - "--n_instructions", - type=int, - required=False, - ) - parser.add_argument( - "--provide_explanation", - action="store_true", - help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve" - "the accuracy of the judge but enables some result interpretation.", - ) - parser.add_argument( - "--swap_mode", - type=str, - choices=["fixed", "both"], - default="fixed", - help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order " - "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account " - "for judge position bias. Default is 'fixed'.", - ) - parser.add_argument( - "--ignore_cache", - action="store_true", - help="If specified, ignore cache of previous completions.", - ) - parser.add_argument( - "--use_tqdm", - action="store_true", - help="If specified, use tqdm, does not work with all model providers, vLLM in particular.", - ) - parser.add_argument( - "--result_folder", - type=str, - required=False, - default="results", - help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in" - " `[result_folder]/[evaluation_name]`.", - ) - parser.add_argument( - "--truncate_all_input_chars", - type=int, - required=False, - default=8192, - help="Character-level truncation applied before tokenization: truncates each instruction " - "before model A/B generation and truncates each completion before judge evaluation.", - ) - parser.add_argument( - "--max_out_tokens_models", - type=int, - required=False, - default=32768, - help=( - "Generation token budget for each model A/B response. For VLLM, keep this <= " - "--max_model_len (if provided)." - ), - ) - parser.add_argument( - "--max_out_tokens_judge", - type=int, - required=False, - default=32768, - help=( - "Generation token budget for the judge response (reasoning + scores). For " - "VLLM, keep this <= --max_model_len (if provided)." - ), - ) - parser.add_argument( - "--max_model_len", - type=int, - required=False, - default=None, - help=( - "Optional total context window for VLLM models (prompt + generation). This is " - "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap " - "generated tokens. This is useful on smaller GPUs to avoid OOM." - ), - ) - parser.add_argument( - "--chat_template", - type=str, - required=False, - default=None, - help="Jinja2 chat template string to use instead of the model's tokenizer template. " - "If not provided, ChatML is used as fallback for models without a chat template.", - ) - parser.add_argument( - "--engine_kwargs", - type=str, - required=False, - default="{}", - help=( - "JSON dict of engine-specific kwargs forwarded to the underlying engine. " - "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'." - ), - ) - args = parser.parse_args() - - try: - engine_kwargs = ( - json.loads(args.engine_kwargs) if args.engine_kwargs else {} - ) - if not isinstance(engine_kwargs, dict): - raise ValueError("engine_kwargs must be a JSON object") - except Exception as e: - raise SystemExit(f"Failed to parse --engine_kwargs: {e}") - - return cls( - dataset=args.dataset, - model_A=args.model_A, - model_B=args.model_B, - judge_model=args.judge_model, - n_instructions=args.n_instructions, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - ignore_cache=args.ignore_cache, - use_tqdm=args.use_tqdm, - truncate_all_input_chars=args.truncate_all_input_chars, - max_out_tokens_models=args.max_out_tokens_models, - max_out_tokens_judge=args.max_out_tokens_judge, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - result_folder=args.result_folder, - engine_kwargs=engine_kwargs, - ) - - def load_contexts(dataset: str) -> pd.Series: path = data_root / "contexts" / dataset return pd.read_csv(path).loc[:, "instruction"] -def print_results(results): - """Print battle results in a nice formatted way""" - - print("\n" + "=" * 60) - print("🏆 MODEL BATTLE RESULTS 🏆".center(60)) - print(f"📊 Dataset: {results['dataset']}") - print( - f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}" - ) - print(f"⚖️ Judge: {results['judge_model']}") - print(f"📈 Results Summary:") - print(f" Total Battles: {results['num_battles']}") - print(f" Win Rate (A): {results['winrate']:.1%}") - print(f" ✅ Wins: {results['num_wins']}") - print(f" ❌ Losses: {results['num_losses']}") - print(f" 🤝 Ties: {results['num_ties']}") - print("=" * 60 + "\n") - - def main(args: CliArgs): """ 1) take as input: @@ -276,7 +86,6 @@ def main(args: CliArgs): 3) create annotations """ - run_started_at = datetime.now(timezone.utc) print( f"Using dataset {args.dataset} and evaluating models {args.model_A} and {args.model_B}." ) @@ -286,6 +95,10 @@ def main(args: CliArgs): # set_langchain_cache() ignore_cache = args.ignore_cache + # MT-Bench has its own pipeline: multi-turn generation + category-aware judging + if args.dataset == "mt-bench": + return run_mt_bench(args, ignore_cache) + # Currrently, we run context evaluation is_fluency_task = "fluency" in args.dataset if is_fluency_task: @@ -377,19 +190,6 @@ def main(args: CliArgs): chat_template=args.chat_template, **args.engine_kwargs, ) - - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" - name += f"-{args.swap_mode}" - name = name.replace("/", "_") - - res_folder = Path(args.result_folder) / name - res_folder.mkdir(parents=True, exist_ok=True) - - # save argument for results analysis - with open(res_folder / f"args-{name}.json", "w") as f: - json.dump(asdict(args), f, indent=2) - - print(f"Saving results to {res_folder}") if is_fluency_task: system_prompt = """You are a highly efficient assistant, who evaluates and selects the best large language \ model based on the quality of completion of a sentence. You will see a sentence to be completed and two \ @@ -398,54 +198,59 @@ def main(args: CliArgs): the ordering or on the length of the answers.""" else: # the default system prompt of annotate is to compare instruction tuned models. + system_prompt = None + + instruction_subset = instructions.head(n_instructions) + instruction_indices = instruction_subset.index.tolist() + metadata = [{"instruction_index": idx} for idx in instruction_indices] + score_parser = PairScore() ( - effective_judge_system_prompt, - judge_user_prompt_template, - ) = resolve_judge_prompts( - provide_explanation=args.provide_explanation, - system_prompt=system_prompt, - ) - annotations = annotate_battles( + annotations, + annotations_reversed, + metadata_for_annotations, + metadata_for_reversed_annotations, + prefs, + _combined_metadata, + ) = _judge_turn( judge_chat_model=judge_chat_model, - instructions=instructions.head(n_instructions).tolist(), + instructions=instruction_subset.tolist(), completions_A=completions_A.head(n_instructions).tolist(), completions_B=completions_B.head(n_instructions).tolist(), + metadata=metadata, + score_parser=score_parser, provide_explanation=args.provide_explanation, - system_prompt=effective_judge_system_prompt, - user_prompt_template=judge_user_prompt_template, + swap_mode=args.swap_mode, truncate_input_chars=args.truncate_all_input_chars, use_tqdm=args.use_tqdm, + system_prompt=system_prompt, ) - if args.swap_mode == "both": - print("Correction for judge bias towards a certain model position is set.") - print( - f"Evaluating completions with models reversed with judge {args.judge_model}." - ) - annotations_reversed = annotate_battles( - judge_chat_model=judge_chat_model, - instructions=instructions.head(n_instructions).tolist(), - completions_A=completions_B.head(n_instructions).tolist(), - completions_B=completions_A.head(n_instructions).tolist(), - provide_explanation=args.provide_explanation, - system_prompt=effective_judge_system_prompt, - user_prompt_template=judge_user_prompt_template, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + name = name.replace("/", "_") + + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + # save argument for results analysis + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + print(f"Saving results to {res_folder}") df = pd.DataFrame(annotations) - df["instruction_index"] = instructions.head(n_instructions).index.tolist() + df["instruction_index"] = [ + meta["instruction_index"] for meta in metadata_for_annotations + ] df["model_A"] = args.model_A df["model_B"] = args.model_B df["judge"] = args.judge_model if args.swap_mode == "both": df_reversed = pd.DataFrame(annotations_reversed) - df_reversed["instruction_index"] = instructions.head( - n_instructions - ).index.tolist() + df_reversed["instruction_index"] = [ + meta["instruction_index"] for meta in metadata_for_reversed_annotations + ] df_reversed["model_A"] = args.model_B df_reversed["model_B"] = args.model_A df_reversed["judge"] = args.judge_model @@ -453,64 +258,22 @@ def main(args: CliArgs): df.to_csv(res_folder / f"{name}-annotations.csv", index=False) - # compute preferences between A and B - score_parser = PairScore() - prefs = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations - ] - ) - - if args.swap_mode == "both": - prefs_reversed = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations_reversed - ] - ) - prefs = pd.concat([prefs, (1 - prefs_reversed)]).reset_index(drop=True) - - # compute and report statistics - summary = compute_pref_summary(prefs) - + stats = compute_preference_stats(prefs) results = { "dataset": args.dataset, "model_A": args.model_A, "model_B": args.model_B, "judge_model": args.judge_model, - **summary, + **stats, "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), } print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}") print_results(results) with open(res_folder / f"results-{name}.json", "w") as f: - json.dump(_to_jsonable(results), f, indent=2, allow_nan=False) - - eval_instruction_index = instructions.head(n_instructions).index.tolist() - eval_instructions = instructions.head(n_instructions).tolist() - eval_completions_A = completions_A.head(n_instructions).tolist() - eval_completions_B = completions_B.head(n_instructions).tolist() - - try: - write_run_metadata( - output_dir=res_folder, - entrypoint="openjury.generate_and_evaluate.main", - run=asdict(args), - results=results, - input_payloads={ - "instruction_index": eval_instruction_index, - "instructions": eval_instructions, - "completions_A": eval_completions_A, - "completions_B": eval_completions_B, - }, - judge_system_prompt=effective_judge_system_prompt, - judge_user_prompt_template=judge_user_prompt_template, - started_at_utc=run_started_at, - ) - except OSError as e: - print(f"Warning: failed to write run metadata: {e}") + json.dump(results, f, indent=2) return prefs diff --git a/openjury/instruction_dataset/__init__.py b/openjury/instruction_dataset/__init__.py index ac211e2..fc75155 100644 --- a/openjury/instruction_dataset/__init__.py +++ b/openjury/instruction_dataset/__init__.py @@ -4,7 +4,12 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: - if "m-arena-hard" in dataset: + if dataset == "mt-bench": + from openjury.instruction_dataset.mt_bench import load_mt_bench + + df_instructions = load_mt_bench() + + elif "m-arena-hard" in dataset: if dataset == "m-arena-hard": language = None else: diff --git a/openjury/instruction_dataset/mt_bench.py b/openjury/instruction_dataset/mt_bench.py new file mode 100644 index 0000000..910a045 --- /dev/null +++ b/openjury/instruction_dataset/mt_bench.py @@ -0,0 +1,166 @@ +from pathlib import Path +from urllib.request import urlretrieve +import warnings + +import pandas as pd +from huggingface_hub import snapshot_download + +from openjury.utils import data_root + +FASTCHAT_GPT4_REFERENCE_URL = ( + "https://raw.githubusercontent.com/lm-sys/FastChat/main/" + "fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl" +) + +def _download_gpt4_references(local_dir: Path) -> Path | None: + reference_dir = local_dir / "reference_answer" + reference_dir.mkdir(parents=True, exist_ok=True) + gpt4_reference_path = reference_dir / "gpt-4.jsonl" + if gpt4_reference_path.exists(): + return gpt4_reference_path + try: + urlretrieve(FASTCHAT_GPT4_REFERENCE_URL, gpt4_reference_path) + except Exception as e: + warnings.warn( + "Could not download MT-Bench GPT-4 reference answers from FastChat. " + f"Falling back to inline references from question.jsonl: {e}", + RuntimeWarning, + ) + return None + return gpt4_reference_path + + +def download_mt_bench(local_dir: Path | None = None) -> tuple[Path, Path | None]: + """Download MT-Bench questions and GPT-4 references if missing.""" + if local_dir is None: + local_dir = data_root / "mt-bench" + try: + local_dir.mkdir(parents=True, exist_ok=True) + except PermissionError as e: + raise PermissionError( + f"Cannot create MT-Bench cache directory at {local_dir}. " + "Set environment variable OPENJURY_DATA to a writable location." + ) from e + + question_path = local_dir / "data" / "mt_bench" / "question.jsonl" + if not question_path.exists(): + try: + snapshot_download( + repo_id="lmsys/mt-bench", + repo_type="space", + allow_patterns=[ + "data/mt_bench/question.jsonl", + ], + local_dir=local_dir, + force_download=False, + ) + except Exception as e: + raise RuntimeError( + "Failed to download MT-Bench questions from HuggingFace space " + "'lmsys/mt-bench'. If you're in an offline / restricted-network " + "environment, pre-download the space snapshot and place the " + f"questions file at {question_path}, or set OPENJURY_DATA to " + "point to that directory." + ) from e + if not question_path.exists(): + raise FileNotFoundError( + "Could not locate MT-Bench questions after download. " + f"Expected file at {question_path}." + ) + + gpt4_reference_path = _download_gpt4_references(local_dir) + return question_path, gpt4_reference_path + + +def load_mt_bench() -> pd.DataFrame: + """Load MT-Bench questions and reference answers. + + Downloads MT-Bench questions from the HuggingFace LMSYS space and tries to + load GPT-4 references from FastChat GitHub. If GPT-4 references cannot be + downloaded or parsed, falls back to inline references from question.jsonl. + """ + question_path, ref_path = download_mt_bench() + + questions = pd.read_json(question_path, lines=True).to_dict(orient="records") + + ref_by_id: dict[int | str, list[str]] = {} + use_inline_reference_fallback = ref_path is None + if ref_path is not None: + try: + reference_records = pd.read_json(ref_path, lines=True).to_dict( + orient="records" + ) + for rec in reference_records: + qid = rec.get("question_id", rec.get("id")) + if qid is None: + continue + choices = rec.get("choices") + if not (isinstance(choices, list) and choices): + continue + first_choice = choices[0] + if not isinstance(first_choice, dict): + continue + turns = first_choice.get("turns") + if not isinstance(turns, list): + continue + ref_by_id[qid] = turns + try: + ref_by_id[int(qid)] = turns + except Exception: + pass + except Exception as e: + warnings.warn( + "Failed to parse GPT-4 reference answers from FastChat. " + f"Falling back to inline references from question.jsonl: {e}", + RuntimeWarning, + ) + use_inline_reference_fallback = True + + rows = [] + for rec in questions: + qid_raw = rec.get("question_id", rec.get("id")) + if qid_raw is None: + raise ValueError( + f"MT-Bench question record missing question_id/id: keys={list(rec.keys())}" + ) + try: + qid = int(qid_raw) + except Exception: + qid = qid_raw + + category = rec.get("category") + turns = rec.get("turns") + if isinstance(turns, list): + turn_1 = turns[0] if len(turns) > 0 else None + turn_2 = turns[1] if len(turns) > 1 else None + else: + turn_1 = rec.get("turn_1", rec.get("instruction")) + turn_2 = rec.get("turn_2") + + ref_turns = ref_by_id.get(qid_raw) or ref_by_id.get(qid) + if ref_turns is None and use_inline_reference_fallback: + inline_ref = rec.get("reference") + if isinstance(inline_ref, list): + ref_turns = inline_ref + + ref_turn_1 = ( + ref_turns[0] if isinstance(ref_turns, list) and len(ref_turns) > 0 else None + ) + ref_turn_2 = ( + ref_turns[1] if isinstance(ref_turns, list) and len(ref_turns) > 1 else None + ) + + rows.append( + { + "instruction_index": qid, + "category": category, + "turn_1": turn_1, + "turn_2": turn_2, + "reference_turn_1": ref_turn_1, + "reference_turn_2": ref_turn_2, + "instruction": turn_1, + } + ) + + return pd.DataFrame(rows) + diff --git a/openjury/mt_bench/__init__.py b/openjury/mt_bench/__init__.py new file mode 100644 index 0000000..c5cdd59 --- /dev/null +++ b/openjury/mt_bench/__init__.py @@ -0,0 +1,5 @@ +"""MT-Bench-specific helpers. + +This package intentionally contains MT-Bench specific logic. +""" + diff --git a/openjury/mt_bench/common.py b/openjury/mt_bench/common.py new file mode 100644 index 0000000..8a0028e --- /dev/null +++ b/openjury/mt_bench/common.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterator + +import pandas as pd + +from openjury.utils import safe_text + + +@dataclass(frozen=True) +class MTBenchPairwiseRow: + question_id: object + category: str | None + turn_1_question: str + turn_2_question: str + answer_a_1: str + answer_a_2: str + answer_b_1: str + answer_b_2: str + ref_1: str + ref_2: str + + +def iter_mt_bench_pairwise_rows( + *, + questions: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + truncate_input_chars: int | None, +) -> Iterator[MTBenchPairwiseRow]: + for question_id in questions.index.tolist(): + row = questions.loc[question_id] + comp_a_row = ( + completions_a.loc[question_id] + if question_id in completions_a.index + else completions_a.iloc[0] + ) + comp_b_row = ( + completions_b.loc[question_id] + if question_id in completions_b.index + else completions_b.iloc[0] + ) + yield MTBenchPairwiseRow( + question_id=question_id, + category=row.get("category"), + turn_1_question=safe_text(row.get("turn_1"), truncate_input_chars), + turn_2_question=safe_text(row.get("turn_2"), truncate_input_chars), + answer_a_1=safe_text( + comp_a_row.get("completion_turn_1", ""), + truncate_input_chars, + ), + answer_a_2=safe_text( + comp_a_row.get("completion_turn_2", ""), + truncate_input_chars, + ), + answer_b_1=safe_text( + comp_b_row.get("completion_turn_1", ""), + truncate_input_chars, + ), + answer_b_2=safe_text( + comp_b_row.get("completion_turn_2", ""), + truncate_input_chars, + ), + ref_1=safe_text(row.get("reference_turn_1"), truncate_input_chars), + ref_2=safe_text(row.get("reference_turn_2"), truncate_input_chars), + ) diff --git a/openjury/mt_bench/fastchat_compat.py b/openjury/mt_bench/fastchat_compat.py new file mode 100644 index 0000000..728b0f2 --- /dev/null +++ b/openjury/mt_bench/fastchat_compat.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +import math +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +import pandas as pd +from langchain.prompts import ChatPromptTemplate + +from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.utils import do_inference + + +FASTCHAT_TEMPERATURE_CONFIG: dict[str, float] = { + "writing": 0.7, + "roleplay": 0.7, + "extraction": 0.0, + "math": 0.0, + "coding": 0.0, + "reasoning": 0.0, + "stem": 0.1, + "humanities": 0.1, +} + +FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding"} + +FastChatVerdict = Literal["A", "B", "tie", "error"] +PairwiseWinner = Literal["model_A", "model_B", "tie", "error"] + + +@dataclass(frozen=True) +class FastChatPairwisePrompt: + name: str + system_prompt: str + user_prompt_template: str + multi_turn: bool + ref_based: bool + + +_PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts" / "mt_bench" +_SYSTEM_BASE_FILE = "system-base.txt" +_USER_SINGLE_BASE_FILE = "user-single-base.txt" +_USER_MULTI_BASE_FILE = "user-multi-base.txt" +_USER_SINGLE_REF_BLOCK_FILE = "user-single-reference-block.txt" +_USER_MULTI_REF_BLOCK_FILE = "user-multi-reference-block.txt" + + +def _load_prompt_text(filename: str) -> str: + path = _PROMPTS_DIR / filename + return path.read_text(encoding="utf-8") + + +def _render_prompt_text(filename: str, **kwargs: str) -> str: + return _load_prompt_text(filename).format(**kwargs) + + +def _build_system_prompt( + *, + user_subject: str, + task_description: str, + begin_instruction: str, + focus_line: str = "", +) -> str: + focus_segment = f"{focus_line} " if focus_line else "" + return _render_prompt_text( + _SYSTEM_BASE_FILE, + user_subject=user_subject, + task_description=task_description, + focus_line=focus_segment, + begin_instruction=begin_instruction, + ) + + +def _build_user_prompt_template(*, multi_turn: bool, ref_based: bool) -> str: + base_filename = _USER_MULTI_BASE_FILE if multi_turn else _USER_SINGLE_BASE_FILE + reference_block = "" + if ref_based: + ref_block_filename = ( + _USER_MULTI_REF_BLOCK_FILE if multi_turn else _USER_SINGLE_REF_BLOCK_FILE + ) + reference_block = _load_prompt_text(ref_block_filename) + return _render_prompt_text(base_filename, reference_block=reference_block) + + +def _load_pairwise_prompt( + *, + name: str, + multi_turn: bool, + ref_based: bool, + system_user_subject: str, + system_task_description: str, + system_begin_instruction: str, + system_focus_line: str = "", +) -> FastChatPairwisePrompt: + return FastChatPairwisePrompt( + name=name, + multi_turn=multi_turn, + ref_based=ref_based, + system_prompt=_build_system_prompt( + user_subject=system_user_subject, + task_description=system_task_description, + begin_instruction=system_begin_instruction, + focus_line=system_focus_line, + ), + user_prompt_template=_build_user_prompt_template( + multi_turn=multi_turn, + ref_based=ref_based, + ), + ) + + +_PAIR_V2 = _load_pairwise_prompt( + name="pair-v2", + multi_turn=False, + ref_based=False, + system_user_subject="question displayed below", + system_task_description=( + "You should choose the assistant that follows the user's instructions and answers " + "the user's question better. Your evaluation should consider factors such as the " + "helpfulness, relevance, accuracy, depth, creativity, and level of detail of their " + "responses." + ), + system_begin_instruction="comparing the two responses and provide a short explanation", +) + +_PAIR_V2_MULTI = _load_pairwise_prompt( + name="pair-v2-multi-turn", + multi_turn=True, + ref_based=False, + system_user_subject="questions", + system_task_description=( + "You should choose the assistant that follows the user's instructions and answers " + "the user's questions better. Your evaluation should consider factors such as the " + "helpfulness, relevance, accuracy, depth, creativity, and level of detail of their " + "responses." + ), + system_focus_line="You should focus on who provides a better answer to the second user question.", + system_begin_instruction=( + "comparing the responses of the two assistants and provide a short explanation" + ), +) + +_PAIR_MATH_V1 = _load_pairwise_prompt( + name="pair-math-v1", + multi_turn=False, + ref_based=True, + system_user_subject="question displayed below", + system_task_description=( + "Your evaluation should consider correctness and helpfulness. You will be given a " + "reference answer, assistant A's answer, and assistant B's answer. Your job is to " + "evaluate which assistant's answer is better." + ), + system_begin_instruction=( + "comparing both assistants' answers with the reference answer. Identify and correct any mistakes" + ), +) + +_PAIR_MATH_V1_MULTI = _load_pairwise_prompt( + name="pair-math-v1-multi-turn", + multi_turn=True, + ref_based=True, + system_user_subject="questions", + system_task_description=( + "Your evaluation should consider correctness and helpfulness. You will be given " + "reference answers, the assistant A's answers, the assistant B's answers. Your job is " + "to determine which assistant provides correct and helpful answers to the second user question." + ), + system_begin_instruction=( + "comparing both assistants' answers with the reference answers. Identify and correct any mistakes" + ), +) + + +def _parse_fastchat_verdict(judgment: str) -> FastChatVerdict: + if "[[A]]" in judgment: + return "A" + if "[[B]]" in judgment: + return "B" + if "[[C]]" in judgment: + return "tie" + return "error" + + +def _map_verdict_to_winner(verdict: FastChatVerdict, swapped: bool) -> PairwiseWinner: + if verdict == "tie": + return "tie" + if verdict == "error": + return "error" + if verdict == "A": + return "model_B" if swapped else "model_A" + if verdict == "B": + return "model_A" if swapped else "model_B" + return "error" + + +def _conservative_winner(g1: PairwiseWinner, g2: PairwiseWinner) -> tuple[PairwiseWinner, bool]: + """Conservative position-bias handling (FastChat/MT-Bench paper). + + Declare a winner only if the two orderings agree; otherwise treat as tie. + """ + if g1 == "error" or g2 == "error": + return "error", False + if g1 == g2: + return g1, False + return "tie", True + + +def _winner_to_preference(winner: PairwiseWinner) -> float: + if winner == "model_A": + return 0.0 + if winner == "model_B": + return 1.0 + if winner == "tie": + return 0.5 + return math.nan + + +def _select_prompt(category: str | None, multi_turn: bool) -> FastChatPairwisePrompt: + needs_ref = (category or "") in FASTCHAT_NEED_REF_CATS + if needs_ref and multi_turn: + return _PAIR_MATH_V1_MULTI + if needs_ref: + return _PAIR_MATH_V1 + if multi_turn: + return _PAIR_V2_MULTI + return _PAIR_V2 + + +def _group_indices_by_prompt( + items: list[dict[str, Any]], +) -> dict[str, list[int]]: + grouped: dict[str, list[int]] = {} + for idx, item in enumerate(items): + grouped.setdefault(item["prompt_name"], []).append(idx) + return grouped + + +def _swap_prompt_kwargs(kwargs: dict[str, str], *, multi_turn: bool) -> dict[str, str]: + swapped = dict(kwargs) + if multi_turn: + swapped["answer_a_1"], swapped["answer_b_1"] = swapped["answer_b_1"], swapped["answer_a_1"] + swapped["answer_a_2"], swapped["answer_b_2"] = swapped["answer_b_2"], swapped["answer_a_2"] + return swapped + swapped["answer_a"], swapped["answer_b"] = swapped["answer_b"], swapped["answer_a"] + return swapped + + +def _infer_by_prompt_groups( + *, + judge_chat_model, + items: list[dict[str, Any]], + use_tqdm: bool, + swap_answers: bool, +) -> list[str]: + """Run judge inference, grouping by prompt variant for batching.""" + grouped_indices = _group_indices_by_prompt(items) + + judgments: list[str] = [""] * len(items) + for prompt_name, idxs in grouped_indices.items(): + prompt: FastChatPairwisePrompt = items[idxs[0]]["prompt"] + prompt_template = ChatPromptTemplate.from_messages( + [("system", prompt.system_prompt), ("user", prompt.user_prompt_template)] + ) + + batch_kwargs = [] + for i in idxs: + kwargs = items[i]["prompt_kwargs"] + if swap_answers: + kwargs = _swap_prompt_kwargs(kwargs, multi_turn=prompt.multi_turn) + batch_kwargs.append(kwargs) + + prompt_inputs = prompt_template.batch(batch_kwargs) + outs = do_inference( + chat_model=judge_chat_model, + inputs=prompt_inputs, + use_tqdm=use_tqdm, + ) + for i, out in zip(idxs, outs): + judgments[i] = str(out) + return judgments + + +def _build_fastchat_judge_items( + *, + questions: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + eval_single: bool, + eval_multi: bool, + truncate_input_chars: int | None, +) -> list[dict[str, Any]]: + items: list[dict[str, Any]] = [] + for pair_row in iter_mt_bench_pairwise_rows( + questions=questions, + completions_a=completions_a, + completions_b=completions_b, + truncate_input_chars=truncate_input_chars, + ): + category = pair_row.category + if eval_single: + prompt = _select_prompt(category, multi_turn=False) + kwargs: dict[str, str] = { + "question": pair_row.turn_1_question, + "answer_a": pair_row.answer_a_1, + "answer_b": pair_row.answer_b_1, + } + if prompt.ref_based: + kwargs["ref_answer_1"] = pair_row.ref_1 + items.append( + { + "question_id": pair_row.question_id, + "category": category, + "turn": 1, + "prompt": prompt, + "prompt_name": prompt.name, + "prompt_kwargs": kwargs, + } + ) + + if eval_multi and pair_row.turn_2_question: + prompt = _select_prompt(category, multi_turn=True) + kwargs = { + "question_1": pair_row.turn_1_question, + "question_2": pair_row.turn_2_question, + "answer_a_1": pair_row.answer_a_1, + "answer_a_2": pair_row.answer_a_2, + "answer_b_1": pair_row.answer_b_1, + "answer_b_2": pair_row.answer_b_2, + } + if prompt.ref_based: + kwargs["ref_answer_1"] = pair_row.ref_1 + kwargs["ref_answer_2"] = pair_row.ref_2 + items.append( + { + "question_id": pair_row.question_id, + "category": category, + "turn": 2, + "prompt": prompt, + "prompt_name": prompt.name, + "prompt_kwargs": kwargs, + } + ) + return items + + +def _resolve_fastchat_item_result( + *, + item: dict[str, Any], + g1_raw: str, + g2_raw: str | None, + judge_model: str, + model_a: str, + model_b: str, +) -> tuple[dict[str, Any], dict[str, object], float, bool]: + prompt: FastChatPairwisePrompt = item["prompt"] + kwargs = item["prompt_kwargs"] + g1_user_prompt = prompt.user_prompt_template.format(**kwargs) + g1_verdict = _parse_fastchat_verdict(g1_raw) + g1_winner = _map_verdict_to_winner(g1_verdict, swapped=False) + + final_winner = g1_winner + inconsistent = False + annotation_row: dict[str, Any] = { + "question_id": item["question_id"], + "category": item["category"], + "turn": item["turn"], + "model_A": model_a, + "model_B": model_b, + "judge": judge_model, + "prompt_name": prompt.name, + "system_prompt": prompt.system_prompt, + "g1_user_prompt": g1_user_prompt, + "g1_judgment": g1_raw, + "g1_verdict": g1_verdict, + "g1_winner": g1_winner, + } + + if g2_raw is not None: + g2_verdict = _parse_fastchat_verdict(g2_raw) + g2_winner = _map_verdict_to_winner(g2_verdict, swapped=True) + final_winner, inconsistent = _conservative_winner(g1_winner, g2_winner) + annotation_row.update( + { + "g2_user_prompt": prompt.user_prompt_template.format( + **_swap_prompt_kwargs(kwargs, multi_turn=prompt.multi_turn) + ), + "g2_judgment": g2_raw, + "g2_verdict": g2_verdict, + "g2_winner": g2_winner, + "final_winner": final_winner, + "inconsistent": inconsistent, + } + ) + else: + annotation_row["final_winner"] = final_winner + annotation_row["inconsistent"] = False + + preference = _winner_to_preference(final_winner) + annotation_row["preference"] = preference + metadata = { + "question_id": item["question_id"], + "category": item["category"], + "turn": item["turn"], + } + return annotation_row, metadata, preference, inconsistent + + +def judge_mt_bench_pairwise_fastchat( + *, + judge_chat_model, + judge_model: str, + questions: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + model_a: str, + model_b: str, + turns_mode: str, + swap_mode: str, + truncate_input_chars: int | None, + use_tqdm: bool, +) -> tuple[pd.Series, list[dict[str, Any]], list[dict[str, object]], int]: + """Pairwise MT-Bench judging compatible with FastChat's `[[A]]/[[B]]/[[C]]` format.""" + assert turns_mode in ("both", "single", "multi") + assert swap_mode in ("fixed", "both") + + eval_single = turns_mode in ("both", "single") + eval_multi = turns_mode in ("both", "multi") + + items = _build_fastchat_judge_items( + questions=questions, + completions_a=completions_a, + completions_b=completions_b, + eval_single=eval_single, + eval_multi=eval_multi, + truncate_input_chars=truncate_input_chars, + ) + + g1_judgments = _infer_by_prompt_groups( + judge_chat_model=judge_chat_model, + items=items, + use_tqdm=use_tqdm, + swap_answers=False, + ) + + g2_judgments: list[str] | None = None + if swap_mode == "both": + g2_judgments = _infer_by_prompt_groups( + judge_chat_model=judge_chat_model, + items=items, + use_tqdm=use_tqdm, + swap_answers=True, + ) + + annotations: list[dict[str, Any]] = [] + metadata: list[dict[str, object]] = [] + prefs: list[float] = [] + num_inconsistent = 0 + + for idx, item in enumerate(items): + g2_raw = g2_judgments[idx] if g2_judgments is not None else None + annotation_row, item_metadata, preference, inconsistent = _resolve_fastchat_item_result( + item=item, + g1_raw=g1_judgments[idx], + g2_raw=g2_raw, + judge_model=judge_model, + model_a=model_a, + model_b=model_b, + ) + if inconsistent: + num_inconsistent += 1 + annotations.append(annotation_row) + metadata.append(item_metadata) + prefs.append(preference) + + return pd.Series(prefs, dtype=float), annotations, metadata, num_inconsistent + diff --git a/openjury/mt_bench/pipeline.py b/openjury/mt_bench/pipeline.py new file mode 100644 index 0000000..39f9eb4 --- /dev/null +++ b/openjury/mt_bench/pipeline.py @@ -0,0 +1,477 @@ +"""MT-Bench evaluation pipeline. + +Orchestrates multi-turn generation, per-turn judging (OpenJury or +FastChat-compatible), and result saving for the MT-Bench benchmark. +""" + +from __future__ import annotations + +import json +import os +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd + +from openjury.evaluate import PairScore, load_judge_system_and_user_prompt +from openjury.eval_runtime import ( + _compute_grouped_stats, + _judge_turn, + compute_preference_stats, + print_results, +) +from openjury.generate import generate_multiturn +from openjury.instruction_dataset import load_instructions +from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.mt_bench.fastchat_compat import ( + FASTCHAT_TEMPERATURE_CONFIG, + judge_mt_bench_pairwise_fastchat, +) +from openjury.utils import cache_function_dataframe, make_model + +if TYPE_CHECKING: + from openjury.config import CliArgs + +NEED_REF_CATS = {"math", "reasoning", "coding"} + + +def format_mt_bench_for_evaluation( + questions: pd.DataFrame, + completions_A: pd.DataFrame, + completions_B: pd.DataFrame, + turns_mode: str, + truncate_input_chars: int | None, +) -> tuple[ + tuple[list[str], list[str], list[str], list[dict[str, object]]], + tuple[list[str], list[str], list[str], list[dict[str, object]]], +]: + """Flatten MT-Bench into per-turn instruction/completion battle inputs.""" + assert turns_mode in ("both", "single", "multi") + eval_single = turns_mode in ("both", "single") + eval_multi = turns_mode in ("both", "multi") + + instructions_turn_1: list[str] = [] + completions_a_turn_1: list[str] = [] + completions_b_turn_1: list[str] = [] + metadata_turn_1: list[dict[str, object]] = [] + + instructions_turn_2: list[str] = [] + completions_a_turn_2: list[str] = [] + completions_b_turn_2: list[str] = [] + metadata_turn_2: list[dict[str, object]] = [] + + for row in iter_mt_bench_pairwise_rows( + questions=questions, + completions_a=completions_A, + completions_b=completions_B, + truncate_input_chars=truncate_input_chars, + ): + needs_ref = row.category in NEED_REF_CATS + if eval_single: + if needs_ref and row.ref_1: + instruction = ( + "[MT-Bench | Turn 1]\n" + "Use the reference answer for correctness checks.\n\n" + f"[Question]\n{row.turn_1_question}\n\n" + f"[Reference Answer]\n{row.ref_1}" + ) + else: + instruction = row.turn_1_question + + instructions_turn_1.append(instruction) + completions_a_turn_1.append(row.answer_a_1) + completions_b_turn_1.append(row.answer_b_1) + metadata_turn_1.append( + { + "question_id": row.question_id, + "category": row.category, + "turn": 1, + } + ) + + if eval_multi and row.turn_2_question: + instruction_parts = [ + "Please focus on which assistant provides a better answer to the second user question." + ] + if needs_ref and (row.ref_1 or row.ref_2): + instruction_parts.extend( + [ + "<|The Start of Reference Answer|>", + "### User:", + row.turn_1_question, + "### Reference answer:", + row.ref_1, + "### User:", + row.turn_2_question, + "### Reference answer:", + row.ref_2, + "<|The End of Reference Answer|>", + ] + ) + + conversation_a = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_a_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_a_2, + ) + conversation_b = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_b_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_b_2, + ) + + instructions_turn_2.append("\n\n".join(instruction_parts)) + completions_a_turn_2.append(conversation_a) + completions_b_turn_2.append(conversation_b) + metadata_turn_2.append( + { + "question_id": row.question_id, + "category": row.category, + "turn": 2, + } + ) + + return ( + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ), + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ), + ) + + +def _format_mt_bench_multiturn_conversation( + *, + turn_1_question: str, + turn_1_answer: str, + turn_2_question: str, + turn_2_answer: str, +) -> str: + return ( + "### User:\n" + f"{turn_1_question}\n\n" + "### Assistant:\n" + f"{turn_1_answer}\n\n" + "### User:\n" + f"{turn_2_question}\n\n" + "### Assistant:\n" + f"{turn_2_answer}" + ) + + +def _generate_mt_bench_completions( + args: CliArgs, + questions_df: pd.DataFrame, + ignore_cache: bool, +) -> tuple[pd.DataFrame, pd.DataFrame]: + cache_prefix = ( + "mt-bench_fastchatgen" if args.mt_bench_compatibility == "fastchat" else "mt-bench" + ) + + def _run_generation(model_name: str) -> pd.DataFrame: + if args.mt_bench_compatibility == "fastchat": + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + temperature_config=FASTCHAT_TEMPERATURE_CONFIG, + ) + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_a = cache_function_dataframe( + lambda: _run_generation(args.model_A), + ignore_cache=ignore_cache, + cache_name=f"{cache_prefix}_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + + completions_b = cache_function_dataframe( + lambda: _run_generation(args.model_B), + ignore_cache=ignore_cache, + cache_name=f"{cache_prefix}_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + return completions_a, completions_b + + +def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + if suffix: + name += f"-{suffix}" + return name.replace("/", "_") + + +def _save_mt_bench_results( + *, + args: CliArgs, + results: dict[str, object], + annotations_df: pd.DataFrame, + name_suffix: str | None = None, +) -> None: + name = _build_mt_bench_result_name(args, suffix=name_suffix) + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + + with open(res_folder / f"results-{name}.json", "w") as f: + json.dump(results, f, indent=2) + + +def _run_mt_bench_fastchat( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + prefs, annotations, combined_metadata, num_inconsistent = ( + judge_mt_bench_pairwise_fastchat( + judge_chat_model=judge_chat_model, + judge_model=args.judge_model, + questions=questions_df, + completions_a=completions_a, + completions_b=completions_b, + model_a=args.model_A, + model_b=args.model_B, + turns_mode=args.mt_bench_turns, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + ) + + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + "mt_bench_compatibility": args.mt_bench_compatibility, + "num_inconsistent": num_inconsistent, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + _save_mt_bench_results( + args=args, + results=results, + annotations_df=pd.DataFrame(annotations), + name_suffix=f"mtbench_{args.mt_bench_compatibility}", + ) + return prefs + + +def _run_mt_bench_openjury( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( + questions=questions_df, + completions_A=completions_a, + completions_B=completions_b, + turns_mode=args.mt_bench_turns, + truncate_input_chars=args.truncate_all_input_chars, + ) + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ) = turn_1_inputs + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ) = turn_2_inputs + + score_parser = PairScore() + annotations = [] + metadata_for_annotations: list[dict[str, object]] = [] + annotations_reversed = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + preference_parts: list[pd.Series] = [] + combined_metadata: list[dict[str, object]] = [] + + if instructions_turn_1: + ( + annotations_turn_1, + annotations_turn_1_reversed, + metadata_turn_1_for_annotations, + metadata_turn_1_for_reversed_annotations, + prefs_turn_1, + combined_metadata_turn_1, + ) = _judge_turn( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_1, + completions_A=completions_a_turn_1, + completions_B=completions_b_turn_1, + metadata=metadata_turn_1, + score_parser=score_parser, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations.extend(annotations_turn_1) + annotations_reversed.extend(annotations_turn_1_reversed) + metadata_for_annotations.extend(metadata_turn_1_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_1_for_reversed_annotations + ) + preference_parts.append(prefs_turn_1) + combined_metadata.extend(combined_metadata_turn_1) + + if instructions_turn_2: + mt_system_prompt, mt_user_prompt_template = load_judge_system_and_user_prompt( + provide_explanation=args.provide_explanation, + multi_turn=True, + ) + ( + annotations_turn_2, + annotations_turn_2_reversed, + metadata_turn_2_for_annotations, + metadata_turn_2_for_reversed_annotations, + prefs_turn_2, + combined_metadata_turn_2, + ) = _judge_turn( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_2, + completions_A=completions_a_turn_2, + completions_B=completions_b_turn_2, + metadata=metadata_turn_2, + score_parser=score_parser, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + system_prompt=mt_system_prompt, + user_prompt_template=mt_user_prompt_template, + ) + annotations.extend(annotations_turn_2) + annotations_reversed.extend(annotations_turn_2_reversed) + metadata_for_annotations.extend(metadata_turn_2_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_2_for_reversed_annotations + ) + preference_parts.append(prefs_turn_2) + combined_metadata.extend(combined_metadata_turn_2) + + prefs = ( + pd.concat(preference_parts).reset_index(drop=True) + if preference_parts + else pd.Series(dtype=float) + ) + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + + df = pd.DataFrame(annotations) + df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations] + df["category"] = [meta["category"] for meta in metadata_for_annotations] + df["turn"] = [meta["turn"] for meta in metadata_for_annotations] + df["model_A"] = args.model_A + df["model_B"] = args.model_B + df["judge"] = args.judge_model + + if args.swap_mode == "both": + df_reversed = pd.DataFrame(annotations_reversed) + df_reversed["instruction_index"] = [ + meta["question_id"] for meta in metadata_for_reversed_annotations + ] + df_reversed["category"] = [ + meta["category"] for meta in metadata_for_reversed_annotations + ] + df_reversed["turn"] = [meta["turn"] for meta in metadata_for_reversed_annotations] + df_reversed["model_A"] = args.model_B + df_reversed["model_B"] = args.model_A + df_reversed["judge"] = args.judge_model + df = pd.concat([df, df_reversed], ignore_index=True) + + _save_mt_bench_results( + args=args, + results=results, + annotations_df=df, + ) + return prefs + + +def run_mt_bench(args: CliArgs, ignore_cache: bool): + """MT-Bench pipeline (optionally FastChat-compatible).""" + questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) + print( + f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." + ) + completions_a, completions_b = _generate_mt_bench_completions( + args=args, + questions_df=questions_df, + ignore_cache=ignore_cache, + ) + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + temperature=0.0 if args.mt_bench_compatibility == "fastchat" else None, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + if args.mt_bench_compatibility == "fastchat": + return _run_mt_bench_fastchat( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) + return _run_mt_bench_openjury( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) diff --git a/openjury/prompts/mt_bench/system-base.txt b/openjury/prompts/mt_bench/system-base.txt new file mode 100644 index 0000000..b4aff2e --- /dev/null +++ b/openjury/prompts/mt_bench/system-base.txt @@ -0,0 +1 @@ +Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user {user_subject}. {task_description} {focus_line}Begin your evaluation by {begin_instruction}. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie. diff --git a/openjury/prompts/mt_bench/user-multi-base.txt b/openjury/prompts/mt_bench/user-multi-base.txt new file mode 100644 index 0000000..33abb79 --- /dev/null +++ b/openjury/prompts/mt_bench/user-multi-base.txt @@ -0,0 +1,32 @@ +{reference_block}<|The Start of Assistant A's Conversation with User|> + +### User: +{{question_1}} + +### Assistant A: +{{answer_a_1}} + +### User: +{{question_2}} + +### Assistant A: +{{answer_a_2}} + +<|The End of Assistant A's Conversation with User|> + + +<|The Start of Assistant B's Conversation with User|> + +### User: +{{question_1}} + +### Assistant B: +{{answer_b_1}} + +### User: +{{question_2}} + +### Assistant B: +{{answer_b_2}} + +<|The End of Assistant B's Conversation with User|> diff --git a/openjury/prompts/mt_bench/user-multi-reference-block.txt b/openjury/prompts/mt_bench/user-multi-reference-block.txt new file mode 100644 index 0000000..703554d --- /dev/null +++ b/openjury/prompts/mt_bench/user-multi-reference-block.txt @@ -0,0 +1,16 @@ +<|The Start of Reference Answer|> + +### User: +{question_1} + +### Reference answer: +{ref_answer_1} + +### User: +{question_2} + +### Reference answer: +{ref_answer_2} + +<|The End of Reference Answer|> + diff --git a/openjury/prompts/mt_bench/user-single-base.txt b/openjury/prompts/mt_bench/user-single-base.txt new file mode 100644 index 0000000..ee7701c --- /dev/null +++ b/openjury/prompts/mt_bench/user-single-base.txt @@ -0,0 +1,10 @@ +[User Question] +{{question}} + +{reference_block}[The Start of Assistant A's Answer] +{{answer_a}} +[The End of Assistant A's Answer] + +[The Start of Assistant B's Answer] +{{answer_b}} +[The End of Assistant B's Answer] diff --git a/openjury/prompts/mt_bench/user-single-reference-block.txt b/openjury/prompts/mt_bench/user-single-reference-block.txt new file mode 100644 index 0000000..1b687d2 --- /dev/null +++ b/openjury/prompts/mt_bench/user-single-reference-block.txt @@ -0,0 +1,4 @@ +[The Start of Reference Answer] +{ref_answer_1} +[The End of Reference Answer] + diff --git a/openjury/prompts/prompt-with-explanation.txt b/openjury/prompts/prompt-with-explanation.txt deleted file mode 100644 index 6600f51..0000000 --- a/openjury/prompts/prompt-with-explanation.txt +++ /dev/null @@ -1,21 +0,0 @@ -<|User Prompt|> -{user_prompt} - -<|The Start of Assistant A's Answer|> -{completion_A} -<|The End of Assistant A's Answer|> - -<|The Start of Assistant B's Answer|> -{completion_B} -<|The End of Assistant B's Answer|> - -# Your output - -## Format description -Your output should follow this format: -``` -score_A: -score_B: -``` - -## Your output, do not repeat the input above, first starts with an explanation of your judgement diff --git a/openjury/prompts/prompt.txt b/openjury/prompts/prompt.txt index 21d2e48..38021e6 100644 --- a/openjury/prompts/prompt.txt +++ b/openjury/prompts/prompt.txt @@ -1,13 +1,13 @@ <|User Prompt|> {user_prompt} -<|The Start of Assistant A's Answer|> +<|The Start of Assistant A's {completion_label}|> {completion_A} -<|The End of Assistant A's Answer|> +<|The End of Assistant A's {completion_label}|> -<|The Start of Assistant B's Answer|> +<|The Start of Assistant B's {completion_label}|> {completion_B} -<|The End of Assistant B's Answer|> +<|The End of Assistant B's {completion_label}|> # Your output @@ -18,5 +18,4 @@ score_A: ``` -## Your output, do not repeat the input above -``` +## Your output, do not repeat the input above{explanation_suffix} diff --git a/openjury/utils.py b/openjury/utils.py index 27db11d..72ee2cd 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -8,7 +8,6 @@ from huggingface_hub import snapshot_download import pandas as pd from tqdm.asyncio import tqdm -from langchain_community.llms import LlamaCpp from langchain_openai import ChatOpenAI from langchain_community.cache import SQLiteCache from langchain_core.globals import set_llm_cache @@ -43,6 +42,23 @@ def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame: return pd.read_parquet(filename, **pandas_kwargs) +def truncate(s: str, max_len: int | None = None) -> str: + if not isinstance(s, str): + return "" + if max_len is not None: + return s[:max_len] + return s + + +def safe_text(value: object, truncate_chars: int | None) -> str: + if value is None: + return "" + is_missing = pd.isna(value) + if isinstance(is_missing, bool) and is_missing: + return "" + return truncate(str(value), max_len=truncate_chars) + + def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]: """Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B).""" prefs = pd.Series(prefs, dtype="float64") @@ -156,7 +172,141 @@ async def ainvoke(self, input, **invoke_kwargs): return self.message -class ChatVLLM: +class BaseLocalModel: + """Shared prompt conversion and invoke helpers for local model wrappers.""" + + def _to_messages(self, input_item) -> list[dict]: + """Convert LangChain prompt input to OpenAI-style messages.""" + role_map = {"human": "user", "ai": "assistant", "system": "system"} + + if hasattr(input_item, "to_messages"): + lc_messages = input_item.to_messages() + return [ + {"role": role_map.get(msg.type, msg.type), "content": msg.content} + for msg in lc_messages + ] + elif ( + isinstance(input_item, list) + and input_item + and isinstance(input_item[0], tuple) + ): + return [ + {"role": role if role != "human" else "user", "content": content} + for role, content in input_item + ] + elif ( + isinstance(input_item, list) + and input_item + and isinstance(input_item[0], dict) + ): + return input_item + elif isinstance(input_item, str): + return [{"role": "user", "content": input_item}] + else: + raise ValueError(f"Unsupported input type: {type(input_item)}") + + def _to_raw_text(self, input_item) -> str: + """Extract raw text from an input item for text-completion mode.""" + if isinstance(input_item, str): + return input_item + if hasattr(input_item, "to_string"): + return input_item.to_string() + if ( + isinstance(input_item, list) + and input_item + and isinstance(input_item[0], dict) + ): + return "\n".join(msg["content"] for msg in input_item) + raise ValueError(f"Cannot extract raw text from: {type(input_item)}") + + def invoke(self, input_item, **invoke_kwargs) -> str: + return self.batch([input_item], **invoke_kwargs)[0] + + async def ainvoke(self, input_item, **invoke_kwargs): + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, lambda: self.invoke(input_item, **invoke_kwargs) + ) + + +class ChatLlamaCppModel(BaseLocalModel): + """LlamaCpp wrapper that auto-detects and applies the GGUF chat template. + + Mirrors the ChatVLLM pattern but for local GGUF models via llama-cpp-python. + + Chat template handling: + - If the GGUF file embeds a chat template (typical for instruct models), + uses ``create_chat_completion()`` which applies the template and + handles EOS tokens correctly. + - If no template is found (base/pretrained models), falls back to + ``create_completion()`` (text mode) and emits a warning. + + Unlike langchain's ``ChatLlamaCpp``, this wrapper explicitly calls + ``Llama.reset()`` between conversations to clear stale KV-cache state. + + Sampling defaults: + - ``temperature=None`` means do not pass temperature explicitly and keep + llama-cpp's backend default behavior. + """ + + def __init__( + self, + model_path: str, + max_tokens: int = 1024, + n_ctx: int = 0, + temperature: float | None = None, + **kwargs, + ): + from llama_cpp import Llama + + self.model_path = model_path + self.max_tokens = max_tokens + self.temperature = temperature + self.llama = Llama( + model_path=model_path, + n_ctx=n_ctx, + verbose=True, + **kwargs, + ) + + chat_template = self.llama.metadata.get("tokenizer.chat_template") + if chat_template: + self._use_generate = False + print(f"ChatLlamaCppModel: using GGUF chat template for '{model_path}'") + else: + self._use_generate = True + warnings.warn( + f"Model '{model_path}' does not embed a chat template. " + f"Falling back to text-completion mode (no chat formatting). " + f"Override with --chat_template if this model needs one.", + ) + + def batch(self, inputs: list, **kwargs) -> list[str]: + """Process a batch of inputs, resetting KV cache between conversations.""" + results = [] + for inp in inputs: + self.llama.reset() + if self._use_generate: + text = self._to_raw_text(inp) + create_kwargs = {"prompt": text, "max_tokens": self.max_tokens} + if self.temperature is not None: + create_kwargs["temperature"] = self.temperature + response = self.llama.create_completion(**create_kwargs) + results.append(response["choices"][0]["text"]) + else: + messages = self._to_messages(inp) + create_kwargs = {"messages": messages, "max_tokens": self.max_tokens} + if self.temperature is not None: + create_kwargs["temperature"] = self.temperature + response = self.llama.create_chat_completion(**create_kwargs) + results.append(response["choices"][0]["message"]["content"]) + return results + + def set_temperature(self, temperature: float | None) -> None: + self.temperature = None if temperature is None else float(temperature) + + +class ChatVLLM(BaseLocalModel): """VLLM wrapper that auto-detects whether to use chat() or generate(). Chat template handling: @@ -169,9 +319,21 @@ class ChatVLLM: falls back to ``llm.generate()`` and emits a warning. This avoids the ``ValueError`` raised by ``transformers >= v4.44`` which removed the default chat template. + + Sampling defaults: + - Uses ``temperature=0.6`` and ``top_p=0.95`` unless explicitly + overridden. """ - def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None = None, **vllm_kwargs): + def __init__( + self, + model: str, + max_tokens: int = 8192, + temperature: float = 0.6, + top_p: float = 0.95, + chat_template: str | None = None, + **vllm_kwargs, + ): from vllm import LLM, SamplingParams self.model_path = model @@ -183,6 +345,7 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None if max_model_len is not None: try: from transformers import AutoConfig + config = AutoConfig.from_pretrained(model, trust_remote_code=True) model_max_pos = getattr(config, "max_position_embeddings", None) if model_max_pos is not None and max_model_len > model_max_pos: @@ -200,10 +363,13 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None ) self.llm = LLM(model=model, trust_remote_code=True, **vllm_kwargs) - self.sampling_params = SamplingParams( + self._SamplingParams = SamplingParams + self._temperature = temperature + self._top_p = top_p + self.sampling_params = self._SamplingParams( max_tokens=max_tokens, - temperature=0.6, - top_p=0.95, + temperature=self._temperature, + top_p=self._top_p, ) # Resolve chat template: @@ -229,52 +395,13 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None self._use_generate = False print(f"ChatVLLM: using tokenizer's chat template for '{model}'") - def _to_messages(self, input_item) -> list[dict]: - """Convert LangChain prompt input to OpenAI-style messages.""" - # Map LangChain message types to OpenAI roles - role_map = {"human": "user", "ai": "assistant", "system": "system"} - - # Handle ChatPromptValue from LangChain - if hasattr(input_item, "to_messages"): - lc_messages = input_item.to_messages() - return [ - {"role": role_map.get(msg.type, msg.type), "content": msg.content} - for msg in lc_messages - ] - # Handle list of tuples like [("system", "..."), ("user", "...")] - elif ( - isinstance(input_item, list) - and input_item - and isinstance(input_item[0], tuple) - ): - return [ - {"role": role if role != "human" else "user", "content": content} - for role, content in input_item - ] - # Handle already formatted messages - elif ( - isinstance(input_item, list) - and input_item - and isinstance(input_item[0], dict) - ): - return input_item - # Handle plain string (wrap as user message) - elif isinstance(input_item, str): - return [{"role": "user", "content": input_item}] - else: - raise ValueError(f"Unsupported input type: {type(input_item)}") - - def _to_raw_text(self, input_item) -> str: - """Extract raw text from an input item for use with llm.generate().""" - if isinstance(input_item, str): - return input_item - # ChatPromptValue from LangChain - if hasattr(input_item, "to_string"): - return input_item.to_string() - # List of dicts (messages) - concatenate contents - if isinstance(input_item, list) and input_item and isinstance(input_item[0], dict): - return "\n".join(msg["content"] for msg in input_item) - raise ValueError(f"Cannot extract raw text from: {type(input_item)}") + def set_temperature(self, temperature: float) -> None: + self._temperature = float(temperature) + self.sampling_params = self._SamplingParams( + max_tokens=self.max_tokens, + temperature=self._temperature, + top_p=self._top_p, + ) def batch(self, inputs: list, **invoke_kwargs) -> list[str]: """Process a batch of inputs using vllm.LLM.chat() or llm.generate(). @@ -295,28 +422,21 @@ def batch(self, inputs: list, **invoke_kwargs) -> list[str]: ) return [out.outputs[0].text for out in outputs] - def invoke(self, input_item, **invoke_kwargs) -> str: - """Process a single input.""" - results = self.batch([input_item], **invoke_kwargs) - return results[0] - - async def ainvoke(self, input_item, **invoke_kwargs): - """Async version - runs sync version in executor for compatibility.""" - import asyncio - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, lambda: self.invoke(input_item, **invoke_kwargs) - ) - - -def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): +def make_model( + model: str, + max_tokens: int | None = 8192, + temperature: float | None = None, + **engine_kwargs, +): """Instantiate a model wrapper from a provider/model-name string. Args: model: Format ``{Provider}/{model_path}``, e.g. ``VLLM/meta-llama/Llama-3.3-70B-Instruct``. max_tokens: Maximum tokens the model may generate. + temperature: Optional generation temperature override. ``None`` keeps + each provider wrapper's default temperature behavior. **engine_kwargs: Engine-specific options forwarded to the model wrapper. """ # Avoid mutating the original engine_kwargs dictionary @@ -326,6 +446,8 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): # Dedicated arguments like max_tokens always win over engine_kwargs. engine_kwargs["max_tokens"] = max_tokens or 8192 + if temperature is not None: + engine_kwargs["temperature"] = temperature model_provider = model.split("/")[0] @@ -344,7 +466,6 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): model=model_name, **engine_kwargs, ) - if model_provider == "OpenRouter": # Special case we need to override API url and key return ChatOpenAI( @@ -353,15 +474,15 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): model=model_name, **engine_kwargs, ) + elif model_provider == "LlamaCpp": + engine_kwargs["model_path"] = model_name + engine_kwargs.setdefault("n_ctx", 0) + return ChatLlamaCppModel(**engine_kwargs) else: model_classes = [ - LlamaCpp, ChatOpenAI, ] - if model_provider == "LlamaCpp": - engine_kwargs["model_path"] = model_name - else: - engine_kwargs["model"] = model_name + engine_kwargs["model"] = model_name try: from langchain_together.llms import Together @@ -383,6 +504,8 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): def download_all(): + from openjury.instruction_dataset.mt_bench import download_mt_bench + print(f"Downloading all dataset in {data_root}") for dataset in ["alpaca-eval", "arena-hard", "m-arena-hard"]: local_path_tables = data_root / "tables" @@ -396,6 +519,8 @@ def download_all(): force_download=False, ) + download_mt_bench() + class Timeblock: """Timer context manager""" diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 7fa07f2..b9e4ef6 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -2,6 +2,7 @@ import pytest import openjury.generate_and_evaluate as generate_and_evaluate +import openjury.mt_bench.pipeline as mt_bench_pipeline from openjury.generate_and_evaluate import ( main as main_generate_and_eval, CliArgs, @@ -10,26 +11,56 @@ @pytest.fixture(autouse=True) def mock_external_data_and_cache(monkeypatch): - instructions = pd.DataFrame( + single_turn_instructions = pd.DataFrame( { "instruction": [f"Synthetic instruction {i}" for i in range(20)], }, index=pd.Index(range(20), name="instruction_index"), ) + # Mix of general and NEED_REF_CATS categories to exercise both code paths. + categories = ["writing", "math", "reasoning", "coding", "roleplay", + "writing", "math", "reasoning", "coding", "roleplay", + "writing", "math", "reasoning", "coding", "roleplay", + "writing", "math", "reasoning", "coding", "roleplay"] + ref_turn_1 = [ + f"Reference answer turn 1 for q{i}" if cat in ("math", "reasoning", "coding") else None + for i, cat in enumerate(categories) + ] + ref_turn_2 = [ + f"Reference answer turn 2 for q{i}" if cat in ("math", "reasoning", "coding") else None + for i, cat in enumerate(categories) + ] + mt_bench_questions = pd.DataFrame( + { + "category": categories, + "turn_1": [f"Synthetic MT-Bench turn 1 question {i}" for i in range(20)], + "turn_2": [f"Synthetic MT-Bench turn 2 follow-up {i}" for i in range(20)], + "reference_turn_1": ref_turn_1, + "reference_turn_2": ref_turn_2, + }, + index=pd.Index(range(20), name="instruction_index"), + ) + mt_bench_questions["instruction"] = mt_bench_questions["turn_1"] + + def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: + df = mt_bench_questions if dataset == "mt-bench" else single_turn_instructions + return df.head(n_instructions) if n_instructions is not None else df + monkeypatch.setattr( generate_and_evaluate, "load_instructions", - lambda dataset, n_instructions=None: ( - instructions.head(n_instructions) - if n_instructions is not None - else instructions - ), + _load_instructions, + ) + monkeypatch.setattr( + mt_bench_pipeline, + "load_instructions", + _load_instructions, ) monkeypatch.setattr( generate_and_evaluate, "load_contexts", - lambda dataset: instructions.loc[:, "instruction"], + lambda dataset: single_turn_instructions.loc[:, "instruction"], ) monkeypatch.setattr( @@ -44,6 +75,9 @@ def _run_without_cache(fun, **_kwargs): monkeypatch.setattr( generate_and_evaluate, "cache_function_dataframe", _run_without_cache ) + monkeypatch.setattr( + mt_bench_pipeline, "cache_function_dataframe", _run_without_cache + ) @pytest.mark.parametrize( @@ -86,4 +120,217 @@ def test_generate_and_evaluate_correct_order_bias(tmp_path): ) avg_pref = sum(prefs) / len(prefs) - assert avg_pref == 0.5 + assert avg_pref == pytest.approx(0.5) + + +def test_main_non_mt_bench_reuses_judge_turn(monkeypatch, tmp_path): + captured = {"calls": 0, "kwargs": None} + + def _judge_turn_stub(**kwargs): + captured["calls"] += 1 + captured["kwargs"] = kwargs + return ( + [{"judge_completion": "score A: 0 score B: 10"}], + [], + [{"instruction_index": 0}], + [], + pd.Series([1.0]), + [{"instruction_index": 0}], + ) + + monkeypatch.setattr( + generate_and_evaluate, + "_judge_turn", + _judge_turn_stub, + ) + + prefs = main_generate_and_eval( + CliArgs( + dataset="alpaca-eval", + model_A="Dummy/no answer", + model_B="Dummy/open is better than close isnt'it", + judge_model="Dummy/score A: 0 score B: 10", + n_instructions=1, + result_folder=str(tmp_path), + ) + ) + + assert captured["calls"] == 1 + assert captured["kwargs"]["swap_mode"] == "fixed" + assert captured["kwargs"]["metadata"] == [{"instruction_index": 0}] + assert prefs.tolist() == [1.0] + + +def test_format_mt_bench_turn_2_uses_conversation_blocks(): + questions = pd.DataFrame( + { + "category": ["math", "writing"], + "turn_1": ["Math question turn 1", "Writing question turn 1"], + "turn_2": ["Math question turn 2", "Writing question turn 2"], + "reference_turn_1": ["Math reference turn 1", None], + "reference_turn_2": ["Math reference turn 2", None], + }, + index=pd.Index([0, 1], name="instruction_index"), + ) + completions_a = pd.DataFrame( + { + "completion_turn_1": ["A1 math", "A1 writing"], + "completion_turn_2": ["A2 math", "A2 writing"], + }, + index=pd.Index([0, 1], name="instruction_index"), + ) + completions_b = pd.DataFrame( + { + "completion_turn_1": ["B1 math", "B1 writing"], + "completion_turn_2": ["B2 math", "B2 writing"], + }, + index=pd.Index([0, 1], name="instruction_index"), + ) + + turn_1_inputs, turn_2_inputs = generate_and_evaluate.format_mt_bench_for_evaluation( + questions=questions, + completions_A=completions_a, + completions_B=completions_b, + turns_mode="both", + truncate_input_chars=8192, + ) + ( + instructions_turn_1, + _completions_a_turn_1, + _completions_b_turn_1, + _metadata_turn_1, + ) = turn_1_inputs + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + _metadata_turn_2, + ) = turn_2_inputs + + assert "Please focus on which assistant provides a better answer to the second user question." in instructions_turn_2[0] + assert "<|The Start of Reference Answer|>" in instructions_turn_2[0] + assert "Math reference turn 1" in instructions_turn_2[0] + assert "Math reference turn 2" in instructions_turn_2[0] + assert "<|The Start of Reference Answer|>" not in instructions_turn_2[1] + + assert "### User:\nMath question turn 1" in completions_a_turn_2[0] + assert "### Assistant:\nA1 math" in completions_a_turn_2[0] + assert "### User:\nMath question turn 2" in completions_a_turn_2[0] + assert "### Assistant:\nA2 math" in completions_a_turn_2[0] + + assert "### User:\nMath question turn 1" in completions_b_turn_2[0] + assert "### Assistant:\nB1 math" in completions_b_turn_2[0] + assert "### User:\nMath question turn 2" in completions_b_turn_2[0] + assert "### Assistant:\nB2 math" in completions_b_turn_2[0] + + assert instructions_turn_1[1] == "Writing question turn 1" + assert "[MT-Bench | Turn 1]" in instructions_turn_1[0] + + +def test_mt_bench_pairwise(tmp_path): + """Test MT-Bench pipeline through score-based parsing.""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer for turn 1 and turn 2", + model_B="Dummy/another answer", + judge_model="Dummy/score A: 10 score B: 0", + n_instructions=5, + result_folder=str(tmp_path), + ) + ) + + assert all(p < 0.5 for p in prefs) + assert len(prefs) == 10 # two turns per question + + +def test_mt_bench_swap_mode(tmp_path): + """Test that MT-Bench swap mode doubles the annotations and corrects bias.""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/score A: 10 score B: 0", + n_instructions=3, + swap_mode="both", + result_folder=str(tmp_path), + ) + ) + + assert len(prefs) == 12 # (3 questions * 2 turns) * 2 swap directions + assert float(sum(prefs) / len(prefs)) == pytest.approx(0.5) + + +def test_mt_bench_single_turn_only(tmp_path): + """Test MT-Bench single-turn-only evaluation (--mt_bench_turns single).""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/score A: 10 score B: 0", + n_instructions=5, + mt_bench_turns="single", + result_folder=str(tmp_path), + ) + ) + + assert all(p < 0.5 for p in prefs) + assert len(prefs) == 5 # one annotation per question, turn 1 only + + +def test_mt_bench_multi_turn_only(tmp_path): + """Test MT-Bench multi-turn-only evaluation (--mt_bench_turns multi).""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/score A: 0 score B: 10", + n_instructions=5, + mt_bench_turns="multi", + result_folder=str(tmp_path), + ) + ) + + assert all(p > 0.5 for p in prefs) + assert len(prefs) == 5 # one annotation per question, turn 2 only + + +def test_mt_bench_fastchat_fixed_verdicts(tmp_path): + """FastChat-compatible MT-Bench judging uses [[A]]/[[B]]/[[C]] parsing.""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/[[A]]", + n_instructions=5, + mt_bench_compatibility="fastchat", + result_folder=str(tmp_path), + ) + ) + + assert len(prefs) == 10 # two turns per question + assert all(p < 0.5 for p in prefs) + + +def test_mt_bench_fastchat_conservative_swap_mode(tmp_path): + """FastChat-compatible swap_mode='both' is conservative (tie if inconsistent).""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/[[A]]", # position-A biased judge + n_instructions=3, + swap_mode="both", + mt_bench_compatibility="fastchat", + result_folder=str(tmp_path), + ) + ) + + # Conservative swap runs both orders, but returns one resolved verdict per match. + assert len(prefs) == 6 # 3 questions * 2 turns + assert all(p == pytest.approx(0.5) for p in prefs) \ No newline at end of file diff --git a/tests/test_mt_bench_downloads.py b/tests/test_mt_bench_downloads.py new file mode 100644 index 0000000..9058a3b --- /dev/null +++ b/tests/test_mt_bench_downloads.py @@ -0,0 +1,66 @@ +from pathlib import Path + +import openjury.instruction_dataset.mt_bench as mt_bench +import openjury.utils as utils + + +def test_download_mt_bench_skips_question_download_if_cached(tmp_path, monkeypatch): + question_path = tmp_path / "data" / "mt_bench" / "question.jsonl" + question_path.parent.mkdir(parents=True, exist_ok=True) + question_path.write_text('{"question_id": 1, "turns": ["Q1"]}\n') + + reference_path = tmp_path / "reference_answer" / "gpt-4.jsonl" + reference_path.parent.mkdir(parents=True, exist_ok=True) + reference_path.write_text('{"question_id": 1, "choices": [{"turns": ["A1"]}]}\n') + + calls = {"snapshot_download": 0} + + def _snapshot_download_stub(**_kwargs): + calls["snapshot_download"] += 1 + + monkeypatch.setattr(mt_bench, "snapshot_download", _snapshot_download_stub) + monkeypatch.setattr( + mt_bench, + "_download_gpt4_references", + lambda _local_dir: reference_path, + ) + + downloaded_question_path, downloaded_reference_path = mt_bench.download_mt_bench( + local_dir=tmp_path + ) + + assert downloaded_question_path == question_path + assert downloaded_reference_path == reference_path + assert calls["snapshot_download"] == 0 + + +def test_download_all_includes_mt_bench(tmp_path, monkeypatch): + hf_datasets = [] + calls = {"contexts": 0, "mt_bench": 0} + + monkeypatch.setattr(utils, "data_root", tmp_path) + monkeypatch.setattr( + utils, + "download_hf", + lambda name, local_path: hf_datasets.append((name, local_path)), + ) + + def _contexts_snapshot_stub(**_kwargs): + calls["contexts"] += 1 + + monkeypatch.setattr(utils, "snapshot_download", _contexts_snapshot_stub) + monkeypatch.setattr( + mt_bench, + "download_mt_bench", + lambda: calls.__setitem__("mt_bench", calls["mt_bench"] + 1), + ) + + utils.download_all() + + assert [name for name, _ in hf_datasets] == [ + "alpaca-eval", + "arena-hard", + "m-arena-hard", + ] + assert calls["contexts"] == 1 + assert calls["mt_bench"] == 1