OpenEuroLLM · ErlisLushtaku · Feb 14, 2026 · Feb 14, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ Compared to other libraries, here is a breakdown of features:
 | **Arena-Hard-Auto** | ❌  | ❌  | ✅  | ❌  | ❌                         | ❌                                            |
 | **Lighteval** | ✅  | ❌  | ❌  | ❌  | ❌                         | ❌                                       |
 | **Evalchemy** | ✅  | ✅  | ❌  | ❌  | ❌                         | ❌                                           |
-| **OpenJury** | 🔜  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
+| **OpenJury** | ✅  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
 
 The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue 
 or send a PR, we will be happy to update the information.
@@ -191,10 +191,29 @@ python openjury/generate_and_evaluate.py \
 
 This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side.
 
+### MT-Bench (Multi-Turn Evaluation)
+
+MT-Bench evaluates multi-turn conversation ability using 80 two-turn questions across 8 categories
+(writing, roleplay, reasoning, math, coding, extraction, STEM, humanities).
+It uses category-dependent judge prompts and reference answers for math/reasoning/coding.
+Questions are automatically downloaded from the [LMSYS MT-Bench HuggingFace space](https://huggingface.co/spaces/lmsys/mt-bench).
+
+```bash
+uv run python openjury/generate_and_evaluate.py \
+  --dataset mt-bench \
+  --model_A VLLM/Qwen/Qwen2.5-7B-Instruct \
+  --model_B OpenRouter/openai/gpt-4o \
+  --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+  --n_instructions 10
+```
+
+Results include per-category and per-turn win rate breakdowns. Use `--swap_mode both` to correct for judge position bias.
+
 ## 📊 Supported Datasets
 
 | Dataset               | Description                                                                                    |
 |-----------------------|------------------------------------------------------------------------------------------------|
+| `mt-bench`            | 80 multi-turn (2-turn) questions across 8 categories ([LMSYS MT-Bench](https://arxiv.org/abs/2306.05685)) |
 | `alpaca-eval`         | General instruction-following benchmark                                                        |
 | `arena-hard`          | More challenging evaluation suite                                                              |
 | `m-arena-hard`        | Translated version of Arena-Hard in 23 languages                                               |

diff --git a/openjury/config.py b/openjury/config.py
@@ -0,0 +1,212 @@
+"""CLI argument configuration for generation and evaluation entrypoints."""
+
+import argparse
+import json
+from dataclasses import dataclass, field
+
+
+@dataclass
+class CliArgs:
+    dataset: str
+    model_A: str
+    model_B: str
+    judge_model: str
+
+    n_instructions: int | None = None
+    provide_explanation: bool = False
+    swap_mode: str = "fixed"
+    ignore_cache: bool = False
+    use_tqdm: bool = False
+    truncate_all_input_chars: int = 8192
+    max_out_tokens_models: int = 32768
+    max_out_tokens_judge: int = 32768
+    max_model_len: int | None = None
+    chat_template: str | None = None
+    mt_bench_turns: str = "both"
+    mt_bench_compatibility: str = "openjury"
+    result_folder: str = "results"
+    engine_kwargs: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        supported_modes = ["fixed", "both"]
+        assert (
+            self.swap_mode in supported_modes
+        ), f"Only {supported_modes} modes are supported but got {self.swap_mode}."
+        supported_mt_bench_modes = ["openjury", "fastchat"]
+        assert (
+            self.mt_bench_compatibility in supported_mt_bench_modes
+        ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}."
+
+    @classmethod
+    def parse_args(cls):
+        parser = argparse.ArgumentParser(
+            prog="Generate completion and evaluate with a judge",
+        )
+        parser.add_argument(
+            "--dataset",
+            help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction "
+            "tuning cases or `french-contexts`, `spanish-contexts` for base models.",
+        )
+        parser.add_argument(
+            "--model_A",
+            required=True,
+            help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+        )
+        parser.add_argument(
+            "--model_B",
+            required=True,
+            help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+        )
+        parser.add_argument(
+            "--judge_model",
+            required=True,
+            help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
+            "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc",
+        )
+        parser.add_argument(
+            "--n_instructions",
+            type=int,
+            required=False,
+        )
+        parser.add_argument(
+            "--provide_explanation",
+            action="store_true",
+            help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve"
+            "the accuracy of the judge but enables some result interpretation.",
+        )
+        parser.add_argument(
+            "--swap_mode",
+            type=str,
+            choices=["fixed", "both"],
+            default="fixed",
+            help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order "
+            "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account "
+            "for judge position bias. Default is 'fixed'.",
+        )
+        parser.add_argument(
+            "--ignore_cache",
+            action="store_true",
+            help="If specified, ignore cache of previous completions.",
+        )
+        parser.add_argument(
+            "--use_tqdm",
+            action="store_true",
+            help="If specified, use tqdm, does not work with all model providers, vLLM in particular.",
+        )
+        parser.add_argument(
+            "--result_folder",
+            type=str,
+            required=False,
+            default="results",
+            help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in"
+            " `[result_folder]/[evaluation_name]`.",
+        )
+        parser.add_argument(
+            "--truncate_all_input_chars",
+            type=int,
+            required=False,
+            default=8192,
+            help="Character-level truncation applied before tokenization: truncates each instruction "
+            "before model A/B generation and truncates each completion before judge evaluation.",
+        )
+        parser.add_argument(
+            "--max_out_tokens_models",
+            type=int,
+            required=False,
+            default=32768,
+            help=(
+                "Generation token budget for each model A/B response. For VLLM, keep this <= "
+                "--max_model_len (if provided)."
+            ),
+        )
+        parser.add_argument(
+            "--max_out_tokens_judge",
+            type=int,
+            required=False,
+            default=32768,
+            help=(
+                "Generation token budget for the judge response (reasoning + scores). For "
+                "VLLM, keep this <= --max_model_len (if provided)."
+            ),
+        )
+        parser.add_argument(
+            "--max_model_len",
+            type=int,
+            required=False,
+            default=None,
+            help=(
+                "Optional total context window for VLLM models (prompt + generation). This is "
+                "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap "
+                "generated tokens. This is useful on smaller GPUs to avoid OOM."
+            ),
+        )
+        parser.add_argument(
+            "--chat_template",
+            type=str,
+            required=False,
+            default=None,
+            help="Jinja2 chat template string to use instead of the model's tokenizer template. "
+            "If not provided, ChatML is used as fallback for models without a chat template.",
+        )
+        parser.add_argument(
+            "--mt_bench_turns",
+            type=str,
+            choices=["both", "single", "multi"],
+            default="both",
+            help="Which MT-Bench turns to evaluate. 'single': only turn 1, "
+            "'multi': only turn 2 (with full conversation context), "
+            "'both' (default): evaluate both turns.",
+        )
+        parser.add_argument(
+            "--mt_bench_compatibility",
+            type=str,
+            choices=["openjury", "fastchat"],
+            default="openjury",
+            help=(
+                "MT-Bench evaluation/generation mode. "
+                "'openjury' (default): OpenJury score_A/score_B prompt + softmax preference. "
+                "'fastchat': use FastChat/MT-Bench pairwise prompts with [[A]]/[[B]]/[[C]] verdict parsing, "
+                "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures."
+            ),
+        )
+        parser.add_argument(
+            "--engine_kwargs",
+            type=str,
+            required=False,
+            default="{}",
+            help=(
+                "JSON dict of engine-specific kwargs forwarded to the underlying engine. "
+                "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'."
+            ),
+        )
+        args = parser.parse_args()
+
+        try:
+            engine_kwargs = (
+                json.loads(args.engine_kwargs) if args.engine_kwargs else {}
+            )
+            if not isinstance(engine_kwargs, dict):
+                raise ValueError("engine_kwargs must be a JSON object")
+        except Exception as e:
+            raise SystemExit(f"Failed to parse --engine_kwargs: {e}")
+
+        return cls(
+            dataset=args.dataset,
+            model_A=args.model_A,
+            model_B=args.model_B,
+            judge_model=args.judge_model,
+            n_instructions=args.n_instructions,
+            provide_explanation=args.provide_explanation,
+            swap_mode=args.swap_mode,
+            ignore_cache=args.ignore_cache,
+            use_tqdm=args.use_tqdm,
+            truncate_all_input_chars=args.truncate_all_input_chars,
+            max_out_tokens_models=args.max_out_tokens_models,
+            max_out_tokens_judge=args.max_out_tokens_judge,
+            max_model_len=args.max_model_len,
+            chat_template=args.chat_template,
+            mt_bench_turns=args.mt_bench_turns,
+            mt_bench_compatibility=args.mt_bench_compatibility,
+            result_folder=args.result_folder,
+            engine_kwargs=engine_kwargs,
+        )