From a475b78f67c4e524b819b06b5b2f90b297e6388d Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Thu, 14 May 2026 17:04:16 -0700 Subject: [PATCH] bench: fix v5 tokenizer fix when --model is an HF Hub repo id _fix_tokenizer_for_sglang resolved tokenizer.json via Path(model_path), which only works for local model directories. For HF Hub repo ids (e.g. "nvidia/DeepSeek-R1-0528-FP4-V2") the path lookup silently no-ops and a bare `except Exception: pass` hid the failure. Combined with the transformers 5.6.0 AutoTokenizer dispatch change that now returns a broken LlamaTokenizer for ByteLevel-BPE models, this produced a ~5x client/server tokenizer mismatch and a false throughput regression (DeepSeek-R1 FP4 on B200: 1970 -> 425 tok/s). Fall back to huggingface_hub.hf_hub_download for repo ids, surface resolution failures as warnings, and add an info log when the fix actually rewires pre_tokenizer/decoder so client/server alignment is visible. Co-Authored-By: Claude Opus 4.7 (1M context) --- utils/bench_serving/backend_request_func.py | 92 ++++++++++++--------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index 7f4a93284..4c8820f8d 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -466,46 +466,64 @@ def _fix_tokenizer_for_sglang(tokenizer, model_path): import json from pathlib import Path + def _resolve(filename): + """Return a filesystem path for `filename`, whether `model_path` is a + local directory or an HF Hub repo id. Returns None and logs a warning + on failure so we don't silently fail to apply the v5 fix.""" + local = Path(model_path) / filename + if local.is_file(): + return str(local) + try: + from huggingface_hub import hf_hub_download + return hf_hub_download(repo_id=model_path, filename=filename) + except Exception as e: + print( + f"v5 tokenizer fix: cannot resolve {filename} for {model_path!r} " + f"({type(e).__name__}: {e}); fix will not apply.", + flush=True, + ) + return None + backend = getattr(tokenizer, "_tokenizer", None) if backend is not None: - try: + tok_file = _resolve("tokenizer.json") + if tok_file is not None: from tokenizers import Tokenizer as RawTokenizer - tok_file = Path(model_path) / "tokenizer.json" - if tok_file.is_file(): - raw = RawTokenizer.from_file(str(tok_file)) - raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None - loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None - if raw_pre and loaded_pre and raw_pre != loaded_pre: - backend.pre_tokenizer = raw.pre_tokenizer - backend.decoder = raw.decoder - except Exception: - pass - - try: - config_file = Path(model_path) / "tokenizer_config.json" - if config_file.is_file(): - with open(config_file) as f: - config = json.load(f) - tok_class = config.get("tokenizer_class", "") - bos_eos_classes = { - "LlamaTokenizer", "LlamaTokenizerFast", - "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", - "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", - } - if tok_class in bos_eos_classes: - defaults = {"add_bos_token": True, "add_eos_token": False} - changed = False - for attr in ("add_bos_token", "add_eos_token"): - val = config.get(attr) - if val is None: - val = defaults.get(attr, False) - if getattr(tokenizer, attr, None) != val: - setattr(tokenizer, f"_{attr}", val) - changed = True - if changed and hasattr(tokenizer, "update_post_processor"): - tokenizer.update_post_processor() - except Exception: - pass + raw = RawTokenizer.from_file(tok_file) + raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None + loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None + if raw_pre and loaded_pre and raw_pre != loaded_pre: + print( + f"v5 tokenizer fix: {model_path} pre_tokenizer {loaded_pre} -> {raw_pre}, " + f"decoder {type(backend.decoder).__name__ if backend.decoder else None} -> " + f"{type(raw.decoder).__name__ if raw.decoder else None}", + flush=True, + ) + backend.pre_tokenizer = raw.pre_tokenizer + backend.decoder = raw.decoder + + config_file = _resolve("tokenizer_config.json") + if config_file is not None: + with open(config_file) as f: + config = json.load(f) + tok_class = config.get("tokenizer_class", "") + bos_eos_classes = { + "LlamaTokenizer", "LlamaTokenizerFast", + "CodeLlamaTokenizer", "CodeLlamaTokenizerFast", + "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast", + } + if tok_class in bos_eos_classes: + defaults = {"add_bos_token": True, "add_eos_token": False} + changed = False + for attr in ("add_bos_token", "add_eos_token"): + val = config.get(attr) + if val is None: + val = defaults.get(attr, False) + if getattr(tokenizer, attr, None) != val: + setattr(tokenizer, f"_{attr}", val) + changed = True + if changed and hasattr(tokenizer, "update_post_processor"): + tokenizer.update_post_processor() return tokenizer