From 9211d21d9bba830702ff89c0b664032605ef31d1 Mon Sep 17 00:00:00 2001 From: Game_Time <108236317+RayBytes@users.noreply.github.com> Date: Sat, 22 Nov 2025 12:15:02 +0500 Subject: [PATCH] Adjust gpt-5.1 reasoning support --- README.md | 8 ++++++-- chatmock/cli.py | 6 +++--- chatmock/reasoning.py | 29 +++++++++++++++++++++++++---- chatmock/routes_ollama.py | 37 +++++++++++++++++++++++++++++++------ chatmock/routes_openai.py | 26 +++++++++++++++++++++----- chatmock/upstream.py | 3 ++- 6 files changed, 88 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 4595e63..960e850 100644 --- a/README.md +++ b/README.md @@ -114,15 +114,19 @@ curl http://127.0.0.1:8000/v1/chat/completions \ # Supported models - `gpt-5` +- `gpt-5.1` - `gpt-5-codex` +- `gpt-5.1-codex` +- `gpt-5.1-codex-max` +- `gpt-5.1-codex-mini` - `codex-mini` # Customisation / Configuration ### Thinking effort -- `--reasoning-effort` (choice of minimal,low,medium,high)
-GPT-5 has a configurable amount of "effort" it can put into thinking, which may cause it to take more time for a response to return, but may overall give a smarter answer. Applying this parameter after `serve` forces the server to use this reasoning effort by default, unless overrided by the API request with a different effort set. The default reasoning effort without setting this parameter is `medium`. +- `--reasoning-effort` (choice of minimal,low,medium,high,xhigh)
+GPT-5 has a configurable amount of "effort" it can put into thinking, which may cause it to take more time for a response to return, but may overall give a smarter answer. Applying this parameter after `serve` forces the server to use this reasoning effort by default, unless overrided by the API request with a different effort set. The default reasoning effort without setting this parameter is `medium`. The `gpt-5.1` family (including codex) supports `low`, `medium`, and `high` while `gpt-5.1-codex-max` adds `xhigh`; neither offers a `minimal` variant. ### Thinking summaries diff --git a/chatmock/cli.py b/chatmock/cli.py index 2c9df30..2ebaf85 100644 --- a/chatmock/cli.py +++ b/chatmock/cli.py @@ -311,7 +311,7 @@ def main() -> None: ) p_serve.add_argument( "--reasoning-effort", - choices=["minimal", "low", "medium", "high"], + choices=["minimal", "low", "medium", "high", "xhigh"], default=os.getenv("CHATGPT_LOCAL_REASONING_EFFORT", "medium").lower(), help="Reasoning effort level for Responses API (default: medium)", ) @@ -335,8 +335,8 @@ def main() -> None: action="store_true", default=(os.getenv("CHATGPT_LOCAL_EXPOSE_REASONING_MODELS") or "").strip().lower() in ("1", "true", "yes", "on"), help=( - "Expose gpt-5 reasoning effort variants (minimal|low|medium|high) as separate models from /v1/models. " - "This allows choosing effort via model selection in compatible UIs." + "Expose gpt-5 reasoning effort variants (minimal|low|medium|high|xhigh where supported) " + "as separate models from /v1/models. This allows choosing effort via model selection in compatible UIs." ), ) p_serve.add_argument( diff --git a/chatmock/reasoning.py b/chatmock/reasoning.py index 7aeabf2..79566cb 100644 --- a/chatmock/reasoning.py +++ b/chatmock/reasoning.py @@ -1,15 +1,34 @@ from __future__ import annotations -from typing import Any, Dict +from typing import Any, Dict, Set + + +DEFAULT_REASONING_EFFORTS: Set[str] = {"minimal", "low", "medium", "high", "xhigh"} + + +def allowed_efforts_for_model(model: str | None) -> Set[str]: + base = (model or "").strip().lower() + if not base: + return DEFAULT_REASONING_EFFORTS + normalized = base.split(":", 1)[0] + if normalized.startswith("gpt-5.1-codex-max"): + return {"low", "medium", "high", "xhigh"} + if normalized.startswith("gpt-5.1"): + return {"low", "medium", "high"} + return DEFAULT_REASONING_EFFORTS def build_reasoning_param( - base_effort: str = "medium", base_summary: str = "auto", overrides: Dict[str, Any] | None = None + base_effort: str = "medium", + base_summary: str = "auto", + overrides: Dict[str, Any] | None = None, + *, + allowed_efforts: Set[str] | None = None, ) -> Dict[str, Any]: effort = (base_effort or "").strip().lower() summary = (base_summary or "").strip().lower() - valid_efforts = {"minimal", "low", "medium", "high"} + valid_efforts = allowed_efforts or DEFAULT_REASONING_EFFORTS valid_summaries = {"auto", "concise", "detailed", "none"} if isinstance(overrides, dict): @@ -80,7 +99,7 @@ def extract_reasoning_from_model_name(model: str | None) -> Dict[str, Any] | Non s = model.strip().lower() if not s: return None - efforts = {"minimal", "low", "medium", "high"} + efforts = {"minimal", "low", "medium", "high", "xhigh"} if ":" in s: maybe = s.rsplit(":", 1)[-1].strip() @@ -96,5 +115,7 @@ def extract_reasoning_from_model_name(model: str | None) -> Dict[str, Any] | Non return {"effort": "medium"} if s.endswith(sep + "high"): return {"effort": "high"} + if s.endswith(sep + "xhigh"): + return {"effort": "xhigh"} return None diff --git a/chatmock/routes_ollama.py b/chatmock/routes_ollama.py index 0be4f1c..1fd8699 100644 --- a/chatmock/routes_ollama.py +++ b/chatmock/routes_ollama.py @@ -10,7 +10,11 @@ from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS from .limits import record_rate_limits_from_response from .http import build_cors_headers -from .reasoning import build_reasoning_param, extract_reasoning_from_model_name +from .reasoning import ( + allowed_efforts_for_model, + build_reasoning_param, + extract_reasoning_from_model_name, +) from .transform import convert_ollama_messages, normalize_ollama_tools from .upstream import normalize_model_name, start_upstream_request from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses @@ -67,7 +71,7 @@ def ollama_version() -> Response: def _instructions_for_model(model: str) -> str: base = current_app.config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS) - if model == "gpt-5-codex" or model == "gpt-5.1-codex": + if model.startswith("gpt-5-codex") or model.startswith("gpt-5.1-codex"): codex = current_app.config.get("GPT5_CODEX_INSTRUCTIONS") or GPT5_CODEX_INSTRUCTIONS if isinstance(codex, str) and codex.strip(): return codex @@ -89,7 +93,15 @@ def ollama_tags() -> Response: if bool(current_app.config.get("VERBOSE")): print("IN GET /api/tags") expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS")) - model_ids = ["gpt-5", "gpt-5.1", "gpt-5-codex", "gpt-5.1-codex", "gpt-5.1-codex-mini", "codex-mini"] + model_ids = [ + "gpt-5", + "gpt-5.1", + "gpt-5-codex", + "gpt-5.1-codex", + "gpt-5.1-codex-max", + "gpt-5.1-codex-mini", + "codex-mini", + ] if expose_variants: model_ids.extend( [ @@ -100,13 +112,16 @@ def ollama_tags() -> Response: "gpt-5.1-high", "gpt-5.1-medium", "gpt-5.1-low", - "gpt-5.1-minimal", "gpt-5-codex-high", "gpt-5-codex-medium", "gpt-5-codex-low", "gpt-5.1-codex-high", "gpt-5.1-codex-medium", "gpt-5.1-codex-low", + "gpt-5.1-codex-max-xhigh", + "gpt-5.1-codex-max-high", + "gpt-5.1-codex-max-medium", + "gpt-5.1-codex-max-low", ] ) models = [] @@ -275,7 +290,12 @@ def ollama_chat() -> Response: tools=tools_responses, tool_choice=tool_choice, parallel_tool_calls=parallel_tool_calls, - reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning), + reasoning_param=build_reasoning_param( + reasoning_effort, + reasoning_summary, + model_reasoning, + allowed_efforts=allowed_efforts_for_model(model), + ), ) if error_resp is not None: if verbose: @@ -310,7 +330,12 @@ def ollama_chat() -> Response: tools=base_tools_only, tool_choice=safe_choice, parallel_tool_calls=parallel_tool_calls, - reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning), + reasoning_param=build_reasoning_param( + reasoning_effort, + reasoning_summary, + model_reasoning, + allowed_efforts=allowed_efforts_for_model(model), + ), ) record_rate_limits_from_response(upstream2) if err2 is None and upstream2 is not None and upstream2.status_code < 400: diff --git a/chatmock/routes_openai.py b/chatmock/routes_openai.py index 049b595..5d97bed 100644 --- a/chatmock/routes_openai.py +++ b/chatmock/routes_openai.py @@ -9,7 +9,12 @@ from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS from .limits import record_rate_limits_from_response from .http import build_cors_headers -from .reasoning import apply_reasoning_to_message, build_reasoning_param, extract_reasoning_from_model_name +from .reasoning import ( + allowed_efforts_for_model, + apply_reasoning_to_message, + build_reasoning_param, + extract_reasoning_from_model_name, +) from .upstream import normalize_model_name, start_upstream_request from .utils import ( convert_chat_messages_to_responses_input, @@ -54,7 +59,7 @@ def _gen(): def _instructions_for_model(model: str) -> str: base = current_app.config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS) - if model == "gpt-5-codex" or model == "gpt-5.1-codex": + if model.startswith("gpt-5-codex") or model.startswith("gpt-5.1-codex"): codex = current_app.config.get("GPT5_CODEX_INSTRUCTIONS") or GPT5_CODEX_INSTRUCTIONS if isinstance(codex, str) and codex.strip(): return codex @@ -166,7 +171,12 @@ def chat_completions() -> Response: model_reasoning = extract_reasoning_from_model_name(requested_model) reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning - reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides) + reasoning_param = build_reasoning_param( + reasoning_effort, + reasoning_summary, + reasoning_overrides, + allowed_efforts=allowed_efforts_for_model(model), + ) upstream, error_resp = start_upstream_request( model, @@ -396,7 +406,12 @@ def completions() -> Response: model_reasoning = extract_reasoning_from_model_name(requested_model) reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning - reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides) + reasoning_param = build_reasoning_param( + reasoning_effort, + reasoning_summary, + reasoning_overrides, + allowed_efforts=allowed_efforts_for_model(model), + ) upstream, error_resp = start_upstream_request( model, input_items, @@ -518,9 +533,10 @@ def list_models() -> Response: expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS")) model_groups = [ ("gpt-5", ["high", "medium", "low", "minimal"]), - ("gpt-5.1", ["high", "medium", "low", "minimal"]), + ("gpt-5.1", ["high", "medium", "low"]), ("gpt-5-codex", ["high", "medium", "low"]), ("gpt-5.1-codex", ["high", "medium", "low"]), + ("gpt-5.1-codex-max", ["xhigh", "high", "medium", "low"]), ("gpt-5.1-codex-mini", []), ("codex-mini", []), ] diff --git a/chatmock/upstream.py b/chatmock/upstream.py index 704cb8c..8f377a0 100644 --- a/chatmock/upstream.py +++ b/chatmock/upstream.py @@ -32,7 +32,7 @@ def normalize_model_name(name: str | None, debug_model: str | None = None) -> st base = name.split(":", 1)[0].strip() for sep in ("-", "_"): lowered = base.lower() - for effort in ("minimal", "low", "medium", "high"): + for effort in ("minimal", "low", "medium", "high", "xhigh"): suffix = f"{sep}{effort}" if lowered.endswith(suffix): base = base[: -len(suffix)] @@ -46,6 +46,7 @@ def normalize_model_name(name: str | None, debug_model: str | None = None) -> st "gpt-5-codex": "gpt-5-codex", "gpt-5-codex-latest": "gpt-5-codex", "gpt-5.1-codex": "gpt-5.1-codex", + "gpt-5.1-codex-max": "gpt-5.1-codex-max", "codex": "codex-mini-latest", "codex-mini": "codex-mini-latest", "codex-mini-latest": "codex-mini-latest",