Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,19 @@ curl http://127.0.0.1:8000/v1/chat/completions \

# Supported models
- `gpt-5`
- `gpt-5.1`
- `gpt-5-codex`
- `gpt-5.1-codex`
- `gpt-5.1-codex-max`
- `gpt-5.1-codex-mini`
- `codex-mini`

# Customisation / Configuration

### Thinking effort

- `--reasoning-effort` (choice of minimal,low,medium,high)<br>
GPT-5 has a configurable amount of "effort" it can put into thinking, which may cause it to take more time for a response to return, but may overall give a smarter answer. Applying this parameter after `serve` forces the server to use this reasoning effort by default, unless overrided by the API request with a different effort set. The default reasoning effort without setting this parameter is `medium`.
- `--reasoning-effort` (choice of minimal,low,medium,high,xhigh)<br>
GPT-5 has a configurable amount of "effort" it can put into thinking, which may cause it to take more time for a response to return, but may overall give a smarter answer. Applying this parameter after `serve` forces the server to use this reasoning effort by default, unless overrided by the API request with a different effort set. The default reasoning effort without setting this parameter is `medium`. The `gpt-5.1` family (including codex) supports `low`, `medium`, and `high` while `gpt-5.1-codex-max` adds `xhigh`; neither offers a `minimal` variant.

### Thinking summaries

Expand Down
6 changes: 3 additions & 3 deletions chatmock/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def main() -> None:
)
p_serve.add_argument(
"--reasoning-effort",
choices=["minimal", "low", "medium", "high"],
choices=["minimal", "low", "medium", "high", "xhigh"],
default=os.getenv("CHATGPT_LOCAL_REASONING_EFFORT", "medium").lower(),
help="Reasoning effort level for Responses API (default: medium)",
)
Expand All @@ -335,8 +335,8 @@ def main() -> None:
action="store_true",
default=(os.getenv("CHATGPT_LOCAL_EXPOSE_REASONING_MODELS") or "").strip().lower() in ("1", "true", "yes", "on"),
help=(
"Expose gpt-5 reasoning effort variants (minimal|low|medium|high) as separate models from /v1/models. "
"This allows choosing effort via model selection in compatible UIs."
"Expose gpt-5 reasoning effort variants (minimal|low|medium|high|xhigh where supported) "
"as separate models from /v1/models. This allows choosing effort via model selection in compatible UIs."
),
)
p_serve.add_argument(
Expand Down
29 changes: 25 additions & 4 deletions chatmock/reasoning.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
from __future__ import annotations

from typing import Any, Dict
from typing import Any, Dict, Set


DEFAULT_REASONING_EFFORTS: Set[str] = {"minimal", "low", "medium", "high", "xhigh"}


def allowed_efforts_for_model(model: str | None) -> Set[str]:
base = (model or "").strip().lower()
if not base:
return DEFAULT_REASONING_EFFORTS
normalized = base.split(":", 1)[0]
if normalized.startswith("gpt-5.1-codex-max"):
return {"low", "medium", "high", "xhigh"}
if normalized.startswith("gpt-5.1"):
return {"low", "medium", "high"}
return DEFAULT_REASONING_EFFORTS


def build_reasoning_param(
base_effort: str = "medium", base_summary: str = "auto", overrides: Dict[str, Any] | None = None
base_effort: str = "medium",
base_summary: str = "auto",
overrides: Dict[str, Any] | None = None,
*,
allowed_efforts: Set[str] | None = None,
) -> Dict[str, Any]:
effort = (base_effort or "").strip().lower()
summary = (base_summary or "").strip().lower()

valid_efforts = {"minimal", "low", "medium", "high"}
valid_efforts = allowed_efforts or DEFAULT_REASONING_EFFORTS
valid_summaries = {"auto", "concise", "detailed", "none"}

if isinstance(overrides, dict):
Expand Down Expand Up @@ -80,7 +99,7 @@ def extract_reasoning_from_model_name(model: str | None) -> Dict[str, Any] | Non
s = model.strip().lower()
if not s:
return None
efforts = {"minimal", "low", "medium", "high"}
efforts = {"minimal", "low", "medium", "high", "xhigh"}

if ":" in s:
maybe = s.rsplit(":", 1)[-1].strip()
Expand All @@ -96,5 +115,7 @@ def extract_reasoning_from_model_name(model: str | None) -> Dict[str, Any] | Non
return {"effort": "medium"}
if s.endswith(sep + "high"):
return {"effort": "high"}
if s.endswith(sep + "xhigh"):
return {"effort": "xhigh"}

return None
37 changes: 31 additions & 6 deletions chatmock/routes_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .limits import record_rate_limits_from_response
from .http import build_cors_headers
from .reasoning import build_reasoning_param, extract_reasoning_from_model_name
from .reasoning import (
allowed_efforts_for_model,
build_reasoning_param,
extract_reasoning_from_model_name,
)
from .transform import convert_ollama_messages, normalize_ollama_tools
from .upstream import normalize_model_name, start_upstream_request
from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses
Expand Down Expand Up @@ -67,7 +71,7 @@ def ollama_version() -> Response:

def _instructions_for_model(model: str) -> str:
base = current_app.config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS)
if model == "gpt-5-codex" or model == "gpt-5.1-codex":
if model.startswith("gpt-5-codex") or model.startswith("gpt-5.1-codex"):
codex = current_app.config.get("GPT5_CODEX_INSTRUCTIONS") or GPT5_CODEX_INSTRUCTIONS
if isinstance(codex, str) and codex.strip():
return codex
Expand All @@ -89,7 +93,15 @@ def ollama_tags() -> Response:
if bool(current_app.config.get("VERBOSE")):
print("IN GET /api/tags")
expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))
model_ids = ["gpt-5", "gpt-5.1", "gpt-5-codex", "gpt-5.1-codex", "gpt-5.1-codex-mini", "codex-mini"]
model_ids = [
"gpt-5",
"gpt-5.1",
"gpt-5-codex",
"gpt-5.1-codex",
"gpt-5.1-codex-max",
"gpt-5.1-codex-mini",
"codex-mini",
]
if expose_variants:
model_ids.extend(
[
Expand All @@ -100,13 +112,16 @@ def ollama_tags() -> Response:
"gpt-5.1-high",
"gpt-5.1-medium",
"gpt-5.1-low",
"gpt-5.1-minimal",
"gpt-5-codex-high",
"gpt-5-codex-medium",
"gpt-5-codex-low",
"gpt-5.1-codex-high",
"gpt-5.1-codex-medium",
"gpt-5.1-codex-low",
"gpt-5.1-codex-max-xhigh",
"gpt-5.1-codex-max-high",
"gpt-5.1-codex-max-medium",
"gpt-5.1-codex-max-low",
]
)
models = []
Expand Down Expand Up @@ -275,7 +290,12 @@ def ollama_chat() -> Response:
tools=tools_responses,
tool_choice=tool_choice,
parallel_tool_calls=parallel_tool_calls,
reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning),
reasoning_param=build_reasoning_param(
reasoning_effort,
reasoning_summary,
model_reasoning,
allowed_efforts=allowed_efforts_for_model(model),
),
)
if error_resp is not None:
if verbose:
Expand Down Expand Up @@ -310,7 +330,12 @@ def ollama_chat() -> Response:
tools=base_tools_only,
tool_choice=safe_choice,
parallel_tool_calls=parallel_tool_calls,
reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning),
reasoning_param=build_reasoning_param(
reasoning_effort,
reasoning_summary,
model_reasoning,
allowed_efforts=allowed_efforts_for_model(model),
),
)
record_rate_limits_from_response(upstream2)
if err2 is None and upstream2 is not None and upstream2.status_code < 400:
Expand Down
26 changes: 21 additions & 5 deletions chatmock/routes_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .limits import record_rate_limits_from_response
from .http import build_cors_headers
from .reasoning import apply_reasoning_to_message, build_reasoning_param, extract_reasoning_from_model_name
from .reasoning import (
allowed_efforts_for_model,
apply_reasoning_to_message,
build_reasoning_param,
extract_reasoning_from_model_name,
)
from .upstream import normalize_model_name, start_upstream_request
from .utils import (
convert_chat_messages_to_responses_input,
Expand Down Expand Up @@ -54,7 +59,7 @@ def _gen():

def _instructions_for_model(model: str) -> str:
base = current_app.config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS)
if model == "gpt-5-codex" or model == "gpt-5.1-codex":
if model.startswith("gpt-5-codex") or model.startswith("gpt-5.1-codex"):
codex = current_app.config.get("GPT5_CODEX_INSTRUCTIONS") or GPT5_CODEX_INSTRUCTIONS
if isinstance(codex, str) and codex.strip():
return codex
Expand Down Expand Up @@ -166,7 +171,12 @@ def chat_completions() -> Response:

model_reasoning = extract_reasoning_from_model_name(requested_model)
reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning
reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides)
reasoning_param = build_reasoning_param(
reasoning_effort,
reasoning_summary,
reasoning_overrides,
allowed_efforts=allowed_efforts_for_model(model),
)

upstream, error_resp = start_upstream_request(
model,
Expand Down Expand Up @@ -396,7 +406,12 @@ def completions() -> Response:

model_reasoning = extract_reasoning_from_model_name(requested_model)
reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning
reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides)
reasoning_param = build_reasoning_param(
reasoning_effort,
reasoning_summary,
reasoning_overrides,
allowed_efforts=allowed_efforts_for_model(model),
)
upstream, error_resp = start_upstream_request(
model,
input_items,
Expand Down Expand Up @@ -518,9 +533,10 @@ def list_models() -> Response:
expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))
model_groups = [
("gpt-5", ["high", "medium", "low", "minimal"]),
("gpt-5.1", ["high", "medium", "low", "minimal"]),
("gpt-5.1", ["high", "medium", "low"]),
("gpt-5-codex", ["high", "medium", "low"]),
("gpt-5.1-codex", ["high", "medium", "low"]),
("gpt-5.1-codex-max", ["xhigh", "high", "medium", "low"]),
("gpt-5.1-codex-mini", []),
("codex-mini", []),
]
Expand Down
3 changes: 2 additions & 1 deletion chatmock/upstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def normalize_model_name(name: str | None, debug_model: str | None = None) -> st
base = name.split(":", 1)[0].strip()
for sep in ("-", "_"):
lowered = base.lower()
for effort in ("minimal", "low", "medium", "high"):
for effort in ("minimal", "low", "medium", "high", "xhigh"):
suffix = f"{sep}{effort}"
if lowered.endswith(suffix):
base = base[: -len(suffix)]
Expand All @@ -46,6 +46,7 @@ def normalize_model_name(name: str | None, debug_model: str | None = None) -> st
"gpt-5-codex": "gpt-5-codex",
"gpt-5-codex-latest": "gpt-5-codex",
"gpt-5.1-codex": "gpt-5.1-codex",
"gpt-5.1-codex-max": "gpt-5.1-codex-max",
"codex": "codex-mini-latest",
"codex-mini": "codex-mini-latest",
"codex-mini-latest": "codex-mini-latest",
Expand Down