diff --git a/js/owrap.ai.js b/js/owrap.ai.js index 804e0f4b2..e922f509c 100644 --- a/js/owrap.ai.js +++ b/js/owrap.ai.js @@ -207,6 +207,13 @@ OpenWrap.ai.prototype.__gpttypes = { if (isDef(aResponse.usage.prompt_tokens)) tokens.prompt = aResponse.usage.prompt_tokens if (isDef(aResponse.usage.completion_tokens)) tokens.completion = aResponse.usage.completion_tokens if (isDef(aResponse.usage.total_tokens)) tokens.total = aResponse.usage.total_tokens + if (isMap(aResponse.usage.prompt_tokens_details)) { + if (isDef(aResponse.usage.prompt_tokens_details.cached_tokens)) tokens.cached = aResponse.usage.prompt_tokens_details.cached_tokens + if (isDef(aResponse.usage.prompt_tokens_details.audio_tokens)) tokens.audio = aResponse.usage.prompt_tokens_details.audio_tokens + } + if (isMap(aResponse.usage.completion_tokens_details)) { + if (isDef(aResponse.usage.completion_tokens_details.reasoning_tokens)) tokens.reasoning = aResponse.usage.completion_tokens_details.reasoning_tokens + } if (Object.keys(tokens).length > 0) stats.tokens = tokens stats.usage = aResponse.usage } @@ -2258,6 +2265,7 @@ OpenWrap.ai.prototype.__gpttypes = { aOptions.temperature = _$(aOptions.temperature, "aOptions.temperature").isNumber().default(0.7) aOptions.url = _$(aOptions.url, "aOptions.url").isString().default("https://api.anthropic.com/") aOptions.headers = _$(aOptions.headers, "aOptions.headers").isMap().default({}) + aOptions.promptCaching = _$(aOptions.promptCaching, "aOptions.promptCaching").isBoolean().default(false) // If noSystem=true it will not output the system messages aOptions.noSystem = _$(aOptions.noSystem, "aOptions.noSystem").isBoolean().default(true) @@ -2266,6 +2274,7 @@ OpenWrap.ai.prototype.__gpttypes = { var _model = aOptions.model var _temperature = aOptions.temperature var _noSystem = aOptions.noSystem + var _promptCaching = aOptions.promptCaching var _lastStats = __ var _debugCh = __ var _resetStats = () => { _lastStats = __ } @@ -2288,6 +2297,8 @@ OpenWrap.ai.prototype.__gpttypes = { if (isDef(aResponse.usage.input_tokens)) tokens.prompt = aResponse.usage.input_tokens if (isDef(aResponse.usage.output_tokens)) tokens.completion = aResponse.usage.output_tokens if (isDef(aResponse.usage.total_tokens)) tokens.total = aResponse.usage.total_tokens + if (isDef(aResponse.usage.cache_creation_input_tokens)) tokens.cacheCreation = aResponse.usage.cache_creation_input_tokens + if (isDef(aResponse.usage.cache_read_input_tokens)) tokens.cacheRead = aResponse.usage.cache_read_input_tokens if (Object.keys(tokens).length > 0) stats.tokens = tokens stats.usage = aResponse.usage } @@ -2362,6 +2373,66 @@ OpenWrap.ai.prototype.__gpttypes = { if (isUnDef(aResult) || aResult === null) return "" return stringify(aResult, __, "") } + var _applyPromptCacheControlToMessages = aMessages => { + if (!_promptCaching || !isArray(aMessages) || aMessages.length === 0) return aMessages + var _lastUserIdx = -1 + for (var ii = aMessages.length - 1; ii >= 0; ii--) { + if (isMap(aMessages[ii]) && aMessages[ii].role === "user") { + var _content = aMessages[ii].content + if (isString(_content)) { + _lastUserIdx = ii + break + } + if (isMap(_content)) { + if (isUnDef(_content.type) || _content.type !== "tool_result") { + _lastUserIdx = ii + break + } + } + if (isArray(_content)) { + var _hasCacheableBlock = _content.some(b => isMap(b) && (isUnDef(b.type) || b.type !== "tool_result")) + if (_hasCacheableBlock) { + _lastUserIdx = ii + break + } + } + } + } + if (_lastUserIdx < 0) return aMessages + + var _msg = aMessages[_lastUserIdx] + if (isString(_msg.content)) { + _msg.content = [{ type: "text", text: _msg.content, cache_control: { type: "ephemeral" } }] + } else if (isMap(_msg.content)) { + _msg.content = [ merge(_msg.content, { cache_control: { type: "ephemeral" } }) ] + } else if (isArray(_msg.content)) { + var _lastBlockIdx = -1 + for (var jj = _msg.content.length - 1; jj >= 0; jj--) { + if (isMap(_msg.content[jj]) && (isUnDef(_msg.content[jj].type) || _msg.content[jj].type !== "tool_result")) { + _lastBlockIdx = jj + break + } + } + if (_lastBlockIdx >= 0) { + _msg.content[_lastBlockIdx] = merge(_msg.content[_lastBlockIdx], { cache_control: { type: "ephemeral" } }) + } else { + _msg.content.push({ type: "text", text: "", cache_control: { type: "ephemeral" } }) + } + } + return aMessages + } + var _buildSystemField = (_systemText, _isJsonPrompt) => { + var _txt = _systemText + if (_isJsonPrompt) { + // Keep compatibility with the existing Anthropic JSON steering instruction. + _txt = (isString(_txt) && _txt.length > 0 ? _txt + "\n\n" : "") + "output json" + } + if (!(isString(_txt) && _txt.length > 0)) return __ + if (_promptCaching) { + return [{ type: "text", text: _txt, cache_control: { type: "ephemeral" } }] + } + return _txt + } var _r = { conversation: [], @@ -2532,6 +2603,8 @@ OpenWrap.ai.prototype.__gpttypes = { var systemMsgs = msgs.filter(m => m.role == "system"); var bodyMessages = (_noSystem ? msgs.filter(m => m.role != "system") : msgs.slice()); + if (_promptCaching) bodyMessages = clone(bodyMessages) + bodyMessages = _applyPromptCacheControlToMessages(bodyMessages) _r.conversation = msgs; @@ -2559,10 +2632,9 @@ OpenWrap.ai.prototype.__gpttypes = { }) .filter(s => isString(s) && s.length > 0) .join("\n") - if (_systemText.length > 0) body.system = _systemText - } - if (aJsonFlag) { - body.system = (isString(body.system) && body.system.length > 0 ? body.system + "\n\n" : "") + "output json" + body.system = _buildSystemField(_systemText, aJsonFlag) + } else if (aJsonFlag) { + body.system = _buildSystemField("", true) } body = merge(body, aOptions.params) @@ -2681,6 +2753,8 @@ OpenWrap.ai.prototype.__gpttypes = { var systemMsgs = msgs.filter(m => m.role == "system"); var bodyMessages = (_noSystem ? msgs.filter(m => m.role != "system") : msgs.slice()); + if (_promptCaching) bodyMessages = clone(bodyMessages) + bodyMessages = _applyPromptCacheControlToMessages(bodyMessages) _r.conversation = msgs; @@ -2707,10 +2781,9 @@ OpenWrap.ai.prototype.__gpttypes = { }) .filter(s => isString(s) && s.length > 0) .join("\n") - if (_systemText.length > 0) body.system = _systemText - } - if (aJsonFlag) { - body.system = (isString(body.system) && body.system.length > 0 ? body.system + "\n\n" : "") + "output json" + body.system = _buildSystemField(_systemText, aJsonFlag) + } else if (aJsonFlag) { + body.system = _buildSystemField("", true) } body = merge(body, aOptions.params) @@ -2925,14 +2998,16 @@ OpenWrap.ai.prototype.__gpttypes = { aVerb = _$(aVerb, "aVerb").isString().default("POST") var _h = new ow.obj.http(__, __, __, __, __, __, __, { timeout: _timeout }) + var _reqHeaders = merge(aOptions.headers, { + "x-api-key" : Packages.openaf.AFCmdBase.afc.dIP(_key), + "anthropic-version": "2023-06-01", + Accept : "*/*" + }) + if (_promptCaching) _reqHeaders["anthropic-beta"] = "prompt-caching-2024-07-31" var __m = { conTimeout : 60000, httpClient : _h, - requestHeaders: merge(aOptions.headers, { - "x-api-key" : Packages.openaf.AFCmdBase.afc.dIP(_key), - "anthropic-version": "2023-06-01", - Accept : "*/*" - }) + requestHeaders: _reqHeaders } _h.close() @@ -2961,14 +3036,16 @@ OpenWrap.ai.prototype.__gpttypes = { aVerb = _$(aVerb, "aVerb").isString().default("POST") var _h = new ow.obj.http(__, __, __, __, __, __, __, { timeout: _timeout }) + var _reqHeaders = merge(aOptions.headers, { + "x-api-key" : Packages.openaf.AFCmdBase.afc.dIP(_key), + "anthropic-version": "2023-06-01", + Accept : "text/event-stream" + }) + if (_promptCaching) _reqHeaders["anthropic-beta"] = "prompt-caching-2024-07-31" var __m = { conTimeout : 60000, httpClient : _h, - requestHeaders: merge(aOptions.headers, { - "x-api-key" : Packages.openaf.AFCmdBase.afc.dIP(_key), - "anthropic-version": "2023-06-01", - Accept : "text/event-stream" - }) + requestHeaders: _reqHeaders } _h.close() @@ -3121,6 +3198,7 @@ OpenWrap.ai.prototype.agent = function(aOptions) { * - params: extra request body parameters merged into prompt, image and embedding calls.\ * - noSystem: when true, system messages are converted to developer messages where supported (defaults to true).\ * - noResponseFormat: when true, disables OpenAI-compatible JSON response_format injection.\ + * - promptCaching: when true enables Anthropic prompt caching headers and cache_control markers (defaults to false).\ * \ * OpenAI-compatible transport options:\ * - apiVersion: API version/path segment for OpenAI-compatible routes (defaults to "v1"). In Azure legacy mode this becomes the api-version query parameter. In Foundry mode, "v1" uses the /openai/v1 path; dated versions use the /models route with api-version.\ @@ -3134,6 +3212,8 @@ OpenWrap.ai.prototype.agent = function(aOptions) { * - Azure OpenAI legacy: new ow.ai.gpt("openai", { key: "...", url: "https://RESOURCE.openai.azure.com", mode: "azure-openai-legacy", deployment: "DEPLOYMENT", apiVersion: "2024-10-21" })\ * - Azure AI Foundry v1: new ow.ai.gpt("openai", { key: "...", url: "https://RESOURCE.services.ai.azure.com", mode: "foundry", model: "DEPLOYMENT" })\ * - Azure AI Foundry dated API: new ow.ai.gpt("openai", { key: "...", url: "https://RESOURCE.services.ai.azure.com/models", mode: "foundry", apiVersion: "2024-05-01-preview", model: "DEPLOYMENT" })\ + * \ + * Usage stats note: `getLastStats()` now includes OpenAI cached prompt tokens (`tokens.cached`) when reported by compatible models.\ * */ OpenWrap.ai.prototype.gpt = function(aType, aOptions) { @@ -3229,7 +3309,8 @@ OpenWrap.ai.prototype.gpt.prototype.getEmbeddings = function(aInput, aDimensions /** * * ow.ai.gpt.getLastStats() : Map - * Returns the latest usage statistics reported by the underlying GPT model for the most recent prompt request. + * Returns the latest usage statistics reported by the underlying GPT model for the most recent prompt request + * (including provider-specific fields such as OpenAI cached prompt tokens and Anthropic cache read/creation tokens when available). * */ OpenWrap.ai.prototype.gpt.prototype.getLastStats = function() { @@ -3555,7 +3636,8 @@ OpenWrap.ai.prototype.gpt.prototype.addUserPrompt = function(aPrompt) { /** * * ow.ai.gpt.addSystemPrompt(aPrompt) : ow.ai.gpt - * Adds aPrompt (a string or an array of strings) with aRole (defaults to "user") to the current conversation. + * Adds aPrompt (a string or an array of strings) as a system prompt to the current conversation. + * For Gemini, large system instructions may be implicitly cached by the provider. For Anthropic, explicit cache markers are sent when `promptCaching` is enabled. * */ OpenWrap.ai.prototype.gpt.prototype.addSystemPrompt = function(aPrompt) { @@ -3793,6 +3875,7 @@ OpenWrap.ai.prototype.gpt.prototype.codePrompt = function(aPrompt, aModel, aTemp * - instructions: a string or an array of strings with the instructions for the model (e.g. "json", "boolean", "sql", "js", "path")\ * - headers: a map with the headers to use in the requests (e.g. { "Content-Type": "application/json" })\ * - params: a map with the parameters to use in the requests (e.g. { "max_tokens": 1000, "top_p": 1, "frequency_penalty": 0, "presence_penalty": 0 })\ + * - promptCaching: when true enables Anthropic prompt caching headers and cache_control markers (defaults to false)\ * \ * For type "openai", options can also include:\ * - mode: transport mode ("openai", "azure-openai-v1", "azure-openai-legacy" or "foundry").\ @@ -3810,7 +3893,7 @@ OpenWrap.ai.prototype.gpt.prototype.codePrompt = function(aPrompt, aModel, aTemp * If aModel is not provided, it will try to get the model from the environment variable "OAF_MODEL" with the map in JSON or SLON format. * \ * The returned object also exposes helper methods to inspect vendor usage information: `getLastStats`/`lastStats` (map with the latest statistics), `promptWithStats`, - * `promptJSONWithStats` and `rawPromptWithStats` (returning `{ response, stats }`). + * `promptJSONWithStats` and `rawPromptWithStats` (returning `{ response, stats }`). OpenAI compatible models can expose cached prompt tokens via `stats.tokens.cached`. * */ global.$gpt = function(aModel) { diff --git a/tests/autoTestAll.AI.js b/tests/autoTestAll.AI.js index 0773e1ad5..5e593b20b 100644 --- a/tests/autoTestAll.AI.js +++ b/tests/autoTestAll.AI.js @@ -458,6 +458,111 @@ ow.test.assert("" + foundryPreviewTransport.headers["api-key"], "test-key", "Problem setting Foundry preview api-key header."); }; + exports.testAIOpenAIStatsCaptureIncludesCachedAndReasoning = function() { + ow.loadAI(); + + var g = new ow.ai.gpt("openai", { key: "test-key", model: "gpt-4o" }); + g.model._request = function(url, body) { + return { + model: body.model, + choices: [ + { + finish_reason: "stop", + message: { role: "assistant", content: "ok" } + } + ], + usage: { + prompt_tokens: 120, + completion_tokens: 30, + total_tokens: 150, + prompt_tokens_details: { + cached_tokens: 90, + audio_tokens: 5 + }, + completion_tokens_details: { + reasoning_tokens: 11 + } + } + }; + }; + + g.rawPrompt("hello", "gpt-4o", 0.1, false, []); + var stats = g.getLastStats(); + ow.test.assert(stats.tokens.cached, 90, "Problem capturing OpenAI cached prompt tokens."); + ow.test.assert(stats.tokens.audio, 5, "Problem capturing OpenAI audio prompt tokens."); + ow.test.assert(stats.tokens.reasoning, 11, "Problem capturing OpenAI reasoning completion tokens."); + }; + + exports.testAIAnthropicPromptCachingHeaders = function() { + ow.loadAI(); + + var _origRest = $rest; + var captured = []; + $rest = function(cfg) { + captured.push(__cloneForTest(cfg)); + return { + get2Stream: function() { return {}; }, + post2Stream: function() { return {}; } + }; + }; + + try { + var gCache = new ow.ai.gpt("anthropic", { key: "test-key", promptCaching: true }); + gCache.model._request("v1/messages", {}); + ow.test.assert(captured[0].requestHeaders["anthropic-beta"], "prompt-caching-2024-07-31", "Problem enabling Anthropic prompt caching beta header on request."); + + captured = []; + gCache.model._requestStream("v1/messages", {}); + ow.test.assert(captured[0].requestHeaders["anthropic-beta"], "prompt-caching-2024-07-31", "Problem enabling Anthropic prompt caching beta header on stream request."); + + captured = []; + var gNoCache = new ow.ai.gpt("anthropic", { key: "test-key", promptCaching: false }); + gNoCache.model._request("v1/messages", {}); + ow.test.assert(isUnDef(captured[0].requestHeaders["anthropic-beta"]), true, "Problem keeping Anthropic prompt caching beta header disabled by default."); + } finally { + $rest = _origRest; + } + }; + + exports.testAIAnthropicPromptCachingBodyAndStats = function() { + ow.loadAI(); + + var g = new ow.ai.gpt("anthropic", { key: "test-key", model: "claude-test", promptCaching: true }); + var requests = []; + g.model._request = function(url, body) { + requests.push(__cloneForTest(body)); + return { + id: "msg-1", + model: "claude-test", + type: "message", + stop_reason: "end_turn", + content: [{ type: "text", text: "ok" }], + usage: { + input_tokens: 20, + output_tokens: 10, + cache_creation_input_tokens: 300, + cache_read_input_tokens: 220 + } + }; + }; + + g.addSystemPrompt("You are concise."); + g.rawPrompt("Hello world", "claude-test", 0.2, false, []); + + var body = requests[0]; + ow.test.assert(isArray(body.system), true, "Problem converting Anthropic system prompt into content blocks when prompt caching is enabled."); + ow.test.assert(body.system[0].cache_control.type, "ephemeral", "Problem setting Anthropic system prompt cache_control marker."); + + var lastMessage = body.messages[body.messages.length - 1]; + var lastContentBlock = lastMessage.content[lastMessage.content.length - 1]; + ow.test.assert(lastMessage.role, "user", "Problem preserving Anthropic last user message role when applying cache_control."); + ow.test.assert(lastContentBlock.cache_control.type, "ephemeral", "Problem setting Anthropic cache_control marker on last user message."); + + var stats = g.getLastStats(); + ow.test.assert(stats.tokens.cacheCreation, 300, "Problem capturing Anthropic cache creation tokens."); + ow.test.assert(stats.tokens.cacheRead, 220, "Problem capturing Anthropic cache read tokens."); + }; + exports.testAIOpenAIToolRecursionNoDuplication = function() { ow.loadAI(); diff --git a/tests/autoTestAll.AI.yaml b/tests/autoTestAll.AI.yaml index 07a4c4cbf..f7e2ec2ba 100644 --- a/tests/autoTestAll.AI.yaml +++ b/tests/autoTestAll.AI.yaml @@ -85,6 +85,21 @@ jobs: to : oJob Test exec: args.func = args.tests.testAIOpenAITransportModes; + - name: AI::OpenAI stats capture includes cached and reasoning + from: AI::Init + to : oJob Test + exec: args.func = args.tests.testAIOpenAIStatsCaptureIncludesCachedAndReasoning; + + - name: AI::Anthropic prompt caching headers + from: AI::Init + to : oJob Test + exec: args.func = args.tests.testAIAnthropicPromptCachingHeaders; + + - name: AI::Anthropic prompt caching body and stats + from: AI::Init + to : oJob Test + exec: args.func = args.tests.testAIAnthropicPromptCachingBodyAndStats; + - name: AI::OpenAI tool recursion without duplication from: AI::Init to : oJob Test @@ -128,6 +143,9 @@ todo: - AI::Test KMeans - AI::GPT prompt argument routing - AI::OpenAI transport modes + - AI::OpenAI stats capture includes cached and reasoning + - AI::Anthropic prompt caching headers + - AI::Anthropic prompt caching body and stats - AI::OpenAI tool recursion without duplication - AI::OpenAI streaming tool recursion without duplication - AI::OpenAI export conversation falls back to tool name