diff --git a/.changeset/multimodal-tool-results.md b/.changeset/multimodal-tool-results.md new file mode 100644 index 000000000..7ec5e49d7 --- /dev/null +++ b/.changeset/multimodal-tool-results.md @@ -0,0 +1,15 @@ +--- +'@tanstack/ai': minor +'@tanstack/ai-client': minor +'@tanstack/openai-base': minor +'@tanstack/ai-anthropic': minor +'@tanstack/ai-gemini': minor +--- + +feat: support multimodal (image) tool results + +Tools may now return an `Array` (e.g. a text part plus an image part) and have it transmitted to the model as structured multimodal tool output instead of a `JSON.stringify`'d blob. This unblocks use cases like returning a screenshot from a tool so the model can see it (issue #363). + +- Detection is structural and opt-in by shape: a tool that returns a non-empty array whose every element is a valid `ContentPart` is passed through unchanged; strings and all other return values are serialized exactly as before, so there are no breaking changes. +- The OpenAI Responses, Anthropic, and Google Gemini adapters convert the content parts into their native multimodal tool-output formats (`function_call_output.output`, `tool_result` content blocks, and `functionResponse.parts` respectively). Providers on the Chat Completions path (Groq, Ollama, Grok, OpenRouter chat) fall back to stringifying, which their APIs require. +- AG-UI stream events (`TOOL_CALL_RESULT.content`, `TOOL_CALL_END.result`) remain string-only per the spec; the multimodal array travels on the tool message itself. diff --git a/examples/ts-react-chat/public/repro-secret.png b/examples/ts-react-chat/public/repro-secret.png new file mode 100644 index 000000000..388d4fdef Binary files /dev/null and b/examples/ts-react-chat/public/repro-secret.png differ diff --git a/examples/ts-react-chat/scripts/make-repro-image.mjs b/examples/ts-react-chat/scripts/make-repro-image.mjs new file mode 100644 index 000000000..1d363cf22 --- /dev/null +++ b/examples/ts-react-chat/scripts/make-repro-image.mjs @@ -0,0 +1,123 @@ +// One-off generator for the issue #363 repro image. +// Renders a fixed 3-digit secret number as black pixels on white. +// The number is impossible to guess (1/1000) and appears nowhere in any text +// the tool returns, so a model that genuinely *sees* the image can read it and +// a model that only received stringified JSON cannot. Writes the PNG to +// public/ and prints its base64 for embedding in the tool. +import { deflateSync } from 'node:zlib' +import { writeFileSync } from 'node:fs' +import { fileURLToPath } from 'node:url' +import { dirname, join } from 'node:path' + +// Keep this in sync with REPRO_SECRET in src/lib/image-tool-repro.ts +const SECRET = '473' +const CELL = 16 // px per font cell +const MARGIN = 24 +const GAP = CELL // gap between digits + +// 3x5 bitmap font for digits 0-9. +const FONT = { + 0: ['111', '101', '101', '101', '111'], + 1: ['010', '110', '010', '010', '111'], + 2: ['111', '001', '111', '100', '111'], + 3: ['111', '001', '111', '001', '111'], + 4: ['101', '101', '111', '001', '001'], + 5: ['111', '100', '111', '001', '111'], + 6: ['111', '100', '111', '101', '111'], + 7: ['111', '001', '010', '010', '010'], + 8: ['111', '101', '111', '101', '111'], + 9: ['111', '101', '111', '001', '111'], +} + +const digits = [...SECRET] +const digitW = 3 * CELL +const digitH = 5 * CELL +const width = MARGIN * 2 + digits.length * digitW + (digits.length - 1) * GAP +const height = MARGIN * 2 + digitH + +// White background, black digit pixels. +const px = new Uint8Array(width * height * 3).fill(255) +function setBlack(x, y) { + const i = (y * width + x) * 3 + px[i] = 0 + px[i + 1] = 0 + px[i + 2] = 0 +} + +digits.forEach((d, di) => { + const glyph = FONT[d] + const originX = MARGIN + di * (digitW + GAP) + for (let gy = 0; gy < 5; gy++) { + for (let gx = 0; gx < 3; gx++) { + if (glyph[gy][gx] !== '1') continue + for (let cy = 0; cy < CELL; cy++) { + for (let cx = 0; cx < CELL; cx++) { + setBlack(originX + gx * CELL + cx, MARGIN + gy * CELL + cy) + } + } + } + } +}) + +// Pack into PNG scanlines (filter byte 0 per row). +const raw = Buffer.alloc((width * 3 + 1) * height) +let o = 0 +for (let y = 0; y < height; y++) { + raw[o++] = 0 + for (let x = 0; x < width; x++) { + const i = (y * width + x) * 3 + raw[o++] = px[i] + raw[o++] = px[i + 1] + raw[o++] = px[i + 2] + } +} + +// CRC32 (PNG chunks require it). +const crcTable = (() => { + const t = new Uint32Array(256) + for (let n = 0; n < 256; n++) { + let c = n + for (let k = 0; k < 8; k++) c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1 + t[n] = c >>> 0 + } + return t +})() +function crc32(buf) { + let c = 0xffffffff + for (let i = 0; i < buf.length; i++) + c = crcTable[(c ^ buf[i]) & 0xff] ^ (c >>> 8) + return (c ^ 0xffffffff) >>> 0 +} + +function chunk(type, data) { + const typeBuf = Buffer.from(type, 'ascii') + const len = Buffer.alloc(4) + len.writeUInt32BE(data.length, 0) + const crc = Buffer.alloc(4) + crc.writeUInt32BE(crc32(Buffer.concat([typeBuf, data])), 0) + return Buffer.concat([len, typeBuf, data, crc]) +} + +const ihdr = Buffer.alloc(13) +ihdr.writeUInt32BE(width, 0) +ihdr.writeUInt32BE(height, 4) +ihdr[8] = 8 // bit depth +ihdr[9] = 2 // colour type: truecolour RGB +ihdr[10] = 0 +ihdr[11] = 0 +ihdr[12] = 0 + +const png = Buffer.concat([ + Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]), + chunk('IHDR', ihdr), + chunk('IDAT', deflateSync(raw)), + chunk('IEND', Buffer.alloc(0)), +]) + +const here = dirname(fileURLToPath(import.meta.url)) +const outPath = join(here, '..', 'public', 'repro-secret.png') +writeFileSync(outPath, png) +console.log('Wrote', outPath, `(${png.length} bytes, secret=${SECRET})`) +console.log('BASE64_START') +console.log(png.toString('base64')) +console.log('BASE64_END') diff --git a/examples/ts-react-chat/src/lib/image-tool-repro.ts b/examples/ts-react-chat/src/lib/image-tool-repro.ts new file mode 100644 index 000000000..e29f14f7d --- /dev/null +++ b/examples/ts-react-chat/src/lib/image-tool-repro.ts @@ -0,0 +1,54 @@ +import { toolDefinition } from '@tanstack/ai' +import { z } from 'zod' +import type { ContentPart } from '@tanstack/ai' + +/** + * Repro for https://github.com/TanStack/ai/issues/363 + * + * `getReproImage` returns a multimodal tool result — an array of `ContentPart` + * (a text part + an image part). With multimodal tool results supported, the + * OpenAI Responses adapter sends the image as a structured `input_image` in + * `function_call_output.output`, so the model can actually see it. Before the + * fix every tool result was `JSON.stringify`'d and the model never received the + * image. + * + * The image is a PNG showing a fixed 3-digit secret number ({@link REPRO_SECRET}) + * rendered as black pixels on white. The number appears nowhere in any text the + * tool returns, so a model that can genuinely see the image can read it back and + * a model that only received stringified JSON cannot — a clean pass/fail signal + * that a blind model cannot fake (1-in-1000 to guess). + * + * Generated by `scripts/make-repro-image.mjs` (also written to + * `public/repro-secret.png` for human comparison). Keep REPRO_SECRET in sync + * with the SECRET constant in that script. + */ +export const REPRO_SECRET = '473' + +const REPRO_IMAGE_BASE64 = + 'iVBORw0KGgoAAAANSUhEUgAAAOAAAACACAIAAACdu/LsAAAEIElEQVR4nO2SgWkAMRDDfv+l2wlSaJA5JbEG8FmHv59SxHzTBUr5iw60qOlAi5oOtKjpQIuaDrSo6UCLmg60qOlAi5oOtKjpQIuaDrSo6UCLmg60qOlAi5oOtKjpQIuaDrSo6UCLmg60qOlAi5oOtKjpQIsabKDfPzk9P81r/ZdeWFBYwJaf5rX+Sy8sKCxgy0/zWv+lFxYUFrDlp3mt/9ILCwoL2PLTvNZ/6YUFhQVs+Wle67/0woLCArb8NK/1X3phQWEBW36a1/ovvbCgsIAtP81r/ZdeWFBYwJaf5rX+Sy8sKCxgy0/zWv+lFxYUFrDlp3mt/9ILCwoL2PLTvNZ/6YUFhQVs+Wle67/0woLCArb8NK/1X3phQWEBW36a1/ovvbCgsMDUgyjaf/MuFhQWmHoQRftv3sWCwgJTD6Jo/827WFBYYOpBFO2/eRcLCgtMPYii/TfvYkFhgakHUbT/5l0sKCww9SCK9t+8iwWFBaYeRNH+m3exoLDA1IMo2n/zLhYUFph6EEX7b97FgsICUw+iaP/Nu1hQWGDqQRTtv3kXCwoLTD2Iov0372JBYYGpB1G0/+ZdLCgsMPUgivbfvIsFhQWmHkTR/pt3sSAZlBfla+uTBvPCgmRQXpSvrU8azAsLkkF5Ub62PmkwLyxIBuVF+dr6pMG8sCAZlBfla+uTBvPCgmRQXpSvrU8azAsLkkF5Ub62PmkwLyxIBuVF+dr6pMG8sCAZlBfla+uTBvPCgmRQXpSvrU8azAsLkkF5Ub62PmkwLyxIBuVF+dr6pMG8sCAZlBfla+uTBvPCgmRQXpSvrU8azAsLkkF5Ub62PmkwLyxIBuVF+dr6pMG8qKA0Uw86pU+aKd9jHmcbhK1PminfYx5nG4StT5op32MeZxuErU+aKd9jHmcbhK1PminfYx5nG4StT5op32MeZxuErU+aKd9jHmcbhK1PminfYx5nG4StT5op32MeZxuErU+aKd9jHmcbhK1PminfYx5nG4StT5op32MeZxuErU+aKd9jHmcbhK1PminfYx5nG4StT5op32MeZxuErU+aKd9jHmcbxOl90mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFpZl60K190mBeVFCaqQfd2icN5kUFlZKgAy1qOtCipgMtajrQoqYDLWo60KKmAy1qOtCipgMtajrQoqYDLWo60KKmAy1qOtCipgMtajrQoqYDLWo60KKmAy1qOtCipgMtajrQoqYDLWp+ARIFD+N3ElIHAAAAAElFTkSuQmCC' + +export const getReproImageToolDef = toolDefinition({ + name: 'getReproImage', + description: + 'Returns an image for the user to inspect. Call this whenever the user asks you to look at, view, or describe the image.', + inputSchema: z.object({}), +}) + +// Server implementation: returns a multimodal content-part array, NOT a string. +export const getReproImage = getReproImageToolDef.server( + (): Array => [ + { + type: 'text', + content: + 'Here is the image you asked to inspect. Read whatever it shows.', + }, + { + type: 'image', + source: { + type: 'data', + value: REPRO_IMAGE_BASE64, + mimeType: 'image/png', + }, + }, + ], +) diff --git a/examples/ts-react-chat/src/routeTree.gen.ts b/examples/ts-react-chat/src/routeTree.gen.ts index 9dd642c9d..59ad81fc8 100644 --- a/examples/ts-react-chat/src/routeTree.gen.ts +++ b/examples/ts-react-chat/src/routeTree.gen.ts @@ -12,6 +12,7 @@ import { Route as rootRouteImport } from './routes/__root' import { Route as ServerFnChatRouteImport } from './routes/server-fn-chat' import { Route as RealtimeRouteImport } from './routes/realtime' import { Route as Issue176ToolResultRouteImport } from './routes/issue-176-tool-result' +import { Route as ImageToolReproRouteImport } from './routes/image-tool-repro' import { Route as ImageGenRouteImport } from './routes/image-gen' import { Route as GenerationHooksRouteImport } from './routes/generation-hooks' import { Route as IndexRouteImport } from './routes/index' @@ -29,6 +30,7 @@ import { Route as ApiTanchatRouteImport } from './routes/api.tanchat' import { Route as ApiSummarizeRouteImport } from './routes/api.summarize' import { Route as ApiStructuredOutputRouteImport } from './routes/api.structured-output' import { Route as ApiStructuredChatRouteImport } from './routes/api.structured-chat' +import { Route as ApiImageToolReproRouteImport } from './routes/api.image-tool-repro' import { Route as ApiImageGenRouteImport } from './routes/api.image-gen' import { Route as ExampleGuitarsIndexRouteImport } from './routes/example.guitars/index' import { Route as ExampleGuitarsGuitarIdRouteImport } from './routes/example.guitars/$guitarId' @@ -52,6 +54,11 @@ const Issue176ToolResultRoute = Issue176ToolResultRouteImport.update({ path: '/issue-176-tool-result', getParentRoute: () => rootRouteImport, } as any) +const ImageToolReproRoute = ImageToolReproRouteImport.update({ + id: '/image-tool-repro', + path: '/image-tool-repro', + getParentRoute: () => rootRouteImport, +} as any) const ImageGenRoute = ImageGenRouteImport.update({ id: '/image-gen', path: '/image-gen', @@ -140,6 +147,11 @@ const ApiStructuredChatRoute = ApiStructuredChatRouteImport.update({ path: '/api/structured-chat', getParentRoute: () => rootRouteImport, } as any) +const ApiImageToolReproRoute = ApiImageToolReproRouteImport.update({ + id: '/api/image-tool-repro', + path: '/api/image-tool-repro', + getParentRoute: () => rootRouteImport, +} as any) const ApiImageGenRoute = ApiImageGenRouteImport.update({ id: '/api/image-gen', path: '/api/image-gen', @@ -180,10 +192,12 @@ export interface FileRoutesByFullPath { '/': typeof IndexRoute '/generation-hooks': typeof GenerationHooksRoute '/image-gen': typeof ImageGenRoute + '/image-tool-repro': typeof ImageToolReproRoute '/issue-176-tool-result': typeof Issue176ToolResultRoute '/realtime': typeof RealtimeRoute '/server-fn-chat': typeof ServerFnChatRoute '/api/image-gen': typeof ApiImageGenRoute + '/api/image-tool-repro': typeof ApiImageToolReproRoute '/api/structured-chat': typeof ApiStructuredChatRoute '/api/structured-output': typeof ApiStructuredOutputRoute '/api/summarize': typeof ApiSummarizeRoute @@ -209,10 +223,12 @@ export interface FileRoutesByTo { '/': typeof IndexRoute '/generation-hooks': typeof GenerationHooksRoute '/image-gen': typeof ImageGenRoute + '/image-tool-repro': typeof ImageToolReproRoute '/issue-176-tool-result': typeof Issue176ToolResultRoute '/realtime': typeof RealtimeRoute '/server-fn-chat': typeof ServerFnChatRoute '/api/image-gen': typeof ApiImageGenRoute + '/api/image-tool-repro': typeof ApiImageToolReproRoute '/api/structured-chat': typeof ApiStructuredChatRoute '/api/structured-output': typeof ApiStructuredOutputRoute '/api/summarize': typeof ApiSummarizeRoute @@ -239,10 +255,12 @@ export interface FileRoutesById { '/': typeof IndexRoute '/generation-hooks': typeof GenerationHooksRoute '/image-gen': typeof ImageGenRoute + '/image-tool-repro': typeof ImageToolReproRoute '/issue-176-tool-result': typeof Issue176ToolResultRoute '/realtime': typeof RealtimeRoute '/server-fn-chat': typeof ServerFnChatRoute '/api/image-gen': typeof ApiImageGenRoute + '/api/image-tool-repro': typeof ApiImageToolReproRoute '/api/structured-chat': typeof ApiStructuredChatRoute '/api/structured-output': typeof ApiStructuredOutputRoute '/api/summarize': typeof ApiSummarizeRoute @@ -270,10 +288,12 @@ export interface FileRouteTypes { | '/' | '/generation-hooks' | '/image-gen' + | '/image-tool-repro' | '/issue-176-tool-result' | '/realtime' | '/server-fn-chat' | '/api/image-gen' + | '/api/image-tool-repro' | '/api/structured-chat' | '/api/structured-output' | '/api/summarize' @@ -299,10 +319,12 @@ export interface FileRouteTypes { | '/' | '/generation-hooks' | '/image-gen' + | '/image-tool-repro' | '/issue-176-tool-result' | '/realtime' | '/server-fn-chat' | '/api/image-gen' + | '/api/image-tool-repro' | '/api/structured-chat' | '/api/structured-output' | '/api/summarize' @@ -328,10 +350,12 @@ export interface FileRouteTypes { | '/' | '/generation-hooks' | '/image-gen' + | '/image-tool-repro' | '/issue-176-tool-result' | '/realtime' | '/server-fn-chat' | '/api/image-gen' + | '/api/image-tool-repro' | '/api/structured-chat' | '/api/structured-output' | '/api/summarize' @@ -358,10 +382,12 @@ export interface RootRouteChildren { IndexRoute: typeof IndexRoute GenerationHooksRoute: typeof GenerationHooksRoute ImageGenRoute: typeof ImageGenRoute + ImageToolReproRoute: typeof ImageToolReproRoute Issue176ToolResultRoute: typeof Issue176ToolResultRoute RealtimeRoute: typeof RealtimeRoute ServerFnChatRoute: typeof ServerFnChatRoute ApiImageGenRoute: typeof ApiImageGenRoute + ApiImageToolReproRoute: typeof ApiImageToolReproRoute ApiStructuredChatRoute: typeof ApiStructuredChatRoute ApiStructuredOutputRoute: typeof ApiStructuredOutputRoute ApiSummarizeRoute: typeof ApiSummarizeRoute @@ -407,6 +433,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof Issue176ToolResultRouteImport parentRoute: typeof rootRouteImport } + '/image-tool-repro': { + id: '/image-tool-repro' + path: '/image-tool-repro' + fullPath: '/image-tool-repro' + preLoaderRoute: typeof ImageToolReproRouteImport + parentRoute: typeof rootRouteImport + } '/image-gen': { id: '/image-gen' path: '/image-gen' @@ -526,6 +559,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ApiStructuredChatRouteImport parentRoute: typeof rootRouteImport } + '/api/image-tool-repro': { + id: '/api/image-tool-repro' + path: '/api/image-tool-repro' + fullPath: '/api/image-tool-repro' + preLoaderRoute: typeof ApiImageToolReproRouteImport + parentRoute: typeof rootRouteImport + } '/api/image-gen': { id: '/api/image-gen' path: '/api/image-gen' @@ -582,10 +622,12 @@ const rootRouteChildren: RootRouteChildren = { IndexRoute: IndexRoute, GenerationHooksRoute: GenerationHooksRoute, ImageGenRoute: ImageGenRoute, + ImageToolReproRoute: ImageToolReproRoute, Issue176ToolResultRoute: Issue176ToolResultRoute, RealtimeRoute: RealtimeRoute, ServerFnChatRoute: ServerFnChatRoute, ApiImageGenRoute: ApiImageGenRoute, + ApiImageToolReproRoute: ApiImageToolReproRoute, ApiStructuredChatRoute: ApiStructuredChatRoute, ApiStructuredOutputRoute: ApiStructuredOutputRoute, ApiSummarizeRoute: ApiSummarizeRoute, diff --git a/examples/ts-react-chat/src/routes/api.image-tool-repro.ts b/examples/ts-react-chat/src/routes/api.image-tool-repro.ts new file mode 100644 index 000000000..9ce08cdc5 --- /dev/null +++ b/examples/ts-react-chat/src/routes/api.image-tool-repro.ts @@ -0,0 +1,72 @@ +import { createFileRoute } from '@tanstack/react-router' +import { + chat, + chatParamsFromRequestBody, + maxIterations, + toServerSentEventsResponse, +} from '@tanstack/ai' +import { openaiText } from '@tanstack/ai-openai' +import { getReproImage } from '@/lib/image-tool-repro' + +/** + * Repro endpoint for https://github.com/TanStack/ai/issues/363 + * + * Uses a single server tool (`getReproImage`) that returns a multimodal + * content-part array. The system prompt forces the model to declare whether it + * could actually SEE the returned image, giving a crisp pass/fail signal. + */ +const SYSTEM_PROMPT = `You are a vision QA assistant. You have one tool: getReproImage(), which returns an image. + +When the user asks you to inspect the image, follow these steps exactly: +1. Call the getReproImage tool. +2. Look at what the tool returned. +3. If you can actually SEE an image, begin your reply with "VISIBLE:" followed by the exact number printed in the image. +4. If the tool result was only text, JSON, or base64 data that you cannot view as an image, begin your reply with "NOT-VISIBLE:" and explain that you only received data you could not view. + +Never guess the number. Only report a number you can actually read in an image. If you cannot see an image, you must say NOT-VISIBLE.` + +export const Route = createFileRoute('/api/image-tool-repro')({ + server: { + handlers: { + POST: async ({ request }) => { + if (request.signal.aborted) { + return new Response(null, { status: 499 }) + } + + const abortController = new AbortController() + + let params + try { + params = await chatParamsFromRequestBody(await request.json()) + } catch (error) { + return new Response( + error instanceof Error ? error.message : 'Bad request', + { status: 400 }, + ) + } + + try { + const stream = chat({ + adapter: openaiText('gpt-4o'), + tools: [getReproImage], + systemPrompts: [SYSTEM_PROMPT], + agentLoopStrategy: maxIterations(5), + messages: params.messages, + threadId: params.threadId, + runId: params.runId, + abortController, + }) + return toServerSentEventsResponse(stream, { abortController }) + } catch (error: any) { + if (error.name === 'AbortError' || abortController.signal.aborted) { + return new Response(null, { status: 499 }) + } + return new Response( + JSON.stringify({ error: error.message || 'An error occurred' }), + { status: 500, headers: { 'Content-Type': 'application/json' } }, + ) + } + }, + }, + }, +}) diff --git a/examples/ts-react-chat/src/routes/image-tool-repro.tsx b/examples/ts-react-chat/src/routes/image-tool-repro.tsx new file mode 100644 index 000000000..c2484930c --- /dev/null +++ b/examples/ts-react-chat/src/routes/image-tool-repro.tsx @@ -0,0 +1,118 @@ +import { createFileRoute } from '@tanstack/react-router' +import { fetchServerSentEvents, useChat } from '@tanstack/ai-react' +import { REPRO_SECRET } from '@/lib/image-tool-repro' +import type { UIMessage } from '@tanstack/ai-react' + +/** + * Repro UI for https://github.com/TanStack/ai/issues/363 + * + * One click sends a fixed prompt that makes the model call `getReproImage` + * (a server tool returning a multimodal content-part array) and report the + * secret number printed in the returned image. With multimodal tool results + * supported, the image reaches the model and it reports the number; before the + * fix the model only received JSON and could not read it. + * + * The verdict keys off whether the model reports the ACTUAL secret number — not + * just a "VISIBLE" prefix — so a blind model cannot fake a pass by guessing. + */ +const REPRO_PROMPT = + 'Call the getReproImage tool and tell me the number printed in the image.' + +function assistantText(message: UIMessage): string { + return message.parts + .filter((p) => p.type === 'text') + .map((p) => (p as { content: string }).content) + .join('') + .trim() +} + +function ReproPage() { + const { messages, sendMessage, isLoading, error } = useChat({ + connection: fetchServerSentEvents('/api/image-tool-repro'), + }) + + const lastAssistant = [...messages] + .reverse() + .find((m) => m.role === 'assistant' && assistantText(m).length > 0) + const answer = lastAssistant ? assistantText(lastAssistant) : '' + // Truly visible only if the model reports the real secret number. + const sawImage = answer.includes(REPRO_SECRET) + const hasAnswer = answer.length > 0 + + return ( +
+
+
+

+ Issue #363 — Multimodal tool result repro +

+

+ The getReproImage server + tool returns an image as a multimodal content-part array. If tool + results are stringified, the model never sees the image and cannot + read the secret number it contains. When the image actually reaches + the model it reports the number correctly. +

+
+ +
+
+

+ What the tool actually returns: +

+ Secret number the model must read +

+ Secret number:{' '} + {REPRO_SECRET} (never sent + as text) +

+
+ +
+ + + {hasAnswer && ( +
+ {sawImage + ? `✅ Model read the secret number (${REPRO_SECRET}) — image reached the model` + : '❌ Model could NOT read the number — image was stringified (issue reproduced)'} +
+ )} + + {answer && ( +
+                {answer}
+              
+ )} + + {error && ( +
+ {error.message} +
+ )} +
+
+
+
+ ) +} + +export const Route = createFileRoute('/image-tool-repro')({ + component: ReproPage, +}) diff --git a/packages/ai-anthropic/src/adapters/text.ts b/packages/ai-anthropic/src/adapters/text.ts index f6b5b9d2d..09b0d249f 100644 --- a/packages/ai-anthropic/src/adapters/text.ts +++ b/packages/ai-anthropic/src/adapters/text.ts @@ -512,14 +512,20 @@ export class AnthropicTextAdapter< const role = message.role if (role === 'tool' && message.toolCallId) { + const toolContent = message.content formattedMessages.push({ role: 'user', content: [ { type: 'tool_result', tool_use_id: message.toolCallId, - content: - typeof message.content === 'string' ? message.content : '', + content: Array.isArray(toolContent) + ? toolContent.map((part) => + this.convertContentPartToAnthropic(part), + ) + : typeof toolContent === 'string' + ? toolContent + : '', }, ], }) diff --git a/packages/ai-anthropic/tests/tool-result-multimodal.test.ts b/packages/ai-anthropic/tests/tool-result-multimodal.test.ts new file mode 100644 index 000000000..b655f6ba9 --- /dev/null +++ b/packages/ai-anthropic/tests/tool-result-multimodal.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect, vi } from 'vitest' +import { chat } from '@tanstack/ai' +import { AnthropicTextAdapter } from '../src/adapters/text' + +const mocks = vi.hoisted(() => { + const betaMessagesCreate = vi.fn() + const messagesCreate = vi.fn() + + const client = { + beta: { + messages: { + create: betaMessagesCreate, + }, + }, + messages: { + create: messagesCreate, + }, + } + + return { betaMessagesCreate, messagesCreate, client } +}) + +vi.mock('@anthropic-ai/sdk', () => { + const { client } = mocks + + class MockAnthropic { + beta = client.beta + messages = client.messages + + constructor(_: { apiKey: string }) {} + } + + return { default: MockAnthropic } +}) + +describe('anthropic multimodal tool result', () => { + it('maps a ContentPart[] tool result to tool_result blocks', async () => { + mocks.betaMessagesCreate.mockResolvedValueOnce( + (async function* () { + yield { + type: 'content_block_start', + index: 0, + content_block: { type: 'text', text: '' }, + } + yield { + type: 'content_block_delta', + index: 0, + delta: { type: 'text_delta', text: 'done' }, + } + yield { type: 'content_block_stop', index: 0 } + yield { + type: 'message_delta', + delta: { stop_reason: 'end_turn' }, + usage: { output_tokens: 1 }, + } + yield { type: 'message_stop' } + })(), + ) + + const adapter = new AnthropicTextAdapter( + { apiKey: 'test-key' }, + 'claude-3-7-sonnet', + ) + + for await (const _ of chat({ + adapter, + messages: [ + { role: 'user', content: 'look' }, + { + role: 'assistant', + content: '', + toolCalls: [ + { + id: 'tu_1', + type: 'function', + function: { name: 'shot', arguments: '{}' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'tu_1', + content: [ + { type: 'text', content: 'screenshot' }, + { + type: 'image', + source: { type: 'data', value: 'AAAA', mimeType: 'image/png' }, + }, + ], + }, + ], + })) { + // consume stream + } + + expect(mocks.betaMessagesCreate).toHaveBeenCalledTimes(1) + const [payload] = mocks.betaMessagesCreate.mock.calls[0]! + + const toolMsg = payload.messages.find( + (m: any) => + Array.isArray(m.content) && m.content[0]?.type === 'tool_result', + ) + expect(toolMsg).toBeDefined() + const block = toolMsg.content[0] + expect(Array.isArray(block.content)).toBe(true) + expect(block.content[0]).toEqual({ type: 'text', text: 'screenshot' }) + expect(block.content[1]).toEqual({ + type: 'image', + source: { type: 'base64', data: 'AAAA', media_type: 'image/png' }, + }) + }) +}) diff --git a/packages/ai-client/src/devtools.ts b/packages/ai-client/src/devtools.ts index dc1444deb..07228ed73 100644 --- a/packages/ai-client/src/devtools.ts +++ b/packages/ai-client/src/devtools.ts @@ -1219,7 +1219,9 @@ function hydrateToolCallOutputs( candidate.output === undefined, ) if (toolCall) { - toolCall.output = parseFixtureResultContent(part.content) + toolCall.output = Array.isArray(part.content) + ? part.content + : parseFixtureResultContent(part.content) } } diff --git a/packages/ai-client/src/types.ts b/packages/ai-client/src/types.ts index c653531a6..426b05537 100644 --- a/packages/ai-client/src/types.ts +++ b/packages/ai-client/src/types.ts @@ -220,7 +220,7 @@ export type ToolCallPart = any> = export interface ToolResultPart { type: 'tool-result' toolCallId: string - content: string + content: string | Array state: ToolResultState error?: string // Error message if state is "error" } diff --git a/packages/ai-code-mode/models-eval/metrics.ts b/packages/ai-code-mode/models-eval/metrics.ts index 97a9d2944..26aa00d3d 100644 --- a/packages/ai-code-mode/models-eval/metrics.ts +++ b/packages/ai-code-mode/models-eval/metrics.ts @@ -76,7 +76,14 @@ export function computeMetrics(messages: Array): ComputedMetrics { if (part.type === 'tool-result') { toolResultLookup.set(part.toolCallId, { - content: part.content, + // `content` may be multimodal (`Array`) since tool + // results can now carry images/etc. The eval only inspects the JSON + // string `execute_typescript` returns, so coerce non-string content + // to a serialized form that `safeJsonParse` can handle. + content: + typeof part.content === 'string' + ? part.content + : JSON.stringify(part.content), state: part.state, error: part.error, }) diff --git a/packages/ai-gemini/src/adapters/text.ts b/packages/ai-gemini/src/adapters/text.ts index 8f9ee636a..9192732c2 100644 --- a/packages/ai-gemini/src/adapters/text.ts +++ b/packages/ai-gemini/src/adapters/text.ts @@ -744,15 +744,52 @@ export class GeminiTextAdapter< if (msg.role === 'tool' && msg.toolCallId) { const functionName = toolCallIdToName.get(msg.toolCallId) || msg.toolCallId - parts.push({ - functionResponse: { - id: msg.toolCallId, - name: functionName, - response: { - content: msg.content || '', + const toolContent = msg.content + if (Array.isArray(toolContent)) { + const textChunks: Array = [] + const mediaParts: Array = [] + for (const part of toolContent) { + if (part.type === 'text') { + textChunks.push(part.content) + } else if (part.source.type === 'data') { + mediaParts.push({ + inlineData: { + data: part.source.value, + mimeType: part.source.mimeType, + }, + }) + } else { + const defaultMimeType = { + image: 'image/jpeg', + audio: 'audio/mp3', + video: 'video/mp4', + document: 'application/pdf', + }[part.type] + mediaParts.push({ + fileData: { + fileUri: part.source.value, + mimeType: part.source.mimeType ?? defaultMimeType, + }, + }) + } + } + parts.push({ + functionResponse: { + id: msg.toolCallId, + name: functionName, + response: { content: textChunks.join('\n') }, + ...(mediaParts.length > 0 && { parts: mediaParts }), }, - }, - }) + }) + } else { + parts.push({ + functionResponse: { + id: msg.toolCallId, + name: functionName, + response: { content: toolContent || '' }, + }, + }) + } } return { diff --git a/packages/ai-gemini/tests/tool-result-multimodal.test.ts b/packages/ai-gemini/tests/tool-result-multimodal.test.ts new file mode 100644 index 000000000..3bae86343 --- /dev/null +++ b/packages/ai-gemini/tests/tool-result-multimodal.test.ts @@ -0,0 +1,248 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest' +import { chat } from '@tanstack/ai' +import { GeminiTextAdapter } from '../src/adapters/text' + +const mocks = vi.hoisted(() => { + return { + constructorSpy: vi.fn<(options: { apiKey: string }) => void>(), + generateContentSpy: vi.fn(), + generateContentStreamSpy: vi.fn(), + getGenerativeModelSpy: vi.fn(), + } +}) + +vi.mock('@google/genai', async () => { + const { + constructorSpy, + generateContentSpy, + generateContentStreamSpy, + getGenerativeModelSpy, + } = mocks + + const actual = await vi.importActual('@google/genai') + class MockGoogleGenAI { + public models = { + generateContent: generateContentSpy, + generateContentStream: generateContentStreamSpy, + } + + public getGenerativeModel = getGenerativeModelSpy + + constructor(options: { apiKey: string }) { + constructorSpy(options) + } + } + + return { + GoogleGenAI: MockGoogleGenAI, + Type: actual.Type, + FinishReason: actual.FinishReason, + } +}) + +const createTextAdapter = () => + new GeminiTextAdapter({ apiKey: 'test-key' }, 'gemini-2.5-pro') + +const createStream = (chunks: Array>) => { + return (async function* () { + for (const chunk of chunks) { + yield chunk + } + })() +} + +describe('gemini multimodal tool result', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it('splits text into response.content and media into functionResponse.parts', async () => { + const streamChunks = [ + { + candidates: [ + { + content: { parts: [{ text: 'ok' }] }, + finishReason: 'STOP', + }, + ], + usageMetadata: { totalTokenCount: 1 }, + }, + ] + + mocks.generateContentStreamSpy.mockResolvedValue(createStream(streamChunks)) + + const adapter = createTextAdapter() + + for await (const _ of chat({ + adapter, + messages: [ + { role: 'user', content: 'look' }, + { + role: 'assistant', + content: '', + toolCalls: [ + { + id: 'call_1', + type: 'function', + function: { name: 'shot', arguments: '{}' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'call_1', + content: [ + { type: 'text', content: 'screenshot' }, + { + type: 'image', + source: { type: 'data', value: 'AAAA', mimeType: 'image/png' }, + }, + ], + }, + ], + })) { + /* consume stream */ + } + + expect(mocks.generateContentStreamSpy).toHaveBeenCalledTimes(1) + const [payload] = mocks.generateContentStreamSpy.mock.calls[0]! + const contents: Array = payload.contents + + const fr = contents + .flatMap((c: any) => c.parts ?? []) + .find((p: any) => p.functionResponse)?.functionResponse + + expect(fr).toBeDefined() + expect(fr.response).toEqual({ content: 'screenshot' }) + expect(fr.parts).toEqual([ + { inlineData: { data: 'AAAA', mimeType: 'image/png' } }, + ]) + }) + + it('handles url-sourced media in tool results via fileData', async () => { + const streamChunks = [ + { + candidates: [ + { + content: { parts: [{ text: 'ok' }] }, + finishReason: 'STOP', + }, + ], + usageMetadata: { totalTokenCount: 1 }, + }, + ] + + mocks.generateContentStreamSpy.mockResolvedValue(createStream(streamChunks)) + + const adapter = createTextAdapter() + + for await (const _ of chat({ + adapter, + messages: [ + { role: 'user', content: 'look' }, + { + role: 'assistant', + content: '', + toolCalls: [ + { + id: 'call_2', + type: 'function', + function: { name: 'fetch_image', arguments: '{}' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'call_2', + content: [ + { type: 'text', content: 'result image' }, + { + type: 'image', + source: { + type: 'url', + value: 'https://example.com/image.jpg', + mimeType: 'image/jpeg', + }, + }, + ], + }, + ], + })) { + /* consume stream */ + } + + expect(mocks.generateContentStreamSpy).toHaveBeenCalledTimes(1) + const [payload] = mocks.generateContentStreamSpy.mock.calls[0]! + const contents: Array = payload.contents + + const fr = contents + .flatMap((c: any) => c.parts ?? []) + .find((p: any) => p.functionResponse)?.functionResponse + + expect(fr).toBeDefined() + expect(fr.response).toEqual({ content: 'result image' }) + expect(fr.parts).toEqual([ + { + fileData: { + fileUri: 'https://example.com/image.jpg', + mimeType: 'image/jpeg', + }, + }, + ]) + }) + + it('keeps backward-compatible string content in tool results', async () => { + const streamChunks = [ + { + candidates: [ + { + content: { parts: [{ text: 'ok' }] }, + finishReason: 'STOP', + }, + ], + usageMetadata: { totalTokenCount: 1 }, + }, + ] + + mocks.generateContentStreamSpy.mockResolvedValue(createStream(streamChunks)) + + const adapter = createTextAdapter() + + for await (const _ of chat({ + adapter, + messages: [ + { role: 'user', content: 'look' }, + { + role: 'assistant', + content: '', + toolCalls: [ + { + id: 'call_3', + type: 'function', + function: { name: 'plain_tool', arguments: '{}' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'call_3', + content: 'just a string', + }, + ], + })) { + /* consume stream */ + } + + expect(mocks.generateContentStreamSpy).toHaveBeenCalledTimes(1) + const [payload] = mocks.generateContentStreamSpy.mock.calls[0]! + const contents: Array = payload.contents + + const fr = contents + .flatMap((c: any) => c.parts ?? []) + .find((p: any) => p.functionResponse)?.functionResponse + + expect(fr).toBeDefined() + expect(fr.response).toEqual({ content: 'just a string' }) + expect(fr.parts).toBeUndefined() + }) +}) diff --git a/packages/ai/src/activities/chat/index.ts b/packages/ai/src/activities/chat/index.ts index 11c1d7c9f..c283b97c4 100644 --- a/packages/ai/src/activities/chat/index.ts +++ b/packages/ai/src/activities/chat/index.ts @@ -10,6 +10,7 @@ import { stripToSpecMiddleware } from '../../strip-to-spec-middleware' import { streamToText } from '../../stream-to-response.js' import { resolveDebugOption } from '../../logger/resolve' import { EventType } from '../../types' +import { normalizeToolResult } from '../../utilities/tool-result' import { LazyToolManager } from './tools/lazy-tool-manager' import { MiddlewareAbortError, @@ -1550,12 +1551,18 @@ class TextEngine< // Check for ModelMessage format (role: 'tool' messages contain tool results) // This handles results sent back from the client after executing client-side tools if (message.role === 'tool' && message.toolCallId) { - // Parse content back to original output (was stringified by uiMessageToModelMessages) + // Parse content back to original output (was stringified by + // uiMessageToModelMessages). Multimodal results carry an + // Array directly — pass it through without parsing. let output: unknown - try { - output = JSON.parse(message.content as string) - } catch { + if (Array.isArray(message.content)) { output = message.content + } else { + try { + output = JSON.parse(message.content as string) + } catch { + output = message.content + } } // Skip approval response messages (they have pendingExecution marker) // These are NOT real client tool results — they are synthetic tool messages @@ -1634,7 +1641,15 @@ class TextEngine< const chunks: Array = [] for (const result of results) { - const content = JSON.stringify(result.result) + // `content` is the canonical value for the tool `ModelMessage` — it may + // be an `Array` (multimodal) which the adapters convert to + // structured provider output on the next iteration. `wireContent` is the + // string form emitted on the AG-UI stream events (TOOL_CALL_END.result / + // TOOL_CALL_RESULT.content are string-only per the AG-UI spec); the + // multimodal array travels via the message itself, not the wire event. + const content = normalizeToolResult(result.result) + const wireContent = + typeof content === 'string' ? content : JSON.stringify(content) // Emit TOOL_CALL_START + TOOL_CALL_ARGS before TOOL_CALL_END so that // the client can reconstruct the full tool call during continuations. @@ -1666,18 +1681,18 @@ class TextEngine< toolCallId: result.toolCallId, toolCallName: result.toolName, toolName: result.toolName, - result: content, + result: wireContent, ...(result.state !== undefined && { state: result.state }), } as StreamChunk) - // AG-UI spec TOOL_CALL_RESULT event + // AG-UI spec TOOL_CALL_RESULT event (content is string-only per spec) chunks.push({ type: 'TOOL_CALL_RESULT', timestamp: Date.now(), model: finishEvent.model, messageId: this.createId('tool-result'), toolCallId: result.toolCallId, - content, + content: wireContent, role: 'tool', ...(result.state !== undefined && { state: result.state }), } as StreamChunk) diff --git a/packages/ai/src/activities/chat/messages.ts b/packages/ai/src/activities/chat/messages.ts index cf9892627..cd0108fbb 100644 --- a/packages/ai/src/activities/chat/messages.ts +++ b/packages/ai/src/activities/chat/messages.ts @@ -1,3 +1,4 @@ +import { normalizeToolResult } from '../../utilities/tool-result' import type { ContentPart, MessagePart, @@ -339,7 +340,7 @@ function buildAssistantMessages(uiMessage: UIMessage): Array { if (part.output !== undefined && !emittedToolResultIds.has(part.id)) { messageList.push({ role: 'tool', - content: JSON.stringify(part.output), + content: normalizeToolResult(part.output), toolCallId: part.id, }) emittedToolResultIds.add(part.id) diff --git a/packages/ai/src/activities/chat/stream/message-updaters.ts b/packages/ai/src/activities/chat/stream/message-updaters.ts index 3fd8895f3..f219c20d6 100644 --- a/packages/ai/src/activities/chat/stream/message-updaters.ts +++ b/packages/ai/src/activities/chat/stream/message-updaters.ts @@ -7,6 +7,7 @@ import { parsePartialJSON } from './json-parser' import type { + ContentPart, StructuredOutputPart, ThinkingPart, ToolCallPart, @@ -107,7 +108,7 @@ export function updateToolResultPart( messages: Array, messageId: string, toolCallId: string, - content: string, + content: string | Array, state: ToolResultState, error?: string, ): Array { diff --git a/packages/ai/src/activities/chat/stream/processor.ts b/packages/ai/src/activities/chat/stream/processor.ts index 83ad0f92a..bff119450 100644 --- a/packages/ai/src/activities/chat/stream/processor.ts +++ b/packages/ai/src/activities/chat/stream/processor.ts @@ -18,6 +18,7 @@ * adapter contract, single-shot flows, and expected UIMessage output. */ import { generateMessageId, uiMessageToModelMessages } from '../messages.js' +import { normalizeToolResult } from '../../../utilities/tool-result' import { defaultJSONParser } from './json-parser' import { appendStructuredOutputDelta, @@ -321,7 +322,7 @@ export class StreamProcessor { ) // Step 2: Create a tool-result part (for LLM conversation history) - const content = typeof output === 'string' ? output : JSON.stringify(output) + const content = normalizeToolResult(output) const toolResultState: ToolResultState = error ? 'error' : 'complete' updatedMessages = updateToolResultPart( @@ -1170,10 +1171,14 @@ export class StreamProcessor { // Step 1: Update the tool-call part's output field (for UI consistency // with client tools — see GitHub issue #176) let output: unknown - try { - output = JSON.parse(chunk.result) - } catch { + if (Array.isArray(chunk.result)) { output = chunk.result + } else { + try { + output = JSON.parse(chunk.result) + } catch { + output = chunk.result + } } this.messages = updateToolCallWithOutput( this.messages, diff --git a/packages/ai/src/activities/chat/tools/tool-calls.ts b/packages/ai/src/activities/chat/tools/tool-calls.ts index 6b3c85cee..9ab977cf9 100644 --- a/packages/ai/src/activities/chat/tools/tool-calls.ts +++ b/packages/ai/src/activities/chat/tools/tool-calls.ts @@ -1,6 +1,8 @@ +import { normalizeToolResult } from '../../../utilities/tool-result' import { isStandardSchema, parseWithStandardSchema } from './schema-converter' import type { AnyTool, + ContentPart, CustomEvent, ModelMessage, RunFinishedEvent, @@ -220,7 +222,7 @@ export class ToolCallManager< for (const toolCall of toolCallsArray) { const tool = this.tools.find((t) => t.name === toolCall.function.name) - let toolResultContent: string + let toolResultContent: string | Array let toolResultState: ToolOutputState | undefined if (tool?.execute) { try { @@ -280,8 +282,7 @@ export class ToolCallManager< } } - toolResultContent = - typeof result === 'string' ? result : JSON.stringify(result) + toolResultContent = normalizeToolResult(result) } catch (error: unknown) { // If tool execution fails, add error message const message = diff --git a/packages/ai/src/index.ts b/packages/ai/src/index.ts index 67cde405e..4a8e4ac8a 100644 --- a/packages/ai/src/index.ts +++ b/packages/ai/src/index.ts @@ -189,6 +189,11 @@ export { // AG-UI wire serialization (used internally by @tanstack/ai-client) export { uiMessagesToWire } from './utilities/ag-ui-wire' export type { WireMessage } from './utilities/ag-ui-wire' +export { + isContentPart, + isContentPartArray, + normalizeToolResult, +} from './utilities/tool-result' // Adapter extension utilities export { createModel, extendAdapter } from './extend-adapter' diff --git a/packages/ai/src/types.ts b/packages/ai/src/types.ts index f84912285..29f7d8574 100644 --- a/packages/ai/src/types.ts +++ b/packages/ai/src/types.ts @@ -357,7 +357,7 @@ export interface ToolCallPart { export interface ToolResultPart { type: 'tool-result' toolCallId: string - content: string + content: string | Array state: ToolResultState error?: string // Error message if state is "error" } @@ -1137,7 +1137,7 @@ export interface ToolCallEndEvent extends AGUIToolCallEndEvent { /** Final parsed input arguments (TanStack AI internal) */ input?: unknown /** Tool execution result (TanStack AI internal) */ - result?: string + result?: string | Array /** Tool execution output state (TanStack AI internal) */ state?: ToolOutputState } diff --git a/packages/ai/src/utilities/ag-ui-wire.ts b/packages/ai/src/utilities/ag-ui-wire.ts index 8f0c58317..40a3348ab 100644 --- a/packages/ai/src/utilities/ag-ui-wire.ts +++ b/packages/ai/src/utilities/ag-ui-wire.ts @@ -103,7 +103,10 @@ export function uiMessagesToWire( role: 'tool', id: deriveToolMessageId(part.toolCallId), toolCallId: part.toolCallId, - content: part.content, + content: + typeof part.content === 'string' + ? part.content + : JSON.stringify(part.content), ...(part.error !== undefined && { error: part.error }), }) } diff --git a/packages/ai/src/utilities/tool-result.ts b/packages/ai/src/utilities/tool-result.ts new file mode 100644 index 000000000..330c29be1 --- /dev/null +++ b/packages/ai/src/utilities/tool-result.ts @@ -0,0 +1,60 @@ +import type { ContentPart } from '../types' + +const CONTENT_PART_TYPES = new Set([ + 'text', + 'image', + 'audio', + 'video', + 'document', +]) + +/** + * Structural check for a single `ContentPart`. A text part must carry a string + * `content`; every other modality must carry a `source` with `type` of + * `'url' | 'data'` and a string `value`. + */ +export function isContentPart(value: unknown): value is ContentPart { + if (typeof value !== 'object' || value === null) return false + const part = value as Record + if (typeof part.type !== 'string' || !CONTENT_PART_TYPES.has(part.type)) { + return false + } + if (part.type === 'text') { + return typeof part.content === 'string' + } + const source = part.source + if (typeof source !== 'object' || source === null) return false + const src = source as Record + if (typeof src.value !== 'string') return false + // `data` sources require a mimeType (matches ContentPartDataSource); `url` + // sources don't. Requiring it here keeps the runtime guard consistent with + // the type and avoids emitting `data:undefined;base64,...` downstream. + if (src.type === 'data') return typeof src.mimeType === 'string' + return src.type === 'url' +} + +/** + * True iff `value` is a NON-EMPTY array whose every element is a valid + * `ContentPart`. Empty arrays and mixed arrays return false so they continue + * to be treated as ordinary (stringified) data — this keeps the auto-detection + * footgun narrow. + */ +export function isContentPartArray( + value: unknown, +): value is Array { + return Array.isArray(value) && value.length > 0 && value.every(isContentPart) +} + +/** + * Normalize a tool's return value for transport: + * - string → unchanged + * - ContentPart array → unchanged (multimodal, passed through to the adapter) + * - anything else → `JSON.stringify` + */ +export function normalizeToolResult( + result: unknown, +): string | Array { + if (typeof result === 'string') return result + if (isContentPartArray(result)) return result + return JSON.stringify(result) +} diff --git a/packages/ai/tests/multimodal-tool-result.test.ts b/packages/ai/tests/multimodal-tool-result.test.ts new file mode 100644 index 000000000..0f6741550 --- /dev/null +++ b/packages/ai/tests/multimodal-tool-result.test.ts @@ -0,0 +1,163 @@ +/** + * Regression tests for multimodal tool-result support (#363). + * + * When a server tool returns an Array (e.g. an image), chat() + * must preserve it as-is all the way to the adapter so the adapter can send + * a structured, multi-part tool_result instead of a JSON string. The code + * path under test is `buildToolResultChunks` in + * `packages/ai/src/activities/chat/index.ts`, which now calls + * `normalizeToolResult(result.result)` instead of `JSON.stringify(result.result)`. + */ + +import { describe, expect, it, vi } from 'vitest' +import { chat } from '../src/activities/chat/index' +import type { StreamChunk } from '../src/types' +import { ev, createMockAdapter, collectChunks, serverTool } from './test-utils' + +// --------------------------------------------------------------------------- +// Shared ContentPart fixture used across tests +// --------------------------------------------------------------------------- +const MULTIMODAL_RESULT = [ + { type: 'text' as const, content: 'screenshot' }, + { + type: 'image' as const, + source: { type: 'url' as const, value: 'https://x/y.png' }, + }, +] + +// --------------------------------------------------------------------------- +// Helper: drive chat() through one tool iteration + one final text iteration +// and return the `role:'tool'` ModelMessage seen by the adapter on call #2. +// --------------------------------------------------------------------------- +async function runWithToolAndCapture( + executeFn: () => unknown, +): Promise<{ role: string; content: unknown; toolCallId?: string }> { + const { adapter, calls } = createMockAdapter({ + iterations: [ + // Iteration 1: adapter emits a single tool call + [ + ev.runStarted(), + ev.toolStart('call_mm', 'screenshotTool'), + ev.toolArgs('call_mm', '{}'), + ev.runFinished('tool_calls'), + ], + // Iteration 2: adapter produces final text + [ + ev.runStarted(), + ev.textStart(), + ev.textContent('Done.'), + ev.textEnd(), + ev.runFinished('stop'), + ], + ], + }) + + const stream = chat({ + adapter, + messages: [{ role: 'user', content: 'Take a screenshot' }], + tools: [serverTool('screenshotTool', executeFn)], + }) + + await collectChunks(stream as AsyncIterable) + + // The adapter must have been called twice + expect(calls).toHaveLength(2) + + // Find the tool-result message in the second call's message list + const secondCallMessages = calls[1]!.messages as Array<{ + role: string + content: unknown + toolCallId?: string + }> + const toolMsg = secondCallMessages.find( + (m) => m.role === 'tool' && m.toolCallId === 'call_mm', + ) + expect(toolMsg).toBeDefined() + return toolMsg! +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('multimodal tool-result support (#363)', () => { + describe('ContentPart[] return value', () => { + it('preserves a ContentPart[] as an array in the tool message (not JSON-stringified)', async () => { + const executeSpy = vi.fn().mockReturnValue(MULTIMODAL_RESULT) + + const toolMsg = await runWithToolAndCapture(executeSpy) + + // The content MUST be an array, not a string + expect(Array.isArray(toolMsg.content)).toBe(true) + + // And it must equal the original ContentPart[] returned by the tool + expect(toolMsg.content).toEqual(MULTIMODAL_RESULT) + }) + + it('content is NOT a JSON string when tool returns ContentPart[]', async () => { + const executeSpy = vi.fn().mockReturnValue(MULTIMODAL_RESULT) + + const toolMsg = await runWithToolAndCapture(executeSpy) + + // The buggy behaviour was: JSON.stringify([{ type:'text', content:'screenshot' }, ...]) + // Guard against regression by asserting the type is explicitly NOT string. + expect(typeof toolMsg.content).not.toBe('string') + }) + }) + + describe('plain object return value (unchanged behaviour)', () => { + it('stringifies a plain object to JSON in the tool message', async () => { + const executeSpy = vi.fn().mockReturnValue({ ok: true }) + + const toolMsg = await runWithToolAndCapture(executeSpy) + + // Plain objects must still be JSON-stringified (existing behaviour) + expect(typeof toolMsg.content).toBe('string') + expect(toolMsg.content).toBe('{"ok":true}') + }) + }) + + describe('TOOL_CALL_RESULT stream chunk', () => { + it('emits string content on the wire event (AG-UI spec) while the message keeps the array', async () => { + const { adapter } = createMockAdapter({ + iterations: [ + [ + ev.runStarted(), + ev.toolStart('call_mm2', 'screenshotTool'), + ev.toolArgs('call_mm2', '{}'), + ev.runFinished('tool_calls'), + ], + [ + ev.runStarted(), + ev.textStart(), + ev.textContent('Done.'), + ev.textEnd(), + ev.runFinished('stop'), + ], + ], + }) + + const stream = chat({ + adapter, + messages: [{ role: 'user', content: 'Take a screenshot' }], + tools: [serverTool('screenshotTool', () => MULTIMODAL_RESULT)], + }) + + const chunks = await collectChunks(stream as AsyncIterable) + + // Locate the TOOL_CALL_RESULT chunk for our tool call + const resultChunk = chunks.find( + (c) => + c.type === 'TOOL_CALL_RESULT' && (c as any).toolCallId === 'call_mm2', + ) as any + + expect(resultChunk).toBeDefined() + // AG-UI TOOL_CALL_RESULT.content is string-only: the wire event carries + // the JSON-stringified array, NOT the array itself. The structured array + // travels on the tool ModelMessage (asserted above), which is what the + // next adapter iteration converts into a multimodal provider request. + expect(typeof resultChunk.content).toBe('string') + expect(resultChunk.content).toBe(JSON.stringify(MULTIMODAL_RESULT)) + }) + }) +}) diff --git a/packages/ai/tests/tool-result.test.ts b/packages/ai/tests/tool-result.test.ts new file mode 100644 index 000000000..020bf7f7b --- /dev/null +++ b/packages/ai/tests/tool-result.test.ts @@ -0,0 +1,67 @@ +import { describe, it, expect } from 'vitest' +import { + isContentPart, + isContentPartArray, + normalizeToolResult, +} from '../src/utilities/tool-result' +import type { ContentPart } from '../src/types' + +const image: ContentPart = { + type: 'image', + source: { type: 'url', value: 'https://example.com/a.png' }, +} +const text: ContentPart = { type: 'text', content: 'hello' } + +describe('isContentPart', () => { + it('accepts valid text and media parts', () => { + expect(isContentPart(text)).toBe(true) + expect(isContentPart(image)).toBe(true) + expect( + isContentPart({ + type: 'image', + source: { type: 'data', value: 'AAAA', mimeType: 'image/png' }, + }), + ).toBe(true) + }) + + it('rejects non-parts', () => { + expect(isContentPart(null)).toBe(false) + expect(isContentPart('hi')).toBe(false) + expect(isContentPart({ type: 'text' })).toBe(false) + expect(isContentPart({ type: 'image' })).toBe(false) + expect(isContentPart({ type: 'image', source: {} })).toBe(false) + expect(isContentPart({ type: 'bogus', content: 'x' })).toBe(false) + }) +}) + +describe('isContentPartArray', () => { + it('accepts a non-empty array of valid parts', () => { + expect(isContentPartArray([text, image])).toBe(true) + expect(isContentPartArray([text])).toBe(true) + }) + + it('rejects empty / mixed / non-arrays', () => { + expect(isContentPartArray([])).toBe(false) + expect(isContentPartArray([text, { foo: 1 }])).toBe(false) + expect(isContentPartArray([1, 2, 3])).toBe(false) + expect(isContentPartArray('hello')).toBe(false) + expect(isContentPartArray({ type: 'text', content: 'x' })).toBe(false) + }) +}) + +describe('normalizeToolResult', () => { + it('passes strings through unchanged', () => { + expect(normalizeToolResult('done')).toBe('done') + }) + + it('passes content-part arrays through unchanged', () => { + const arr = [text, image] + expect(normalizeToolResult(arr)).toBe(arr) + }) + + it('stringifies everything else', () => { + expect(normalizeToolResult({ ok: true })).toBe('{"ok":true}') + expect(normalizeToolResult([1, 2, 3])).toBe('[1,2,3]') + expect(normalizeToolResult([])).toBe('[]') + }) +}) diff --git a/packages/openai-base/src/adapters/chat-completions-text.ts b/packages/openai-base/src/adapters/chat-completions-text.ts index 064f92b9d..be2a0d7d0 100644 --- a/packages/openai-base/src/adapters/chat-completions-text.ts +++ b/packages/openai-base/src/adapters/chat-completions-text.ts @@ -1196,6 +1196,12 @@ export abstract class OpenAIBaseChatCompletionsTextAdapter< protected convertMessage(message: ModelMessage): ChatCompletionMessageParam { // Handle tool messages if (message.role === 'tool') { + // The Chat Completions API has no multimodal `tool` message support + // (unlike the Responses API's `function_call_output`). A tool that + // returns an `Array` is therefore stringified here — the + // documented fallback for providers on the chat-completions path + // (Groq, Ollama, Grok, OpenRouter chat). Multimodal tool results are + // only delivered structurally via the Responses adapter. return { role: 'tool', tool_call_id: message.toolCallId || '', diff --git a/packages/openai-base/src/adapters/responses-text.ts b/packages/openai-base/src/adapters/responses-text.ts index 43b3d9138..3daa45e71 100644 --- a/packages/openai-base/src/adapters/responses-text.ts +++ b/packages/openai-base/src/adapters/responses-text.ts @@ -13,6 +13,7 @@ import type { import type { Response, ResponseCreateParams, + ResponseFunctionCallOutputItem, ResponseInput, ResponseInputContent, ResponseStreamEvent, @@ -1693,13 +1694,17 @@ export abstract class OpenAIBaseResponsesTextAdapter< for (const message of messages) { // Handle tool messages - convert to FunctionToolCallOutput if (message.role === 'tool') { + const toolContent = message.content + const output: string | Array = + Array.isArray(toolContent) + ? toolContent.map((part) => this.convertContentPartToInput(part)) + : typeof toolContent === 'string' + ? toolContent + : JSON.stringify(toolContent) result.push({ type: 'function_call_output', call_id: message.toolCallId || '', - output: - typeof message.content === 'string' - ? message.content - : JSON.stringify(message.content), + output, }) continue } diff --git a/packages/openai-base/tests/responses-text.test.ts b/packages/openai-base/tests/responses-text.test.ts index 7430f6947..a522f2c49 100644 --- a/packages/openai-base/tests/responses-text.test.ts +++ b/packages/openai-base/tests/responses-text.test.ts @@ -2030,6 +2030,79 @@ describe('OpenAIBaseResponsesTextAdapter', () => { ]), ) }) + + it('converts a multimodal tool result to a structured function_call_output', async () => { + const streamChunks = [ + { + type: 'response.created', + response: { + id: 'resp-mm-1', + model: 'test-model', + status: 'in_progress', + }, + }, + { + type: 'response.completed', + response: { + id: 'resp-mm-1', + model: 'test-model', + status: 'completed', + output: [], + usage: { + input_tokens: 10, + output_tokens: 1, + total_tokens: 11, + }, + }, + }, + ] + + setupMockResponsesClient(streamChunks) + const adapter = new TestResponsesAdapter(testConfig, 'test-model') + + const chunks: Array = [] + for await (const chunk of adapter.chatStream({ + logger: testLogger, + model: 'test-model', + messages: [ + { role: 'user', content: 'look' }, + { + role: 'assistant', + content: '', + toolCalls: [ + { + id: 'call_1', + type: 'function', + function: { name: 'shot', arguments: '{}' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'call_1', + content: [ + { type: 'text', content: 'screenshot' }, + { + type: 'image', + source: { type: 'url', value: 'https://x/y.png' }, + }, + ], + }, + ], + })) { + chunks.push(chunk) + } + + const [payload] = mockResponsesCreate.mock.calls[0]! + const out = payload.input.find( + (i: any) => i.type === 'function_call_output', + ) + expect(Array.isArray(out.output)).toBe(true) + expect(out.output).toEqual([ + { type: 'input_text', text: 'screenshot' }, + { type: 'input_image', image_url: 'https://x/y.png', detail: 'auto' }, + ]) + }) }) describe('subclassing', () => { diff --git a/testing/e2e/src/routeTree.gen.ts b/testing/e2e/src/routeTree.gen.ts index b979c060b..b8edd29b3 100644 --- a/testing/e2e/src/routeTree.gen.ts +++ b/testing/e2e/src/routeTree.gen.ts @@ -27,6 +27,7 @@ import { Route as ApiToolsTestRouteImport } from './routes/api.tools-test' import { Route as ApiSummarizeRouteImport } from './routes/api.summarize' import { Route as ApiOpenrouterWebToolsWireRouteImport } from './routes/api.openrouter-web-tools-wire' import { Route as ApiOpenrouterCostRouteImport } from './routes/api.openrouter-cost' +import { Route as ApiMultimodalToolResultWireRouteImport } from './routes/api.multimodal-tool-result-wire' import { Route as ApiMiddlewareTestRouteImport } from './routes/api.middleware-test' import { Route as ApiImageRouteImport } from './routes/api.image' import { Route as ApiChatRouteImport } from './routes/api.chat' @@ -131,6 +132,12 @@ const ApiOpenrouterCostRoute = ApiOpenrouterCostRouteImport.update({ path: '/api/openrouter-cost', getParentRoute: () => rootRouteImport, } as any) +const ApiMultimodalToolResultWireRoute = + ApiMultimodalToolResultWireRouteImport.update({ + id: '/api/multimodal-tool-result-wire', + path: '/api/multimodal-tool-result-wire', + getParentRoute: () => rootRouteImport, + } as any) const ApiMiddlewareTestRoute = ApiMiddlewareTestRouteImport.update({ id: '/api/middleware-test', path: '/api/middleware-test', @@ -210,6 +217,7 @@ export interface FileRoutesByFullPath { '/api/chat': typeof ApiChatRoute '/api/image': typeof ApiImageRouteWithChildren '/api/middleware-test': typeof ApiMiddlewareTestRoute + '/api/multimodal-tool-result-wire': typeof ApiMultimodalToolResultWireRoute '/api/openrouter-cost': typeof ApiOpenrouterCostRoute '/api/openrouter-web-tools-wire': typeof ApiOpenrouterWebToolsWireRoute '/api/summarize': typeof ApiSummarizeRoute @@ -242,6 +250,7 @@ export interface FileRoutesByTo { '/api/chat': typeof ApiChatRoute '/api/image': typeof ApiImageRouteWithChildren '/api/middleware-test': typeof ApiMiddlewareTestRoute + '/api/multimodal-tool-result-wire': typeof ApiMultimodalToolResultWireRoute '/api/openrouter-cost': typeof ApiOpenrouterCostRoute '/api/openrouter-web-tools-wire': typeof ApiOpenrouterWebToolsWireRoute '/api/summarize': typeof ApiSummarizeRoute @@ -275,6 +284,7 @@ export interface FileRoutesById { '/api/chat': typeof ApiChatRoute '/api/image': typeof ApiImageRouteWithChildren '/api/middleware-test': typeof ApiMiddlewareTestRoute + '/api/multimodal-tool-result-wire': typeof ApiMultimodalToolResultWireRoute '/api/openrouter-cost': typeof ApiOpenrouterCostRoute '/api/openrouter-web-tools-wire': typeof ApiOpenrouterWebToolsWireRoute '/api/summarize': typeof ApiSummarizeRoute @@ -309,6 +319,7 @@ export interface FileRouteTypes { | '/api/chat' | '/api/image' | '/api/middleware-test' + | '/api/multimodal-tool-result-wire' | '/api/openrouter-cost' | '/api/openrouter-web-tools-wire' | '/api/summarize' @@ -341,6 +352,7 @@ export interface FileRouteTypes { | '/api/chat' | '/api/image' | '/api/middleware-test' + | '/api/multimodal-tool-result-wire' | '/api/openrouter-cost' | '/api/openrouter-web-tools-wire' | '/api/summarize' @@ -373,6 +385,7 @@ export interface FileRouteTypes { | '/api/chat' | '/api/image' | '/api/middleware-test' + | '/api/multimodal-tool-result-wire' | '/api/openrouter-cost' | '/api/openrouter-web-tools-wire' | '/api/summarize' @@ -406,6 +419,7 @@ export interface RootRouteChildren { ApiChatRoute: typeof ApiChatRoute ApiImageRoute: typeof ApiImageRouteWithChildren ApiMiddlewareTestRoute: typeof ApiMiddlewareTestRoute + ApiMultimodalToolResultWireRoute: typeof ApiMultimodalToolResultWireRoute ApiOpenrouterCostRoute: typeof ApiOpenrouterCostRoute ApiOpenrouterWebToolsWireRoute: typeof ApiOpenrouterWebToolsWireRoute ApiSummarizeRoute: typeof ApiSummarizeRoute @@ -544,6 +558,13 @@ declare module '@tanstack/react-router' { preLoaderRoute: typeof ApiOpenrouterCostRouteImport parentRoute: typeof rootRouteImport } + '/api/multimodal-tool-result-wire': { + id: '/api/multimodal-tool-result-wire' + path: '/api/multimodal-tool-result-wire' + fullPath: '/api/multimodal-tool-result-wire' + preLoaderRoute: typeof ApiMultimodalToolResultWireRouteImport + parentRoute: typeof rootRouteImport + } '/api/middleware-test': { id: '/api/middleware-test' path: '/api/middleware-test' @@ -707,6 +728,7 @@ const rootRouteChildren: RootRouteChildren = { ApiChatRoute: ApiChatRoute, ApiImageRoute: ApiImageRouteWithChildren, ApiMiddlewareTestRoute: ApiMiddlewareTestRoute, + ApiMultimodalToolResultWireRoute: ApiMultimodalToolResultWireRoute, ApiOpenrouterCostRoute: ApiOpenrouterCostRoute, ApiOpenrouterWebToolsWireRoute: ApiOpenrouterWebToolsWireRoute, ApiSummarizeRoute: ApiSummarizeRoute, diff --git a/testing/e2e/src/routes/api.multimodal-tool-result-wire.ts b/testing/e2e/src/routes/api.multimodal-tool-result-wire.ts new file mode 100644 index 000000000..7a765594b --- /dev/null +++ b/testing/e2e/src/routes/api.multimodal-tool-result-wire.ts @@ -0,0 +1,87 @@ +import { createFileRoute } from '@tanstack/react-router' +import { chat, createChatOptions } from '@tanstack/ai' +import { createTextAdapter } from '@/lib/providers' +import type { ModelMessage } from '@tanstack/ai' +import type { Provider } from '@/lib/types' + +// 1x1 transparent PNG (base64) — enough to assert structured passthrough. +const PNG_1x1 = + 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+M8AAAMBAQDJ/1eYAAAAAElFTkSuQmCC' + +/** + * Wire-format verification for multimodal tool-result messages (#363). + * + * A tool message's `content` can now be `Array`, and the + * OpenAI / Anthropic / Gemini adapters convert it to structured provider tool + * output instead of `JSON.stringify`. This route drives a single `chat()` call + * whose `messages` already contain a multimodal tool result so the companion + * spec can inspect aimock's journal (`GET /v1/_requests`) and assert the + * adapter emitted STRUCTURED tool output (image block present) per provider. + */ +export const Route = createFileRoute('/api/multimodal-tool-result-wire')({ + server: { + handlers: { + POST: async ({ request }) => { + const url = new URL(request.url) + const provider = (url.searchParams.get('provider') ?? + 'openai') as Provider + const testId = url.searchParams.get('testId') ?? undefined + + const { adapter } = createTextAdapter( + provider, + undefined, + undefined, + testId, + ) + + const messages: Array = [ + { role: 'user', content: 'Look at the screenshot.' }, + { + role: 'assistant', + content: '', + toolCalls: [ + { + id: 'call_1', + type: 'function', + function: { name: 'getShot', arguments: '{}' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'call_1', + content: [ + { type: 'text', content: 'screenshot' }, + { + type: 'image', + source: { type: 'data', value: PNG_1x1, mimeType: 'image/png' }, + }, + ], + }, + ] + + try { + for await (const _ of chat({ + ...createChatOptions({ adapter }), + messages, + })) { + // Drain the stream. + } + } catch (error) { + return new Response( + JSON.stringify({ + ok: false, + error: error instanceof Error ? error.message : String(error), + }), + { status: 200, headers: { 'Content-Type': 'application/json' } }, + ) + } + + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }) + }, + }, + }, +}) diff --git a/testing/e2e/tests/multimodal-tool-result-wire.spec.ts b/testing/e2e/tests/multimodal-tool-result-wire.spec.ts new file mode 100644 index 000000000..b51b0e84e --- /dev/null +++ b/testing/e2e/tests/multimodal-tool-result-wire.spec.ts @@ -0,0 +1,130 @@ +import { test, expect } from './fixtures' + +/** + * Wire-format verification for multimodal tool-result messages (#363). + * + * A tool message's `content` can be `Array`, and the + * OpenAI / Anthropic / Gemini adapters must convert it to structured provider + * tool output instead of `JSON.stringify`. This spec drives the route at + * `/api/multimodal-tool-result-wire` (which calls `chat()` with a pre-built + * conversation that includes a multimodal tool result), then inspects + * aimock's journal (`GET /v1/_requests`) to assert the outbound bytes contain + * structured tool output per provider. + * + * Aimock normalises Anthropic and Gemini request bodies to an OpenAI-compatible + * form before journalling. As a result: + * - OpenAI (Responses API) — image parts survive in body.messages[tool].content + * - Anthropic — aimock's claudeToCompletionRequest strips image + * blocks from tool_result content; only text survives + * - Gemini — aimock's geminiToCompletionRequest strips inlineData + * parts from functionResponse; only text survives + * The OpenAI assertion therefore verifies image presence directly. The Anthropic + * and Gemini assertions verify that the chat() call + adapter completed the + * end-to-end HTTP round-trip without a synchronous throw — proving serialization + * of the Array tool result ran to completion. Structural correctness + * for those two providers is covered by adapter unit tests. + * + * Note: the Anthropic and Gemini adapters catch HTTP errors from aimock (no-fixture + * 404) and yield a RUN_ERROR stream event rather than throwing. The route therefore + * always returns ok:true as long as serialization does not crash synchronously. + * This makes ok:true a meaningful proof that the multimodal conversion succeeded. + */ +// Serial mode: each test clears then re-populates the aimock journal. +// Parallel workers would otherwise race on the global journal — one test's +// beforeEach DELETE clears a sibling test's entries before they are read. +test.describe.configure({ mode: 'serial' }) + +test.describe('multimodal tool result — wire format', () => { + test.beforeEach(async ({ request, aimockPort }) => { + // Clear the aimock journal before each serial test so we only assert + // against the request triggered by this specific test. + await request.delete(`http://127.0.0.1:${aimockPort}/v1/_requests`) + }) + + // ────────────────────────────────────────────────────────────────────────── + // OpenAI (Responses API — /v1/responses) + // + // aimock stores the normalised CompletionRequest for /v1/responses where + // function_call_output.output survives as-is in the tool message content. + // When output is an array (multimodal), it appears under + // body.messages[role=tool].content: Array<{type, ...}> + // with `input_image` and `input_text` items. + // ────────────────────────────────────────────────────────────────────────── + test('openai: tool message content is a structured array with an image part', async ({ + request, + aimockPort, + testId, + }) => { + await request.post( + `/api/multimodal-tool-result-wire?provider=openai&testId=${encodeURIComponent(testId)}`, + ) + const journal = await request.get( + `http://127.0.0.1:${aimockPort}/v1/_requests`, + ) + const entries = (await journal.json()) as Array<{ body: any }> + // OpenAI Responses API: aimock normalises function_call_output → role:'tool' + // and preserves the output array as-is in the content field. + const toolMsg = entries[0]?.body?.messages?.find( + (m: any) => m.role === 'tool', + ) + expect(Array.isArray(toolMsg?.content)).toBe(true) + expect(toolMsg.content.some((p: any) => p.type === 'input_image')).toBe( + true, + ) + expect(toolMsg.content.some((p: any) => p.type === 'input_text')).toBe(true) + }) + + // ────────────────────────────────────────────────────────────────────────── + // Anthropic (/v1/messages) + // + // The Anthropic adapter wraps the tool result in a user message with a + // tool_result block whose content is Array. However, + // aimock's claudeToCompletionRequest normalisation strips non-text blocks + // before journalling. The adapter also catches HTTP errors (aimock returns + // a 404 when no fixture matches) and yields a RUN_ERROR event rather than + // throwing, so the route always returns ok:true when serialization succeeds. + // + // Assertion: ok:true proves chat() + the Anthropic adapter serialized the + // Array tool result and completed the round-trip without a + // synchronous throw (i.e. the multimodal conversion did not crash). + // ────────────────────────────────────────────────────────────────────────── + test('anthropic: multimodal tool result completes end-to-end (image structure covered by unit test)', async ({ + request, + aimockPort, + testId, + }) => { + const res = await request.post( + `/api/multimodal-tool-result-wire?provider=anthropic&testId=${encodeURIComponent(testId)}`, + ) + const { ok } = (await res.json()) as { ok: boolean; error?: string } + // Structural proof that the image becomes a tool_result image block lives in packages/ai-anthropic/tests/tool-result-multimodal.test.ts — aimock's journal strips multimodal tool content so it can't be asserted here. + expect(ok).toBe(true) + }) + + // ────────────────────────────────────────────────────────────────────────── + // Gemini (/v1beta/models/.../streamGenerateContent) + // + // The Gemini adapter emits a functionResponse with parts:[{inlineData:{...}}] + // for the image alongside response:{content:'screenshot'} for the text. + // aimock's geminiToCompletionRequest normalises this to role:'tool' and + // JSON.stringifies only the response object (dropping inlineData). + // The adapter catches HTTP errors (aimock 404 on no-fixture) and yields a + // RUN_ERROR event rather than throwing, so ok:true means serialization succeeded. + // + // Assertion: ok:true proves chat() + the Gemini adapter serialized the + // Array tool result and completed the round-trip without a + // synchronous throw (i.e. the multimodal conversion did not crash). + // ────────────────────────────────────────────────────────────────────────── + test('gemini: multimodal tool result completes end-to-end (image structure covered by unit test)', async ({ + request, + aimockPort, + testId, + }) => { + const res = await request.post( + `/api/multimodal-tool-result-wire?provider=gemini&testId=${encodeURIComponent(testId)}`, + ) + const { ok } = (await res.json()) as { ok: boolean; error?: string } + // Structural proof that the image becomes a functionResponse.parts inlineData entry lives in packages/ai-gemini/tests/tool-result-multimodal.test.ts — aimock's journal strips multimodal tool content so it can't be asserted here. + expect(ok).toBe(true) + }) +})