From b5dc1636503a2310cd72e8a4941ea38d75cee75d Mon Sep 17 00:00:00 2001
From: daniel-lxs <ricciodaniel98@gmail.com>
Date: Wed, 19 Nov 2025 12:25:39 -0500
Subject: [PATCH 1/2] fix: preserve user images in native tool call results

When native tool calls are enabled, user images sent with responses
were being converted to placeholder text '(image content)' instead of
being preserved and sent to the LLM.

This fix:
- Preserves image blocks in tool_result content as arrays when images
  are present
- Only converts to string when no images exist (for cleaner
  representation)
- Maintains compatibility with Anthropic API's tool_result format

Also adds comprehensive test coverage for image handling in both
native and XML protocols.
---
 .../presentAssistantMessage-images.spec.ts    | 208 ++++++++++++++++++
 .../presentAssistantMessage.ts                |  33 +--
 2 files changed, 228 insertions(+), 13 deletions(-)
 create mode 100644 src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts

diff --git a/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts b/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
new file mode 100644
index 0000000000..ba91c35339
--- /dev/null
+++ b/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
@@ -0,0 +1,208 @@
+// npx vitest src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
+
+import { describe, it, expect, beforeEach, vi } from "vitest"
+import { Anthropic } from "@anthropic-ai/sdk"
+import { presentAssistantMessage } from "../presentAssistantMessage"
+import { Task } from "../../task/Task"
+import { TOOL_PROTOCOL } from "@roo-code/types"
+
+// Mock dependencies
+vi.mock("../../task/Task")
+vi.mock("../../tools/validateToolUse", () => ({
+	validateToolUse: vi.fn(),
+}))
+vi.mock("@roo-code/telemetry", () => ({
+	TelemetryService: {
+		instance: {
+			captureToolUsage: vi.fn(),
+			captureConsecutiveMistakeError: vi.fn(),
+		},
+	},
+}))
+
+describe("presentAssistantMessage - Image Handling in Native Tool Calls", () => {
+	let mockTask: any
+
+	beforeEach(() => {
+		// Create a mock Task with minimal properties needed for testing
+		mockTask = {
+			taskId: "test-task-id",
+			instanceId: "test-instance",
+			abort: false,
+			presentAssistantMessageLocked: false,
+			presentAssistantMessageHasPendingUpdates: false,
+			currentStreamingContentIndex: 0,
+			assistantMessageContent: [],
+			userMessageContent: [],
+			didCompleteReadingStream: false,
+			didRejectTool: false,
+			didAlreadyUseTool: false,
+			diffEnabled: false,
+			consecutiveMistakeCount: 0,
+			api: {
+				getModel: () => ({ id: "test-model", info: {} }),
+			},
+			browserSession: {
+				closeBrowser: vi.fn().mockResolvedValue(undefined),
+			},
+			recordToolUsage: vi.fn(),
+			toolRepetitionDetector: {
+				check: vi.fn().mockReturnValue({ allowExecution: true }),
+			},
+			providerRef: {
+				deref: () => ({
+					getState: vi.fn().mockResolvedValue({
+						mode: "code",
+						customModes: [],
+					}),
+				}),
+			},
+			say: vi.fn().mockResolvedValue(undefined),
+			ask: vi.fn().mockResolvedValue({ response: "yesButtonClicked" }),
+		}
+	})
+
+	it("should preserve images in tool_result for native protocol", async () => {
+		// Set up a tool_use block with an ID (indicates native protocol)
+		const toolCallId = "tool_call_123"
+		mockTask.assistantMessageContent = [
+			{
+				type: "tool_use",
+				id: toolCallId, // ID indicates native protocol
+				name: "ask_followup_question",
+				params: { question: "What do you see?" },
+			},
+		]
+
+		// Create a mock askApproval that includes images in the response
+		const imageBlock: Anthropic.ImageBlockParam = {
+			type: "image",
+			source: {
+				type: "base64",
+				media_type: "image/png",
+				data: "base64ImageData",
+			},
+		}
+
+		mockTask.ask = vi.fn().mockResolvedValue({
+			response: "yesButtonClicked",
+			text: "I see a cat",
+			images: ["data:image/png;base64,base64ImageData"],
+		})
+
+		// Execute presentAssistantMessage
+		await presentAssistantMessage(mockTask)
+
+		// Verify that userMessageContent was populated
+		expect(mockTask.userMessageContent.length).toBeGreaterThan(0)
+
+		// Find the tool_result block
+		const toolResult = mockTask.userMessageContent.find(
+			(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
+		)
+
+		expect(toolResult).toBeDefined()
+		expect(toolResult.tool_use_id).toBe(toolCallId)
+
+		// Check if content is an array (images should be preserved as array)
+		// When images are present, content should be an array containing image blocks
+		if (Array.isArray(toolResult.content)) {
+			// Images were preserved!
+			const hasImageBlock = toolResult.content.some((block: any) => block.type === "image")
+			expect(hasImageBlock).toBe(true)
+		} else {
+			// If it's a string, images were NOT preserved (this is the bug we're fixing)
+			// This test should PASS after the fix
+			expect(Array.isArray(toolResult.content)).toBe(true)
+		}
+	})
+
+	it("should convert to string when no images are present (native protocol)", async () => {
+		// Set up a tool_use block with an ID (indicates native protocol)
+		const toolCallId = "tool_call_456"
+		mockTask.assistantMessageContent = [
+			{
+				type: "tool_use",
+				id: toolCallId,
+				name: "ask_followup_question",
+				params: { question: "What is your name?" },
+			},
+		]
+
+		// Response with text but NO images
+		mockTask.ask = vi.fn().mockResolvedValue({
+			response: "yesButtonClicked",
+			text: "My name is Alice",
+			images: undefined,
+		})
+
+		await presentAssistantMessage(mockTask)
+
+		const toolResult = mockTask.userMessageContent.find(
+			(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
+		)
+
+		expect(toolResult).toBeDefined()
+
+		// When no images, content should be a string
+		expect(typeof toolResult.content).toBe("string")
+	})
+
+	it("should preserve images in content array for XML protocol (existing behavior)", async () => {
+		// Set up a tool_use block WITHOUT an ID (indicates XML protocol)
+		mockTask.assistantMessageContent = [
+			{
+				type: "tool_use",
+				// No ID = XML protocol
+				name: "ask_followup_question",
+				params: { question: "What do you see?" },
+			},
+		]
+
+		mockTask.ask = vi.fn().mockResolvedValue({
+			response: "yesButtonClicked",
+			text: "I see a dog",
+			images: ["data:image/png;base64,dogImageData"],
+		})
+
+		await presentAssistantMessage(mockTask)
+
+		// For XML protocol, content is added as separate blocks
+		// Check that both text and image blocks were added
+		const hasTextBlock = mockTask.userMessageContent.some((item: any) => item.type === "text")
+		const hasImageBlock = mockTask.userMessageContent.some((item: any) => item.type === "image")
+
+		expect(hasTextBlock).toBe(true)
+		// XML protocol preserves images as separate blocks in userMessageContent
+		expect(hasImageBlock).toBe(true)
+	})
+
+	it("should handle empty tool result gracefully", async () => {
+		const toolCallId = "tool_call_789"
+		mockTask.assistantMessageContent = [
+			{
+				type: "tool_use",
+				id: toolCallId,
+				name: "attempt_completion",
+				params: { result: "Task completed" },
+			},
+		]
+
+		// Empty response
+		mockTask.ask = vi.fn().mockResolvedValue({
+			response: "yesButtonClicked",
+			text: undefined,
+			images: undefined,
+		})
+
+		await presentAssistantMessage(mockTask)
+
+		const toolResult = mockTask.userMessageContent.find(
+			(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
+		)
+
+		expect(toolResult).toBeDefined()
+		// Should have fallback text
+		expect(toolResult.content).toBeTruthy()
+	})
+})
diff --git a/src/core/assistant-message/presentAssistantMessage.ts b/src/core/assistant-message/presentAssistantMessage.ts
index e2cbe37812..102071eacd 100644
--- a/src/core/assistant-message/presentAssistantMessage.ts
+++ b/src/core/assistant-message/presentAssistantMessage.ts
@@ -297,22 +297,29 @@ export async function presentAssistantMessage(cline: Task) {
 					}
 
 					// For native protocol, add as tool_result block
-					let resultContent: string
+					// Preserve image blocks in the content array instead of converting to strings
+					let resultContent: string | Array<Anthropic.TextBlockParam | Anthropic.ImageBlockParam>
 					if (typeof content === "string") {
 						resultContent = content || "(tool did not return anything)"
 					} else {
-						// Convert array of content blocks to string for tool result
-						// Tool results in OpenAI format only support strings
-						resultContent = content
-							.map((item) => {
-								if (item.type === "text") {
-									return item.text
-								} else if (item.type === "image") {
-									return "(image content)"
-								}
-								return ""
-							})
-							.join("\n")
+						// Preserve both text and image blocks in the content array
+						// This allows images sent by the user to be included in tool results
+						const hasImages = content.some((item) => item.type === "image")
+						if (hasImages) {
+							// Keep as array to preserve image blocks
+							resultContent = content
+						} else {
+							// If no images, convert to string for simpler representation
+							resultContent =
+								content
+									.map((item) => {
+										if (item.type === "text") {
+											return item.text
+										}
+										return ""
+									})
+									.join("\n") || "(tool did not return anything)"
+						}
 					}
 
 					cline.userMessageContent.push({

From e1749bcab4c1b5105fd30c96716a9058807af986 Mon Sep 17 00:00:00 2001
From: daniel-lxs <ricciodaniel98@gmail.com>
Date: Wed, 19 Nov 2025 12:51:43 -0500
Subject: [PATCH 2/2] fix: properly handle images in native tool results

Images should be added as separate blocks after tool_result,
not inside the tool_result content itself. This follows the
Anthropic API specification where tool_result content must
be a string, and images are separate content blocks in the
user message.

Updated tests to verify images are correctly added as
separate blocks in the user message content.
---
 .../presentAssistantMessage-images.spec.ts    | 19 ++++-----
 .../presentAssistantMessage.ts                | 40 +++++++++----------
 2 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts b/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
index ba91c35339..b78f184251 100644
--- a/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
+++ b/src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
@@ -104,17 +104,14 @@ describe("presentAssistantMessage - Image Handling in Native Tool Calls", () =>
 		expect(toolResult).toBeDefined()
 		expect(toolResult.tool_use_id).toBe(toolCallId)
 
-		// Check if content is an array (images should be preserved as array)
-		// When images are present, content should be an array containing image blocks
-		if (Array.isArray(toolResult.content)) {
-			// Images were preserved!
-			const hasImageBlock = toolResult.content.some((block: any) => block.type === "image")
-			expect(hasImageBlock).toBe(true)
-		} else {
-			// If it's a string, images were NOT preserved (this is the bug we're fixing)
-			// This test should PASS after the fix
-			expect(Array.isArray(toolResult.content)).toBe(true)
-		}
+		// For native protocol, tool_result content should be a string (text only)
+		expect(typeof toolResult.content).toBe("string")
+		expect(toolResult.content).toContain("I see a cat")
+
+		// Images should be added as separate blocks AFTER the tool_result
+		const imageBlocks = mockTask.userMessageContent.filter((item: any) => item.type === "image")
+		expect(imageBlocks.length).toBeGreaterThan(0)
+		expect(imageBlocks[0].source.data).toBe("base64ImageData")
 	})
 
 	it("should convert to string when no images are present (native protocol)", async () => {
diff --git a/src/core/assistant-message/presentAssistantMessage.ts b/src/core/assistant-message/presentAssistantMessage.ts
index 102071eacd..fcf7d25cc9 100644
--- a/src/core/assistant-message/presentAssistantMessage.ts
+++ b/src/core/assistant-message/presentAssistantMessage.ts
@@ -296,38 +296,36 @@ export async function presentAssistantMessage(cline: Task) {
 						return
 					}
 
-					// For native protocol, add as tool_result block
-					// Preserve image blocks in the content array instead of converting to strings
-					let resultContent: string | Array<Anthropic.TextBlockParam | Anthropic.ImageBlockParam>
+					// For native protocol, tool_result content must be a string
+					// Images are added as separate blocks in the user message
+					let resultContent: string
+					let imageBlocks: Anthropic.ImageBlockParam[] = []
+
 					if (typeof content === "string") {
 						resultContent = content || "(tool did not return anything)"
 					} else {
-						// Preserve both text and image blocks in the content array
-						// This allows images sent by the user to be included in tool results
-						const hasImages = content.some((item) => item.type === "image")
-						if (hasImages) {
-							// Keep as array to preserve image blocks
-							resultContent = content
-						} else {
-							// If no images, convert to string for simpler representation
-							resultContent =
-								content
-									.map((item) => {
-										if (item.type === "text") {
-											return item.text
-										}
-										return ""
-									})
-									.join("\n") || "(tool did not return anything)"
-						}
+						// Separate text and image blocks
+						const textBlocks = content.filter((item) => item.type === "text")
+						imageBlocks = content.filter((item) => item.type === "image") as Anthropic.ImageBlockParam[]
+
+						// Convert text blocks to string for tool_result
+						resultContent =
+							textBlocks.map((item) => (item as Anthropic.TextBlockParam).text).join("\n") ||
+							"(tool did not return anything)"
 					}
 
+					// Add tool_result with text content only
 					cline.userMessageContent.push({
 						type: "tool_result",
 						tool_use_id: toolCallId,
 						content: resultContent,
 					} as Anthropic.ToolResultBlockParam)
 
+					// Add image blocks separately after tool_result
+					if (imageBlocks.length > 0) {
+						cline.userMessageContent.push(...imageBlocks)
+					}
+
 					hasToolResult = true
 				} else {
 					// For XML protocol, add as text blocks (legacy behavior)