Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// npx vitest src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts

import { describe, it, expect, beforeEach, vi } from "vitest"
import { Anthropic } from "@anthropic-ai/sdk"
import { presentAssistantMessage } from "../presentAssistantMessage"
import { Task } from "../../task/Task"
import { TOOL_PROTOCOL } from "@roo-code/types"

// Mock dependencies
vi.mock("../../task/Task")
vi.mock("../../tools/validateToolUse", () => ({
validateToolUse: vi.fn(),
}))
vi.mock("@roo-code/telemetry", () => ({
TelemetryService: {
instance: {
captureToolUsage: vi.fn(),
captureConsecutiveMistakeError: vi.fn(),
},
},
}))

describe("presentAssistantMessage - Image Handling in Native Tool Calls", () => {
let mockTask: any

beforeEach(() => {
// Create a mock Task with minimal properties needed for testing
mockTask = {
taskId: "test-task-id",
instanceId: "test-instance",
abort: false,
presentAssistantMessageLocked: false,
presentAssistantMessageHasPendingUpdates: false,
currentStreamingContentIndex: 0,
assistantMessageContent: [],
userMessageContent: [],
didCompleteReadingStream: false,
didRejectTool: false,
didAlreadyUseTool: false,
diffEnabled: false,
consecutiveMistakeCount: 0,
api: {
getModel: () => ({ id: "test-model", info: {} }),
},
browserSession: {
closeBrowser: vi.fn().mockResolvedValue(undefined),
},
recordToolUsage: vi.fn(),
toolRepetitionDetector: {
check: vi.fn().mockReturnValue({ allowExecution: true }),
},
providerRef: {
deref: () => ({
getState: vi.fn().mockResolvedValue({
mode: "code",
customModes: [],
}),
}),
},
say: vi.fn().mockResolvedValue(undefined),
ask: vi.fn().mockResolvedValue({ response: "yesButtonClicked" }),
}
})

it("should preserve images in tool_result for native protocol", async () => {
// Set up a tool_use block with an ID (indicates native protocol)
const toolCallId = "tool_call_123"
mockTask.assistantMessageContent = [
{
type: "tool_use",
id: toolCallId, // ID indicates native protocol
name: "ask_followup_question",
params: { question: "What do you see?" },
},
]

// Create a mock askApproval that includes images in the response
const imageBlock: Anthropic.ImageBlockParam = {
type: "image",
source: {
type: "base64",
media_type: "image/png",
data: "base64ImageData",
},
}

mockTask.ask = vi.fn().mockResolvedValue({
response: "yesButtonClicked",
text: "I see a cat",
images: [""],
})

// Execute presentAssistantMessage
await presentAssistantMessage(mockTask)

// Verify that userMessageContent was populated
expect(mockTask.userMessageContent.length).toBeGreaterThan(0)

// Find the tool_result block
const toolResult = mockTask.userMessageContent.find(
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
)

expect(toolResult).toBeDefined()
expect(toolResult.tool_use_id).toBe(toolCallId)

// For native protocol, tool_result content should be a string (text only)
expect(typeof toolResult.content).toBe("string")
expect(toolResult.content).toContain("I see a cat")

// Images should be added as separate blocks AFTER the tool_result
const imageBlocks = mockTask.userMessageContent.filter((item: any) => item.type === "image")
expect(imageBlocks.length).toBeGreaterThan(0)
expect(imageBlocks[0].source.data).toBe("base64ImageData")
})

it("should convert to string when no images are present (native protocol)", async () => {
// Set up a tool_use block with an ID (indicates native protocol)
const toolCallId = "tool_call_456"
mockTask.assistantMessageContent = [
{
type: "tool_use",
id: toolCallId,
name: "ask_followup_question",
params: { question: "What is your name?" },
},
]

// Response with text but NO images
mockTask.ask = vi.fn().mockResolvedValue({
response: "yesButtonClicked",
text: "My name is Alice",
images: undefined,
})

await presentAssistantMessage(mockTask)

const toolResult = mockTask.userMessageContent.find(
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
)

expect(toolResult).toBeDefined()

// When no images, content should be a string
expect(typeof toolResult.content).toBe("string")
})

it("should preserve images in content array for XML protocol (existing behavior)", async () => {
// Set up a tool_use block WITHOUT an ID (indicates XML protocol)
mockTask.assistantMessageContent = [
{
type: "tool_use",
// No ID = XML protocol
name: "ask_followup_question",
params: { question: "What do you see?" },
},
]

mockTask.ask = vi.fn().mockResolvedValue({
response: "yesButtonClicked",
text: "I see a dog",
images: [""],
})

await presentAssistantMessage(mockTask)

// For XML protocol, content is added as separate blocks
// Check that both text and image blocks were added
const hasTextBlock = mockTask.userMessageContent.some((item: any) => item.type === "text")
const hasImageBlock = mockTask.userMessageContent.some((item: any) => item.type === "image")

expect(hasTextBlock).toBe(true)
// XML protocol preserves images as separate blocks in userMessageContent
expect(hasImageBlock).toBe(true)
})

it("should handle empty tool result gracefully", async () => {
const toolCallId = "tool_call_789"
mockTask.assistantMessageContent = [
{
type: "tool_use",
id: toolCallId,
name: "attempt_completion",
params: { result: "Task completed" },
},
]

// Empty response
mockTask.ask = vi.fn().mockResolvedValue({
response: "yesButtonClicked",
text: undefined,
images: undefined,
})

await presentAssistantMessage(mockTask)

const toolResult = mockTask.userMessageContent.find(
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
)

expect(toolResult).toBeDefined()
// Should have fallback text
expect(toolResult.content).toBeTruthy()
})
})
31 changes: 18 additions & 13 deletions src/core/assistant-message/presentAssistantMessage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -296,31 +296,36 @@ export async function presentAssistantMessage(cline: Task) {
return
}

// For native protocol, add as tool_result block
// For native protocol, tool_result content must be a string
// Images are added as separate blocks in the user message
let resultContent: string
let imageBlocks: Anthropic.ImageBlockParam[] = []

if (typeof content === "string") {
resultContent = content || "(tool did not return anything)"
} else {
// Convert array of content blocks to string for tool result
// Tool results in OpenAI format only support strings
resultContent = content
.map((item) => {
if (item.type === "text") {
return item.text
} else if (item.type === "image") {
return "(image content)"
}
return ""
})
.join("\n")
// Separate text and image blocks
const textBlocks = content.filter((item) => item.type === "text")
imageBlocks = content.filter((item) => item.type === "image") as Anthropic.ImageBlockParam[]

// Convert text blocks to string for tool_result
resultContent =
textBlocks.map((item) => (item as Anthropic.TextBlockParam).text).join("\n") ||
"(tool did not return anything)"
}

// Add tool_result with text content only
cline.userMessageContent.push({
type: "tool_result",
tool_use_id: toolCallId,
content: resultContent,
} as Anthropic.ToolResultBlockParam)

// Add image blocks separately after tool_result
if (imageBlocks.length > 0) {
cline.userMessageContent.push(...imageBlocks)
}

hasToolResult = true
} else {
// For XML protocol, add as text blocks (legacy behavior)
Expand Down
Loading