Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions src/core/mentions/__tests__/fetchUrlContent.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// npx vitest core/mentions/__tests__/fetchUrlContent.spec.ts

import axios from "axios"

import { fetchUrlContent } from "../fetchUrlContent"

vi.mock("axios")

describe("fetchUrlContent", () => {
beforeEach(() => {
vi.clearAllMocks()
})

it("should fetch and extract text from HTML content", async () => {
vi.mocked(axios.get).mockResolvedValueOnce({
headers: { "content-type": "text/html; charset=utf-8" },
data: `
<html>
<head><title>Test Page</title></head>
<body>
<script>console.log("ignore me")</script>
<style>.ignore { display: none; }</style>
<nav>Navigation links</nav>
<main>
<h1>Hello World</h1>
<p>This is the main content of the page.</p>
</main>
<footer>Footer content</footer>
</body>
</html>
`,
})

const result = await fetchUrlContent("https://example.com")

expect(result.url).toBe("https://example.com")
expect(result.content).toContain("Hello World")
expect(result.content).toContain("This is the main content of the page.")
// Script/style/nav/footer should be removed
expect(result.content).not.toContain("ignore me")
expect(result.content).not.toContain("Navigation links")
expect(result.content).not.toContain("Footer content")
expect(result.truncated).toBe(false)
})

it("should return raw text for non-HTML content", async () => {
vi.mocked(axios.get).mockResolvedValueOnce({
headers: { "content-type": "text/plain" },
data: "Plain text content from the URL",
})

const result = await fetchUrlContent("https://example.com/file.txt")

expect(result.content).toBe("Plain text content from the URL")
expect(result.truncated).toBe(false)
})

it("should handle JSON content type as raw text", async () => {
vi.mocked(axios.get).mockResolvedValueOnce({
headers: { "content-type": "application/json" },
data: '{"key": "value"}',
})

const result = await fetchUrlContent("https://example.com/api/data")

expect(result.content).toBe('{"key": "value"}')
})

it("should truncate content that exceeds the max length", async () => {
const longContent = "x".repeat(60_000)
vi.mocked(axios.get).mockResolvedValueOnce({
headers: { "content-type": "text/plain" },
data: longContent,
})

const result = await fetchUrlContent("https://example.com/large")

expect(result.truncated).toBe(true)
expect(result.content.length).toBe(50_000)
})

it("should propagate axios errors", async () => {
vi.mocked(axios.get).mockRejectedValueOnce(new Error("Request failed with status code 404"))

await expect(fetchUrlContent("https://example.com/not-found")).rejects.toThrow(
"Request failed with status code 404",
)
})

it("should use body as fallback when no main/article element exists", async () => {
vi.mocked(axios.get).mockResolvedValueOnce({
headers: { "content-type": "text/html" },
data: `
<html>
<body>
<div>Some body content without semantic elements</div>
</body>
</html>
`,
})

const result = await fetchUrlContent("https://example.com/simple")

expect(result.content).toContain("Some body content without semantic elements")
})

it("should handle missing content-type header", async () => {
vi.mocked(axios.get).mockResolvedValueOnce({
headers: {},
data: "Some raw content",
})

const result = await fetchUrlContent("https://example.com/unknown")

// With no content-type, it falls through to the non-HTML path
expect(result.content).toBe("Some raw content")
})
})
48 changes: 45 additions & 3 deletions src/core/mentions/__tests__/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,57 @@ vi.mock("../../../i18n", () => ({
t: vi.fn((key: string) => key),
}))

// Mock fetchUrlContent
vi.mock("../fetchUrlContent", () => ({
fetchUrlContent: vi.fn().mockResolvedValue({
url: "https://example.com",
content: "Example page content here",
truncated: false,
}),
}))

describe("parseMentions - URL mention handling", () => {
beforeEach(() => {
vi.clearAllMocks()
})

it("should replace URL mentions with quoted URL reference", async () => {
it("should replace URL mentions with quoted URL reference indicating content", async () => {
const result = await parseMentions("Check @https://example.com", "/test")

expect(result.text).toContain("'https://example.com' (see below for fetched content)")
})

it("should produce a content block with fetched URL content", async () => {
const result = await parseMentions("Check @https://example.com", "/test")

expect(result.contentBlocks).toHaveLength(1)
expect(result.contentBlocks[0].type).toBe("url")
expect(result.contentBlocks[0].content).toContain("Example page content here")
expect(result.contentBlocks[0].content).toContain("[url_content for 'https://example.com']")
})

it("should handle URL fetch errors gracefully", async () => {
const { fetchUrlContent } = await import("../fetchUrlContent")
vi.mocked(fetchUrlContent).mockRejectedValueOnce(new Error("Network timeout"))

const result = await parseMentions("Check @https://example.com", "/test")

expect(result.contentBlocks).toHaveLength(1)
expect(result.contentBlocks[0].type).toBe("url")
expect(result.contentBlocks[0].content).toContain("Error fetching URL content: Network timeout")
})

it("should indicate truncation when content is truncated", async () => {
const { fetchUrlContent } = await import("../fetchUrlContent")
vi.mocked(fetchUrlContent).mockResolvedValueOnce({
url: "https://example.com",
content: "Truncated content...",
truncated: true,
})

const result = await parseMentions("Check @https://example.com", "/test")

// URL mentions are now replaced with a quoted reference (no fetching)
expect(result.text).toContain("'https://example.com'")
expect(result.contentBlocks).toHaveLength(1)
expect(result.contentBlocks[0].content).toContain("[Content truncated due to length]")
})
})
77 changes: 77 additions & 0 deletions src/core/mentions/fetchUrlContent.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import axios from "axios"
import * as cheerio from "cheerio"

const MAX_CONTENT_LENGTH = 50_000
const REQUEST_TIMEOUT_MS = 15_000

export interface FetchUrlResult {
url: string
content: string
truncated: boolean
}

/**
* Fetches a URL and extracts readable text content from the HTML.
* Uses cheerio for HTML parsing and text extraction.
* Falls back to raw text for non-HTML responses.
*/
export async function fetchUrlContent(url: string): Promise<FetchUrlResult> {
const response = await axios.get(url, {
timeout: REQUEST_TIMEOUT_MS,
maxRedirects: 5,
responseType: "text",
headers: {
"User-Agent": "Roo-Code/1.0 (URL Context Fetcher)",
Accept: "text/html, application/xhtml+xml, text/plain, */*",
},
// Limit response size to avoid downloading huge files
maxContentLength: 5 * 1024 * 1024, // 5MB
})

const contentType = response.headers["content-type"] || ""
const rawBody = typeof response.data === "string" ? response.data : String(response.data)

let text: string

if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
text = extractTextFromHtml(rawBody)
} else {
// For non-HTML content (plain text, JSON, etc.), use raw body
text = rawBody
}

const truncated = text.length > MAX_CONTENT_LENGTH
if (truncated) {
text = text.slice(0, MAX_CONTENT_LENGTH)
}

return { url, content: text, truncated }
}

/**
* Extracts meaningful text content from an HTML string using cheerio.
* Removes scripts, styles, navigation, and other non-content elements.
*/
function extractTextFromHtml(html: string): string {
const $ = cheerio.load(html)

// Remove non-content elements
$(
"script, style, nav, footer, header, noscript, svg, iframe, form, button, [role='navigation'], [role='banner'], [role='contentinfo'], [aria-hidden='true']",
).remove()

// Try to find main content area first
let contentEl = $("main, article, [role='main'], .content, #content, .post, .article")
if (contentEl.length === 0) {
contentEl = $("body")
}

// Extract text, preserving some structure
const text = contentEl
.text()
.replace(/[ \t]+/g, " ") // Collapse horizontal whitespace
.replace(/\n{3,}/g, "\n\n") // Collapse excessive newlines
.trim()

return text
}
18 changes: 17 additions & 1 deletion src/core/mentions/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { RooIgnoreController } from "../ignore/RooIgnoreController"
import { getCommand, type Command } from "../../services/command/commands"
import { buildSkillResult, resolveSkillContentForMode, type SkillLookup } from "../../services/skills/skillInvocation"
import type { SkillContent } from "../../shared/skills"
import { fetchUrlContent } from "./fetchUrlContent"

export async function openMention(cwd: string, mention?: string): Promise<void> {
if (!mention) {
Expand Down Expand Up @@ -163,7 +164,7 @@ export async function parseMentions(
parsedText = parsedText.replace(mentionRegexGlobal, (match, mention) => {
mentions.add(mention)
if (mention.startsWith("http")) {
return `'${mention}'`
return `'${mention}' (see below for fetched content)`
} else if (mention.startsWith("/")) {
// Clean path reference - no "see below" since we format like tool results
const mentionPath = mention.slice(1)
Expand Down Expand Up @@ -221,6 +222,21 @@ export async function parseMentions(
} catch (error) {
parsedText += `\n\n<git_commit hash="${mention}">\nError fetching commit info: ${error.message}\n</git_commit>`
}
} else if (mention.startsWith("http")) {
try {
const result = await fetchUrlContent(mention)
const truncationNote = result.truncated ? "\n[Content truncated due to length]" : ""
contentBlocks.push({
type: "url",
content: `[url_content for '${mention}']\n${result.content}${truncationNote}`,
})
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error)
contentBlocks.push({
type: "url",
content: `[url_content for '${mention}']\nError fetching URL content: ${errorMsg}`,
})
}
} else if (mention === "terminal") {
try {
const terminalOutput = await getLatestTerminalOutput()
Expand Down
Loading