RooCodeInc · roomote-v0 · Mar 4, 2026
@@ -0,0 +1,118 @@
+// npx vitest core/mentions/__tests__/fetchUrlContent.spec.ts
+
+import axios from "axios"
+
+import { fetchUrlContent } from "../fetchUrlContent"
+
+vi.mock("axios")
+
+describe("fetchUrlContent", () => {
+	beforeEach(() => {
+		vi.clearAllMocks()
+	})
+
+	it("should fetch and extract text from HTML content", async () => {
+		vi.mocked(axios.get).mockResolvedValueOnce({
+			headers: { "content-type": "text/html; charset=utf-8" },
+			data: `
+				<html>
+					<head><title>Test Page</title></head>
+					<body>
+						<script>console.log("ignore me")</script>
+						<style>.ignore { display: none; }</style>
+						<nav>Navigation links</nav>
+						<main>
+							<h1>Hello World</h1>
+							<p>This is the main content of the page.</p>
+						</main>
+						<footer>Footer content</footer>
+					</body>
+				</html>
+			`,
+		})
+
+		const result = await fetchUrlContent("https://example.com")
+
+		expect(result.url).toBe("https://example.com")
+		expect(result.content).toContain("Hello World")
+		expect(result.content).toContain("This is the main content of the page.")
+		// Script/style/nav/footer should be removed
+		expect(result.content).not.toContain("ignore me")
+		expect(result.content).not.toContain("Navigation links")
+		expect(result.content).not.toContain("Footer content")
+		expect(result.truncated).toBe(false)
+	})
+
+	it("should return raw text for non-HTML content", async () => {
+		vi.mocked(axios.get).mockResolvedValueOnce({
+			headers: { "content-type": "text/plain" },
+			data: "Plain text content from the URL",
+		})
+
+		const result = await fetchUrlContent("https://example.com/file.txt")
+
+		expect(result.content).toBe("Plain text content from the URL")
+		expect(result.truncated).toBe(false)
+	})
+
+	it("should handle JSON content type as raw text", async () => {
+		vi.mocked(axios.get).mockResolvedValueOnce({
+			headers: { "content-type": "application/json" },
+			data: '{"key": "value"}',
+		})
+
+		const result = await fetchUrlContent("https://example.com/api/data")
+
+		expect(result.content).toBe('{"key": "value"}')
+	})
+
+	it("should truncate content that exceeds the max length", async () => {
+		const longContent = "x".repeat(60_000)
+		vi.mocked(axios.get).mockResolvedValueOnce({
+			headers: { "content-type": "text/plain" },
+			data: longContent,
+		})
+
+		const result = await fetchUrlContent("https://example.com/large")
+
+		expect(result.truncated).toBe(true)
+		expect(result.content.length).toBe(50_000)
+	})
+
+	it("should propagate axios errors", async () => {
+		vi.mocked(axios.get).mockRejectedValueOnce(new Error("Request failed with status code 404"))
+
+		await expect(fetchUrlContent("https://example.com/not-found")).rejects.toThrow(
+			"Request failed with status code 404",
+		)
+	})
+
+	it("should use body as fallback when no main/article element exists", async () => {
+		vi.mocked(axios.get).mockResolvedValueOnce({
+			headers: { "content-type": "text/html" },
+			data: `
+				<html>
+					<body>
+						<div>Some body content without semantic elements</div>
+					</body>
+				</html>
+			`,
+		})
+
+		const result = await fetchUrlContent("https://example.com/simple")
+
+		expect(result.content).toContain("Some body content without semantic elements")
+	})
+
+	it("should handle missing content-type header", async () => {
+		vi.mocked(axios.get).mockResolvedValueOnce({
+			headers: {},
+			data: "Some raw content",
+		})
+
+		const result = await fetchUrlContent("https://example.com/unknown")
+
+		// With no content-type, it falls through to the non-HTML path
+		expect(result.content).toBe("Some raw content")
+	})
+})
@@ -16,15 +16,57 @@ vi.mock("../../../i18n", () => ({
 	t: vi.fn((key: string) => key),
 }))
 
+// Mock fetchUrlContent
+vi.mock("../fetchUrlContent", () => ({
+	fetchUrlContent: vi.fn().mockResolvedValue({
+		url: "https://example.com",
+		content: "Example page content here",
+		truncated: false,
+	}),
+}))
+
 describe("parseMentions - URL mention handling", () => {
 	beforeEach(() => {
 		vi.clearAllMocks()
 	})
 
-	it("should replace URL mentions with quoted URL reference", async () => {
+	it("should replace URL mentions with quoted URL reference indicating content", async () => {
+		const result = await parseMentions("Check @https://example.com", "/test")
+
+		expect(result.text).toContain("'https://example.com' (see below for fetched content)")
+	})
+
+	it("should produce a content block with fetched URL content", async () => {
+		const result = await parseMentions("Check @https://example.com", "/test")
+
+		expect(result.contentBlocks).toHaveLength(1)
+		expect(result.contentBlocks[0].type).toBe("url")
+		expect(result.contentBlocks[0].content).toContain("Example page content here")
+		expect(result.contentBlocks[0].content).toContain("[url_content for 'https://example.com']")
+	})
+
+	it("should handle URL fetch errors gracefully", async () => {
+		const { fetchUrlContent } = await import("../fetchUrlContent")
+		vi.mocked(fetchUrlContent).mockRejectedValueOnce(new Error("Network timeout"))
+
+		const result = await parseMentions("Check @https://example.com", "/test")
+
+		expect(result.contentBlocks).toHaveLength(1)
+		expect(result.contentBlocks[0].type).toBe("url")
+		expect(result.contentBlocks[0].content).toContain("Error fetching URL content: Network timeout")
+	})
+
+	it("should indicate truncation when content is truncated", async () => {
+		const { fetchUrlContent } = await import("../fetchUrlContent")
+		vi.mocked(fetchUrlContent).mockResolvedValueOnce({
+			url: "https://example.com",
+			content: "Truncated content...",
+			truncated: true,
+		})
+
 		const result = await parseMentions("Check @https://example.com", "/test")
 
-		// URL mentions are now replaced with a quoted reference (no fetching)
-		expect(result.text).toContain("'https://example.com'")
+		expect(result.contentBlocks).toHaveLength(1)
+		expect(result.contentBlocks[0].content).toContain("[Content truncated due to length]")
 	})
 })
@@ -0,0 +1,77 @@
+import axios from "axios"
+import * as cheerio from "cheerio"
+
+const MAX_CONTENT_LENGTH = 50_000
+const REQUEST_TIMEOUT_MS = 15_000
+
+export interface FetchUrlResult {
+	url: string
+	content: string
+	truncated: boolean
+}
+
+/**
+ * Fetches a URL and extracts readable text content from the HTML.
+ * Uses cheerio for HTML parsing and text extraction.
+ * Falls back to raw text for non-HTML responses.
+ */
+export async function fetchUrlContent(url: string): Promise<FetchUrlResult> {
+	const response = await axios.get(url, {
+		timeout: REQUEST_TIMEOUT_MS,
+		maxRedirects: 5,
+		responseType: "text",
+		headers: {
+			"User-Agent": "Roo-Code/1.0 (URL Context Fetcher)",
+			Accept: "text/html, application/xhtml+xml, text/plain, */*",
+		},
+		// Limit response size to avoid downloading huge files
+		maxContentLength: 5 * 1024 * 1024, // 5MB
+	})
+
+	const contentType = response.headers["content-type"] || ""
+	const rawBody = typeof response.data === "string" ? response.data : String(response.data)
+
+	let text: string
+
+	if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
+		text = extractTextFromHtml(rawBody)
+	} else {
+		// For non-HTML content (plain text, JSON, etc.), use raw body
+		text = rawBody
+	}
+
+	const truncated = text.length > MAX_CONTENT_LENGTH
+	if (truncated) {
+		text = text.slice(0, MAX_CONTENT_LENGTH)
+	}
+
+	return { url, content: text, truncated }
+}
+
+/**
+ * Extracts meaningful text content from an HTML string using cheerio.
+ * Removes scripts, styles, navigation, and other non-content elements.
+ */
+function extractTextFromHtml(html: string): string {
+	const $ = cheerio.load(html)
+
+	// Remove non-content elements
+	$(
+		"script, style, nav, footer, header, noscript, svg, iframe, form, button, [role='navigation'], [role='banner'], [role='contentinfo'], [aria-hidden='true']",
+	).remove()
+
+	// Try to find main content area first
+	let contentEl = $("main, article, [role='main'], .content, #content, .post, .article")
+	if (contentEl.length === 0) {
+		contentEl = $("body")
+	}
+
+	// Extract text, preserving some structure
+	const text = contentEl
+		.text()
+		.replace(/[ \t]+/g, " ") // Collapse horizontal whitespace
+		.replace(/\n{3,}/g, "\n\n") // Collapse excessive newlines
+		.trim()
+
+	return text
+}
@@ -19,6 +19,7 @@ import { RooIgnoreController } from "../ignore/RooIgnoreController"
 import { getCommand, type Command } from "../../services/command/commands"
 import { buildSkillResult, resolveSkillContentForMode, type SkillLookup } from "../../services/skills/skillInvocation"
 import type { SkillContent } from "../../shared/skills"
+import { fetchUrlContent } from "./fetchUrlContent"
 
 export async function openMention(cwd: string, mention?: string): Promise<void> {
 	if (!mention) {
@@ -163,7 +164,7 @@ export async function parseMentions(
 	parsedText = parsedText.replace(mentionRegexGlobal, (match, mention) => {
 		mentions.add(mention)
 		if (mention.startsWith("http")) {
-			return `'${mention}'`
+			return `'${mention}' (see below for fetched content)`
 		} else if (mention.startsWith("/")) {
 			// Clean path reference - no "see below" since we format like tool results
 			const mentionPath = mention.slice(1)
@@ -221,6 +222,21 @@ export async function parseMentions(
 			} catch (error) {
 				parsedText += `\n\n<git_commit hash="${mention}">\nError fetching commit info: ${error.message}\n</git_commit>`
 			}
+		} else if (mention.startsWith("http")) {
+			try {
+				const result = await fetchUrlContent(mention)
+				const truncationNote = result.truncated ? "\n[Content truncated due to length]" : ""
+				contentBlocks.push({
+					type: "url",
+					content: `[url_content for '${mention}']\n${result.content}${truncationNote}`,
+				})
+			} catch (error) {
+				const errorMsg = error instanceof Error ? error.message : String(error)
+				contentBlocks.push({
+					type: "url",
+					content: `[url_content for '${mention}']\nError fetching URL content: ${errorMsg}`,
+				})
+			}
 		} else if (mention === "terminal") {
 			try {
 				const terminalOutput = await getLatestTerminalOutput()