diff --git a/package-lock.json b/package-lock.json index d02edbc..2263f4d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6383,9 +6383,6 @@ "win32" ] }, - "node_modules/sqlite-vec/node_modules/sqlite-vec-linux-arm64": { - "optional": true - }, "node_modules/stackback": { "version": "0.0.2", "resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz", diff --git a/src/connectors/docs.ts b/src/connectors/docs.ts new file mode 100644 index 0000000..eff388f --- /dev/null +++ b/src/connectors/docs.ts @@ -0,0 +1,804 @@ +/** + * Documentation site connector for Sphinx, VitePress, and Doxygen. + * + * Crawls documentation sites, auto-detects the generator, extracts main content, + * and indexes each page with URL-based deduplication. Supports incremental syncs + * via content-hash comparison built into indexDocument(). + */ +import type Database from "better-sqlite3"; +import { NodeHtmlMarkdown } from "node-html-markdown"; +import { ValidationError } from "../errors.js"; +import { getLogger } from "../logger.js"; +import { fetchRaw } from "../core/url-fetcher.js"; +import type { FetchOptions } from "../core/url-fetcher.js"; +import { indexDocument } from "../core/indexing.js"; +import { listDocuments, deleteDocument } from "../core/documents.js"; +import { startSync, completeSync, failSync } from "./sync-tracker.js"; +import type { EmbeddingProvider } from "../providers/embedding.js"; + +// Source type used to tag all docs-connector documents. +// "library" is the closest semantic match in the IndexDocumentInput union. +const SOURCE_TYPE = "library" as const; + +// Internal connector type identifier used in the sync tracker. +const CONNECTOR_TYPE = "docs"; + +const DEFAULT_MAX_PAGES = 500; +const DEFAULT_MAX_DEPTH = 10; +const DEFAULT_CONCURRENCY = 3; + +/** Non-content file extensions that should not be crawled. */ +const SKIP_EXTENSIONS = new Set([ + "png", + "jpg", + "jpeg", + "gif", + "svg", + "ico", + "webp", + "pdf", + "zip", + "tar", + "gz", + "bz2", + "xz", + "css", + "js", + "mjs", + "json", + "xml", + "woff", + "woff2", + "ttf", + "eot", + "otf", + "mp4", + "mp3", + "ogg", + "wav", + "map", +]); + +/** Maximum HTML size (in bytes) to process — truncate before regex to mitigate ReDoS. */ +const MAX_HTML_SIZE = 5_000_000; + +/** Supported documentation site generators. */ +export type DocSiteType = "sphinx" | "vitepress" | "doxygen" | "generic"; + +/** Matcher for tag attributes — RegExp or predicate function. */ +type AttrMatcher = RegExp | ((attrs: string) => boolean); + +/** A CSS-like selector expressed as a tag name + attribute matcher. */ +interface ContentSelector { + tag: string; + attr: AttrMatcher; +} + +/** Per-framework detection and content selectors. */ +interface FrameworkDef { + type: DocSiteType; + /** Returns true if the full HTML matches this framework. */ + detect: (html: string) => boolean; + contentSelectors: ContentSelector[]; +} + +/** + * Return a predicate that checks whether a tag's attribute string contains + * a specific CSS class name. Uses indexOf + split instead of regex to avoid + * polynomial backtracking on untrusted HTML. + */ +function classContains(className: string, caseInsensitive = false): (attrs: string) => boolean { + return (attrs: string): boolean => { + // Extract the class attribute value using indexOf (no overlapping quantifiers) + const classRe = /class=["']([^"']{0,2000})["']/i; + const m = classRe.exec(attrs); + if (!m?.[1]) return false; + const classes = m[1].split(/\s+/); + return caseInsensitive + ? classes.some((c) => c.toLowerCase() === className.toLowerCase()) + : classes.includes(className); + }; +} + +/** + * Data-driven framework definitions. + * + * Detection uses string-based checks (includes / simple regex without + * backtracking-prone quantifiers) to avoid CodeQL polynomial-regex alerts. + * Content selectors use classContains() predicates for the same reason. + */ +const FRAMEWORK_DEFS: readonly FrameworkDef[] = [ + { + type: "sphinx", + detect: (html) => + /content=["']Sphinx/i.test(html) || + html.includes("sphinxsidebar") || + html.includes("rst-content") || + /class=["']sphinx-[a-z]/i.test(html), + contentSelectors: [ + { tag: "div", attr: /role=["']main["']/i }, + { tag: "div", attr: classContains("body") }, + { tag: "section", attr: /role=["']main["']/i }, + { tag: "article", attr: () => true }, + ], + }, + { + type: "vitepress", + detect: (html) => + /__VITEPRESS_/i.test(html) || + html.includes("VPDoc") || + html.includes("vp-doc") || + /content=["']VitePress/i.test(html), + contentSelectors: [ + { tag: "div", attr: classContains("vp-doc", true) }, + { tag: "div", attr: classContains("VPDoc") }, + { tag: "main", attr: () => true }, + ], + }, + { + type: "doxygen", + detect: (html) => + /Generated by Doxygen/i.test(html) || + /content=["']Doxygen/i.test(html) || + html.includes("doc-content") || + html.includes("doxygen"), + contentSelectors: [ + { tag: "div", attr: classContains("contents") }, + { tag: "div", attr: /id=["']doc-content["']/ }, + { tag: "div", attr: classContains("textblock") }, + ], + }, +]; + +/** Fallback selectors for sites that don't match any known framework. */ +const GENERIC_CONTENT_SELECTORS: readonly ContentSelector[] = [ + { tag: "main", attr: () => true }, + { tag: "article", attr: () => true }, + { tag: "div", attr: /\bid=["']content["']/ }, + { tag: "div", attr: classContains("content") }, +]; + +/** Configuration for a documentation site sync. */ +export interface DocSiteConfig { + /** Root URL of the documentation site. */ + url: string; + /** Documentation generator type. Set to "auto" (or omit) for auto-detection. */ + type?: DocSiteType | "auto"; + /** Library name to associate with indexed pages (used for filtering and metadata). */ + library?: string | undefined; + /** Library version to associate with indexed pages. */ + version?: string | undefined; + /** Maximum number of pages to crawl (default: 500). */ + maxPages?: number | undefined; + /** Maximum link depth from the root page (default: 10). */ + maxDepth?: number | undefined; + /** Maximum number of pages to fetch concurrently (1–10, default: 3). */ + concurrency?: number | undefined; + /** Allow fetching from private/internal IP addresses (default: false). */ + allowPrivateUrls?: boolean | undefined; + /** Accept self-signed or untrusted TLS certificates (default: false). */ + allowSelfSignedCerts?: boolean | undefined; + /** ISO 8601 timestamp of the last sync; reserved for future incremental sync use. */ + lastSync?: string | undefined; + /** + * Restrict crawling to URLs whose path starts with this prefix. + * Defaults to the root URL's pathname (e.g. "/docs/"). + */ + pathPrefix?: string | undefined; +} + +/** Result of a documentation site sync. */ +export interface DocSiteSyncResult { + /** Pages newly indexed in this sync. */ + pagesIndexed: number; + /** Pages that existed before and were re-indexed due to content changes. */ + pagesUpdated: number; + /** Pages skipped because they are empty or contain no meaningful content. */ + pagesSkipped: number; + /** The detected (or configured) documentation site type. */ + detectedType: DocSiteType; + /** Per-page errors encountered during the crawl. */ + errors: Array<{ url: string; error: string }>; +} + +// --------------------------------------------------------------------------- +// URL utilities +// --------------------------------------------------------------------------- + +/** + * Normalise a URL for deduplication: strip the fragment, remove trailing + * slash from non-root paths, and keep scheme + host + path + query. + */ +export function normalizeUrl(rawUrl: string): string { + try { + const parsed = new URL(rawUrl); + parsed.hash = ""; + if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) { + parsed.pathname = parsed.pathname.slice(0, -1); + } + return parsed.href; + } catch { + // Malformed URL — return as-is for deduplication fallback + return rawUrl; + } +} + +// --------------------------------------------------------------------------- +// Site-type detection +// --------------------------------------------------------------------------- + +/** + * Detect the documentation generator from the HTML of a page. + * + * Checks generator meta tags and framework-specific CSS class names + * defined in FRAMEWORK_DEFS. Returns "generic" when no known pattern is found. + */ +export function detectDocSiteType(html: string): DocSiteType { + for (const fw of FRAMEWORK_DEFS) { + if (fw.detect(html)) { + return fw.type; + } + } + return "generic"; +} + +// --------------------------------------------------------------------------- +// HTML content extraction +// --------------------------------------------------------------------------- + +/** + * Walk HTML from `startPos` and find the end of a balanced `...` + * block using depth counting. Returns the index of the matching close tag, + * or -1 if none is found (malformed HTML). + */ +function findClosingTagIndex(html: string, tagName: string, startPos: number): number { + const openRe = new RegExp(String.raw`<${tagName}(?:\s[^>]{0,2000})?>`, "gi"); + const closeRe = new RegExp(``, "gi"); + + let depth = 1; + let pos = startPos; + + while (depth > 0) { + openRe.lastIndex = pos; + closeRe.lastIndex = pos; + + const nextOpen = openRe.exec(html); + const nextClose = closeRe.exec(html); + + if (!nextClose) return -1; + + if (nextOpen !== null && nextOpen.index < nextClose.index) { + depth++; + pos = nextOpen.index + nextOpen[0].length; + } else { + depth--; + if (depth === 0) return nextClose.index; + pos = nextClose.index + nextClose[0].length; + } + } + + return -1; +} + +/** + * Extract the balanced inner HTML of the first element whose opening tag + * matches `tagName` and whose attribute string matches `attrPattern`. + * + * Uses a depth-counting approach so nested elements of the same tag name + * are handled correctly. Returns null when no matching element is found. + */ +export function extractElementByPattern( + html: string, + tagName: string, + attrPattern: AttrMatcher, +): string | null { + const scanner = new RegExp(String.raw`<(${tagName})(\s[^>]{0,2000})?>`, "gi"); + + let m: RegExpExecArray | null; + while ((m = scanner.exec(html)) !== null) { + const attrs = m[2] ?? ""; + const matchesAttr = + typeof attrPattern === "function" ? attrPattern(attrs) : attrPattern.test(attrs); + if (matchesAttr) { + const contentStart = m.index + m[0].length; + const closeIdx = findClosingTagIndex(html, tagName, contentStart); + return closeIdx === -1 ? null : html.slice(contentStart, closeIdx); + } + } + + return null; +} + +/** + * Extract the main documentation content from a page's HTML. + * + * Attempts to isolate the primary content container for each site type so + * that navigation, sidebars, and footers are excluded. Falls back to + * full-page conversion when no known container is found. + * + * HTML is truncated to MAX_HTML_SIZE before regex processing to mitigate ReDoS. + */ +export function extractMainContent(html: string, siteType: DocSiteType): string { + // Truncate oversized HTML before any regex processing to mitigate ReDoS + const safeHtml = html.length > MAX_HTML_SIZE ? html.slice(0, MAX_HTML_SIZE) : html; + + const selectors = + FRAMEWORK_DEFS.find((fw) => fw.type === siteType)?.contentSelectors ?? + GENERIC_CONTENT_SELECTORS; + + let contentHtml: string | null = null; + for (const sel of selectors) { + contentHtml = extractElementByPattern(safeHtml, sel.tag, sel.attr); + if (contentHtml) break; + } + + return NodeHtmlMarkdown.translate(contentHtml ?? safeHtml, { + ignore: ["script", "style", "nav"], + }); +} + +/** + * Extract the page title from HTML. + * + * Tries (in order): H1 tag, tag, URL-derived fallback. + */ +export function extractDocTitle(html: string, url: string): string { + // H1 is the most semantically accurate source for documentation pages. + // Uses indexOf instead of a single capturing regex to avoid polynomial backtracking. + const h1Open = /<h1[^>]{0,2000}>/i.exec(html); + if (h1Open) { + const innerStart = h1Open.index + h1Open[0].length; + const h1CloseIdx = html.toLowerCase().indexOf("</h1>", innerStart); + if (h1CloseIdx !== -1) { + const title = html + .slice(innerStart, h1CloseIdx) + .replaceAll(/<[^>]{1,2000}>/g, "") + .trim(); + if (title) return title; + } + } + + // <title> tag as fallback + const titleTagMatch = /<title[^>]{0,2000}>([^<]+)<\/title>/i.exec(html); + if (titleTagMatch?.[1]) { + const title = titleTagMatch[1].trim(); + if (title) return title; + } + + // Last resort: derive from URL path + try { + const parsed = new URL(url); + const path = parsed.pathname.replace(/\/$/, ""); + const segment = path.split("/").pop(); + if (segment) { + return segment + .replaceAll("-", " ") + .replaceAll("_", " ") + .replace(/\.\w+$/, ""); + } + return parsed.hostname; + } catch { + // Malformed URL — return raw URL as title + return url; + } +} + +// --------------------------------------------------------------------------- +// Link extraction +// --------------------------------------------------------------------------- + +/** + * Extract all internal HTML anchor links from a page. + * + * Filters links to: + * - Same origin as the base URL + * - Path starting with `pathPrefix` + * - Not a binary/asset file extension + * - Not fragment-only references + * + * Returns an array of normalised absolute URLs. + */ +/** Href values that should not be treated as navigable links. */ +const SKIP_SCHEMES = ["#", "mailto:", "javascript:", "data:", "vbscript:"]; + +/** Resolve and validate a raw href against the base URL constraints. */ +function resolveDocHref( + raw: string, + baseUrl: string, + baseOrigin: string, + pathPrefix: string, +): string | null { + if (SKIP_SCHEMES.some((s) => raw.startsWith(s))) return null; + + try { + const resolved = new URL(raw, baseUrl); + if (resolved.origin !== baseOrigin) return null; + if (resolved.protocol !== "http:" && resolved.protocol !== "https:") return null; + + const ext = resolved.pathname.split(".").pop()?.toLowerCase() ?? ""; + if (SKIP_EXTENSIONS.has(ext)) return null; + if (pathPrefix && !resolved.pathname.startsWith(pathPrefix)) return null; + + return normalizeUrl(resolved.href); + } catch { + // Skip unparseable href values + return null; + } +} + +export function extractDocLinks(html: string, baseUrl: string, pathPrefix: string): string[] { + const base = new URL(baseUrl); + const links = new Set<string>(); + + const hrefRe = /<a\s[^>]{0,2000}\bhref=["']([^"']{1,4000})["'][^>]{0,2000}>/gi; + let match: RegExpExecArray | null; + + while ((match = hrefRe.exec(html)) !== null) { + const raw = match[1]; + if (!raw) continue; + const resolved = resolveDocHref(raw, baseUrl, base.origin, pathPrefix); + if (resolved) links.add(resolved); + } + + return [...links]; +} + +// --------------------------------------------------------------------------- +// Sitemap parsing +// --------------------------------------------------------------------------- + +/** + * Extract page URLs from a sitemap.xml (or sitemap index) document. + * + * Only returns URLs on the same origin as `baseUrl` and under `pathPrefix`. + * Binary/asset paths are excluded. + */ +export function extractSitemapUrls(xml: string, baseUrl: string, pathPrefix: string): string[] { + const base = new URL(baseUrl); + const urls: string[] = []; + const seen = new Set<string>(); + + const locRe = /<loc>([^<]+)<\/loc>/gi; + let match: RegExpExecArray | null; + + while ((match = locRe.exec(xml)) !== null) { + const raw = match[1]?.trim(); + if (!raw) continue; + try { + const parsed = new URL(raw); + if (parsed.origin !== base.origin) continue; + if (pathPrefix && !parsed.pathname.startsWith(pathPrefix)) continue; + + const ext = parsed.pathname.split(".").pop()?.toLowerCase() ?? ""; + if (SKIP_EXTENSIONS.has(ext)) continue; + + const normalised = normalizeUrl(parsed.href); + if (!seen.has(normalised)) { + seen.add(normalised); + urls.push(normalised); + } + } catch { + // Skip invalid URLs in sitemap + } + } + + return urls; +} + +// --------------------------------------------------------------------------- +// Internal page processing +// --------------------------------------------------------------------------- + +/** Context passed to processPage to avoid a long parameter list. */ +interface PageContext { + siteType: DocSiteType; + db: Database.Database; + provider: EmbeddingProvider; + config: DocSiteConfig; + /** Map of normalised URL → existing document ID for update detection. */ + existingUrlMap: Map<string, string>; + result: DocSiteSyncResult; +} + +/** + * Process a single documentation page: extract title + content, then index. + * + * indexDocument() handles URL-based dedup automatically: if the URL already + * exists and the content hash is unchanged the call is a no-op; if the hash + * changed the old document is replaced. + */ +async function processPage(url: string, html: string, ctx: PageContext): Promise<void> { + const log = getLogger(); + + const title = extractDocTitle(html, url); + const content = extractMainContent(html, ctx.siteType); + + if (!content.trim()) { + ctx.result.pagesSkipped++; + log.debug({ url }, "Skipping empty page"); + return; + } + + const normalised = normalizeUrl(url); + const isKnown = ctx.existingUrlMap.has(normalised); + + const indexed = await indexDocument(ctx.db, ctx.provider, { + title, + content, + sourceType: SOURCE_TYPE, + url, + library: ctx.config.library, + version: ctx.config.version, + submittedBy: "crawler", + }); + + // chunkCount === 0 means indexDocument determined the page was unchanged + if (indexed.chunkCount === 0 && isKnown) { + ctx.result.pagesSkipped++; + } else if (isKnown) { + ctx.result.pagesUpdated++; + } else { + ctx.result.pagesIndexed++; + } + + log.debug({ url, title, chunks: indexed.chunkCount }, "Processed documentation page"); +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Crawl and index a documentation site. + * + * 1. Fetches the root page to auto-detect the site type. + * 2. Tries to discover all pages via sitemap.xml. + * 3. Falls back to (or supplements) BFS link crawling. + * 4. Processes pages concurrently in configurable batches. + * + * URL-based deduplication is handled by indexDocument(): unchanged pages + * are skipped automatically; changed pages are re-indexed in-place. + */ +/** Validate config and return a parsed base URL. */ +function validateDocSiteConfig(config: DocSiteConfig): URL { + if (!config.url?.trim()) { + throw new ValidationError("DocSiteConfig.url is required"); + } + let baseUrl: URL; + try { + baseUrl = new URL(config.url); + } catch { + throw new ValidationError(`Invalid URL: ${config.url}`); + } + if (baseUrl.protocol !== "http:" && baseUrl.protocol !== "https:") { + throw new ValidationError(`URL must use http or https scheme: ${config.url}`); + } + return baseUrl; +} + +/** Discover URLs via sitemap.xml and root page links, populating the BFS queue. */ +async function discoverUrls( + config: DocSiteConfig, + baseUrl: URL, + rootHtml: string, + pathPrefix: string, + fetchOptions: FetchOptions, + visited: Set<string>, + queue: Array<{ url: string; depth: number }>, +): Promise<void> { + const log = getLogger(); + + const sitemapUrl = `${baseUrl.origin}/sitemap.xml`; + try { + const sitemapRaw = await fetchRaw(sitemapUrl, fetchOptions); + if (sitemapRaw.contentType.includes("xml") || sitemapRaw.body.includes("<urlset")) { + const sitemapUrls = extractSitemapUrls(sitemapRaw.body, config.url, pathPrefix); + for (const u of sitemapUrls) { + if (!visited.has(u)) { + queue.push({ url: u, depth: 1 }); + visited.add(u); + } + } + log.info({ count: sitemapUrls.length }, "Discovered URLs from sitemap.xml"); + } + } catch { + log.debug({ url: sitemapUrl }, "sitemap.xml unavailable, falling back to link crawling"); + } + + for (const link of extractDocLinks(rootHtml, config.url, pathPrefix)) { + if (!visited.has(link)) { + queue.push({ url: link, depth: 1 }); + visited.add(link); + } + } +} + +export async function syncDocSite( + db: Database.Database, + provider: EmbeddingProvider, + config: DocSiteConfig, +): Promise<DocSiteSyncResult> { + const log = getLogger(); + + const baseUrl = validateDocSiteConfig(config); + + const maxPages = Math.max(1, Math.min(config.maxPages ?? DEFAULT_MAX_PAGES, 10_000)); + const maxDepth = Math.max(1, Math.min(config.maxDepth ?? DEFAULT_MAX_DEPTH, 100)); + const concurrency = Math.max(1, Math.min(config.concurrency ?? DEFAULT_CONCURRENCY, 10)); + + const pathPrefix = config.pathPrefix ?? baseUrl.pathname; + + const fetchOptions: FetchOptions = { + allowPrivateUrls: config.allowPrivateUrls ?? false, + allowSelfSignedCerts: config.allowSelfSignedCerts ?? false, + }; + + if (fetchOptions.allowPrivateUrls) { + log.warn({ url: config.url }, "Doc sync with allowPrivateUrls — SSRF protections relaxed"); + } + + const result: DocSiteSyncResult = { + pagesIndexed: 0, + pagesUpdated: 0, + pagesSkipped: 0, + detectedType: "generic", + errors: [], + }; + + const syncId = startSync(db, CONNECTOR_TYPE, config.url); + + try { + // --- Fetch root page --- + log.info({ url: config.url }, "Fetching documentation root page"); + + let rootHtml: string; + try { + const raw = await fetchRaw(config.url, fetchOptions); + rootHtml = raw.body; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new Error(`Failed to fetch root page: ${msg}`); + } + + // --- Detect site type --- + result.detectedType = + config.type !== undefined && config.type !== "auto" + ? config.type + : detectDocSiteType(rootHtml); + + log.info({ type: result.detectedType, url: config.url }, "Documentation site type"); + + // --- URL discovery --- + const visited = new Set<string>(); + const queue: Array<{ url: string; depth: number }> = []; + const rootNormalised = normalizeUrl(config.url); + visited.add(rootNormalised); + + await discoverUrls(config, baseUrl, rootHtml, pathPrefix, fetchOptions, visited, queue); + + // --- Build existing-URL index for update tracking --- + const existingDocs = listDocuments(db, { sourceType: SOURCE_TYPE, library: config.library }); + const existingUrlMap = new Map<string, string>( + existingDocs + .filter((d): d is typeof d & { url: string } => d.url !== null) + .map((d) => [normalizeUrl(d.url), d.id]), + ); + + const ctx: PageContext = { + siteType: result.detectedType, + db, + provider, + config, + existingUrlMap, + result, + }; + + // --- Process the root page first --- + await processPage(rootNormalised, rootHtml, ctx); + + // --- BFS crawl --- + while (queue.length > 0 && visited.size <= maxPages) { + const batch = queue.splice(0, concurrency); + + await Promise.allSettled( + batch.map(async ({ url, depth }) => { + if (visited.size > maxPages) return; + + let html: string; + let contentType: string; + try { + const raw = await fetchRaw(url, fetchOptions); + html = raw.body; + contentType = raw.contentType; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + log.warn({ url, error: msg }, "Failed to fetch documentation page"); + result.errors.push({ url, error: msg }); + return; + } + + // Only process HTML pages (skip binary/asset responses that slipped through) + if (!contentType.includes("text/html") && !contentType.includes("text/plain")) { + return; + } + + await processPage(url, html, ctx); + + // Continue link discovery if within depth budget + if (depth < maxDepth) { + for (const link of extractDocLinks(html, url, pathPrefix)) { + if (!visited.has(link)) { + visited.add(link); + queue.push({ url: link, depth: depth + 1 }); + } + } + } + }), + ); + } + + completeSync(db, syncId, { + added: result.pagesIndexed, + updated: result.pagesUpdated, + deleted: 0, + errored: result.errors.length, + }); + + log.info( + { + pagesIndexed: result.pagesIndexed, + pagesUpdated: result.pagesUpdated, + pagesSkipped: result.pagesSkipped, + errors: result.errors.length, + }, + "Documentation site sync complete", + ); + + return result; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + failSync(db, syncId, msg); + throw err; + } +} + +/** + * Remove all documents that were indexed from a given documentation site. + * + * Identifies documents by URL prefix (`siteUrl + "%"`) so only pages that + * originated from the specified site are removed. + * + * @param db The database connection. + * @param siteUrl Root URL of the documentation site (used as URL prefix filter). + * @returns The number of documents deleted. + */ +export function disconnectDocSite(db: Database.Database, siteUrl: string): number { + const log = getLogger(); + + let basePrefix: string; + try { + const parsed = new URL(siteUrl); + // Use origin + pathname as prefix so we don't accidentally match sibling sites + basePrefix = parsed.origin + parsed.pathname; + } catch { + throw new ValidationError(`Invalid site URL for disconnect: ${siteUrl}`); + } + + // Parameterised LIKE — the prefix is derived from a validated URL, not user input. + const rows = db + .prepare("SELECT id FROM documents WHERE url LIKE ?") + .all(`${basePrefix}%`) as Array<{ id: string }>; + + let removed = 0; + for (const row of rows) { + try { + deleteDocument(db, row.id); + removed++; + } catch { + // Document may have already been deleted + } + } + + log.info({ siteUrl, removed }, "Documentation site disconnected"); + return removed; +} diff --git a/src/connectors/index.ts b/src/connectors/index.ts index 2885f8a..fba92f8 100644 --- a/src/connectors/index.ts +++ b/src/connectors/index.ts @@ -111,11 +111,11 @@ export function deleteDbConnectorConfig(db: Database.Database, type: string): bo const CONNECTORS_DIR = join(homedir(), ".libscope", "connectors"); function ensureConnectorsDir(): void { - if (!existsSync(CONNECTORS_DIR)) { - mkdirSync(CONNECTORS_DIR, { recursive: true, mode: 0o700 }); - } else { + if (existsSync(CONNECTORS_DIR)) { // Remediate existing directories that may have permissive permissions restrictPermissions(CONNECTORS_DIR, 0o700); + } else { + mkdirSync(CONNECTORS_DIR, { recursive: true, mode: 0o700 }); } try { chmodSync(CONNECTORS_DIR, 0o700); @@ -229,3 +229,16 @@ export { getApiUrls, } from "./confluence.js"; export type { ConfluenceConfig, ConfluenceSyncResult } from "./confluence.js"; + +export { + syncDocSite, + disconnectDocSite, + detectDocSiteType, + extractDocLinks, + extractDocTitle, + extractMainContent, + extractElementByPattern, + extractSitemapUrls, + normalizeUrl, +} from "./docs.js"; +export type { DocSiteConfig, DocSiteSyncResult, DocSiteType } from "./docs.js"; diff --git a/src/core/index.ts b/src/core/index.ts index 918472e..1e95612 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -212,6 +212,19 @@ export { } from "../connectors/confluence.js"; export type { ConfluenceConfig, ConfluenceSyncResult } from "../connectors/confluence.js"; +export { + syncDocSite, + disconnectDocSite, + detectDocSiteType, + extractDocLinks, + extractDocTitle, + extractMainContent, + extractElementByPattern, + extractSitemapUrls, + normalizeUrl as normalizeDocUrl, +} from "../connectors/docs.js"; +export type { DocSiteConfig, DocSiteSyncResult, DocSiteType } from "../connectors/docs.js"; + export { resolveSelector, bulkDelete, bulkRetag, bulkMove } from "./bulk.js"; export type { BulkSelector, BulkResult } from "./bulk.js"; diff --git a/tests/unit/docs-connector.test.ts b/tests/unit/docs-connector.test.ts new file mode 100644 index 0000000..9a97417 --- /dev/null +++ b/tests/unit/docs-connector.test.ts @@ -0,0 +1,909 @@ +/** + * Unit tests for src/connectors/docs.ts + * + * Tests cover: + * - normalizeUrl + * - detectDocSiteType + * - extractElementByPattern + * - extractMainContent + * - extractDocTitle + * - extractDocLinks + * - extractSitemapUrls + * - syncDocSite (via mocked fetch + indexDocument) + * - disconnectDocSite + */ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { ValidationError } from "../../src/errors.js"; +import { createTestDbWithVec } from "../fixtures/test-db.js"; +import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; +import { initLogger } from "../../src/logger.js"; +import type Database from "better-sqlite3"; + +// ------------------------------------------------------------------------- +// Mock global fetch so we never make real HTTP calls +// ------------------------------------------------------------------------- +const mockFetch = vi.fn(); +vi.stubGlobal("fetch", mockFetch); + +// Mock dns to avoid real DNS lookups from url-fetcher +vi.mock("node:dns", () => ({ + promises: { + resolve4: vi.fn().mockResolvedValue(["93.184.216.34"]), + resolve6: vi.fn().mockResolvedValue([]), + }, + lookup: (_host: string, cb: (err: null, addr: string) => void) => cb(null, "93.184.216.34"), +})); + +// Dynamic import after mocks +const { + normalizeUrl, + detectDocSiteType, + extractElementByPattern, + extractMainContent, + extractDocTitle, + extractDocLinks, + extractSitemapUrls, + syncDocSite, + disconnectDocSite, +} = await import("../../src/connectors/docs.js"); + +// ------------------------------------------------------------------------- +// Helpers +// ------------------------------------------------------------------------- + +function mockResponse(body: string, contentType: string, status = 200): Response { + return { + ok: status >= 200 && status < 300, + status, + headers: new Headers({ "content-type": contentType }), + body: { + getReader: () => { + let done = false; + return { + read: () => { + if (done) return Promise.resolve({ done: true as const, value: undefined }); + done = true; + return Promise.resolve({ done: false as const, value: new TextEncoder().encode(body) }); + }, + cancel: () => Promise.resolve(undefined), + }; + }, + }, + text: () => Promise.resolve(body), + url: "", + redirected: false, + } as unknown as Response; +} + +function htmlResponse(body: string, status = 200): Response { + return mockResponse(body, "text/html; charset=utf-8", status); +} + +function xmlResponse(body: string, status = 200): Response { + return mockResponse(body, "application/xml; charset=utf-8", status); +} + +function notFoundResponse(): Response { + return { + ok: false, + status: 404, + headers: new Headers({ "content-type": "text/html" }), + body: null, + text: () => Promise.resolve("Not Found"), + url: "", + redirected: false, + } as unknown as Response; +} + +// ------------------------------------------------------------------------- +// normalizeUrl +// ------------------------------------------------------------------------- + +describe("normalizeUrl", () => { + it("strips fragments", () => { + expect(normalizeUrl("https://example.com/docs/page#section")).toBe( + "https://example.com/docs/page", + ); + }); + + it("removes trailing slash from non-root paths", () => { + expect(normalizeUrl("https://example.com/docs/page/")).toBe("https://example.com/docs/page"); + }); + + it("preserves root slash", () => { + expect(normalizeUrl("https://example.com/")).toBe("https://example.com/"); + }); + + it("preserves query strings", () => { + expect(normalizeUrl("https://example.com/docs?v=2")).toBe("https://example.com/docs?v=2"); + }); + + it("handles already normalised URLs unchanged", () => { + const url = "https://example.com/docs/api"; + expect(normalizeUrl(url)).toBe(url); + }); + + it("returns input unchanged when URL is malformed", () => { + expect(normalizeUrl("not-a-url")).toBe("not-a-url"); + }); +}); + +// ------------------------------------------------------------------------- +// detectDocSiteType +// ------------------------------------------------------------------------- + +describe("detectDocSiteType", () => { + it.each([ + [ + "Sphinx meta generator", + '<html><head><meta name="generator" content="Sphinx 5.0"></head></html>', + "sphinx", + ], + ["Sphinx sphinxsidebar class", '<div class="sphinxsidebar"><p>nav</p></div>', "sphinx"], + [ + "Sphinx rst-content class", + '<div class="rst-content"><div role="main">...</div></div>', + "sphinx", + ], + ["Sphinx sphinx- prefixed class", '<div class="sphinx-version">5.0</div>', "sphinx"], + ["VitePress __VITEPRESS_ global", "<script>window.__VITEPRESS_DATA__={}</script>", "vitepress"], + ["VitePress VPDoc class", '<div class="VPDoc"><main>...</main></div>', "vitepress"], + ["VitePress vp-doc class", '<div class="vp-doc"><h1>Title</h1></div>', "vitepress"], + ["VitePress meta content", '<meta name="generator" content="VitePress 1.0">', "vitepress"], + ["Doxygen HTML comment", "<!-- Generated by Doxygen 1.9 --><html></html>", "doxygen"], + ["Doxygen meta generator", '<meta name="generator" content="Doxygen 1.9.0">', "doxygen"], + [ + "Doxygen doc-content id", + '<div id="doc-content"><div class="contents">...</div></div>', + "doxygen", + ], + [ + "unknown HTML → generic", + "<html><body><main><p>Some docs</p></main></body></html>", + "generic", + ], + [ + "Sphinx precedence over VitePress", + '<meta name="generator" content="Sphinx 5.0"><div class="vp-doc">overlap</div>', + "sphinx", + ], + ] as const)("detects %s", (_label, html, expected) => { + expect(detectDocSiteType(html)).toBe(expected); + }); +}); + +// ------------------------------------------------------------------------- +// extractElementByPattern +// ------------------------------------------------------------------------- + +describe("extractElementByPattern", () => { + it("extracts content of a simple div by id pattern", () => { + const html = '<div id="content"><p>Hello world</p></div>'; + const result = extractElementByPattern(html, "div", /id=["']content["']/); + expect(result).toBe("<p>Hello world</p>"); + }); + + it("extracts content of a div by class pattern", () => { + const html = '<div class="vp-doc"><h1>Title</h1><p>Body</p></div>'; + const result = extractElementByPattern(html, "div", (attrs) => attrs.includes("vp-doc")); + expect(result).toBe("<h1>Title</h1><p>Body</p>"); + }); + + it("handles nested elements of the same tag name correctly", () => { + const html = + '<div class="main"><div class="inner"><p>inner</p></div><p>outer</p></div><div>other</div>'; + const result = extractElementByPattern(html, "div", /class=["']main["']/); + expect(result).toBe('<div class="inner"><p>inner</p></div><p>outer</p>'); + }); + + it("returns null when no matching element is found", () => { + const html = "<div><p>nothing here</p></div>"; + const result = extractElementByPattern(html, "div", /class=["']vp-doc["']/); + expect(result).toBeNull(); + }); + + it("extracts main element with empty attr pattern", () => { + const html = "<html><body><main><p>content</p></main></body></html>"; + const result = extractElementByPattern(html, "main", () => true); + expect(result).toBe("<p>content</p>"); + }); + + it("extracts article element with empty attr pattern", () => { + const html = "<body><article><h1>Doc</h1><p>text</p></article></body>"; + const result = extractElementByPattern(html, "article", () => true); + expect(result).toBe("<h1>Doc</h1><p>text</p>"); + }); + + it("returns null for malformed HTML with unclosed tags", () => { + const html = '<div class="main"><p>unclosed'; + const result = extractElementByPattern(html, "div", /class=["']main["']/); + // Should not throw; returns null or partial result + expect(result === null || typeof result === "string").toBe(true); + }); + + it("finds first match when multiple matching elements exist", () => { + const html = '<div class="body"><p>first</p></div><div class="body"><p>second</p></div>'; + const result = extractElementByPattern(html, "div", /class=["']body["']/); + expect(result).toBe("<p>first</p>"); + }); +}); + +// ------------------------------------------------------------------------- +// extractDocTitle +// ------------------------------------------------------------------------- + +describe("extractDocTitle", () => { + it.each([ + [ + "H1 tag", + "<html><body><h1>Getting Started</h1></body></html>", + "https://example.com/docs/start", + "Getting Started", + ], + [ + "H1 with inner tags stripped", + '<h1><a href="#">API Reference</a></h1>', + "https://example.com/docs/api", + "API Reference", + ], + [ + "<title> fallback", + "<html><head><title>My Library — Docs", + "https://example.com/docs", + "My Library — Docs", + ], + [ + "URL-derived fallback", + "

content

", + "https://example.com/docs/installation", + "installation", + ], + [ + "hyphens to spaces", + "", + "https://example.com/docs/getting-started", + "getting started", + ], + [ + "strip file extension", + "", + "https://example.com/docs/index.html", + "index", + ], + [ + "hostname for empty path", + "", + "https://example.com/", + "example.com", + ], + [ + "H1 precedence over title", + "Page Title

Real Title

", + "https://example.com/page", + "Real Title", + ], + ] as const)("extracts title from %s", (_label, html, url, expected) => { + expect(extractDocTitle(html, url)).toBe(expected); + }); +}); + +// ------------------------------------------------------------------------- +// extractDocLinks +// ------------------------------------------------------------------------- + +describe("extractDocLinks", () => { + const BASE = "https://docs.example.com/docs/"; + + it("extracts absolute same-origin links", () => { + const html = 'API'; + const links = extractDocLinks(html, BASE, "/docs/"); + expect(links).toContain("https://docs.example.com/docs/api"); + }); + + it("resolves relative links against base URL", () => { + const html = 'Getting Started'; + const links = extractDocLinks(html, BASE, "/docs/"); + expect(links).toContain("https://docs.example.com/docs/getting-started"); + }); + + it.each([ + ["different origins", 'External'], + ["fragment-only links", 'Jump'], + ["mailto links", 'Email'], + ["javascript links", 'Click'], + ["data URIs", 'Data'], + ["vbscript links", 'VBS'], + ])("skips %s", (_label, html) => { + expect(extractDocLinks(html, BASE, "/docs/")).toEqual([]); + }); + + it("skips binary asset extensions", () => { + const html = [ + 'PNG', + 'ZIP', + 'CSS', + 'JS', + ].join("\n"); + expect(extractDocLinks(html, BASE, "/docs/")).toEqual([]); + }); + + it("respects pathPrefix to exclude links outside the prefix", () => { + const html = 'In docsBlog'; + const links = extractDocLinks(html, BASE, "/docs/"); + expect(links).toContain("https://docs.example.com/docs/page"); + expect(links).not.toContain("https://docs.example.com/blog/post"); + }); + + it("deduplicates links (normalises URL, strips fragment)", () => { + const html = [ + 'One', + 'Two', + 'Three', + ].join("\n"); + const links = extractDocLinks(html, BASE, "/docs/"); + expect(links.filter((l) => l.includes("/docs/page")).length).toBe(1); + }); + + it("returns empty array when no anchors found", () => { + expect(extractDocLinks("

No links here

", BASE, "/docs/")).toEqual([]); + }); +}); + +// ------------------------------------------------------------------------- +// extractSitemapUrls +// ------------------------------------------------------------------------- + +describe("extractSitemapUrls", () => { + const BASE = "https://docs.example.com/"; + + it("extracts URLs from a simple sitemap", () => { + const xml = ` + + https://docs.example.com/docs/intro + https://docs.example.com/docs/api +`; + const urls = extractSitemapUrls(xml, BASE, "/docs/"); + expect(urls).toContain("https://docs.example.com/docs/intro"); + expect(urls).toContain("https://docs.example.com/docs/api"); + }); + + it("filters out URLs on different origins", () => { + const xml = ` + https://other.com/docs/page + https://docs.example.com/docs/page +`; + const urls = extractSitemapUrls(xml, BASE, "/docs/"); + expect(urls).not.toContain("https://other.com/docs/page"); + expect(urls).toContain("https://docs.example.com/docs/page"); + }); + + it("filters by pathPrefix", () => { + const xml = ` + https://docs.example.com/docs/page + https://docs.example.com/blog/post +`; + const urls = extractSitemapUrls(xml, BASE, "/docs/"); + expect(urls).toContain("https://docs.example.com/docs/page"); + expect(urls).not.toContain("https://docs.example.com/blog/post"); + }); + + it("filters out binary asset URLs", () => { + const xml = ` + https://docs.example.com/docs/image.png + https://docs.example.com/docs/page +`; + const urls = extractSitemapUrls(xml, BASE, "/docs/"); + expect(urls).not.toContain("https://docs.example.com/docs/image.png"); + }); + + it("deduplicates URLs", () => { + const xml = ` + https://docs.example.com/docs/page + https://docs.example.com/docs/page +`; + const urls = extractSitemapUrls(xml, BASE, "/docs/"); + expect(urls.length).toBe(1); + }); + + it("returns empty array for empty sitemap", () => { + const xml = ``; + expect(extractSitemapUrls(xml, BASE, "/docs/")).toEqual([]); + }); +}); + +// ------------------------------------------------------------------------- +// extractMainContent +// ------------------------------------------------------------------------- + +describe("extractMainContent", () => { + it.each([ + [ + "Sphinx role=main div", + "sphinx" as const, + '

Title

Content

', + ["Title", "Content"], + ], + [ + "VitePress vp-doc div", + "vitepress" as const, + '
nav

API

Details

', + ["API", "Details"], + ], + [ + "Doxygen contents div", + "doxygen" as const, + '

Function Reference

Details

', + ["Function Reference", "Details"], + ], + [ + "generic main element", + "generic" as const, + "
nav

Guide

Text

", + ["Guide", "Text"], + ], + [ + "full-page fallback", + "sphinx" as const, + "

Fallback content

", + ["Fallback content"], + ], + ])("extracts %s", (_label, siteType, html, expected) => { + const result = extractMainContent(html, siteType); + for (const text of expected) { + expect(result).toContain(text); + } + }); + + it("returns non-empty string for any non-empty HTML", () => { + const html = "

Something

"; + const result = extractMainContent(html, "generic"); + expect(result.trim().length).toBeGreaterThan(0); + }); +}); + +// ------------------------------------------------------------------------- +// syncDocSite — validation +// ------------------------------------------------------------------------- + +describe("syncDocSite — validation", () => { + let db: Database.Database; + let provider: MockEmbeddingProvider; + + beforeEach(() => { + initLogger("silent"); + db = createTestDbWithVec(); + provider = new MockEmbeddingProvider(); + }); + + afterEach(() => { + db.close(); + }); + + it("throws ValidationError when url is missing", async () => { + await expect(syncDocSite(db, provider, { url: "" })).rejects.toBeInstanceOf(ValidationError); + }); + + it("throws ValidationError for malformed URL", async () => { + await expect(syncDocSite(db, provider, { url: "not-a-url" })).rejects.toBeInstanceOf( + ValidationError, + ); + }); + + it("throws ValidationError for non-http/https scheme", async () => { + await expect( + syncDocSite(db, provider, { url: "ftp://example.com/docs" }), + ).rejects.toBeInstanceOf(ValidationError); + }); +}); + +// ------------------------------------------------------------------------- +// syncDocSite — integration with mocked fetch +// ------------------------------------------------------------------------- + +describe("syncDocSite — mocked fetch", () => { + let db: Database.Database; + let provider: MockEmbeddingProvider; + + // Implementation order: root page is fetched FIRST, then sitemap.xml, + // then BFS pages. All mock setups must follow this order. + + const SPHINX_ROOT = ` + + + + My Library Docs + + +
+ API + Guide +
+
+

Welcome

+

This is the documentation root.

+
+ + `; + + // Sphinx root page with only one outbound link (for simpler tests) + const SPHINX_ROOT_SIMPLE = ` + + Docs + +

Welcome

This is the documentation root page content.

+ + `; + + const SPHINX_API = ` + + API Reference + +
+

API Reference

+

Function definitions and usage.

+
+ + `; + + beforeEach(() => { + initLogger("silent"); + db = createTestDbWithVec(); + provider = new MockEmbeddingProvider(); + // mockReset clears both call history AND the mockResolvedValueOnce queue, + // preventing mock bleed between tests. + mockFetch.mockReset(); + }); + + afterEach(() => { + db.close(); + }); + + it("indexes the root page and detects Sphinx site type", async () => { + // Order: root, sitemap + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root page + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml 404 + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + expect(result.detectedType).toBe("sphinx"); + expect(result.pagesIndexed).toBe(1); + expect(result.errors).toHaveLength(0); + }); + + it("uses configured type instead of auto-detecting", async () => { + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + type: "vitepress", + }); + + expect(result.detectedType).toBe("vitepress"); + }); + + it("crawls pages discovered via link extraction", async () => { + // Order: root, sitemap, api, guide + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT)) // root (has links to api + guide) + .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml + .mockResolvedValueOnce(htmlResponse(SPHINX_API)) // /docs/api + .mockResolvedValueOnce( + htmlResponse("

Guide

Guide content.

"), + ); // /docs/guide + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + // Root + api + guide = 3 pages + expect(result.pagesIndexed).toBe(3); + expect(result.errors).toHaveLength(0); + }); + + it("uses sitemap.xml for URL discovery when available", async () => { + const sitemap = ` + + https://docs.example.com/docs/api +`; + + // Order: root, sitemap (success), api + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root page + .mockResolvedValueOnce(xmlResponse(sitemap)) // sitemap.xml success + .mockResolvedValueOnce(htmlResponse(SPHINX_API)); // /docs/api from sitemap + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + expect(result.pagesIndexed).toBeGreaterThanOrEqual(1); + expect(result.errors).toHaveLength(0); + }); + + it("records errors for pages that fail to fetch", async () => { + const rootWithFailingLink = ` + + + +

Root

Intro text content here.

+ Broken + + `; + + // Order: root, sitemap, broken page + mockFetch + .mockResolvedValueOnce(htmlResponse(rootWithFailingLink)) // root + .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml + .mockResolvedValueOnce(notFoundResponse()); // broken page → error + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + expect(result.errors.length).toBeGreaterThanOrEqual(1); + expect(result.errors[0]?.url).toContain("/docs/broken"); + }); + + it("skips pages outside pathPrefix", async () => { + const rootWithOutsideLink = ` + + + +

Root

Intro text content here.

+ Blog + API + + `; + + // Order: root, sitemap, api (blog is skipped by pathPrefix) + mockFetch + .mockResolvedValueOnce(htmlResponse(rootWithOutsideLink)) // root + .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml + .mockResolvedValueOnce(htmlResponse(SPHINX_API)); // /docs/api + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + pathPrefix: "/docs/", + }); + + // Should only have fetched root and /docs/api, not /blog/post + const fetchedUrls = mockFetch.mock.calls.map((c) => c[0] as string); + expect(fetchedUrls.some((u) => u.includes("/blog/"))).toBe(false); + expect(result.errors).toHaveLength(0); + }); + + it("respects maxPages limit", async () => { + const rootWithManyLinks = ` + + + +

Root

Intro content for root page.

+ P1 + P2 + P3 + P4 + P5 + + `; + const pageHtml = (n: number) => + `

Page ${n}

Content for page ${n} of the docs.

`; + + // Order: root, sitemap, then sub-pages (unlimited via mockResolvedValue) + mockFetch + .mockResolvedValueOnce(htmlResponse(rootWithManyLinks)) // root + .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml + .mockResolvedValue(htmlResponse(pageHtml(1))); // all subsequent pages + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + maxPages: 2, + }); + + // root (1) + up to maxPages (2) = at most 3 total + expect(result.pagesIndexed + result.pagesUpdated + result.pagesSkipped).toBeLessThanOrEqual(3); + }); + + it("skips empty pages and counts them as skipped", async () => { + // A page with a role=main div that has no text content + const emptyPage = `
`; + + mockFetch + .mockResolvedValueOnce(htmlResponse(emptyPage)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + const result = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + expect(result.pagesSkipped).toBeGreaterThanOrEqual(1); + expect(result.pagesIndexed).toBe(0); + }); + + it("tags indexed documents with the configured library name", async () => { + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + library: "mylib", + version: "2.0", + }); + + const doc = db + .prepare("SELECT library, version FROM documents WHERE url IS NOT NULL LIMIT 1") + .get() as { library: string; version: string } | undefined; + + expect(doc?.library).toBe("mylib"); + expect(doc?.version).toBe("2.0"); + }); + + it("re-indexes changed pages and counts them as updated", async () => { + // First sync — index root + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + await syncDocSite(db, provider, { url: "https://docs.example.com/docs/" }); + + const beforeCount = (db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number }) + .n; + expect(beforeCount).toBe(1); + + // Second sync — same URL but different content + const changedRoot = SPHINX_ROOT_SIMPLE.replace( + "documentation root page content.", + "updated documentation page content.", + ); + mockFetch + .mockResolvedValueOnce(htmlResponse(changedRoot)) // root (changed) + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + const result2 = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + // Should update in-place, not add a new doc + expect(result2.pagesUpdated).toBe(1); + expect(result2.pagesIndexed).toBe(0); + const afterCount = (db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number }).n; + expect(afterCount).toBe(1); + }); + + it("skips unchanged pages (content-hash match) as skipped", async () => { + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + await syncDocSite(db, provider, { url: "https://docs.example.com/docs/" }); + + // Exact same content — should be skipped on second run + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root (unchanged) + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + const result2 = await syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + }); + + expect(result2.pagesSkipped).toBe(1); + expect(result2.pagesIndexed).toBe(0); + expect(result2.pagesUpdated).toBe(0); + }); + + it("records sync history in the connector_syncs table", async () => { + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + await syncDocSite(db, provider, { url: "https://docs.example.com/docs/" }); + + const row = db + .prepare("SELECT status, connector_type FROM connector_syncs ORDER BY id DESC LIMIT 1") + .get() as { status: string; connector_type: string } | undefined; + + expect(row?.status).toBe("completed"); + expect(row?.connector_type).toBe("docs"); + }); + + it("throws when root page fetch fails", async () => { + mockFetch.mockResolvedValueOnce(notFoundResponse()); // root 404 + + await expect( + syncDocSite(db, provider, { url: "https://docs.example.com/docs/" }), + ).rejects.toThrow(); + }); + + it("limits concurrency to between 1 and 10", async () => { + mockFetch + .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root + .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml + + await expect( + syncDocSite(db, provider, { + url: "https://docs.example.com/docs/", + concurrency: 100, + }), + ).resolves.not.toThrow(); + }); +}); + +// ------------------------------------------------------------------------- +// disconnectDocSite +// ------------------------------------------------------------------------- + +describe("disconnectDocSite", () => { + let db: Database.Database; + + beforeEach(() => { + initLogger("silent"); + db = createTestDbWithVec(); + vi.clearAllMocks(); + }); + + afterEach(() => { + db.close(); + }); + + it("removes all documents from the given site URL prefix", () => { + // Seed some docs manually + db.prepare( + "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)", + ).run("doc-1", "Page 1", "Content 1", "https://docs.example.com/docs/page1"); + db.prepare( + "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)", + ).run("doc-2", "Page 2", "Content 2", "https://docs.example.com/docs/page2"); + db.prepare( + "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)", + ).run("doc-3", "Other", "Content 3", "https://other.example.com/docs/page"); + + const removed = disconnectDocSite(db, "https://docs.example.com/docs/"); + + expect(removed).toBe(2); + + const remaining = db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number }; + expect(remaining.n).toBe(1); // doc-3 should remain + }); + + it("returns 0 when no matching documents exist", () => { + const removed = disconnectDocSite(db, "https://docs.example.com/docs/"); + expect(removed).toBe(0); + }); + + it("throws ValidationError for invalid site URL", () => { + expect(() => disconnectDocSite(db, "not-a-url")).toThrow(ValidationError); + }); + + it("does not remove documents from other sites", () => { + db.prepare( + "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)", + ).run("doc-1", "Page 1", "Content 1", "https://other.example.com/docs/page"); + + const removed = disconnectDocSite(db, "https://docs.example.com/docs/"); + expect(removed).toBe(0); + + const remaining = db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number }; + expect(remaining.n).toBe(1); + }); + + it("removes associated chunks", () => { + db.prepare( + "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', 'Title', 'Body', ?)", + ).run("doc-1", "https://docs.example.com/docs/page"); + db.prepare( + "INSERT INTO chunks (id, document_id, content, chunk_index) VALUES (?, ?, ?, ?)", + ).run("chunk-1", "doc-1", "Chunk content", 0); + + disconnectDocSite(db, "https://docs.example.com/docs/"); + + const chunks = db + .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = 'doc-1'") + .get() as { n: number }; + expect(chunks.n).toBe(0); + }); +});