Doc
text
diff --git a/package-lock.json b/package-lock.json
index d02edbc..2263f4d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -6383,9 +6383,6 @@
"win32"
]
},
- "node_modules/sqlite-vec/node_modules/sqlite-vec-linux-arm64": {
- "optional": true
- },
"node_modules/stackback": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz",
diff --git a/src/connectors/docs.ts b/src/connectors/docs.ts
new file mode 100644
index 0000000..eff388f
--- /dev/null
+++ b/src/connectors/docs.ts
@@ -0,0 +1,804 @@
+/**
+ * Documentation site connector for Sphinx, VitePress, and Doxygen.
+ *
+ * Crawls documentation sites, auto-detects the generator, extracts main content,
+ * and indexes each page with URL-based deduplication. Supports incremental syncs
+ * via content-hash comparison built into indexDocument().
+ */
+import type Database from "better-sqlite3";
+import { NodeHtmlMarkdown } from "node-html-markdown";
+import { ValidationError } from "../errors.js";
+import { getLogger } from "../logger.js";
+import { fetchRaw } from "../core/url-fetcher.js";
+import type { FetchOptions } from "../core/url-fetcher.js";
+import { indexDocument } from "../core/indexing.js";
+import { listDocuments, deleteDocument } from "../core/documents.js";
+import { startSync, completeSync, failSync } from "./sync-tracker.js";
+import type { EmbeddingProvider } from "../providers/embedding.js";
+
+// Source type used to tag all docs-connector documents.
+// "library" is the closest semantic match in the IndexDocumentInput union.
+const SOURCE_TYPE = "library" as const;
+
+// Internal connector type identifier used in the sync tracker.
+const CONNECTOR_TYPE = "docs";
+
+const DEFAULT_MAX_PAGES = 500;
+const DEFAULT_MAX_DEPTH = 10;
+const DEFAULT_CONCURRENCY = 3;
+
+/** Non-content file extensions that should not be crawled. */
+const SKIP_EXTENSIONS = new Set([
+ "png",
+ "jpg",
+ "jpeg",
+ "gif",
+ "svg",
+ "ico",
+ "webp",
+ "pdf",
+ "zip",
+ "tar",
+ "gz",
+ "bz2",
+ "xz",
+ "css",
+ "js",
+ "mjs",
+ "json",
+ "xml",
+ "woff",
+ "woff2",
+ "ttf",
+ "eot",
+ "otf",
+ "mp4",
+ "mp3",
+ "ogg",
+ "wav",
+ "map",
+]);
+
+/** Maximum HTML size (in bytes) to process — truncate before regex to mitigate ReDoS. */
+const MAX_HTML_SIZE = 5_000_000;
+
+/** Supported documentation site generators. */
+export type DocSiteType = "sphinx" | "vitepress" | "doxygen" | "generic";
+
+/** Matcher for tag attributes — RegExp or predicate function. */
+type AttrMatcher = RegExp | ((attrs: string) => boolean);
+
+/** A CSS-like selector expressed as a tag name + attribute matcher. */
+interface ContentSelector {
+ tag: string;
+ attr: AttrMatcher;
+}
+
+/** Per-framework detection and content selectors. */
+interface FrameworkDef {
+ type: DocSiteType;
+ /** Returns true if the full HTML matches this framework. */
+ detect: (html: string) => boolean;
+ contentSelectors: ContentSelector[];
+}
+
+/**
+ * Return a predicate that checks whether a tag's attribute string contains
+ * a specific CSS class name. Uses indexOf + split instead of regex to avoid
+ * polynomial backtracking on untrusted HTML.
+ */
+function classContains(className: string, caseInsensitive = false): (attrs: string) => boolean {
+ return (attrs: string): boolean => {
+ // Extract the class attribute value using indexOf (no overlapping quantifiers)
+ const classRe = /class=["']([^"']{0,2000})["']/i;
+ const m = classRe.exec(attrs);
+ if (!m?.[1]) return false;
+ const classes = m[1].split(/\s+/);
+ return caseInsensitive
+ ? classes.some((c) => c.toLowerCase() === className.toLowerCase())
+ : classes.includes(className);
+ };
+}
+
+/**
+ * Data-driven framework definitions.
+ *
+ * Detection uses string-based checks (includes / simple regex without
+ * backtracking-prone quantifiers) to avoid CodeQL polynomial-regex alerts.
+ * Content selectors use classContains() predicates for the same reason.
+ */
+const FRAMEWORK_DEFS: readonly FrameworkDef[] = [
+ {
+ type: "sphinx",
+ detect: (html) =>
+ /content=["']Sphinx/i.test(html) ||
+ html.includes("sphinxsidebar") ||
+ html.includes("rst-content") ||
+ /class=["']sphinx-[a-z]/i.test(html),
+ contentSelectors: [
+ { tag: "div", attr: /role=["']main["']/i },
+ { tag: "div", attr: classContains("body") },
+ { tag: "section", attr: /role=["']main["']/i },
+ { tag: "article", attr: () => true },
+ ],
+ },
+ {
+ type: "vitepress",
+ detect: (html) =>
+ /__VITEPRESS_/i.test(html) ||
+ html.includes("VPDoc") ||
+ html.includes("vp-doc") ||
+ /content=["']VitePress/i.test(html),
+ contentSelectors: [
+ { tag: "div", attr: classContains("vp-doc", true) },
+ { tag: "div", attr: classContains("VPDoc") },
+ { tag: "main", attr: () => true },
+ ],
+ },
+ {
+ type: "doxygen",
+ detect: (html) =>
+ /Generated by Doxygen/i.test(html) ||
+ /content=["']Doxygen/i.test(html) ||
+ html.includes("doc-content") ||
+ html.includes("doxygen"),
+ contentSelectors: [
+ { tag: "div", attr: classContains("contents") },
+ { tag: "div", attr: /id=["']doc-content["']/ },
+ { tag: "div", attr: classContains("textblock") },
+ ],
+ },
+];
+
+/** Fallback selectors for sites that don't match any known framework. */
+const GENERIC_CONTENT_SELECTORS: readonly ContentSelector[] = [
+ { tag: "main", attr: () => true },
+ { tag: "article", attr: () => true },
+ { tag: "div", attr: /\bid=["']content["']/ },
+ { tag: "div", attr: classContains("content") },
+];
+
+/** Configuration for a documentation site sync. */
+export interface DocSiteConfig {
+ /** Root URL of the documentation site. */
+ url: string;
+ /** Documentation generator type. Set to "auto" (or omit) for auto-detection. */
+ type?: DocSiteType | "auto";
+ /** Library name to associate with indexed pages (used for filtering and metadata). */
+ library?: string | undefined;
+ /** Library version to associate with indexed pages. */
+ version?: string | undefined;
+ /** Maximum number of pages to crawl (default: 500). */
+ maxPages?: number | undefined;
+ /** Maximum link depth from the root page (default: 10). */
+ maxDepth?: number | undefined;
+ /** Maximum number of pages to fetch concurrently (1–10, default: 3). */
+ concurrency?: number | undefined;
+ /** Allow fetching from private/internal IP addresses (default: false). */
+ allowPrivateUrls?: boolean | undefined;
+ /** Accept self-signed or untrusted TLS certificates (default: false). */
+ allowSelfSignedCerts?: boolean | undefined;
+ /** ISO 8601 timestamp of the last sync; reserved for future incremental sync use. */
+ lastSync?: string | undefined;
+ /**
+ * Restrict crawling to URLs whose path starts with this prefix.
+ * Defaults to the root URL's pathname (e.g. "/docs/").
+ */
+ pathPrefix?: string | undefined;
+}
+
+/** Result of a documentation site sync. */
+export interface DocSiteSyncResult {
+ /** Pages newly indexed in this sync. */
+ pagesIndexed: number;
+ /** Pages that existed before and were re-indexed due to content changes. */
+ pagesUpdated: number;
+ /** Pages skipped because they are empty or contain no meaningful content. */
+ pagesSkipped: number;
+ /** The detected (or configured) documentation site type. */
+ detectedType: DocSiteType;
+ /** Per-page errors encountered during the crawl. */
+ errors: Array<{ url: string; error: string }>;
+}
+
+// ---------------------------------------------------------------------------
+// URL utilities
+// ---------------------------------------------------------------------------
+
+/**
+ * Normalise a URL for deduplication: strip the fragment, remove trailing
+ * slash from non-root paths, and keep scheme + host + path + query.
+ */
+export function normalizeUrl(rawUrl: string): string {
+ try {
+ const parsed = new URL(rawUrl);
+ parsed.hash = "";
+ if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
+ parsed.pathname = parsed.pathname.slice(0, -1);
+ }
+ return parsed.href;
+ } catch {
+ // Malformed URL — return as-is for deduplication fallback
+ return rawUrl;
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Site-type detection
+// ---------------------------------------------------------------------------
+
+/**
+ * Detect the documentation generator from the HTML of a page.
+ *
+ * Checks generator meta tags and framework-specific CSS class names
+ * defined in FRAMEWORK_DEFS. Returns "generic" when no known pattern is found.
+ */
+export function detectDocSiteType(html: string): DocSiteType {
+ for (const fw of FRAMEWORK_DEFS) {
+ if (fw.detect(html)) {
+ return fw.type;
+ }
+ }
+ return "generic";
+}
+
+// ---------------------------------------------------------------------------
+// HTML content extraction
+// ---------------------------------------------------------------------------
+
+/**
+ * Walk HTML from `startPos` and find the end of a balanced `
Some docs
Hello world
Hello world
"); + }); + + it("extracts content of a div by class pattern", () => { + const html = 'Body
Body
"); + }); + + it("handles nested elements of the same tag name correctly", () => { + const html = + 'inner
outer
inner
outer
'); + }); + + it("returns null when no matching element is found", () => { + const html = "nothing here
content
content
"); + }); + + it("extracts article element with empty attr pattern", () => { + const html = "text
text
"); + }); + + it("returns null for malformed HTML with unclosed tags", () => { + const html = 'unclosed'; + const result = extractElementByPattern(html, "div", /class=["']main["']/); + // Should not throw; returns null or partial result + expect(result === null || typeof result === "string").toBe(true); + }); + + it("finds first match when multiple matching elements exist", () => { + const html = '
first
second
first
"); + }); +}); + +// ------------------------------------------------------------------------- +// extractDocTitle +// ------------------------------------------------------------------------- + +describe("extractDocTitle", () => { + it.each([ + [ + "H1 tag", + "content
", + "https://example.com/docs/installation", + "installation", + ], + [ + "hyphens to spaces", + "", + "https://example.com/docs/getting-started", + "getting started", + ], + [ + "strip file extension", + "", + "https://example.com/docs/index.html", + "index", + ], + [ + "hostname for empty path", + "", + "https://example.com/", + "example.com", + ], + [ + "H1 precedence over title", + "No links here
", BASE, "/docs/")).toEqual([]); + }); +}); + +// ------------------------------------------------------------------------- +// extractSitemapUrls +// ------------------------------------------------------------------------- + +describe("extractSitemapUrls", () => { + const BASE = "https://docs.example.com/"; + + it("extracts URLs from a simple sitemap", () => { + const xml = ` +Content
Details
Details
Text
Fallback content
", + ["Fallback content"], + ], + ])("extracts %s", (_label, siteType, html, expected) => { + const result = extractMainContent(html, siteType); + for (const text of expected) { + expect(result).toContain(text); + } + }); + + it("returns non-empty string for any non-empty HTML", () => { + const html = "Something
This is the documentation root.
+This is the documentation root page content.
Function definitions and usage.
+Guide content.
Intro text content here.
Intro text content here.
Intro content for root page.
Content for page ${n} of the docs.