diff --git a/package-lock.json b/package-lock.json
index d02edbc..2263f4d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -6383,9 +6383,6 @@
         "win32"
       ]
     },
-    "node_modules/sqlite-vec/node_modules/sqlite-vec-linux-arm64": {
-      "optional": true
-    },
     "node_modules/stackback": {
       "version": "0.0.2",
       "resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz",
diff --git a/src/connectors/docs.ts b/src/connectors/docs.ts
new file mode 100644
index 0000000..eff388f
--- /dev/null
+++ b/src/connectors/docs.ts
@@ -0,0 +1,804 @@
+/**
+ * Documentation site connector for Sphinx, VitePress, and Doxygen.
+ *
+ * Crawls documentation sites, auto-detects the generator, extracts main content,
+ * and indexes each page with URL-based deduplication. Supports incremental syncs
+ * via content-hash comparison built into indexDocument().
+ */
+import type Database from "better-sqlite3";
+import { NodeHtmlMarkdown } from "node-html-markdown";
+import { ValidationError } from "../errors.js";
+import { getLogger } from "../logger.js";
+import { fetchRaw } from "../core/url-fetcher.js";
+import type { FetchOptions } from "../core/url-fetcher.js";
+import { indexDocument } from "../core/indexing.js";
+import { listDocuments, deleteDocument } from "../core/documents.js";
+import { startSync, completeSync, failSync } from "./sync-tracker.js";
+import type { EmbeddingProvider } from "../providers/embedding.js";
+
+// Source type used to tag all docs-connector documents.
+// "library" is the closest semantic match in the IndexDocumentInput union.
+const SOURCE_TYPE = "library" as const;
+
+// Internal connector type identifier used in the sync tracker.
+const CONNECTOR_TYPE = "docs";
+
+const DEFAULT_MAX_PAGES = 500;
+const DEFAULT_MAX_DEPTH = 10;
+const DEFAULT_CONCURRENCY = 3;
+
+/** Non-content file extensions that should not be crawled. */
+const SKIP_EXTENSIONS = new Set([
+  "png",
+  "jpg",
+  "jpeg",
+  "gif",
+  "svg",
+  "ico",
+  "webp",
+  "pdf",
+  "zip",
+  "tar",
+  "gz",
+  "bz2",
+  "xz",
+  "css",
+  "js",
+  "mjs",
+  "json",
+  "xml",
+  "woff",
+  "woff2",
+  "ttf",
+  "eot",
+  "otf",
+  "mp4",
+  "mp3",
+  "ogg",
+  "wav",
+  "map",
+]);
+
+/** Maximum HTML size (in bytes) to process — truncate before regex to mitigate ReDoS. */
+const MAX_HTML_SIZE = 5_000_000;
+
+/** Supported documentation site generators. */
+export type DocSiteType = "sphinx" | "vitepress" | "doxygen" | "generic";
+
+/** Matcher for tag attributes — RegExp or predicate function. */
+type AttrMatcher = RegExp | ((attrs: string) => boolean);
+
+/** A CSS-like selector expressed as a tag name + attribute matcher. */
+interface ContentSelector {
+  tag: string;
+  attr: AttrMatcher;
+}
+
+/** Per-framework detection and content selectors. */
+interface FrameworkDef {
+  type: DocSiteType;
+  /** Returns true if the full HTML matches this framework. */
+  detect: (html: string) => boolean;
+  contentSelectors: ContentSelector[];
+}
+
+/**
+ * Return a predicate that checks whether a tag's attribute string contains
+ * a specific CSS class name. Uses indexOf + split instead of regex to avoid
+ * polynomial backtracking on untrusted HTML.
+ */
+function classContains(className: string, caseInsensitive = false): (attrs: string) => boolean {
+  return (attrs: string): boolean => {
+    // Extract the class attribute value using indexOf (no overlapping quantifiers)
+    const classRe = /class=["']([^"']{0,2000})["']/i;
+    const m = classRe.exec(attrs);
+    if (!m?.[1]) return false;
+    const classes = m[1].split(/\s+/);
+    return caseInsensitive
+      ? classes.some((c) => c.toLowerCase() === className.toLowerCase())
+      : classes.includes(className);
+  };
+}
+
+/**
+ * Data-driven framework definitions.
+ *
+ * Detection uses string-based checks (includes / simple regex without
+ * backtracking-prone quantifiers) to avoid CodeQL polynomial-regex alerts.
+ * Content selectors use classContains() predicates for the same reason.
+ */
+const FRAMEWORK_DEFS: readonly FrameworkDef[] = [
+  {
+    type: "sphinx",
+    detect: (html) =>
+      /content=["']Sphinx/i.test(html) ||
+      html.includes("sphinxsidebar") ||
+      html.includes("rst-content") ||
+      /class=["']sphinx-[a-z]/i.test(html),
+    contentSelectors: [
+      { tag: "div", attr: /role=["']main["']/i },
+      { tag: "div", attr: classContains("body") },
+      { tag: "section", attr: /role=["']main["']/i },
+      { tag: "article", attr: () => true },
+    ],
+  },
+  {
+    type: "vitepress",
+    detect: (html) =>
+      /__VITEPRESS_/i.test(html) ||
+      html.includes("VPDoc") ||
+      html.includes("vp-doc") ||
+      /content=["']VitePress/i.test(html),
+    contentSelectors: [
+      { tag: "div", attr: classContains("vp-doc", true) },
+      { tag: "div", attr: classContains("VPDoc") },
+      { tag: "main", attr: () => true },
+    ],
+  },
+  {
+    type: "doxygen",
+    detect: (html) =>
+      /Generated by Doxygen/i.test(html) ||
+      /content=["']Doxygen/i.test(html) ||
+      html.includes("doc-content") ||
+      html.includes("doxygen"),
+    contentSelectors: [
+      { tag: "div", attr: classContains("contents") },
+      { tag: "div", attr: /id=["']doc-content["']/ },
+      { tag: "div", attr: classContains("textblock") },
+    ],
+  },
+];
+
+/** Fallback selectors for sites that don't match any known framework. */
+const GENERIC_CONTENT_SELECTORS: readonly ContentSelector[] = [
+  { tag: "main", attr: () => true },
+  { tag: "article", attr: () => true },
+  { tag: "div", attr: /\bid=["']content["']/ },
+  { tag: "div", attr: classContains("content") },
+];
+
+/** Configuration for a documentation site sync. */
+export interface DocSiteConfig {
+  /** Root URL of the documentation site. */
+  url: string;
+  /** Documentation generator type. Set to "auto" (or omit) for auto-detection. */
+  type?: DocSiteType | "auto";
+  /** Library name to associate with indexed pages (used for filtering and metadata). */
+  library?: string | undefined;
+  /** Library version to associate with indexed pages. */
+  version?: string | undefined;
+  /** Maximum number of pages to crawl (default: 500). */
+  maxPages?: number | undefined;
+  /** Maximum link depth from the root page (default: 10). */
+  maxDepth?: number | undefined;
+  /** Maximum number of pages to fetch concurrently (1–10, default: 3). */
+  concurrency?: number | undefined;
+  /** Allow fetching from private/internal IP addresses (default: false). */
+  allowPrivateUrls?: boolean | undefined;
+  /** Accept self-signed or untrusted TLS certificates (default: false). */
+  allowSelfSignedCerts?: boolean | undefined;
+  /** ISO 8601 timestamp of the last sync; reserved for future incremental sync use. */
+  lastSync?: string | undefined;
+  /**
+   * Restrict crawling to URLs whose path starts with this prefix.
+   * Defaults to the root URL's pathname (e.g. "/docs/").
+   */
+  pathPrefix?: string | undefined;
+}
+
+/** Result of a documentation site sync. */
+export interface DocSiteSyncResult {
+  /** Pages newly indexed in this sync. */
+  pagesIndexed: number;
+  /** Pages that existed before and were re-indexed due to content changes. */
+  pagesUpdated: number;
+  /** Pages skipped because they are empty or contain no meaningful content. */
+  pagesSkipped: number;
+  /** The detected (or configured) documentation site type. */
+  detectedType: DocSiteType;
+  /** Per-page errors encountered during the crawl. */
+  errors: Array<{ url: string; error: string }>;
+}
+
+// ---------------------------------------------------------------------------
+// URL utilities
+// ---------------------------------------------------------------------------
+
+/**
+ * Normalise a URL for deduplication: strip the fragment, remove trailing
+ * slash from non-root paths, and keep scheme + host + path + query.
+ */
+export function normalizeUrl(rawUrl: string): string {
+  try {
+    const parsed = new URL(rawUrl);
+    parsed.hash = "";
+    if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
+      parsed.pathname = parsed.pathname.slice(0, -1);
+    }
+    return parsed.href;
+  } catch {
+    // Malformed URL — return as-is for deduplication fallback
+    return rawUrl;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Site-type detection
+// ---------------------------------------------------------------------------
+
+/**
+ * Detect the documentation generator from the HTML of a page.
+ *
+ * Checks generator meta tags and framework-specific CSS class names
+ * defined in FRAMEWORK_DEFS. Returns "generic" when no known pattern is found.
+ */
+export function detectDocSiteType(html: string): DocSiteType {
+  for (const fw of FRAMEWORK_DEFS) {
+    if (fw.detect(html)) {
+      return fw.type;
+    }
+  }
+  return "generic";
+}
+
+// ---------------------------------------------------------------------------
+// HTML content extraction
+// ---------------------------------------------------------------------------
+
+/**
+ * Walk HTML from `startPos` and find the end of a balanced `<tagName>...</tagName>`
+ * block using depth counting. Returns the index of the matching close tag,
+ * or -1 if none is found (malformed HTML).
+ */
+function findClosingTagIndex(html: string, tagName: string, startPos: number): number {
+  const openRe = new RegExp(String.raw`<${tagName}(?:\s[^>]{0,2000})?>`, "gi");
+  const closeRe = new RegExp(`</${tagName}>`, "gi");
+
+  let depth = 1;
+  let pos = startPos;
+
+  while (depth > 0) {
+    openRe.lastIndex = pos;
+    closeRe.lastIndex = pos;
+
+    const nextOpen = openRe.exec(html);
+    const nextClose = closeRe.exec(html);
+
+    if (!nextClose) return -1;
+
+    if (nextOpen !== null && nextOpen.index < nextClose.index) {
+      depth++;
+      pos = nextOpen.index + nextOpen[0].length;
+    } else {
+      depth--;
+      if (depth === 0) return nextClose.index;
+      pos = nextClose.index + nextClose[0].length;
+    }
+  }
+
+  return -1;
+}
+
+/**
+ * Extract the balanced inner HTML of the first element whose opening tag
+ * matches `tagName` and whose attribute string matches `attrPattern`.
+ *
+ * Uses a depth-counting approach so nested elements of the same tag name
+ * are handled correctly.  Returns null when no matching element is found.
+ */
+export function extractElementByPattern(
+  html: string,
+  tagName: string,
+  attrPattern: AttrMatcher,
+): string | null {
+  const scanner = new RegExp(String.raw`<(${tagName})(\s[^>]{0,2000})?>`, "gi");
+
+  let m: RegExpExecArray | null;
+  while ((m = scanner.exec(html)) !== null) {
+    const attrs = m[2] ?? "";
+    const matchesAttr =
+      typeof attrPattern === "function" ? attrPattern(attrs) : attrPattern.test(attrs);
+    if (matchesAttr) {
+      const contentStart = m.index + m[0].length;
+      const closeIdx = findClosingTagIndex(html, tagName, contentStart);
+      return closeIdx === -1 ? null : html.slice(contentStart, closeIdx);
+    }
+  }
+
+  return null;
+}
+
+/**
+ * Extract the main documentation content from a page's HTML.
+ *
+ * Attempts to isolate the primary content container for each site type so
+ * that navigation, sidebars, and footers are excluded.  Falls back to
+ * full-page conversion when no known container is found.
+ *
+ * HTML is truncated to MAX_HTML_SIZE before regex processing to mitigate ReDoS.
+ */
+export function extractMainContent(html: string, siteType: DocSiteType): string {
+  // Truncate oversized HTML before any regex processing to mitigate ReDoS
+  const safeHtml = html.length > MAX_HTML_SIZE ? html.slice(0, MAX_HTML_SIZE) : html;
+
+  const selectors =
+    FRAMEWORK_DEFS.find((fw) => fw.type === siteType)?.contentSelectors ??
+    GENERIC_CONTENT_SELECTORS;
+
+  let contentHtml: string | null = null;
+  for (const sel of selectors) {
+    contentHtml = extractElementByPattern(safeHtml, sel.tag, sel.attr);
+    if (contentHtml) break;
+  }
+
+  return NodeHtmlMarkdown.translate(contentHtml ?? safeHtml, {
+    ignore: ["script", "style", "nav"],
+  });
+}
+
+/**
+ * Extract the page title from HTML.
+ *
+ * Tries (in order): H1 tag, <title> tag, URL-derived fallback.
+ */
+export function extractDocTitle(html: string, url: string): string {
+  // H1 is the most semantically accurate source for documentation pages.
+  // Uses indexOf instead of a single capturing regex to avoid polynomial backtracking.
+  const h1Open = /<h1[^>]{0,2000}>/i.exec(html);
+  if (h1Open) {
+    const innerStart = h1Open.index + h1Open[0].length;
+    const h1CloseIdx = html.toLowerCase().indexOf("</h1>", innerStart);
+    if (h1CloseIdx !== -1) {
+      const title = html
+        .slice(innerStart, h1CloseIdx)
+        .replaceAll(/<[^>]{1,2000}>/g, "")
+        .trim();
+      if (title) return title;
+    }
+  }
+
+  // <title> tag as fallback
+  const titleTagMatch = /<title[^>]{0,2000}>([^<]+)<\/title>/i.exec(html);
+  if (titleTagMatch?.[1]) {
+    const title = titleTagMatch[1].trim();
+    if (title) return title;
+  }
+
+  // Last resort: derive from URL path
+  try {
+    const parsed = new URL(url);
+    const path = parsed.pathname.replace(/\/$/, "");
+    const segment = path.split("/").pop();
+    if (segment) {
+      return segment
+        .replaceAll("-", " ")
+        .replaceAll("_", " ")
+        .replace(/\.\w+$/, "");
+    }
+    return parsed.hostname;
+  } catch {
+    // Malformed URL — return raw URL as title
+    return url;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Link extraction
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract all internal HTML anchor links from a page.
+ *
+ * Filters links to:
+ * - Same origin as the base URL
+ * - Path starting with `pathPrefix`
+ * - Not a binary/asset file extension
+ * - Not fragment-only references
+ *
+ * Returns an array of normalised absolute URLs.
+ */
+/** Href values that should not be treated as navigable links. */
+const SKIP_SCHEMES = ["#", "mailto:", "javascript:", "data:", "vbscript:"];
+
+/** Resolve and validate a raw href against the base URL constraints. */
+function resolveDocHref(
+  raw: string,
+  baseUrl: string,
+  baseOrigin: string,
+  pathPrefix: string,
+): string | null {
+  if (SKIP_SCHEMES.some((s) => raw.startsWith(s))) return null;
+
+  try {
+    const resolved = new URL(raw, baseUrl);
+    if (resolved.origin !== baseOrigin) return null;
+    if (resolved.protocol !== "http:" && resolved.protocol !== "https:") return null;
+
+    const ext = resolved.pathname.split(".").pop()?.toLowerCase() ?? "";
+    if (SKIP_EXTENSIONS.has(ext)) return null;
+    if (pathPrefix && !resolved.pathname.startsWith(pathPrefix)) return null;
+
+    return normalizeUrl(resolved.href);
+  } catch {
+    // Skip unparseable href values
+    return null;
+  }
+}
+
+export function extractDocLinks(html: string, baseUrl: string, pathPrefix: string): string[] {
+  const base = new URL(baseUrl);
+  const links = new Set<string>();
+
+  const hrefRe = /<a\s[^>]{0,2000}\bhref=["']([^"']{1,4000})["'][^>]{0,2000}>/gi;
+  let match: RegExpExecArray | null;
+
+  while ((match = hrefRe.exec(html)) !== null) {
+    const raw = match[1];
+    if (!raw) continue;
+    const resolved = resolveDocHref(raw, baseUrl, base.origin, pathPrefix);
+    if (resolved) links.add(resolved);
+  }
+
+  return [...links];
+}
+
+// ---------------------------------------------------------------------------
+// Sitemap parsing
+// ---------------------------------------------------------------------------
+
+/**
+ * Extract page URLs from a sitemap.xml (or sitemap index) document.
+ *
+ * Only returns URLs on the same origin as `baseUrl` and under `pathPrefix`.
+ * Binary/asset paths are excluded.
+ */
+export function extractSitemapUrls(xml: string, baseUrl: string, pathPrefix: string): string[] {
+  const base = new URL(baseUrl);
+  const urls: string[] = [];
+  const seen = new Set<string>();
+
+  const locRe = /<loc>([^<]+)<\/loc>/gi;
+  let match: RegExpExecArray | null;
+
+  while ((match = locRe.exec(xml)) !== null) {
+    const raw = match[1]?.trim();
+    if (!raw) continue;
+    try {
+      const parsed = new URL(raw);
+      if (parsed.origin !== base.origin) continue;
+      if (pathPrefix && !parsed.pathname.startsWith(pathPrefix)) continue;
+
+      const ext = parsed.pathname.split(".").pop()?.toLowerCase() ?? "";
+      if (SKIP_EXTENSIONS.has(ext)) continue;
+
+      const normalised = normalizeUrl(parsed.href);
+      if (!seen.has(normalised)) {
+        seen.add(normalised);
+        urls.push(normalised);
+      }
+    } catch {
+      // Skip invalid URLs in sitemap
+    }
+  }
+
+  return urls;
+}
+
+// ---------------------------------------------------------------------------
+// Internal page processing
+// ---------------------------------------------------------------------------
+
+/** Context passed to processPage to avoid a long parameter list. */
+interface PageContext {
+  siteType: DocSiteType;
+  db: Database.Database;
+  provider: EmbeddingProvider;
+  config: DocSiteConfig;
+  /** Map of normalised URL → existing document ID for update detection. */
+  existingUrlMap: Map<string, string>;
+  result: DocSiteSyncResult;
+}
+
+/**
+ * Process a single documentation page: extract title + content, then index.
+ *
+ * indexDocument() handles URL-based dedup automatically: if the URL already
+ * exists and the content hash is unchanged the call is a no-op; if the hash
+ * changed the old document is replaced.
+ */
+async function processPage(url: string, html: string, ctx: PageContext): Promise<void> {
+  const log = getLogger();
+
+  const title = extractDocTitle(html, url);
+  const content = extractMainContent(html, ctx.siteType);
+
+  if (!content.trim()) {
+    ctx.result.pagesSkipped++;
+    log.debug({ url }, "Skipping empty page");
+    return;
+  }
+
+  const normalised = normalizeUrl(url);
+  const isKnown = ctx.existingUrlMap.has(normalised);
+
+  const indexed = await indexDocument(ctx.db, ctx.provider, {
+    title,
+    content,
+    sourceType: SOURCE_TYPE,
+    url,
+    library: ctx.config.library,
+    version: ctx.config.version,
+    submittedBy: "crawler",
+  });
+
+  // chunkCount === 0 means indexDocument determined the page was unchanged
+  if (indexed.chunkCount === 0 && isKnown) {
+    ctx.result.pagesSkipped++;
+  } else if (isKnown) {
+    ctx.result.pagesUpdated++;
+  } else {
+    ctx.result.pagesIndexed++;
+  }
+
+  log.debug({ url, title, chunks: indexed.chunkCount }, "Processed documentation page");
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/**
+ * Crawl and index a documentation site.
+ *
+ * 1. Fetches the root page to auto-detect the site type.
+ * 2. Tries to discover all pages via sitemap.xml.
+ * 3. Falls back to (or supplements) BFS link crawling.
+ * 4. Processes pages concurrently in configurable batches.
+ *
+ * URL-based deduplication is handled by indexDocument(): unchanged pages
+ * are skipped automatically; changed pages are re-indexed in-place.
+ */
+/** Validate config and return a parsed base URL. */
+function validateDocSiteConfig(config: DocSiteConfig): URL {
+  if (!config.url?.trim()) {
+    throw new ValidationError("DocSiteConfig.url is required");
+  }
+  let baseUrl: URL;
+  try {
+    baseUrl = new URL(config.url);
+  } catch {
+    throw new ValidationError(`Invalid URL: ${config.url}`);
+  }
+  if (baseUrl.protocol !== "http:" && baseUrl.protocol !== "https:") {
+    throw new ValidationError(`URL must use http or https scheme: ${config.url}`);
+  }
+  return baseUrl;
+}
+
+/** Discover URLs via sitemap.xml and root page links, populating the BFS queue. */
+async function discoverUrls(
+  config: DocSiteConfig,
+  baseUrl: URL,
+  rootHtml: string,
+  pathPrefix: string,
+  fetchOptions: FetchOptions,
+  visited: Set<string>,
+  queue: Array<{ url: string; depth: number }>,
+): Promise<void> {
+  const log = getLogger();
+
+  const sitemapUrl = `${baseUrl.origin}/sitemap.xml`;
+  try {
+    const sitemapRaw = await fetchRaw(sitemapUrl, fetchOptions);
+    if (sitemapRaw.contentType.includes("xml") || sitemapRaw.body.includes("<urlset")) {
+      const sitemapUrls = extractSitemapUrls(sitemapRaw.body, config.url, pathPrefix);
+      for (const u of sitemapUrls) {
+        if (!visited.has(u)) {
+          queue.push({ url: u, depth: 1 });
+          visited.add(u);
+        }
+      }
+      log.info({ count: sitemapUrls.length }, "Discovered URLs from sitemap.xml");
+    }
+  } catch {
+    log.debug({ url: sitemapUrl }, "sitemap.xml unavailable, falling back to link crawling");
+  }
+
+  for (const link of extractDocLinks(rootHtml, config.url, pathPrefix)) {
+    if (!visited.has(link)) {
+      queue.push({ url: link, depth: 1 });
+      visited.add(link);
+    }
+  }
+}
+
+export async function syncDocSite(
+  db: Database.Database,
+  provider: EmbeddingProvider,
+  config: DocSiteConfig,
+): Promise<DocSiteSyncResult> {
+  const log = getLogger();
+
+  const baseUrl = validateDocSiteConfig(config);
+
+  const maxPages = Math.max(1, Math.min(config.maxPages ?? DEFAULT_MAX_PAGES, 10_000));
+  const maxDepth = Math.max(1, Math.min(config.maxDepth ?? DEFAULT_MAX_DEPTH, 100));
+  const concurrency = Math.max(1, Math.min(config.concurrency ?? DEFAULT_CONCURRENCY, 10));
+
+  const pathPrefix = config.pathPrefix ?? baseUrl.pathname;
+
+  const fetchOptions: FetchOptions = {
+    allowPrivateUrls: config.allowPrivateUrls ?? false,
+    allowSelfSignedCerts: config.allowSelfSignedCerts ?? false,
+  };
+
+  if (fetchOptions.allowPrivateUrls) {
+    log.warn({ url: config.url }, "Doc sync with allowPrivateUrls — SSRF protections relaxed");
+  }
+
+  const result: DocSiteSyncResult = {
+    pagesIndexed: 0,
+    pagesUpdated: 0,
+    pagesSkipped: 0,
+    detectedType: "generic",
+    errors: [],
+  };
+
+  const syncId = startSync(db, CONNECTOR_TYPE, config.url);
+
+  try {
+    // --- Fetch root page ---
+    log.info({ url: config.url }, "Fetching documentation root page");
+
+    let rootHtml: string;
+    try {
+      const raw = await fetchRaw(config.url, fetchOptions);
+      rootHtml = raw.body;
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      throw new Error(`Failed to fetch root page: ${msg}`);
+    }
+
+    // --- Detect site type ---
+    result.detectedType =
+      config.type !== undefined && config.type !== "auto"
+        ? config.type
+        : detectDocSiteType(rootHtml);
+
+    log.info({ type: result.detectedType, url: config.url }, "Documentation site type");
+
+    // --- URL discovery ---
+    const visited = new Set<string>();
+    const queue: Array<{ url: string; depth: number }> = [];
+    const rootNormalised = normalizeUrl(config.url);
+    visited.add(rootNormalised);
+
+    await discoverUrls(config, baseUrl, rootHtml, pathPrefix, fetchOptions, visited, queue);
+
+    // --- Build existing-URL index for update tracking ---
+    const existingDocs = listDocuments(db, { sourceType: SOURCE_TYPE, library: config.library });
+    const existingUrlMap = new Map<string, string>(
+      existingDocs
+        .filter((d): d is typeof d & { url: string } => d.url !== null)
+        .map((d) => [normalizeUrl(d.url), d.id]),
+    );
+
+    const ctx: PageContext = {
+      siteType: result.detectedType,
+      db,
+      provider,
+      config,
+      existingUrlMap,
+      result,
+    };
+
+    // --- Process the root page first ---
+    await processPage(rootNormalised, rootHtml, ctx);
+
+    // --- BFS crawl ---
+    while (queue.length > 0 && visited.size <= maxPages) {
+      const batch = queue.splice(0, concurrency);
+
+      await Promise.allSettled(
+        batch.map(async ({ url, depth }) => {
+          if (visited.size > maxPages) return;
+
+          let html: string;
+          let contentType: string;
+          try {
+            const raw = await fetchRaw(url, fetchOptions);
+            html = raw.body;
+            contentType = raw.contentType;
+          } catch (err) {
+            const msg = err instanceof Error ? err.message : String(err);
+            log.warn({ url, error: msg }, "Failed to fetch documentation page");
+            result.errors.push({ url, error: msg });
+            return;
+          }
+
+          // Only process HTML pages (skip binary/asset responses that slipped through)
+          if (!contentType.includes("text/html") && !contentType.includes("text/plain")) {
+            return;
+          }
+
+          await processPage(url, html, ctx);
+
+          // Continue link discovery if within depth budget
+          if (depth < maxDepth) {
+            for (const link of extractDocLinks(html, url, pathPrefix)) {
+              if (!visited.has(link)) {
+                visited.add(link);
+                queue.push({ url: link, depth: depth + 1 });
+              }
+            }
+          }
+        }),
+      );
+    }
+
+    completeSync(db, syncId, {
+      added: result.pagesIndexed,
+      updated: result.pagesUpdated,
+      deleted: 0,
+      errored: result.errors.length,
+    });
+
+    log.info(
+      {
+        pagesIndexed: result.pagesIndexed,
+        pagesUpdated: result.pagesUpdated,
+        pagesSkipped: result.pagesSkipped,
+        errors: result.errors.length,
+      },
+      "Documentation site sync complete",
+    );
+
+    return result;
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    failSync(db, syncId, msg);
+    throw err;
+  }
+}
+
+/**
+ * Remove all documents that were indexed from a given documentation site.
+ *
+ * Identifies documents by URL prefix (`siteUrl + "%"`) so only pages that
+ * originated from the specified site are removed.
+ *
+ * @param db      The database connection.
+ * @param siteUrl Root URL of the documentation site (used as URL prefix filter).
+ * @returns       The number of documents deleted.
+ */
+export function disconnectDocSite(db: Database.Database, siteUrl: string): number {
+  const log = getLogger();
+
+  let basePrefix: string;
+  try {
+    const parsed = new URL(siteUrl);
+    // Use origin + pathname as prefix so we don't accidentally match sibling sites
+    basePrefix = parsed.origin + parsed.pathname;
+  } catch {
+    throw new ValidationError(`Invalid site URL for disconnect: ${siteUrl}`);
+  }
+
+  // Parameterised LIKE — the prefix is derived from a validated URL, not user input.
+  const rows = db
+    .prepare("SELECT id FROM documents WHERE url LIKE ?")
+    .all(`${basePrefix}%`) as Array<{ id: string }>;
+
+  let removed = 0;
+  for (const row of rows) {
+    try {
+      deleteDocument(db, row.id);
+      removed++;
+    } catch {
+      // Document may have already been deleted
+    }
+  }
+
+  log.info({ siteUrl, removed }, "Documentation site disconnected");
+  return removed;
+}
diff --git a/src/connectors/index.ts b/src/connectors/index.ts
index 2885f8a..fba92f8 100644
--- a/src/connectors/index.ts
+++ b/src/connectors/index.ts
@@ -111,11 +111,11 @@ export function deleteDbConnectorConfig(db: Database.Database, type: string): bo
 const CONNECTORS_DIR = join(homedir(), ".libscope", "connectors");
 
 function ensureConnectorsDir(): void {
-  if (!existsSync(CONNECTORS_DIR)) {
-    mkdirSync(CONNECTORS_DIR, { recursive: true, mode: 0o700 });
-  } else {
+  if (existsSync(CONNECTORS_DIR)) {
     // Remediate existing directories that may have permissive permissions
     restrictPermissions(CONNECTORS_DIR, 0o700);
+  } else {
+    mkdirSync(CONNECTORS_DIR, { recursive: true, mode: 0o700 });
   }
   try {
     chmodSync(CONNECTORS_DIR, 0o700);
@@ -229,3 +229,16 @@ export {
   getApiUrls,
 } from "./confluence.js";
 export type { ConfluenceConfig, ConfluenceSyncResult } from "./confluence.js";
+
+export {
+  syncDocSite,
+  disconnectDocSite,
+  detectDocSiteType,
+  extractDocLinks,
+  extractDocTitle,
+  extractMainContent,
+  extractElementByPattern,
+  extractSitemapUrls,
+  normalizeUrl,
+} from "./docs.js";
+export type { DocSiteConfig, DocSiteSyncResult, DocSiteType } from "./docs.js";
diff --git a/src/core/index.ts b/src/core/index.ts
index 918472e..1e95612 100644
--- a/src/core/index.ts
+++ b/src/core/index.ts
@@ -212,6 +212,19 @@ export {
 } from "../connectors/confluence.js";
 export type { ConfluenceConfig, ConfluenceSyncResult } from "../connectors/confluence.js";
 
+export {
+  syncDocSite,
+  disconnectDocSite,
+  detectDocSiteType,
+  extractDocLinks,
+  extractDocTitle,
+  extractMainContent,
+  extractElementByPattern,
+  extractSitemapUrls,
+  normalizeUrl as normalizeDocUrl,
+} from "../connectors/docs.js";
+export type { DocSiteConfig, DocSiteSyncResult, DocSiteType } from "../connectors/docs.js";
+
 export { resolveSelector, bulkDelete, bulkRetag, bulkMove } from "./bulk.js";
 export type { BulkSelector, BulkResult } from "./bulk.js";
 
diff --git a/tests/unit/docs-connector.test.ts b/tests/unit/docs-connector.test.ts
new file mode 100644
index 0000000..9a97417
--- /dev/null
+++ b/tests/unit/docs-connector.test.ts
@@ -0,0 +1,909 @@
+/**
+ * Unit tests for src/connectors/docs.ts
+ *
+ * Tests cover:
+ *  - normalizeUrl
+ *  - detectDocSiteType
+ *  - extractElementByPattern
+ *  - extractMainContent
+ *  - extractDocTitle
+ *  - extractDocLinks
+ *  - extractSitemapUrls
+ *  - syncDocSite (via mocked fetch + indexDocument)
+ *  - disconnectDocSite
+ */
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { ValidationError } from "../../src/errors.js";
+import { createTestDbWithVec } from "../fixtures/test-db.js";
+import { MockEmbeddingProvider } from "../fixtures/mock-provider.js";
+import { initLogger } from "../../src/logger.js";
+import type Database from "better-sqlite3";
+
+// -------------------------------------------------------------------------
+// Mock global fetch so we never make real HTTP calls
+// -------------------------------------------------------------------------
+const mockFetch = vi.fn();
+vi.stubGlobal("fetch", mockFetch);
+
+// Mock dns to avoid real DNS lookups from url-fetcher
+vi.mock("node:dns", () => ({
+  promises: {
+    resolve4: vi.fn().mockResolvedValue(["93.184.216.34"]),
+    resolve6: vi.fn().mockResolvedValue([]),
+  },
+  lookup: (_host: string, cb: (err: null, addr: string) => void) => cb(null, "93.184.216.34"),
+}));
+
+// Dynamic import after mocks
+const {
+  normalizeUrl,
+  detectDocSiteType,
+  extractElementByPattern,
+  extractMainContent,
+  extractDocTitle,
+  extractDocLinks,
+  extractSitemapUrls,
+  syncDocSite,
+  disconnectDocSite,
+} = await import("../../src/connectors/docs.js");
+
+// -------------------------------------------------------------------------
+// Helpers
+// -------------------------------------------------------------------------
+
+function mockResponse(body: string, contentType: string, status = 200): Response {
+  return {
+    ok: status >= 200 && status < 300,
+    status,
+    headers: new Headers({ "content-type": contentType }),
+    body: {
+      getReader: () => {
+        let done = false;
+        return {
+          read: () => {
+            if (done) return Promise.resolve({ done: true as const, value: undefined });
+            done = true;
+            return Promise.resolve({ done: false as const, value: new TextEncoder().encode(body) });
+          },
+          cancel: () => Promise.resolve(undefined),
+        };
+      },
+    },
+    text: () => Promise.resolve(body),
+    url: "",
+    redirected: false,
+  } as unknown as Response;
+}
+
+function htmlResponse(body: string, status = 200): Response {
+  return mockResponse(body, "text/html; charset=utf-8", status);
+}
+
+function xmlResponse(body: string, status = 200): Response {
+  return mockResponse(body, "application/xml; charset=utf-8", status);
+}
+
+function notFoundResponse(): Response {
+  return {
+    ok: false,
+    status: 404,
+    headers: new Headers({ "content-type": "text/html" }),
+    body: null,
+    text: () => Promise.resolve("Not Found"),
+    url: "",
+    redirected: false,
+  } as unknown as Response;
+}
+
+// -------------------------------------------------------------------------
+// normalizeUrl
+// -------------------------------------------------------------------------
+
+describe("normalizeUrl", () => {
+  it("strips fragments", () => {
+    expect(normalizeUrl("https://example.com/docs/page#section")).toBe(
+      "https://example.com/docs/page",
+    );
+  });
+
+  it("removes trailing slash from non-root paths", () => {
+    expect(normalizeUrl("https://example.com/docs/page/")).toBe("https://example.com/docs/page");
+  });
+
+  it("preserves root slash", () => {
+    expect(normalizeUrl("https://example.com/")).toBe("https://example.com/");
+  });
+
+  it("preserves query strings", () => {
+    expect(normalizeUrl("https://example.com/docs?v=2")).toBe("https://example.com/docs?v=2");
+  });
+
+  it("handles already normalised URLs unchanged", () => {
+    const url = "https://example.com/docs/api";
+    expect(normalizeUrl(url)).toBe(url);
+  });
+
+  it("returns input unchanged when URL is malformed", () => {
+    expect(normalizeUrl("not-a-url")).toBe("not-a-url");
+  });
+});
+
+// -------------------------------------------------------------------------
+// detectDocSiteType
+// -------------------------------------------------------------------------
+
+describe("detectDocSiteType", () => {
+  it.each([
+    [
+      "Sphinx meta generator",
+      '<html><head><meta name="generator" content="Sphinx 5.0"></head></html>',
+      "sphinx",
+    ],
+    ["Sphinx sphinxsidebar class", '<div class="sphinxsidebar"><p>nav</p></div>', "sphinx"],
+    [
+      "Sphinx rst-content class",
+      '<div class="rst-content"><div role="main">...</div></div>',
+      "sphinx",
+    ],
+    ["Sphinx sphinx- prefixed class", '<div class="sphinx-version">5.0</div>', "sphinx"],
+    ["VitePress __VITEPRESS_ global", "<script>window.__VITEPRESS_DATA__={}</script>", "vitepress"],
+    ["VitePress VPDoc class", '<div class="VPDoc"><main>...</main></div>', "vitepress"],
+    ["VitePress vp-doc class", '<div class="vp-doc"><h1>Title</h1></div>', "vitepress"],
+    ["VitePress meta content", '<meta name="generator" content="VitePress 1.0">', "vitepress"],
+    ["Doxygen HTML comment", "<!-- Generated by Doxygen 1.9 --><html></html>", "doxygen"],
+    ["Doxygen meta generator", '<meta name="generator" content="Doxygen 1.9.0">', "doxygen"],
+    [
+      "Doxygen doc-content id",
+      '<div id="doc-content"><div class="contents">...</div></div>',
+      "doxygen",
+    ],
+    [
+      "unknown HTML → generic",
+      "<html><body><main><p>Some docs</p></main></body></html>",
+      "generic",
+    ],
+    [
+      "Sphinx precedence over VitePress",
+      '<meta name="generator" content="Sphinx 5.0"><div class="vp-doc">overlap</div>',
+      "sphinx",
+    ],
+  ] as const)("detects %s", (_label, html, expected) => {
+    expect(detectDocSiteType(html)).toBe(expected);
+  });
+});
+
+// -------------------------------------------------------------------------
+// extractElementByPattern
+// -------------------------------------------------------------------------
+
+describe("extractElementByPattern", () => {
+  it("extracts content of a simple div by id pattern", () => {
+    const html = '<div id="content"><p>Hello world</p></div>';
+    const result = extractElementByPattern(html, "div", /id=["']content["']/);
+    expect(result).toBe("<p>Hello world</p>");
+  });
+
+  it("extracts content of a div by class pattern", () => {
+    const html = '<div class="vp-doc"><h1>Title</h1><p>Body</p></div>';
+    const result = extractElementByPattern(html, "div", (attrs) => attrs.includes("vp-doc"));
+    expect(result).toBe("<h1>Title</h1><p>Body</p>");
+  });
+
+  it("handles nested elements of the same tag name correctly", () => {
+    const html =
+      '<div class="main"><div class="inner"><p>inner</p></div><p>outer</p></div><div>other</div>';
+    const result = extractElementByPattern(html, "div", /class=["']main["']/);
+    expect(result).toBe('<div class="inner"><p>inner</p></div><p>outer</p>');
+  });
+
+  it("returns null when no matching element is found", () => {
+    const html = "<div><p>nothing here</p></div>";
+    const result = extractElementByPattern(html, "div", /class=["']vp-doc["']/);
+    expect(result).toBeNull();
+  });
+
+  it("extracts main element with empty attr pattern", () => {
+    const html = "<html><body><main><p>content</p></main></body></html>";
+    const result = extractElementByPattern(html, "main", () => true);
+    expect(result).toBe("<p>content</p>");
+  });
+
+  it("extracts article element with empty attr pattern", () => {
+    const html = "<body><article><h1>Doc</h1><p>text</p></article></body>";
+    const result = extractElementByPattern(html, "article", () => true);
+    expect(result).toBe("<h1>Doc</h1><p>text</p>");
+  });
+
+  it("returns null for malformed HTML with unclosed tags", () => {
+    const html = '<div class="main"><p>unclosed';
+    const result = extractElementByPattern(html, "div", /class=["']main["']/);
+    // Should not throw; returns null or partial result
+    expect(result === null || typeof result === "string").toBe(true);
+  });
+
+  it("finds first match when multiple matching elements exist", () => {
+    const html = '<div class="body"><p>first</p></div><div class="body"><p>second</p></div>';
+    const result = extractElementByPattern(html, "div", /class=["']body["']/);
+    expect(result).toBe("<p>first</p>");
+  });
+});
+
+// -------------------------------------------------------------------------
+// extractDocTitle
+// -------------------------------------------------------------------------
+
+describe("extractDocTitle", () => {
+  it.each([
+    [
+      "H1 tag",
+      "<html><body><h1>Getting Started</h1></body></html>",
+      "https://example.com/docs/start",
+      "Getting Started",
+    ],
+    [
+      "H1 with inner tags stripped",
+      '<h1><a href="#">API Reference</a></h1>',
+      "https://example.com/docs/api",
+      "API Reference",
+    ],
+    [
+      "<title> fallback",
+      "<html><head><title>My Library — Docs</title></head><body></body></html>",
+      "https://example.com/docs",
+      "My Library — Docs",
+    ],
+    [
+      "URL-derived fallback",
+      "<html><body><p>content</p></body></html>",
+      "https://example.com/docs/installation",
+      "installation",
+    ],
+    [
+      "hyphens to spaces",
+      "<html><body></body></html>",
+      "https://example.com/docs/getting-started",
+      "getting started",
+    ],
+    [
+      "strip file extension",
+      "<html><body></body></html>",
+      "https://example.com/docs/index.html",
+      "index",
+    ],
+    [
+      "hostname for empty path",
+      "<html><body></body></html>",
+      "https://example.com/",
+      "example.com",
+    ],
+    [
+      "H1 precedence over title",
+      "<html><head><title>Page Title</title></head><body><h1>Real Title</h1></body></html>",
+      "https://example.com/page",
+      "Real Title",
+    ],
+  ] as const)("extracts title from %s", (_label, html, url, expected) => {
+    expect(extractDocTitle(html, url)).toBe(expected);
+  });
+});
+
+// -------------------------------------------------------------------------
+// extractDocLinks
+// -------------------------------------------------------------------------
+
+describe("extractDocLinks", () => {
+  const BASE = "https://docs.example.com/docs/";
+
+  it("extracts absolute same-origin links", () => {
+    const html = '<a href="https://docs.example.com/docs/api">API</a>';
+    const links = extractDocLinks(html, BASE, "/docs/");
+    expect(links).toContain("https://docs.example.com/docs/api");
+  });
+
+  it("resolves relative links against base URL", () => {
+    const html = '<a href="getting-started">Getting Started</a>';
+    const links = extractDocLinks(html, BASE, "/docs/");
+    expect(links).toContain("https://docs.example.com/docs/getting-started");
+  });
+
+  it.each([
+    ["different origins", '<a href="https://other.com/page">External</a>'],
+    ["fragment-only links", '<a href="#section">Jump</a>'],
+    ["mailto links", '<a href="mailto:user@example.com">Email</a>'],
+    ["javascript links", '<a href="javascript:void(0)">Click</a>'],
+    ["data URIs", '<a href="data:text/html,<h1>hi</h1>">Data</a>'],
+    ["vbscript links", '<a href="vbscript:MsgBox">VBS</a>'],
+  ])("skips %s", (_label, html) => {
+    expect(extractDocLinks(html, BASE, "/docs/")).toEqual([]);
+  });
+
+  it("skips binary asset extensions", () => {
+    const html = [
+      '<a href="/docs/logo.png">PNG</a>',
+      '<a href="/docs/download.zip">ZIP</a>',
+      '<a href="/docs/styles.css">CSS</a>',
+      '<a href="/docs/bundle.js">JS</a>',
+    ].join("\n");
+    expect(extractDocLinks(html, BASE, "/docs/")).toEqual([]);
+  });
+
+  it("respects pathPrefix to exclude links outside the prefix", () => {
+    const html = '<a href="/docs/page">In docs</a><a href="/blog/post">Blog</a>';
+    const links = extractDocLinks(html, BASE, "/docs/");
+    expect(links).toContain("https://docs.example.com/docs/page");
+    expect(links).not.toContain("https://docs.example.com/blog/post");
+  });
+
+  it("deduplicates links (normalises URL, strips fragment)", () => {
+    const html = [
+      '<a href="/docs/page">One</a>',
+      '<a href="/docs/page/">Two</a>',
+      '<a href="/docs/page#section">Three</a>',
+    ].join("\n");
+    const links = extractDocLinks(html, BASE, "/docs/");
+    expect(links.filter((l) => l.includes("/docs/page")).length).toBe(1);
+  });
+
+  it("returns empty array when no anchors found", () => {
+    expect(extractDocLinks("<p>No links here</p>", BASE, "/docs/")).toEqual([]);
+  });
+});
+
+// -------------------------------------------------------------------------
+// extractSitemapUrls
+// -------------------------------------------------------------------------
+
+describe("extractSitemapUrls", () => {
+  const BASE = "https://docs.example.com/";
+
+  it("extracts URLs from a simple sitemap", () => {
+    const xml = `<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://docs.example.com/docs/intro</loc></url>
+  <url><loc>https://docs.example.com/docs/api</loc></url>
+</urlset>`;
+    const urls = extractSitemapUrls(xml, BASE, "/docs/");
+    expect(urls).toContain("https://docs.example.com/docs/intro");
+    expect(urls).toContain("https://docs.example.com/docs/api");
+  });
+
+  it("filters out URLs on different origins", () => {
+    const xml = `<urlset>
+  <url><loc>https://other.com/docs/page</loc></url>
+  <url><loc>https://docs.example.com/docs/page</loc></url>
+</urlset>`;
+    const urls = extractSitemapUrls(xml, BASE, "/docs/");
+    expect(urls).not.toContain("https://other.com/docs/page");
+    expect(urls).toContain("https://docs.example.com/docs/page");
+  });
+
+  it("filters by pathPrefix", () => {
+    const xml = `<urlset>
+  <url><loc>https://docs.example.com/docs/page</loc></url>
+  <url><loc>https://docs.example.com/blog/post</loc></url>
+</urlset>`;
+    const urls = extractSitemapUrls(xml, BASE, "/docs/");
+    expect(urls).toContain("https://docs.example.com/docs/page");
+    expect(urls).not.toContain("https://docs.example.com/blog/post");
+  });
+
+  it("filters out binary asset URLs", () => {
+    const xml = `<urlset>
+  <url><loc>https://docs.example.com/docs/image.png</loc></url>
+  <url><loc>https://docs.example.com/docs/page</loc></url>
+</urlset>`;
+    const urls = extractSitemapUrls(xml, BASE, "/docs/");
+    expect(urls).not.toContain("https://docs.example.com/docs/image.png");
+  });
+
+  it("deduplicates URLs", () => {
+    const xml = `<urlset>
+  <url><loc>https://docs.example.com/docs/page</loc></url>
+  <url><loc>https://docs.example.com/docs/page</loc></url>
+</urlset>`;
+    const urls = extractSitemapUrls(xml, BASE, "/docs/");
+    expect(urls.length).toBe(1);
+  });
+
+  it("returns empty array for empty sitemap", () => {
+    const xml = `<urlset></urlset>`;
+    expect(extractSitemapUrls(xml, BASE, "/docs/")).toEqual([]);
+  });
+});
+
+// -------------------------------------------------------------------------
+// extractMainContent
+// -------------------------------------------------------------------------
+
+describe("extractMainContent", () => {
+  it.each([
+    [
+      "Sphinx role=main div",
+      "sphinx" as const,
+      '<nav>navigation</nav><div role="main"><h1>Title</h1><p>Content</p></div><footer>footer</footer>',
+      ["Title", "Content"],
+    ],
+    [
+      "VitePress vp-doc div",
+      "vitepress" as const,
+      '<header>nav</header><div class="vp-doc"><h1>API</h1><p>Details</p></div><aside>sidebar</aside>',
+      ["API", "Details"],
+    ],
+    [
+      "Doxygen contents div",
+      "doxygen" as const,
+      '<div id="nav">navigation</div><div class="contents"><h2>Function Reference</h2><p>Details</p></div>',
+      ["Function Reference", "Details"],
+    ],
+    [
+      "generic main element",
+      "generic" as const,
+      "<body><header>nav</header><main><h1>Guide</h1><p>Text</p></main><footer></footer>",
+      ["Guide", "Text"],
+    ],
+    [
+      "full-page fallback",
+      "sphinx" as const,
+      "<html><body><p>Fallback content</p></body></html>",
+      ["Fallback content"],
+    ],
+  ])("extracts %s", (_label, siteType, html, expected) => {
+    const result = extractMainContent(html, siteType);
+    for (const text of expected) {
+      expect(result).toContain(text);
+    }
+  });
+
+  it("returns non-empty string for any non-empty HTML", () => {
+    const html = "<div><p>Something</p></div>";
+    const result = extractMainContent(html, "generic");
+    expect(result.trim().length).toBeGreaterThan(0);
+  });
+});
+
+// -------------------------------------------------------------------------
+// syncDocSite — validation
+// -------------------------------------------------------------------------
+
+describe("syncDocSite — validation", () => {
+  let db: Database.Database;
+  let provider: MockEmbeddingProvider;
+
+  beforeEach(() => {
+    initLogger("silent");
+    db = createTestDbWithVec();
+    provider = new MockEmbeddingProvider();
+  });
+
+  afterEach(() => {
+    db.close();
+  });
+
+  it("throws ValidationError when url is missing", async () => {
+    await expect(syncDocSite(db, provider, { url: "" })).rejects.toBeInstanceOf(ValidationError);
+  });
+
+  it("throws ValidationError for malformed URL", async () => {
+    await expect(syncDocSite(db, provider, { url: "not-a-url" })).rejects.toBeInstanceOf(
+      ValidationError,
+    );
+  });
+
+  it("throws ValidationError for non-http/https scheme", async () => {
+    await expect(
+      syncDocSite(db, provider, { url: "ftp://example.com/docs" }),
+    ).rejects.toBeInstanceOf(ValidationError);
+  });
+});
+
+// -------------------------------------------------------------------------
+// syncDocSite — integration with mocked fetch
+// -------------------------------------------------------------------------
+
+describe("syncDocSite — mocked fetch", () => {
+  let db: Database.Database;
+  let provider: MockEmbeddingProvider;
+
+  // Implementation order: root page is fetched FIRST, then sitemap.xml,
+  // then BFS pages. All mock setups must follow this order.
+
+  const SPHINX_ROOT = `
+    <html>
+      <head>
+        <meta name="generator" content="Sphinx 5.0">
+        <title>My Library Docs</title>
+      </head>
+      <body>
+        <div class="sphinxsidebar">
+          <a href="https://docs.example.com/docs/api">API</a>
+          <a href="https://docs.example.com/docs/guide">Guide</a>
+        </div>
+        <div role="main">
+          <h1>Welcome</h1>
+          <p>This is the documentation root.</p>
+        </div>
+      </body>
+    </html>`;
+
+  // Sphinx root page with only one outbound link (for simpler tests)
+  const SPHINX_ROOT_SIMPLE = `
+    <html>
+      <head><meta name="generator" content="Sphinx 5.0"><title>Docs</title></head>
+      <body>
+        <div role="main"><h1>Welcome</h1><p>This is the documentation root page content.</p></div>
+      </body>
+    </html>`;
+
+  const SPHINX_API = `
+    <html>
+      <head><title>API Reference</title></head>
+      <body>
+        <div role="main">
+          <h1>API Reference</h1>
+          <p>Function definitions and usage.</p>
+        </div>
+      </body>
+    </html>`;
+
+  beforeEach(() => {
+    initLogger("silent");
+    db = createTestDbWithVec();
+    provider = new MockEmbeddingProvider();
+    // mockReset clears both call history AND the mockResolvedValueOnce queue,
+    // preventing mock bleed between tests.
+    mockFetch.mockReset();
+  });
+
+  afterEach(() => {
+    db.close();
+  });
+
+  it("indexes the root page and detects Sphinx site type", async () => {
+    // Order: root, sitemap
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root page
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml 404
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    expect(result.detectedType).toBe("sphinx");
+    expect(result.pagesIndexed).toBe(1);
+    expect(result.errors).toHaveLength(0);
+  });
+
+  it("uses configured type instead of auto-detecting", async () => {
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+      type: "vitepress",
+    });
+
+    expect(result.detectedType).toBe("vitepress");
+  });
+
+  it("crawls pages discovered via link extraction", async () => {
+    // Order: root, sitemap, api, guide
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT)) // root (has links to api + guide)
+      .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml
+      .mockResolvedValueOnce(htmlResponse(SPHINX_API)) // /docs/api
+      .mockResolvedValueOnce(
+        htmlResponse("<html><body><main><h1>Guide</h1><p>Guide content.</p></main></body></html>"),
+      ); // /docs/guide
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    // Root + api + guide = 3 pages
+    expect(result.pagesIndexed).toBe(3);
+    expect(result.errors).toHaveLength(0);
+  });
+
+  it("uses sitemap.xml for URL discovery when available", async () => {
+    const sitemap = `<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://docs.example.com/docs/api</loc></url>
+</urlset>`;
+
+    // Order: root, sitemap (success), api
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root page
+      .mockResolvedValueOnce(xmlResponse(sitemap)) // sitemap.xml success
+      .mockResolvedValueOnce(htmlResponse(SPHINX_API)); // /docs/api from sitemap
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    expect(result.pagesIndexed).toBeGreaterThanOrEqual(1);
+    expect(result.errors).toHaveLength(0);
+  });
+
+  it("records errors for pages that fail to fetch", async () => {
+    const rootWithFailingLink = `
+      <html>
+        <head><meta name="generator" content="Sphinx 5.0"></head>
+        <body>
+          <div role="main"><h1>Root</h1><p>Intro text content here.</p></div>
+          <a href="https://docs.example.com/docs/broken">Broken</a>
+        </body>
+      </html>`;
+
+    // Order: root, sitemap, broken page
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(rootWithFailingLink)) // root
+      .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml
+      .mockResolvedValueOnce(notFoundResponse()); // broken page → error
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    expect(result.errors.length).toBeGreaterThanOrEqual(1);
+    expect(result.errors[0]?.url).toContain("/docs/broken");
+  });
+
+  it("skips pages outside pathPrefix", async () => {
+    const rootWithOutsideLink = `
+      <html>
+        <head><meta name="generator" content="Sphinx 5.0"></head>
+        <body>
+          <div role="main"><h1>Root</h1><p>Intro text content here.</p></div>
+          <a href="https://docs.example.com/blog/post">Blog</a>
+          <a href="https://docs.example.com/docs/api">API</a>
+        </body>
+      </html>`;
+
+    // Order: root, sitemap, api (blog is skipped by pathPrefix)
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(rootWithOutsideLink)) // root
+      .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml
+      .mockResolvedValueOnce(htmlResponse(SPHINX_API)); // /docs/api
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+      pathPrefix: "/docs/",
+    });
+
+    // Should only have fetched root and /docs/api, not /blog/post
+    const fetchedUrls = mockFetch.mock.calls.map((c) => c[0] as string);
+    expect(fetchedUrls.some((u) => u.includes("/blog/"))).toBe(false);
+    expect(result.errors).toHaveLength(0);
+  });
+
+  it("respects maxPages limit", async () => {
+    const rootWithManyLinks = `
+      <html>
+        <head><meta name="generator" content="Sphinx 5.0"></head>
+        <body>
+          <div role="main"><h1>Root</h1><p>Intro content for root page.</p></div>
+          <a href="https://docs.example.com/docs/p1">P1</a>
+          <a href="https://docs.example.com/docs/p2">P2</a>
+          <a href="https://docs.example.com/docs/p3">P3</a>
+          <a href="https://docs.example.com/docs/p4">P4</a>
+          <a href="https://docs.example.com/docs/p5">P5</a>
+        </body>
+      </html>`;
+    const pageHtml = (n: number) =>
+      `<html><body><main><h1>Page ${n}</h1><p>Content for page ${n} of the docs.</p></main></body></html>`;
+
+    // Order: root, sitemap, then sub-pages (unlimited via mockResolvedValue)
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(rootWithManyLinks)) // root
+      .mockResolvedValueOnce(notFoundResponse()) // sitemap.xml
+      .mockResolvedValue(htmlResponse(pageHtml(1))); // all subsequent pages
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+      maxPages: 2,
+    });
+
+    // root (1) + up to maxPages (2) = at most 3 total
+    expect(result.pagesIndexed + result.pagesUpdated + result.pagesSkipped).toBeLessThanOrEqual(3);
+  });
+
+  it("skips empty pages and counts them as skipped", async () => {
+    // A page with a role=main div that has no text content
+    const emptyPage = `<html><head><meta name="generator" content="Sphinx 5.0"></head><body><div role="main"> </div></body></html>`;
+
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(emptyPage)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    const result = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    expect(result.pagesSkipped).toBeGreaterThanOrEqual(1);
+    expect(result.pagesIndexed).toBe(0);
+  });
+
+  it("tags indexed documents with the configured library name", async () => {
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+      library: "mylib",
+      version: "2.0",
+    });
+
+    const doc = db
+      .prepare("SELECT library, version FROM documents WHERE url IS NOT NULL LIMIT 1")
+      .get() as { library: string; version: string } | undefined;
+
+    expect(doc?.library).toBe("mylib");
+    expect(doc?.version).toBe("2.0");
+  });
+
+  it("re-indexes changed pages and counts them as updated", async () => {
+    // First sync — index root
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    await syncDocSite(db, provider, { url: "https://docs.example.com/docs/" });
+
+    const beforeCount = (db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number })
+      .n;
+    expect(beforeCount).toBe(1);
+
+    // Second sync — same URL but different content
+    const changedRoot = SPHINX_ROOT_SIMPLE.replace(
+      "documentation root page content.",
+      "updated documentation page content.",
+    );
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(changedRoot)) // root (changed)
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    const result2 = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    // Should update in-place, not add a new doc
+    expect(result2.pagesUpdated).toBe(1);
+    expect(result2.pagesIndexed).toBe(0);
+    const afterCount = (db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number }).n;
+    expect(afterCount).toBe(1);
+  });
+
+  it("skips unchanged pages (content-hash match) as skipped", async () => {
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    await syncDocSite(db, provider, { url: "https://docs.example.com/docs/" });
+
+    // Exact same content — should be skipped on second run
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root (unchanged)
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    const result2 = await syncDocSite(db, provider, {
+      url: "https://docs.example.com/docs/",
+    });
+
+    expect(result2.pagesSkipped).toBe(1);
+    expect(result2.pagesIndexed).toBe(0);
+    expect(result2.pagesUpdated).toBe(0);
+  });
+
+  it("records sync history in the connector_syncs table", async () => {
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    await syncDocSite(db, provider, { url: "https://docs.example.com/docs/" });
+
+    const row = db
+      .prepare("SELECT status, connector_type FROM connector_syncs ORDER BY id DESC LIMIT 1")
+      .get() as { status: string; connector_type: string } | undefined;
+
+    expect(row?.status).toBe("completed");
+    expect(row?.connector_type).toBe("docs");
+  });
+
+  it("throws when root page fetch fails", async () => {
+    mockFetch.mockResolvedValueOnce(notFoundResponse()); // root 404
+
+    await expect(
+      syncDocSite(db, provider, { url: "https://docs.example.com/docs/" }),
+    ).rejects.toThrow();
+  });
+
+  it("limits concurrency to between 1 and 10", async () => {
+    mockFetch
+      .mockResolvedValueOnce(htmlResponse(SPHINX_ROOT_SIMPLE)) // root
+      .mockResolvedValueOnce(notFoundResponse()); // sitemap.xml
+
+    await expect(
+      syncDocSite(db, provider, {
+        url: "https://docs.example.com/docs/",
+        concurrency: 100,
+      }),
+    ).resolves.not.toThrow();
+  });
+});
+
+// -------------------------------------------------------------------------
+// disconnectDocSite
+// -------------------------------------------------------------------------
+
+describe("disconnectDocSite", () => {
+  let db: Database.Database;
+
+  beforeEach(() => {
+    initLogger("silent");
+    db = createTestDbWithVec();
+    vi.clearAllMocks();
+  });
+
+  afterEach(() => {
+    db.close();
+  });
+
+  it("removes all documents from the given site URL prefix", () => {
+    // Seed some docs manually
+    db.prepare(
+      "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)",
+    ).run("doc-1", "Page 1", "Content 1", "https://docs.example.com/docs/page1");
+    db.prepare(
+      "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)",
+    ).run("doc-2", "Page 2", "Content 2", "https://docs.example.com/docs/page2");
+    db.prepare(
+      "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)",
+    ).run("doc-3", "Other", "Content 3", "https://other.example.com/docs/page");
+
+    const removed = disconnectDocSite(db, "https://docs.example.com/docs/");
+
+    expect(removed).toBe(2);
+
+    const remaining = db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number };
+    expect(remaining.n).toBe(1); // doc-3 should remain
+  });
+
+  it("returns 0 when no matching documents exist", () => {
+    const removed = disconnectDocSite(db, "https://docs.example.com/docs/");
+    expect(removed).toBe(0);
+  });
+
+  it("throws ValidationError for invalid site URL", () => {
+    expect(() => disconnectDocSite(db, "not-a-url")).toThrow(ValidationError);
+  });
+
+  it("does not remove documents from other sites", () => {
+    db.prepare(
+      "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', ?, ?, ?)",
+    ).run("doc-1", "Page 1", "Content 1", "https://other.example.com/docs/page");
+
+    const removed = disconnectDocSite(db, "https://docs.example.com/docs/");
+    expect(removed).toBe(0);
+
+    const remaining = db.prepare("SELECT COUNT(*) as n FROM documents").get() as { n: number };
+    expect(remaining.n).toBe(1);
+  });
+
+  it("removes associated chunks", () => {
+    db.prepare(
+      "INSERT INTO documents (id, source_type, title, content, url) VALUES (?, 'library', 'Title', 'Body', ?)",
+    ).run("doc-1", "https://docs.example.com/docs/page");
+    db.prepare(
+      "INSERT INTO chunks (id, document_id, content, chunk_index) VALUES (?, ?, ?, ?)",
+    ).run("chunk-1", "doc-1", "Chunk content", 0);
+
+    disconnectDocSite(db, "https://docs.example.com/docs/");
+
+    const chunks = db
+      .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = 'doc-1'")
+      .get() as { n: number };
+    expect(chunks.n).toBe(0);
+  });
+});