,
): void {
const log = getLogger();
- const rows = db.prepare("SELECT * FROM webhooks WHERE active = 1").all() as WebhookRow[];
+ const rows = db
+ .prepare(
+ "SELECT id, url, events, secret, active, created_at, last_triggered_at, failure_count FROM webhooks WHERE active = 1",
+ )
+ .all() as WebhookRow[];
const body = buildPayload(event, data);
diff --git a/src/mcp/errors.ts b/src/mcp/errors.ts
new file mode 100644
index 0000000..38f07b8
--- /dev/null
+++ b/src/mcp/errors.ts
@@ -0,0 +1,40 @@
+import { LibScopeError } from "../errors.js";
+import { getLogger } from "../logger.js";
+
+/** Convert a thrown error into an MCP error response object. */
+export function errorResponse(err: unknown): {
+ content: Array<{ type: "text"; text: string }>;
+ isError: true;
+} {
+ let message: string;
+ if (err instanceof LibScopeError) {
+ message = err.message;
+ } else if (err instanceof Error) {
+ message = `${err.name}: ${err.message}`;
+ } else {
+ message = `An unexpected error occurred: ${String(err)}`;
+ }
+
+ const log = getLogger();
+ log.error({ err }, "MCP tool error");
+
+ return {
+ content: [{ type: "text" as const, text: `Error: ${message}` }],
+ isError: true,
+ };
+}
+
+export type ToolResult = { content: Array<{ type: "text"; text: string }>; isError?: boolean };
+
+/** Wraps a tool handler so that thrown errors are converted to MCP error responses. */
+export function withErrorHandling(
+ handler: (params: P) => ToolResult | Promise,
+): (params: P) => Promise {
+ return async (params: P) => {
+ try {
+ return await handler(params);
+ } catch (err) {
+ return errorResponse(err);
+ }
+ };
+}
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
index 8d62806..cd44e5c 100644
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -6,7 +6,13 @@ import { getDatabase, runMigrations, createVectorTable } from "../db/index.js";
import { getActiveWorkspace, getWorkspacePath } from "../core/workspace.js";
import { createEmbeddingProvider } from "../providers/index.js";
import { searchDocuments } from "../core/search.js";
-import { askQuestion, createLlmProvider, type LlmProvider } from "../core/rag.js";
+import {
+ askQuestion,
+ createLlmProvider,
+ getContextForQuestion,
+ isPassthroughMode,
+ type LlmProvider,
+} from "../core/rag.js";
import { getDocument, listDocuments, deleteDocument, updateDocument } from "../core/documents.js";
import { rateDocument, getDocumentRatings } from "../core/ratings.js";
import { indexDocument } from "../core/indexing.js";
@@ -23,45 +29,12 @@ import { createWebhook, listWebhooks, deleteWebhook, redactWebhook } from "../co
import type { WebhookEvent } from "../core/webhooks.js";
import { suggestTags } from "../core/tags.js";
import { fetchAndConvert } from "../core/url-fetcher.js";
+import { spiderUrl } from "../core/spider.js";
+import type { SpiderOptions } from "../core/spider.js";
import { initLogger, getLogger } from "../logger.js";
-import { ConfigError, LibScopeError, ValidationError } from "../errors.js";
-
-function errorResponse(err: unknown): {
- content: Array<{ type: "text"; text: string }>;
- isError: true;
-} {
- let message: string;
- if (err instanceof LibScopeError) {
- message = err.message;
- } else if (err instanceof Error) {
- message = `${err.name}: ${err.message}`;
- } else {
- message = `An unexpected error occurred: ${String(err)}`;
- }
-
- const log = getLogger();
- log.error({ err }, "MCP tool error");
-
- return {
- content: [{ type: "text" as const, text: `Error: ${message}` }],
- isError: true,
- };
-}
-
-type ToolResult = { content: Array<{ type: "text"; text: string }>; isError?: boolean };
-
-/** Wraps a tool handler so that thrown errors are converted to MCP error responses. */
-function withErrorHandling(
- handler: (params: P) => ToolResult | Promise,
-): (params: P) => Promise {
- return async (params: P) => {
- try {
- return await handler(params);
- } catch (err) {
- return errorResponse(err);
- }
- };
-}
+import { ConfigError, ValidationError } from "../errors.js";
+import { errorResponse, withErrorHandling } from "./errors.js";
+export { errorResponse, withErrorHandling, type ToolResult } from "./errors.js";
// Start the server
async function main(): Promise {
@@ -330,7 +303,7 @@ async function main(): Promise {
// Tool: submit-document
server.tool(
"submit-document",
- "Submit a new document for indexing into the knowledge base. You can provide content directly, or provide a URL to fetch and index automatically.",
+ "Submit a new document for indexing into the knowledge base. You can provide content directly, or provide a URL to fetch and index automatically. Set spider=true to crawl linked pages from the URL.",
{
title: z
.string()
@@ -353,17 +326,104 @@ async function main(): Promise {
topic: z.string().optional().describe("Topic ID to categorize under"),
library: z.string().optional().describe("Library name (for library docs)"),
version: z.string().optional().describe("Library version"),
+ spider: z
+ .boolean()
+ .optional()
+ .describe("When true, crawl pages linked from the URL. Requires 'url'. Default: false."),
+ maxPages: z
+ .number()
+ .int()
+ .positive()
+ .optional()
+ .describe("Maximum pages to index during a spider run (default: 25, hard cap: 200)."),
+ maxDepth: z
+ .number()
+ .int()
+ .min(0)
+ .optional()
+ .describe(
+ "Maximum link-hop depth from the seed URL (default: 2, hard cap: 5). 0 = seed only.",
+ ),
+ sameDomain: z
+ .boolean()
+ .optional()
+ .describe("Only follow links on the same domain as the seed URL (default: true)."),
+ pathPrefix: z
+ .string()
+ .optional()
+ .describe("Only follow links whose path starts with this prefix (e.g. '/docs/')."),
+ excludePatterns: z
+ .array(z.string())
+ .optional()
+ .describe("Glob patterns for URLs to skip (e.g. ['*/changelog*', '*/api/v1/*'])."),
},
withErrorHandling(async (params) => {
let { title, content } = params;
const { url, library, version, topic } = params;
+ const fetchOptions = {
+ allowPrivateUrls: config.indexing.allowPrivateUrls,
+ allowSelfSignedCerts: config.indexing.allowSelfSignedCerts,
+ };
+
+ // Spider mode — crawl linked pages from the URL
+ if (params.spider && !url) {
+ throw new ValidationError("Field 'url' is required when spider is true");
+ }
+ if (params.spider && url) {
+ const spiderOptions: SpiderOptions = { fetchOptions };
+ if (params.maxPages !== undefined) spiderOptions.maxPages = params.maxPages;
+ if (params.maxDepth !== undefined) spiderOptions.maxDepth = params.maxDepth;
+ if (params.sameDomain !== undefined) spiderOptions.sameDomain = params.sameDomain;
+ if (params.pathPrefix !== undefined) spiderOptions.pathPrefix = params.pathPrefix;
+ if (params.excludePatterns !== undefined)
+ spiderOptions.excludePatterns = params.excludePatterns;
+
+ const indexed: Array<{ id: string; title: string }> = [];
+ const errors: Array<{ url: string; error: string }> = [];
+ const sourceType = params.sourceType ?? (library ? "library" : "manual");
+
+ const gen = spiderUrl(url, spiderOptions);
+ let result = await gen.next();
+ while (!result.done) {
+ const page = result.value;
+ try {
+ const doc = await indexDocument(db, provider, {
+ title: page.title,
+ content: page.content,
+ sourceType,
+ library,
+ version,
+ topicId: topic,
+ url: page.url,
+ submittedBy: "model",
+ });
+ indexed.push({ id: doc.id, title: page.title });
+ } catch (err) {
+ errors.push({ url: page.url, error: err instanceof Error ? err.message : String(err) });
+ }
+ result = await gen.next();
+ }
+ const stats = result.value;
+
+ const summary = [
+ `Spider complete.`,
+ `Pages indexed: ${indexed.length}`,
+ `Pages crawled: ${stats?.pagesCrawled ?? indexed.length}`,
+ `Pages skipped: ${stats?.pagesSkipped ?? 0}`,
+ errors.length > 0 ? `Errors: ${errors.length}` : null,
+ stats?.abortReason ? `Stopped early: ${stats.abortReason}` : null,
+ ]
+ .filter(Boolean)
+ .join("\n");
+
+ return {
+ content: [{ type: "text" as const, text: summary }],
+ };
+ }
// If URL is provided and no content, fetch it
if (url && !content) {
- const fetched = await fetchAndConvert(url, {
- allowPrivateUrls: config.indexing.allowPrivateUrls,
- allowSelfSignedCerts: config.indexing.allowSelfSignedCerts,
- });
+ const fetched = await fetchAndConvert(url, fetchOptions);
content = fetched.content;
title ??= fetched.title;
}
@@ -543,9 +603,30 @@ async function main(): Promise {
library: z.string().optional().describe("Filter by library name"),
},
withErrorHandling(async (params) => {
+ if (isPassthroughMode(config)) {
+ const { contextPrompt, sources } = await getContextForQuestion(db, provider, {
+ question: params.question,
+ topK: params.topK,
+ topic: params.topic,
+ library: params.library,
+ });
+
+ const sourcesText =
+ sources.length > 0
+ ? "\n\n**Sources:**\n" +
+ sources
+ .map((s) => `- ${s.title} (score: ${s.score.toFixed(2)}) [${s.documentId}]`)
+ .join("\n")
+ : "";
+
+ return {
+ content: [{ type: "text" as const, text: contextPrompt + sourcesText }],
+ };
+ }
+
if (!llmProvider) {
throw new ConfigError(
- "No LLM provider configured. Set llm.provider to 'openai' or 'ollama' in your config.",
+ "No LLM provider configured. Set llm.provider to 'openai', 'ollama', or 'passthrough' in your config.",
);
}
diff --git a/src/providers/local.ts b/src/providers/local.ts
index 10e4a7b..37391b3 100644
--- a/src/providers/local.ts
+++ b/src/providers/local.ts
@@ -2,6 +2,17 @@ import { EmbeddingError } from "../errors.js";
import type { EmbeddingProvider } from "./embedding.js";
import { getLogger } from "../logger.js";
+/** Minimal typed interface for the @xenova/transformers feature-extraction pipeline output. */
+interface TransformersOutput {
+ data: Float32Array;
+}
+
+/** Minimal typed interface for the @xenova/transformers feature-extraction pipeline function. */
+type FeatureExtractionPipeline = (
+ input: string,
+ options: { pooling: string; normalize: boolean },
+) => Promise;
+
/**
* Local embedding provider using @xenova/transformers (all-MiniLM-L6-v2).
* Downloads the model on first use (~80MB). Runs entirely in-process.
@@ -10,7 +21,7 @@ export class LocalEmbeddingProvider implements EmbeddingProvider {
readonly name = "local";
readonly dimensions = 384;
- private pipeline: unknown = null;
+ private pipeline: FeatureExtractionPipeline | null = null;
private initPromise: Promise | null = null;
private async ensureInitialized(): Promise {
@@ -24,7 +35,11 @@ export class LocalEmbeddingProvider implements EmbeddingProvider {
try {
// Dynamic import to avoid loading transformers until needed
const { pipeline } = await import("@xenova/transformers");
- this.pipeline = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");
+ // Cast to the typed interface; @xenova/transformers lacks precise TS generics for pipeline output
+ this.pipeline = (await pipeline(
+ "feature-extraction",
+ "Xenova/all-MiniLM-L6-v2",
+ )) as unknown as FeatureExtractionPipeline;
log.info("Local embedding model loaded successfully");
} catch (err) {
this.initPromise = null;
@@ -38,10 +53,8 @@ export class LocalEmbeddingProvider implements EmbeddingProvider {
}
await this.ensureInitialized();
try {
- // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-assignment
- const output = await (this.pipeline as any)(text, { pooling: "mean", normalize: true });
- // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
- const embedding = Array.from(output.data as Float32Array);
+ const output = await this.pipeline!(text, { pooling: "mean", normalize: true });
+ const embedding = Array.from(output.data);
if (embedding.length !== this.dimensions) {
throw new EmbeddingError(
`Expected embedding dimension ${this.dimensions}, got ${embedding.length}`,
diff --git a/tests/unit/bulk.test.ts b/tests/unit/bulk.test.ts
index e9239fc..37094b8 100644
--- a/tests/unit/bulk.test.ts
+++ b/tests/unit/bulk.test.ts
@@ -76,6 +76,25 @@ describe("bulk operations", () => {
expect(ids).toContain("doc-b");
});
+ it("filters by dateFrom", () => {
+ insertDoc(db, "doc-old", "Old Doc", {
+ library: "react",
+ createdAt: "2020-01-01T00:00:00.000Z",
+ });
+ const ids = resolveSelector(db, { library: "react", dateFrom: "2024-01-01T00:00:00.000Z" });
+ expect(ids).not.toContain("doc-old");
+ expect(ids).toContain("doc-a");
+ });
+
+ it("filters by dateTo", () => {
+ insertDoc(db, "doc-future", "Future Doc", {
+ library: "react",
+ createdAt: "2099-01-01T00:00:00.000Z",
+ });
+ const ids = resolveSelector(db, { library: "react", dateTo: "2025-01-01T00:00:00.000Z" });
+ expect(ids).not.toContain("doc-future");
+ });
+
it("throws on empty selector", () => {
expect(() => resolveSelector(db, {})).toThrow(ValidationError);
});
@@ -90,9 +109,64 @@ describe("bulk operations", () => {
expect(ids.length).toBeLessThanOrEqual(10);
});
- it("returns empty array for negative limit", () => {
- const ids = resolveSelector(db, { library: "react" }, -5);
- expect(ids).toHaveLength(0);
+ it("throws ValidationError for negative limit", () => {
+ expect(() => resolveSelector(db, { library: "react" }, -5)).toThrow(ValidationError);
+ expect(() => resolveSelector(db, { library: "react" }, -1)).toThrow(
+ "limit must be a non-negative integer",
+ );
+ });
+
+ it("applies dateFrom filter at SQL level before LIMIT", () => {
+ // Insert enough docs to exceed a small limit, with varying dates
+ for (let i = 0; i < 20; i++) {
+ insertDoc(db, `old-${i}`, `Old Doc ${i}`, {
+ library: "test-lib",
+ createdAt: "2020-01-01T00:00:00.000Z",
+ });
+ }
+ for (let i = 0; i < 5; i++) {
+ insertDoc(db, `new-${i}`, `New Doc ${i}`, {
+ library: "test-lib",
+ createdAt: "2025-06-01T00:00:00.000Z",
+ });
+ }
+
+ // With a limit of 10, date filter must happen in SQL before LIMIT,
+ // otherwise old docs could fill the limit and exclude new ones
+ const ids = resolveSelector(
+ db,
+ { library: "test-lib", dateFrom: "2025-01-01T00:00:00.000Z" },
+ 10,
+ );
+ expect(ids).toHaveLength(5);
+ for (const id of ids) {
+ expect(id).toMatch(/^new-/);
+ }
+ });
+
+ it("applies dateTo filter at SQL level before LIMIT", () => {
+ for (let i = 0; i < 20; i++) {
+ insertDoc(db, `future-${i}`, `Future Doc ${i}`, {
+ library: "test-lib",
+ createdAt: "2099-01-01T00:00:00.000Z",
+ });
+ }
+ for (let i = 0; i < 5; i++) {
+ insertDoc(db, `past-${i}`, `Past Doc ${i}`, {
+ library: "test-lib",
+ createdAt: "2020-06-01T00:00:00.000Z",
+ });
+ }
+
+ const ids = resolveSelector(
+ db,
+ { library: "test-lib", dateTo: "2025-01-01T00:00:00.000Z" },
+ 10,
+ );
+ expect(ids).toHaveLength(5);
+ for (const id of ids) {
+ expect(id).toMatch(/^past-/);
+ }
});
});
diff --git a/tests/unit/config.test.ts b/tests/unit/config.test.ts
index 9d380c8..a84d0be 100644
--- a/tests/unit/config.test.ts
+++ b/tests/unit/config.test.ts
@@ -1,10 +1,11 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
-import { loadConfig, validateConfig } from "../../src/config.js";
+import { loadConfig, validateConfig, invalidateConfigCache } from "../../src/config.js";
import type { LibScopeConfig } from "../../src/config.js";
import * as loggerModule from "../../src/logger.js";
describe("config", () => {
it("should return default config when no files exist", () => {
+ invalidateConfigCache();
const config = loadConfig();
expect(config.embedding.provider).toBe("local");
@@ -12,10 +13,18 @@ describe("config", () => {
expect(config.database.path).toContain("libscope.db");
});
+ it("should return cached config on repeated calls", () => {
+ invalidateConfigCache();
+ const first = loadConfig();
+ const second = loadConfig(); // cache hit
+ expect(second).toBe(first); // same object reference
+ });
+
it("should respect LIBSCOPE_EMBEDDING_PROVIDER env var", () => {
const original = process.env["LIBSCOPE_EMBEDDING_PROVIDER"];
try {
process.env["LIBSCOPE_EMBEDDING_PROVIDER"] = "ollama";
+ invalidateConfigCache();
const config = loadConfig();
expect(config.embedding.provider).toBe("ollama");
} finally {
@@ -31,6 +40,7 @@ describe("config", () => {
const original = process.env["LIBSCOPE_EMBEDDING_PROVIDER"];
try {
process.env["LIBSCOPE_EMBEDDING_PROVIDER"] = "invalid";
+ invalidateConfigCache();
const config = loadConfig();
// Should fall through to default since "invalid" doesn't match the switch
expect(config.embedding.provider).toBe("local");
@@ -47,6 +57,7 @@ describe("config", () => {
const original = process.env["LIBSCOPE_OPENAI_API_KEY"];
try {
process.env["LIBSCOPE_OPENAI_API_KEY"] = "sk-test123";
+ invalidateConfigCache();
const config = loadConfig();
expect(config.embedding.openaiApiKey).toBe("sk-test123");
} finally {
@@ -62,6 +73,7 @@ describe("config", () => {
const original = process.env["LIBSCOPE_OLLAMA_URL"];
try {
process.env["LIBSCOPE_OLLAMA_URL"] = "http://custom:11434";
+ invalidateConfigCache();
const config = loadConfig();
expect(config.embedding.ollamaUrl).toBe("http://custom:11434");
} finally {
@@ -77,6 +89,7 @@ describe("config", () => {
const original = process.env["LIBSCOPE_ALLOW_PRIVATE_URLS"];
try {
process.env["LIBSCOPE_ALLOW_PRIVATE_URLS"] = "true";
+ invalidateConfigCache();
const config = loadConfig();
expect(config.indexing.allowPrivateUrls).toBe(true);
} finally {
@@ -92,6 +105,7 @@ describe("config", () => {
const original = process.env["LIBSCOPE_ALLOW_SELF_SIGNED_CERTS"];
try {
process.env["LIBSCOPE_ALLOW_SELF_SIGNED_CERTS"] = "1";
+ invalidateConfigCache();
const config = loadConfig();
expect(config.indexing.allowSelfSignedCerts).toBe(true);
} finally {
@@ -109,6 +123,7 @@ describe("config", () => {
try {
process.env["LIBSCOPE_LLM_PROVIDER"] = "ollama";
process.env["LIBSCOPE_LLM_MODEL"] = "llama3";
+ invalidateConfigCache();
const config = loadConfig();
expect(config.llm?.provider).toBe("ollama");
expect(config.llm?.model).toBe("llama3");
diff --git a/tests/unit/connectors-config.test.ts b/tests/unit/connectors-config.test.ts
index 28d15dc..3bfa6ac 100644
--- a/tests/unit/connectors-config.test.ts
+++ b/tests/unit/connectors-config.test.ts
@@ -144,6 +144,17 @@ describe("connectors config", () => {
expect(result).toBe(true);
expect(loadDbConnectorConfig(db, "notion")).toBeUndefined();
});
+
+ it("loadDbConnectorConfig throws ConfigError when config_json is corrupted", () => {
+ // Directly insert corrupted JSON into the database
+ db.prepare(
+ "INSERT INTO connector_configs (type, config_json, updated_at) VALUES (?, ?, datetime('now'))",
+ ).run("corrupted", "not valid json{{{");
+
+ expect(() => loadDbConnectorConfig(db, "corrupted")).toThrow(
+ /Corrupted connector config for type "corrupted"/,
+ );
+ });
});
describe("sync tracker", () => {
diff --git a/tests/unit/http-utils.test.ts b/tests/unit/http-utils.test.ts
index fa8d745..7658343 100644
--- a/tests/unit/http-utils.test.ts
+++ b/tests/unit/http-utils.test.ts
@@ -108,7 +108,7 @@ describe("fetchWithRetry", () => {
baseDelay: 10,
}),
).rejects.toThrow(FetchError);
- expect(mockFetch).toHaveBeenCalledTimes(2);
+ expect(mockFetch).toHaveBeenCalledTimes(3); // 1 initial + 2 retries
vi.useFakeTimers();
});
diff --git a/tests/unit/link-extractor.test.ts b/tests/unit/link-extractor.test.ts
new file mode 100644
index 0000000..f623eba
--- /dev/null
+++ b/tests/unit/link-extractor.test.ts
@@ -0,0 +1,155 @@
+import { describe, it, expect } from "vitest";
+import { extractLinks } from "../../src/core/link-extractor.js";
+
+const BASE = "https://example.com/docs/intro";
+
+describe("extractLinks", () => {
+ it("extracts absolute http links", () => {
+ const html = `link`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]);
+ });
+
+ it("resolves relative links against base URL", () => {
+ const html = `guide`;
+ const links = extractLinks(html, BASE);
+ expect(links).toEqual(["https://example.com/guide"]);
+ });
+
+ it("resolves root-relative links", () => {
+ const html = `about`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/about"]);
+ });
+
+ it("strips fragment-only links", () => {
+ const html = `jump`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("strips fragments from full URLs", () => {
+ const html = `link`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]);
+ });
+
+ it("deduplicates links", () => {
+ const html = `
+ first
+ second
+ `;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]);
+ });
+
+ it("deduplicates after fragment stripping", () => {
+ const html = `
+ a
+ b
+ `;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]);
+ });
+
+ it("filters out mailto: links", () => {
+ const html = `email`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("filters out javascript: links", () => {
+ const html = `noop`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("filters out tel: links", () => {
+ const html = `call`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("filters out ftp: links", () => {
+ const html = `ftp`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("filters out data: links", () => {
+ const html = `data`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("handles single-quoted href attributes", () => {
+ const html = `link`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/single"]);
+ });
+
+ it("handles unquoted href attributes", () => {
+ const html = `link`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/noquote"]);
+ });
+
+ it("ignores tags that aren't ", () => {
+ const html = `
+
+
+ real
+ `;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/real"]);
+ });
+
+ it("handles tags with extra attributes", () => {
+ const html = `link`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]);
+ });
+
+ it("handles href before other attributes", () => {
+ const html = `link`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]);
+ });
+
+ it("strips trailing slash from non-root paths", () => {
+ const html = `docs`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/docs"]);
+ });
+
+ it("preserves trailing slash on root path", () => {
+ const html = `home`;
+ expect(extractLinks(html, BASE)).toEqual(["https://example.com/"]);
+ });
+
+ it("returns empty array for HTML with no links", () => {
+ const html = `No links here at all.
`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("returns empty array for empty string", () => {
+ expect(extractLinks("", BASE)).toEqual([]);
+ });
+
+ it("handles multiple links preserving discovery order", () => {
+ const html = `
+ a
+ b
+ c
+ `;
+ expect(extractLinks(html, BASE)).toEqual([
+ "https://example.com/a",
+ "https://example.com/b",
+ "https://example.com/c",
+ ]);
+ });
+
+ it("handles malformed href gracefully", () => {
+ const html = `bad`;
+ // Should not throw; just skip
+ expect(() => extractLinks(html, BASE)).not.toThrow();
+ });
+
+ it("skips and tags (not )", () => {
+ const html = `X`;
+ expect(extractLinks(html, BASE)).toEqual([]);
+ });
+
+ it("handles https links alongside http", () => {
+ const html = `
+ http
+ https
+ `;
+ const links = extractLinks(html, BASE);
+ expect(links).toContain("http://example.com/http");
+ expect(links).toContain("https://example.com/https");
+ });
+});
diff --git a/tests/unit/mcp-server.test.ts b/tests/unit/mcp-server.test.ts
new file mode 100644
index 0000000..1e0f65a
--- /dev/null
+++ b/tests/unit/mcp-server.test.ts
@@ -0,0 +1,209 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
+import { createTestDbWithVec } from "../fixtures/test-db.js";
+import { MockEmbeddingProvider } from "../fixtures/mock-provider.js";
+import { initLogger } from "../../src/logger.js";
+import { errorResponse, withErrorHandling, type ToolResult } from "../../src/mcp/errors.js";
+import { LibScopeError, ValidationError, DocumentNotFoundError } from "../../src/errors.js";
+import type Database from "better-sqlite3";
+
+describe("MCP server helpers", () => {
+ beforeEach(() => {
+ initLogger("silent");
+ });
+
+ describe("errorResponse", () => {
+ it("returns isError: true with text content", () => {
+ const result = errorResponse(new Error("something went wrong"));
+ expect(result.isError).toBe(true);
+ expect(result.content).toHaveLength(1);
+ expect(result.content[0]!.type).toBe("text");
+ });
+
+ it("formats LibScopeError using just the message", () => {
+ const result = errorResponse(new ValidationError("invalid input"));
+ expect(result.content[0]!.text).toBe("Error: invalid input");
+ });
+
+ it("formats a generic Error using name: message", () => {
+ const err = new TypeError("bad type");
+ const result = errorResponse(err);
+ expect(result.content[0]!.text).toBe("Error: TypeError: bad type");
+ });
+
+ it("formats non-Error values using String()", () => {
+ const result = errorResponse("raw string error");
+ expect(result.content[0]!.text).toContain("raw string error");
+ });
+
+ it("formats null/undefined without throwing", () => {
+ expect(() => errorResponse(null)).not.toThrow();
+ expect(() => errorResponse(undefined)).not.toThrow();
+ });
+ });
+
+ describe("withErrorHandling", () => {
+ it("returns the handler result when no error is thrown", async () => {
+ const expected: ToolResult = { content: [{ type: "text", text: "ok" }] };
+ const wrapped = withErrorHandling(() => expected);
+ const result = await wrapped({});
+ expect(result).toEqual(expected);
+ });
+
+ it("catches synchronous throws and returns an error response", async () => {
+ const wrapped = withErrorHandling(() => {
+ throw new ValidationError("bad input");
+ });
+ const result = await wrapped({});
+ expect(result.isError).toBe(true);
+ expect(result.content[0]!.text).toContain("bad input");
+ });
+
+ it("catches rejected promises and returns an error response", async () => {
+ const wrapped = withErrorHandling(() => {
+ return Promise.reject(new DocumentNotFoundError("doc-123"));
+ });
+ const result = await wrapped({});
+ expect(result.isError).toBe(true);
+ });
+
+ it("passes params to the inner handler", async () => {
+ const handler = vi.fn().mockReturnValue({ content: [{ type: "text", text: "done" }] });
+ const wrapped = withErrorHandling(handler);
+ const params = { docId: "abc", query: "test" };
+ await wrapped(params);
+ expect(handler).toHaveBeenCalledWith(params);
+ });
+
+ it("returns isError: true for LibScopeError subclasses", async () => {
+ const wrapped = withErrorHandling(() => {
+ throw new LibScopeError("base lib error");
+ });
+ const result = await wrapped({});
+ expect(result.isError).toBe(true);
+ expect(result.content[0]!.text).toBe("Error: base lib error");
+ });
+ });
+});
+
+// Integration-style tests for MCP tool behaviors using the underlying core functions
+// These verify the business logic that MCP tools delegate to.
+describe("MCP tool business logic", () => {
+ let db: Database.Database;
+ let provider: MockEmbeddingProvider;
+
+ beforeEach(() => {
+ initLogger("silent");
+ db = createTestDbWithVec();
+ provider = new MockEmbeddingProvider();
+ });
+
+ afterEach(() => {
+ db.close();
+ });
+
+ it("search returns empty response when no documents are indexed", async () => {
+ const { searchDocuments } = await import("../../src/core/search.js");
+ const { results, totalCount } = await searchDocuments(db, provider, { query: "anything" });
+ expect(results).toHaveLength(0);
+ expect(totalCount).toBe(0);
+ });
+
+ it("indexDocument then getDocument returns indexed document", async () => {
+ const { indexDocument } = await import("../../src/core/indexing.js");
+ const { getDocument } = await import("../../src/core/documents.js");
+
+ const indexed = await indexDocument(db, provider, {
+ title: "Test Doc",
+ content: "Some content for testing.",
+ sourceType: "manual",
+ });
+
+ expect(indexed.id).toBeTruthy();
+
+ const fetched = getDocument(db, indexed.id);
+ expect(fetched.title).toBe("Test Doc");
+ expect(fetched.content).toBe("Some content for testing.");
+ });
+
+ it("deleteDocument removes a document", async () => {
+ const { indexDocument } = await import("../../src/core/indexing.js");
+ const { deleteDocument, getDocument } = await import("../../src/core/documents.js");
+
+ const indexed = await indexDocument(db, provider, {
+ title: "Delete Me",
+ content: "This will be deleted.",
+ sourceType: "manual",
+ });
+
+ deleteDocument(db, indexed.id);
+
+ expect(() => getDocument(db, indexed.id)).toThrow(DocumentNotFoundError);
+ });
+
+ it("listDocuments returns paginated documents", async () => {
+ const { indexDocument } = await import("../../src/core/indexing.js");
+ const { listDocuments } = await import("../../src/core/documents.js");
+
+ await indexDocument(db, provider, {
+ title: "Doc A",
+ content: "Content A",
+ sourceType: "library",
+ library: "react",
+ });
+ await indexDocument(db, provider, {
+ title: "Doc B",
+ content: "Content B",
+ sourceType: "library",
+ library: "vue",
+ });
+
+ const all = listDocuments(db, {});
+ expect(all.length).toBeGreaterThanOrEqual(2);
+
+ const limited = listDocuments(db, { limit: 1 });
+ expect(limited).toHaveLength(1);
+ });
+
+ it("getDocumentRatings returns zero ratings for new document", async () => {
+ const { indexDocument } = await import("../../src/core/indexing.js");
+ const { getDocumentRatings } = await import("../../src/core/ratings.js");
+
+ const indexed = await indexDocument(db, provider, {
+ title: "Rate Me",
+ content: "Rateable content.",
+ sourceType: "manual",
+ });
+
+ const ratings = getDocumentRatings(db, indexed.id);
+ expect(ratings.totalRatings).toBe(0);
+ expect(ratings.averageRating).toBe(0);
+ });
+
+ it("rateDocument stores a rating and updates average", async () => {
+ const { indexDocument } = await import("../../src/core/indexing.js");
+ const { rateDocument, getDocumentRatings } = await import("../../src/core/ratings.js");
+
+ const indexed = await indexDocument(db, provider, {
+ title: "Rate Me",
+ content: "Rateable content.",
+ sourceType: "manual",
+ });
+
+ rateDocument(db, { documentId: indexed.id, rating: 4, feedback: "good doc" });
+ const ratings = getDocumentRatings(db, indexed.id);
+ expect(ratings.totalRatings).toBe(1);
+ expect(ratings.averageRating).toBe(4);
+ });
+
+ it("listTopics returns empty array when no topics exist", async () => {
+ const { listTopics } = await import("../../src/core/topics.js");
+ const topics = listTopics(db);
+ expect(topics).toEqual([]);
+ });
+
+ it("errorResponse for DocumentNotFoundError returns proper message", () => {
+ const result = errorResponse(new DocumentNotFoundError("missing-id"));
+ expect(result.isError).toBe(true);
+ expect(result.content[0]!.text).toContain("missing-id");
+ });
+});
diff --git a/tests/unit/packs.test.ts b/tests/unit/packs.test.ts
index 90edf79..45bf0f5 100644
--- a/tests/unit/packs.test.ts
+++ b/tests/unit/packs.test.ts
@@ -1,7 +1,8 @@
-import { describe, it, expect, beforeEach, afterEach } from "vitest";
-import { writeFileSync, existsSync, mkdtempSync } from "node:fs";
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { writeFileSync, existsSync, mkdtempSync, readFileSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
+import { gzipSync, gunzipSync } from "node:zlib";
import type Database from "better-sqlite3";
import { createTestDbWithVec } from "../fixtures/test-db.js";
import { MockEmbeddingProvider } from "../fixtures/mock-provider.js";
@@ -11,6 +12,7 @@ import {
listInstalledPacks,
createPack,
listAvailablePacks,
+ createPackFromSource,
} from "../../src/core/packs.js";
import type { KnowledgePack } from "../../src/core/packs.js";
import { indexDocument } from "../../src/core/indexing.js";
@@ -443,4 +445,626 @@ describe("knowledge packs", () => {
);
});
});
+
+ describe("createPackFromSource", () => {
+ let sourceDir: string;
+
+ beforeEach(() => {
+ sourceDir = mkdtempSync(join(tmpdir(), "libscope-pack-source-"));
+ });
+
+ it("should create a pack from a folder of markdown files", async () => {
+ writeFileSync(join(sourceDir, "guide.md"), "# Guide\n\nThis is a guide.");
+ writeFileSync(join(sourceDir, "api.md"), "# API\n\nEndpoint reference.");
+
+ const pack = await createPackFromSource({
+ name: "test-from-folder",
+ from: [sourceDir],
+ });
+
+ expect(pack.name).toBe("test-from-folder");
+ expect(pack.documents).toHaveLength(2);
+ expect(pack.documents.map((d) => d.title).sort()).toEqual(["api", "guide"]);
+ expect(pack.documents[0]!.content).toBeTruthy();
+ expect(pack.documents[0]!.source).toMatch(/^file:\/\//);
+ expect(pack.version).toBe("1.0.0");
+ expect(pack.metadata.author).toBe("libscope");
+ });
+
+ it("should write pack to outputPath", async () => {
+ writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent here.");
+ const outputPath = join(tempDir, "output-pack.json");
+
+ const pack = await createPackFromSource({
+ name: "output-test",
+ from: [sourceDir],
+ outputPath,
+ });
+
+ expect(existsSync(outputPath)).toBe(true);
+ const written = JSON.parse(readFileSync(outputPath, "utf-8")) as KnowledgePack;
+ expect(written.name).toBe("output-test");
+ expect(written.documents).toHaveLength(1);
+ expect(pack.documents).toHaveLength(1);
+ });
+
+ it("should filter by extensions", async () => {
+ writeFileSync(join(sourceDir, "readme.md"), "# Readme");
+ writeFileSync(join(sourceDir, "page.html"), "Page
Content
");
+ writeFileSync(join(sourceDir, "data.json"), '{"key": "value"}');
+
+ const pack = await createPackFromSource({
+ name: "ext-filter",
+ from: [sourceDir],
+ extensions: [".md"],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("readme");
+ });
+
+ it("should handle extensions without leading dot", async () => {
+ writeFileSync(join(sourceDir, "readme.md"), "# Readme\n\nContent");
+
+ const pack = await createPackFromSource({
+ name: "ext-no-dot",
+ from: [sourceDir],
+ extensions: ["md"],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ });
+
+ it("should exclude files matching patterns", async () => {
+ writeFileSync(join(sourceDir, "guide.md"), "# Guide\n\nContent");
+ writeFileSync(join(sourceDir, "draft.md"), "# Draft\n\nNot ready");
+
+ const pack = await createPackFromSource({
+ name: "exclude-test",
+ from: [sourceDir],
+ exclude: ["draft.md"],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("guide");
+ });
+
+ it("should recurse into subdirectories by default", async () => {
+ const { mkdirSync } = await import("node:fs");
+ const subDir = join(sourceDir, "sub");
+ mkdirSync(subDir);
+ writeFileSync(join(sourceDir, "root.md"), "# Root");
+ writeFileSync(join(subDir, "nested.md"), "# Nested\n\nDeep content");
+
+ const pack = await createPackFromSource({
+ name: "recursive-test",
+ from: [sourceDir],
+ });
+
+ expect(pack.documents).toHaveLength(2);
+ expect(pack.documents.map((d) => d.title).sort()).toEqual(["nested", "root"]);
+ });
+
+ it("should not recurse when recursive is false", async () => {
+ const { mkdirSync } = await import("node:fs");
+ const subDir = join(sourceDir, "sub");
+ mkdirSync(subDir);
+ writeFileSync(join(sourceDir, "root.md"), "# Root");
+ writeFileSync(join(subDir, "nested.md"), "# Nested");
+
+ const pack = await createPackFromSource({
+ name: "no-recurse",
+ from: [sourceDir],
+ recursive: false,
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("root");
+ });
+
+ it("should throw for empty pack name", async () => {
+ await expect(createPackFromSource({ name: " ", from: [sourceDir] })).rejects.toThrow(
+ /Pack name is required/,
+ );
+ });
+
+ it("should throw for empty from array", async () => {
+ await expect(createPackFromSource({ name: "test", from: [] })).rejects.toThrow(
+ /At least one --from source is required/,
+ );
+ });
+
+ it("should throw for non-existent source path", async () => {
+ await expect(
+ createPackFromSource({ name: "test", from: ["/nonexistent/path/xyz"] }),
+ ).rejects.toThrow(/does not exist/);
+ });
+
+ it("should throw when no documents could be created", async () => {
+ // Empty directory — no parseable files
+ await expect(createPackFromSource({ name: "empty", from: [sourceDir] })).rejects.toThrow(
+ /No documents could be created/,
+ );
+ });
+
+ it("should skip files without a parser", async () => {
+ writeFileSync(join(sourceDir, "data.bin"), "binary stuff");
+ writeFileSync(join(sourceDir, "readme.md"), "# Readme\n\nContent");
+
+ const pack = await createPackFromSource({
+ name: "skip-unsupported",
+ from: [sourceDir],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("readme");
+ });
+
+ it("should skip files with empty content after parsing", async () => {
+ writeFileSync(join(sourceDir, "empty.md"), " ");
+ writeFileSync(join(sourceDir, "real.md"), "# Real\n\nActual content");
+
+ const pack = await createPackFromSource({
+ name: "skip-empty",
+ from: [sourceDir],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("real");
+ });
+
+ it("should accept a single file as source", async () => {
+ const filePath = join(sourceDir, "single.md");
+ writeFileSync(filePath, "# Single File\n\nJust one file.");
+
+ const pack = await createPackFromSource({
+ name: "single-file",
+ from: [filePath],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("single");
+ });
+
+ it("should accept multiple sources", async () => {
+ const dir2 = mkdtempSync(join(tmpdir(), "libscope-pack-source2-"));
+ writeFileSync(join(sourceDir, "a.md"), "# A\n\nFrom dir 1");
+ writeFileSync(join(dir2, "b.md"), "# B\n\nFrom dir 2");
+
+ const pack = await createPackFromSource({
+ name: "multi-source",
+ from: [sourceDir, dir2],
+ });
+
+ expect(pack.documents).toHaveLength(2);
+ });
+
+ it("should call onProgress callback", async () => {
+ writeFileSync(join(sourceDir, "a.md"), "# A");
+ writeFileSync(join(sourceDir, "b.md"), "# B");
+
+ const progress: Array<{ file: string; index: number; total: number }> = [];
+
+ await createPackFromSource({
+ name: "progress-test",
+ from: [sourceDir],
+ onProgress: (info) => progress.push(info),
+ });
+
+ expect(progress).toHaveLength(2);
+ expect(progress[0]!.index).toBe(0);
+ expect(progress[0]!.total).toBe(2);
+ expect(progress[1]!.index).toBe(1);
+ });
+
+ it("should set custom version, description, author", async () => {
+ writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent");
+
+ const pack = await createPackFromSource({
+ name: "custom-meta",
+ from: [sourceDir],
+ version: "2.0.0",
+ description: "Custom desc",
+ author: "Test Author",
+ });
+
+ expect(pack.version).toBe("2.0.0");
+ expect(pack.description).toBe("Custom desc");
+ expect(pack.metadata.author).toBe("Test Author");
+ });
+
+ it("should produce a valid pack that passes validatePack", async () => {
+ writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nSome content here");
+ const outputPath = join(tempDir, "validate-test.json");
+
+ await createPackFromSource({
+ name: "validate-test",
+ from: [sourceDir],
+ outputPath,
+ });
+
+ // Read and re-validate through installPack (which calls validatePack internally)
+ const result = await installPack(db, provider, outputPath);
+ expect(result.packName).toBe("validate-test");
+ expect(result.documentsInstalled).toBe(1);
+ });
+
+ it("should handle HTML files", async () => {
+ writeFileSync(
+ join(sourceDir, "page.html"),
+ "TestHello
World
",
+ );
+
+ const pack = await createPackFromSource({
+ name: "html-test",
+ from: [sourceDir],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("page");
+ expect(pack.documents[0]!.content).toContain("Hello");
+ expect(pack.documents[0]!.content).toContain("World");
+ });
+
+ it("should exclude with wildcard patterns", async () => {
+ const { mkdirSync } = await import("node:fs");
+ const assetsDir = join(sourceDir, "assets");
+ mkdirSync(assetsDir);
+ writeFileSync(join(sourceDir, "readme.md"), "# Readme\n\nContent");
+ writeFileSync(join(assetsDir, "data.md"), "# Asset data");
+
+ const pack = await createPackFromSource({
+ name: "wildcard-exclude",
+ from: [sourceDir],
+ exclude: ["assets/**"],
+ });
+
+ expect(pack.documents).toHaveLength(1);
+ expect(pack.documents[0]!.title).toBe("readme");
+ });
+
+ it("should write gzipped pack when output ends in .gz", async () => {
+ writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent here.");
+ const outputPath = join(tempDir, "test.json.gz");
+
+ await createPackFromSource({
+ name: "gzip-test",
+ from: [sourceDir],
+ outputPath,
+ });
+
+ expect(existsSync(outputPath)).toBe(true);
+ const raw = readFileSync(outputPath);
+ // Verify gzip magic bytes
+ expect(raw[0]).toBe(0x1f);
+ expect(raw[1]).toBe(0x8b);
+ // Decompress and verify JSON
+ const json = gunzipSync(raw).toString("utf-8");
+ const parsed = JSON.parse(json) as KnowledgePack;
+ expect(parsed.name).toBe("gzip-test");
+ expect(parsed.documents).toHaveLength(1);
+ });
+
+ it("should write plain JSON when output ends in .json", async () => {
+ writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent here.");
+ const outputPath = join(tempDir, "test.json");
+
+ await createPackFromSource({
+ name: "json-test",
+ from: [sourceDir],
+ outputPath,
+ });
+
+ const raw = readFileSync(outputPath, "utf-8");
+ const parsed = JSON.parse(raw) as KnowledgePack;
+ expect(parsed.name).toBe("json-test");
+ });
+ });
+
+ describe("gzip pack install", () => {
+ it("should install a gzipped pack file", async () => {
+ const pack = makeSamplePack({ name: "gz-pack" });
+ const packPath = join(tempDir, "gz-pack.json.gz");
+ writeFileSync(packPath, gzipSync(Buffer.from(JSON.stringify(pack), "utf-8")));
+
+ const result = await installPack(db, provider, packPath);
+
+ expect(result.packName).toBe("gz-pack");
+ expect(result.documentsInstalled).toBe(2);
+ expect(result.alreadyInstalled).toBe(false);
+ });
+
+ it("should auto-detect gzip by magic bytes even with .json extension", async () => {
+ const pack = makeSamplePack({ name: "magic-detect" });
+ const packPath = join(tempDir, "magic-detect.json");
+ // Write gzipped content but with .json extension
+ writeFileSync(packPath, gzipSync(Buffer.from(JSON.stringify(pack), "utf-8")));
+
+ const result = await installPack(db, provider, packPath);
+
+ expect(result.packName).toBe("magic-detect");
+ expect(result.documentsInstalled).toBe(2);
+ });
+
+ it("should round-trip: create gzipped pack from source then install it", async () => {
+ const rtDir = mkdtempSync(join(tmpdir(), "libscope-pack-rt-"));
+ writeFileSync(join(rtDir, "guide.md"), "# Guide\n\nThis is a guide.");
+ const packPath = join(tempDir, "roundtrip.json.gz");
+
+ await createPackFromSource({
+ name: "roundtrip-pack",
+ from: [rtDir],
+ outputPath: packPath,
+ });
+
+ const result = await installPack(db, provider, packPath);
+ expect(result.packName).toBe("roundtrip-pack");
+ expect(result.documentsInstalled).toBe(1);
+ });
+ });
+
+ describe("installPack — batch & progress options", () => {
+ it("should report progress via onProgress callback", async () => {
+ const pack = makeSamplePack();
+ const packPath = join(tempDir, "progress-pack.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const calls: Array<{ current: number; total: number; label: string }> = [];
+ await installPack(db, provider, packPath, {
+ onProgress: (current, total, label) => {
+ calls.push({ current, total, label });
+ },
+ });
+
+ // Should have called onProgress at least once (one batch covering both docs)
+ expect(calls.length).toBeGreaterThan(0);
+ // Last call should report all docs processed
+ const last = calls[calls.length - 1]!;
+ expect(last.current).toBe(2);
+ expect(last.total).toBe(2);
+ });
+
+ it("should process in smaller batches when batchSize=1", async () => {
+ const pack = makeSamplePack();
+ const packPath = join(tempDir, "batch1-pack.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const calls: number[] = [];
+ await installPack(db, provider, packPath, {
+ batchSize: 1,
+ onProgress: (current) => calls.push(current),
+ });
+
+ // With batchSize=1 and 2 docs, should get 2 progress calls
+ expect(calls).toEqual([1, 2]);
+ });
+
+ it("should skip documents when resumeFrom is set", async () => {
+ const pack = makeSamplePack({
+ name: "resume-pack",
+ documents: [
+ { title: "Doc 1", content: "Content one", source: "" },
+ { title: "Doc 2", content: "Content two", source: "" },
+ { title: "Doc 3", content: "Content three", source: "" },
+ ],
+ });
+ const packPath = join(tempDir, "resume-pack.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const result = await installPack(db, provider, packPath, { resumeFrom: 2 });
+
+ // Should only install doc 3 (skipped first 2)
+ expect(result.documentsInstalled).toBe(1);
+ expect(result.packName).toBe("resume-pack");
+ });
+
+ it("should count errors when embedBatch fails", async () => {
+ const pack = makeSamplePack({ name: "err-pack" });
+ const packPath = join(tempDir, "err-pack.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const failProvider = new MockEmbeddingProvider();
+ failProvider.embedBatch = vi.fn().mockRejectedValue(new Error("embed failed"));
+
+ const result = await installPack(db, failProvider, packPath);
+
+ // embedBatch failure means documents in that batch are skipped
+ expect(result.errors).toBeGreaterThan(0);
+ expect(result.documentsInstalled).toBe(0);
+ });
+
+ it("should include errors=0 on successful install", async () => {
+ const pack = makeSamplePack({ name: "ok-pack" });
+ const packPath = join(tempDir, "ok-pack.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const result = await installPack(db, provider, packPath);
+
+ expect(result.errors).toBe(0);
+ expect(result.documentsInstalled).toBe(2);
+ });
+
+ it("should use a single embedBatch call per batch for efficiency", async () => {
+ const pack = makeSamplePack({ name: "batch-efficiency" });
+ const packPath = join(tempDir, "batch-eff.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ await installPack(db, provider, packPath, { batchSize: 10 });
+
+ // 2 docs in one batch → 1 embedBatch call
+ expect(provider.embedBatchCallCount).toBe(1);
+ });
+
+ it("should return errors=0 for already-installed pack", async () => {
+ const pack = makeSamplePack({ name: "already-pack" });
+ const packPath = join(tempDir, "already-pack.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ await installPack(db, provider, packPath);
+ const result = await installPack(db, provider, packPath);
+
+ expect(result.alreadyInstalled).toBe(true);
+ expect(result.errors).toBe(0);
+ });
+ });
+
+ describe("installPack — concurrency option", () => {
+ it("should install all docs correctly with concurrency=1 (sequential)", async () => {
+ const pack = makeSamplePack({
+ name: "concurrent-1",
+ documents: [
+ { title: "Doc A", content: "# Doc A\n\nContent A.", source: "" },
+ { title: "Doc B", content: "# Doc B\n\nContent B.", source: "" },
+ { title: "Doc C", content: "# Doc C\n\nContent C.", source: "" },
+ ],
+ });
+ const packPath = join(tempDir, "concurrent-1.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const result = await installPack(db, provider, packPath, { batchSize: 1, concurrency: 1 });
+
+ expect(result.documentsInstalled).toBe(3);
+ expect(result.errors).toBe(0);
+ });
+
+ it("should install all docs correctly with concurrency=4 (parallel)", async () => {
+ const pack = makeSamplePack({
+ name: "concurrent-4",
+ documents: [
+ { title: "Doc A", content: "# Doc A\n\nContent A.", source: "" },
+ { title: "Doc B", content: "# Doc B\n\nContent B.", source: "" },
+ { title: "Doc C", content: "# Doc C\n\nContent C.", source: "" },
+ { title: "Doc D", content: "# Doc D\n\nContent D.", source: "" },
+ ],
+ });
+ const packPath = join(tempDir, "concurrent-4.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const result = await installPack(db, provider, packPath, { batchSize: 1, concurrency: 4 });
+
+ expect(result.documentsInstalled).toBe(4);
+ expect(result.errors).toBe(0);
+
+ // Verify all 4 docs are in the DB
+ const docs = db
+ .prepare("SELECT id FROM documents WHERE pack_name = ?")
+ .all("concurrent-4") as Array<{ id: string }>;
+ expect(docs.length).toBe(4);
+ });
+
+ it("should make multiple embedBatch calls with small batchSize and high concurrency", async () => {
+ const pack = makeSamplePack({
+ name: "multi-batch",
+ documents: [
+ { title: "Doc 1", content: "Content 1", source: "" },
+ { title: "Doc 2", content: "Content 2", source: "" },
+ { title: "Doc 3", content: "Content 3", source: "" },
+ { title: "Doc 4", content: "Content 4", source: "" },
+ ],
+ });
+ const packPath = join(tempDir, "multi-batch.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ await installPack(db, provider, packPath, { batchSize: 2, concurrency: 2 });
+
+ // 4 docs with batchSize=2 → 2 batches → 2 embedBatch calls
+ expect(provider.embedBatchCallCount).toBe(2);
+ });
+
+ it("should not exceed concurrency limit for embed calls", async () => {
+ // Track the maximum number of concurrent embedBatch calls in flight
+ let maxConcurrent = 0;
+ let activeCalls = 0;
+ let totalCalls = 0;
+
+ const trackingProvider = new MockEmbeddingProvider();
+ trackingProvider.embedBatch = vi.fn().mockImplementation((texts: string[]) => {
+ totalCalls++;
+ activeCalls++;
+ maxConcurrent = Math.max(maxConcurrent, activeCalls);
+ // Simulate slight async delay so concurrent calls can overlap
+ return Promise.resolve().then(() => {
+ activeCalls--;
+ return texts.map(() => [0.1, 0.2, 0.3, 0.4]);
+ });
+ });
+
+ const pack = makeSamplePack({
+ name: "concurrency-limit",
+ documents: Array.from({ length: 8 }, (_, i) => ({
+ title: `Doc ${i}`,
+ content: `Content ${i}`,
+ source: "",
+ })),
+ });
+ const packPath = join(tempDir, "concurrency-limit.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ await installPack(db, trackingProvider, packPath, { batchSize: 1, concurrency: 3 });
+
+ // Should never exceed the concurrency limit of 3
+ expect(maxConcurrent).toBeLessThanOrEqual(3);
+ // Should have made 8 embedBatch calls (8 docs, batchSize=1)
+ expect(totalCalls).toBe(8);
+ });
+
+ it("should report progress after each batch when embedding concurrently", async () => {
+ const pack = makeSamplePack({
+ name: "concurrent-progress",
+ documents: [
+ { title: "Doc A", content: "Content A", source: "" },
+ { title: "Doc B", content: "Content B", source: "" },
+ { title: "Doc C", content: "Content C", source: "" },
+ ],
+ });
+ const packPath = join(tempDir, "concurrent-progress.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const calls: Array<{ current: number; total: number }> = [];
+ await installPack(db, provider, packPath, {
+ batchSize: 1,
+ concurrency: 2,
+ onProgress: (current, total) => calls.push({ current, total }),
+ });
+
+ // Should have 3 progress calls (one per batch/doc with batchSize=1)
+ expect(calls).toHaveLength(3);
+ // Final call should report all docs processed
+ expect(calls[calls.length - 1]!.current).toBe(3);
+ expect(calls[calls.length - 1]!.total).toBe(3);
+ });
+
+ it("should count errors correctly when some batches fail during concurrent embedding", async () => {
+ let callCount = 0;
+ const partialFailProvider = new MockEmbeddingProvider();
+ partialFailProvider.embedBatch = vi.fn().mockImplementation(() => {
+ callCount++;
+ if (callCount % 2 === 0) {
+ return Promise.reject(new Error("embed failed"));
+ }
+ return Promise.resolve([[0.1, 0.2, 0.3, 0.4]]);
+ });
+
+ const pack = makeSamplePack({
+ name: "partial-fail",
+ documents: [
+ { title: "Doc 1", content: "Content 1", source: "" },
+ { title: "Doc 2", content: "Content 2", source: "" },
+ { title: "Doc 3", content: "Content 3", source: "" },
+ { title: "Doc 4", content: "Content 4", source: "" },
+ ],
+ });
+ const packPath = join(tempDir, "partial-fail.json");
+ writeFileSync(packPath, JSON.stringify(pack), "utf-8");
+
+ const result = await installPack(db, partialFailProvider, packPath, {
+ batchSize: 1,
+ concurrency: 4,
+ });
+
+ // 4 docs, batchSize=1 → 4 batches; even-numbered calls fail → 2 errors, 2 installed
+ expect(result.errors).toBe(2);
+ expect(result.documentsInstalled).toBe(2);
+ });
+ });
});
diff --git a/tests/unit/parsers.test.ts b/tests/unit/parsers.test.ts
index c30a614..5fc5ab3 100644
--- a/tests/unit/parsers.test.ts
+++ b/tests/unit/parsers.test.ts
@@ -5,6 +5,7 @@ import { PlainTextParser } from "../../src/core/parsers/text.js";
import { JsonParser } from "../../src/core/parsers/json-parser.js";
import { YamlParser } from "../../src/core/parsers/yaml.js";
import { CsvParser } from "../../src/core/parsers/csv.js";
+import { HtmlParser } from "../../src/core/parsers/html.js";
import { ValidationError } from "../../src/errors.js";
describe("getParserForFile", () => {
@@ -44,6 +45,14 @@ describe("getParserForFile", () => {
expect(getParserForFile("document.docx")).not.toBeNull();
});
+ it("returns parser for .html files", () => {
+ expect(getParserForFile("page.html")).not.toBeNull();
+ });
+
+ it("returns parser for .htm files", () => {
+ expect(getParserForFile("page.htm")).not.toBeNull();
+ });
+
it("returns null for unsupported extensions", () => {
expect(getParserForFile("image.png")).toBeNull();
expect(getParserForFile("archive.zip")).toBeNull();
@@ -66,6 +75,8 @@ describe("getSupportedExtensions", () => {
expect(exts).toContain(".pdf");
expect(exts).toContain(".docx");
expect(exts).toContain(".txt");
+ expect(exts).toContain(".html");
+ expect(exts).toContain(".htm");
// Should be sorted
const sorted = [...exts].sort();
expect(exts).toEqual(sorted);
@@ -215,3 +226,78 @@ describe("WordParser", () => {
await expect(parser.parse(Buffer.from("not a docx"))).rejects.toThrow(ValidationError);
});
});
+
+describe("HtmlParser", () => {
+ const parser = new HtmlParser();
+
+ it("has .html and .htm extensions", () => {
+ expect(parser.extensions).toEqual([".html", ".htm"]);
+ });
+
+ it("converts basic HTML to markdown", async () => {
+ const html = "Hello
This is a test.
";
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("Hello");
+ expect(result).toContain("**test**");
+ });
+
+ it("strips script tags", async () => {
+ const html = 'Content
More
';
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("Content");
+ expect(result).toContain("More");
+ expect(result).not.toContain("alert");
+ expect(result).not.toContain("script");
+ });
+
+ it("strips style tags", async () => {
+ const html = "Visible
";
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("Visible");
+ expect(result).not.toContain("color");
+ });
+
+ it("strips nav tags", async () => {
+ const html =
+ "Article
";
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("Article");
+ expect(result).not.toContain("Home");
+ });
+
+ it("handles full HTML documents with doctype and head", async () => {
+ const html = `
+Test Page
+Main Title
Body text here.
`;
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("Main Title");
+ expect(result).toContain("Body text here");
+ expect(result).not.toContain("color: blue");
+ });
+
+ it("converts links to markdown format", async () => {
+ const html = 'Click here';
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("[Click here]");
+ expect(result).toContain("https://example.com");
+ });
+
+ it("converts lists to markdown", async () => {
+ const html = "";
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).toContain("One");
+ expect(result).toContain("Two");
+ expect(result).toContain("Three");
+ });
+
+ it("handles empty HTML gracefully", async () => {
+ const result = await parser.parse(Buffer.from(""));
+ expect(result).toBe("");
+ });
+
+ it("collapses excessive blank lines", async () => {
+ const html = "First
Second
";
+ const result = await parser.parse(Buffer.from(html));
+ expect(result).not.toMatch(/\n{3,}/);
+ });
+});
diff --git a/tests/unit/reporter.test.ts b/tests/unit/reporter.test.ts
new file mode 100644
index 0000000..b63d8d7
--- /dev/null
+++ b/tests/unit/reporter.test.ts
@@ -0,0 +1,172 @@
+import { describe, it, expect, vi, afterEach } from "vitest";
+import { isVerbose, createReporter } from "../../src/cli/reporter.js";
+
+describe("reporter", () => {
+ afterEach(() => {
+ delete process.env["LIBSCOPE_VERBOSE"];
+ vi.restoreAllMocks();
+ });
+
+ describe("isVerbose", () => {
+ it("returns true when verbose flag is set", () => {
+ expect(isVerbose(true)).toBe(true);
+ });
+
+ it("returns false when verbose flag is false", () => {
+ expect(isVerbose(false)).toBe(false);
+ });
+
+ it("returns false when verbose flag is undefined", () => {
+ expect(isVerbose(undefined)).toBe(false);
+ });
+
+ it("returns true when LIBSCOPE_VERBOSE=1 env var is set", () => {
+ process.env["LIBSCOPE_VERBOSE"] = "1";
+ expect(isVerbose(false)).toBe(true);
+ });
+
+ it("returns false when LIBSCOPE_VERBOSE=0", () => {
+ process.env["LIBSCOPE_VERBOSE"] = "0";
+ expect(isVerbose(false)).toBe(false);
+ });
+ });
+
+ describe("createReporter", () => {
+ it("returns a SilentReporter (no-op) in verbose mode", () => {
+ const reporter = createReporter(true);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+ const stderr = vi.spyOn(process.stderr, "write").mockImplementation(() => true);
+
+ reporter.log("hello");
+ reporter.success("done");
+ reporter.warn("careful");
+ reporter.error("bad");
+ reporter.progress(1, 10, "task");
+ reporter.clearProgress();
+
+ expect(stdout).not.toHaveBeenCalled();
+ expect(stderr).not.toHaveBeenCalled();
+ });
+
+ it("returns a SilentReporter when LIBSCOPE_VERBOSE=1", () => {
+ process.env["LIBSCOPE_VERBOSE"] = "1";
+ const reporter = createReporter();
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.log("hello");
+ expect(stdout).not.toHaveBeenCalled();
+ });
+
+ it("PrettyReporter.log writes to stdout", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.log("test message");
+
+ expect(stdout).toHaveBeenCalledOnce();
+ expect(String(stdout.mock.calls[0]![0])).toContain("test message");
+ });
+
+ it("PrettyReporter.success writes green checkmark to stdout", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.success("all done");
+
+ const output = String(stdout.mock.calls[0]![0]);
+ expect(output).toContain("all done");
+ // Green ANSI code
+ expect(output).toContain("\x1b[32m");
+ });
+
+ it("PrettyReporter.warn writes to stderr", () => {
+ const reporter = createReporter(false);
+ const stderr = vi.spyOn(process.stderr, "write").mockImplementation(() => true);
+
+ reporter.warn("watch out");
+
+ expect(stderr).toHaveBeenCalledOnce();
+ expect(String(stderr.mock.calls[0]![0])).toContain("watch out");
+ });
+
+ it("PrettyReporter.error writes to stderr", () => {
+ const reporter = createReporter(false);
+ const stderr = vi.spyOn(process.stderr, "write").mockImplementation(() => true);
+
+ reporter.error("something failed");
+
+ expect(stderr).toHaveBeenCalledOnce();
+ expect(String(stderr.mock.calls[0]![0])).toContain("something failed");
+ });
+
+ it("PrettyReporter.progress writes \\r-prefixed line to stdout", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.progress(3, 10, "indexing doc");
+
+ const output = String(stdout.mock.calls[0]![0]);
+ expect(output).toMatch(/^\r/);
+ expect(output).toContain("3/10");
+ expect(output).toContain("30%");
+ });
+
+ it("PrettyReporter.clearProgress clears the progress line", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.progress(1, 5, "working");
+ stdout.mockClear();
+
+ reporter.clearProgress();
+
+ // Should write spaces to clear the line
+ const output = String(stdout.mock.calls[0]![0]);
+ expect(output).toMatch(/^\r\s+\r$/);
+ });
+
+ it("PrettyReporter.clearProgress is a no-op when no progress shown", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.clearProgress();
+
+ expect(stdout).not.toHaveBeenCalled();
+ });
+
+ it("PrettyReporter.log clears progress before writing", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.progress(1, 5, "working");
+ stdout.mockClear();
+
+ reporter.log("a message");
+
+ // First call should be the clear, second the message
+ expect(stdout.mock.calls.length).toBeGreaterThanOrEqual(2);
+ const clearCall = String(stdout.mock.calls[0]![0]);
+ expect(clearCall).toMatch(/^\r\s+\r$/);
+ });
+
+ it("PrettyReporter.progress truncates long labels", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.progress(1, 1, "a".repeat(50));
+
+ const output = String(stdout.mock.calls[0]![0]);
+ expect(output).toContain("...");
+ });
+
+ it("PrettyReporter.progress handles zero total gracefully", () => {
+ const reporter = createReporter(false);
+ const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true);
+
+ reporter.progress(0, 0, "starting");
+
+ const output = String(stdout.mock.calls[0]![0]);
+ expect(output).toContain("0%");
+ });
+ });
+});
diff --git a/tests/unit/saved-searches.test.ts b/tests/unit/saved-searches.test.ts
index 8e20a1b..b2eb001 100644
--- a/tests/unit/saved-searches.test.ts
+++ b/tests/unit/saved-searches.test.ts
@@ -11,6 +11,9 @@ import {
import { indexDocument } from "../../src/core/indexing.js";
import { ValidationError, DocumentNotFoundError } from "../../src/errors.js";
import type Database from "better-sqlite3";
+import { initLogger } from "../../src/logger.js";
+
+initLogger("silent");
describe("saved-searches", () => {
let db: Database.Database;
@@ -191,5 +194,18 @@ describe("saved-searches", () => {
const fetched = getSavedSearch(db, created.id);
expect(fetched.filters).toBeNull();
});
+
+ it("should default to null when filters JSON is corrupted", () => {
+ // Directly insert a row with invalid JSON in the filters column
+ db.prepare("INSERT INTO saved_searches (id, name, query, filters) VALUES (?, ?, ?, ?)").run(
+ "corrupt-ss",
+ "Corrupt Search",
+ "test query",
+ "{not valid json",
+ );
+
+ const fetched = getSavedSearch(db, "corrupt-ss");
+ expect(fetched.filters).toBeNull();
+ });
});
});
diff --git a/tests/unit/spider.test.ts b/tests/unit/spider.test.ts
new file mode 100644
index 0000000..24d9348
--- /dev/null
+++ b/tests/unit/spider.test.ts
@@ -0,0 +1,497 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+
+// ── Mock fetchRaw so we don't make real network requests ─────────────────────
+const mockFetchRaw = vi.fn();
+vi.mock("../../src/core/url-fetcher.js", () => ({
+ fetchRaw: (...args: unknown[]): unknown => mockFetchRaw(...args),
+ DEFAULT_FETCH_OPTIONS: {
+ timeout: 30_000,
+ maxRedirects: 5,
+ maxBodySize: 10 * 1024 * 1024,
+ allowPrivateUrls: false,
+ allowSelfSignedCerts: false,
+ },
+}));
+
+// ── Import spider after mock is set up ───────────────────────────────────────
+const { spiderUrl } = await import("../../src/core/spider.js");
+
+// ── Helpers ──────────────────────────────────────────────────────────────────
+
+function htmlPage(title: string, links: string[] = [], body = ""): string {
+ const anchors = links.map((href) => `link`).join("\n");
+ return `${title}${anchors}${body}`;
+}
+
+function pageResponse(html: string, url = "https://example.com/") {
+ return {
+ body: html,
+ contentType: "text/html; charset=utf-8",
+ finalUrl: url,
+ };
+}
+
+/** Collect all yielded values from an async generator. */
+async function collectPages(gen: ReturnType): Promise<{
+ pages: Array<{ url: string; title: string; depth: number }>;
+ stats: Awaited> extends { value: infer V } ? V : unknown;
+}> {
+ const pages = [];
+ let result = await gen.next();
+ while (!result.done) {
+ const v = result.value as { url: string; title: string; depth: number };
+ pages.push({ url: v.url, title: v.title, depth: v.depth });
+ result = await gen.next();
+ }
+ return { pages, stats: result.value };
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+describe("spiderUrl", () => {
+ beforeEach(() => {
+ mockFetchRaw.mockReset();
+ // Default: robots.txt not found
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) {
+ return Promise.reject(new Error("404"));
+ }
+ return Promise.resolve(pageResponse(htmlPage("Page", []), url));
+ });
+ // Speed up tests by removing inter-request delay
+ vi.useFakeTimers();
+ });
+
+ afterEach(() => {
+ vi.useRealTimers();
+ });
+
+ it("yields the seed page with depth 0", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(pageResponse(htmlPage("Seed Page"), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 1, requestDelay: 0 });
+ const result = await gen.next();
+ expect(result.done).toBe(false);
+ const page = result.value as { url: string; title: string; depth: number };
+ expect(page.url).toBe("https://example.com/");
+ expect(page.title).toBe("Seed Page");
+ expect(page.depth).toBe(0);
+ });
+
+ it("follows links up to maxDepth", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/") {
+ return Promise.resolve(pageResponse(htmlPage("Root", ["https://example.com/child"]), url));
+ }
+ if (url === "https://example.com/child") {
+ return Promise.resolve(
+ pageResponse(htmlPage("Child", ["https://example.com/grandchild"]), url),
+ );
+ }
+ if (url === "https://example.com/grandchild") {
+ return Promise.resolve(pageResponse(htmlPage("Grandchild", []), url));
+ }
+ return Promise.reject(new Error("unexpected"));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxDepth: 2, maxPages: 10, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+
+ expect(pages.map((p) => p.url)).toContain("https://example.com/");
+ expect(pages.map((p) => p.url)).toContain("https://example.com/child");
+ expect(pages.map((p) => p.url)).toContain("https://example.com/grandchild");
+ // depth 3 should not appear
+ expect(pages.every((p) => p.depth <= 2)).toBe(true);
+ });
+
+ it("does not follow links beyond maxDepth", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/") {
+ return Promise.resolve(pageResponse(htmlPage("Root", ["https://example.com/child"]), url));
+ }
+ if (url === "https://example.com/child") {
+ return Promise.resolve(
+ pageResponse(htmlPage("Child", ["https://example.com/grandchild"]), url),
+ );
+ }
+ // grandchild should NOT be fetched at maxDepth=1
+ return Promise.reject(new Error("should not fetch this"));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxDepth: 1, maxPages: 10, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+
+ const urls = pages.map((p) => p.url);
+ expect(urls).toContain("https://example.com/");
+ expect(urls).toContain("https://example.com/child");
+ expect(urls).not.toContain("https://example.com/grandchild");
+ });
+
+ it("enforces maxPages hard cap", async () => {
+ // Return the same page with 5 links each time
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ const links = [1, 2, 3, 4, 5].map((i) => `https://example.com/page${i}`);
+ return Promise.resolve(pageResponse(htmlPage("Page", links), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 3, maxDepth: 5, requestDelay: 0 });
+ const { pages, stats } = await collectPages(gen);
+
+ expect(pages.length).toBeLessThanOrEqual(3);
+ expect((stats as { pagesFetched: number }).pagesFetched).toBeLessThanOrEqual(3);
+ });
+
+ it("does not visit the same URL twice (cycle detection)", async () => {
+ // Page A links to B, B links back to A
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/a") {
+ return Promise.resolve(pageResponse(htmlPage("A", ["https://example.com/b"]), url));
+ }
+ if (url === "https://example.com/b") {
+ return Promise.resolve(pageResponse(htmlPage("B", ["https://example.com/a"]), url));
+ }
+ return Promise.reject(new Error("unexpected"));
+ });
+
+ const gen = spiderUrl("https://example.com/a", { maxPages: 20, maxDepth: 5, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+
+ // Should only visit a and b once each
+ const urls = pages.map((p) => p.url);
+ expect(urls.filter((u) => u === "https://example.com/a").length).toBe(1);
+ expect(urls.filter((u) => u === "https://example.com/b").length).toBe(1);
+ });
+
+ it("filters cross-domain links when sameDomain=true (default)", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(
+ pageResponse(
+ htmlPage("Root", ["https://other.com/page", "https://example.com/local"]),
+ url,
+ ),
+ );
+ });
+
+ const gen = spiderUrl("https://example.com/", {
+ sameDomain: true,
+ maxPages: 10,
+ maxDepth: 1,
+ requestDelay: 0,
+ });
+ const { pages } = await collectPages(gen);
+
+ const urls = pages.map((p) => p.url);
+ expect(urls).not.toContain("https://other.com/page");
+ expect(urls).toContain("https://example.com/local");
+ });
+
+ it("allows cross-domain links when sameDomain=false", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/") {
+ return Promise.resolve(pageResponse(htmlPage("Root", ["https://other.com/page"]), url));
+ }
+ return Promise.resolve(pageResponse(htmlPage("Other", []), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", {
+ sameDomain: false,
+ maxPages: 10,
+ maxDepth: 1,
+ requestDelay: 0,
+ });
+ const { pages } = await collectPages(gen);
+ expect(pages.map((p) => p.url)).toContain("https://other.com/page");
+ });
+
+ it("allows subdomain links when sameDomain=true", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/") {
+ return Promise.resolve(
+ pageResponse(htmlPage("Root", ["https://docs.example.com/guide"]), url),
+ );
+ }
+ return Promise.resolve(pageResponse(htmlPage("Subdomain page", []), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", {
+ sameDomain: true,
+ maxPages: 10,
+ maxDepth: 1,
+ requestDelay: 0,
+ });
+ const { pages } = await collectPages(gen);
+ expect(pages.map((p) => p.url)).toContain("https://docs.example.com/guide");
+ });
+
+ it("filters links outside pathPrefix", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(
+ pageResponse(
+ htmlPage("Docs", ["https://example.com/docs/guide", "https://example.com/blog/post"]),
+ url,
+ ),
+ );
+ });
+
+ const gen = spiderUrl("https://example.com/docs/", {
+ pathPrefix: "/docs",
+ maxPages: 10,
+ maxDepth: 1,
+ requestDelay: 0,
+ });
+ const { pages } = await collectPages(gen);
+ const urls = pages.map((p) => p.url);
+ expect(urls).toContain("https://example.com/docs/guide");
+ expect(urls).not.toContain("https://example.com/blog/post");
+ });
+
+ it("skips URLs matching excludePatterns", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(
+ pageResponse(
+ htmlPage("Page", [
+ "https://example.com/docs/guide",
+ "https://example.com/changelog/v2",
+ "https://example.com/api/v1/ref",
+ ]),
+ url,
+ ),
+ );
+ });
+
+ const gen = spiderUrl("https://example.com/", {
+ excludePatterns: ["*/changelog*", "*/api/v1/*"],
+ maxPages: 10,
+ maxDepth: 1,
+ requestDelay: 0,
+ });
+ const { pages } = await collectPages(gen);
+ const urls = pages.map((p) => p.url);
+ expect(urls).toContain("https://example.com/docs/guide");
+ expect(urls).not.toContain("https://example.com/changelog/v2");
+ expect(urls).not.toContain("https://example.com/api/v1/ref");
+ });
+
+ it("skips URLs disallowed by robots.txt", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url === "https://example.com/robots.txt") {
+ return Promise.resolve({
+ body: "User-agent: *\nDisallow: /private/",
+ contentType: "text/plain",
+ finalUrl: url,
+ });
+ }
+ return Promise.resolve(
+ pageResponse(
+ htmlPage("Root", [
+ "https://example.com/public/page",
+ "https://example.com/private/secret",
+ ]),
+ url,
+ ),
+ );
+ });
+
+ const gen = spiderUrl("https://example.com/", {
+ maxPages: 10,
+ maxDepth: 1,
+ requestDelay: 0,
+ });
+ const { pages } = await collectPages(gen);
+ const urls = pages.map((p) => p.url);
+ expect(urls).toContain("https://example.com/public/page");
+ expect(urls).not.toContain("https://example.com/private/secret");
+ });
+
+ it("respects LibScope-specific robots.txt rules", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url === "https://example.com/robots.txt") {
+ return Promise.resolve({
+ body: "User-agent: libscope\nDisallow: /restricted/\nUser-agent: *\nDisallow:",
+ contentType: "text/plain",
+ finalUrl: url,
+ });
+ }
+ return Promise.resolve(
+ pageResponse(htmlPage("Root", ["https://example.com/restricted/data"]), url),
+ );
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 10, maxDepth: 1, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+ expect(pages.map((p) => p.url)).not.toContain("https://example.com/restricted/data");
+ });
+
+ it("continues crawling when a single page fetch fails", async () => {
+ let callCount = 0;
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/") {
+ return Promise.resolve(
+ pageResponse(
+ htmlPage("Root", ["https://example.com/good", "https://example.com/bad"]),
+ url,
+ ),
+ );
+ }
+ if (url === "https://example.com/bad") {
+ callCount++;
+ return Promise.reject(new Error("connection refused"));
+ }
+ return Promise.resolve(pageResponse(htmlPage("Good", []), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 10, maxDepth: 1, requestDelay: 0 });
+ const { pages, stats } = await collectPages(gen);
+
+ const urls = pages.map((p) => p.url);
+ expect(urls).toContain("https://example.com/");
+ expect(urls).toContain("https://example.com/good");
+ expect(urls).not.toContain("https://example.com/bad");
+ expect((stats as { errors: Array<{ url: string }> }).errors.length).toBeGreaterThan(0);
+ expect(callCount).toBe(1); // fetched once, failed
+ });
+
+ it("returns SpiderStats from the generator return value", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(pageResponse(htmlPage("Page", ["https://example.com/child"]), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 5, maxDepth: 1, requestDelay: 0 });
+ const { stats } = await collectPages(gen);
+ const s = stats as {
+ pagesFetched: number;
+ pagesCrawled: number;
+ pagesSkipped: number;
+ errors: unknown[];
+ };
+
+ expect(typeof s.pagesFetched).toBe("number");
+ expect(typeof s.pagesCrawled).toBe("number");
+ expect(typeof s.pagesSkipped).toBe("number");
+ expect(Array.isArray(s.errors)).toBe(true);
+ expect(s.pagesFetched).toBeGreaterThan(0);
+ });
+
+ it("caps maxPages to the hard limit of 200", async () => {
+ // We just confirm that requesting 999 is capped — we test via stats.pagesFetched ≤ 200
+ // In practice, our mock only has one page so pagesFetched will be 1.
+ // The important thing is that the option is accepted without error.
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(pageResponse(htmlPage("Only Page", []), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 999, maxDepth: 0, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+ expect(pages.length).toBeLessThanOrEqual(200);
+ });
+
+ it("caps maxDepth to the hard limit of 5", async () => {
+ // Should not throw even when maxDepth: 100 is passed
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve(pageResponse(htmlPage("Page", []), url));
+ });
+
+ // Should not throw — maxDepth is capped to hard limit internally
+ const gen = spiderUrl("https://example.com/", { maxDepth: 100, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+ expect(pages.length).toBeGreaterThanOrEqual(1);
+ });
+
+ it("maxDepth=0 only fetches the seed page", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ if (url === "https://example.com/") {
+ return Promise.resolve(pageResponse(htmlPage("Seed", ["https://example.com/child"]), url));
+ }
+ return Promise.reject(new Error("should not fetch children at depth 0"));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxDepth: 0, maxPages: 10, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+
+ expect(pages.length).toBe(1);
+ expect(pages[0]!.url).toBe("https://example.com/");
+ });
+
+ it("BFS: fetches pages breadth-first (children before grandchildren)", async () => {
+ const fetchOrder: string[] = [];
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ fetchOrder.push(url);
+ if (url === "https://example.com/") {
+ return Promise.resolve(
+ pageResponse(htmlPage("Root", ["https://example.com/a", "https://example.com/b"]), url),
+ );
+ }
+ if (url === "https://example.com/a") {
+ return Promise.resolve(pageResponse(htmlPage("A", ["https://example.com/a1"]), url));
+ }
+ if (url === "https://example.com/b") {
+ return Promise.resolve(pageResponse(htmlPage("B", []), url));
+ }
+ return Promise.resolve(pageResponse(htmlPage("Leaf", []), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 10, maxDepth: 2, requestDelay: 0 });
+ await collectPages(gen);
+
+ // root → a → b → a1 (BFS order: process all depth-1 before depth-2)
+ const idxRoot = fetchOrder.indexOf("https://example.com/");
+ const idxA = fetchOrder.indexOf("https://example.com/a");
+ const idxB = fetchOrder.indexOf("https://example.com/b");
+ const idxA1 = fetchOrder.indexOf("https://example.com/a1");
+
+ expect(idxRoot).toBeLessThan(idxA);
+ expect(idxRoot).toBeLessThan(idxB);
+ // Both a and b (depth 1) should appear before a1 (depth 2)
+ expect(idxA).toBeLessThan(idxA1);
+ expect(idxB).toBeLessThan(idxA1);
+ });
+
+ it("handles plain text responses without crashing", async () => {
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ return Promise.resolve({
+ body: "# Plain Text\n\nNo HTML here.",
+ contentType: "text/plain",
+ finalUrl: url,
+ });
+ });
+
+ const gen = spiderUrl("https://example.com/notes.txt", { maxDepth: 0, requestDelay: 0 });
+ const { pages } = await collectPages(gen);
+ expect(pages.length).toBe(1);
+ expect(pages[0]!.title).toBe("Plain Text");
+ });
+
+ it("marks abortReason as maxPages when capped mid-crawl", async () => {
+ // Seed always returns a new unique link
+ let counter = 0;
+ mockFetchRaw.mockImplementation((url: string) => {
+ if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404"));
+ counter++;
+ const links = [`https://example.com/page${counter + 100}`];
+ return Promise.resolve(pageResponse(htmlPage(`Page ${counter}`, links), url));
+ });
+
+ const gen = spiderUrl("https://example.com/", { maxPages: 2, maxDepth: 5, requestDelay: 0 });
+ const { stats } = await collectPages(gen);
+ expect((stats as { abortReason?: string }).abortReason).toBe("maxPages");
+ });
+});
diff --git a/tests/unit/update-document.test.ts b/tests/unit/update-document.test.ts
index a754ef9..61f9e7e 100644
--- a/tests/unit/update-document.test.ts
+++ b/tests/unit/update-document.test.ts
@@ -1,4 +1,4 @@
-import { describe, it, expect, beforeEach } from "vitest";
+import { describe, it, expect, beforeEach, vi } from "vitest";
import { createTestDbWithVec } from "../fixtures/test-db.js";
import {
getDocument,
@@ -87,17 +87,21 @@ describe("updateDocument", () => {
});
it("should update updated_at timestamp", async () => {
- const before: Document = getDocument(db, docId);
- // SQLite datetime('now') has 1-second resolution; wait just enough for it to tick
- await new Promise((r) => setTimeout(r, 1100));
- const input: UpdateDocumentInput = { title: "Updated" };
- await updateDocument(db, provider, docId, input);
- const after: Document = getDocument(db, docId);
-
- expect(new Date(after.updatedAt).getTime()).toBeGreaterThanOrEqual(
- new Date(before.updatedAt).getTime(),
- );
- expect(after.updatedAt).not.toBe(before.updatedAt);
+ vi.useFakeTimers();
+ try {
+ const before: Document = getDocument(db, docId);
+ // Advance fake clock by 2 seconds so the JS timestamp differs
+ vi.advanceTimersByTime(2000);
+ const input: UpdateDocumentInput = { title: "Updated" };
+ await updateDocument(db, provider, docId, input);
+ const after: Document = getDocument(db, docId);
+
+ expect(new Date(after.updatedAt).getTime()).toBeGreaterThan(
+ new Date(before.updatedAt).getTime(),
+ );
+ } finally {
+ vi.useRealTimers();
+ }
});
it("should throw for nonexistent document", async () => {
diff --git a/tests/unit/webhooks.test.ts b/tests/unit/webhooks.test.ts
index 9272901..d759133 100644
--- a/tests/unit/webhooks.test.ts
+++ b/tests/unit/webhooks.test.ts
@@ -278,8 +278,8 @@ describe("webhooks", () => {
await createWebhook(db, "https://example.com/hook", ["document.updated"]);
fireWebhooks(db, "document.created", { docId: "123" });
- // Give time for any async calls
- await new Promise((r) => setTimeout(r, 50));
+ // Flush all pending microtasks/promises; mockFetch should remain uncalled
+ await Promise.resolve();
expect(mockFetch).not.toHaveBeenCalled();
});
@@ -367,8 +367,26 @@ describe("webhooks", () => {
fireWebhooks(db, "document.created", { docId: "123" });
- await new Promise((r) => setTimeout(r, 50));
+ // Flush all pending microtasks/promises; mockFetch should remain uncalled
+ await Promise.resolve();
expect(mockFetch).not.toHaveBeenCalled();
});
});
+
+ describe("rowToWebhook corrupted JSON", () => {
+ it("should default to empty events array when events JSON is corrupted", () => {
+ // Directly insert a row with invalid JSON in the events column
+ db.prepare("INSERT INTO webhooks (id, url, events, secret) VALUES (?, ?, ?, ?)").run(
+ "corrupt-1",
+ "https://example.com/hook",
+ "not valid json{{{",
+ null,
+ );
+
+ const hooks = listWebhooks(db);
+ const corrupt = hooks.find((h) => h.id === "corrupt-1");
+ expect(corrupt).toBeDefined();
+ expect(corrupt!.events).toEqual([]);
+ });
+ });
});