diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index a78fcd7..24f178c 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -14,4 +14,5 @@ Refer to [agents.md](../agents.md) at the repository root for full architecture, - Tests use `MockEmbeddingProvider` and in-memory SQLite (no sqlite-vec in tests). - Run `npm run typecheck && npm run test:coverage && npm run lint` before considering work complete. Use `test:coverage` (not `test`) — CI enforces coverage thresholds (statements ≥ 75%, branches ≥ 74%, functions ≥ 75%, lines ≥ 75%) and will reject PRs that drop below them. - Before creating a PR, use a `code-review` sub-agent to self-review your diff. Fix any issues it finds before opening the PR. +- **Branch workflow:** All feature branches and PRs target `development`. Only `development` can be merged into `main`. When creating branches, branch from `development`. When creating PRs, set the base to `development`. - **PR lifecycle is mandatory.** After pushing a PR, always: (1) wait for CI/CD to complete, (2) check if it passed, (3) fix failures and re-push if needed, (4) read and address all review comments, (5) verify CI is green again. A PR is not done until all checks pass and all review comments are resolved. See the "Pull Request Lifecycle" section in `agents.md` for the full workflow. diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7f94626..4c13cec 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -2,6 +2,7 @@ version: 2 updates: - package-ecosystem: "npm" directory: "/" + target-branch: "development" schedule: interval: "weekly" groups: @@ -9,14 +10,17 @@ updates: update-types: ["minor", "patch"] - package-ecosystem: "pip" directory: "/sdk/python" + target-branch: "development" schedule: interval: "weekly" - package-ecosystem: "gomod" directory: "/sdk/go" + target-branch: "development" schedule: interval: "weekly" - package-ecosystem: "github-actions" directory: "/" + target-branch: "development" schedule: interval: "weekly" groups: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2058500..b9b3589 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [main] + branches: [main, development] pull_request: - branches: [main] + branches: [main, development] workflow_call: concurrency: @@ -15,8 +15,8 @@ jobs: lint-and-typecheck: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: 22 cache: npm @@ -31,8 +31,8 @@ jobs: matrix: node-version: [20, 22] steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: ${{ matrix.node-version }} cache: npm @@ -40,7 +40,7 @@ jobs: - run: npm run test:coverage - name: Upload coverage if: matrix.node-version == 22 - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: coverage path: coverage/ @@ -51,7 +51,7 @@ jobs: permissions: contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: actions/dependency-review-action@v4 with: fail-on-severity: high @@ -60,8 +60,8 @@ jobs: runs-on: ubuntu-latest needs: [lint-and-typecheck, test] steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: 22 cache: npm diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index f1359a0..5c57320 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -2,9 +2,9 @@ name: "CodeQL" on: push: - branches: [main] + branches: [main, development] pull_request: - branches: [main] + branches: [main, development] schedule: - cron: "0 6 * * 1" @@ -21,7 +21,7 @@ jobs: matrix: language: [javascript-typescript] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Initialize CodeQL uses: github/codeql-action/init@v4 with: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 02b50c4..56ad55d 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,7 +1,7 @@ name: Docker on: push: - branches: [main] + branches: [main, development] tags: ["v*"] pull_request: paths: ["Dockerfile", "docker-compose.yml", ".dockerignore"] @@ -21,7 +21,7 @@ jobs: contents: read packages: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: docker/setup-buildx-action@v3 - uses: docker/login-action@v3 if: github.event_name != 'pull_request' diff --git a/.github/workflows/merge-gate.yml b/.github/workflows/merge-gate.yml new file mode 100644 index 0000000..ace7c31 --- /dev/null +++ b/.github/workflows/merge-gate.yml @@ -0,0 +1,19 @@ +name: Merge Gate + +on: + pull_request: + branches: [main] + +jobs: + enforce-source-branch: + runs-on: ubuntu-latest + steps: + - name: Verify PR source is development or release-please branch + run: | + HEAD="${{ github.head_ref }}" + if [ "$HEAD" = "development" ] || [[ "$HEAD" == release-please--* ]]; then + echo "✅ Source branch '$HEAD' — merge allowed." + else + echo "::error::Only the 'development' branch (or release-please branches) can be merged into main. This PR is from '$HEAD'." + exit 1 + fi diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml index 922f668..10dba58 100644 --- a/.github/workflows/release-please.yml +++ b/.github/workflows/release-please.yml @@ -19,6 +19,9 @@ jobs: id: release with: release-type: node + # Use a PAT so the created PR triggers CI workflows. + # GITHUB_TOKEN cannot trigger other workflows (GitHub security restriction). + token: ${{ secrets.GH_TOKEN }} publish: runs-on: ubuntu-latest @@ -28,8 +31,8 @@ jobs: contents: read id-token: write steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-node@v6 with: node-version: 22 cache: npm diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index 88817fe..8aedd35 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -14,9 +14,9 @@ jobs: runs-on: ubuntu-latest environment: pypi steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: "3.12" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d2d133b..3efe2fd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest needs: ci steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Create GitHub Release uses: softprops/action-gh-release@v2 with: diff --git a/.github/workflows/sdk-go.yml b/.github/workflows/sdk-go.yml index f091577..22b136c 100644 --- a/.github/workflows/sdk-go.yml +++ b/.github/workflows/sdk-go.yml @@ -11,8 +11,8 @@ jobs: matrix: go-version: ["1.21", "1.22"] steps: - - uses: actions/checkout@v4 - - uses: actions/setup-go@v5 + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 with: go-version: ${{ matrix.go-version }} - run: cd sdk/go && go test ./... -v diff --git a/.github/workflows/sdk-python.yml b/.github/workflows/sdk-python.yml index cfd2e3b..eb3d89f 100644 --- a/.github/workflows/sdk-python.yml +++ b/.github/workflows/sdk-python.yml @@ -11,8 +11,8 @@ jobs: matrix: python-version: ["3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - run: cd sdk/python && pip install -e ".[dev]" && pytest -v diff --git a/README.md b/README.md index fa5577d..c2cbd84 100644 --- a/README.md +++ b/README.md @@ -72,25 +72,34 @@ Once connected, your assistant can search docs, submit new documents, rate conte
Full list of MCP tools -| Tool | What it does | -| --------------------- | --------------------------------------------------------- | -| `search-docs` | Semantic search with topic/library/version/rating filters | -| `get-document` | Retrieve a document by ID | -| `delete-document` | Remove a document | -| `submit-document` | Index new content (raw text or a URL to fetch) | -| `rate-document` | Rate a doc 1–5 with optional feedback and corrections | -| `list-documents` | List docs with filters | -| `list-topics` | Browse the topic hierarchy | -| `ask-question` | RAG question-answering with source citations | -| `reindex-documents` | Re-embed chunks (useful after switching providers) | -| `health-check` | DB status, doc/chunk counts | -| `sync-obsidian-vault` | Sync an Obsidian vault | -| `sync-onenote` | Sync OneNote notebooks via Microsoft Graph | -| `sync-notion` | Sync Notion pages and databases | -| `sync-confluence` | Sync Confluence spaces | -| `sync-slack` | Sync Slack channels and threads | -| `install-pack` | Install a knowledge pack | -| `list-packs` | List installed or registry packs | +| Tool | What it does | +| ---------------------- | --------------------------------------------------------- | +| `search-docs` | Semantic search with topic/library/version/rating filters | +| `ask-question` | RAG question-answering with source citations | +| `get-document` | Retrieve a document by ID | +| `list-documents` | List docs with filters | +| `list-topics` | Browse the topic hierarchy | +| `submit-document` | Index new content (raw text or a URL to fetch) | +| `update-document` | Update a document's title, content, or metadata | +| `delete-document` | Remove a document | +| `rate-document` | Rate a doc 1–5 with optional feedback and corrections | +| `suggest-tags` | Auto-suggest tags based on content analysis | +| `save-search` | Save a named search query with filters | +| `list-saved-searches` | List all saved searches | +| `run-saved-search` | Execute a saved search by name or ID | +| `delete-saved-search` | Delete a saved search | +| `link-documents` | Create a cross-reference between two documents | +| `get-document-links` | List all incoming and outgoing links for a document | +| `delete-link` | Remove a cross-reference link | +| `reindex-documents` | Re-embed chunks (useful after switching providers) | +| `health-check` | DB status, doc/chunk counts | +| `sync-obsidian-vault` | Sync an Obsidian vault | +| `sync-onenote` | Sync OneNote notebooks via Microsoft Graph | +| `sync-notion` | Sync Notion pages and databases | +| `sync-confluence` | Sync Confluence spaces | +| `sync-slack` | Sync Slack channels and threads | +| `install-pack` | Install a knowledge pack | +| `list-packs` | List installed or registry packs |
@@ -194,17 +203,32 @@ libscope serve --api --port 3378 OpenAPI 3.0 spec at `GET /openapi.json`. Key endpoints: -| Method | Endpoint | Description | -| ------------ | ----------------------- | ------------------------ | -| `GET` | `/api/v1/search?q=...` | Semantic search | -| `GET/POST` | `/api/v1/documents` | List or create documents | -| `GET/DELETE` | `/api/v1/documents/:id` | Get or remove a document | -| `POST` | `/api/v1/documents/url` | Index from a URL | -| `POST` | `/api/v1/ask` | RAG question-answering | -| `GET/POST` | `/api/v1/topics` | List or create topics | -| `GET` | `/api/v1/tags` | List tags | -| `GET` | `/api/v1/stats` | Usage statistics | -| `GET` | `/api/v1/health` | Health check | +| Method | Endpoint | Description | +| --------------- | --------------------------------- | ---------------------------------- | +| `GET` | `/api/v1/search?q=...` | Semantic search | +| `POST` | `/api/v1/ask` | RAG question-answering | +| `GET/POST` | `/api/v1/documents` | List or create documents | +| `GET/PATCH/DELETE` | `/api/v1/documents/:id` | Get, update, or delete a document | +| `POST` | `/api/v1/documents/url` | Index from a URL | +| `POST` | `/api/v1/documents/:id/tags` | Add tags | +| `GET` | `/api/v1/documents/:id/suggest-tags` | Auto-suggest tags | +| `GET/POST` | `/api/v1/documents/:id/links` | List or create cross-references | +| `DELETE` | `/api/v1/links/:id` | Delete a cross-reference | +| `GET/POST` | `/api/v1/topics` | List or create topics | +| `GET` | `/api/v1/tags` | List tags | +| `GET/POST` | `/api/v1/searches` | List or create saved searches | +| `POST` | `/api/v1/searches/:id/run` | Run a saved search | +| `DELETE` | `/api/v1/searches/:id` | Delete a saved search | +| `POST` | `/api/v1/bulk/delete` | Bulk delete documents | +| `POST` | `/api/v1/bulk/retag` | Bulk add/remove tags | +| `POST` | `/api/v1/bulk/move` | Bulk move to a topic | +| `GET/POST` | `/api/v1/webhooks` | List or create webhooks | +| `DELETE` | `/api/v1/webhooks/:id` | Delete a webhook | +| `POST` | `/api/v1/webhooks/:id/test` | Send a test ping to a webhook | +| `GET` | `/api/v1/analytics/searches` | Search analytics and knowledge gaps| +| `GET` | `/api/v1/connectors/status` | Connector sync status and history | +| `GET` | `/api/v1/stats` | Usage statistics | +| `GET` | `/api/v1/health` | Health check | ## Configuration @@ -304,6 +328,28 @@ export LIBSCOPE_ALLOW_PRIVATE_URLS=true export LIBSCOPE_ALLOW_SELF_SIGNED_CERTS=true ``` +## Webhooks + +LibScope can push events to any HTTP endpoint. Useful for triggering CI pipelines, Slack notifications, or custom workflows whenever documents are created or updated. + +```bash +libscope serve --api # webhooks require the REST API +``` + +```bash +# Create a webhook +curl -X POST http://localhost:3378/api/v1/webhooks \ + -H "Content-Type: application/json" \ + -d '{"url": "https://hooks.example.com/libscope", "events": ["document.created", "document.updated"], "secret": "my-hmac-secret"}' + +# Send a test ping +curl -X POST http://localhost:3378/api/v1/webhooks//test +``` + +Webhook payloads are signed with HMAC-SHA256 when a secret is set. The signature is in the `X-LibScope-Signature` header. + +Supported events: `document.created`, `document.updated`, `document.deleted`. + ## Other Tools LibScope ships with a few more utilities beyond the core index-and-search loop: @@ -352,27 +398,54 @@ There's also a web dashboard at `http://localhost:3377` when you run `libscope s **Documents** -| Command | Description | -| ----------------------------------- | ----------------- | -| `libscope docs list` | List documents | -| `libscope docs show ` | Show a document | -| `libscope docs delete ` | Delete a document | -| `libscope docs history ` | Version history | -| `libscope docs rollback ` | Roll back | +| Command | Description | +| --------------------------------------- | ---------------------------- | +| `libscope docs list` | List documents | +| `libscope docs show ` | Show a document | +| `libscope docs update ` | Update title/content/metadata| +| `libscope docs delete ` | Delete a document | +| `libscope docs history ` | Version history | +| `libscope docs rollback ` | Roll back to a prior version | **Organization** -| Command | Description | -| ---------------------------------- | ---------------- | -| `libscope topics list` | List topics | -| `libscope topics create ` | Create a topic | -| `libscope tag add ` | Add tags | -| `libscope tag remove ` | Remove a tag | -| `libscope tag list` | List tags | -| `libscope workspace create ` | Create workspace | -| `libscope workspace list` | List workspaces | -| `libscope workspace use ` | Switch workspace | -| `libscope workspace delete ` | Delete workspace | +| Command | Description | +| ------------------------------------ | -------------------------------- | +| `libscope topics list` | List topics | +| `libscope topics create ` | Create a topic | +| `libscope tag add ` | Add tags | +| `libscope tag remove ` | Remove a tag | +| `libscope tag list` | List tags | +| `libscope workspace create ` | Create workspace | +| `libscope workspace list` | List workspaces | +| `libscope workspace use ` | Switch workspace | +| `libscope workspace delete ` | Delete workspace | + +**Saved Searches** + +| Command | Description | +| ------------------------------------ | ----------------------------- | +| `libscope searches list` | List all saved searches | +| `libscope searches run ` | Re-run a saved search | +| `libscope searches delete ` | Delete a saved search | +| `libscope search --save ` | Save a search while running it| + +**Document Links** + +| Command | Description | +| ------------------------------------- | -------------------------------- | +| `libscope link ` | Create a cross-reference | +| `libscope links ` | Show all links for a document | +| `libscope unlink ` | Remove a link | +| `libscope prereqs ` | Show prerequisite reading chain | + +**Bulk Operations** + +| Command | Description | +| ---------------------------- | ----------------------------------- | +| `libscope bulk delete` | Delete all matching documents | +| `libscope bulk retag` | Add/remove tags on matching docs | +| `libscope bulk move` | Move matching docs to a topic | **Connectors** @@ -432,4 +505,4 @@ npm run lint # lint ## License -MIT — see [LICENSE](LICENSE). +[Business Source License 1.1](LICENSE) — see [LICENSE](LICENSE) for full terms. diff --git a/agents.md b/agents.md index 3148a5b..1b21baa 100644 --- a/agents.md +++ b/agents.md @@ -35,14 +35,32 @@ LibScope is an **AI-powered knowledge base with MCP (Model Context Protocol) int src/ ├── cli/index.ts # CLI entry point (commander). All commands in one file. ├── mcp/server.ts # MCP server (stdio transport, @modelcontextprotocol/sdk) +├── api/ +│ ├── server.ts # HTTP server bootstrap (createServer, listen) +│ ├── routes.ts # All REST route handlers in one function (~700 lines) +│ ├── middleware.ts # Auth (checkApiKey), rate limiting, body parsing, sendError/sendJson +│ └── openapi.ts # OpenAPI spec generation ├── core/ # Business logic — framework-agnostic, no side effects │ ├── indexing.ts # Document parsing, chunking by heading, embedding + storage │ ├── search.ts # Semantic (vector) + FTS5 + LIKE fallback search │ ├── ratings.ts # Rating storage, aggregation, correction suggestions │ ├── documents.ts # Document CRUD │ ├── topics.ts # Topic hierarchy management -│ ├── url-fetcher.ts # Fetch URL → convert HTML to markdown-like text +│ ├── bulk.ts # Bulk delete/move/retag operations with selector resolution +│ ├── tags.ts # Tag CRUD and document–tag associations +│ ├── dedup.ts # Content deduplication helpers +│ ├── rag.ts # Retrieval-augmented generation (ask a question over the index) +│ ├── url-fetcher.ts # Fetch URL → convert HTML to markdown-like text (SSRF-protected) │ └── index.ts # Public re-exports (barrel file) +├── connectors/ # External-service sync connectors +│ ├── obsidian.ts # Obsidian vault sync (reads .md files + YAML frontmatter) +│ ├── confluence.ts # Confluence Cloud sync (REST API) +│ ├── notion.ts # Notion sync (official API) +│ ├── onenote.ts # OneNote sync (Microsoft Graph API) +│ ├── slack.ts # Slack channel sync (Web API) +│ ├── sync-tracker.ts # Tracks last-synced state per connector in SQLite +│ ├── http-utils.ts # Authenticated fetch helper shared by connectors (respects allowSelfSignedCerts) +│ └── index.ts # Re-exports ├── db/ │ ├── connection.ts # SQLite connection + sqlite-vec extension loading │ ├── schema.ts # Migrations (versioned) + vector table creation @@ -146,6 +164,64 @@ Environment variables > project `.libscope.json` > user `~/.libscope/config.json Env vars: `LIBSCOPE_EMBEDDING_PROVIDER`, `LIBSCOPE_OPENAI_API_KEY`, `LIBSCOPE_OLLAMA_URL`, `LIBSCOPE_ALLOW_PRIVATE_URLS`, `LIBSCOPE_ALLOW_SELF_SIGNED_CERTS`. +## Security Patterns + +### Authentication — use constant-time comparison + +The API key check in `src/api/middleware.ts` must use `crypto.timingSafeEqual` to prevent timing attacks. Direct string equality (`===` / `!==`) short-circuits on the first differing byte, leaking information about the key length and value. + +```typescript +import { timingSafeEqual } from "node:crypto"; + +// ✅ Correct +const tokenBuf = Buffer.from(token); +const keyBuf = Buffer.from(apiKey); +if (tokenBuf.length !== keyBuf.length || !timingSafeEqual(tokenBuf, keyBuf)) { + return sendError(res, 401, "UNAUTHORIZED", "Invalid API key"); +} + +// ❌ Wrong — timing-attack vulnerable +if (token !== apiKey) { ... } +``` + +### TLS — use per-request undici Agent, not process-wide env var + +When self-signed certificates must be accepted (controlled by `config.indexing.allowSelfSignedCerts`), configure TLS per-request via an `undici.Agent` passed as `dispatcher`. Do **not** mutate `process.env["NODE_TLS_REJECT_UNAUTHORIZED"]` — it is process-global and creates a race condition with concurrent requests. + +```typescript +import { Agent } from "undici"; + +// ✅ Correct — scoped to this request chain only +let _insecureAgent: Agent | undefined; +const getInsecureAgent = (): Agent => + (_insecureAgent ??= new Agent({ connect: { rejectUnauthorized: false } })); + +const response = await fetch(url, { + // @ts-expect-error — Node.js undici-based fetch accepts dispatcher for per-request TLS config + dispatcher: allowSelfSigned ? getInsecureAgent() : undefined, +}); + +// ❌ Wrong — affects all concurrent requests until restored +process.env["NODE_TLS_REJECT_UNAUTHORIZED"] = "0"; +``` + +### SSE streaming — check backpressure + +`res.write()` returns `false` when the socket buffer is full or the client has disconnected. Ignoring the return value wastes compute and holds open connections indefinitely. + +```typescript +// ✅ Correct +for await (const event of stream) { + const ok = res.write(`data: ${JSON.stringify(event)}\n\n`); + if (!ok) break; +} + +// ❌ Wrong — no backpressure handling +for await (const event of stream) { + res.write(`data: ${JSON.stringify(event)}\n\n`); +} +``` + ## Testing ### Framework diff --git a/docs/guide/configuration.md b/docs/guide/configuration.md index 50c11fa..fd01d0d 100644 --- a/docs/guide/configuration.md +++ b/docs/guide/configuration.md @@ -76,7 +76,9 @@ export LIBSCOPE_LLM_PROVIDER=openai export LIBSCOPE_LLM_MODEL=gpt-4o-mini ``` -Supported providers: `openai`, `ollama`. +Supported providers: `openai`, `ollama`, `passthrough`. + +The `passthrough` provider is for advanced integrations where you supply your own LLM responses externally. When set, the `ask` command emits an event stream that your application handles rather than calling an LLM directly. ## Environment Variables diff --git a/docs/guide/getting-started.md b/docs/guide/getting-started.md index 916fe77..28d3562 100644 --- a/docs/guide/getting-started.md +++ b/docs/guide/getting-started.md @@ -10,7 +10,7 @@ It also runs as an [MCP server](/guide/mcp-setup), so AI assistants like Claude npm install -g libscope ``` -Requires Node.js 18 or later. +Requires Node.js 20 or later. ## Initialize @@ -67,9 +67,55 @@ libscope serve This starts a stdio-based MCP server that any compatible AI assistant can connect to. See [MCP Setup](/guide/mcp-setup) for integration instructions. +## Web Dashboard + +Run the local web dashboard to browse, search, and manage your knowledge base in a browser: + +```bash +libscope serve --dashboard +# opens at http://localhost:3377 +``` + +The dashboard includes full-text search, document browsing, topic navigation, and a knowledge graph visualization at `/graph`. + +## Organize and Annotate + +Once you have content indexed you can enrich it: + +```bash +# Tag documents +libscope tag add typescript,api,v2 + +# Group into topics +libscope topics create "backend" +libscope topics create "auth" --parent backend + +# Save frequent searches +libscope search "auth best practices" --save "Auth Docs" +libscope searches run "Auth Docs" + +# Cross-reference documents +libscope link --type prerequisite + +# Bulk operations +libscope bulk retag --library react --add-tags deprecated --dry-run +libscope bulk move --library react --to new-topic-id +``` + +## REST API + +For programmatic access, start the REST API instead of the MCP server: + +```bash +libscope serve --api --port 3378 +``` + +The OpenAPI 3.0 spec is served at `GET /openapi.json`. See [REST API Reference](/reference/rest-api) for full documentation. + ## What's Next - [Configuration](/guide/configuration) — embedding providers, LLM setup, environment variables - [MCP Setup](/guide/mcp-setup) — connect LibScope to Claude, Cursor, or VS Code - [Connectors](/guide/connectors) — sync from Obsidian, Notion, Confluence, Slack, and more - [CLI Reference](/reference/cli) — full list of commands and options +- [REST API Reference](/reference/rest-api) — full API endpoint documentation diff --git a/docs/guide/mcp-setup.md b/docs/guide/mcp-setup.md index 63594e6..5db4c40 100644 --- a/docs/guide/mcp-setup.md +++ b/docs/guide/mcp-setup.md @@ -80,13 +80,38 @@ If you're using [workspaces](/guide/configuration#workspaces), pass the workspac ## Available Tools -Once connected, your AI assistant gets access to all of LibScope's MCP tools. See the [MCP Tools Reference](/reference/mcp-tools) for the full list. - -The most commonly used ones: - -- **`search-docs`** — semantic search across your knowledge base -- **`ask-question`** — RAG Q&A with synthesized answers -- **`submit-document`** — index new content (by text or URL) -- **`list-topics`** — browse what's in the knowledge base +Once connected, your AI assistant gets access to all 26 of LibScope's MCP tools. See the [MCP Tools Reference](/reference/mcp-tools) for full parameter details. + +**Search & Q&A** +- **`search-docs`** — semantic search with topic/library/version/rating filters +- **`ask-question`** — RAG Q&A with synthesized answers and source citations + +**Document Management** +- **`submit-document`** — index new content by text or URL +- **`update-document`** — update title, content, or metadata +- **`get-document`** — retrieve a document by ID +- **`list-documents`** — list docs with filters +- **`delete-document`** — remove a document +- **`rate-document`** — rate 1–5 with feedback +- **`suggest-tags`** — auto-suggest tags based on content + +**Organization** +- **`list-topics`** — browse the topic hierarchy +- **`link-documents`** — create cross-references between docs +- **`get-document-links`** — list a document's incoming and outgoing links +- **`delete-link`** — remove a cross-reference + +**Saved Searches** +- **`save-search`** — save a named query with filters +- **`list-saved-searches`** — list saved searches +- **`run-saved-search`** — execute a saved search + +**Connectors** — trigger syncs directly from your AI assistant: +- **`sync-obsidian-vault`**, **`sync-notion`**, **`sync-confluence`**, **`sync-slack`**, **`sync-onenote`** + +**Packs & Maintenance** +- **`install-pack`**, **`list-packs`** — manage knowledge packs +- **`reindex-documents`** — re-embed after switching providers +- **`health-check`** — DB status and doc/chunk counts Your AI assistant will call these tools automatically when it needs information from your docs. diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 9b0a644..e7f4821 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -283,7 +283,7 @@ libscope link --type see_also --label "Background context" | `libscope config set ` | Set a configuration value | | `libscope config show` | Show current configuration | -Supported config keys for `set`: `embedding.provider`, `indexing.allowPrivateUrls`, `indexing.allowSelfSignedCerts`. +Supported config keys for `set`: `embedding.provider`, `embedding.ollamaUrl`, `embedding.ollamaModel`, `embedding.openaiModel`, `llm.provider`, `llm.model`, `database.path`, `logging.level`, `indexing.allowPrivateUrls`, `indexing.allowSelfSignedCerts`. ## Global Options diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 13d6589..e44b13e 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -24,10 +24,11 @@ Complete reference for all configuration options. ### LLM (for RAG) -| Key | Type | Default | Description | -| -------------- | ------ | ------- | -------------------- | -| `llm.provider` | string | — | `openai` or `ollama` | -| `llm.model` | string | — | Model name override | +| Key | Type | Default | Description | +| ----------------- | ------ | ------- | ---------------------------------------------- | +| `llm.provider` | string | — | `openai`, `ollama`, or `passthrough` | +| `llm.model` | string | — | Model name override | +| `llm.ollamaUrl` | string | — | Ollama server URL (overrides embedding URL) | ### Database @@ -69,9 +70,24 @@ Complete reference for all configuration options. ## Setting Values ```bash -# Via CLI -libscope config set embedding.provider ollama -libscope config set llm.provider openai +# Embedding +libscope config set embedding.provider ollama # local | ollama | openai +libscope config set embedding.ollamaUrl http://localhost:11434 +libscope config set embedding.ollamaModel nomic-embed-text +libscope config set embedding.openaiModel text-embedding-3-small + +# LLM (for RAG) +libscope config set llm.provider openai # openai | ollama | passthrough +libscope config set llm.model gpt-4o-mini + +# Database +libscope config set database.path /custom/path/libscope.db + +# Logging +libscope config set logging.level debug # debug | info | warn | error | silent + +# Network +libscope config set indexing.allowPrivateUrls true libscope config set indexing.allowSelfSignedCerts true # View current config diff --git a/docs/reference/mcp-tools.md b/docs/reference/mcp-tools.md index c0fe6e4..5b5888b 100644 --- a/docs/reference/mcp-tools.md +++ b/docs/reference/mcp-tools.md @@ -46,6 +46,20 @@ Index a new document. You can provide content directly, or a URL to fetch automa | `topic` | string | | Topic to categorize under | | `sourceType` | string | | `library`, `topic`, `manual`, or `model-generated` | +## update-document + +Update an existing document's title, content, or metadata. Changing content triggers re-chunking and re-embedding. + +| Parameter | Type | Required | Description | +| ------------ | ------ | -------- | --------------------------------------- | +| `documentId` | string | ✅ | The document ID to update | +| `title` | string | | New title | +| `content` | string | | New content (triggers re-chunking) | +| `library` | string | | New library name (pass `null` to clear) | +| `version` | string | | New version (pass `null` to clear) | +| `url` | string | | New source URL (pass `null` to clear) | +| `topicId` | string | | New topic ID (pass `null` to clear) | + ## rate-document Rate a document and optionally suggest corrections. @@ -171,6 +185,42 @@ List installed or available knowledge packs. | `available` | boolean | | If true, list from registry instead of installed | | `registryUrl` | string | | Custom registry URL | +## suggest-tags + +Suggest tags for a document based on content analysis (compares against existing tags in the knowledge base). + +| Parameter | Type | Required | Description | +| ---------------- | ------ | -------- | ------------------------------------ | +| `documentId` | string | ✅ | The document ID | +| `maxSuggestions` | number | | Maximum suggestions to return (1–20, default: 5) | + +## link-documents + +Create a typed cross-reference relationship between two documents. + +| Parameter | Type | Required | Description | +| ---------- | ------ | -------- | -------------------------------------------------------------------- | +| `sourceId` | string | ✅ | The source document ID | +| `targetId` | string | ✅ | The target document ID | +| `linkType` | string | ✅ | Relationship type: `see_also`, `prerequisite`, `supersedes`, `related` | +| `label` | string | | Optional human-readable description of the relationship | + +## get-document-links + +Get all cross-reference links for a document, both outgoing and incoming. + +| Parameter | Type | Required | Description | +| ------------ | ------ | -------- | --------------- | +| `documentId` | string | ✅ | The document ID | + +## delete-link + +Remove a cross-reference link between documents. + +| Parameter | Type | Required | Description | +| --------- | ------ | -------- | ------------------------ | +| `linkId` | string | ✅ | The link ID to delete | + ## save-search Save a search query with optional filters for later re-use. diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md index 3aa3d81..3950d1e 100644 --- a/docs/reference/rest-api.md +++ b/docs/reference/rest-api.md @@ -10,29 +10,78 @@ The OpenAPI 3.0 spec is available at `GET /openapi.json`. ## Endpoints -| Method | Endpoint | Description | -| -------- | ---------------------------- | -------------------------------- | -| `GET` | `/api/v1/health` | Health check with document count | -| `GET` | `/api/v1/search?q=...` | Semantic search | -| `GET` | `/api/v1/documents` | List documents (with filters) | -| `POST` | `/api/v1/documents` | Index a new document | -| `GET` | `/api/v1/documents/:id` | Get a single document | -| `DELETE` | `/api/v1/documents/:id` | Delete a document | -| `POST` | `/api/v1/documents/url` | Index a document from a URL | -| `POST` | `/api/v1/documents/:id/tags` | Add tags to a document | -| `POST` | `/api/v1/ask` | RAG question answering | -| `GET` | `/api/v1/topics` | List all topics | -| `POST` | `/api/v1/topics` | Create a topic | -| `GET` | `/api/v1/tags` | List all tags | -| `GET` | `/api/v1/stats` | Usage statistics | -| `GET` | `/api/v1/searches` | List saved searches | -| `POST` | `/api/v1/searches` | Create a saved search | -| `POST` | `/api/v1/searches/:id/run` | Run a saved search | -| `DELETE` | `/api/v1/searches/:id` | Delete a saved search | -| `GET` | `/openapi.json` | OpenAPI 3.0 specification | -| `POST` | `/api/v1/bulk/delete` | Bulk delete documents | -| `POST` | `/api/v1/bulk/retag` | Bulk add/remove tags | -| `POST` | `/api/v1/bulk/move` | Bulk move documents to a topic | +### Search & Q&A + +| Method | Endpoint | Description | +| ------ | ------------------------- | ------------------------------------- | +| `GET` | `/api/v1/search?q=...` | Semantic search | +| `POST` | `/api/v1/ask` | RAG question-answering | + +### Documents + +| Method | Endpoint | Description | +| -------- | ------------------------------------- | ------------------------------------- | +| `GET` | `/api/v1/documents` | List documents (with filters) | +| `POST` | `/api/v1/documents` | Index a new document | +| `GET` | `/api/v1/documents/:id` | Get a single document | +| `PATCH` | `/api/v1/documents/:id` | Update a document | +| `DELETE` | `/api/v1/documents/:id` | Delete a document | +| `POST` | `/api/v1/documents/url` | Index from a URL | +| `POST` | `/api/v1/documents/:id/tags` | Add tags to a document | +| `GET` | `/api/v1/documents/:id/suggest-tags` | Auto-suggest tags based on content | +| `GET` | `/api/v1/documents/:id/links` | List cross-reference links | +| `POST` | `/api/v1/documents/:id/links` | Create a cross-reference link | + +### Document Links + +| Method | Endpoint | Description | +| -------- | --------------------- | ---------------------- | +| `DELETE` | `/api/v1/links/:id` | Delete a link | + +### Topics & Tags + +| Method | Endpoint | Description | +| ------ | ----------------- | -------------------- | +| `GET` | `/api/v1/topics` | List all topics | +| `POST` | `/api/v1/topics` | Create a topic | +| `GET` | `/api/v1/tags` | List all tags | + +### Saved Searches + +| Method | Endpoint | Description | +| -------- | --------------------------- | ---------------------------- | +| `GET` | `/api/v1/searches` | List saved searches | +| `POST` | `/api/v1/searches` | Create a saved search | +| `POST` | `/api/v1/searches/:id/run` | Run a saved search | +| `DELETE` | `/api/v1/searches/:id` | Delete a saved search | + +### Bulk Operations + +| Method | Endpoint | Description | +| ------ | ----------------------- | ------------------------------ | +| `POST` | `/api/v1/bulk/delete` | Bulk delete documents | +| `POST` | `/api/v1/bulk/retag` | Bulk add/remove tags | +| `POST` | `/api/v1/bulk/move` | Bulk move documents to a topic | + +### Webhooks + +| Method | Endpoint | Description | +| -------- | ------------------------------- | ----------------------------- | +| `GET` | `/api/v1/webhooks` | List webhooks | +| `POST` | `/api/v1/webhooks` | Create a webhook | +| `DELETE` | `/api/v1/webhooks/:id` | Delete a webhook | +| `POST` | `/api/v1/webhooks/:id/test` | Send a test ping | + +### System + +| Method | Endpoint | Description | +| ------ | -------------------------------- | ---------------------------------- | +| `GET` | `/api/v1/health` | Health check with document count | +| `GET` | `/api/v1/stats` | Usage statistics | +| `GET` | `/api/v1/analytics/searches` | Search analytics and knowledge gaps| +| `GET` | `/api/v1/connectors/status` | Connector sync status and history | +| `GET` | `/api/v1/connectors/schedules` | Scheduled connector entries | +| `GET` | `/openapi.json` | OpenAPI 3.0 specification | ## Examples @@ -44,7 +93,7 @@ curl -X POST http://localhost:3378/api/v1/documents \ -d '{ "title": "Auth Guide", "content": "# Authentication\n\nUse OAuth2...", - "tags": ["auth"] + "tags": ["auth", "security"] }' ``` @@ -54,6 +103,12 @@ curl -X POST http://localhost:3378/api/v1/documents \ curl "http://localhost:3378/api/v1/search?q=authentication&limit=5" ``` +### Search with filters + +```bash +curl "http://localhost:3378/api/v1/search?q=deploy&library=my-lib&topic=backend&limit=10" +``` + ### Ask a question ```bash @@ -75,3 +130,73 @@ curl -X POST http://localhost:3378/api/v1/documents/url \ "library": "my-lib" }' ``` + +### Update a document + +```bash +curl -X PATCH http://localhost:3378/api/v1/documents/ \ + -H "Content-Type: application/json" \ + -d '{ + "title": "Updated Title", + "library": "my-lib", + "version": "2.0.0" + }' +``` + +### Bulk retag + +```bash +curl -X POST http://localhost:3378/api/v1/bulk/retag \ + -H "Content-Type: application/json" \ + -d '{ + "selector": {"library": "react"}, + "addTags": ["v18"], + "removeTags": ["v17"], + "dryRun": false + }' +``` + +### Create a webhook + +```bash +curl -X POST http://localhost:3378/api/v1/webhooks \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://hooks.example.com/libscope", + "events": ["document.created", "document.updated"], + "secret": "my-hmac-secret" + }' +``` + +Webhook payloads are signed with HMAC-SHA256 when a secret is provided. The signature is sent in the `X-LibScope-Signature` header. + +Supported events: `document.created`, `document.updated`, `document.deleted`. + +### Create a cross-reference link + +```bash +curl -X POST http://localhost:3378/api/v1/documents//links \ + -H "Content-Type: application/json" \ + -d '{ + "targetId": "", + "linkType": "prerequisite", + "label": "Read this first" + }' +``` + +Valid `linkType` values: `see_also`, `prerequisite`, `supersedes`, `related`. + +### Create a saved search + +```bash +curl -X POST http://localhost:3378/api/v1/searches \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Auth Docs", + "query": "authentication best practices", + "filters": {"library": "my-lib"} + }' + +# Run it later +curl -X POST http://localhost:3378/api/v1/searches//run +``` diff --git a/package-lock.json b/package-lock.json index 8a6f4c7..ea4307b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,7 @@ "dependencies": { "@modelcontextprotocol/sdk": "^1.0.0", "@xenova/transformers": "^2.17.2", - "better-sqlite3": "^11.0.0", + "better-sqlite3": "^12.6.2", "commander": "^14.0.3", "csv-parse": "^6.1.0", "js-yaml": "^4.1.1", @@ -20,6 +20,7 @@ "openai": "^6.25.0", "pino": "^10.3.1", "sqlite-vec": "^0.1.0", + "undici": "^7.22.0", "zod": "^4.3.6" }, "bin": { @@ -28,14 +29,14 @@ "devDependencies": { "@types/better-sqlite3": "^7.6.0", "@types/js-yaml": "^4.0.9", - "@types/node": "^22.0.0", + "@types/node": "^25.3.3", "@types/node-cron": "^3.0.11", "@types/pdf-parse": "^1.1.5", "@typescript-eslint/eslint-plugin": "^8.0.0", "@typescript-eslint/parser": "^8.0.0", "@vitest/coverage-v8": "^4.0.18", - "eslint": "^9.0.0", - "eslint-config-prettier": "^9.0.0", + "eslint": "^10.0.2", + "eslint-config-prettier": "^10.1.8", "husky": "^9.0.0", "lint-staged": "^16.3.1", "prettier": "^3.0.0", @@ -450,207 +451,74 @@ } }, "node_modules/@eslint/config-array": { - "version": "0.21.1", - "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.1.tgz", - "integrity": "sha512-aw1gNayWpdI/jSYVgzN5pL0cfzU02GT3NBpeT/DXbx1/1x7ZKxFPd9bwrzygx/qiwIQiJ1sw/zD8qY/kRvlGHA==", + "version": "0.23.2", + "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.23.2.tgz", + "integrity": "sha512-YF+fE6LV4v5MGWRGj7G404/OZzGNepVF8fxk7jqmqo3lrza7a0uUcDnROGRBG1WFC1omYUS/Wp1f42i0M+3Q3A==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@eslint/object-schema": "^2.1.7", + "@eslint/object-schema": "^3.0.2", "debug": "^4.3.1", - "minimatch": "^3.1.2" + "minimatch": "^10.2.1" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@eslint/config-array/node_modules/balanced-match": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true, - "license": "MIT" - }, - "node_modules/@eslint/config-array/node_modules/brace-expansion": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", - "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", - "dev": true, - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/@eslint/config-array/node_modules/minimatch": { - "version": "3.1.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", - "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", - "dev": true, - "license": "ISC", - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" + "node": "^20.19.0 || ^22.13.0 || >=24" } }, "node_modules/@eslint/config-helpers": { - "version": "0.4.2", - "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz", - "integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==", + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.5.2.tgz", + "integrity": "sha512-a5MxrdDXEvqnIq+LisyCX6tQMPF/dSJpCfBgBauY+pNZ28yCtSsTvyTYrMhaI+LK26bVyCJfJkT0u8KIj2i1dQ==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@eslint/core": "^0.17.0" + "@eslint/core": "^1.1.0" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" } }, "node_modules/@eslint/core": { - "version": "0.17.0", - "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz", - "integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@eslint/core/-/core-1.1.0.tgz", + "integrity": "sha512-/nr9K9wkr3P1EzFTdFdMoLuo1PmIxjmwvPozwoSodjNBdefGujXQUF93u1DDZpEaTuDvMsIQddsd35BwtrW9Xw==", "dev": true, "license": "Apache-2.0", "dependencies": { "@types/json-schema": "^7.0.15" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - } - }, - "node_modules/@eslint/eslintrc": { - "version": "3.3.4", - "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.4.tgz", - "integrity": "sha512-4h4MVF8pmBsncB60r0wSJiIeUKTSD4m7FmTFThG8RHlsg9ajqckLm9OraguFGZE4vVdpiI1Q4+hFnisopmG6gQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "ajv": "^6.14.0", - "debug": "^4.3.2", - "espree": "^10.0.1", - "globals": "^14.0.0", - "ignore": "^5.2.0", - "import-fresh": "^3.2.1", - "js-yaml": "^4.1.1", - "minimatch": "^3.1.3", - "strip-json-comments": "^3.1.1" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/@eslint/eslintrc/node_modules/ajv": { - "version": "6.14.0", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.14.0.tgz", - "integrity": "sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" - } - }, - "node_modules/@eslint/eslintrc/node_modules/balanced-match": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true, - "license": "MIT" - }, - "node_modules/@eslint/eslintrc/node_modules/brace-expansion": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", - "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", - "dev": true, - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/@eslint/eslintrc/node_modules/ignore": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", - "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, - "node_modules/@eslint/eslintrc/node_modules/json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true, - "license": "MIT" - }, - "node_modules/@eslint/eslintrc/node_modules/minimatch": { - "version": "3.1.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", - "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", - "dev": true, - "license": "ISC", - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/@eslint/js": { - "version": "9.39.3", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.3.tgz", - "integrity": "sha512-1B1VkCq6FuUNlQvlBYb+1jDu/gV297TIs/OeiaSR9l1H27SVW55ONE1e1Vp16NqP683+xEGzxYtv4XCiDPaQiw==", - "dev": true, - "license": "MIT", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://eslint.org/donate" + "node": "^20.19.0 || ^22.13.0 || >=24" } }, "node_modules/@eslint/object-schema": { - "version": "2.1.7", - "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz", - "integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-3.0.2.tgz", + "integrity": "sha512-HOy56KJt48Bx8KmJ+XGQNSUMT/6dZee/M54XyUyuvTvPXJmsERRvBchsUVx1UMe1WwIH49XLAczNC7V2INsuUw==", "dev": true, "license": "Apache-2.0", "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" } }, "node_modules/@eslint/plugin-kit": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz", - "integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==", + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.6.0.tgz", + "integrity": "sha512-bIZEUzOI1jkhviX2cp5vNyXQc6olzb2ohewQubuYlMXZ2Q/XjBO0x0XhGPvc9fjSIiUN0vw+0hq53BJ4eQSJKQ==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@eslint/core": "^0.17.0", + "@eslint/core": "^1.1.0", "levn": "^0.4.1" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" } }, "node_modules/@hono/node-server": { - "version": "1.19.9", - "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.9.tgz", - "integrity": "sha512-vHL6w3ecZsky+8P5MD+eFfaGTyCeOHUIFYMGpQGbrBTSmNNoxv0if69rEZ5giu36weC5saFuznL411gRX7bJDw==", + "version": "1.19.10", + "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.10.tgz", + "integrity": "sha512-hZ7nOssGqRgyV3FVVQdfi+U4q02uB23bpnYpdvNXkYTRRyWx84b7yf1ans+dnJ/7h41sGL3CeQTfO+ZGxuO+Iw==", "license": "MIT", "engines": { "node": ">=18.14.1" @@ -1532,6 +1400,13 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/esrecurse": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/@types/esrecurse/-/esrecurse-4.3.1.tgz", + "integrity": "sha512-xJBAbDifo5hpffDBuHl0Y8ywswbiAp/Wi7Y/GtAgSlZyIABppyurxVueOPE8LUQOxdlgi6Zqce7uoEpqNTeiUw==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/estree": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", @@ -1605,12 +1480,12 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.19.13", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.13.tgz", - "integrity": "sha512-akNQMv0wW5uyRpD2v2IEyRSZiR+BeGuoB6L310EgGObO44HSMNT8z1xzio28V8qOrgYaopIDNA18YgdXd+qTiw==", + "version": "25.3.3", + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.3.3.tgz", + "integrity": "sha512-DpzbrH7wIcBaJibpKo9nnSQL0MTRdnWttGyE5haGwK86xgMOkFLp7vEyfQPGLOJh5wNYiJ3V9PmUMDhV9u8kkQ==", "license": "MIT", "dependencies": { - "undici-types": "~6.21.0" + "undici-types": "~7.18.0" } }, "node_modules/@types/node-cron": { @@ -2452,22 +2327,6 @@ "url": "https://github.com/chalk/ansi-regex?sponsor=1" } }, - "node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "license": "MIT", - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, "node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", @@ -2637,14 +2496,17 @@ "license": "MIT" }, "node_modules/better-sqlite3": { - "version": "11.10.0", - "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-11.10.0.tgz", - "integrity": "sha512-EwhOpyXiOEL/lKzHz9AW1msWFNzGc/z+LzeB3/jnFJpxu+th2yqvzsSWas1v9jgs9+xiXJcD5A8CJxAG2TaghQ==", + "version": "12.6.2", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-12.6.2.tgz", + "integrity": "sha512-8VYKM3MjCa9WcaSAI3hzwhmyHVlH8tiGFwf0RlTsZPWJ1I5MkzjiudCo4KC4DxOaL/53A5B1sI/IbldNFDbsKA==", "hasInstallScript": true, "license": "MIT", "dependencies": { "bindings": "^1.5.0", "prebuild-install": "^7.1.1" + }, + "engines": { + "node": "20.x || 22.x || 23.x || 24.x || 25.x" } }, "node_modules/bindings": { @@ -2802,16 +2664,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/callsites": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", - "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6" - } - }, "node_modules/ccount": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", @@ -2833,23 +2685,6 @@ "node": ">=18" } }, - "node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "license": "MIT", - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, "node_modules/character-entities-html4": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz", @@ -2979,13 +2814,6 @@ "node": ">=20" } }, - "node_modules/concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true, - "license": "MIT" - }, "node_modules/content-disposition": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.1.tgz", @@ -3417,33 +3245,30 @@ } }, "node_modules/eslint": { - "version": "9.39.3", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.3.tgz", - "integrity": "sha512-VmQ+sifHUbI/IcSopBCF/HO3YiHQx/AVd3UVyYL6weuwW+HvON9VYn5l6Zl1WZzPWXPNZrSQpxwkkZ/VuvJZzg==", + "version": "10.0.2", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-10.0.2.tgz", + "integrity": "sha512-uYixubwmqJZH+KLVYIVKY1JQt7tysXhtj21WSvjcSmU5SVNzMus1bgLe+pAt816yQ8opKfheVVoPLqvVMGejYw==", "dev": true, "license": "MIT", "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", - "@eslint-community/regexpp": "^4.12.1", - "@eslint/config-array": "^0.21.1", - "@eslint/config-helpers": "^0.4.2", - "@eslint/core": "^0.17.0", - "@eslint/eslintrc": "^3.3.1", - "@eslint/js": "9.39.3", - "@eslint/plugin-kit": "^0.4.1", + "@eslint-community/regexpp": "^4.12.2", + "@eslint/config-array": "^0.23.2", + "@eslint/config-helpers": "^0.5.2", + "@eslint/core": "^1.1.0", + "@eslint/plugin-kit": "^0.6.0", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", "@types/estree": "^1.0.6", - "ajv": "^6.12.4", - "chalk": "^4.0.0", + "ajv": "^6.14.0", "cross-spawn": "^7.0.6", "debug": "^4.3.2", "escape-string-regexp": "^4.0.0", - "eslint-scope": "^8.4.0", - "eslint-visitor-keys": "^4.2.1", - "espree": "^10.4.0", - "esquery": "^1.5.0", + "eslint-scope": "^9.1.1", + "eslint-visitor-keys": "^5.0.1", + "espree": "^11.1.1", + "esquery": "^1.7.0", "esutils": "^2.0.2", "fast-deep-equal": "^3.1.3", "file-entry-cache": "^8.0.0", @@ -3453,8 +3278,7 @@ "imurmurhash": "^0.1.4", "is-glob": "^4.0.0", "json-stable-stringify-without-jsonify": "^1.0.1", - "lodash.merge": "^4.6.2", - "minimatch": "^3.1.2", + "minimatch": "^10.2.1", "natural-compare": "^1.4.0", "optionator": "^0.9.3" }, @@ -3462,7 +3286,7 @@ "eslint": "bin/eslint.js" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" }, "funding": { "url": "https://eslint.org/donate" @@ -3477,30 +3301,35 @@ } }, "node_modules/eslint-config-prettier": { - "version": "9.1.2", - "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-9.1.2.tgz", - "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", + "version": "10.1.8", + "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.8.tgz", + "integrity": "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w==", "dev": true, "license": "MIT", "bin": { "eslint-config-prettier": "bin/cli.js" }, + "funding": { + "url": "https://opencollective.com/eslint-config-prettier" + }, "peerDependencies": { "eslint": ">=7.0.0" } }, "node_modules/eslint-scope": { - "version": "8.4.0", - "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz", - "integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==", + "version": "9.1.1", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-9.1.1.tgz", + "integrity": "sha512-GaUN0sWim5qc8KVErfPBWmc31LEsOkrUJbvJZV+xuL3u2phMUK4HIvXlWAakfC8W4nzlK+chPEAkYOYb5ZScIw==", "dev": true, "license": "BSD-2-Clause", "dependencies": { + "@types/esrecurse": "^4.3.1", + "@types/estree": "^1.0.8", "esrecurse": "^4.3.0", "estraverse": "^5.2.0" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" }, "funding": { "url": "https://opencollective.com/eslint" @@ -3536,32 +3365,14 @@ "url": "https://github.com/sponsors/epoberezkin" } }, - "node_modules/eslint/node_modules/balanced-match": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true, - "license": "MIT" - }, - "node_modules/eslint/node_modules/brace-expansion": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", - "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", - "dev": true, - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, "node_modules/eslint/node_modules/eslint-visitor-keys": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", - "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-5.0.1.tgz", + "integrity": "sha512-tD40eHxA35h0PEIZNeIjkHoDR4YjjJp34biM0mDvplBe//mB+IHCqHDGV7pxF+7MklTvighcCPPZC7ynWyjdTA==", "dev": true, "license": "Apache-2.0", "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" }, "funding": { "url": "https://opencollective.com/eslint" @@ -3584,45 +3395,32 @@ "dev": true, "license": "MIT" }, - "node_modules/eslint/node_modules/minimatch": { - "version": "3.1.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", - "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", - "dev": true, - "license": "ISC", - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, "node_modules/espree": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz", - "integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==", + "version": "11.1.1", + "resolved": "https://registry.npmjs.org/espree/-/espree-11.1.1.tgz", + "integrity": "sha512-AVHPqQoZYc+RUM4/3Ly5udlZY/U4LS8pIG05jEjWM2lQMU/oaZ7qshzAl2YP1tfNmXfftH3ohurfwNAug+MnsQ==", "dev": true, "license": "BSD-2-Clause", "dependencies": { - "acorn": "^8.15.0", + "acorn": "^8.16.0", "acorn-jsx": "^5.3.2", - "eslint-visitor-keys": "^4.2.1" + "eslint-visitor-keys": "^5.0.1" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" }, "funding": { "url": "https://opencollective.com/eslint" } }, "node_modules/espree/node_modules/eslint-visitor-keys": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", - "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-5.0.1.tgz", + "integrity": "sha512-tD40eHxA35h0PEIZNeIjkHoDR4YjjJp34biM0mDvplBe//mB+IHCqHDGV7pxF+7MklTvighcCPPZC7ynWyjdTA==", "dev": true, "license": "Apache-2.0", "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": "^20.19.0 || ^22.13.0 || >=24" }, "funding": { "url": "https://opencollective.com/eslint" @@ -4076,19 +3874,6 @@ "node": ">=10.13.0" } }, - "node_modules/globals": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz", - "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/gopd": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", @@ -4189,9 +3974,9 @@ } }, "node_modules/hono": { - "version": "4.12.3", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.3.tgz", - "integrity": "sha512-SFsVSjp8sj5UumXOOFlkZOG6XS9SJDKw0TbwFeV+AJ8xlST8kxK5Z/5EYa111UY8732lK2S/xB653ceuaoGwpg==", + "version": "4.12.5", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.5.tgz", + "integrity": "sha512-3qq+FUBtlTHhtYxbxheZgY8NIFnkkC/MR8u5TTsr7YZ3wixryQ3cCwn3iZbg8p8B88iDBBAYSfZDS75t8MN7Vg==", "license": "MIT", "engines": { "node": ">=16.9.0" @@ -4311,23 +4096,6 @@ "license": "MIT", "optional": true }, - "node_modules/import-fresh": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", - "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "parent-module": "^1.0.0", - "resolve-from": "^4.0.0" - }, - "engines": { - "node": ">=6" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/imurmurhash": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", @@ -4629,9 +4397,9 @@ } }, "node_modules/lint-staged": { - "version": "16.3.1", - "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.3.1.tgz", - "integrity": "sha512-bqvvquXzFBAlSbluugR4KXAe4XnO/QZcKVszpkBtqLWa2KEiVy8n6Xp38OeUbv/gOJOX4Vo9u5pFt/ADvbm42Q==", + "version": "16.3.2", + "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-16.3.2.tgz", + "integrity": "sha512-xKqhC2AeXLwiAHXguxBjuChoTTWFC6Pees0SHPwOpwlvI3BH7ZADFPddAdN3pgo3aiKgPUx/bxE78JfUnxQnlg==", "dev": true, "license": "MIT", "dependencies": { @@ -4686,13 +4454,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/lodash.merge": { - "version": "4.6.2", - "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", - "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", - "dev": true, - "license": "MIT" - }, "node_modules/log-update": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/log-update/-/log-update-6.1.0.tgz", @@ -5423,19 +5184,6 @@ "license": "(MIT AND Zlib)", "optional": true }, - "node_modules/parent-module": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", - "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==", - "dev": true, - "license": "MIT", - "dependencies": { - "callsites": "^3.0.0" - }, - "engines": { - "node": ">=6" - } - }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -5918,16 +5666,6 @@ "node": ">=0.10.0" } }, - "node_modules/resolve-from": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", - "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=4" - } - }, "node_modules/restore-cursor": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-5.1.0.tgz", @@ -6611,19 +6349,6 @@ "url": "https://github.com/chalk/strip-ansi?sponsor=1" } }, - "node_modules/strip-json-comments": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", - "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/superjson": { "version": "2.2.6", "resolved": "https://registry.npmjs.org/superjson/-/superjson-2.2.6.tgz", @@ -6896,10 +6621,19 @@ "license": "MIT", "optional": true }, + "node_modules/undici": { + "version": "7.22.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.22.0.tgz", + "integrity": "sha512-RqslV2Us5BrllB+JeiZnK4peryVTndy9Dnqq62S3yYRRTj0tFQCwEniUy2167skdGOy3vqRzEvl1Dm4sV2ReDg==", + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + }, "node_modules/undici-types": { - "version": "6.21.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", - "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "version": "7.18.2", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", + "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", "license": "MIT" }, "node_modules/unist-util-is": { diff --git a/package.json b/package.json index 0af45d9..4f690d5 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "dependencies": { "@modelcontextprotocol/sdk": "^1.0.0", "@xenova/transformers": "^2.17.2", - "better-sqlite3": "^11.0.0", + "better-sqlite3": "^12.6.2", "commander": "^14.0.3", "csv-parse": "^6.1.0", "js-yaml": "^4.1.1", @@ -62,6 +62,7 @@ "openai": "^6.25.0", "pino": "^10.3.1", "sqlite-vec": "^0.1.0", + "undici": "^7.22.0", "zod": "^4.3.6" }, "optionalDependencies": { @@ -71,14 +72,14 @@ "devDependencies": { "@types/better-sqlite3": "^7.6.0", "@types/js-yaml": "^4.0.9", - "@types/node": "^22.0.0", + "@types/node": "^25.3.3", "@types/node-cron": "^3.0.11", "@types/pdf-parse": "^1.1.5", "@typescript-eslint/eslint-plugin": "^8.0.0", "@typescript-eslint/parser": "^8.0.0", "@vitest/coverage-v8": "^4.0.18", - "eslint": "^9.0.0", - "eslint-config-prettier": "^9.0.0", + "eslint": "^10.0.2", + "eslint-config-prettier": "^10.1.8", "husky": "^9.0.0", "lint-staged": "^16.3.1", "prettier": "^3.0.0", diff --git a/src/api/openapi.ts b/src/api/openapi.ts index 0264cbe..e61b4f2 100644 --- a/src/api/openapi.ts +++ b/src/api/openapi.ts @@ -127,22 +127,44 @@ export const OPENAPI_SPEC = { }, "/api/v1/documents/url": { post: { - summary: "Index document from URL", + summary: "Index document from URL (with optional spidering)", operationId: "indexFromUrl", requestBody: { required: true, content: { "application/json": { schema: { $ref: "#/components/schemas/IndexFromUrlRequest" }, - example: { url: "https://example.com/docs", topic: "guides" }, + examples: { + single: { + summary: "Single URL", + value: { url: "https://example.com/page", topic: "guides" }, + }, + spider: { + summary: "Spider mode", + value: { + url: "https://docs.example.com", + spider: true, + maxPages: 50, + maxDepth: 2, + }, + }, + }, }, }, }, responses: { "201": { - description: "Document indexed from URL", + description: + "Document(s) indexed. Returns DocumentResponse for single-URL mode, SpiderResponse for spider mode.", content: { - "application/json": { schema: { $ref: "#/components/schemas/DocumentResponse" } }, + "application/json": { + schema: { + oneOf: [ + { $ref: "#/components/schemas/DocumentResponse" }, + { $ref: "#/components/schemas/SpiderResponse" }, + ], + }, + }, }, }, }, @@ -343,6 +365,70 @@ export const OPENAPI_SPEC = { properties: { url: { type: "string", format: "uri" }, topic: { type: "string" }, + spider: { + type: "boolean", + description: "When true, crawl linked pages starting from the URL (BFS spider mode).", + }, + maxPages: { + type: "integer", + minimum: 1, + description: + "Maximum total pages to fetch in spider mode (default: 25, hard cap: 200).", + }, + maxDepth: { + type: "integer", + minimum: 0, + description: + "Maximum hop depth from the seed URL in spider mode (default: 2, hard cap: 5).", + }, + sameDomain: { + type: "boolean", + description: "Only follow links sharing the seed hostname (default: true).", + }, + pathPrefix: { + type: "string", + description: "Only follow links whose path starts with this prefix (e.g. '/docs/').", + }, + excludePatterns: { + type: "array", + items: { type: "string" }, + description: "Glob patterns for URLs to skip during spidering (e.g. ['*/changelog*']).", + }, + }, + }, + SpiderResponse: { + type: "object", + properties: { + documents: { + type: "array", + items: { + type: "object", + properties: { + id: { type: "string" }, + title: { type: "string" }, + url: { type: "string" }, + }, + }, + }, + pagesFetched: { type: "integer", description: "Pages successfully fetched and indexed." }, + pagesCrawled: { type: "integer", description: "Total pages attempted." }, + pagesSkipped: { type: "integer", description: "Pages skipped by filters or robots.txt." }, + errors: { + type: "array", + items: { + type: "object", + properties: { + url: { type: "string" }, + error: { type: "string" }, + }, + }, + }, + abortReason: { + type: "string", + nullable: true, + enum: ["maxPages", "timeout", null], + description: "Set if the crawl was aborted early.", + }, }, }, AskRequest: { diff --git a/src/api/routes.ts b/src/api/routes.ts index 497d3e5..a3f9856 100644 --- a/src/api/routes.ts +++ b/src/api/routes.ts @@ -35,7 +35,7 @@ import { } from "../core/index.js"; import type { LinkType, BulkSelector } from "../core/index.js"; import { loadConfig } from "../config.js"; -import { DocumentNotFoundError, LibScopeError } from "../errors.js"; +import { DocumentNotFoundError, FetchError, LibScopeError } from "../errors.js"; import { getLogger } from "../logger.js"; import { parseJsonBody, sendJson, sendError } from "./middleware.js"; import { OPENAPI_SPEC } from "./openapi.js"; @@ -52,6 +52,8 @@ import { } from "../core/webhooks.js"; import type { WebhookEvent } from "../core/webhooks.js"; import { loadScheduleEntries } from "../core/scheduler.js"; +import { spiderUrl } from "../core/spider.js"; +import type { SpiderOptions, SpiderStats } from "../core/spider.js"; function parseUrl(req: IncomingMessage): URL { return new URL(req.url ?? "/", `http://${req.headers["host"] ?? "localhost"}`); @@ -332,7 +334,7 @@ export async function handleRequest( return; } - // Index from URL + // Index from URL (with optional spidering) if (pathname === "/api/v1/documents/url" && method === "POST") { const body = await parseJsonBody(req); if (!body || typeof body !== "object") { @@ -344,16 +346,100 @@ export async function handleRequest( sendError(res, 400, "VALIDATION_ERROR", "Field 'url' is required"); return; } - const fetched = await fetchAndConvert(b["url"], { - allowPrivateUrls: loadConfig().indexing.allowPrivateUrls, - allowSelfSignedCerts: loadConfig().indexing.allowSelfSignedCerts, - }); + const url = b["url"]; const topicId = typeof b["topic"] === "string" ? b["topic"] : undefined; + const config = loadConfig(); + const fetchOptions = { + allowPrivateUrls: config.indexing.allowPrivateUrls, + allowSelfSignedCerts: config.indexing.allowSelfSignedCerts, + }; + + // Spider mode — crawl linked pages + if (b["spider"] === true) { + // Validate optional numeric fields — must be finite positive integers + if (b["maxPages"] !== undefined) { + const v = b["maxPages"]; + if (typeof v !== "number" || !Number.isFinite(v) || !Number.isInteger(v) || v < 1) { + sendError(res, 400, "VALIDATION_ERROR", "maxPages must be a positive integer"); + return; + } + } + if (b["maxDepth"] !== undefined) { + const v = b["maxDepth"]; + if (typeof v !== "number" || !Number.isFinite(v) || !Number.isInteger(v) || v < 0) { + sendError(res, 400, "VALIDATION_ERROR", "maxDepth must be a non-negative integer"); + return; + } + } + + const spiderOptions: SpiderOptions = { + fetchOptions, + ...(typeof b["maxPages"] === "number" ? { maxPages: b["maxPages"] } : {}), + ...(typeof b["maxDepth"] === "number" ? { maxDepth: b["maxDepth"] } : {}), + ...(typeof b["sameDomain"] === "boolean" ? { sameDomain: b["sameDomain"] } : {}), + ...(typeof b["pathPrefix"] === "string" ? { pathPrefix: b["pathPrefix"] } : {}), + ...(Array.isArray(b["excludePatterns"]) + ? { + excludePatterns: (b["excludePatterns"] as unknown[]).filter( + (p): p is string => typeof p === "string", + ), + } + : {}), + }; + + const indexedDocs: Array<{ id: string; title: string; url: string }> = []; + const errors: Array<{ url: string; error: string }> = []; + let stats: SpiderStats = { pagesFetched: 0, pagesCrawled: 0, pagesSkipped: 0, errors }; + + const gen = spiderUrl(url, spiderOptions); + let result = await gen.next(); + while (!result.done) { + const page = result.value; + try { + const doc = await indexDocument(db, provider, { + content: page.content, + title: page.title, + sourceType: "manual", + url: page.url, + topicId, + }); + indexedDocs.push({ id: doc.id, title: page.title, url: page.url }); + } catch (indexErr) { + const msg = indexErr instanceof Error ? indexErr.message : String(indexErr); + errors.push({ url: page.url, error: msg }); + } + result = await gen.next(); + } + // result.value is SpiderStats when done (generator is exhausted) + if (result.done && result.value) { + stats = result.value; + stats.errors = errors; + } + + const took = Math.round(performance.now() - start); + sendJson( + res, + 201, + { + documents: indexedDocs, + pagesFetched: indexedDocs.length, + pagesCrawled: stats.pagesCrawled, + pagesSkipped: stats.pagesSkipped, + errors, + abortReason: stats.abortReason ?? null, + }, + took, + ); + return; + } + + // Single-URL mode (default) + const fetched = await fetchAndConvert(url, fetchOptions); const doc = await indexDocument(db, provider, { content: fetched.content, title: fetched.title, sourceType: "manual", - url: b["url"], + url, topicId, }); const took = Math.round(performance.now() - start); @@ -850,6 +936,10 @@ export async function handleRequest( sendError(res, 404, "NOT_FOUND", err.message); return; } + if (err instanceof FetchError) { + sendError(res, 502, "FETCH_ERROR", err.message); + return; + } if (err instanceof LibScopeError && err.code === "VALIDATION_ERROR") { sendError(res, 400, "VALIDATION_ERROR", err.message); return; diff --git a/src/cli/index.ts b/src/cli/index.ts index e775e2b..b92bc7f 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -32,6 +32,7 @@ import { } from "../core/analytics.js"; import { startRepl } from "./repl.js"; import { confirmAction } from "./confirm.js"; +import { createReporter, isVerbose } from "./reporter.js"; import { addTagsToDocument, removeTagFromDocument, @@ -55,6 +56,7 @@ import { listInstalledPacks, listAvailablePacks, createPack, + createPackFromSource, } from "../core/packs.js"; import { execSync } from "node:child_process"; @@ -119,7 +121,7 @@ program .name("libscope") .description("AI-powered knowledge base with MCP integration") .version(_pkg.version) - .option("--verbose", "Enable verbose logging") + .option("-v, --verbose", "Enable verbose logging") .option("--log-level ", "Set log level (debug, info, warn, error, silent)") .option("--workspace ", "Use a specific workspace"); @@ -1085,10 +1087,15 @@ interface ProgramOpts { } function setupLogging(opts: ProgramOpts): void { - const level: LogLevel = opts.verbose - ? "debug" - : ((opts.logLevel as LogLevel | undefined) ?? loadConfig().logging.level); - initLogger(level); + if (isVerbose(opts.verbose)) { + initLogger("debug"); + } else if (opts.logLevel) { + initLogger(opts.logLevel as LogLevel); + } else { + // Default to silent in CLI mode — pretty reporter handles user-facing output. + // Set LIBSCOPE_VERBOSE=1 or pass --verbose to see structured JSON logs. + initLogger("silent"); + } } /** Shared CLI initialization: loadConfig → setupLogging → getDatabase → runMigrations. */ @@ -1607,21 +1614,61 @@ const packCmd = program.command("pack").description("Manage knowledge packs"); packCmd .command("install ") - .description("Install a knowledge pack from registry or local .json file") + .description("Install a knowledge pack from registry or local .json/.json.gz file") .option("--registry ", "Custom registry URL") - .action(async (nameOrPath: string, opts: { registry?: string }) => { - const { db, provider } = initializeAppWithEmbedding(); - const result = await installPack(db, provider, nameOrPath, { - registryUrl: opts.registry, - }); - if (result.alreadyInstalled) { - console.log(`Pack "${result.packName}" is already installed.`); - } else { - console.log( - `✓ Pack "${result.packName}" installed (${result.documentsInstalled} documents).`, - ); - } - }); + .option("--batch-size ", "Number of documents to embed per batch (default: 10)") + .option("--resume-from ", "Skip the first N documents (resume a partial install)") + .option("--concurrency ", "Number of batches to embed in parallel (default: 4)") + .action( + async ( + nameOrPath: string, + opts: { registry?: string; batchSize?: string; resumeFrom?: string; concurrency?: string }, + ) => { + const { db, provider } = initializeAppWithEmbedding(); + const globalOpts = program.opts(); + const reporter = createReporter(globalOpts.verbose); + + const batchSize = opts.batchSize ? parseIntOption(opts.batchSize, "--batch-size") : undefined; + const resumeFrom = opts.resumeFrom + ? parseIntOption(opts.resumeFrom, "--resume-from") + : undefined; + const concurrency = opts.concurrency + ? parseIntOption(opts.concurrency, "--concurrency") + : undefined; + + if (concurrency !== undefined && concurrency < 1) { + reporter.log('Error: "--concurrency" must be an integer greater than or equal to 1.'); + closeDatabase(); + process.exit(1); + return; + } + + try { + const result = await installPack(db, provider, nameOrPath, { + registryUrl: opts.registry, + batchSize, + resumeFrom, + concurrency, + onProgress: (current, total, docTitle) => { + reporter.progress(current, total, docTitle); + }, + }); + + reporter.clearProgress(); + + if (result.alreadyInstalled) { + reporter.log(`Pack "${result.packName}" is already installed.`); + } else { + const errMsg = result.errors > 0 ? ` (${result.errors} errors)` : ""; + reporter.success( + `Pack "${result.packName}" installed: ${result.documentsInstalled} documents${errMsg}.`, + ); + } + } finally { + closeDatabase(); + } + }, + ); packCmd .command("remove ") @@ -1682,38 +1729,81 @@ packCmd packCmd .command("create") - .description("Export current documents as a pack file") + .description("Create a pack from the database, a local folder, or a URL") .requiredOption("--name ", "Pack name") - .option("--topic ", "Filter documents by topic ID") + .option("--from ", "Source folder(s), file(s), or URL(s) to build pack from") + .option("--topic ", "Filter documents by topic ID (database mode only)") .option("--version ", "Pack version (default: 1.0.0)") .option("--description ", "Pack description") .option("--author ", "Pack author") .option("--output ", "Output file path") + .option( + "--extensions ", + "Comma-separated file extensions to include (e.g. .md,.html). Default: all supported", + ) + .option("--exclude ", "Glob patterns to exclude (e.g. *.min.js, assets/**)") + .option("--no-recursive", "Do not recurse into subdirectories") .action( - (opts: { + async (opts: { name: string; + from?: string[]; topic?: string; version?: string; description?: string; author?: string; output?: string; + extensions?: string; + exclude?: string[]; + recursive: boolean; }) => { - const { db } = initializeApp(); - try { - const outputPath = opts.output ?? `${opts.name}.json`; - const pack = createPack(db, { + if (opts.from && opts.from.length > 0) { + // Source mode: build pack directly from files/URLs (no database needed) + // Default to .json.gz for source packs (they can be large) + const outputPath = opts.output ?? `${opts.name}.json.gz`; + const extensionList = opts.extensions + ? opts.extensions.split(",").map((e) => e.trim()) + : undefined; + + const pack = await createPackFromSource({ name: opts.name, + from: opts.from, version: opts.version, description: opts.description, author: opts.author, - topic: opts.topic, outputPath, + extensions: extensionList, + exclude: opts.exclude, + recursive: opts.recursive, + onProgress: ({ file, index, total }) => { + const pct = Math.round(((index + 1) / total) * 100); + const short = file.length > 60 ? `...${file.slice(-57)}` : file; + process.stdout.write(`\r [${pct}%] ${index + 1}/${total} ${short}`.padEnd(80)); + }, }); + // Clear progress line + process.stdout.write("\r" + " ".repeat(80) + "\r"); console.log( `✓ Pack "${pack.name}" created with ${pack.documents.length} documents → ${outputPath}`, ); - } finally { - closeDatabase(); + } else { + // Database mode: export existing documents + const outputPath = opts.output ?? `${opts.name}.json`; + const { db } = initializeApp(); + try { + const pack = createPack(db, { + name: opts.name, + version: opts.version, + description: opts.description, + author: opts.author, + topic: opts.topic, + outputPath, + }); + console.log( + `✓ Pack "${pack.name}" created with ${pack.documents.length} documents → ${outputPath}`, + ); + } finally { + closeDatabase(); + } } }, ); @@ -1729,9 +1819,7 @@ connectCmd .option("--notebook ", "Sync a specific notebook") .action(async (opts: { token?: string; sync?: boolean; notebook?: string }) => { const config = loadConfig(); - const logLevel = - (program.opts().logLevel as LogLevel) ?? (program.opts().verbose ? "debug" : "info"); - initLogger(logLevel); + setupLogging(program.opts()); const workspace = program.opts().workspace as string | undefined; if (workspace) { @@ -1845,9 +1933,7 @@ disconnectCmd return; } const config = loadConfig(); - const logLevel = - (program.opts().logLevel as LogLevel) ?? (program.opts().verbose ? "debug" : "info"); - initLogger(logLevel); + setupLogging(program.opts()); const workspace2 = program.opts().workspace as string | undefined; if (workspace2) { diff --git a/src/cli/reporter.ts b/src/cli/reporter.ts new file mode 100644 index 0000000..22533cb --- /dev/null +++ b/src/cli/reporter.ts @@ -0,0 +1,87 @@ +/** + * CLI output reporter — pretty human-readable output for interactive terminals. + * In verbose/JSON mode, a SilentReporter is used so pino JSON logs handle output. + */ + +const RESET = "\x1b[0m"; +const GREEN = "\x1b[32m"; +const YELLOW = "\x1b[33m"; +const RED = "\x1b[31m"; +const CYAN = "\x1b[36m"; +const DIM = "\x1b[2m"; + +export interface CliReporter { + log(msg: string): void; + success(msg: string): void; + warn(msg: string): void; + error(msg: string): void; + progress(current: number, total: number, label: string): void; + clearProgress(): void; +} + +function buildBar(pct: number, width = 20): string { + const filled = Math.round((pct / 100) * width); + return "\u2588".repeat(filled) + "\u2591".repeat(width - filled); +} + +/** Pretty human-readable reporter. Uses ANSI colors and \r progress lines. */ +class PrettyReporter implements CliReporter { + private hasProgress = false; + + log(msg: string): void { + this.clearProgress(); + process.stdout.write(`${msg}\n`); + } + + success(msg: string): void { + this.clearProgress(); + process.stdout.write(`${GREEN}\u2713${RESET} ${msg}\n`); + } + + warn(msg: string): void { + this.clearProgress(); + process.stderr.write(`${YELLOW}\u26a0${RESET} ${msg}\n`); + } + + error(msg: string): void { + this.clearProgress(); + process.stderr.write(`${RED}\u2717${RESET} ${msg}\n`); + } + + progress(current: number, total: number, label: string): void { + const pct = total > 0 ? Math.round((current / total) * 100) : 0; + const bar = buildBar(pct); + const truncatedLabel = label.length > 40 ? `${label.slice(0, 37)}...` : label; + const line = `${CYAN}[${bar}]${RESET} ${pct}% (${current}/${total}) ${DIM}${truncatedLabel}${RESET}`; + process.stdout.write(`\r${line}`); + this.hasProgress = true; + } + + clearProgress(): void { + if (this.hasProgress) { + const width = process.stdout.columns ?? 80; + process.stdout.write(`\r${" ".repeat(width - 1)}\r`); + this.hasProgress = false; + } + } +} + +/** No-op reporter: used in verbose/JSON mode where pino logs handle output. */ +class SilentReporter implements CliReporter { + log(_msg: string): void {} + success(_msg: string): void {} + warn(_msg: string): void {} + error(_msg: string): void {} + progress(_current: number, _total: number, _label: string): void {} + clearProgress(): void {} +} + +/** Returns true if verbose mode is active (flag or env var). */ +export function isVerbose(verbose?: boolean): boolean { + return verbose === true || process.env["LIBSCOPE_VERBOSE"] === "1"; +} + +/** Create a reporter appropriate for the current mode. */ +export function createReporter(verbose?: boolean): CliReporter { + return isVerbose(verbose) ? new SilentReporter() : new PrettyReporter(); +} diff --git a/src/config.ts b/src/config.ts index dab9d4c..21fe082 100644 --- a/src/config.ts +++ b/src/config.ts @@ -13,7 +13,7 @@ export interface LibScopeConfig { openaiModel?: string; }; llm?: { - provider?: "openai" | "ollama"; + provider?: "openai" | "ollama" | "passthrough"; model?: string; ollamaUrl?: string; openaiApiKey?: string; @@ -112,9 +112,16 @@ function getEnvOverrides(): Partial { }; } - if (llmProvider === "openai" || llmProvider === "ollama" || llmModel) { + if ( + llmProvider === "openai" || + llmProvider === "ollama" || + llmProvider === "passthrough" || + llmModel + ) { overrides.llm = { - ...(llmProvider === "openai" || llmProvider === "ollama" ? { provider: llmProvider } : {}), + ...(llmProvider === "openai" || llmProvider === "ollama" || llmProvider === "passthrough" + ? { provider: llmProvider } + : {}), ...(llmModel ? { model: llmModel } : {}), }; } @@ -122,8 +129,22 @@ function getEnvOverrides(): Partial { return overrides; } -/** Load config with precedence: env > project > user > defaults */ +let _configCache: LibScopeConfig | null = null; +let _configCacheAt = 0; +const CONFIG_CACHE_TTL_MS = 30_000; + +/** Invalidate the config cache (e.g. after saving new values). */ +export function invalidateConfigCache(): void { + _configCache = null; + _configCacheAt = 0; +} + +/** Load config with precedence: env > project > user > defaults. Result is cached for 30 s. */ export function loadConfig(): LibScopeConfig { + const now = Date.now(); + if (_configCache && now - _configCacheAt < CONFIG_CACHE_TTL_MS) { + return _configCache; + } const userConfig = loadJsonFile(getUserConfigPath()); const projectConfig = loadJsonFile(getProjectConfigPath()); const envOverrides = getEnvOverrides(); @@ -160,6 +181,8 @@ export function loadConfig(): LibScopeConfig { validateConfig(config); + _configCache = config; + _configCacheAt = now; return config; } @@ -257,4 +280,5 @@ export function saveUserConfig(config: Partial): void { }, }; writeFileSync(getUserConfigPath(), JSON.stringify(merged, null, 2), "utf-8"); + invalidateConfigCache(); } diff --git a/src/connectors/confluence.ts b/src/connectors/confluence.ts index 9068182..641f4da 100644 --- a/src/connectors/confluence.ts +++ b/src/connectors/confluence.ts @@ -115,19 +115,10 @@ async function confluenceFetch(url: string, auth: string): Promise { async function fetchAllPages(initialUrl: string, baseUrl: string, auth: string): Promise { const all: T[] = []; let url: string | undefined = initialUrl; - const MAX_PAGES = 10_000; while (url) { const resp: PaginatedResponse = await confluenceFetch>(url, auth); all.push(...resp.results); - if (all.length >= MAX_PAGES) { - const log = getLogger(); - log.warn( - { count: all.length, max: MAX_PAGES }, - "Reached max page limit, stopping pagination", - ); - break; - } const next: string | undefined = resp._links?.next; url = next ? `${baseUrl}${next}` : undefined; } @@ -221,6 +212,38 @@ function extractTagContent(html: string, tagName: string): string { return html.slice(contentStart, end); } +/** + * Remove self-closing `` tags whose attribute + * string matches `nameTest`. Uses indexOf so there is no regex backtracking. + */ +function removeSelfClosingMacros(html: string, nameTest: RegExp): string { + const OPEN = "", start + OPEN.length); + if (tagEnd === -1) { + result += html.slice(pos); + break; + } + const isSelfClosing = html[tagEnd - 1] === "/"; + const attrs = html.slice(start, tagEnd + 1); + if (isSelfClosing && nameTest.test(attrs)) { + result += html.slice(pos, start); // drop the self-closing tag + } else { + result += html.slice(pos, tagEnd + 1); // keep it + } + pos = tagEnd + 1; + } + return result; +} + export function convertConfluenceStorage(html: string): string { let processed = html; @@ -263,13 +286,12 @@ export function convertConfluenceStorage(html: string): string { return `

${title}

${body.trim()}`; }); - // TOC → strip + // TOC → strip (paired tags handled by replaceStructuredMacros, self-closing by indexOf helper) processed = replaceStructuredMacros(processed, (_inner, attrs) => { if (!/ac:name="toc"/i.test(attrs)) return undefined; return ""; }); - // Self-closing TOC - processed = processed.replace(/]*ac:name="toc"[^>]*\/>/gi, ""); + processed = removeSelfClosingMacros(processed, /ac:name="toc"/i); // JIRA macro → [JIRA: KEY-123] as a span to avoid escaping processed = replaceStructuredMacros(processed, (inner, attrs) => { diff --git a/src/connectors/http-utils.ts b/src/connectors/http-utils.ts index b788367..aeb97bb 100644 --- a/src/connectors/http-utils.ts +++ b/src/connectors/http-utils.ts @@ -1,22 +1,13 @@ +import { Agent } from "undici"; import { getLogger } from "../logger.js"; import { FetchError } from "../errors.js"; import { loadConfig } from "../config.js"; -let tlsWarningLogged = false; - -/** - * Log a one-time warning when `allowSelfSignedCerts` is enabled but the - * user has not set `NODE_TLS_REJECT_UNAUTHORIZED=0` in their environment. - */ -function warnIfTlsBypassMissing(): void { - if (tlsWarningLogged) return; - if (process.env["NODE_TLS_REJECT_UNAUTHORIZED"] === "0") return; - tlsWarningLogged = true; - const log = getLogger(); - log.warn( - "allowSelfSignedCerts is enabled but NODE_TLS_REJECT_UNAUTHORIZED is not set. " + - "Set NODE_TLS_REJECT_UNAUTHORIZED=0 in your environment to allow self-signed certificates.", - ); +/** Lazy singleton undici Agent that skips TLS certificate verification. */ +let _insecureAgent: Agent | undefined; +function getInsecureAgent(): Agent { + _insecureAgent ??= new Agent({ connect: { rejectUnauthorized: false } }); + return _insecureAgent; } export interface RetryConfig { @@ -39,52 +30,45 @@ export async function fetchWithRetry( const log = getLogger(); const config = loadConfig(); - if (config.indexing.allowSelfSignedCerts) { - warnIfTlsBypassMissing(); - } + // Use a per-request undici Agent when self-signed certs are allowed. + // This is scoped to this fetch chain and does not affect concurrent requests. + const dispatcher = config.indexing.allowSelfSignedCerts ? getInsecureAgent() : undefined; - try { - for (let attempt = 0; attempt < maxRetries; attempt++) { - const timeoutSignal = AbortSignal.timeout(30_000); - const combinedSignal = - options?.signal != null ? AbortSignal.any([options.signal, timeoutSignal]) : timeoutSignal; + for (let attempt = 0; attempt <= maxRetries; attempt++) { + const fetchOptions = { + ...(options ?? {}), + ...(dispatcher ? { dispatcher: dispatcher as unknown } : {}), + } as RequestInit; + const response = await fetch(url, fetchOptions); - const response = await fetch(url, { - ...options, - signal: combinedSignal, - }); - - if (response.status === 429 || (response.status >= 500 && response.status < 600)) { - if (attempt >= maxRetries - 1) { - const body = await response.text().catch(() => ""); - throw new FetchError(`HTTP ${response.status} after ${maxRetries} attempts: ${body}`); - } + if (response.status === 429 || (response.status >= 500 && response.status < 600)) { + if (attempt >= maxRetries) { + const body = await response.text().catch(() => ""); + throw new FetchError(`HTTP ${response.status} after ${maxRetries + 1} attempts: ${body}`); + } - let delayMs = baseDelay * 2 ** attempt; - if (response.status === 429) { - const retryAfter = response.headers.get("Retry-After"); - if (retryAfter) { - const parsed = Number(retryAfter); - if (!Number.isNaN(parsed)) { - delayMs = parsed * 1000; - } + let delayMs = baseDelay * 2 ** attempt; + if (response.status === 429) { + const retryAfter = response.headers.get("Retry-After"); + if (retryAfter) { + const parsed = Number(retryAfter); + if (!Number.isNaN(parsed)) { + delayMs = parsed * 1000; } } - - log.warn( - { status: response.status, attempt: attempt + 1, delayMs }, - "Retrying after transient error", - ); - await new Promise((resolve) => setTimeout(resolve, delayMs)); - continue; } - return response; + log.warn( + { status: response.status, attempt: attempt + 1, delayMs }, + "Retrying after transient error", + ); + await new Promise((resolve) => setTimeout(resolve, delayMs)); + continue; } - // Unreachable, but satisfies TypeScript - throw new FetchError("fetchWithRetry: unexpected code path"); - } finally { - // no-op: TLS state is managed by the user's environment, not this function + return response; } + + // Unreachable, but satisfies TypeScript + throw new FetchError("fetchWithRetry: unexpected code path"); } diff --git a/src/connectors/index.ts b/src/connectors/index.ts index 4dcded4..09926f5 100644 --- a/src/connectors/index.ts +++ b/src/connectors/index.ts @@ -60,7 +60,11 @@ export function loadDbConnectorConfig( | { config_json: string } | undefined; if (!row) return undefined; - return JSON.parse(row.config_json) as ConnectorConfig; + try { + return JSON.parse(row.config_json) as ConnectorConfig; + } catch (err) { + throw new ConfigError(`Corrupted connector config for type "${type}"`, err); + } } /** Delete connector config from the database. */ @@ -102,7 +106,11 @@ export function loadNamedConnectorConfig(name: string): T { ); } const raw = readFileSync(filePath, "utf-8"); - return JSON.parse(raw) as T; + try { + return JSON.parse(raw) as T; + } catch (err) { + throw new ConfigError(`Corrupted connector config file for "${name}"`, err); + } } /** Check if a named connector config exists */ @@ -132,13 +140,19 @@ export function deleteConnectorDocuments(db: Database.Database, sourceType: stri for (const row of rows) { try { deleteChunksFts.run(row.id); - } catch { - // FTS table may not exist + } catch (err) { + getLogger().debug( + { err, documentId: row.id }, + "FTS table cleanup skipped (table may not exist)", + ); } try { deleteEmbeddings.run(row.id); - } catch { - // chunk_embeddings table may not exist + } catch (err) { + getLogger().debug( + { err, documentId: row.id }, + "chunk_embeddings cleanup skipped (table may not exist)", + ); } deleteChunks.run(row.id); deleteDoc.run(row.id); diff --git a/src/connectors/obsidian.ts b/src/connectors/obsidian.ts index 480470b..9739c5e 100644 --- a/src/connectors/obsidian.ts +++ b/src/connectors/obsidian.ts @@ -1,4 +1,5 @@ import { readdirSync, readFileSync, statSync } from "node:fs"; +import { load as yamlLoad } from "js-yaml"; import { join, relative, dirname, basename, extname, resolve } from "node:path"; import type Database from "better-sqlite3"; import type { EmbeddingProvider } from "../providers/embedding.js"; @@ -110,7 +111,20 @@ export function parseObsidianMarkdown( if (fmMatch) { const fmBlock = fmMatch[1] ?? ""; body = content.slice((fmMatch[0] ?? "").length).trimStart(); - frontmatter = parseSimpleYaml(fmBlock); + try { + const parsed = yamlLoad(fmBlock); + if (parsed !== null && typeof parsed === "object" && !Array.isArray(parsed)) { + // js-yaml parses bare YAML date literals (e.g. 2024-01-15) as Date objects per YAML 1.1. + // Normalise them to ISO-8601 strings so downstream code always sees strings. + const normalised: Record = {}; + for (const [k, v] of Object.entries(parsed as Record)) { + normalised[k] = v instanceof Date ? v.toISOString().slice(0, 10) : v; + } + frontmatter = normalised; + } + } catch { + // Malformed frontmatter — leave frontmatter as empty object and continue + } } // Build vault file map for wikilink resolution @@ -184,61 +198,6 @@ export function parseObsidianMarkdown( return { frontmatter, body: body.trim(), tags, wikilinks }; } -function parseSimpleYaml(yaml: string): Record { - const result: Record = {}; - const lines = yaml.split("\n"); - let currentKey: string | undefined; - let listValues: string[] | undefined; - - for (const line of lines) { - // List item continuation - if (listValues !== undefined && /^\s+-\s+(.*)/.test(line)) { - const itemMatch = /^\s+-\s+(.*)/.exec(line); - if (itemMatch?.[1] !== undefined) { - listValues.push(itemMatch[1].trim()); - } - continue; - } - - // Flush any pending list - if (currentKey !== undefined && listValues !== undefined) { - result[currentKey] = listValues; - listValues = undefined; - currentKey = undefined; - } - - const colonIdx = line.indexOf(":"); - if (colonIdx < 1) continue; - const key = line.slice(0, colonIdx); - if (!/^[a-zA-Z_][a-zA-Z0-9_-]*$/.test(key)) continue; - const value = line.slice(colonIdx + 1).trim(); - - if (value === "" || value === "[]") { - // Could be start of a list - currentKey = key; - listValues = value === "[]" ? [] : []; - continue; - } - - // Inline list: [a, b, c] - if (value.startsWith("[") && value.endsWith("]")) { - const inner = value.slice(1, -1); - result[key] = inner.split(",").map((s) => s.trim().replace(/^['"]|['"]$/g, "")); - continue; - } - - // Simple scalar - result[key] = value.replace(/^['"]|['"]$/g, ""); - } - - // Flush trailing list - if (currentKey !== undefined && listValues !== undefined) { - result[currentKey] = listValues; - } - - return result; -} - function resolveEmbeds( body: string, vaultPath: string, diff --git a/src/connectors/slack.ts b/src/connectors/slack.ts index 043205b..6035283 100644 --- a/src/connectors/slack.ts +++ b/src/connectors/slack.ts @@ -499,13 +499,19 @@ export function disconnectSlack(db: Database.Database): number { for (const row of rows) { try { deleteChunksFts.run(row.id); - } catch { - // FTS table may not exist + } catch (err) { + getLogger().debug( + { err, documentId: row.id }, + "FTS table cleanup skipped (table may not exist)", + ); } try { deleteEmbeddings.run(row.id); - } catch { - // chunk_embeddings table may not exist + } catch (err) { + getLogger().debug( + { err, documentId: row.id }, + "chunk_embeddings cleanup skipped (table may not exist)", + ); } deleteChunks.run(row.id); deleteDoc.run(row.id); diff --git a/src/core/analytics.ts b/src/core/analytics.ts index a44ef94..ca97cb9 100644 --- a/src/core/analytics.ts +++ b/src/core/analytics.ts @@ -172,8 +172,8 @@ export function recordSearchQuery(db: Database.Database, entry: RecordSearchQuer db.prepare( "INSERT INTO search_queries (query, result_count, top_score, search_type) VALUES (?, ?, ?, ?)", ).run(entry.query, entry.resultCount, entry.topScore, entry.searchType); - } catch { - // silently ignore if table doesn't exist yet + } catch (err) { + getLogger().debug({ err }, "search_queries insert skipped (table may not exist yet)"); } } diff --git a/src/core/bulk.ts b/src/core/bulk.ts index 5bba9d8..1df1438 100644 --- a/src/core/bulk.ts +++ b/src/core/bulk.ts @@ -2,7 +2,12 @@ import type Database from "better-sqlite3"; import { getLogger } from "../logger.js"; import { ValidationError } from "../errors.js"; import { deleteDocument, listDocuments } from "./documents.js"; -import { addTagsToDocument, removeTagFromDocument, getDocumentTags } from "./tags.js"; +import { + addTagsToDocument, + removeTagFromDocument, + getDocumentTags, + getDocumentTagsBatch, +} from "./tags.js"; export interface BulkSelector { topicId?: string; @@ -42,71 +47,35 @@ export function resolveSelector( throw new ValidationError("Bulk selector must specify at least one filter criterion"); } + if (limit !== undefined && limit < 0) { + throw new ValidationError("limit must be a non-negative integer"); + } + const effectiveLimit = Math.max(0, Math.min(limit ?? MAX_BATCH_SIZE, MAX_BATCH_SIZE)); if (effectiveLimit === 0) { return []; } - // Use listDocuments for basic filters + // Push date filters into the SQL query so they apply before LIMIT const docs = listDocuments(db, { library: selector.library, topicId: selector.topicId, sourceType: selector.sourceType, + dateFrom: selector.dateFrom, + dateTo: selector.dateTo, limit: effectiveLimit, }); let ids = docs.map((d) => d.id); - // Build a Map for O(1) lookup instead of O(n) .find() per id - const docMap = new Map(docs.map((d) => [d.id, d])); - - // Apply date filters - if (selector.dateFrom) { - const from = selector.dateFrom; - ids = ids.filter((id) => { - const doc = docMap.get(id); - return doc != null && doc.createdAt >= from; - }); - } - if (selector.dateTo) { - const to = selector.dateTo; - ids = ids.filter((id) => { - const doc = docMap.get(id); - return doc != null && doc.createdAt <= to; - }); - } - - // Apply tag filter (AND logic — document must have ALL specified tags) + // Apply tag filter (AND logic — document must have ALL specified tags). + // Fetch all tags in a single query instead of one query per document. if (selector.tags && selector.tags.length > 0) { const requiredTags = selector.tags.map((t) => t.trim().toLowerCase()); - // Batch query: fetch tags for all candidate documents, chunked to respect SQLite parameter limits - const SQLITE_MAX_PARAMS = 999; - const tagRows: Array<{ document_id: string; name: string }> = []; - for (let i = 0; i < ids.length; i += SQLITE_MAX_PARAMS) { - const chunk = ids.slice(i, i + SQLITE_MAX_PARAMS); - const placeholders = chunk.map(() => "?").join(", "); - const rows = db - .prepare( - `SELECT dt.document_id, t.name - FROM tags t - JOIN document_tags dt ON dt.tag_id = t.id - WHERE dt.document_id IN (${placeholders})`, - ) - .all(...chunk) as Array<{ document_id: string; name: string }>; - tagRows.push(...rows); - } - const tagsByDoc = new Map(); - for (const row of tagRows) { - const existing = tagsByDoc.get(row.document_id); - if (existing) { - existing.push(row.name); - } else { - tagsByDoc.set(row.document_id, [row.name]); - } - } + const tagsByDoc = getDocumentTagsBatch(db, ids); ids = ids.filter((id) => { - const docTags = tagsByDoc.get(id) ?? []; + const docTags = (tagsByDoc.get(id) ?? []).map((t) => t.name); return requiredTags.every((rt) => docTags.includes(rt)); }); } diff --git a/src/core/documents.ts b/src/core/documents.ts index 4670890..f6280c9 100644 --- a/src/core/documents.ts +++ b/src/core/documents.ts @@ -96,6 +96,8 @@ export function listDocuments( library?: string | undefined; topicId?: string | undefined; sourceType?: string | undefined; + dateFrom?: string | undefined; + dateTo?: string | undefined; limit?: number | undefined; }, ): Document[] { @@ -117,6 +119,14 @@ export function listDocuments( sql += " AND source_type = ?"; params.push(options.sourceType); } + if (options?.dateFrom) { + sql += " AND created_at >= ?"; + params.push(options.dateFrom); + } + if (options?.dateTo) { + sql += " AND created_at <= ?"; + params.push(options.dateTo); + } sql += " ORDER BY updated_at DESC LIMIT ?"; params.push(options?.limit ?? 50); @@ -199,6 +209,10 @@ export async function updateDocument( ? createHash("sha256").update(newContent).digest("hex") : existing.contentHash; + // Use JS Date so test fake-timers (vi.setSystemTime) can control the timestamp. + // SQLite's datetime('now') uses the OS clock and cannot be mocked in unit tests. + const updatedAt = new Date().toISOString().replace("T", " ").slice(0, 19); + if (contentChanged) { log.info({ docId: documentId }, "Content changed, re-chunking and re-indexing embeddings"); @@ -221,7 +235,7 @@ export async function updateDocument( db.prepare("DELETE FROM chunks WHERE document_id = ?").run(documentId); db.prepare( - `UPDATE documents SET title = ?, content = ?, library = ?, version = ?, url = ?, topic_id = ?, content_hash = ?, updated_at = datetime('now') WHERE id = ?`, + `UPDATE documents SET title = ?, content = ?, library = ?, version = ?, url = ?, topic_id = ?, content_hash = ?, updated_at = ? WHERE id = ?`, ).run( newTitle, newContent, @@ -230,6 +244,7 @@ export async function updateDocument( newUrl, newTopicId, contentHash, + updatedAt, documentId, ); @@ -260,8 +275,8 @@ export async function updateDocument( saveVersion(db, documentId); db.prepare( - `UPDATE documents SET title = ?, library = ?, version = ?, url = ?, topic_id = ?, updated_at = datetime('now') WHERE id = ?`, - ).run(newTitle, newLibrary, newVersion, newUrl, newTopicId, documentId); + `UPDATE documents SET title = ?, library = ?, version = ?, url = ?, topic_id = ?, updated_at = ? WHERE id = ?`, + ).run(newTitle, newLibrary, newVersion, newUrl, newTopicId, updatedAt, documentId); }); transaction(); diff --git a/src/core/export.ts b/src/core/export.ts index 02b7e35..24c94a5 100644 --- a/src/core/export.ts +++ b/src/core/export.ts @@ -39,8 +39,8 @@ export function exportKnowledgeBase(db: Database.Database, outputPath: string): webhooks = (db.prepare("SELECT * FROM webhooks").all() as Record[]).map( (w) => ({ ...w, secret: w.secret != null ? "[REDACTED]" : null }), ); - } catch { - // webhooks table may not exist + } catch (err) { + log.debug({ err }, "Webhooks table not present in export (table may not exist)"); } const data: ExportData = { diff --git a/src/core/indexing.ts b/src/core/indexing.ts index 9cfca2d..4adb514 100644 --- a/src/core/indexing.ts +++ b/src/core/indexing.ts @@ -253,9 +253,24 @@ export async function indexDocument( .prepare("SELECT id FROM documents WHERE title = ? AND LENGTH(content) = ?") .get(input.title, contentLength) as { id: string } | undefined; if (existingByContent) { - throw new ValidationError( - `Document with same title and content length already exists (id: ${existingByContent.id}). Delete it first or modify the content.`, - ); + if (input.dedup === "skip") { + log.info( + { existingDocId: existingByContent.id, title: input.title }, + "Duplicate by title+length detected, skipping", + ); + return { id: existingByContent.id, chunkCount: 0 }; + } + if (input.dedup === "warn") { + log.warn( + { existingDocId: existingByContent.id, title: input.title }, + "Duplicate by title+length detected, indexing anyway", + ); + // Continue indexing with a new ID + } else { + throw new ValidationError( + `Document with same title and content length already exists (id: ${existingByContent.id}). Delete it first or modify the content.`, + ); + } } const docId = randomUUID(); @@ -333,9 +348,11 @@ export async function indexDocument( } catch (err) { const message = err instanceof Error ? err.message : String(err); if (message.includes("no such table")) { - log.debug({ chunkId, err }, "Skipped vector insertion (sqlite-vec may not be loaded)"); + log.debug({ chunkId }, "Skipped vector insertion (sqlite-vec not loaded)"); } else { - log.warn({ chunkId, err }, "Failed to insert vector embedding"); + // Re-throw so the transaction rolls back — don't silently commit + // chunks that have no embedding (they would be invisible to semantic search). + throw err; } } } diff --git a/src/core/link-extractor.ts b/src/core/link-extractor.ts new file mode 100644 index 0000000..5c7c889 --- /dev/null +++ b/src/core/link-extractor.ts @@ -0,0 +1,159 @@ +/** + * Link extraction from HTML. + * Parses tags using fast indexOf-based parsing (no regex catastrophic backtracking). + * Resolves relative URLs, strips fragments, deduplicates, and filters to http/https only. + */ + +/** + * Extract all unique, normalized http/https links from an HTML string. + * + * @param html Raw HTML to parse. + * @param baseUrl The URL the HTML was fetched from — used to resolve relative hrefs. + * @returns Deduplicated array of absolute http/https URLs (no fragments, trailing slashes + * on path roots normalized away). + */ +export function extractLinks(html: string, baseUrl: string): string[] { + const seen = new Set(); + const links: string[] = []; + + let pos = 0; + const lower = html.toLowerCase(); + + while (pos < html.length) { + // Find the next tag (next char must be space, >, or /) + const charAfterA = lower[tagStart + 2]; + if ( + charAfterA !== " " && + charAfterA !== "\t" && + charAfterA !== "\n" && + charAfterA !== "\r" && + charAfterA !== ">" && + charAfterA !== "/" + ) { + pos = tagStart + 2; + continue; + } + + // Find end of opening tag + const tagEnd = html.indexOf(">", tagStart); + if (tagEnd === -1) break; + + const tag = html.slice(tagStart, tagEnd + 1); + const href = extractHref(tag); + + if (href !== null) { + const resolved = resolveUrl(href, baseUrl); + if (resolved !== null && !seen.has(resolved)) { + seen.add(resolved); + links.push(resolved); + } + } + + pos = tagEnd + 1; + } + + return links; +} + +/** + * Extract the href attribute value from an tag string. + * Returns null if no href found or href is empty. + */ +function extractHref(tag: string): string | null { + const lowerTag = tag.toLowerCase(); + let searchPos = 0; + + while (searchPos < lowerTag.length) { + const hrefIdx = lowerTag.indexOf("href", searchPos); + if (hrefIdx === -1) return null; + + // Require an attribute boundary before "href" to avoid matching data-href, aria-href, etc. + // The character immediately preceding "href" must be whitespace (or it's at position 0, + // which can't happen in a valid tag and so we skip it). + const charBefore = hrefIdx > 0 ? lowerTag[hrefIdx - 1] : ""; + if (charBefore !== " " && charBefore !== "\t" && charBefore !== "\n" && charBefore !== "\r") { + searchPos = hrefIdx + 4; + continue; + } + + // Skip whitespace before = + let eqIdx = hrefIdx + 4; + while (eqIdx < tag.length && (tag[eqIdx] === " " || tag[eqIdx] === "\t")) eqIdx++; + + if (tag[eqIdx] !== "=") { + searchPos = hrefIdx + 4; + continue; + } + + // Skip whitespace after = + let valStart = eqIdx + 1; + while (valStart < tag.length && (tag[valStart] === " " || tag[valStart] === "\t")) valStart++; + + if (valStart >= tag.length) return null; + + let href: string; + const quote = tag[valStart]; + if (quote === '"' || quote === "'") { + const closeQuote = tag.indexOf(quote, valStart + 1); + if (closeQuote === -1) return null; + href = tag.slice(valStart + 1, closeQuote); + } else { + // Unquoted attribute value — ends at whitespace or > + let end = valStart; + while ( + end < tag.length && + tag[end] !== " " && + tag[end] !== "\t" && + tag[end] !== ">" && + tag[end] !== "\n" + ) { + end++; + } + href = tag.slice(valStart, end); + } + + href = href.trim(); + return href.length > 0 ? href : null; + } + + return null; +} + +/** + * Resolve a potentially-relative href against a base URL. + * Returns null if the result is not an http/https URL (e.g. mailto:, javascript:, data:, #fragment-only). + */ +function resolveUrl(href: string, baseUrl: string): string | null { + // Skip fragment-only links immediately — they point to the same page + if (href.startsWith("#")) return null; + + let resolved: URL; + try { + resolved = new URL(href, baseUrl); + } catch { + return null; + } + + // Allowlist: only permit http and https. + // This rejects javascript:, vbscript:, data:, mailto:, ftp:, file:, and + // any other non-http scheme without needing an enumerated blocklist. + if (resolved.protocol !== "http:" && resolved.protocol !== "https:") { + return null; + } + + // Strip fragment + resolved.hash = ""; + + // Normalize: remove trailing slash from non-root paths + // e.g. https://example.com/docs/ → https://example.com/docs + // but https://example.com/ stays as https://example.com/ + if (resolved.pathname.length > 1 && resolved.pathname.endsWith("/")) { + resolved.pathname = resolved.pathname.slice(0, -1); + } + + return resolved.href; +} diff --git a/src/core/packs.ts b/src/core/packs.ts index 26642e9..45aeca1 100644 --- a/src/core/packs.ts +++ b/src/core/packs.ts @@ -1,10 +1,23 @@ import type Database from "better-sqlite3"; -import { readFileSync, writeFileSync } from "node:fs"; -import { resolve as pathResolve, isAbsolute as pathIsAbsolute } from "node:path"; +import { randomUUID, createHash } from "node:crypto"; +import { readFileSync, writeFileSync, readdirSync, statSync } from "node:fs"; +import { pathToFileURL } from "node:url"; +import { + resolve as pathResolve, + isAbsolute as pathIsAbsolute, + basename, + relative, + join as pathJoin, + extname, +} from "node:path"; +import { gzipSync, gunzipSync } from "node:zlib"; import type { EmbeddingProvider } from "../providers/embedding.js"; import { ValidationError, FetchError } from "../errors.js"; import { getLogger } from "../logger.js"; -import { indexDocument } from "./indexing.js"; +import { chunkContent, chunkContentStreaming, STREAMING_THRESHOLD } from "./indexing.js"; +import { getParserForFile, getSupportedExtensions } from "./parsers/index.js"; +import { suggestTagsFromText } from "./tags.js"; +import { fetchAndConvert } from "./url-fetcher.js"; export interface PackDocument { title: string; @@ -45,6 +58,19 @@ export interface InstallResult { packName: string; documentsInstalled: number; alreadyInstalled: boolean; + errors: number; +} + +export interface InstallOptions { + registryUrl?: string | undefined; + /** Number of documents to embed and insert per batch. Default: 10. */ + batchSize?: number | undefined; + /** Skip the first N documents (for resuming a partial install). Default: 0. */ + resumeFrom?: number | undefined; + /** Maximum number of batches to embed concurrently. Default: 4. */ + concurrency?: number | undefined; + /** Called after each batch of documents is processed. */ + onProgress?: ((current: number, total: number, docTitle: string) => void) | undefined; } export interface CreatePackOptions { @@ -57,8 +83,55 @@ export interface CreatePackOptions { outputPath?: string | undefined; } +export interface CreatePackFromSourceOptions { + /** Pack name (required). */ + name: string; + /** One or more source paths (directories or files) or URLs. */ + from: string[]; + version?: string | undefined; + description?: string | undefined; + author?: string | undefined; + license?: string | undefined; + outputPath?: string | undefined; + /** Only include files with these extensions (e.g. [".md", ".html"]). Defaults to all supported. */ + extensions?: string[] | undefined; + /** Glob-style patterns to exclude (matched against the relative path from the source root). */ + exclude?: string[] | undefined; + /** Walk directories recursively (default: true). */ + recursive?: boolean | undefined; + /** Called for each file processed, for progress reporting. */ + onProgress?: ((info: { file: string; index: number; total: number }) => void) | undefined; +} + const DEFAULT_REGISTRY_URL = "https://raw.githubusercontent.com/libscope/packs/main/registry.json"; +/** Gzip magic number: first two bytes of a gzip stream. */ +const GZIP_MAGIC = Buffer.from([0x1f, 0x8b]); + +/** Check if a filename indicates gzip compression (.gz or .json.gz). */ +function isGzipPath(filePath: string): boolean { + return filePath.endsWith(".gz"); +} + +/** Write a pack to disk, gzip-compressing if the path ends in .gz. */ +function writePackFile(filePath: string, pack: KnowledgePack): void { + const json = JSON.stringify(pack, null, 2); + if (isGzipPath(filePath)) { + writeFileSync(filePath, gzipSync(Buffer.from(json, "utf-8"))); + } else { + writeFileSync(filePath, json, "utf-8"); + } +} + +/** Read a pack file, auto-detecting gzip by magic bytes or extension. */ +function readPackFile(filePath: string): string { + const raw = readFileSync(filePath); + if (raw.length >= 2 && raw[0] === GZIP_MAGIC[0] && raw[1] === GZIP_MAGIC[1]) { + return gunzipSync(raw).toString("utf-8"); + } + return raw.toString("utf-8"); +} + /** Validate that a registry URL uses https and is not a private IP. */ function validateRegistryUrl(url: string): void { let parsed: URL; @@ -190,20 +263,20 @@ export async function installPack( db: Database.Database, provider: EmbeddingProvider, packNameOrPath: string, - options?: { registryUrl?: string | undefined }, + options?: InstallOptions, ): Promise { const log = getLogger(); let pack: KnowledgePack; - // Try loading as a local file first - if (packNameOrPath.endsWith(".json")) { + // Try loading as a local file first (supports .json and .json.gz) + if (packNameOrPath.endsWith(".json") || packNameOrPath.endsWith(".json.gz")) { const resolved = pathResolve(packNameOrPath); // Prevent path traversal: if a relative path is given, ensure it resolves within CWD if (!pathIsAbsolute(packNameOrPath) && !resolved.startsWith(process.cwd())) { throw new ValidationError("Pack file path must be within the current working directory"); } try { - const raw = readFileSync(resolved, "utf-8"); + const raw = readPackFile(resolved); const parsed: unknown = JSON.parse(raw); pack = validatePack(parsed); } catch (err) { @@ -215,6 +288,7 @@ export async function installPack( } else { // Fetch from registry const registryUrl = options?.registryUrl ?? DEFAULT_REGISTRY_URL; + validateRegistryUrl(registryUrl); const baseUrl = registryUrl.replace(/\/[^/]+$/, ""); const packUrl = `${baseUrl}/${packNameOrPath}.json`; @@ -241,10 +315,36 @@ export async function installPack( if (existing) { log.info({ pack: pack.name }, "Pack already installed"); - return { packName: pack.name, documentsInstalled: 0, alreadyInstalled: true }; + return { packName: pack.name, documentsInstalled: 0, alreadyInstalled: true, errors: 0 }; } - log.info({ pack: pack.name, docCount: pack.documents.length }, "Installing pack"); + const batchSize = options?.batchSize ?? 10; + const concurrency = options?.concurrency ?? 4; + const resumeFrom = options?.resumeFrom ?? 0; + const onProgress = options?.onProgress; + const total = pack.documents.length; + + if (!Number.isInteger(batchSize) || batchSize <= 0) { + throw new ValidationError("batchSize must be a positive integer"); + } + if (!Number.isInteger(concurrency) || concurrency <= 0) { + throw new ValidationError("concurrency must be a positive integer"); + } + if (!Number.isInteger(resumeFrom) || resumeFrom < 0) { + throw new ValidationError("resumeFrom must be a non-negative integer"); + } + if (resumeFrom > total) { + throw new ValidationError( + "resumeFrom cannot be greater than the total number of documents in the pack", + ); + } + + const docs = resumeFrom > 0 ? pack.documents.slice(resumeFrom) : pack.documents; + + log.info( + { pack: pack.name, docCount: total, batchSize, concurrency, resumeFrom }, + "Installing pack", + ); // Insert the pack record first (documents.pack_name has FK to packs.name) db.prepare("INSERT INTO packs (name, version, description, doc_count) VALUES (?, ?, ?, 0)").run( @@ -253,34 +353,198 @@ export async function installPack( pack.description, ); - let installed = 0; - for (const doc of pack.documents) { - try { - const result = await indexDocument(db, provider, { - title: doc.title, - content: doc.content, - sourceType: "library", - url: doc.source || undefined, - submittedBy: "manual", - dedup: "warn", + // Prepare statements once (reused across all batches) + const insertDoc = db.prepare(` + INSERT INTO documents (id, source_type, title, content, url, submitted_by, content_hash, pack_name) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `); + const insertChunk = db.prepare(` + INSERT INTO chunks (id, document_id, content, chunk_index) + VALUES (?, ?, ?, ?) + `); + const insertEmbedding = db.prepare(` + INSERT INTO chunk_embeddings (chunk_id, embedding) + VALUES (?, ?) + `); + + // Slice docs into batches by index only — chunks are computed lazily when each + // batch is scheduled, so we never hold all chunks in memory simultaneously. + type DocChunkInfo = { + doc: PackDocument; + docId: string; + contentHash: string; + chunks: string[]; + chunkOffset: number; // offset into allChunks for this batch + }; + type BatchData = { + batchDocs: PackDocument[]; + }; + type ResolvedBatch = { + docInfos: DocChunkInfo[]; + allChunks: string[]; + }; + + const batches: BatchData[] = []; + for (let batchStart = 0; batchStart < docs.length; batchStart += batchSize) { + batches.push({ batchDocs: docs.slice(batchStart, batchStart + batchSize) }); + } + + /** Chunk a batch's documents on demand, right before embedding. */ + function resolveBatch(batch: BatchData): ResolvedBatch { + const docInfos: DocChunkInfo[] = []; + const allChunks: string[] = []; + for (const doc of batch.batchDocs) { + const contentHash = createHash("sha256").update(doc.content).digest("hex"); + const useStreaming = doc.content.length > STREAMING_THRESHOLD; + const chunks = useStreaming ? chunkContentStreaming(doc.content) : chunkContent(doc.content); + docInfos.push({ + doc, + docId: randomUUID(), + contentHash, + chunks, + chunkOffset: allChunks.length, }); + allChunks.push(...chunks); + } + return { docInfos, allChunks }; + } - // Tag the document with the pack name - db.prepare("UPDATE documents SET pack_name = ? WHERE id = ?").run(pack.name, result.id); - installed++; - } catch (err) { - log.warn( - { err, title: doc.title, pack: pack.name }, - "Failed to index pack document, skipping", + // Phase 2 & 3: Embed batches concurrently (up to `concurrency` at a time), + // inserting each batch into the DB in order as embeddings complete. + // This maximises provider throughput while keeping inserts serialised (SQLite requirement). + let installed = 0; + let errors = 0; + let processedCount = resumeFrom; + + type EmbedResult = { resolved: ResolvedBatch; embeddings: number[][]; success: boolean }; + const embedResults: Array = Array.from({ + length: batches.length, + }); + let nextInsertIdx = 0; + + /** Insert completed batches in index order, advancing nextInsertIdx. */ + function flushInserts(): void { + while (nextInsertIdx < batches.length && embedResults[nextInsertIdx] !== undefined) { + const i = nextInsertIdx++; + const { resolved: batch, embeddings, success } = embedResults[i]!; + const result = { embeddings, success }; + + if (!result.success) { + errors += batch.docInfos.length; + } else { + let batchInstalled = 0; + const doInsert = db.transaction(() => { + batchInstalled = 0; + for (const info of batch.docInfos) { + insertDoc.run( + info.docId, + "library", + info.doc.title, + info.doc.content, + info.doc.source || null, + "manual", + info.contentHash, + pack.name, + ); + for (let j = 0; j < info.chunks.length; j++) { + const chunkId = randomUUID(); + const chunkText = info.chunks[j] ?? ""; + const embedding = result.embeddings[info.chunkOffset + j] ?? []; + insertChunk.run(chunkId, info.docId, chunkText, j); + try { + const vecBuffer = Buffer.from(new Float32Array(embedding).buffer); + insertEmbedding.run(chunkId, vecBuffer); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + if (!message.includes("no such table")) { + log.warn({ chunkId, err }, "Failed to insert vector embedding"); + } + } + } + batchInstalled++; + } + }); + try { + doInsert(); + installed += batchInstalled; + } catch (err) { + log.warn( + { err, pack: pack.name, batchIndex: i }, + "Transaction failed for batch, skipping these documents", + ); + errors += batch.docInfos.length; + } + } + + processedCount += batch.docInfos.length; + onProgress?.( + processedCount, + total, + batch.docInfos[batch.docInfos.length - 1]?.doc.title ?? "", ); } } + // Semaphore-based concurrent embedding: up to `concurrency` embedBatch calls in flight at once. + await new Promise((resolve) => { + if (batches.length === 0) { + resolve(); + return; + } + + let activeCount = 0; + let scheduleIdx = 0; + + function scheduleNext(): void { + while (activeCount < concurrency && scheduleIdx < batches.length) { + const i = scheduleIdx++; + const resolved = resolveBatch(batches[i]!); + activeCount++; + + // Wrap in try/catch so synchronous throws from embedBatch don't leave + // the surrounding Promise permanently pending. + let embedPromise: Promise; + if (resolved.allChunks.length > 0) { + try { + embedPromise = provider.embedBatch(resolved.allChunks); + } catch (err) { + embedPromise = Promise.reject(err instanceof Error ? err : new Error(String(err))); + } + } else { + embedPromise = Promise.resolve([] as number[][]); + } + + embedPromise + .then((embeddings) => { + embedResults[i] = { resolved, embeddings, success: true }; + }) + .catch((err) => { + log.warn( + { err, pack: pack.name, batchIndex: i }, + "Failed to embed batch, skipping these documents", + ); + embedResults[i] = { resolved, embeddings: [], success: false }; + }) + .finally(() => { + activeCount--; + flushInserts(); + if (scheduleIdx < batches.length) { + scheduleNext(); + } else if (activeCount === 0) { + resolve(); + } + }); + } + } + + scheduleNext(); + }); + // Update doc count db.prepare("UPDATE packs SET doc_count = ? WHERE name = ?").run(installed, pack.name); - log.info({ pack: pack.name, installed }, "Pack installed"); - return { packName: pack.name, documentsInstalled: installed, alreadyInstalled: false }; + log.info({ pack: pack.name, installed, errors }, "Pack installed"); + return { packName: pack.name, documentsInstalled: installed, alreadyInstalled: false, errors }; } /** Remove a pack and all its associated documents. */ @@ -305,8 +569,11 @@ export function removePack(db: Database.Database, packName: string): void { db.prepare( "DELETE FROM chunk_embeddings WHERE chunk_id IN (SELECT id FROM chunks WHERE document_id = ?)", ).run(id); - } catch { - // chunk_embeddings table may not exist + } catch (err) { + log.debug( + { err, documentId: id }, + "chunk_embeddings cleanup skipped (table may not exist)", + ); } db.prepare("DELETE FROM documents WHERE id = ?").run(id); } @@ -387,9 +654,235 @@ export function createPack(db: Database.Database, options: CreatePackOptions): K }; if (options.outputPath) { - writeFileSync(options.outputPath, JSON.stringify(pack, null, 2), "utf-8"); + writePackFile(options.outputPath, pack); log.info({ outputPath: options.outputPath, docCount: documents.length }, "Pack file created"); } return pack; } + +// --------------------------------------------------------------------------- +// Create pack from filesystem / URL sources (no database required) +// --------------------------------------------------------------------------- + +/** Simple glob-style pattern matching (supports * and ** wildcards). */ +function matchesExcludePattern(relativePath: string, pattern: string): boolean { + // Escape regex special chars except * and ** + const escaped = pattern + .replace(/[.+^${}()|[\]\\]/g, "\\$&") + .replace(/\*\*/g, "\0") + .replace(/\*/g, "[^/]*") + .replace(/\0/g, ".*"); + return new RegExp(`^${escaped}$`).test(relativePath); +} + +/** Recursively collect files from a directory. */ +function collectFiles( + dir: string, + rootDir: string, + recursive: boolean, + extensions: Set, + excludePatterns: string[], +): string[] { + const results: string[] = []; + let entries: string[]; + try { + entries = readdirSync(dir); + } catch (err) { + throw new ValidationError( + `Cannot read directory "${dir}": ${err instanceof Error ? err.message : String(err)}`, + ); + } + + for (const entry of entries) { + const fullPath = pathJoin(dir, entry); + const rel = relative(rootDir, fullPath); + + // Check exclude patterns + if (excludePatterns.some((p) => matchesExcludePattern(rel, p))) { + continue; + } + + let stat; + try { + stat = statSync(fullPath); + } catch { + continue; // Skip unreadable entries + } + + if (stat.isDirectory()) { + if (recursive) { + results.push(...collectFiles(fullPath, rootDir, recursive, extensions, excludePatterns)); + } + } else if (stat.isFile()) { + const ext = extname(fullPath).toLowerCase(); + if (extensions.has(ext)) { + results.push(fullPath); + } + } + } + + return results; +} + +function isUrl(value: string): boolean { + return value.startsWith("http://") || value.startsWith("https://"); +} + +/** Create a pack directly from filesystem paths and/or URLs (no database needed). */ +export async function createPackFromSource( + options: CreatePackFromSourceOptions, +): Promise { + const log = getLogger(); + + if (!options.name.trim()) { + throw new ValidationError("Pack name is required"); + } + if (options.from.length === 0) { + throw new ValidationError("At least one --from source is required"); + } + + const allSupported = getSupportedExtensions(); + const extensions = new Set( + options.extensions?.map((e) => (e.startsWith(".") ? e.toLowerCase() : `.${e.toLowerCase()}`)) ?? + allSupported, + ); + const excludePatterns = options.exclude ?? []; + const recursive = options.recursive ?? true; + + const documents: PackDocument[] = []; + const errors: Array<{ source: string; error: string }> = []; + + // Separate URLs from file paths + const urls: string[] = []; + const fileSources: string[] = []; + for (const src of options.from) { + if (isUrl(src)) { + urls.push(src); + } else { + fileSources.push(src); + } + } + + // Collect all files from filesystem sources + const allFiles: string[] = []; + for (const src of fileSources) { + const resolved = pathResolve(src); + let stat; + try { + stat = statSync(resolved); + } catch (err) { + throw new ValidationError( + `Source path "${src}" does not exist or is not accessible: ${err instanceof Error ? err.message : String(err)}`, + ); + } + + if (stat.isDirectory()) { + allFiles.push(...collectFiles(resolved, resolved, recursive, extensions, excludePatterns)); + } else if (stat.isFile()) { + allFiles.push(resolved); + } else { + throw new ValidationError(`Source path "${src}" is not a file or directory`); + } + } + + // Parse filesystem files + const totalCount = allFiles.length + urls.length; + for (let i = 0; i < allFiles.length; i++) { + const filePath = allFiles[i]!; + options.onProgress?.({ file: filePath, index: i, total: totalCount }); + + const parser = getParserForFile(filePath); + if (!parser) { + log.debug({ file: filePath }, "No parser for file, skipping"); + continue; + } + + try { + const buffer = readFileSync(filePath); + const content = await parser.parse(buffer); + const trimmed = content.trimEnd(); + if (trimmed.length === 0) { + log.debug({ file: filePath }, "Empty content after parsing, skipping"); + continue; + } + + const title = basename(filePath).replace(/\.[^.]+$/, ""); + const tags = suggestTagsFromText(title, trimmed); + documents.push({ + title, + content: trimmed, + source: pathToFileURL(filePath).href, + tags, + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + log.warn({ file: filePath, err: msg }, "Failed to parse file, skipping"); + errors.push({ source: filePath, error: msg }); + } + } + + // Fetch URLs + for (let i = 0; i < urls.length; i++) { + const url = urls[i]!; + options.onProgress?.({ file: url, index: allFiles.length + i, total: totalCount }); + + try { + const fetched = await fetchAndConvert(url); + if (!fetched.content.trim()) { + log.debug({ url }, "Empty content from URL, skipping"); + continue; + } + + const tags = suggestTagsFromText(fetched.title, fetched.content.trimEnd()); + documents.push({ + title: fetched.title, + content: fetched.content.trimEnd(), + source: url, + tags, + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + log.warn({ url, err: msg }, "Failed to fetch URL, skipping"); + errors.push({ source: url, error: msg }); + } + } + + if (documents.length === 0) { + const detail = + errors.length > 0 + ? ` (${errors.length} source(s) failed: ${errors.map((e) => e.source).join(", ")})` + : ""; + throw new ValidationError(`No documents could be created from the provided sources${detail}`); + } + + if (errors.length > 0) { + log.warn({ errorCount: errors.length, errors }, "Some sources failed during pack creation"); + } + + const pack: KnowledgePack = { + name: options.name, + version: options.version ?? "1.0.0", + description: options.description ?? `Knowledge pack: ${options.name}`, + documents, + metadata: { + author: options.author ?? "libscope", + license: options.license ?? "MIT", + createdAt: new Date().toISOString(), + }, + }; + + if (options.outputPath) { + writePackFile(options.outputPath, pack); + log.info( + { outputPath: options.outputPath, docCount: documents.length }, + "Pack file created from source", + ); + } + + log.info( + { name: pack.name, docCount: documents.length, errorCount: errors.length }, + "Pack created from source", + ); + return pack; +} diff --git a/src/core/parsers/html.ts b/src/core/parsers/html.ts new file mode 100644 index 0000000..1e9d4a1 --- /dev/null +++ b/src/core/parsers/html.ts @@ -0,0 +1,23 @@ +import { NodeHtmlMarkdown } from "node-html-markdown"; +import { ValidationError } from "../../errors.js"; +import type { DocumentParser } from "./index.js"; + +const nhm = new NodeHtmlMarkdown({ ignore: ["script", "style", "nav"] }); + +/** Parser for HTML files — converts to Markdown via node-html-markdown. */ +export class HtmlParser implements DocumentParser { + readonly extensions = [".html", ".htm"]; + + parse(content: Buffer): Promise { + try { + const html = content.toString("utf-8"); + const markdown = nhm.translate(html); + + // Collapse excessive blank lines left by ignored elements + return Promise.resolve(markdown.replace(/\n{3,}/g, "\n\n").trimEnd()); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : "Unknown HTML parsing error"; + throw new ValidationError(`Failed to parse HTML: ${message}`); + } + } +} diff --git a/src/core/parsers/index.ts b/src/core/parsers/index.ts index 35e43ff..1151e25 100644 --- a/src/core/parsers/index.ts +++ b/src/core/parsers/index.ts @@ -6,6 +6,7 @@ import { YamlParser } from "./yaml.js"; import { CsvParser } from "./csv.js"; import { PdfParser } from "./pdf.js"; import { WordParser } from "./word.js"; +import { HtmlParser } from "./html.js"; /** Interface for document format parsers. */ export interface DocumentParser { @@ -23,6 +24,7 @@ const parsers: DocumentParser[] = [ new CsvParser(), new PdfParser(), new WordParser(), + new HtmlParser(), ]; const extensionMap = new Map(); diff --git a/src/core/rag.ts b/src/core/rag.ts index ecc7ed7..3545d87 100644 --- a/src/core/rag.ts +++ b/src/core/rag.ts @@ -67,12 +67,17 @@ export function extractSources(results: SearchResult[]): RagSource[] { } interface LlmConfig { - provider?: "openai" | "ollama"; + provider?: "openai" | "ollama" | "passthrough"; model?: string; ollamaUrl?: string; openaiApiKey?: string; } +/** Returns true if the config is set to passthrough mode (delegate synthesis to the calling LLM). */ +export function isPassthroughMode(config: LibScopeConfig): boolean { + return config.llm?.provider === "passthrough"; +} + /** Create an LLM provider from config. */ export function createLlmProvider(config: LibScopeConfig): LlmProvider { const llmConfig: LlmConfig | undefined = config.llm; @@ -86,11 +91,40 @@ export function createLlmProvider(config: LibScopeConfig): LlmProvider { } throw new ConfigError( - "No LLM provider configured. Set llm.provider to 'openai' or 'ollama' in your config, " + + "No LLM provider configured. Set llm.provider to 'openai', 'ollama', or 'passthrough' in your config, " + "or set LIBSCOPE_LLM_PROVIDER environment variable.", ); } +export interface PassthroughResult { + contextPrompt: string; + sources: RagSource[]; +} + +/** + * Retrieve relevant chunks and return the formatted context prompt without calling an LLM. + * Used in passthrough mode so the calling LLM can synthesize the answer itself. + */ +export async function getContextForQuestion( + db: Database.Database, + embeddingProvider: EmbeddingProvider, + options: RagOptions, +): Promise { + const topK = options.topK ?? 5; + + const { results } = await searchDocuments(db, embeddingProvider, { + query: options.question, + topic: options.topic, + library: options.library, + limit: topK, + }); + + return { + contextPrompt: buildContextPrompt(options.question, results), + sources: extractSources(results), + }; +} + function createOpenAiProvider( embedding: LibScopeConfig["embedding"], llmConfig: LlmConfig | undefined, diff --git a/src/core/reindex.ts b/src/core/reindex.ts index 5ffc55a..08a9ee7 100644 --- a/src/core/reindex.ts +++ b/src/core/reindex.ts @@ -2,6 +2,7 @@ import type Database from "better-sqlite3"; import type { EmbeddingProvider } from "../providers/embedding.js"; import { DatabaseError } from "../errors.js"; import { getLogger } from "../logger.js"; +import { createVectorTable } from "../db/schema.js"; export interface ReindexOptions { /** Only reindex chunks belonging to these document IDs. */ @@ -59,16 +60,12 @@ export async function reindex( log.info({ total }, "Chunks to reindex"); - // Ensure the vector table exists for the current provider dimensions + // Ensure the vector table exists with the correct dimensions for this provider. + // Delegates to schema.createVectorTable() — single source of truth for the DDL. try { - db.exec(` - CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0( - chunk_id TEXT PRIMARY KEY, - embedding float[${provider.dimensions}] - ); - `); - } catch (err: unknown) { - log.warn({ err }, "Could not ensure vector table — continuing anyway"); + createVectorTable(db, provider.dimensions); + } catch { + log.warn("Could not ensure vector table — continuing anyway"); } const deleteStmt = db.prepare("DELETE FROM chunk_embeddings WHERE chunk_id = ?"); diff --git a/src/core/saved-searches.ts b/src/core/saved-searches.ts index 8a10dd3..42b8ddf 100644 --- a/src/core/saved-searches.ts +++ b/src/core/saved-searches.ts @@ -4,6 +4,7 @@ import type { EmbeddingProvider } from "../providers/embedding.js"; import { searchDocuments } from "./search.js"; import type { SearchOptions, SearchResult } from "./search.js"; import { ValidationError, DocumentNotFoundError } from "../errors.js"; +import { getLogger } from "../logger.js"; export interface SavedSearch { id: string; @@ -28,7 +29,11 @@ interface SavedSearchRow { function rowToSavedSearch(row: SavedSearchRow): SavedSearch { let filters: Omit | null = null; if (row.filters) { - filters = JSON.parse(row.filters) as Omit; + try { + filters = JSON.parse(row.filters) as Omit; + } catch { + getLogger().warn({ id: row.id }, "Failed to parse saved search filters JSON; using null"); + } } return { id: row.id, diff --git a/src/core/search.ts b/src/core/search.ts index 4acb086..39540ed 100644 --- a/src/core/search.ts +++ b/src/core/search.ts @@ -431,7 +431,7 @@ function keywordSearch( params.push(options.version); } if (options.minRating) { - sql += " AND (SELECT AVG(r.rating) FROM ratings r WHERE r.document_id = d.id) >= ?"; + sql += " AND avg_r.avg_rating >= ?"; params.push(options.minRating); } if (options.dateFrom) { @@ -553,7 +553,7 @@ function fts5Search( params.push(options.version); } if (options.minRating) { - sql += " AND (SELECT AVG(r.rating) FROM ratings r WHERE r.document_id = d.id) >= ?"; + sql += " AND avg_r.avg_rating >= ?"; params.push(options.minRating); } if (options.dateFrom) { diff --git a/src/core/spider.ts b/src/core/spider.ts new file mode 100644 index 0000000..44dd6a1 --- /dev/null +++ b/src/core/spider.ts @@ -0,0 +1,478 @@ +/** + * URL spider engine — BFS crawl from a seed URL with configurable depth, page, domain, and path limits. + * + * Safety guarantees: + * - All URLs are SSRF-validated via fetchRaw() before fetching + * - Hard caps on pages (200) and depth (5) that cannot be overridden by callers + * - Total wall-clock timeout of 10 minutes aborts the crawl + * - robots.txt is fetched once per origin and its Disallow rules are honoured + * - Private/internal IPs are blocked by the underlying url-fetcher + */ + +import { getLogger } from "../logger.js"; +import { FetchError } from "../errors.js"; +import { fetchRaw, type FetchOptions } from "./url-fetcher.js"; +import { extractLinks } from "./link-extractor.js"; +import { NodeHtmlMarkdown } from "node-html-markdown"; + +// ── Hard limits that callers cannot override ──────────────────────────────── + +const HARD_MAX_PAGES = 200; +const HARD_MAX_DEPTH = 5; +/** Total spider wall-clock timeout in ms (10 minutes). */ +const HARD_TOTAL_TIMEOUT_MS = 10 * 60 * 1000; +/** Default delay between requests in ms (1 second). */ +const DEFAULT_REQUEST_DELAY_MS = 1_000; + +// ── Public types ───────────────────────────────────────────────────────────── + +export interface SpiderOptions { + /** Maximum total pages to index (default: 25, hard cap: 200). */ + maxPages?: number; + /** Maximum hop depth from the seed URL (default: 2, hard cap: 5). 0 = seed only. */ + maxDepth?: number; + /** Only follow links that share the same hostname as the seed (default: true). */ + sameDomain?: boolean; + /** Only follow links whose path starts with this prefix (e.g. "/docs/"). */ + pathPrefix?: string; + /** Glob-style patterns for URLs to skip (matched against full URL string). */ + excludePatterns?: string[]; + /** Milliseconds to wait between requests (default: 1000). */ + requestDelay?: number; + /** Passed through to fetchRaw for each page request. */ + fetchOptions?: Pick< + FetchOptions, + "allowPrivateUrls" | "allowSelfSignedCerts" | "timeout" | "maxBodySize" + >; +} + +export interface SpiderResult { + url: string; + title: string; + content: string; + depth: number; +} + +export interface SpiderStats { + /** Pages successfully fetched and yielded to the caller (caller decides whether to index). */ + pagesFetched: number; + pagesCrawled: number; + pagesSkipped: number; + errors: Array<{ url: string; error: string }>; + abortReason?: "maxPages" | "timeout"; +} + +// ── robots.txt parsing ─────────────────────────────────────────────────────── + +/** Fetch robots.txt for an origin, capping the timeout at 10 s regardless of caller options. */ +async function fetchRobotsTxt( + origin: string, + fetchOptions?: SpiderOptions["fetchOptions"], +): Promise> { + const robotsUrl = origin + "/robots.txt"; + // Cap robots.txt timeout: use caller's timeout only if shorter than our hard cap. + const effectiveTimeout = + fetchOptions?.timeout !== undefined && Number.isFinite(fetchOptions.timeout) + ? Math.min(fetchOptions.timeout, 10_000) + : 10_000; + try { + const raw = await fetchRaw(robotsUrl, { ...fetchOptions, timeout: effectiveTimeout }); + return parseRobotsTxt(raw.body); + } catch { + // robots.txt missing or inaccessible — no restrictions + return new Set(); + } +} + +/** + * Parse robots.txt and return Disallow path prefixes that apply to our agent. + * + * Implements proper UA precedence: if any group explicitly names "libscope", + * only those groups apply (ignoring wildcard). Otherwise wildcard groups apply. + * This matches the robots.txt spec — a specific UA rule overrides the wildcard. + */ +function parseRobotsTxt(text: string): Set { + type RobotsGroup = { agents: string[]; disallows: string[] }; + const groups: RobotsGroup[] = []; + let current: RobotsGroup | null = null; + + for (const raw of text.split(/\r?\n/)) { + const line = raw.trim(); + if (line.startsWith("#") || line.length === 0) continue; + + const lower = line.toLowerCase(); + if (lower.startsWith("user-agent:")) { + const agent = line.slice("user-agent:".length).trim(); + // Start a new group only if current has already collected Disallow lines + if (current === null || current.disallows.length > 0) { + current = { agents: [], disallows: [] }; + groups.push(current); + } + current.agents.push(agent.toLowerCase()); + } else if (lower.startsWith("disallow:") && current !== null) { + const path = line.slice("disallow:".length).trim(); + if (path.length > 0) current.disallows.push(path); + } + } + + // Prefer explicit "libscope" group over the wildcard group + const libscopeGroups = groups.filter((g) => g.agents.includes("libscope")); + const selected = + libscopeGroups.length > 0 ? libscopeGroups : groups.filter((g) => g.agents.includes("*")); + + const disallowed = new Set(); + for (const group of selected) { + for (const path of group.disallows) disallowed.add(path); + } + return disallowed; +} + +function isDisallowedByRobots(url: string, disallowed: Set): boolean { + if (disallowed.size === 0) return false; + let pathname: string; + try { + pathname = new URL(url).pathname; + } catch { + return false; + } + for (const prefix of disallowed) { + if (pathname.startsWith(prefix)) return true; + } + return false; +} + +// ── Wildcard/glob pattern matching ────────────────────────────────────────── + +const REGEX_SPECIAL = new Set([".", "+", "^", "$", "{", "}", "(", ")", "|", "[", "]", "\\"]); + +// Match a URL against a simple glob pattern. +// Both * and ** match any sequence of characters including path separators. +// Matching is case-insensitive and applied to the full URL string. +function matchesGlob(url: string, pattern: string): boolean { + let regexStr = "^"; + let i = 0; + while (i < pattern.length) { + if (pattern[i] === "*" && pattern[i + 1] === "*") { + regexStr += ".*"; + i += 2; + if (pattern[i] === "/") i++; // skip optional trailing slash after ** + } else if (pattern[i] === "*") { + regexStr += ".*"; // * also matches / in URL context + i++; + } else { + const ch = pattern[i]!; + // Escape chars that are special in regex + if (REGEX_SPECIAL.has(ch)) { + regexStr += "\\" + ch; + } else { + regexStr += ch; + } + i++; + } + } + regexStr += "$"; + try { + return new RegExp(regexStr, "i").test(url); + } catch { + return false; + } +} + +function isExcluded(url: string, patterns: string[]): boolean { + return patterns.some((p) => matchesGlob(url, p)); +} + +// ── Domain / path filtering ────────────────────────────────────────────────── + +function isSameDomain(url: string, seedHostname: string): boolean { + try { + const parsed = new URL(url); + const host = parsed.hostname.toLowerCase(); + const seed = seedHostname.toLowerCase(); + // Allow exact match or subdomain match (e.g. docs.example.com vs example.com) + return host === seed || host.endsWith("." + seed); + } catch { + return false; + } +} + +function hasPathPrefix(url: string, prefix: string): boolean { + if (!prefix) return true; + try { + return new URL(url).pathname.startsWith(prefix); + } catch { + return false; + } +} + +// ── HTML → markdown (reuse url-fetcher's approach) ────────────────────────── + +function htmlToMarkdown(html: string): string { + return NodeHtmlMarkdown.translate(html); +} + +/** + * Remove all HTML tags from a string using indexOf-based scanning. + * Handles tags that span multiple lines and tags with > inside attribute values. + * This avoids regex-based tag stripping which can be bypassed by newlines in tags. + */ +function stripTags(input: string): string { + let result = ""; + let pos = 0; + while (pos < input.length) { + const open = input.indexOf("<", pos); + if (open === -1) { + result += input.slice(pos); + break; + } + result += input.slice(pos, open); + // Scan for the closing > of this tag, respecting quoted attribute values + let i = open + 1; + while (i < input.length) { + const ch = input[i]; + if (ch === ">") { + i++; + break; + } + // Skip quoted attribute values so > inside them doesn't end the tag early + if (ch === '"' || ch === "'") { + const close = input.indexOf(ch, i + 1); + i = close === -1 ? input.length : close + 1; + } else { + i++; + } + } + pos = i; + } + // Collapse whitespace left behind by removed tags + return result.replace(/\s+/g, " "); +} + +function extractTitle(html: string, url: string): string { + // Try tag + const match = /<title[^>]*>([^<]+)<\/title>/i.exec(html); + if (match?.[1]) return match[1].trim(); + // Try first <h1> + const h1 = /<h1[^>]*>([\s\S]*?)<\/h1>/i.exec(html); + if (h1?.[1]) { + return stripTags(h1[1]).trim(); + } + // Fall back to URL path + try { + const parsed = new URL(url); + const path = parsed.pathname.replace(/\/$/, ""); + const last = path.split("/").pop(); + if (last) return last.replace(/[-_]/g, " ").replace(/\.\w+$/, ""); + return parsed.hostname; + } catch { + return url; + } +} + +// ── Spider engine ──────────────────────────────────────────────────────────── + +/** + * Spider a seed URL, yielding each successfully fetched page as a SpiderResult. + * Performs BFS up to maxDepth hops and maxPages total. + * + * @example + * for await (const page of spiderUrl("https://docs.example.com", { maxPages: 50, maxDepth: 2 })) { + * await indexDocument(db, provider, { title: page.title, content: page.content, url: page.url }); + * } + */ +export async function* spiderUrl( + seedUrl: string, + options: SpiderOptions = {}, +): AsyncGenerator<SpiderResult, SpiderStats, unknown> { + const log = getLogger(); + + // Resolve effective limits + const maxPages = Math.min(options.maxPages ?? 25, HARD_MAX_PAGES); + const maxDepth = Math.min(options.maxDepth ?? 2, HARD_MAX_DEPTH); + const sameDomain = options.sameDomain ?? true; + const pathPrefix = options.pathPrefix ?? ""; + const excludePatterns = options.excludePatterns ?? []; + const requestDelay = options.requestDelay ?? DEFAULT_REQUEST_DELAY_MS; + const fetchOptions = options.fetchOptions; + + // Parse seed URL for domain filtering + let seedHostname: string; + let seedOrigin: string; + try { + const parsed = new URL(seedUrl); + seedHostname = parsed.hostname; + seedOrigin = parsed.origin; + } catch { + throw new FetchError("Invalid seed URL: " + seedUrl); + } + + const stats: SpiderStats = { + pagesFetched: 0, + pagesCrawled: 0, + pagesSkipped: 0, + errors: [], + }; + + // Per-origin robots.txt cache — fetched lazily as new origins are encountered. + // Pre-populate with the seed origin so we don't re-fetch it on the first page. + const robotsCache = new Map<string, Set<string>>(); + const seedRobots = await fetchRobotsTxt(seedOrigin, fetchOptions); + robotsCache.set(seedOrigin, seedRobots); + log.debug({ origin: seedOrigin, rules: seedRobots.size }, "Loaded robots.txt rules"); + + const visited = new Set<string>(); + // BFS queue entries + type QueueEntry = { url: string; depth: number }; + const queue: QueueEntry[] = [{ url: seedUrl, depth: 0 }]; + + const deadline = Date.now() + HARD_TOTAL_TIMEOUT_MS; + + while (queue.length > 0 && stats.pagesFetched < maxPages) { + // Check total timeout + if (Date.now() > deadline) { + log.warn({ pagesFetched: stats.pagesFetched }, "Spider total timeout reached"); + stats.abortReason = "timeout"; + break; + } + + const entry = queue.shift()!; + const { url, depth } = entry; + + // Skip already-visited + if (visited.has(url)) continue; + visited.add(url); + + // Apply filters (except for seed URL at depth 0 — always fetch it) + if (depth > 0) { + if (sameDomain && !isSameDomain(url, seedHostname)) { + log.debug({ url }, "Spider: skipping cross-domain link"); + stats.pagesSkipped++; + continue; + } + if (pathPrefix && !hasPathPrefix(url, pathPrefix)) { + log.debug({ url, pathPrefix }, "Spider: skipping link outside path prefix"); + stats.pagesSkipped++; + continue; + } + if (excludePatterns.length > 0 && isExcluded(url, excludePatterns)) { + log.debug({ url }, "Spider: skipping excluded URL"); + stats.pagesSkipped++; + continue; + } + // Fetch robots.txt for new origins (cross-domain crawl when sameDomain is false) + let urlOrigin: string; + try { + urlOrigin = new URL(url).origin; + } catch { + urlOrigin = seedOrigin; + } + if (!robotsCache.has(urlOrigin)) { + const rules = await fetchRobotsTxt(urlOrigin, fetchOptions); + robotsCache.set(urlOrigin, rules); + log.debug( + { origin: urlOrigin, rules: rules.size }, + "Loaded robots.txt rules for new origin", + ); + } + if (isDisallowedByRobots(url, robotsCache.get(urlOrigin)!)) { + log.debug({ url }, "Spider: skipping URL disallowed by robots.txt"); + stats.pagesSkipped++; + continue; + } + } + + // Check maxPages before fetching + if (stats.pagesFetched >= maxPages) { + stats.abortReason = "maxPages"; + break; + } + + // Delay between requests (skip delay before first request) + if (stats.pagesCrawled > 0 && requestDelay > 0) { + await sleep(requestDelay); + } + + // Fetch the page + log.info({ url, depth }, "Spider: fetching page"); + stats.pagesCrawled++; + + let raw: Awaited<ReturnType<typeof fetchRaw>>; + try { + raw = await fetchRaw(url, fetchOptions); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + log.warn({ url, err: msg }, "Spider: fetch failed, skipping"); + stats.errors.push({ url, error: msg }); + continue; + } + + // Normalize to the final URL after any redirects. + // This ensures the visited set, yielded URL, and link-extraction base are all consistent. + const canonicalUrl = raw.finalUrl || url; + if (canonicalUrl !== url) { + visited.add(canonicalUrl); + } + + // Convert to markdown + const isHtml = raw.contentType.includes("text/html"); + const content = isHtml ? htmlToMarkdown(raw.body) : raw.body; + const title = isHtml + ? extractTitle(raw.body, canonicalUrl) + : extractTextTitle(raw.body, canonicalUrl); + + stats.pagesFetched++; + yield { url: canonicalUrl, title, content, depth }; + + // Extract and enqueue child links if we haven't hit maxDepth + if (depth < maxDepth) { + if (isHtml) { + const links = extractLinks(raw.body, canonicalUrl); + for (const link of links) { + if (!visited.has(link)) { + queue.push({ url: link, depth: depth + 1 }); + } + } + log.debug({ url, linksFound: links.length }, "Spider: extracted links"); + } + } + } + + // If the loop exited via the outer while condition hitting maxPages (not via + // an explicit break with abortReason already set), record the reason now. + if (!stats.abortReason && queue.length > 0 && stats.pagesFetched >= maxPages) { + stats.abortReason = "maxPages"; + } + + log.info( + { + pagesFetched: stats.pagesFetched, + pagesCrawled: stats.pagesCrawled, + pagesSkipped: stats.pagesSkipped, + errors: stats.errors.length, + abortReason: stats.abortReason, + }, + "Spider: crawl complete", + ); + + return stats; +} + +function extractTextTitle(text: string, url: string): string { + // For plain text/markdown, try first # heading + const match = /^#\s+(.+)$/m.exec(text); + if (match?.[1]) return match[1].trim(); + // Fall back to URL + try { + const parsed = new URL(url); + const path = parsed.pathname.replace(/\/$/, ""); + const last = path.split("/").pop(); + if (last) return last.replace(/[-_]/g, " ").replace(/\.\w+$/, ""); + return parsed.hostname; + } catch { + return url; + } +} + +function sleep(ms: number): Promise<void> { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/src/core/tags.ts b/src/core/tags.ts index 1fe4bfe..f37180d 100644 --- a/src/core/tags.ts +++ b/src/core/tags.ts @@ -222,6 +222,37 @@ export function removeTagFromDocument( log.info({ documentId, tagId }, "Tag removed from document"); } +/** Get all tags for multiple documents in a single query. Returns a Map of documentId → tags. */ +export function getDocumentTagsBatch( + db: Database.Database, + documentIds: string[], +): Map<string, Tag[]> { + if (documentIds.length === 0) return new Map(); + const placeholders = documentIds.map(() => "?").join(", "); + const rows = db + .prepare( + `SELECT dt.document_id, t.id, t.name, t.created_at + FROM tags t + JOIN document_tags dt ON dt.tag_id = t.id + WHERE dt.document_id IN (${placeholders}) + ORDER BY t.name`, + ) + .all(...documentIds) as Array<{ + document_id: string; + id: string; + name: string; + created_at: string; + }>; + + const result = new Map<string, Tag[]>(); + for (const row of rows) { + const entry = result.get(row.document_id) ?? []; + entry.push({ id: row.id, name: row.name, createdAt: row.created_at }); + result.set(row.document_id, entry); + } + return result; +} + /** Get all tags for a specific document. */ export function getDocumentTags(db: Database.Database, documentId: string): Tag[] { const rows = db @@ -313,14 +344,39 @@ export function getDocumentsByTag( } /** Tokenize text into lowercase words, filtering stopwords and short words. */ -function tokenize(text: string): string[] { +export function tokenize(text: string): string[] { return text .toLowerCase() .split(/[^a-z0-9]+/) .filter((w) => w.length >= 3 && !STOPWORDS.has(w)); } -/** Suggest tags for a document based on content analysis (TF-IDF-like keyword extraction). */ +/** Suggest tags from raw text without requiring a database (for pack creation). */ +export function suggestTagsFromText( + title: string, + content: string, + maxSuggestions?: number, +): string[] { + const limit = maxSuggestions ?? 5; + const fullText = `${title} ${content}`; + const tokens = tokenize(fullText); + if (tokens.length === 0) return []; + + const tf = new Map<string, number>(); + for (const token of tokens) { + tf.set(token, (tf.get(token) ?? 0) + 1); + } + + const maxTf = Math.max(...tf.values()); + const scored: Array<{ term: string; score: number }> = []; + + for (const [term, count] of tf) { + scored.push({ term, score: count / maxTf }); + } + + scored.sort((a, b) => b.score - a.score); + return scored.slice(0, limit).map((s) => s.term); +} export function suggestTags( db: Database.Database, documentId: string, diff --git a/src/core/url-fetcher.ts b/src/core/url-fetcher.ts index 6953bac..8d5441c 100644 --- a/src/core/url-fetcher.ts +++ b/src/core/url-fetcher.ts @@ -1,30 +1,19 @@ import { promises as dns, lookup as dnsLookup } from "node:dns"; import { promisify } from "node:util"; +import { Agent } from "undici"; import { NodeHtmlMarkdown } from "node-html-markdown"; import { FetchError } from "../errors.js"; import { getLogger } from "../logger.js"; -const lookupAsync = promisify(dnsLookup); - -let tlsWarningLogged = false; - -/** - * Log a one-time warning when `allowSelfSignedCerts` is enabled but the - * user has not set `NODE_TLS_REJECT_UNAUTHORIZED=0` in their environment. - * Setting the env var programmatically is a security anti-pattern flagged - * by static analysis tools — the user must opt in at the process level. - */ -function warnIfTlsBypassMissing(): void { - if (tlsWarningLogged) return; - if (process.env["NODE_TLS_REJECT_UNAUTHORIZED"] === "0") return; - tlsWarningLogged = true; - const log = getLogger(); - log.warn( - "allowSelfSignedCerts is enabled but NODE_TLS_REJECT_UNAUTHORIZED is not set. " + - "Set NODE_TLS_REJECT_UNAUTHORIZED=0 in your environment to allow self-signed certificates.", - ); +/** Lazy singleton undici Agent that skips TLS certificate verification. */ +let _insecureAgent: Agent | undefined; +function getInsecureAgent(): Agent { + _insecureAgent ??= new Agent({ connect: { rejectUnauthorized: false } }); + return _insecureAgent; } +const lookupAsync = promisify(dnsLookup); + export interface FetchedDocument { title: string; content: string; @@ -178,10 +167,11 @@ async function fetchWithRedirects( allowPrivateUrls: boolean, allowSelfSignedCerts: boolean, ): Promise<Response> { - if (allowSelfSignedCerts) { - warnIfTlsBypassMissing(); - } - return _fetchWithRedirects(url, timeout, maxRedirects, allowPrivateUrls); + // Pass a per-request undici Agent when self-signed certs are allowed. + // This is scoped to this specific request chain and does not affect other + // concurrent requests (unlike mutating process.env["NODE_TLS_REJECT_UNAUTHORIZED"]). + const dispatcher = allowSelfSignedCerts ? getInsecureAgent() : undefined; + return _fetchWithRedirects(url, timeout, maxRedirects, allowPrivateUrls, dispatcher); } async function _fetchWithRedirects( @@ -189,6 +179,7 @@ async function _fetchWithRedirects( timeout: number, maxRedirects: number, allowPrivateUrls: boolean, + dispatcher: Agent | undefined, ): Promise<Response> { let current = url; for (let i = 0; i < maxRedirects; i++) { @@ -205,7 +196,8 @@ async function _fetchWithRedirects( }, signal: AbortSignal.timeout(timeout), redirect: "manual", - }); + ...(dispatcher ? { dispatcher: dispatcher as unknown } : {}), + } as RequestInit); // Re-validate the connected IP hasn't changed (DNS rebinding defense) // Re-resolve and confirm it still matches the pinned set @@ -246,6 +238,57 @@ async function _fetchWithRedirects( throw new FetchError(`Too many redirects (max ${maxRedirects})`); } +/** Result of a raw fetch — HTML/text body before any conversion, plus resolved final URL. */ +export interface FetchedRaw { + body: string; + contentType: string; + finalUrl: string; +} + +/** + * Fetch a URL and return the raw body without converting to markdown. + * Useful for callers (e.g. the spider) that need access to raw HTML for link extraction. + * Applies all the same SSRF protection, redirect following, and body-size limits as fetchAndConvert. + */ +export async function fetchRaw(url: string, options?: FetchOptions): Promise<FetchedRaw> { + const log = getLogger(); + log.debug({ url }, "Fetching raw URL"); + + const { timeout, maxRedirects, maxBodySize, allowPrivateUrls, allowSelfSignedCerts } = { + ...DEFAULT_FETCH_OPTIONS, + ...options, + }; + + try { + await validateUrl(url, allowPrivateUrls); + + const response = await fetchWithRedirects( + url, + timeout, + maxRedirects, + allowPrivateUrls, + allowSelfSignedCerts, + ); + + if (!response.ok) { + throw new FetchError(`HTTP ${response.status}: ${response.statusText}`); + } + + const contentType = response.headers.get("content-type") ?? ""; + const body = await readBodyWithLimit(response, maxBodySize); + // Derive final URL from redirect chain (fetchWithRedirects resolves relative locations) + const finalUrl = response.url ?? url; + + return { body, contentType, finalUrl }; + } catch (err) { + if (err instanceof FetchError) throw err; + throw new FetchError( + `Failed to fetch URL: ${url} — ${err instanceof Error ? err.message : String(err)}`, + err, + ); + } +} + /** * Fetch a URL and convert its HTML content to clean markdown-like text. * Strips tags, preserves code blocks and headings. diff --git a/src/core/versioning.ts b/src/core/versioning.ts index 0bcee13..756fbf5 100644 --- a/src/core/versioning.ts +++ b/src/core/versioning.ts @@ -3,6 +3,7 @@ import { randomUUID } from "node:crypto"; import type { EmbeddingProvider } from "../providers/embedding.js"; import { DocumentNotFoundError } from "../errors.js"; import { getDocument, updateDocument } from "./documents.js"; +import { getLogger } from "../logger.js"; export const MAX_VERSIONS_DEFAULT = 10; @@ -174,7 +175,11 @@ function mapRow(row: { if (row.metadata) { try { metadata = JSON.parse(row.metadata) as Record<string, unknown>; - } catch { + } catch (err) { + getLogger().warn( + { err, versionId: row.id }, + "Failed to parse version metadata JSON; using null", + ); metadata = null; } } diff --git a/src/core/webhooks.ts b/src/core/webhooks.ts index 84c5b2b..ac98176 100644 --- a/src/core/webhooks.ts +++ b/src/core/webhooks.ts @@ -79,10 +79,19 @@ function recordFailure( } function rowToWebhook(row: WebhookRow): Webhook { + let events: WebhookEvent[] = []; + try { + events = JSON.parse(row.events) as WebhookEvent[]; + } catch { + getLogger().warn( + { webhookId: row.id }, + "Failed to parse webhook events JSON; defaulting to []", + ); + } return { id: row.id, url: row.url, - events: JSON.parse(row.events) as WebhookEvent[], + events, secret: row.secret, active: row.active === 1, createdAt: row.created_at, @@ -251,7 +260,11 @@ export function fireWebhooks( data: Record<string, unknown>, ): void { const log = getLogger(); - const rows = db.prepare("SELECT * FROM webhooks WHERE active = 1").all() as WebhookRow[]; + const rows = db + .prepare( + "SELECT id, url, events, secret, active, created_at, last_triggered_at, failure_count FROM webhooks WHERE active = 1", + ) + .all() as WebhookRow[]; const body = buildPayload(event, data); diff --git a/src/mcp/errors.ts b/src/mcp/errors.ts new file mode 100644 index 0000000..38f07b8 --- /dev/null +++ b/src/mcp/errors.ts @@ -0,0 +1,40 @@ +import { LibScopeError } from "../errors.js"; +import { getLogger } from "../logger.js"; + +/** Convert a thrown error into an MCP error response object. */ +export function errorResponse(err: unknown): { + content: Array<{ type: "text"; text: string }>; + isError: true; +} { + let message: string; + if (err instanceof LibScopeError) { + message = err.message; + } else if (err instanceof Error) { + message = `${err.name}: ${err.message}`; + } else { + message = `An unexpected error occurred: ${String(err)}`; + } + + const log = getLogger(); + log.error({ err }, "MCP tool error"); + + return { + content: [{ type: "text" as const, text: `Error: ${message}` }], + isError: true, + }; +} + +export type ToolResult = { content: Array<{ type: "text"; text: string }>; isError?: boolean }; + +/** Wraps a tool handler so that thrown errors are converted to MCP error responses. */ +export function withErrorHandling<P>( + handler: (params: P) => ToolResult | Promise<ToolResult>, +): (params: P) => Promise<ToolResult> { + return async (params: P) => { + try { + return await handler(params); + } catch (err) { + return errorResponse(err); + } + }; +} diff --git a/src/mcp/server.ts b/src/mcp/server.ts index 8d62806..cd44e5c 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -6,7 +6,13 @@ import { getDatabase, runMigrations, createVectorTable } from "../db/index.js"; import { getActiveWorkspace, getWorkspacePath } from "../core/workspace.js"; import { createEmbeddingProvider } from "../providers/index.js"; import { searchDocuments } from "../core/search.js"; -import { askQuestion, createLlmProvider, type LlmProvider } from "../core/rag.js"; +import { + askQuestion, + createLlmProvider, + getContextForQuestion, + isPassthroughMode, + type LlmProvider, +} from "../core/rag.js"; import { getDocument, listDocuments, deleteDocument, updateDocument } from "../core/documents.js"; import { rateDocument, getDocumentRatings } from "../core/ratings.js"; import { indexDocument } from "../core/indexing.js"; @@ -23,45 +29,12 @@ import { createWebhook, listWebhooks, deleteWebhook, redactWebhook } from "../co import type { WebhookEvent } from "../core/webhooks.js"; import { suggestTags } from "../core/tags.js"; import { fetchAndConvert } from "../core/url-fetcher.js"; +import { spiderUrl } from "../core/spider.js"; +import type { SpiderOptions } from "../core/spider.js"; import { initLogger, getLogger } from "../logger.js"; -import { ConfigError, LibScopeError, ValidationError } from "../errors.js"; - -function errorResponse(err: unknown): { - content: Array<{ type: "text"; text: string }>; - isError: true; -} { - let message: string; - if (err instanceof LibScopeError) { - message = err.message; - } else if (err instanceof Error) { - message = `${err.name}: ${err.message}`; - } else { - message = `An unexpected error occurred: ${String(err)}`; - } - - const log = getLogger(); - log.error({ err }, "MCP tool error"); - - return { - content: [{ type: "text" as const, text: `Error: ${message}` }], - isError: true, - }; -} - -type ToolResult = { content: Array<{ type: "text"; text: string }>; isError?: boolean }; - -/** Wraps a tool handler so that thrown errors are converted to MCP error responses. */ -function withErrorHandling<P>( - handler: (params: P) => ToolResult | Promise<ToolResult>, -): (params: P) => Promise<ToolResult> { - return async (params: P) => { - try { - return await handler(params); - } catch (err) { - return errorResponse(err); - } - }; -} +import { ConfigError, ValidationError } from "../errors.js"; +import { errorResponse, withErrorHandling } from "./errors.js"; +export { errorResponse, withErrorHandling, type ToolResult } from "./errors.js"; // Start the server async function main(): Promise<void> { @@ -330,7 +303,7 @@ async function main(): Promise<void> { // Tool: submit-document server.tool( "submit-document", - "Submit a new document for indexing into the knowledge base. You can provide content directly, or provide a URL to fetch and index automatically.", + "Submit a new document for indexing into the knowledge base. You can provide content directly, or provide a URL to fetch and index automatically. Set spider=true to crawl linked pages from the URL.", { title: z .string() @@ -353,17 +326,104 @@ async function main(): Promise<void> { topic: z.string().optional().describe("Topic ID to categorize under"), library: z.string().optional().describe("Library name (for library docs)"), version: z.string().optional().describe("Library version"), + spider: z + .boolean() + .optional() + .describe("When true, crawl pages linked from the URL. Requires 'url'. Default: false."), + maxPages: z + .number() + .int() + .positive() + .optional() + .describe("Maximum pages to index during a spider run (default: 25, hard cap: 200)."), + maxDepth: z + .number() + .int() + .min(0) + .optional() + .describe( + "Maximum link-hop depth from the seed URL (default: 2, hard cap: 5). 0 = seed only.", + ), + sameDomain: z + .boolean() + .optional() + .describe("Only follow links on the same domain as the seed URL (default: true)."), + pathPrefix: z + .string() + .optional() + .describe("Only follow links whose path starts with this prefix (e.g. '/docs/')."), + excludePatterns: z + .array(z.string()) + .optional() + .describe("Glob patterns for URLs to skip (e.g. ['*/changelog*', '*/api/v1/*'])."), }, withErrorHandling(async (params) => { let { title, content } = params; const { url, library, version, topic } = params; + const fetchOptions = { + allowPrivateUrls: config.indexing.allowPrivateUrls, + allowSelfSignedCerts: config.indexing.allowSelfSignedCerts, + }; + + // Spider mode — crawl linked pages from the URL + if (params.spider && !url) { + throw new ValidationError("Field 'url' is required when spider is true"); + } + if (params.spider && url) { + const spiderOptions: SpiderOptions = { fetchOptions }; + if (params.maxPages !== undefined) spiderOptions.maxPages = params.maxPages; + if (params.maxDepth !== undefined) spiderOptions.maxDepth = params.maxDepth; + if (params.sameDomain !== undefined) spiderOptions.sameDomain = params.sameDomain; + if (params.pathPrefix !== undefined) spiderOptions.pathPrefix = params.pathPrefix; + if (params.excludePatterns !== undefined) + spiderOptions.excludePatterns = params.excludePatterns; + + const indexed: Array<{ id: string; title: string }> = []; + const errors: Array<{ url: string; error: string }> = []; + const sourceType = params.sourceType ?? (library ? "library" : "manual"); + + const gen = spiderUrl(url, spiderOptions); + let result = await gen.next(); + while (!result.done) { + const page = result.value; + try { + const doc = await indexDocument(db, provider, { + title: page.title, + content: page.content, + sourceType, + library, + version, + topicId: topic, + url: page.url, + submittedBy: "model", + }); + indexed.push({ id: doc.id, title: page.title }); + } catch (err) { + errors.push({ url: page.url, error: err instanceof Error ? err.message : String(err) }); + } + result = await gen.next(); + } + const stats = result.value; + + const summary = [ + `Spider complete.`, + `Pages indexed: ${indexed.length}`, + `Pages crawled: ${stats?.pagesCrawled ?? indexed.length}`, + `Pages skipped: ${stats?.pagesSkipped ?? 0}`, + errors.length > 0 ? `Errors: ${errors.length}` : null, + stats?.abortReason ? `Stopped early: ${stats.abortReason}` : null, + ] + .filter(Boolean) + .join("\n"); + + return { + content: [{ type: "text" as const, text: summary }], + }; + } // If URL is provided and no content, fetch it if (url && !content) { - const fetched = await fetchAndConvert(url, { - allowPrivateUrls: config.indexing.allowPrivateUrls, - allowSelfSignedCerts: config.indexing.allowSelfSignedCerts, - }); + const fetched = await fetchAndConvert(url, fetchOptions); content = fetched.content; title ??= fetched.title; } @@ -543,9 +603,30 @@ async function main(): Promise<void> { library: z.string().optional().describe("Filter by library name"), }, withErrorHandling(async (params) => { + if (isPassthroughMode(config)) { + const { contextPrompt, sources } = await getContextForQuestion(db, provider, { + question: params.question, + topK: params.topK, + topic: params.topic, + library: params.library, + }); + + const sourcesText = + sources.length > 0 + ? "\n\n**Sources:**\n" + + sources + .map((s) => `- ${s.title} (score: ${s.score.toFixed(2)}) [${s.documentId}]`) + .join("\n") + : ""; + + return { + content: [{ type: "text" as const, text: contextPrompt + sourcesText }], + }; + } + if (!llmProvider) { throw new ConfigError( - "No LLM provider configured. Set llm.provider to 'openai' or 'ollama' in your config.", + "No LLM provider configured. Set llm.provider to 'openai', 'ollama', or 'passthrough' in your config.", ); } diff --git a/src/providers/local.ts b/src/providers/local.ts index 10e4a7b..37391b3 100644 --- a/src/providers/local.ts +++ b/src/providers/local.ts @@ -2,6 +2,17 @@ import { EmbeddingError } from "../errors.js"; import type { EmbeddingProvider } from "./embedding.js"; import { getLogger } from "../logger.js"; +/** Minimal typed interface for the @xenova/transformers feature-extraction pipeline output. */ +interface TransformersOutput { + data: Float32Array; +} + +/** Minimal typed interface for the @xenova/transformers feature-extraction pipeline function. */ +type FeatureExtractionPipeline = ( + input: string, + options: { pooling: string; normalize: boolean }, +) => Promise<TransformersOutput>; + /** * Local embedding provider using @xenova/transformers (all-MiniLM-L6-v2). * Downloads the model on first use (~80MB). Runs entirely in-process. @@ -10,7 +21,7 @@ export class LocalEmbeddingProvider implements EmbeddingProvider { readonly name = "local"; readonly dimensions = 384; - private pipeline: unknown = null; + private pipeline: FeatureExtractionPipeline | null = null; private initPromise: Promise<void> | null = null; private async ensureInitialized(): Promise<void> { @@ -24,7 +35,11 @@ export class LocalEmbeddingProvider implements EmbeddingProvider { try { // Dynamic import to avoid loading transformers until needed const { pipeline } = await import("@xenova/transformers"); - this.pipeline = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2"); + // Cast to the typed interface; @xenova/transformers lacks precise TS generics for pipeline output + this.pipeline = (await pipeline( + "feature-extraction", + "Xenova/all-MiniLM-L6-v2", + )) as unknown as FeatureExtractionPipeline; log.info("Local embedding model loaded successfully"); } catch (err) { this.initPromise = null; @@ -38,10 +53,8 @@ export class LocalEmbeddingProvider implements EmbeddingProvider { } await this.ensureInitialized(); try { - // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-assignment - const output = await (this.pipeline as any)(text, { pooling: "mean", normalize: true }); - // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access - const embedding = Array.from(output.data as Float32Array); + const output = await this.pipeline!(text, { pooling: "mean", normalize: true }); + const embedding = Array.from(output.data); if (embedding.length !== this.dimensions) { throw new EmbeddingError( `Expected embedding dimension ${this.dimensions}, got ${embedding.length}`, diff --git a/tests/unit/bulk.test.ts b/tests/unit/bulk.test.ts index e9239fc..37094b8 100644 --- a/tests/unit/bulk.test.ts +++ b/tests/unit/bulk.test.ts @@ -76,6 +76,25 @@ describe("bulk operations", () => { expect(ids).toContain("doc-b"); }); + it("filters by dateFrom", () => { + insertDoc(db, "doc-old", "Old Doc", { + library: "react", + createdAt: "2020-01-01T00:00:00.000Z", + }); + const ids = resolveSelector(db, { library: "react", dateFrom: "2024-01-01T00:00:00.000Z" }); + expect(ids).not.toContain("doc-old"); + expect(ids).toContain("doc-a"); + }); + + it("filters by dateTo", () => { + insertDoc(db, "doc-future", "Future Doc", { + library: "react", + createdAt: "2099-01-01T00:00:00.000Z", + }); + const ids = resolveSelector(db, { library: "react", dateTo: "2025-01-01T00:00:00.000Z" }); + expect(ids).not.toContain("doc-future"); + }); + it("throws on empty selector", () => { expect(() => resolveSelector(db, {})).toThrow(ValidationError); }); @@ -90,9 +109,64 @@ describe("bulk operations", () => { expect(ids.length).toBeLessThanOrEqual(10); }); - it("returns empty array for negative limit", () => { - const ids = resolveSelector(db, { library: "react" }, -5); - expect(ids).toHaveLength(0); + it("throws ValidationError for negative limit", () => { + expect(() => resolveSelector(db, { library: "react" }, -5)).toThrow(ValidationError); + expect(() => resolveSelector(db, { library: "react" }, -1)).toThrow( + "limit must be a non-negative integer", + ); + }); + + it("applies dateFrom filter at SQL level before LIMIT", () => { + // Insert enough docs to exceed a small limit, with varying dates + for (let i = 0; i < 20; i++) { + insertDoc(db, `old-${i}`, `Old Doc ${i}`, { + library: "test-lib", + createdAt: "2020-01-01T00:00:00.000Z", + }); + } + for (let i = 0; i < 5; i++) { + insertDoc(db, `new-${i}`, `New Doc ${i}`, { + library: "test-lib", + createdAt: "2025-06-01T00:00:00.000Z", + }); + } + + // With a limit of 10, date filter must happen in SQL before LIMIT, + // otherwise old docs could fill the limit and exclude new ones + const ids = resolveSelector( + db, + { library: "test-lib", dateFrom: "2025-01-01T00:00:00.000Z" }, + 10, + ); + expect(ids).toHaveLength(5); + for (const id of ids) { + expect(id).toMatch(/^new-/); + } + }); + + it("applies dateTo filter at SQL level before LIMIT", () => { + for (let i = 0; i < 20; i++) { + insertDoc(db, `future-${i}`, `Future Doc ${i}`, { + library: "test-lib", + createdAt: "2099-01-01T00:00:00.000Z", + }); + } + for (let i = 0; i < 5; i++) { + insertDoc(db, `past-${i}`, `Past Doc ${i}`, { + library: "test-lib", + createdAt: "2020-06-01T00:00:00.000Z", + }); + } + + const ids = resolveSelector( + db, + { library: "test-lib", dateTo: "2025-01-01T00:00:00.000Z" }, + 10, + ); + expect(ids).toHaveLength(5); + for (const id of ids) { + expect(id).toMatch(/^past-/); + } }); }); diff --git a/tests/unit/config.test.ts b/tests/unit/config.test.ts index 9d380c8..a84d0be 100644 --- a/tests/unit/config.test.ts +++ b/tests/unit/config.test.ts @@ -1,10 +1,11 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; -import { loadConfig, validateConfig } from "../../src/config.js"; +import { loadConfig, validateConfig, invalidateConfigCache } from "../../src/config.js"; import type { LibScopeConfig } from "../../src/config.js"; import * as loggerModule from "../../src/logger.js"; describe("config", () => { it("should return default config when no files exist", () => { + invalidateConfigCache(); const config = loadConfig(); expect(config.embedding.provider).toBe("local"); @@ -12,10 +13,18 @@ describe("config", () => { expect(config.database.path).toContain("libscope.db"); }); + it("should return cached config on repeated calls", () => { + invalidateConfigCache(); + const first = loadConfig(); + const second = loadConfig(); // cache hit + expect(second).toBe(first); // same object reference + }); + it("should respect LIBSCOPE_EMBEDDING_PROVIDER env var", () => { const original = process.env["LIBSCOPE_EMBEDDING_PROVIDER"]; try { process.env["LIBSCOPE_EMBEDDING_PROVIDER"] = "ollama"; + invalidateConfigCache(); const config = loadConfig(); expect(config.embedding.provider).toBe("ollama"); } finally { @@ -31,6 +40,7 @@ describe("config", () => { const original = process.env["LIBSCOPE_EMBEDDING_PROVIDER"]; try { process.env["LIBSCOPE_EMBEDDING_PROVIDER"] = "invalid"; + invalidateConfigCache(); const config = loadConfig(); // Should fall through to default since "invalid" doesn't match the switch expect(config.embedding.provider).toBe("local"); @@ -47,6 +57,7 @@ describe("config", () => { const original = process.env["LIBSCOPE_OPENAI_API_KEY"]; try { process.env["LIBSCOPE_OPENAI_API_KEY"] = "sk-test123"; + invalidateConfigCache(); const config = loadConfig(); expect(config.embedding.openaiApiKey).toBe("sk-test123"); } finally { @@ -62,6 +73,7 @@ describe("config", () => { const original = process.env["LIBSCOPE_OLLAMA_URL"]; try { process.env["LIBSCOPE_OLLAMA_URL"] = "http://custom:11434"; + invalidateConfigCache(); const config = loadConfig(); expect(config.embedding.ollamaUrl).toBe("http://custom:11434"); } finally { @@ -77,6 +89,7 @@ describe("config", () => { const original = process.env["LIBSCOPE_ALLOW_PRIVATE_URLS"]; try { process.env["LIBSCOPE_ALLOW_PRIVATE_URLS"] = "true"; + invalidateConfigCache(); const config = loadConfig(); expect(config.indexing.allowPrivateUrls).toBe(true); } finally { @@ -92,6 +105,7 @@ describe("config", () => { const original = process.env["LIBSCOPE_ALLOW_SELF_SIGNED_CERTS"]; try { process.env["LIBSCOPE_ALLOW_SELF_SIGNED_CERTS"] = "1"; + invalidateConfigCache(); const config = loadConfig(); expect(config.indexing.allowSelfSignedCerts).toBe(true); } finally { @@ -109,6 +123,7 @@ describe("config", () => { try { process.env["LIBSCOPE_LLM_PROVIDER"] = "ollama"; process.env["LIBSCOPE_LLM_MODEL"] = "llama3"; + invalidateConfigCache(); const config = loadConfig(); expect(config.llm?.provider).toBe("ollama"); expect(config.llm?.model).toBe("llama3"); diff --git a/tests/unit/connectors-config.test.ts b/tests/unit/connectors-config.test.ts index 28d15dc..3bfa6ac 100644 --- a/tests/unit/connectors-config.test.ts +++ b/tests/unit/connectors-config.test.ts @@ -144,6 +144,17 @@ describe("connectors config", () => { expect(result).toBe(true); expect(loadDbConnectorConfig(db, "notion")).toBeUndefined(); }); + + it("loadDbConnectorConfig throws ConfigError when config_json is corrupted", () => { + // Directly insert corrupted JSON into the database + db.prepare( + "INSERT INTO connector_configs (type, config_json, updated_at) VALUES (?, ?, datetime('now'))", + ).run("corrupted", "not valid json{{{"); + + expect(() => loadDbConnectorConfig(db, "corrupted")).toThrow( + /Corrupted connector config for type "corrupted"/, + ); + }); }); describe("sync tracker", () => { diff --git a/tests/unit/http-utils.test.ts b/tests/unit/http-utils.test.ts index fa8d745..7658343 100644 --- a/tests/unit/http-utils.test.ts +++ b/tests/unit/http-utils.test.ts @@ -108,7 +108,7 @@ describe("fetchWithRetry", () => { baseDelay: 10, }), ).rejects.toThrow(FetchError); - expect(mockFetch).toHaveBeenCalledTimes(2); + expect(mockFetch).toHaveBeenCalledTimes(3); // 1 initial + 2 retries vi.useFakeTimers(); }); diff --git a/tests/unit/link-extractor.test.ts b/tests/unit/link-extractor.test.ts new file mode 100644 index 0000000..f623eba --- /dev/null +++ b/tests/unit/link-extractor.test.ts @@ -0,0 +1,155 @@ +import { describe, it, expect } from "vitest"; +import { extractLinks } from "../../src/core/link-extractor.js"; + +const BASE = "https://example.com/docs/intro"; + +describe("extractLinks", () => { + it("extracts absolute http links", () => { + const html = `<a href="https://example.com/page">link</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]); + }); + + it("resolves relative links against base URL", () => { + const html = `<a href="../guide">guide</a>`; + const links = extractLinks(html, BASE); + expect(links).toEqual(["https://example.com/guide"]); + }); + + it("resolves root-relative links", () => { + const html = `<a href="/about">about</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/about"]); + }); + + it("strips fragment-only links", () => { + const html = `<a href="#section">jump</a>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("strips fragments from full URLs", () => { + const html = `<a href="https://example.com/page#section">link</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]); + }); + + it("deduplicates links", () => { + const html = ` + <a href="https://example.com/page">first</a> + <a href="https://example.com/page">second</a> + `; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]); + }); + + it("deduplicates after fragment stripping", () => { + const html = ` + <a href="https://example.com/page#a">a</a> + <a href="https://example.com/page#b">b</a> + `; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]); + }); + + it("filters out mailto: links", () => { + const html = `<a href="mailto:user@example.com">email</a>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("filters out javascript: links", () => { + const html = `<a href="javascript:void(0)">noop</a>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("filters out tel: links", () => { + const html = `<a href="tel:+15555555555">call</a>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("filters out ftp: links", () => { + const html = `<a href="ftp://files.example.com/data">ftp</a>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("filters out data: links", () => { + const html = `<a href="data:text/plain;base64,abc">data</a>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("handles single-quoted href attributes", () => { + const html = `<a href='https://example.com/single'>link</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/single"]); + }); + + it("handles unquoted href attributes", () => { + const html = `<a href=https://example.com/noquote>link</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/noquote"]); + }); + + it("ignores tags that aren't <a>", () => { + const html = ` + <img src="https://example.com/img.png"> + <link href="https://example.com/style.css"> + <a href="https://example.com/real">real</a> + `; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/real"]); + }); + + it("handles <a> tags with extra attributes", () => { + const html = `<a class="nav" id="main" href="https://example.com/page" target="_blank">link</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]); + }); + + it("handles href before other attributes", () => { + const html = `<a href="https://example.com/page" class="nav">link</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/page"]); + }); + + it("strips trailing slash from non-root paths", () => { + const html = `<a href="https://example.com/docs/">docs</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/docs"]); + }); + + it("preserves trailing slash on root path", () => { + const html = `<a href="https://example.com/">home</a>`; + expect(extractLinks(html, BASE)).toEqual(["https://example.com/"]); + }); + + it("returns empty array for HTML with no links", () => { + const html = `<p>No links here at all.</p>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("returns empty array for empty string", () => { + expect(extractLinks("", BASE)).toEqual([]); + }); + + it("handles multiple links preserving discovery order", () => { + const html = ` + <a href="https://example.com/a">a</a> + <a href="https://example.com/b">b</a> + <a href="https://example.com/c">c</a> + `; + expect(extractLinks(html, BASE)).toEqual([ + "https://example.com/a", + "https://example.com/b", + "https://example.com/c", + ]); + }); + + it("handles malformed href gracefully", () => { + const html = `<a href="not a valid [url]">bad</a>`; + // Should not throw; just skip + expect(() => extractLinks(html, BASE)).not.toThrow(); + }); + + it("skips <abbr> and <article> tags (not <a>)", () => { + const html = `<abbr href="https://example.com/x">X</abbr>`; + expect(extractLinks(html, BASE)).toEqual([]); + }); + + it("handles https links alongside http", () => { + const html = ` + <a href="http://example.com/http">http</a> + <a href="https://example.com/https">https</a> + `; + const links = extractLinks(html, BASE); + expect(links).toContain("http://example.com/http"); + expect(links).toContain("https://example.com/https"); + }); +}); diff --git a/tests/unit/mcp-server.test.ts b/tests/unit/mcp-server.test.ts new file mode 100644 index 0000000..1e0f65a --- /dev/null +++ b/tests/unit/mcp-server.test.ts @@ -0,0 +1,209 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import { createTestDbWithVec } from "../fixtures/test-db.js"; +import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; +import { initLogger } from "../../src/logger.js"; +import { errorResponse, withErrorHandling, type ToolResult } from "../../src/mcp/errors.js"; +import { LibScopeError, ValidationError, DocumentNotFoundError } from "../../src/errors.js"; +import type Database from "better-sqlite3"; + +describe("MCP server helpers", () => { + beforeEach(() => { + initLogger("silent"); + }); + + describe("errorResponse", () => { + it("returns isError: true with text content", () => { + const result = errorResponse(new Error("something went wrong")); + expect(result.isError).toBe(true); + expect(result.content).toHaveLength(1); + expect(result.content[0]!.type).toBe("text"); + }); + + it("formats LibScopeError using just the message", () => { + const result = errorResponse(new ValidationError("invalid input")); + expect(result.content[0]!.text).toBe("Error: invalid input"); + }); + + it("formats a generic Error using name: message", () => { + const err = new TypeError("bad type"); + const result = errorResponse(err); + expect(result.content[0]!.text).toBe("Error: TypeError: bad type"); + }); + + it("formats non-Error values using String()", () => { + const result = errorResponse("raw string error"); + expect(result.content[0]!.text).toContain("raw string error"); + }); + + it("formats null/undefined without throwing", () => { + expect(() => errorResponse(null)).not.toThrow(); + expect(() => errorResponse(undefined)).not.toThrow(); + }); + }); + + describe("withErrorHandling", () => { + it("returns the handler result when no error is thrown", async () => { + const expected: ToolResult = { content: [{ type: "text", text: "ok" }] }; + const wrapped = withErrorHandling(() => expected); + const result = await wrapped({}); + expect(result).toEqual(expected); + }); + + it("catches synchronous throws and returns an error response", async () => { + const wrapped = withErrorHandling(() => { + throw new ValidationError("bad input"); + }); + const result = await wrapped({}); + expect(result.isError).toBe(true); + expect(result.content[0]!.text).toContain("bad input"); + }); + + it("catches rejected promises and returns an error response", async () => { + const wrapped = withErrorHandling(() => { + return Promise.reject(new DocumentNotFoundError("doc-123")); + }); + const result = await wrapped({}); + expect(result.isError).toBe(true); + }); + + it("passes params to the inner handler", async () => { + const handler = vi.fn().mockReturnValue({ content: [{ type: "text", text: "done" }] }); + const wrapped = withErrorHandling(handler); + const params = { docId: "abc", query: "test" }; + await wrapped(params); + expect(handler).toHaveBeenCalledWith(params); + }); + + it("returns isError: true for LibScopeError subclasses", async () => { + const wrapped = withErrorHandling(() => { + throw new LibScopeError("base lib error"); + }); + const result = await wrapped({}); + expect(result.isError).toBe(true); + expect(result.content[0]!.text).toBe("Error: base lib error"); + }); + }); +}); + +// Integration-style tests for MCP tool behaviors using the underlying core functions +// These verify the business logic that MCP tools delegate to. +describe("MCP tool business logic", () => { + let db: Database.Database; + let provider: MockEmbeddingProvider; + + beforeEach(() => { + initLogger("silent"); + db = createTestDbWithVec(); + provider = new MockEmbeddingProvider(); + }); + + afterEach(() => { + db.close(); + }); + + it("search returns empty response when no documents are indexed", async () => { + const { searchDocuments } = await import("../../src/core/search.js"); + const { results, totalCount } = await searchDocuments(db, provider, { query: "anything" }); + expect(results).toHaveLength(0); + expect(totalCount).toBe(0); + }); + + it("indexDocument then getDocument returns indexed document", async () => { + const { indexDocument } = await import("../../src/core/indexing.js"); + const { getDocument } = await import("../../src/core/documents.js"); + + const indexed = await indexDocument(db, provider, { + title: "Test Doc", + content: "Some content for testing.", + sourceType: "manual", + }); + + expect(indexed.id).toBeTruthy(); + + const fetched = getDocument(db, indexed.id); + expect(fetched.title).toBe("Test Doc"); + expect(fetched.content).toBe("Some content for testing."); + }); + + it("deleteDocument removes a document", async () => { + const { indexDocument } = await import("../../src/core/indexing.js"); + const { deleteDocument, getDocument } = await import("../../src/core/documents.js"); + + const indexed = await indexDocument(db, provider, { + title: "Delete Me", + content: "This will be deleted.", + sourceType: "manual", + }); + + deleteDocument(db, indexed.id); + + expect(() => getDocument(db, indexed.id)).toThrow(DocumentNotFoundError); + }); + + it("listDocuments returns paginated documents", async () => { + const { indexDocument } = await import("../../src/core/indexing.js"); + const { listDocuments } = await import("../../src/core/documents.js"); + + await indexDocument(db, provider, { + title: "Doc A", + content: "Content A", + sourceType: "library", + library: "react", + }); + await indexDocument(db, provider, { + title: "Doc B", + content: "Content B", + sourceType: "library", + library: "vue", + }); + + const all = listDocuments(db, {}); + expect(all.length).toBeGreaterThanOrEqual(2); + + const limited = listDocuments(db, { limit: 1 }); + expect(limited).toHaveLength(1); + }); + + it("getDocumentRatings returns zero ratings for new document", async () => { + const { indexDocument } = await import("../../src/core/indexing.js"); + const { getDocumentRatings } = await import("../../src/core/ratings.js"); + + const indexed = await indexDocument(db, provider, { + title: "Rate Me", + content: "Rateable content.", + sourceType: "manual", + }); + + const ratings = getDocumentRatings(db, indexed.id); + expect(ratings.totalRatings).toBe(0); + expect(ratings.averageRating).toBe(0); + }); + + it("rateDocument stores a rating and updates average", async () => { + const { indexDocument } = await import("../../src/core/indexing.js"); + const { rateDocument, getDocumentRatings } = await import("../../src/core/ratings.js"); + + const indexed = await indexDocument(db, provider, { + title: "Rate Me", + content: "Rateable content.", + sourceType: "manual", + }); + + rateDocument(db, { documentId: indexed.id, rating: 4, feedback: "good doc" }); + const ratings = getDocumentRatings(db, indexed.id); + expect(ratings.totalRatings).toBe(1); + expect(ratings.averageRating).toBe(4); + }); + + it("listTopics returns empty array when no topics exist", async () => { + const { listTopics } = await import("../../src/core/topics.js"); + const topics = listTopics(db); + expect(topics).toEqual([]); + }); + + it("errorResponse for DocumentNotFoundError returns proper message", () => { + const result = errorResponse(new DocumentNotFoundError("missing-id")); + expect(result.isError).toBe(true); + expect(result.content[0]!.text).toContain("missing-id"); + }); +}); diff --git a/tests/unit/packs.test.ts b/tests/unit/packs.test.ts index 90edf79..45bf0f5 100644 --- a/tests/unit/packs.test.ts +++ b/tests/unit/packs.test.ts @@ -1,7 +1,8 @@ -import { describe, it, expect, beforeEach, afterEach } from "vitest"; -import { writeFileSync, existsSync, mkdtempSync } from "node:fs"; +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { writeFileSync, existsSync, mkdtempSync, readFileSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; +import { gzipSync, gunzipSync } from "node:zlib"; import type Database from "better-sqlite3"; import { createTestDbWithVec } from "../fixtures/test-db.js"; import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; @@ -11,6 +12,7 @@ import { listInstalledPacks, createPack, listAvailablePacks, + createPackFromSource, } from "../../src/core/packs.js"; import type { KnowledgePack } from "../../src/core/packs.js"; import { indexDocument } from "../../src/core/indexing.js"; @@ -443,4 +445,626 @@ describe("knowledge packs", () => { ); }); }); + + describe("createPackFromSource", () => { + let sourceDir: string; + + beforeEach(() => { + sourceDir = mkdtempSync(join(tmpdir(), "libscope-pack-source-")); + }); + + it("should create a pack from a folder of markdown files", async () => { + writeFileSync(join(sourceDir, "guide.md"), "# Guide\n\nThis is a guide."); + writeFileSync(join(sourceDir, "api.md"), "# API\n\nEndpoint reference."); + + const pack = await createPackFromSource({ + name: "test-from-folder", + from: [sourceDir], + }); + + expect(pack.name).toBe("test-from-folder"); + expect(pack.documents).toHaveLength(2); + expect(pack.documents.map((d) => d.title).sort()).toEqual(["api", "guide"]); + expect(pack.documents[0]!.content).toBeTruthy(); + expect(pack.documents[0]!.source).toMatch(/^file:\/\//); + expect(pack.version).toBe("1.0.0"); + expect(pack.metadata.author).toBe("libscope"); + }); + + it("should write pack to outputPath", async () => { + writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent here."); + const outputPath = join(tempDir, "output-pack.json"); + + const pack = await createPackFromSource({ + name: "output-test", + from: [sourceDir], + outputPath, + }); + + expect(existsSync(outputPath)).toBe(true); + const written = JSON.parse(readFileSync(outputPath, "utf-8")) as KnowledgePack; + expect(written.name).toBe("output-test"); + expect(written.documents).toHaveLength(1); + expect(pack.documents).toHaveLength(1); + }); + + it("should filter by extensions", async () => { + writeFileSync(join(sourceDir, "readme.md"), "# Readme"); + writeFileSync(join(sourceDir, "page.html"), "<h1>Page</h1><p>Content</p>"); + writeFileSync(join(sourceDir, "data.json"), '{"key": "value"}'); + + const pack = await createPackFromSource({ + name: "ext-filter", + from: [sourceDir], + extensions: [".md"], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("readme"); + }); + + it("should handle extensions without leading dot", async () => { + writeFileSync(join(sourceDir, "readme.md"), "# Readme\n\nContent"); + + const pack = await createPackFromSource({ + name: "ext-no-dot", + from: [sourceDir], + extensions: ["md"], + }); + + expect(pack.documents).toHaveLength(1); + }); + + it("should exclude files matching patterns", async () => { + writeFileSync(join(sourceDir, "guide.md"), "# Guide\n\nContent"); + writeFileSync(join(sourceDir, "draft.md"), "# Draft\n\nNot ready"); + + const pack = await createPackFromSource({ + name: "exclude-test", + from: [sourceDir], + exclude: ["draft.md"], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("guide"); + }); + + it("should recurse into subdirectories by default", async () => { + const { mkdirSync } = await import("node:fs"); + const subDir = join(sourceDir, "sub"); + mkdirSync(subDir); + writeFileSync(join(sourceDir, "root.md"), "# Root"); + writeFileSync(join(subDir, "nested.md"), "# Nested\n\nDeep content"); + + const pack = await createPackFromSource({ + name: "recursive-test", + from: [sourceDir], + }); + + expect(pack.documents).toHaveLength(2); + expect(pack.documents.map((d) => d.title).sort()).toEqual(["nested", "root"]); + }); + + it("should not recurse when recursive is false", async () => { + const { mkdirSync } = await import("node:fs"); + const subDir = join(sourceDir, "sub"); + mkdirSync(subDir); + writeFileSync(join(sourceDir, "root.md"), "# Root"); + writeFileSync(join(subDir, "nested.md"), "# Nested"); + + const pack = await createPackFromSource({ + name: "no-recurse", + from: [sourceDir], + recursive: false, + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("root"); + }); + + it("should throw for empty pack name", async () => { + await expect(createPackFromSource({ name: " ", from: [sourceDir] })).rejects.toThrow( + /Pack name is required/, + ); + }); + + it("should throw for empty from array", async () => { + await expect(createPackFromSource({ name: "test", from: [] })).rejects.toThrow( + /At least one --from source is required/, + ); + }); + + it("should throw for non-existent source path", async () => { + await expect( + createPackFromSource({ name: "test", from: ["/nonexistent/path/xyz"] }), + ).rejects.toThrow(/does not exist/); + }); + + it("should throw when no documents could be created", async () => { + // Empty directory — no parseable files + await expect(createPackFromSource({ name: "empty", from: [sourceDir] })).rejects.toThrow( + /No documents could be created/, + ); + }); + + it("should skip files without a parser", async () => { + writeFileSync(join(sourceDir, "data.bin"), "binary stuff"); + writeFileSync(join(sourceDir, "readme.md"), "# Readme\n\nContent"); + + const pack = await createPackFromSource({ + name: "skip-unsupported", + from: [sourceDir], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("readme"); + }); + + it("should skip files with empty content after parsing", async () => { + writeFileSync(join(sourceDir, "empty.md"), " "); + writeFileSync(join(sourceDir, "real.md"), "# Real\n\nActual content"); + + const pack = await createPackFromSource({ + name: "skip-empty", + from: [sourceDir], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("real"); + }); + + it("should accept a single file as source", async () => { + const filePath = join(sourceDir, "single.md"); + writeFileSync(filePath, "# Single File\n\nJust one file."); + + const pack = await createPackFromSource({ + name: "single-file", + from: [filePath], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("single"); + }); + + it("should accept multiple sources", async () => { + const dir2 = mkdtempSync(join(tmpdir(), "libscope-pack-source2-")); + writeFileSync(join(sourceDir, "a.md"), "# A\n\nFrom dir 1"); + writeFileSync(join(dir2, "b.md"), "# B\n\nFrom dir 2"); + + const pack = await createPackFromSource({ + name: "multi-source", + from: [sourceDir, dir2], + }); + + expect(pack.documents).toHaveLength(2); + }); + + it("should call onProgress callback", async () => { + writeFileSync(join(sourceDir, "a.md"), "# A"); + writeFileSync(join(sourceDir, "b.md"), "# B"); + + const progress: Array<{ file: string; index: number; total: number }> = []; + + await createPackFromSource({ + name: "progress-test", + from: [sourceDir], + onProgress: (info) => progress.push(info), + }); + + expect(progress).toHaveLength(2); + expect(progress[0]!.index).toBe(0); + expect(progress[0]!.total).toBe(2); + expect(progress[1]!.index).toBe(1); + }); + + it("should set custom version, description, author", async () => { + writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent"); + + const pack = await createPackFromSource({ + name: "custom-meta", + from: [sourceDir], + version: "2.0.0", + description: "Custom desc", + author: "Test Author", + }); + + expect(pack.version).toBe("2.0.0"); + expect(pack.description).toBe("Custom desc"); + expect(pack.metadata.author).toBe("Test Author"); + }); + + it("should produce a valid pack that passes validatePack", async () => { + writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nSome content here"); + const outputPath = join(tempDir, "validate-test.json"); + + await createPackFromSource({ + name: "validate-test", + from: [sourceDir], + outputPath, + }); + + // Read and re-validate through installPack (which calls validatePack internally) + const result = await installPack(db, provider, outputPath); + expect(result.packName).toBe("validate-test"); + expect(result.documentsInstalled).toBe(1); + }); + + it("should handle HTML files", async () => { + writeFileSync( + join(sourceDir, "page.html"), + "<html><head><title>Test

Hello

World

", + ); + + const pack = await createPackFromSource({ + name: "html-test", + from: [sourceDir], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("page"); + expect(pack.documents[0]!.content).toContain("Hello"); + expect(pack.documents[0]!.content).toContain("World"); + }); + + it("should exclude with wildcard patterns", async () => { + const { mkdirSync } = await import("node:fs"); + const assetsDir = join(sourceDir, "assets"); + mkdirSync(assetsDir); + writeFileSync(join(sourceDir, "readme.md"), "# Readme\n\nContent"); + writeFileSync(join(assetsDir, "data.md"), "# Asset data"); + + const pack = await createPackFromSource({ + name: "wildcard-exclude", + from: [sourceDir], + exclude: ["assets/**"], + }); + + expect(pack.documents).toHaveLength(1); + expect(pack.documents[0]!.title).toBe("readme"); + }); + + it("should write gzipped pack when output ends in .gz", async () => { + writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent here."); + const outputPath = join(tempDir, "test.json.gz"); + + await createPackFromSource({ + name: "gzip-test", + from: [sourceDir], + outputPath, + }); + + expect(existsSync(outputPath)).toBe(true); + const raw = readFileSync(outputPath); + // Verify gzip magic bytes + expect(raw[0]).toBe(0x1f); + expect(raw[1]).toBe(0x8b); + // Decompress and verify JSON + const json = gunzipSync(raw).toString("utf-8"); + const parsed = JSON.parse(json) as KnowledgePack; + expect(parsed.name).toBe("gzip-test"); + expect(parsed.documents).toHaveLength(1); + }); + + it("should write plain JSON when output ends in .json", async () => { + writeFileSync(join(sourceDir, "doc.md"), "# Doc\n\nContent here."); + const outputPath = join(tempDir, "test.json"); + + await createPackFromSource({ + name: "json-test", + from: [sourceDir], + outputPath, + }); + + const raw = readFileSync(outputPath, "utf-8"); + const parsed = JSON.parse(raw) as KnowledgePack; + expect(parsed.name).toBe("json-test"); + }); + }); + + describe("gzip pack install", () => { + it("should install a gzipped pack file", async () => { + const pack = makeSamplePack({ name: "gz-pack" }); + const packPath = join(tempDir, "gz-pack.json.gz"); + writeFileSync(packPath, gzipSync(Buffer.from(JSON.stringify(pack), "utf-8"))); + + const result = await installPack(db, provider, packPath); + + expect(result.packName).toBe("gz-pack"); + expect(result.documentsInstalled).toBe(2); + expect(result.alreadyInstalled).toBe(false); + }); + + it("should auto-detect gzip by magic bytes even with .json extension", async () => { + const pack = makeSamplePack({ name: "magic-detect" }); + const packPath = join(tempDir, "magic-detect.json"); + // Write gzipped content but with .json extension + writeFileSync(packPath, gzipSync(Buffer.from(JSON.stringify(pack), "utf-8"))); + + const result = await installPack(db, provider, packPath); + + expect(result.packName).toBe("magic-detect"); + expect(result.documentsInstalled).toBe(2); + }); + + it("should round-trip: create gzipped pack from source then install it", async () => { + const rtDir = mkdtempSync(join(tmpdir(), "libscope-pack-rt-")); + writeFileSync(join(rtDir, "guide.md"), "# Guide\n\nThis is a guide."); + const packPath = join(tempDir, "roundtrip.json.gz"); + + await createPackFromSource({ + name: "roundtrip-pack", + from: [rtDir], + outputPath: packPath, + }); + + const result = await installPack(db, provider, packPath); + expect(result.packName).toBe("roundtrip-pack"); + expect(result.documentsInstalled).toBe(1); + }); + }); + + describe("installPack — batch & progress options", () => { + it("should report progress via onProgress callback", async () => { + const pack = makeSamplePack(); + const packPath = join(tempDir, "progress-pack.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const calls: Array<{ current: number; total: number; label: string }> = []; + await installPack(db, provider, packPath, { + onProgress: (current, total, label) => { + calls.push({ current, total, label }); + }, + }); + + // Should have called onProgress at least once (one batch covering both docs) + expect(calls.length).toBeGreaterThan(0); + // Last call should report all docs processed + const last = calls[calls.length - 1]!; + expect(last.current).toBe(2); + expect(last.total).toBe(2); + }); + + it("should process in smaller batches when batchSize=1", async () => { + const pack = makeSamplePack(); + const packPath = join(tempDir, "batch1-pack.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const calls: number[] = []; + await installPack(db, provider, packPath, { + batchSize: 1, + onProgress: (current) => calls.push(current), + }); + + // With batchSize=1 and 2 docs, should get 2 progress calls + expect(calls).toEqual([1, 2]); + }); + + it("should skip documents when resumeFrom is set", async () => { + const pack = makeSamplePack({ + name: "resume-pack", + documents: [ + { title: "Doc 1", content: "Content one", source: "" }, + { title: "Doc 2", content: "Content two", source: "" }, + { title: "Doc 3", content: "Content three", source: "" }, + ], + }); + const packPath = join(tempDir, "resume-pack.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const result = await installPack(db, provider, packPath, { resumeFrom: 2 }); + + // Should only install doc 3 (skipped first 2) + expect(result.documentsInstalled).toBe(1); + expect(result.packName).toBe("resume-pack"); + }); + + it("should count errors when embedBatch fails", async () => { + const pack = makeSamplePack({ name: "err-pack" }); + const packPath = join(tempDir, "err-pack.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const failProvider = new MockEmbeddingProvider(); + failProvider.embedBatch = vi.fn().mockRejectedValue(new Error("embed failed")); + + const result = await installPack(db, failProvider, packPath); + + // embedBatch failure means documents in that batch are skipped + expect(result.errors).toBeGreaterThan(0); + expect(result.documentsInstalled).toBe(0); + }); + + it("should include errors=0 on successful install", async () => { + const pack = makeSamplePack({ name: "ok-pack" }); + const packPath = join(tempDir, "ok-pack.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const result = await installPack(db, provider, packPath); + + expect(result.errors).toBe(0); + expect(result.documentsInstalled).toBe(2); + }); + + it("should use a single embedBatch call per batch for efficiency", async () => { + const pack = makeSamplePack({ name: "batch-efficiency" }); + const packPath = join(tempDir, "batch-eff.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + await installPack(db, provider, packPath, { batchSize: 10 }); + + // 2 docs in one batch → 1 embedBatch call + expect(provider.embedBatchCallCount).toBe(1); + }); + + it("should return errors=0 for already-installed pack", async () => { + const pack = makeSamplePack({ name: "already-pack" }); + const packPath = join(tempDir, "already-pack.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + await installPack(db, provider, packPath); + const result = await installPack(db, provider, packPath); + + expect(result.alreadyInstalled).toBe(true); + expect(result.errors).toBe(0); + }); + }); + + describe("installPack — concurrency option", () => { + it("should install all docs correctly with concurrency=1 (sequential)", async () => { + const pack = makeSamplePack({ + name: "concurrent-1", + documents: [ + { title: "Doc A", content: "# Doc A\n\nContent A.", source: "" }, + { title: "Doc B", content: "# Doc B\n\nContent B.", source: "" }, + { title: "Doc C", content: "# Doc C\n\nContent C.", source: "" }, + ], + }); + const packPath = join(tempDir, "concurrent-1.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const result = await installPack(db, provider, packPath, { batchSize: 1, concurrency: 1 }); + + expect(result.documentsInstalled).toBe(3); + expect(result.errors).toBe(0); + }); + + it("should install all docs correctly with concurrency=4 (parallel)", async () => { + const pack = makeSamplePack({ + name: "concurrent-4", + documents: [ + { title: "Doc A", content: "# Doc A\n\nContent A.", source: "" }, + { title: "Doc B", content: "# Doc B\n\nContent B.", source: "" }, + { title: "Doc C", content: "# Doc C\n\nContent C.", source: "" }, + { title: "Doc D", content: "# Doc D\n\nContent D.", source: "" }, + ], + }); + const packPath = join(tempDir, "concurrent-4.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const result = await installPack(db, provider, packPath, { batchSize: 1, concurrency: 4 }); + + expect(result.documentsInstalled).toBe(4); + expect(result.errors).toBe(0); + + // Verify all 4 docs are in the DB + const docs = db + .prepare("SELECT id FROM documents WHERE pack_name = ?") + .all("concurrent-4") as Array<{ id: string }>; + expect(docs.length).toBe(4); + }); + + it("should make multiple embedBatch calls with small batchSize and high concurrency", async () => { + const pack = makeSamplePack({ + name: "multi-batch", + documents: [ + { title: "Doc 1", content: "Content 1", source: "" }, + { title: "Doc 2", content: "Content 2", source: "" }, + { title: "Doc 3", content: "Content 3", source: "" }, + { title: "Doc 4", content: "Content 4", source: "" }, + ], + }); + const packPath = join(tempDir, "multi-batch.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + await installPack(db, provider, packPath, { batchSize: 2, concurrency: 2 }); + + // 4 docs with batchSize=2 → 2 batches → 2 embedBatch calls + expect(provider.embedBatchCallCount).toBe(2); + }); + + it("should not exceed concurrency limit for embed calls", async () => { + // Track the maximum number of concurrent embedBatch calls in flight + let maxConcurrent = 0; + let activeCalls = 0; + let totalCalls = 0; + + const trackingProvider = new MockEmbeddingProvider(); + trackingProvider.embedBatch = vi.fn().mockImplementation((texts: string[]) => { + totalCalls++; + activeCalls++; + maxConcurrent = Math.max(maxConcurrent, activeCalls); + // Simulate slight async delay so concurrent calls can overlap + return Promise.resolve().then(() => { + activeCalls--; + return texts.map(() => [0.1, 0.2, 0.3, 0.4]); + }); + }); + + const pack = makeSamplePack({ + name: "concurrency-limit", + documents: Array.from({ length: 8 }, (_, i) => ({ + title: `Doc ${i}`, + content: `Content ${i}`, + source: "", + })), + }); + const packPath = join(tempDir, "concurrency-limit.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + await installPack(db, trackingProvider, packPath, { batchSize: 1, concurrency: 3 }); + + // Should never exceed the concurrency limit of 3 + expect(maxConcurrent).toBeLessThanOrEqual(3); + // Should have made 8 embedBatch calls (8 docs, batchSize=1) + expect(totalCalls).toBe(8); + }); + + it("should report progress after each batch when embedding concurrently", async () => { + const pack = makeSamplePack({ + name: "concurrent-progress", + documents: [ + { title: "Doc A", content: "Content A", source: "" }, + { title: "Doc B", content: "Content B", source: "" }, + { title: "Doc C", content: "Content C", source: "" }, + ], + }); + const packPath = join(tempDir, "concurrent-progress.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const calls: Array<{ current: number; total: number }> = []; + await installPack(db, provider, packPath, { + batchSize: 1, + concurrency: 2, + onProgress: (current, total) => calls.push({ current, total }), + }); + + // Should have 3 progress calls (one per batch/doc with batchSize=1) + expect(calls).toHaveLength(3); + // Final call should report all docs processed + expect(calls[calls.length - 1]!.current).toBe(3); + expect(calls[calls.length - 1]!.total).toBe(3); + }); + + it("should count errors correctly when some batches fail during concurrent embedding", async () => { + let callCount = 0; + const partialFailProvider = new MockEmbeddingProvider(); + partialFailProvider.embedBatch = vi.fn().mockImplementation(() => { + callCount++; + if (callCount % 2 === 0) { + return Promise.reject(new Error("embed failed")); + } + return Promise.resolve([[0.1, 0.2, 0.3, 0.4]]); + }); + + const pack = makeSamplePack({ + name: "partial-fail", + documents: [ + { title: "Doc 1", content: "Content 1", source: "" }, + { title: "Doc 2", content: "Content 2", source: "" }, + { title: "Doc 3", content: "Content 3", source: "" }, + { title: "Doc 4", content: "Content 4", source: "" }, + ], + }); + const packPath = join(tempDir, "partial-fail.json"); + writeFileSync(packPath, JSON.stringify(pack), "utf-8"); + + const result = await installPack(db, partialFailProvider, packPath, { + batchSize: 1, + concurrency: 4, + }); + + // 4 docs, batchSize=1 → 4 batches; even-numbered calls fail → 2 errors, 2 installed + expect(result.errors).toBe(2); + expect(result.documentsInstalled).toBe(2); + }); + }); }); diff --git a/tests/unit/parsers.test.ts b/tests/unit/parsers.test.ts index c30a614..5fc5ab3 100644 --- a/tests/unit/parsers.test.ts +++ b/tests/unit/parsers.test.ts @@ -5,6 +5,7 @@ import { PlainTextParser } from "../../src/core/parsers/text.js"; import { JsonParser } from "../../src/core/parsers/json-parser.js"; import { YamlParser } from "../../src/core/parsers/yaml.js"; import { CsvParser } from "../../src/core/parsers/csv.js"; +import { HtmlParser } from "../../src/core/parsers/html.js"; import { ValidationError } from "../../src/errors.js"; describe("getParserForFile", () => { @@ -44,6 +45,14 @@ describe("getParserForFile", () => { expect(getParserForFile("document.docx")).not.toBeNull(); }); + it("returns parser for .html files", () => { + expect(getParserForFile("page.html")).not.toBeNull(); + }); + + it("returns parser for .htm files", () => { + expect(getParserForFile("page.htm")).not.toBeNull(); + }); + it("returns null for unsupported extensions", () => { expect(getParserForFile("image.png")).toBeNull(); expect(getParserForFile("archive.zip")).toBeNull(); @@ -66,6 +75,8 @@ describe("getSupportedExtensions", () => { expect(exts).toContain(".pdf"); expect(exts).toContain(".docx"); expect(exts).toContain(".txt"); + expect(exts).toContain(".html"); + expect(exts).toContain(".htm"); // Should be sorted const sorted = [...exts].sort(); expect(exts).toEqual(sorted); @@ -215,3 +226,78 @@ describe("WordParser", () => { await expect(parser.parse(Buffer.from("not a docx"))).rejects.toThrow(ValidationError); }); }); + +describe("HtmlParser", () => { + const parser = new HtmlParser(); + + it("has .html and .htm extensions", () => { + expect(parser.extensions).toEqual([".html", ".htm"]); + }); + + it("converts basic HTML to markdown", async () => { + const html = "

Hello

This is a test.

"; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("Hello"); + expect(result).toContain("**test**"); + }); + + it("strips script tags", async () => { + const html = '

Content

More

'; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("Content"); + expect(result).toContain("More"); + expect(result).not.toContain("alert"); + expect(result).not.toContain("script"); + }); + + it("strips style tags", async () => { + const html = "

Visible

"; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("Visible"); + expect(result).not.toContain("color"); + }); + + it("strips nav tags", async () => { + const html = + "

Article

"; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("Article"); + expect(result).not.toContain("Home"); + }); + + it("handles full HTML documents with doctype and head", async () => { + const html = ` +Test Page +

Main Title

Body text here.

`; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("Main Title"); + expect(result).toContain("Body text here"); + expect(result).not.toContain("color: blue"); + }); + + it("converts links to markdown format", async () => { + const html = 'Click here'; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("[Click here]"); + expect(result).toContain("https://example.com"); + }); + + it("converts lists to markdown", async () => { + const html = "
  • One
  • Two
  • Three
"; + const result = await parser.parse(Buffer.from(html)); + expect(result).toContain("One"); + expect(result).toContain("Two"); + expect(result).toContain("Three"); + }); + + it("handles empty HTML gracefully", async () => { + const result = await parser.parse(Buffer.from("")); + expect(result).toBe(""); + }); + + it("collapses excessive blank lines", async () => { + const html = "

First

Second

"; + const result = await parser.parse(Buffer.from(html)); + expect(result).not.toMatch(/\n{3,}/); + }); +}); diff --git a/tests/unit/reporter.test.ts b/tests/unit/reporter.test.ts new file mode 100644 index 0000000..b63d8d7 --- /dev/null +++ b/tests/unit/reporter.test.ts @@ -0,0 +1,172 @@ +import { describe, it, expect, vi, afterEach } from "vitest"; +import { isVerbose, createReporter } from "../../src/cli/reporter.js"; + +describe("reporter", () => { + afterEach(() => { + delete process.env["LIBSCOPE_VERBOSE"]; + vi.restoreAllMocks(); + }); + + describe("isVerbose", () => { + it("returns true when verbose flag is set", () => { + expect(isVerbose(true)).toBe(true); + }); + + it("returns false when verbose flag is false", () => { + expect(isVerbose(false)).toBe(false); + }); + + it("returns false when verbose flag is undefined", () => { + expect(isVerbose(undefined)).toBe(false); + }); + + it("returns true when LIBSCOPE_VERBOSE=1 env var is set", () => { + process.env["LIBSCOPE_VERBOSE"] = "1"; + expect(isVerbose(false)).toBe(true); + }); + + it("returns false when LIBSCOPE_VERBOSE=0", () => { + process.env["LIBSCOPE_VERBOSE"] = "0"; + expect(isVerbose(false)).toBe(false); + }); + }); + + describe("createReporter", () => { + it("returns a SilentReporter (no-op) in verbose mode", () => { + const reporter = createReporter(true); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + const stderr = vi.spyOn(process.stderr, "write").mockImplementation(() => true); + + reporter.log("hello"); + reporter.success("done"); + reporter.warn("careful"); + reporter.error("bad"); + reporter.progress(1, 10, "task"); + reporter.clearProgress(); + + expect(stdout).not.toHaveBeenCalled(); + expect(stderr).not.toHaveBeenCalled(); + }); + + it("returns a SilentReporter when LIBSCOPE_VERBOSE=1", () => { + process.env["LIBSCOPE_VERBOSE"] = "1"; + const reporter = createReporter(); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.log("hello"); + expect(stdout).not.toHaveBeenCalled(); + }); + + it("PrettyReporter.log writes to stdout", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.log("test message"); + + expect(stdout).toHaveBeenCalledOnce(); + expect(String(stdout.mock.calls[0]![0])).toContain("test message"); + }); + + it("PrettyReporter.success writes green checkmark to stdout", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.success("all done"); + + const output = String(stdout.mock.calls[0]![0]); + expect(output).toContain("all done"); + // Green ANSI code + expect(output).toContain("\x1b[32m"); + }); + + it("PrettyReporter.warn writes to stderr", () => { + const reporter = createReporter(false); + const stderr = vi.spyOn(process.stderr, "write").mockImplementation(() => true); + + reporter.warn("watch out"); + + expect(stderr).toHaveBeenCalledOnce(); + expect(String(stderr.mock.calls[0]![0])).toContain("watch out"); + }); + + it("PrettyReporter.error writes to stderr", () => { + const reporter = createReporter(false); + const stderr = vi.spyOn(process.stderr, "write").mockImplementation(() => true); + + reporter.error("something failed"); + + expect(stderr).toHaveBeenCalledOnce(); + expect(String(stderr.mock.calls[0]![0])).toContain("something failed"); + }); + + it("PrettyReporter.progress writes \\r-prefixed line to stdout", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.progress(3, 10, "indexing doc"); + + const output = String(stdout.mock.calls[0]![0]); + expect(output).toMatch(/^\r/); + expect(output).toContain("3/10"); + expect(output).toContain("30%"); + }); + + it("PrettyReporter.clearProgress clears the progress line", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.progress(1, 5, "working"); + stdout.mockClear(); + + reporter.clearProgress(); + + // Should write spaces to clear the line + const output = String(stdout.mock.calls[0]![0]); + expect(output).toMatch(/^\r\s+\r$/); + }); + + it("PrettyReporter.clearProgress is a no-op when no progress shown", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.clearProgress(); + + expect(stdout).not.toHaveBeenCalled(); + }); + + it("PrettyReporter.log clears progress before writing", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.progress(1, 5, "working"); + stdout.mockClear(); + + reporter.log("a message"); + + // First call should be the clear, second the message + expect(stdout.mock.calls.length).toBeGreaterThanOrEqual(2); + const clearCall = String(stdout.mock.calls[0]![0]); + expect(clearCall).toMatch(/^\r\s+\r$/); + }); + + it("PrettyReporter.progress truncates long labels", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.progress(1, 1, "a".repeat(50)); + + const output = String(stdout.mock.calls[0]![0]); + expect(output).toContain("..."); + }); + + it("PrettyReporter.progress handles zero total gracefully", () => { + const reporter = createReporter(false); + const stdout = vi.spyOn(process.stdout, "write").mockImplementation(() => true); + + reporter.progress(0, 0, "starting"); + + const output = String(stdout.mock.calls[0]![0]); + expect(output).toContain("0%"); + }); + }); +}); diff --git a/tests/unit/saved-searches.test.ts b/tests/unit/saved-searches.test.ts index 8e20a1b..b2eb001 100644 --- a/tests/unit/saved-searches.test.ts +++ b/tests/unit/saved-searches.test.ts @@ -11,6 +11,9 @@ import { import { indexDocument } from "../../src/core/indexing.js"; import { ValidationError, DocumentNotFoundError } from "../../src/errors.js"; import type Database from "better-sqlite3"; +import { initLogger } from "../../src/logger.js"; + +initLogger("silent"); describe("saved-searches", () => { let db: Database.Database; @@ -191,5 +194,18 @@ describe("saved-searches", () => { const fetched = getSavedSearch(db, created.id); expect(fetched.filters).toBeNull(); }); + + it("should default to null when filters JSON is corrupted", () => { + // Directly insert a row with invalid JSON in the filters column + db.prepare("INSERT INTO saved_searches (id, name, query, filters) VALUES (?, ?, ?, ?)").run( + "corrupt-ss", + "Corrupt Search", + "test query", + "{not valid json", + ); + + const fetched = getSavedSearch(db, "corrupt-ss"); + expect(fetched.filters).toBeNull(); + }); }); }); diff --git a/tests/unit/spider.test.ts b/tests/unit/spider.test.ts new file mode 100644 index 0000000..24d9348 --- /dev/null +++ b/tests/unit/spider.test.ts @@ -0,0 +1,497 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; + +// ── Mock fetchRaw so we don't make real network requests ───────────────────── +const mockFetchRaw = vi.fn(); +vi.mock("../../src/core/url-fetcher.js", () => ({ + fetchRaw: (...args: unknown[]): unknown => mockFetchRaw(...args), + DEFAULT_FETCH_OPTIONS: { + timeout: 30_000, + maxRedirects: 5, + maxBodySize: 10 * 1024 * 1024, + allowPrivateUrls: false, + allowSelfSignedCerts: false, + }, +})); + +// ── Import spider after mock is set up ─────────────────────────────────────── +const { spiderUrl } = await import("../../src/core/spider.js"); + +// ── Helpers ────────────────────────────────────────────────────────────────── + +function htmlPage(title: string, links: string[] = [], body = ""): string { + const anchors = links.map((href) => `link`).join("\n"); + return `${title}${anchors}${body}`; +} + +function pageResponse(html: string, url = "https://example.com/") { + return { + body: html, + contentType: "text/html; charset=utf-8", + finalUrl: url, + }; +} + +/** Collect all yielded values from an async generator. */ +async function collectPages(gen: ReturnType): Promise<{ + pages: Array<{ url: string; title: string; depth: number }>; + stats: Awaited> extends { value: infer V } ? V : unknown; +}> { + const pages = []; + let result = await gen.next(); + while (!result.done) { + const v = result.value as { url: string; title: string; depth: number }; + pages.push({ url: v.url, title: v.title, depth: v.depth }); + result = await gen.next(); + } + return { pages, stats: result.value }; +} + +// ── Tests ──────────────────────────────────────────────────────────────────── + +describe("spiderUrl", () => { + beforeEach(() => { + mockFetchRaw.mockReset(); + // Default: robots.txt not found + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) { + return Promise.reject(new Error("404")); + } + return Promise.resolve(pageResponse(htmlPage("Page", []), url)); + }); + // Speed up tests by removing inter-request delay + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("yields the seed page with depth 0", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve(pageResponse(htmlPage("Seed Page"), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 1, requestDelay: 0 }); + const result = await gen.next(); + expect(result.done).toBe(false); + const page = result.value as { url: string; title: string; depth: number }; + expect(page.url).toBe("https://example.com/"); + expect(page.title).toBe("Seed Page"); + expect(page.depth).toBe(0); + }); + + it("follows links up to maxDepth", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/") { + return Promise.resolve(pageResponse(htmlPage("Root", ["https://example.com/child"]), url)); + } + if (url === "https://example.com/child") { + return Promise.resolve( + pageResponse(htmlPage("Child", ["https://example.com/grandchild"]), url), + ); + } + if (url === "https://example.com/grandchild") { + return Promise.resolve(pageResponse(htmlPage("Grandchild", []), url)); + } + return Promise.reject(new Error("unexpected")); + }); + + const gen = spiderUrl("https://example.com/", { maxDepth: 2, maxPages: 10, requestDelay: 0 }); + const { pages } = await collectPages(gen); + + expect(pages.map((p) => p.url)).toContain("https://example.com/"); + expect(pages.map((p) => p.url)).toContain("https://example.com/child"); + expect(pages.map((p) => p.url)).toContain("https://example.com/grandchild"); + // depth 3 should not appear + expect(pages.every((p) => p.depth <= 2)).toBe(true); + }); + + it("does not follow links beyond maxDepth", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/") { + return Promise.resolve(pageResponse(htmlPage("Root", ["https://example.com/child"]), url)); + } + if (url === "https://example.com/child") { + return Promise.resolve( + pageResponse(htmlPage("Child", ["https://example.com/grandchild"]), url), + ); + } + // grandchild should NOT be fetched at maxDepth=1 + return Promise.reject(new Error("should not fetch this")); + }); + + const gen = spiderUrl("https://example.com/", { maxDepth: 1, maxPages: 10, requestDelay: 0 }); + const { pages } = await collectPages(gen); + + const urls = pages.map((p) => p.url); + expect(urls).toContain("https://example.com/"); + expect(urls).toContain("https://example.com/child"); + expect(urls).not.toContain("https://example.com/grandchild"); + }); + + it("enforces maxPages hard cap", async () => { + // Return the same page with 5 links each time + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + const links = [1, 2, 3, 4, 5].map((i) => `https://example.com/page${i}`); + return Promise.resolve(pageResponse(htmlPage("Page", links), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 3, maxDepth: 5, requestDelay: 0 }); + const { pages, stats } = await collectPages(gen); + + expect(pages.length).toBeLessThanOrEqual(3); + expect((stats as { pagesFetched: number }).pagesFetched).toBeLessThanOrEqual(3); + }); + + it("does not visit the same URL twice (cycle detection)", async () => { + // Page A links to B, B links back to A + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/a") { + return Promise.resolve(pageResponse(htmlPage("A", ["https://example.com/b"]), url)); + } + if (url === "https://example.com/b") { + return Promise.resolve(pageResponse(htmlPage("B", ["https://example.com/a"]), url)); + } + return Promise.reject(new Error("unexpected")); + }); + + const gen = spiderUrl("https://example.com/a", { maxPages: 20, maxDepth: 5, requestDelay: 0 }); + const { pages } = await collectPages(gen); + + // Should only visit a and b once each + const urls = pages.map((p) => p.url); + expect(urls.filter((u) => u === "https://example.com/a").length).toBe(1); + expect(urls.filter((u) => u === "https://example.com/b").length).toBe(1); + }); + + it("filters cross-domain links when sameDomain=true (default)", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve( + pageResponse( + htmlPage("Root", ["https://other.com/page", "https://example.com/local"]), + url, + ), + ); + }); + + const gen = spiderUrl("https://example.com/", { + sameDomain: true, + maxPages: 10, + maxDepth: 1, + requestDelay: 0, + }); + const { pages } = await collectPages(gen); + + const urls = pages.map((p) => p.url); + expect(urls).not.toContain("https://other.com/page"); + expect(urls).toContain("https://example.com/local"); + }); + + it("allows cross-domain links when sameDomain=false", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/") { + return Promise.resolve(pageResponse(htmlPage("Root", ["https://other.com/page"]), url)); + } + return Promise.resolve(pageResponse(htmlPage("Other", []), url)); + }); + + const gen = spiderUrl("https://example.com/", { + sameDomain: false, + maxPages: 10, + maxDepth: 1, + requestDelay: 0, + }); + const { pages } = await collectPages(gen); + expect(pages.map((p) => p.url)).toContain("https://other.com/page"); + }); + + it("allows subdomain links when sameDomain=true", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/") { + return Promise.resolve( + pageResponse(htmlPage("Root", ["https://docs.example.com/guide"]), url), + ); + } + return Promise.resolve(pageResponse(htmlPage("Subdomain page", []), url)); + }); + + const gen = spiderUrl("https://example.com/", { + sameDomain: true, + maxPages: 10, + maxDepth: 1, + requestDelay: 0, + }); + const { pages } = await collectPages(gen); + expect(pages.map((p) => p.url)).toContain("https://docs.example.com/guide"); + }); + + it("filters links outside pathPrefix", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve( + pageResponse( + htmlPage("Docs", ["https://example.com/docs/guide", "https://example.com/blog/post"]), + url, + ), + ); + }); + + const gen = spiderUrl("https://example.com/docs/", { + pathPrefix: "/docs", + maxPages: 10, + maxDepth: 1, + requestDelay: 0, + }); + const { pages } = await collectPages(gen); + const urls = pages.map((p) => p.url); + expect(urls).toContain("https://example.com/docs/guide"); + expect(urls).not.toContain("https://example.com/blog/post"); + }); + + it("skips URLs matching excludePatterns", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve( + pageResponse( + htmlPage("Page", [ + "https://example.com/docs/guide", + "https://example.com/changelog/v2", + "https://example.com/api/v1/ref", + ]), + url, + ), + ); + }); + + const gen = spiderUrl("https://example.com/", { + excludePatterns: ["*/changelog*", "*/api/v1/*"], + maxPages: 10, + maxDepth: 1, + requestDelay: 0, + }); + const { pages } = await collectPages(gen); + const urls = pages.map((p) => p.url); + expect(urls).toContain("https://example.com/docs/guide"); + expect(urls).not.toContain("https://example.com/changelog/v2"); + expect(urls).not.toContain("https://example.com/api/v1/ref"); + }); + + it("skips URLs disallowed by robots.txt", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url === "https://example.com/robots.txt") { + return Promise.resolve({ + body: "User-agent: *\nDisallow: /private/", + contentType: "text/plain", + finalUrl: url, + }); + } + return Promise.resolve( + pageResponse( + htmlPage("Root", [ + "https://example.com/public/page", + "https://example.com/private/secret", + ]), + url, + ), + ); + }); + + const gen = spiderUrl("https://example.com/", { + maxPages: 10, + maxDepth: 1, + requestDelay: 0, + }); + const { pages } = await collectPages(gen); + const urls = pages.map((p) => p.url); + expect(urls).toContain("https://example.com/public/page"); + expect(urls).not.toContain("https://example.com/private/secret"); + }); + + it("respects LibScope-specific robots.txt rules", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url === "https://example.com/robots.txt") { + return Promise.resolve({ + body: "User-agent: libscope\nDisallow: /restricted/\nUser-agent: *\nDisallow:", + contentType: "text/plain", + finalUrl: url, + }); + } + return Promise.resolve( + pageResponse(htmlPage("Root", ["https://example.com/restricted/data"]), url), + ); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 10, maxDepth: 1, requestDelay: 0 }); + const { pages } = await collectPages(gen); + expect(pages.map((p) => p.url)).not.toContain("https://example.com/restricted/data"); + }); + + it("continues crawling when a single page fetch fails", async () => { + let callCount = 0; + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/") { + return Promise.resolve( + pageResponse( + htmlPage("Root", ["https://example.com/good", "https://example.com/bad"]), + url, + ), + ); + } + if (url === "https://example.com/bad") { + callCount++; + return Promise.reject(new Error("connection refused")); + } + return Promise.resolve(pageResponse(htmlPage("Good", []), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 10, maxDepth: 1, requestDelay: 0 }); + const { pages, stats } = await collectPages(gen); + + const urls = pages.map((p) => p.url); + expect(urls).toContain("https://example.com/"); + expect(urls).toContain("https://example.com/good"); + expect(urls).not.toContain("https://example.com/bad"); + expect((stats as { errors: Array<{ url: string }> }).errors.length).toBeGreaterThan(0); + expect(callCount).toBe(1); // fetched once, failed + }); + + it("returns SpiderStats from the generator return value", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve(pageResponse(htmlPage("Page", ["https://example.com/child"]), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 5, maxDepth: 1, requestDelay: 0 }); + const { stats } = await collectPages(gen); + const s = stats as { + pagesFetched: number; + pagesCrawled: number; + pagesSkipped: number; + errors: unknown[]; + }; + + expect(typeof s.pagesFetched).toBe("number"); + expect(typeof s.pagesCrawled).toBe("number"); + expect(typeof s.pagesSkipped).toBe("number"); + expect(Array.isArray(s.errors)).toBe(true); + expect(s.pagesFetched).toBeGreaterThan(0); + }); + + it("caps maxPages to the hard limit of 200", async () => { + // We just confirm that requesting 999 is capped — we test via stats.pagesFetched ≤ 200 + // In practice, our mock only has one page so pagesFetched will be 1. + // The important thing is that the option is accepted without error. + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve(pageResponse(htmlPage("Only Page", []), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 999, maxDepth: 0, requestDelay: 0 }); + const { pages } = await collectPages(gen); + expect(pages.length).toBeLessThanOrEqual(200); + }); + + it("caps maxDepth to the hard limit of 5", async () => { + // Should not throw even when maxDepth: 100 is passed + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve(pageResponse(htmlPage("Page", []), url)); + }); + + // Should not throw — maxDepth is capped to hard limit internally + const gen = spiderUrl("https://example.com/", { maxDepth: 100, requestDelay: 0 }); + const { pages } = await collectPages(gen); + expect(pages.length).toBeGreaterThanOrEqual(1); + }); + + it("maxDepth=0 only fetches the seed page", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + if (url === "https://example.com/") { + return Promise.resolve(pageResponse(htmlPage("Seed", ["https://example.com/child"]), url)); + } + return Promise.reject(new Error("should not fetch children at depth 0")); + }); + + const gen = spiderUrl("https://example.com/", { maxDepth: 0, maxPages: 10, requestDelay: 0 }); + const { pages } = await collectPages(gen); + + expect(pages.length).toBe(1); + expect(pages[0]!.url).toBe("https://example.com/"); + }); + + it("BFS: fetches pages breadth-first (children before grandchildren)", async () => { + const fetchOrder: string[] = []; + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + fetchOrder.push(url); + if (url === "https://example.com/") { + return Promise.resolve( + pageResponse(htmlPage("Root", ["https://example.com/a", "https://example.com/b"]), url), + ); + } + if (url === "https://example.com/a") { + return Promise.resolve(pageResponse(htmlPage("A", ["https://example.com/a1"]), url)); + } + if (url === "https://example.com/b") { + return Promise.resolve(pageResponse(htmlPage("B", []), url)); + } + return Promise.resolve(pageResponse(htmlPage("Leaf", []), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 10, maxDepth: 2, requestDelay: 0 }); + await collectPages(gen); + + // root → a → b → a1 (BFS order: process all depth-1 before depth-2) + const idxRoot = fetchOrder.indexOf("https://example.com/"); + const idxA = fetchOrder.indexOf("https://example.com/a"); + const idxB = fetchOrder.indexOf("https://example.com/b"); + const idxA1 = fetchOrder.indexOf("https://example.com/a1"); + + expect(idxRoot).toBeLessThan(idxA); + expect(idxRoot).toBeLessThan(idxB); + // Both a and b (depth 1) should appear before a1 (depth 2) + expect(idxA).toBeLessThan(idxA1); + expect(idxB).toBeLessThan(idxA1); + }); + + it("handles plain text responses without crashing", async () => { + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + return Promise.resolve({ + body: "# Plain Text\n\nNo HTML here.", + contentType: "text/plain", + finalUrl: url, + }); + }); + + const gen = spiderUrl("https://example.com/notes.txt", { maxDepth: 0, requestDelay: 0 }); + const { pages } = await collectPages(gen); + expect(pages.length).toBe(1); + expect(pages[0]!.title).toBe("Plain Text"); + }); + + it("marks abortReason as maxPages when capped mid-crawl", async () => { + // Seed always returns a new unique link + let counter = 0; + mockFetchRaw.mockImplementation((url: string) => { + if (url.endsWith("/robots.txt")) return Promise.reject(new Error("404")); + counter++; + const links = [`https://example.com/page${counter + 100}`]; + return Promise.resolve(pageResponse(htmlPage(`Page ${counter}`, links), url)); + }); + + const gen = spiderUrl("https://example.com/", { maxPages: 2, maxDepth: 5, requestDelay: 0 }); + const { stats } = await collectPages(gen); + expect((stats as { abortReason?: string }).abortReason).toBe("maxPages"); + }); +}); diff --git a/tests/unit/update-document.test.ts b/tests/unit/update-document.test.ts index a754ef9..61f9e7e 100644 --- a/tests/unit/update-document.test.ts +++ b/tests/unit/update-document.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, beforeEach } from "vitest"; +import { describe, it, expect, beforeEach, vi } from "vitest"; import { createTestDbWithVec } from "../fixtures/test-db.js"; import { getDocument, @@ -87,17 +87,21 @@ describe("updateDocument", () => { }); it("should update updated_at timestamp", async () => { - const before: Document = getDocument(db, docId); - // SQLite datetime('now') has 1-second resolution; wait just enough for it to tick - await new Promise((r) => setTimeout(r, 1100)); - const input: UpdateDocumentInput = { title: "Updated" }; - await updateDocument(db, provider, docId, input); - const after: Document = getDocument(db, docId); - - expect(new Date(after.updatedAt).getTime()).toBeGreaterThanOrEqual( - new Date(before.updatedAt).getTime(), - ); - expect(after.updatedAt).not.toBe(before.updatedAt); + vi.useFakeTimers(); + try { + const before: Document = getDocument(db, docId); + // Advance fake clock by 2 seconds so the JS timestamp differs + vi.advanceTimersByTime(2000); + const input: UpdateDocumentInput = { title: "Updated" }; + await updateDocument(db, provider, docId, input); + const after: Document = getDocument(db, docId); + + expect(new Date(after.updatedAt).getTime()).toBeGreaterThan( + new Date(before.updatedAt).getTime(), + ); + } finally { + vi.useRealTimers(); + } }); it("should throw for nonexistent document", async () => { diff --git a/tests/unit/webhooks.test.ts b/tests/unit/webhooks.test.ts index 9272901..d759133 100644 --- a/tests/unit/webhooks.test.ts +++ b/tests/unit/webhooks.test.ts @@ -278,8 +278,8 @@ describe("webhooks", () => { await createWebhook(db, "https://example.com/hook", ["document.updated"]); fireWebhooks(db, "document.created", { docId: "123" }); - // Give time for any async calls - await new Promise((r) => setTimeout(r, 50)); + // Flush all pending microtasks/promises; mockFetch should remain uncalled + await Promise.resolve(); expect(mockFetch).not.toHaveBeenCalled(); }); @@ -367,8 +367,26 @@ describe("webhooks", () => { fireWebhooks(db, "document.created", { docId: "123" }); - await new Promise((r) => setTimeout(r, 50)); + // Flush all pending microtasks/promises; mockFetch should remain uncalled + await Promise.resolve(); expect(mockFetch).not.toHaveBeenCalled(); }); }); + + describe("rowToWebhook corrupted JSON", () => { + it("should default to empty events array when events JSON is corrupted", () => { + // Directly insert a row with invalid JSON in the events column + db.prepare("INSERT INTO webhooks (id, url, events, secret) VALUES (?, ?, ?, ?)").run( + "corrupt-1", + "https://example.com/hook", + "not valid json{{{", + null, + ); + + const hooks = listWebhooks(db); + const corrupt = hooks.find((h) => h.id === "corrupt-1"); + expect(corrupt).toBeDefined(); + expect(corrupt!.events).toEqual([]); + }); + }); });