From 5190ae7e75607e82c6333fcd19d205e5172c0988 Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Thu, 19 Mar 2026 21:00:33 +0000 Subject: [PATCH 1/3] fix: export TreeSitterChunker and CodeChunk from libscope/lite TreeSitterChunker was compiled but not re-exported from the ./lite entry point, making it inaccessible to consumers using the package exports map. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- src/lite/index.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lite/index.ts b/src/lite/index.ts index fd87f96..9a35346 100644 --- a/src/lite/index.ts +++ b/src/lite/index.ts @@ -1,4 +1,6 @@ export { LibScopeLite } from "./core.js"; +export { TreeSitterChunker } from "./chunker-treesitter.js"; +export type { CodeChunk } from "./chunker-treesitter.js"; export type { LiteOptions, LiteDoc, From e14cfe6c2defd5d3f7e8810b1bcd5dd924e2d9f7 Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Thu, 19 Mar 2026 17:23:19 -0400 Subject: [PATCH 2/3] feat: wire TreeSitterChunker into LibScopeLite.index() via preChunked (#461) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `preChunked?: string[]` to `IndexDocumentInput` — when provided, `indexDocument` skips the markdown chunker and uses the caller's chunks directly. `LibScopeLite.index()` now checks `doc.language`: if set and supported, it pre-chunks the content with `TreeSitterChunker` and passes the result as `preChunked`. Falls back silently to the text chunker on any error (tree-sitter not installed, parse failure, etc.). Consumers set `language: "cpp"` (or any supported alias) on their `LiteDoc` and get function/class-boundary chunks automatically. Docs updated to note this as the preferred approach over using `TreeSitterChunker` directly. Co-authored-by: Claude Sonnet 4.6 (1M context) --- docs/guide/lite.md | 35 ++++++++++++++++++- docs/reference/lite-api.md | 32 ++++++++++++++++- src/core/indexing.ts | 13 +++++-- src/lite/core.ts | 18 ++++++++++ tests/unit/indexing.test.ts | 63 ++++++++++++++++++++++++++++++++- tests/unit/lite.test.ts | 69 +++++++++++++++++++++++++++++++++++++ 6 files changed, 224 insertions(+), 6 deletions(-) diff --git a/docs/guide/lite.md b/docs/guide/lite.md index 03452a3..1fc7547 100644 --- a/docs/guide/lite.md +++ b/docs/guide/lite.md @@ -139,6 +139,7 @@ await lite.index([ | `version` | `string?` | Library version | | `sourceType` | `string?` | `"manual"` (default), `"library"`, `"topic"`, or `"model-generated"` | | `topicId` | `string?` | Topic ID to associate the document with | +| `language` | `string?` | Language alias for code-aware tree-sitter chunking (e.g. `"typescript"`, `"cpp"`, `"go"`). When set and tree-sitter is available, chunks at function/class boundaries instead of text boundaries. | ### `indexRaw(input)` @@ -269,7 +270,39 @@ The LLM provider must support streaming. Providers that don't expose a `complete ## Code Indexing -For source code files, use the tree-sitter chunker to split at function and class boundaries: +LibScope Lite can split source code files at function and class boundaries using tree-sitter rather than plain text chunking. The preferred way to enable this is to set the `language` field on a `LiteDoc` — no extra imports or chunking steps required. + +### Preferred: set `language` on `LiteDoc` + +```ts +// Preferred: just set language on LiteDoc — chunking is automatic +await lite.index([ + { + title: "src/auth.cpp", + content: fileContent, + library: "my-repo", + language: "cpp", // enables tree-sitter chunking at function boundaries + }, +]); +``` + +Setting `language` on a `LiteDoc` automatically triggers code-aware tree-sitter chunking. This is the preferred approach over using `TreeSitterChunker` directly. If tree-sitter is not installed or parsing fails, indexing falls back silently to the standard text chunker. + +Supported languages and their extension aliases: + +| Language | Aliases | +|---|---| +| `typescript` | `ts`, `tsx` | +| `javascript` | `js`, `jsx`, `mjs`, `cjs` | +| `python` | `py` | +| `csharp` | `cs` | +| `cpp` | `cc`, `cxx`, `hpp`, `h` | +| `c` | — | +| `go` | — | + +### Advanced: using `TreeSitterChunker` directly + +Direct use of `TreeSitterChunker` is rarely needed when using `LibScopeLite` — setting `language` on `LiteDoc` covers most cases. Use `TreeSitterChunker` directly only when you need access to the raw `CodeChunk` objects (e.g., to extract line numbers for display, filter by node type, or build custom chunk titles): ```ts import { LibScopeLite } from "libscope/lite"; diff --git a/docs/reference/lite-api.md b/docs/reference/lite-api.md index 577dd3a..2f683e5 100644 --- a/docs/reference/lite-api.md +++ b/docs/reference/lite-api.md @@ -77,7 +77,7 @@ interface LiteOptions { async index(docs: LiteDoc[]): Promise ``` -Index an array of pre-parsed documents. Each document is chunked using the markdown-aware chunker, embedded, and stored. +Index an array of pre-parsed documents. Each document is chunked using the markdown-aware chunker (or code-aware tree-sitter chunker when `language` is set), embedded, and stored. **`LiteDoc`** @@ -107,9 +107,39 @@ interface LiteDoc { /** Topic ID to associate the document with for topic-scoped search. */ topicId?: string; + + /** + * Language alias for code-aware tree-sitter chunking. + * When set and tree-sitter is available, chunks at function/class boundaries + * instead of text boundaries. Falls back silently to the standard text chunker + * if tree-sitter is not installed or parsing fails. + * + * Supported languages and aliases: + * - `"typescript"` (aliases: `"ts"`, `"tsx"`) + * - `"javascript"` (aliases: `"js"`, `"jsx"`, `"mjs"`, `"cjs"`) + * - `"python"` (alias: `"py"`) + * - `"csharp"` (alias: `"cs"`) + * - `"cpp"` (aliases: `"cc"`, `"cxx"`, `"hpp"`, `"h"`) + * - `"c"` + * - `"go"` + */ + language?: string; } ``` +**`LiteDoc` properties:** + +| Property | Type | Required | Description | +|---|---|---|---| +| `title` | `string` | Yes | Document title. Used in search result display and title boosting. | +| `content` | `string` | Yes | Full document text. Will be chunked before embedding. | +| `url` | `string` | No | Source URL for deduplication — replaced if content hash changed, skipped if unchanged. | +| `sourceType` | `string` | No | `"manual"` (default), `"library"`, `"topic"`, or `"model-generated"`. | +| `library` | `string` | No | Library namespace for scoped search. | +| `version` | `string` | No | Library version. Used with `library` for version-scoped search. | +| `topicId` | `string` | No | Topic ID to associate the document with for topic-scoped search. | +| `language` | `string` | No | Language alias for code-aware tree-sitter chunking (e.g. `"typescript"`, `"cpp"`, `"go"`). When set and tree-sitter is available, chunks at function/class boundaries instead of text boundaries. | + **Example:** ```ts diff --git a/src/core/indexing.ts b/src/core/indexing.ts index 3b0d372..2c96a61 100644 --- a/src/core/indexing.ts +++ b/src/core/indexing.ts @@ -26,6 +26,8 @@ export interface IndexDocumentInput { dedupOptions?: DedupOptions | undefined; /** ISO 8601 expiry timestamp. Document will be pruned by pruneExpiredDocuments() after this time. */ expiresAt?: string | undefined; + /** If set, skip chunkContent() and use these directly as document chunks. */ + preChunked?: string[] | undefined; } export interface IndexedDocument { @@ -430,11 +432,16 @@ export async function indexDocument( if (titleResult) return titleResult; const docId = randomUUID(); - const useStreaming = input.content.length > STREAMING_THRESHOLD; - const chunks = useStreaming ? chunkContentStreaming(input.content) : chunkContent(input.content); + let chunks: string[]; + if (input.preChunked && input.preChunked.length > 0) { + chunks = input.preChunked; + } else { + const useStreaming = input.content.length > STREAMING_THRESHOLD; + chunks = useStreaming ? chunkContentStreaming(input.content) : chunkContent(input.content); + } log.info( - { docId, title: input.title, chunkCount: chunks.length, streaming: useStreaming }, + { docId, title: input.title, chunkCount: chunks.length }, "Indexing document", ); diff --git a/src/lite/core.ts b/src/lite/core.ts index b83adcb..f371112 100644 --- a/src/lite/core.ts +++ b/src/lite/core.ts @@ -11,6 +11,7 @@ import { bulkDelete } from "../core/bulk.js"; import { rateDocument } from "../core/ratings.js"; import { askQuestion, getContextForQuestion, type LlmProvider } from "../core/rag.js"; import { normalizeRawInput } from "./normalize.js"; +import { TreeSitterChunker } from "./chunker-treesitter.js"; import type { LiteOptions, LiteDoc, @@ -25,6 +26,11 @@ export class LibScopeLite { private readonly db: Database.Database; private readonly provider: EmbeddingProvider; private readonly llmProvider: LlmProvider | null; + private _chunker: TreeSitterChunker | undefined; + private get chunker(): TreeSitterChunker { + this._chunker ??= new TreeSitterChunker(); + return this._chunker; + } constructor(opts: LiteOptions = {}) { this.provider = opts.provider ?? new LocalEmbeddingProvider(); @@ -49,6 +55,17 @@ export class LibScopeLite { async index(docs: LiteDoc[]): Promise { for (const doc of docs) { + let preChunked: string[] | undefined; + + if (doc.language && this.chunker.supports(doc.language)) { + try { + const codeChunks = await this.chunker.chunk(doc.content, doc.language); + preChunked = codeChunks.map((c) => c.content); + } catch { + // tree-sitter not installed or parse failed — fall back to text chunker + } + } + await indexDocument(this.db, this.provider, { title: doc.title, content: doc.content, @@ -57,6 +74,7 @@ export class LibScopeLite { version: doc.version, topicId: doc.topicId, url: doc.url, + preChunked, }); } } diff --git a/tests/unit/indexing.test.ts b/tests/unit/indexing.test.ts index 883d3a7..3594a2e 100644 --- a/tests/unit/indexing.test.ts +++ b/tests/unit/indexing.test.ts @@ -1,9 +1,14 @@ -import { describe, it, expect } from "vitest"; +import { describe, it, expect, beforeEach, afterEach } from "vitest"; import { chunkContent, chunkContentStreaming, STREAMING_THRESHOLD, + indexDocument, } from "../../src/core/indexing.js"; +import Database from "better-sqlite3"; +import { runMigrations, createVectorTable } from "../../src/db/schema.js"; +import { createDatabase } from "../../src/db/connection.js"; +import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; describe("chunkContent", () => { it("should split content by markdown headings", () => { @@ -321,3 +326,59 @@ describe("STREAMING_THRESHOLD", () => { expect(STREAMING_THRESHOLD).toBe(1024 * 1024); }); }); + +describe("indexDocument preChunked", () => { + let db: Database.Database; + let provider: MockEmbeddingProvider; + + beforeEach(() => { + db = createDatabase(":memory:"); + runMigrations(db); + try { createVectorTable(db, 4); } catch { /* sqlite-vec not available */ } + provider = new MockEmbeddingProvider(); + }); + + afterEach(() => { + db.close(); + }); + + it("uses preChunked when provided, bypassing chunkContent", async () => { + const preChunked = ["chunk one", "chunk two", "chunk three"]; + const result = await indexDocument(db, provider, { + title: "Test File", + content: "some content", + sourceType: "manual", + preChunked, + }); + + expect(result.chunkCount).toBe(3); + + const chunks = db.prepare("SELECT content FROM chunks WHERE document_id = ? ORDER BY chunk_index").all(result.id) as Array<{ content: string }>; + expect(chunks).toHaveLength(3); + expect(chunks[0]?.content).toBe("chunk one"); + expect(chunks[1]?.content).toBe("chunk two"); + expect(chunks[2]?.content).toBe("chunk three"); + }); + + it("falls back to text chunking when preChunked is empty", async () => { + const result = await indexDocument(db, provider, { + title: "Test File", + content: "Some content that will be chunked normally.", + sourceType: "manual", + preChunked: [], + }); + + // Should have chunked via chunkContent (at least 1 chunk) + expect(result.chunkCount).toBeGreaterThanOrEqual(1); + }); + + it("falls back to text chunking when preChunked is undefined", async () => { + const result = await indexDocument(db, provider, { + title: "Test File", + content: "Some content without preChunked.", + sourceType: "manual", + }); + + expect(result.chunkCount).toBeGreaterThanOrEqual(1); + }); +}); diff --git a/tests/unit/lite.test.ts b/tests/unit/lite.test.ts index a427ea2..3648285 100644 --- a/tests/unit/lite.test.ts +++ b/tests/unit/lite.test.ts @@ -1,6 +1,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; import { LibScopeLite } from "../../src/lite/index.js"; import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; +import { TreeSitterChunker } from "../../src/lite/chunker-treesitter.js"; import type { LlmProvider } from "../../src/core/rag.js"; function* fakeStream(): Generator { @@ -269,4 +270,72 @@ describe("LibScopeLite", () => { expect(() => instance.close()).not.toThrow(); }); }); + + describe("index() with language/tree-sitter chunking", () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("calls TreeSitterChunker.chunk() when language is set and supported", async () => { + vi.spyOn(TreeSitterChunker.prototype, "supports").mockReturnValue(true); + const chunkSpy = vi.spyOn(TreeSitterChunker.prototype, "chunk").mockResolvedValue([ + { content: "function foo() {}", startLine: 1, endLine: 3, nodeType: "function_declaration" }, + { content: "function bar() {}", startLine: 5, endLine: 7, nodeType: "function_declaration" }, + ]); + + await lite.index([ + { + title: "src/main.ts", + content: "function foo() {}\nfunction bar() {}", + language: "typescript", + }, + ]); + + expect(chunkSpy).toHaveBeenCalledWith( + "function foo() {}\nfunction bar() {}", + "typescript", + ); + }); + + it("does not call chunk() when language is not set", async () => { + const chunkSpy = vi.spyOn(TreeSitterChunker.prototype, "chunk"); + + await lite.index([{ title: "Doc", content: "Some content here." }]); + + expect(chunkSpy).not.toHaveBeenCalled(); + }); + + it("falls back silently when tree-sitter throws", async () => { + vi.spyOn(TreeSitterChunker.prototype, "supports").mockReturnValue(true); + vi.spyOn(TreeSitterChunker.prototype, "chunk").mockRejectedValue( + new Error("tree-sitter not installed"), + ); + + // Should not throw — fallback to text chunker + await expect( + lite.index([ + { + title: "src/main.go", + content: "package main\nfunc main() {}", + language: "go", + }, + ]), + ).resolves.toBeUndefined(); + }); + + it("does not call chunk() when language is set but not supported", async () => { + vi.spyOn(TreeSitterChunker.prototype, "supports").mockReturnValue(false); + const chunkSpy = vi.spyOn(TreeSitterChunker.prototype, "chunk"); + + await lite.index([ + { + title: "src/main.rb", + content: "def hello; end", + language: "ruby", + }, + ]); + + expect(chunkSpy).not.toHaveBeenCalled(); + }); + }); }); From 9f371f8b9080629a99d16b9f57e2df8abc544a57 Mon Sep 17 00:00:00 2001 From: Robert DeRienzo Date: Thu, 19 Mar 2026 21:25:49 +0000 Subject: [PATCH 3/3] style: fix prettier formatting Co-Authored-By: Claude Sonnet 4.6 (1M context) --- src/core/indexing.ts | 5 +---- tests/unit/indexing.test.ts | 10 ++++++++-- tests/unit/lite.test.ts | 19 +++++++++++++------ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/core/indexing.ts b/src/core/indexing.ts index 2c96a61..03641e4 100644 --- a/src/core/indexing.ts +++ b/src/core/indexing.ts @@ -440,10 +440,7 @@ export async function indexDocument( chunks = useStreaming ? chunkContentStreaming(input.content) : chunkContent(input.content); } - log.info( - { docId, title: input.title, chunkCount: chunks.length }, - "Indexing document", - ); + log.info({ docId, title: input.title, chunkCount: chunks.length }, "Indexing document"); const metaPrefix = buildMetaPrefix(input); const textsForEmbedding = chunks.map((c) => metaPrefix + c); diff --git a/tests/unit/indexing.test.ts b/tests/unit/indexing.test.ts index 3594a2e..c711999 100644 --- a/tests/unit/indexing.test.ts +++ b/tests/unit/indexing.test.ts @@ -334,7 +334,11 @@ describe("indexDocument preChunked", () => { beforeEach(() => { db = createDatabase(":memory:"); runMigrations(db); - try { createVectorTable(db, 4); } catch { /* sqlite-vec not available */ } + try { + createVectorTable(db, 4); + } catch { + /* sqlite-vec not available */ + } provider = new MockEmbeddingProvider(); }); @@ -353,7 +357,9 @@ describe("indexDocument preChunked", () => { expect(result.chunkCount).toBe(3); - const chunks = db.prepare("SELECT content FROM chunks WHERE document_id = ? ORDER BY chunk_index").all(result.id) as Array<{ content: string }>; + const chunks = db + .prepare("SELECT content FROM chunks WHERE document_id = ? ORDER BY chunk_index") + .all(result.id) as Array<{ content: string }>; expect(chunks).toHaveLength(3); expect(chunks[0]?.content).toBe("chunk one"); expect(chunks[1]?.content).toBe("chunk two"); diff --git a/tests/unit/lite.test.ts b/tests/unit/lite.test.ts index 3648285..7e64b11 100644 --- a/tests/unit/lite.test.ts +++ b/tests/unit/lite.test.ts @@ -279,8 +279,18 @@ describe("LibScopeLite", () => { it("calls TreeSitterChunker.chunk() when language is set and supported", async () => { vi.spyOn(TreeSitterChunker.prototype, "supports").mockReturnValue(true); const chunkSpy = vi.spyOn(TreeSitterChunker.prototype, "chunk").mockResolvedValue([ - { content: "function foo() {}", startLine: 1, endLine: 3, nodeType: "function_declaration" }, - { content: "function bar() {}", startLine: 5, endLine: 7, nodeType: "function_declaration" }, + { + content: "function foo() {}", + startLine: 1, + endLine: 3, + nodeType: "function_declaration", + }, + { + content: "function bar() {}", + startLine: 5, + endLine: 7, + nodeType: "function_declaration", + }, ]); await lite.index([ @@ -291,10 +301,7 @@ describe("LibScopeLite", () => { }, ]); - expect(chunkSpy).toHaveBeenCalledWith( - "function foo() {}\nfunction bar() {}", - "typescript", - ); + expect(chunkSpy).toHaveBeenCalledWith("function foo() {}\nfunction bar() {}", "typescript"); }); it("does not call chunk() when language is not set", async () => {