Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { getDatabase, runMigrations, createVectorTable, closeDatabase } from "..
import { createEmbeddingProvider, type EmbeddingProvider } from "../providers/index.js";
import { indexDocument, indexFile } from "../core/indexing.js";
import { getSupportedExtensions } from "../core/parsers/index.js";
import { searchDocuments } from "../core/search.js";
import { searchDocuments, getRelatedChunks } from "../core/search.js";
import { askQuestion, createLlmProvider } from "../core/rag.js";
import { getDocumentRatings, listRatings } from "../core/ratings.js";
import { createTopic, listTopics } from "../core/topics.js";
Expand Down Expand Up @@ -452,6 +452,71 @@ program
},
);

// related
program
.command("related <chunkId>")
.description("Find chunks related to a given chunk by vector similarity")
.option("--limit <n>", "Number of results", "10")
.option("--topic <topic>", "Filter by topic")
.option("--library <lib>", "Filter by library")
.option("--min-score <n>", "Minimum similarity score (0-1)")
.option("--tags <tags>", "Comma-separated tags to filter by")
.action(
(
chunkId: string,
opts: {
limit: string;
topic?: string;
library?: string;
minScore?: string;
tags?: string;
},
) => {
const { db } = initializeApp();
try {
const limit = parseIntOption(opts.limit, "--limit");
const minScore = opts.minScore !== undefined ? parseFloat(opts.minScore) : undefined;
const tags = opts.tags ? opts.tags.split(",").map((t) => t.trim()) : undefined;

let result;
try {
result = getRelatedChunks(db, {
chunkId,
...(limit !== undefined && { limit }),
...(opts.topic !== undefined && { topic: opts.topic }),
...(opts.library !== undefined && { library: opts.library }),
...(tags !== undefined && { tags }),
...(minScore !== undefined && { minScore }),
});
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
console.error(`Error: ${message}`);
process.exit(1);
}

const { sourceChunk, chunks } = result;
console.log(
`\nRelated to chunk: ${sourceChunk.id} from document ${sourceChunk.documentId}`,
);

if (chunks.length === 0) {
console.log("No related chunks found.");
} else {
console.log(`\nShowing ${chunks.length} related chunks:\n`);
for (const r of chunks) {
console.log(`\n── ${r.title} (score: ${r.score.toFixed(2)}) ──`);
console.log(` Chunk ID: ${r.chunkId}`);
if (r.library) console.log(` Library: ${r.library}`);
if (r.url) console.log(` Source: ${r.url}`);
console.log(` ${r.content.slice(0, 200)}${r.content.length > 200 ? "..." : ""}`);
}
}
} finally {
closeDatabase();
}
},
);

// saved searches
const searchesCmd = program.command("searches").description("Manage saved searches");

Expand Down
240 changes: 240 additions & 0 deletions src/core/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,27 @@ export interface SearchResult {
contextAfter?: ContextChunk[] | undefined;
}

export interface RelatedChunksOptions {
chunkId: string;
limit?: number; // default 10
excludeDocumentId?: string; // exclude the source document (default: auto-detected from chunkId)
topic?: string;
library?: string;
tags?: string[];
minScore?: number; // default 0.0
includeLinkedDocuments?: boolean; // blend in explicit document_links (default false)
}

export interface RelatedChunksResult {
chunks: SearchResult[];
sourceChunk: {
id: string;
documentId: string;
content: string;
chunkIndex: number;
};
}

// ---------------------------------------------------------------------------
// Title boost multiplier: chunks whose document title contains any query word
// receive this multiplicative boost to their final score.
Expand Down Expand Up @@ -820,6 +841,225 @@ function attachRatings(db: Database.Database, results: SearchResult[]): SearchRe
return results.map((r) => ({ ...r, avgRating: ratingMap.get(r.documentId) ?? null }));
}

/**
* Find chunks related to a given chunk by vector similarity.
* Looks up the source chunk's embedding, then searches for similar chunks
* excluding the source document (by default). Returns synchronously.
*/
export function getRelatedChunks(
db: Database.Database,
options: RelatedChunksOptions,
): RelatedChunksResult {
const { chunkId } = options;
const limit = Math.max(1, Math.min(options.limit ?? 10, 1000));
const minScore = options.minScore ?? 0.0;

// Look up the source chunk
const SourceChunkSchema = z.object({
id: z.string(),
document_id: z.string(),
content: z.string(),
chunk_index: z.number(),
});
const sourceChunkRow = validateRow(
SourceChunkSchema.optional(),
db
.prepare(`SELECT id, document_id, content, chunk_index FROM chunks WHERE id = ?`)
.get(chunkId),
"getRelatedChunks.sourceChunk",
);
if (!sourceChunkRow) {
throw new Error(`Chunk not found: ${chunkId}`);
}

const sourceChunk = {
id: sourceChunkRow.id,
documentId: sourceChunkRow.document_id,
content: sourceChunkRow.content,
chunkIndex: sourceChunkRow.chunk_index,
};

const excludeDocumentId = options.excludeDocumentId ?? sourceChunkRow.document_id;

// Fetch the embedding for the source chunk
const EmbeddingRowSchema = z.object({ embedding: z.instanceof(Buffer) });
const embeddingRow = validateRow(
EmbeddingRowSchema.optional(),
db.prepare(`SELECT embedding FROM chunk_embeddings WHERE chunk_id = ?`).get(chunkId),
"getRelatedChunks.embedding",
);
if (!embeddingRow) {
throw new Error(`No embedding found for chunk: ${chunkId}`);
}

const vecBuffer = embeddingRow.embedding;

// Build SQL: vector ANN search excluding the source document
const tagFilter = buildTagFilter(options.tags, "d");

let sql = `
SELECT
candidates.chunk_id,
candidates.distance,
c.document_id,
c.content AS chunk_content,
d.title,
d.source_type,
d.library,
d.version,
d.topic_id,
d.url
FROM (
SELECT chunk_id, distance
FROM chunk_embeddings
WHERE embedding MATCH ?
ORDER BY distance
LIMIT ?
) candidates
JOIN chunks c ON c.id = candidates.chunk_id
JOIN documents d ON d.id = c.document_id
WHERE c.document_id != ?
`;

const params: unknown[] = [vecBuffer, limit * 10, excludeDocumentId];

if (options.library) {
sql += ` AND d.library = ?`;
params.push(options.library);
}
if (options.topic) {
sql += ` AND d.topic_id = ?`;
params.push(options.topic);
}
sql += tagFilter.clause;
params.push(...tagFilter.params);

sql += ` ORDER BY candidates.distance LIMIT ?`;
params.push(limit * 2); // over-fetch to allow minScore filtering

const RelatedRowSchema = z.object({
chunk_id: z.string(),
distance: z.number(),
document_id: z.string(),
chunk_content: z.string(),
title: z.string(),
source_type: z.string(),
library: z.string().nullable(),
version: z.string().nullable(),
topic_id: z.string().nullable(),
url: z.string().nullable(),
});

const rows = validateRows(
RelatedRowSchema,
db.prepare(sql).all(...params),
"getRelatedChunks.rows",
);

let results: SearchResult[] = rows.map((row) => {
const similarity = 1 - row.distance;
return {
documentId: row.document_id,
chunkId: row.chunk_id,
title: row.title,
content: row.chunk_content,
sourceType: row.source_type,
library: row.library,
version: row.version,
topicId: row.topic_id,
url: row.url,
score: similarity,
avgRating: null,
scoreExplanation: {
method: "vector" as SearchMethod,
rawScore: row.distance,
boostFactors: [],
details: `Vector similarity: distance=${row.distance.toFixed(4)}, similarity=${similarity.toFixed(4)}`,
},
};
});

// Apply minScore filter
if (minScore > 0) {
results = results.filter((r) => r.score >= minScore);
}

// Optional: blend in explicitly linked documents
if (options.includeLinkedDocuments) {
const linkedDocs = db
.prepare(
`SELECT DISTINCT
CASE WHEN source_id = ? THEN target_id ELSE source_id END AS linked_doc_id
FROM document_links
WHERE source_id = ? OR target_id = ?`,
)
.all(sourceChunk.documentId, sourceChunk.documentId, sourceChunk.documentId) as {
linked_doc_id: string;
}[];

const LinkedChunkSchema = z.object({
id: z.string(),
document_id: z.string(),
content: z.string(),
chunk_index: z.number(),
title: z.string(),
source_type: z.string(),
library: z.string().nullable(),
version: z.string().nullable(),
topic_id: z.string().nullable(),
url: z.string().nullable(),
});

const presentDocIds = new Set(results.map((r) => r.documentId));
for (const { linked_doc_id } of linkedDocs) {
if (!presentDocIds.has(linked_doc_id)) {
const linkedChunk = validateRow(
LinkedChunkSchema.optional(),
db
.prepare(
`SELECT c.id, c.document_id, c.content, c.chunk_index,
d.title, d.source_type, d.library, d.version, d.topic_id, d.url
FROM chunks c
JOIN documents d ON d.id = c.document_id
WHERE c.document_id = ?
ORDER BY c.chunk_index ASC
LIMIT 1`,
)
.get(linked_doc_id),
"getRelatedChunks.linkedChunk",
);
if (linkedChunk) {
results.push({
documentId: linkedChunk.document_id,
chunkId: linkedChunk.id,
title: linkedChunk.title,
content: linkedChunk.content,
sourceType: linkedChunk.source_type,
library: linkedChunk.library,
version: linkedChunk.version,
topicId: linkedChunk.topic_id,
url: linkedChunk.url,
score: 0.6,
avgRating: null,
scoreExplanation: {
method: "vector" as SearchMethod,
rawScore: 0.6,
boostFactors: ["linked_document"],
details: "Explicitly linked document",
},
});
}
}
}
results.sort((a, b) => b.score - a.score);
}

// Trim to requested limit
results = results.slice(0, limit);

return { chunks: results, sourceChunk };
}

/** FTS5-based full-text search with BM25 ranking. Uses AND logic by default. */
function fts5Search(
db: Database.Database,
Expand Down
51 changes: 50 additions & 1 deletion src/mcp/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { loadConfig } from "../config.js";
import { getDatabase, runMigrations, createVectorTable } from "../db/index.js";
import { getActiveWorkspace, getWorkspacePath } from "../core/workspace.js";
import { createEmbeddingProvider } from "../providers/index.js";
import { searchDocuments } from "../core/search.js";
import { searchDocuments, getRelatedChunks } from "../core/search.js";
import {
askQuestion,
createLlmProvider,
Expand Down Expand Up @@ -177,6 +177,55 @@ async function main(): Promise<void> {
}),
);

// Tool: get-related
server.tool(
"get-related",
"Find chunks semantically similar to a given chunk (more-like-this). Returns related content seeded from an existing chunk's stored embedding without requiring a text query.",
{
chunkId: z.string().describe("ID of the source chunk to find related content for"),
limit: z
.number()
.min(1)
.max(50)
.optional()
.describe("Number of results to return (default 10)"),
topic: z.string().optional().describe("Filter results to a specific topic"),
library: z.string().optional().describe("Filter results to a specific library"),
tags: z.array(z.string()).optional().describe("Filter results to documents with these tags"),
minScore: z
.number()
.min(0)
.max(1)
.optional()
.describe("Minimum similarity score threshold (0-1)"),
includeLinkedDocuments: z
.boolean()
.optional()
.describe("Also include explicitly linked documents even if below similarity threshold"),
},
withErrorHandling(
({ chunkId, limit, topic, library, tags, minScore, includeLinkedDocuments }) => {
const result = getRelatedChunks(db, {
chunkId,
...(limit !== undefined && { limit }),
...(topic !== undefined && { topic }),
...(library !== undefined && { library }),
...(tags !== undefined && { tags }),
...(minScore !== undefined && { minScore }),
...(includeLinkedDocuments !== undefined && { includeLinkedDocuments }),
});
return {
content: [
{
type: "text" as const,
text: JSON.stringify(result, null, 2),
},
],
};
},
),
);

// Tool: get-document
server.tool(
"get-document",
Expand Down
Loading
Loading