-
Notifications
You must be signed in to change notification settings - Fork 41
Description
- Merkle trees 增量索引
- 支持 MCP
- 支持 CLI 模式和 WebIDE 模式(本地代码的场景:如无代码仓库、代码未推送等)
高级混合代码搜索是怎么实现的?
根据代码库上下文,这指的是 Claude Context 系统中的混合搜索功能,它结合了密集向量搜索(语义相似性)和 BM25 稀疏向量搜索(文本匹配)。
混合搜索架构
混合搜索通过 HYBRID_MODE
环境变量控制,默认启用。 1 系统根据此设置决定创建混合集合还是常规集合。 2
混合集合创建
混合集合包含特殊的 BM25 函数和稀疏向量字段: 3
集合模式包含:
vector
字段:密集向量(语义嵌入)sparse_vector
字段:稀疏向量(BM25)content_bm25_emb
函数:自动从内容生成 BM25 嵌入 4
数据插入过程
在混合模式下,文档插入时只需提供内容和密集向量,BM25 稀疏向量由数据库自动生成: 5
混合搜索执行
搜索时,系统准备两个搜索请求:
RRF 重排序
两种搜索结果通过 Reciprocal Rank Fusion (RRF) 算法合并和重排序: 8
在 REST API 实现中,重排序策略配置为: 9
搜索参数配置
- 密集搜索:使用 COSINE 相似度,
nprobe: 10
- 稀疏搜索:使用 BM25,
drop_ratio_search: 0.2
- RRF 参数:
k: 100
10
这种混合方法结合了语义理解(密集向量)和精确文本匹配(BM25),提供更准确和全面的代码搜索结果。
Notes
混合搜索功能在两个 Milvus 实现中都可用:gRPC 客户端 (MilvusVectorDatabase
) 和 REST API 客户端 (MilvusRestfulVectorDatabase
),它们使用相同的核心逻辑但通过不同的传输协议与 Milvus 通信。
Wiki pages you might want to explore:
- Core Indexing Engine (zilliztech/claude-context)
- Vector Database Layer (zilliztech/claude-context)
- Supported AI Assistants (zilliztech/claude-context)
Citations
File: packages/core/src/context.ts (L223-229)
private getIsHybrid(): boolean {
const isHybridEnv = envManager.get('HYBRID_MODE');
if (isHybridEnv === undefined || isHybridEnv === null) {
return true; // Default to true
}
return isHybridEnv.toLowerCase() === 'true';
}
File: packages/core/src/context.ts (L234-240)
public getCollectionName(codebasePath: string): string {
const isHybrid = this.getIsHybrid();
const normalizedPath = path.resolve(codebasePath);
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
const prefix = isHybrid === true ? 'hybrid_code_chunks' : 'code_chunks';
return `${prefix}_${hash.substring(0, 8)}`;
}
File: packages/core/src/context.ts (L440-446)
const searchRequests: HybridSearchRequest[] = [
{
data: queryEmbedding.vector,
anns_field: "vector",
param: { "nprobe": 10 },
limit: topK
},
File: packages/core/src/context.ts (L447-453)
{
data: query,
anns_field: "sparse_vector",
param: { "drop_ratio_search": 0.2 },
limit: topK
}
];
File: packages/core/src/context.ts (L464-467)
rerank: {
strategy: 'rrf',
params: { k: 100 }
},
File: packages/core/src/context.ts (L818-847)
if (isHybrid === true) {
// Create hybrid vector documents
const documents: VectorDocument[] = chunks.map((chunk, index) => {
if (!chunk.metadata.filePath) {
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
}
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
const fileExtension = path.extname(chunk.metadata.filePath);
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
return {
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
content: chunk.content, // Full text content for BM25 and storage
vector: embeddings[index].vector, // Dense vector
relativePath,
startLine: chunk.metadata.startLine || 0,
endLine: chunk.metadata.endLine || 0,
fileExtension,
metadata: {
...restMetadata,
codebasePath,
language: chunk.metadata.language || 'unknown',
chunkIndex: index
}
};
});
// Store to vector database
await this.vectorDatabase.insertHybrid(this.getCollectionName(codebasePath), documents);
File: packages/core/src/vectordb/milvus-restful-vectordb.ts (L527-536)
functions: [
{
name: "content_bm25_emb",
description: "content bm25 function",
type: "BM25",
inputFieldNames: ["content"],
outputFieldNames: ["sparse_vector"],
params: {},
},
],
File: packages/core/src/vectordb/milvus-restful-vectordb.ts (L555-564)
fieldName: "vector",
dataType: "FloatVector",
elementTypeParams: {
dim: dimension
}
},
{
fieldName: "sparse_vector",
dataType: "SparseFloatVector"
},
File: packages/core/src/vectordb/milvus-restful-vectordb.ts (L705-721)
searchParams: {
metricType: "COSINE",
params: searchRequests[0].param || { "nprobe": 10 }
}
};
// For sparse vector search - data must be array of queries: ["query text"]
const search_param_2: any = {
data: Array.isArray(searchRequests[1].data) ? searchRequests[1].data : [searchRequests[1].data],
annsField: searchRequests[1].anns_field, // "sparse_vector"
limit: searchRequests[1].limit,
outputFields: ["*"],
searchParams: {
metricType: "BM25",
params: searchRequests[1].param || { "drop_ratio_search": 0.2 }
}
};
File: packages/core/src/vectordb/milvus-restful-vectordb.ts (L729-734)
const rerank_strategy = {
strategy: "rrf",
params: {
k: 100
}
};