diff --git a/src/services/code-index/graph/context-aware-search.ts b/src/services/code-index/graph/context-aware-search.ts new file mode 100644 index 0000000000..5fcb5276db --- /dev/null +++ b/src/services/code-index/graph/context-aware-search.ts @@ -0,0 +1,339 @@ +import { + IContextAwareSearch, + IGraphIndex, + ContextAwareSearchResult, + CodeGraphNode, + CodeGraphEdge, + CodeNodeType, + EdgeType, +} from "../interfaces/graph-index" +import { IEmbedder } from "../interfaces/embedder" +import { IVectorStore } from "../interfaces/vector-store" + +/** + * Context-aware search implementation that combines vector similarity + * with graph relationships for enhanced code understanding + */ +export class ContextAwareSearchService implements IContextAwareSearch { + constructor( + private readonly graphIndex: IGraphIndex, + private readonly embedder: IEmbedder, + private readonly vectorStore: IVectorStore, + ) {} + + /** + * Search with context awareness, combining vector similarity with graph relationships + */ + async searchWithContext( + query: string, + options?: { + includeRelated?: boolean + maxDepth?: number + nodeTypes?: CodeNodeType[] + edgeTypes?: EdgeType[] + limit?: number + }, + ): Promise { + const { includeRelated = true, maxDepth = 2, nodeTypes, edgeTypes, limit = 10 } = options || {} + + // Generate embedding for the query + const { embeddings } = await this.embedder.createEmbeddings([query]) + const queryEmbedding = embeddings[0] + + // Search for similar nodes using vector similarity + const similarNodes = await this.graphIndex.searchSimilarNodes( + queryEmbedding, + limit * 2, // Get more candidates for filtering + nodeTypes?.[0], // Use first node type if specified + ) + + // Build context for each result + const results: ContextAwareSearchResult[] = [] + const processedNodes = new Set() + + for (const node of similarNodes) { + if (processedNodes.has(node.id)) continue + processedNodes.add(node.id) + + // Calculate similarity score + const score = this.calculateSimilarityScore(queryEmbedding, node.embedding || []) + + // Build context if requested + let context: ContextAwareSearchResult["context"] = { + relatedNodes: [], + relationships: [], + } + + if (includeRelated) { + // Get related nodes + const relatedNodes = await this.graphIndex.getConnectedNodes(node.id, edgeTypes?.[0], maxDepth) + + // Get relationships + const relationships = await this.graphIndex.getEdges(node.id, edgeTypes?.[0]) + + // Build call chain if it's a function/method + if (node.type === CodeNodeType.FUNCTION || node.type === CodeNodeType.METHOD) { + const callChain = await this.buildCallChain(node.id, maxDepth) + context.callChain = callChain + } + + // Build dependency tree + const dependencies = await this.buildDependencyTree(node.id, maxDepth) + + context = { + relatedNodes: relatedNodes.slice(0, 10), // Limit related nodes + relationships: relationships.slice(0, 20), // Limit relationships + callChain: context.callChain, + dependencies, + } + } + + results.push({ + node, + score, + context, + }) + + if (results.length >= limit) break + } + + // Sort by score and return top results + return results.sort((a, b) => b.score - a.score).slice(0, limit) + } + + /** + * Get code context for a specific location in a file + */ + async getContextForLocation(filePath: string, line: number): Promise { + // Search for nodes at this location + const allNodes = await this.searchNodesByLocation(filePath, line) + + if (allNodes.length === 0) { + return null + } + + // Find the most specific node (smallest range containing the line) + const node = allNodes.reduce((best, current) => { + const bestRange = best.endLine - best.startLine + const currentRange = current.endLine - current.startLine + return currentRange < bestRange ? current : best + }) + + // Get comprehensive context + const relatedNodes = await this.graphIndex.getConnectedNodes(node.id, undefined, 3) + const relationships = await this.graphIndex.getEdges(node.id) + + // Build call chain if applicable + let callChain: CodeGraphNode[] | undefined + if (node.type === CodeNodeType.FUNCTION || node.type === CodeNodeType.METHOD) { + callChain = await this.buildCallChain(node.id, 5) + } + + // Build dependency tree + const dependencies = await this.buildDependencyTree(node.id, 3) + + return { + node, + score: 1.0, // Perfect match for location + context: { + relatedNodes, + relationships, + callChain, + dependencies, + }, + } + } + + /** + * Find related code across the codebase + */ + async findRelatedCode(nodeId: string, relationshipTypes?: EdgeType[]): Promise { + const relatedNodes: CodeGraphNode[] = [] + const visited = new Set() + const queue: { id: string; depth: number }[] = [{ id: nodeId, depth: 0 }] + const maxDepth = 3 + + while (queue.length > 0) { + const { id, depth } = queue.shift()! + + if (visited.has(id) || depth > maxDepth) continue + visited.add(id) + + // Get edges for this node + const edges = await this.graphIndex.getEdges(id) + + for (const edge of edges) { + // Filter by relationship type if specified + if (relationshipTypes && !relationshipTypes.includes(edge.type)) { + continue + } + + const targetId = edge.source === id ? edge.target : edge.source + if (!visited.has(targetId)) { + const targetNode = await this.graphIndex.getNode(targetId) + if (targetNode) { + relatedNodes.push(targetNode) + queue.push({ id: targetId, depth: depth + 1 }) + } + } + } + } + + // Sort by relevance (based on relationship strength and type) + return this.rankRelatedNodes(relatedNodes, nodeId) + } + + /** + * Build a call chain for a function/method + */ + private async buildCallChain(nodeId: string, maxDepth: number): Promise { + const chain: CodeGraphNode[] = [] + const visited = new Set() + + const buildChainRecursive = async (currentId: string, depth: number) => { + if (depth > maxDepth || visited.has(currentId)) return + visited.add(currentId) + + const node = await this.graphIndex.getNode(currentId) + if (!node) return + + chain.push(node) + + // Find callers (nodes that have CALLS edge to this node) + const edges = await this.graphIndex.getEdges(currentId, EdgeType.CALLS) + for (const edge of edges) { + if (edge.target === currentId) { + await buildChainRecursive(edge.source, depth + 1) + } + } + } + + await buildChainRecursive(nodeId, 0) + return chain + } + + /** + * Build a dependency tree for a node + */ + private async buildDependencyTree(nodeId: string, maxDepth: number): Promise { + const dependencies: CodeGraphNode[] = [] + const visited = new Set() + + const buildTreeRecursive = async (currentId: string, depth: number) => { + if (depth > maxDepth || visited.has(currentId)) return + visited.add(currentId) + + // Get import and dependency edges + const edges = await this.graphIndex.getEdges(currentId) + const depEdges = edges.filter( + (e) => e.type === EdgeType.IMPORTS || e.type === EdgeType.DEPENDS_ON || e.type === EdgeType.USES, + ) + + for (const edge of depEdges) { + const targetId = edge.source === currentId ? edge.target : edge.source + const targetNode = await this.graphIndex.getNode(targetId) + if (targetNode && !visited.has(targetId)) { + dependencies.push(targetNode) + await buildTreeRecursive(targetId, depth + 1) + } + } + } + + await buildTreeRecursive(nodeId, 0) + return dependencies + } + + /** + * Search for nodes at a specific file location + */ + private async searchNodesByLocation(filePath: string, line: number): Promise { + // This would need to be implemented with a proper query to the graph store + // For now, we'll use a simplified approach + const allNodes: CodeGraphNode[] = [] + + // Search through vector store for nodes in this file + const results = await this.vectorStore.search( + new Array(768).fill(0), // Dummy embedding + filePath, + 0, // No minimum score + 100, // Get many results + ) + + for (const result of results) { + if ( + result.payload?.filePath === filePath && + result.payload?.startLine <= line && + result.payload?.endLine >= line + ) { + // Convert to graph node + const node: CodeGraphNode = { + id: result.id as string, + type: CodeNodeType.FUNCTION, // Would need proper type detection + name: `${filePath}:${line}`, + filePath, + startLine: result.payload.startLine, + endLine: result.payload.endLine, + content: result.payload.codeChunk, + metadata: {}, + } + allNodes.push(node) + } + } + + return allNodes + } + + /** + * Calculate similarity score between two embeddings + */ + private calculateSimilarityScore(embedding1: number[], embedding2: number[]): number { + if (embedding1.length !== embedding2.length || embedding1.length === 0) { + return 0 + } + + // Cosine similarity + let dotProduct = 0 + let norm1 = 0 + let norm2 = 0 + + for (let i = 0; i < embedding1.length; i++) { + dotProduct += embedding1[i] * embedding2[i] + norm1 += embedding1[i] * embedding1[i] + norm2 += embedding2[i] * embedding2[i] + } + + const denominator = Math.sqrt(norm1) * Math.sqrt(norm2) + if (denominator === 0) return 0 + + // Convert to 0-1 range + return (dotProduct / denominator + 1) / 2 + } + + /** + * Rank related nodes by relevance + */ + private rankRelatedNodes(nodes: CodeGraphNode[], sourceNodeId: string): CodeGraphNode[] { + // Simple ranking based on node type priority + const typePriority: Record = { + [CodeNodeType.CLASS]: 10, + [CodeNodeType.INTERFACE]: 9, + [CodeNodeType.FUNCTION]: 8, + [CodeNodeType.METHOD]: 7, + [CodeNodeType.TYPE_ALIAS]: 6, + [CodeNodeType.ENUM]: 5, + [CodeNodeType.CONSTANT]: 4, + [CodeNodeType.VARIABLE]: 3, + [CodeNodeType.MODULE]: 2, + [CodeNodeType.IMPORT]: 1, + [CodeNodeType.EXPORT]: 1, + [CodeNodeType.FILE]: 0, + [CodeNodeType.NAMESPACE]: 2, + } + + return nodes.sort((a, b) => { + const priorityA = typePriority[a.type] || 0 + const priorityB = typePriority[b.type] || 0 + return priorityB - priorityA + }) + } +} diff --git a/src/services/code-index/graph/graph-index-store.ts b/src/services/code-index/graph/graph-index-store.ts new file mode 100644 index 0000000000..e309690c1a --- /dev/null +++ b/src/services/code-index/graph/graph-index-store.ts @@ -0,0 +1,340 @@ +import { QdrantClient } from "@qdrant/js-client-rest" +import { v5 as uuidv5 } from "uuid" +import { createHash } from "crypto" +import { IGraphIndex, CodeGraphNode, CodeGraphEdge, CodeNodeType, EdgeType } from "../interfaces/graph-index" +import { QDRANT_CODE_BLOCK_NAMESPACE } from "../constants" + +/** + * Graph-based index implementation using Qdrant collections + * Uses separate collections for nodes and edges to enable graph traversal + */ +export class GraphIndexStore implements IGraphIndex { + private client: QdrantClient + private readonly nodesCollectionName: string + private readonly edgesCollectionName: string + private readonly vectorSize: number + private readonly workspacePath: string + + constructor(workspacePath: string, qdrantUrl: string, vectorSize: number, apiKey?: string) { + this.workspacePath = workspacePath + this.vectorSize = vectorSize + + // Generate collection names based on workspace + const hash = createHash("sha256").update(workspacePath).digest("hex") + this.nodesCollectionName = `graph-nodes-${hash.substring(0, 16)}` + this.edgesCollectionName = `graph-edges-${hash.substring(0, 16)}` + + // Initialize Qdrant client + try { + const urlObj = new URL(qdrantUrl) + const port = urlObj.port ? Number(urlObj.port) : urlObj.protocol === "https:" ? 443 : 80 + + this.client = new QdrantClient({ + host: urlObj.hostname, + https: urlObj.protocol === "https:", + port: port, + prefix: urlObj.pathname === "/" ? undefined : urlObj.pathname.replace(/\/+$/, ""), + apiKey, + headers: { + "User-Agent": "Roo-Code-GraphIndex", + }, + }) + } catch { + this.client = new QdrantClient({ + url: qdrantUrl, + apiKey, + headers: { + "User-Agent": "Roo-Code-GraphIndex", + }, + }) + } + } + + /** + * Initialize the graph collections + */ + async initialize(): Promise { + // Create nodes collection + try { + await this.client.createCollection(this.nodesCollectionName, { + vectors: { + size: this.vectorSize, + distance: "Cosine", + on_disk: true, + }, + hnsw_config: { + m: 64, + ef_construct: 512, + on_disk: true, + }, + }) + } catch (error: any) { + if (!error?.message?.includes("already exists")) { + throw error + } + } + + // Create edges collection (no vectors, just metadata) + try { + await this.client.createCollection(this.edgesCollectionName, { + vectors: { + size: 4, // Minimal vector size for edges + distance: "Cosine", + on_disk: true, + }, + }) + } catch (error: any) { + if (!error?.message?.includes("already exists")) { + throw error + } + } + + // Create indexes for efficient querying + await this.createIndexes() + } + + private async createIndexes(): Promise { + // Node indexes + const nodeIndexFields = ["type", "name", "filePath", "startLine", "endLine"] + for (const field of nodeIndexFields) { + try { + await this.client.createPayloadIndex(this.nodesCollectionName, { + field_name: field, + field_schema: "keyword", + }) + } catch (error: any) { + // Ignore if index already exists + } + } + + // Edge indexes + const edgeIndexFields = ["source", "target", "type", "weight"] + for (const field of edgeIndexFields) { + try { + await this.client.createPayloadIndex(this.edgesCollectionName, { + field_name: field, + field_schema: field === "weight" ? "float" : "keyword", + }) + } catch (error: any) { + // Ignore if index already exists + } + } + } + + async addNode(node: CodeGraphNode): Promise { + const point = { + id: node.id, + vector: node.embedding || new Array(this.vectorSize).fill(0), + payload: { + type: node.type, + name: node.name, + filePath: node.filePath, + startLine: node.startLine, + endLine: node.endLine, + content: node.content, + metadata: node.metadata || {}, + }, + } + + await this.client.upsert(this.nodesCollectionName, { + points: [point], + wait: true, + }) + } + + async addEdge(edge: CodeGraphEdge): Promise { + const point = { + id: edge.id, + vector: [edge.weight, 0, 0, 0], // Use weight as first dimension + payload: { + source: edge.source, + target: edge.target, + type: edge.type, + weight: edge.weight, + metadata: edge.metadata || {}, + }, + } + + await this.client.upsert(this.edgesCollectionName, { + points: [point], + wait: true, + }) + } + + async getNode(nodeId: string): Promise { + try { + const result = await this.client.retrieve(this.nodesCollectionName, { + ids: [nodeId], + }) + + if (result.length === 0) { + return null + } + + const point = result[0] + return { + id: nodeId, + type: point.payload?.type as CodeNodeType, + name: point.payload?.name as string, + filePath: point.payload?.filePath as string, + startLine: point.payload?.startLine as number, + endLine: point.payload?.endLine as number, + content: point.payload?.content as string, + embedding: point.vector as number[], + metadata: point.payload?.metadata as Record, + } + } catch { + return null + } + } + + async getEdges(nodeId: string, edgeType?: EdgeType): Promise { + const filter: any = { + should: [ + { key: "source", match: { value: nodeId } }, + { key: "target", match: { value: nodeId } }, + ], + } + + if (edgeType) { + filter.must = [{ key: "type", match: { value: edgeType } }] + } + + const result = await this.client.scroll(this.edgesCollectionName, { + filter, + limit: 1000, + with_payload: true, + }) + + return result.points.map((point) => ({ + id: point.id as string, + source: point.payload?.source as string, + target: point.payload?.target as string, + type: point.payload?.type as EdgeType, + weight: point.payload?.weight as number, + metadata: point.payload?.metadata as Record, + })) + } + + async getConnectedNodes(nodeId: string, edgeType?: EdgeType, depth: number = 1): Promise { + const visited = new Set() + const nodes: CodeGraphNode[] = [] + const queue: { id: string; currentDepth: number }[] = [{ id: nodeId, currentDepth: 0 }] + + while (queue.length > 0) { + const { id, currentDepth } = queue.shift()! + + if (visited.has(id) || currentDepth > depth) { + continue + } + + visited.add(id) + + // Get the node + const node = await this.getNode(id) + if (node && currentDepth > 0) { + nodes.push(node) + } + + // Get edges if we haven't reached max depth + if (currentDepth < depth) { + const edges = await this.getEdges(id, edgeType) + for (const edge of edges) { + const nextId = edge.source === id ? edge.target : edge.source + if (!visited.has(nextId)) { + queue.push({ id: nextId, currentDepth: currentDepth + 1 }) + } + } + } + } + + return nodes + } + + async searchSimilarNodes( + embedding: number[], + limit: number = 10, + nodeType?: CodeNodeType, + ): Promise { + const filter = nodeType ? { must: [{ key: "type", match: { value: nodeType } }] } : undefined + + const result = await this.client.query(this.nodesCollectionName, { + query: embedding, + filter, + limit, + with_payload: true, + }) + + return result.points.map((point) => ({ + id: point.id as string, + type: point.payload?.type as CodeNodeType, + name: point.payload?.name as string, + filePath: point.payload?.filePath as string, + startLine: point.payload?.startLine as number, + endLine: point.payload?.endLine as number, + content: point.payload?.content as string, + embedding: point.vector as number[], + metadata: point.payload?.metadata as Record, + })) + } + + async getSubgraph(nodeId: string, depth: number): Promise<{ nodes: CodeGraphNode[]; edges: CodeGraphEdge[] }> { + const nodes = await this.getConnectedNodes(nodeId, undefined, depth) + const nodeIds = new Set([nodeId, ...nodes.map((n) => n.id)]) + + // Get all edges between these nodes + const allEdges: CodeGraphEdge[] = [] + for (const id of nodeIds) { + const edges = await this.getEdges(id) + for (const edge of edges) { + if (nodeIds.has(edge.source) && nodeIds.has(edge.target)) { + allEdges.push(edge) + } + } + } + + // Get the root node + const rootNode = await this.getNode(nodeId) + if (rootNode) { + nodes.unshift(rootNode) + } + + // Remove duplicate edges + const uniqueEdges = Array.from(new Map(allEdges.map((e) => [e.id, e])).values()) + + return { nodes, edges: uniqueEdges } + } + + async clear(): Promise { + try { + await this.client.deleteCollection(this.nodesCollectionName) + } catch { + // Collection might not exist + } + + try { + await this.client.deleteCollection(this.edgesCollectionName) + } catch { + // Collection might not exist + } + + // Reinitialize collections + await this.initialize() + } + + /** + * Helper method to generate node ID + */ + static generateNodeId(filePath: string, type: string, name: string, line: number): string { + const content = `${filePath}-${type}-${name}-${line}` + return uuidv5(content, QDRANT_CODE_BLOCK_NAMESPACE) + } + + /** + * Helper method to generate edge ID + */ + static generateEdgeId(source: string, target: string, type: EdgeType): string { + const content = `${source}-${target}-${type}` + return uuidv5(content, QDRANT_CODE_BLOCK_NAMESPACE) + } +} diff --git a/src/services/code-index/graph/relationship-extractor.ts b/src/services/code-index/graph/relationship-extractor.ts new file mode 100644 index 0000000000..5e88794d18 --- /dev/null +++ b/src/services/code-index/graph/relationship-extractor.ts @@ -0,0 +1,673 @@ +import { Node } from "web-tree-sitter" +import * as path from "path" +import { v5 as uuidv5 } from "uuid" +import { CodeGraphNode, CodeGraphEdge, CodeNodeType, EdgeType } from "../interfaces/graph-index" +import { GraphIndexStore } from "./graph-index-store" +import { QDRANT_CODE_BLOCK_NAMESPACE } from "../constants" + +/** + * Extracts relationships between code elements from AST + */ +export class RelationshipExtractor { + private readonly workspacePath: string + + constructor(workspacePath: string) { + this.workspacePath = workspacePath + } + + /** + * Extract nodes and relationships from an AST + */ + extractFromAST( + tree: any, + filePath: string, + content: string, + language: string, + ): { nodes: CodeGraphNode[]; edges: CodeGraphEdge[] } { + const nodes: CodeGraphNode[] = [] + const edges: CodeGraphEdge[] = [] + const relativePath = path.relative(this.workspacePath, filePath) + + // Create file node + const fileNodeId = GraphIndexStore.generateNodeId(relativePath, "file", relativePath, 0) + const fileNode: CodeGraphNode = { + id: fileNodeId, + type: CodeNodeType.FILE, + name: path.basename(filePath), + filePath: relativePath, + startLine: 1, + endLine: content.split("\n").length, + content: content.substring(0, 500), // Store first 500 chars as preview + metadata: { + language, + fullPath: filePath, + }, + } + nodes.push(fileNode) + + // Extract based on language + switch (language) { + case "typescript": + case "tsx": + case "javascript": + case "jsx": + this.extractTypeScriptRelationships(tree.rootNode, relativePath, fileNodeId, nodes, edges) + break + case "python": + this.extractPythonRelationships(tree.rootNode, relativePath, fileNodeId, nodes, edges) + break + case "java": + this.extractJavaRelationships(tree.rootNode, relativePath, fileNodeId, nodes, edges) + break + case "go": + this.extractGoRelationships(tree.rootNode, relativePath, fileNodeId, nodes, edges) + break + case "rust": + this.extractRustRelationships(tree.rootNode, relativePath, fileNodeId, nodes, edges) + break + default: + // Generic extraction for other languages + this.extractGenericRelationships(tree.rootNode, relativePath, fileNodeId, nodes, edges) + } + + return { nodes, edges } + } + + private extractTypeScriptRelationships( + node: Node, + filePath: string, + fileNodeId: string, + nodes: CodeGraphNode[], + edges: CodeGraphEdge[], + ): void { + const visit = (currentNode: Node, parentNodeId?: string) => { + let currentNodeId: string | undefined + + // Extract imports + if (currentNode.type === "import_statement") { + const source = currentNode.childForFieldName("source")?.text?.replace(/['"]/g, "") + if (source) { + const importNodeId = GraphIndexStore.generateNodeId( + filePath, + "import", + source, + currentNode.startPosition.row + 1, + ) + const importNode: CodeGraphNode = { + id: importNodeId, + type: CodeNodeType.IMPORT, + name: source, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text, + metadata: { source }, + } + nodes.push(importNode) + + // File imports module + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, importNodeId, EdgeType.IMPORTS), + source: fileNodeId, + target: importNodeId, + type: EdgeType.IMPORTS, + weight: 1.0, + }) + } + } + + // Extract classes + if (currentNode.type === "class_declaration") { + const className = currentNode.childForFieldName("name")?.text + if (className) { + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "class", + className, + currentNode.startPosition.row + 1, + ) + const classNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.CLASS, + name: className, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: {}, + } + nodes.push(classNode) + + // File contains class + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, currentNodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + + // Check for inheritance + const heritage = currentNode.childForFieldName("heritage") + if (heritage) { + heritage.children.forEach((clause) => { + if (clause && clause.type === "extends_clause") { + const superClass = clause.children.find((c) => c && c.type === "identifier")?.text + if (superClass && currentNodeId) { + edges.push({ + id: GraphIndexStore.generateEdgeId(currentNodeId, superClass, EdgeType.EXTENDS), + source: currentNodeId, + target: superClass, // This would need to be resolved to actual node ID + type: EdgeType.EXTENDS, + weight: 1.0, + metadata: { unresolved: true }, + }) + } + } + if (clause && clause.type === "implements_clause") { + clause.children + .filter((c) => c && c.type === "identifier") + .forEach((interfaceNode) => { + if (interfaceNode && currentNodeId) { + const interfaceName = interfaceNode.text + edges.push({ + id: GraphIndexStore.generateEdgeId( + currentNodeId, + interfaceName, + EdgeType.IMPLEMENTS, + ), + source: currentNodeId, + target: interfaceName, // This would need to be resolved to actual node ID + type: EdgeType.IMPLEMENTS, + weight: 1.0, + metadata: { unresolved: true }, + }) + } + }) + } + }) + } + } + } + + // Extract interfaces + if (currentNode.type === "interface_declaration") { + const interfaceName = currentNode.childForFieldName("name")?.text + if (interfaceName) { + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "interface", + interfaceName, + currentNode.startPosition.row + 1, + ) + const interfaceNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.INTERFACE, + name: interfaceName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: {}, + } + nodes.push(interfaceNode) + + // File contains interface + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, currentNodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + } + + // Extract functions + if ( + currentNode.type === "function_declaration" || + currentNode.type === "arrow_function" || + currentNode.type === "function_expression" + ) { + const functionName = + currentNode.childForFieldName("name")?.text || `anonymous_${currentNode.startPosition.row + 1}` + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "function", + functionName, + currentNode.startPosition.row + 1, + ) + const functionNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.FUNCTION, + name: functionName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: { + isAsync: currentNode.children.some((c) => c && c.type === "async"), + isArrow: currentNode.type === "arrow_function", + }, + } + nodes.push(functionNode) + + // Determine container + const containerId = parentNodeId || fileNodeId + edges.push({ + id: GraphIndexStore.generateEdgeId(containerId, currentNodeId, EdgeType.CONTAINS), + source: containerId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + + // Extract method definitions + if (currentNode.type === "method_definition") { + const methodName = currentNode.childForFieldName("name")?.text + if (methodName && parentNodeId) { + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "method", + methodName, + currentNode.startPosition.row + 1, + ) + const methodNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.METHOD, + name: methodName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: { + isStatic: currentNode.children.some((c) => c && c.type === "static"), + isPrivate: currentNode.children.some((c) => c && c.type === "private"), + isAsync: currentNode.children.some((c) => c && c.type === "async"), + }, + } + nodes.push(methodNode) + + // Class contains method + edges.push({ + id: GraphIndexStore.generateEdgeId(parentNodeId, currentNodeId, EdgeType.CONTAINS), + source: parentNodeId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + } + + // Extract type aliases + if (currentNode.type === "type_alias_declaration") { + const typeName = currentNode.childForFieldName("name")?.text + if (typeName) { + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "type_alias", + typeName, + currentNode.startPosition.row + 1, + ) + const typeNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.TYPE_ALIAS, + name: typeName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text, + metadata: {}, + } + nodes.push(typeNode) + + // File contains type + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, currentNodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + } + + // Extract enums + if (currentNode.type === "enum_declaration") { + const enumName = currentNode.childForFieldName("name")?.text + if (enumName) { + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "enum", + enumName, + currentNode.startPosition.row + 1, + ) + const enumNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.ENUM, + name: enumName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: {}, + } + nodes.push(enumNode) + + // File contains enum + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, currentNodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + } + + // Extract exports + if (currentNode.type === "export_statement") { + const declaration = currentNode.childForFieldName("declaration") + if (declaration) { + const exportName = + declaration.childForFieldName("name")?.text || `export_${currentNode.startPosition.row + 1}` + const exportNodeId = GraphIndexStore.generateNodeId( + filePath, + "export", + exportName, + currentNode.startPosition.row + 1, + ) + const exportNode: CodeGraphNode = { + id: exportNodeId, + type: CodeNodeType.EXPORT, + name: exportName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text, + metadata: { + isDefault: currentNode.children.some((c) => c && c.type === "default"), + }, + } + nodes.push(exportNode) + + // File exports symbol + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, exportNodeId, EdgeType.EXPORTS), + source: fileNodeId, + target: exportNodeId, + type: EdgeType.EXPORTS, + weight: 1.0, + }) + } + } + + // Recurse through children + for (const child of currentNode.children) { + if (child) { + visit(child, currentNodeId || parentNodeId) + } + } + } + + visit(node) + } + + private extractPythonRelationships( + node: Node, + filePath: string, + fileNodeId: string, + nodes: CodeGraphNode[], + edges: CodeGraphEdge[], + ): void { + const visit = (currentNode: Node, parentNodeId?: string) => { + let currentNodeId: string | undefined + + // Extract imports + if (currentNode.type === "import_statement" || currentNode.type === "import_from_statement") { + const moduleName = currentNode.children.find((c) => c && c.type === "dotted_name")?.text + if (moduleName) { + const importNodeId = GraphIndexStore.generateNodeId( + filePath, + "import", + moduleName, + currentNode.startPosition.row + 1, + ) + const importNode: CodeGraphNode = { + id: importNodeId, + type: CodeNodeType.IMPORT, + name: moduleName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text, + metadata: {}, + } + nodes.push(importNode) + + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, importNodeId, EdgeType.IMPORTS), + source: fileNodeId, + target: importNodeId, + type: EdgeType.IMPORTS, + weight: 1.0, + }) + } + } + + // Extract classes + if (currentNode.type === "class_definition") { + const className = currentNode.childForFieldName("name")?.text + if (className) { + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + "class", + className, + currentNode.startPosition.row + 1, + ) + const classNode: CodeGraphNode = { + id: currentNodeId, + type: CodeNodeType.CLASS, + name: className, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: {}, + } + nodes.push(classNode) + + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, currentNodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + + // Check for inheritance + const superclasses = currentNode.childForFieldName("superclasses") + if (superclasses) { + superclasses.children + .filter((c) => c && c.type === "identifier") + .forEach((superClass) => { + if (superClass && currentNodeId && superClass.text) { + edges.push({ + id: GraphIndexStore.generateEdgeId( + currentNodeId, + superClass.text, + EdgeType.EXTENDS, + ), + source: currentNodeId, + target: superClass.text, + type: EdgeType.EXTENDS, + weight: 1.0, + metadata: { unresolved: true }, + }) + } + }) + } + } + } + + // Extract functions + if (currentNode.type === "function_definition") { + const functionName = currentNode.childForFieldName("name")?.text + if (functionName) { + const nodeType = parentNodeId ? CodeNodeType.METHOD : CodeNodeType.FUNCTION + currentNodeId = GraphIndexStore.generateNodeId( + filePath, + nodeType === CodeNodeType.METHOD ? "method" : "function", + functionName, + currentNode.startPosition.row + 1, + ) + const functionNode: CodeGraphNode = { + id: currentNodeId, + type: nodeType, + name: functionName, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: { + isAsync: currentNode.children.some((c) => c && c.type === "async"), + }, + } + nodes.push(functionNode) + + const containerId = parentNodeId || fileNodeId + edges.push({ + id: GraphIndexStore.generateEdgeId(containerId, currentNodeId, EdgeType.CONTAINS), + source: containerId, + target: currentNodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + } + + // Recurse through children + for (const child of currentNode.children) { + if (child) { + visit(child, currentNodeId || parentNodeId) + } + } + } + + visit(node) + } + + private extractJavaRelationships( + node: Node, + filePath: string, + fileNodeId: string, + nodes: CodeGraphNode[], + edges: CodeGraphEdge[], + ): void { + // Similar implementation for Java + this.extractGenericRelationships(node, filePath, fileNodeId, nodes, edges) + } + + private extractGoRelationships( + node: Node, + filePath: string, + fileNodeId: string, + nodes: CodeGraphNode[], + edges: CodeGraphEdge[], + ): void { + // Similar implementation for Go + this.extractGenericRelationships(node, filePath, fileNodeId, nodes, edges) + } + + private extractRustRelationships( + node: Node, + filePath: string, + fileNodeId: string, + nodes: CodeGraphNode[], + edges: CodeGraphEdge[], + ): void { + // Similar implementation for Rust + this.extractGenericRelationships(node, filePath, fileNodeId, nodes, edges) + } + + private extractGenericRelationships( + node: Node, + filePath: string, + fileNodeId: string, + nodes: CodeGraphNode[], + edges: CodeGraphEdge[], + ): void { + // Generic extraction for functions and classes + const visit = (currentNode: Node) => { + // Look for function-like constructs + if (currentNode.type.includes("function") || currentNode.type.includes("method")) { + const name = currentNode.childForFieldName("name")?.text || `func_${currentNode.startPosition.row + 1}` + const nodeId = GraphIndexStore.generateNodeId( + filePath, + "function", + name, + currentNode.startPosition.row + 1, + ) + const functionNode: CodeGraphNode = { + id: nodeId, + type: CodeNodeType.FUNCTION, + name, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: {}, + } + nodes.push(functionNode) + + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, nodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: nodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + + // Look for class-like constructs + if (currentNode.type.includes("class") || currentNode.type.includes("struct")) { + const name = currentNode.childForFieldName("name")?.text || `class_${currentNode.startPosition.row + 1}` + const nodeId = GraphIndexStore.generateNodeId( + filePath, + "class", + name, + currentNode.startPosition.row + 1, + ) + const classNode: CodeGraphNode = { + id: nodeId, + type: CodeNodeType.CLASS, + name, + filePath, + startLine: currentNode.startPosition.row + 1, + endLine: currentNode.endPosition.row + 1, + content: currentNode.text.substring(0, 500), + metadata: {}, + } + nodes.push(classNode) + + edges.push({ + id: GraphIndexStore.generateEdgeId(fileNodeId, nodeId, EdgeType.CONTAINS), + source: fileNodeId, + target: nodeId, + type: EdgeType.CONTAINS, + weight: 1.0, + }) + } + + // Recurse through children + for (const child of currentNode.children) { + if (child) { + visit(child) + } + } + } + + visit(node) + } +} diff --git a/src/services/code-index/interfaces/graph-index.ts b/src/services/code-index/interfaces/graph-index.ts new file mode 100644 index 0000000000..538310e009 --- /dev/null +++ b/src/services/code-index/interfaces/graph-index.ts @@ -0,0 +1,160 @@ +/** + * Graph-based code indexing interfaces for enhanced codebase understanding + */ + +/** + * Represents a node in the code graph + */ +export interface CodeGraphNode { + id: string + type: CodeNodeType + name: string + filePath: string + startLine: number + endLine: number + content: string + embedding?: number[] + metadata: Record +} + +/** + * Types of nodes in the code graph + */ +export enum CodeNodeType { + FILE = "file", + CLASS = "class", + INTERFACE = "interface", + FUNCTION = "function", + METHOD = "method", + VARIABLE = "variable", + IMPORT = "import", + EXPORT = "export", + MODULE = "module", + NAMESPACE = "namespace", + TYPE_ALIAS = "type_alias", + ENUM = "enum", + CONSTANT = "constant", +} + +/** + * Represents an edge/relationship between code nodes + */ +export interface CodeGraphEdge { + id: string + source: string // Node ID + target: string // Node ID + type: EdgeType + weight: number // Relationship strength (0-1) + metadata?: Record +} + +/** + * Types of relationships between code nodes + */ +export enum EdgeType { + CONTAINS = "contains", // File contains class/function + IMPORTS = "imports", // Module imports another + EXPORTS = "exports", // Module exports symbol + EXTENDS = "extends", // Class inheritance + IMPLEMENTS = "implements", // Interface implementation + CALLS = "calls", // Function calls another + REFERENCES = "references", // Variable/type reference + DEFINES = "defines", // File defines symbol + USES = "uses", // Generic usage relationship + OVERRIDES = "overrides", // Method override + DECORATES = "decorates", // Decorator relationship + DEPENDS_ON = "depends_on", // Dependency relationship +} + +/** + * Graph-based index interface + */ +export interface IGraphIndex { + /** + * Add a node to the graph + */ + addNode(node: CodeGraphNode): Promise + + /** + * Add an edge to the graph + */ + addEdge(edge: CodeGraphEdge): Promise + + /** + * Get a node by ID + */ + getNode(nodeId: string): Promise + + /** + * Get edges for a node + */ + getEdges(nodeId: string, edgeType?: EdgeType): Promise + + /** + * Get connected nodes + */ + getConnectedNodes(nodeId: string, edgeType?: EdgeType, depth?: number): Promise + + /** + * Search nodes by similarity + */ + searchSimilarNodes(embedding: number[], limit?: number, nodeType?: CodeNodeType): Promise + + /** + * Get subgraph around a node + */ + getSubgraph( + nodeId: string, + depth: number, + ): Promise<{ + nodes: CodeGraphNode[] + edges: CodeGraphEdge[] + }> + + /** + * Clear the entire graph + */ + clear(): Promise +} + +/** + * Context-aware search result + */ +export interface ContextAwareSearchResult { + node: CodeGraphNode + score: number + context: { + relatedNodes: CodeGraphNode[] + relationships: CodeGraphEdge[] + callChain?: CodeGraphNode[] + dependencies?: CodeGraphNode[] + } +} + +/** + * Enhanced search interface with context awareness + */ +export interface IContextAwareSearch { + /** + * Search with context awareness + */ + searchWithContext( + query: string, + options?: { + includeRelated?: boolean + maxDepth?: number + nodeTypes?: CodeNodeType[] + edgeTypes?: EdgeType[] + }, + ): Promise + + /** + * Get code context for a specific location + */ + getContextForLocation(filePath: string, line: number): Promise + + /** + * Find related code across the codebase + */ + findRelatedCode(nodeId: string, relationshipTypes?: EdgeType[]): Promise +} diff --git a/src/services/code-index/processors/semantic-parser.ts b/src/services/code-index/processors/semantic-parser.ts new file mode 100644 index 0000000000..1e1b34fb52 --- /dev/null +++ b/src/services/code-index/processors/semantic-parser.ts @@ -0,0 +1,711 @@ +import { Node } from "web-tree-sitter" +import { createHash } from "crypto" +import * as path from "path" +import { CodeBlock } from "../interfaces" +import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../constants" + +/** + * Semantic context for code blocks + */ +export interface SemanticContext { + scope: string[] // Nested scope (e.g., ["ClassName", "methodName"]) + imports: string[] + exports: string[] + dependencies: string[] + complexity: number + semanticType: SemanticBlockType +} + +/** + * Types of semantic blocks for better chunking + */ +export enum SemanticBlockType { + CLASS_DEFINITION = "class_definition", + FUNCTION_DEFINITION = "function_definition", + METHOD_DEFINITION = "method_definition", + INTERFACE_DEFINITION = "interface_definition", + TYPE_DEFINITION = "type_definition", + IMPORT_BLOCK = "import_block", + EXPORT_BLOCK = "export_block", + VARIABLE_DECLARATION = "variable_declaration", + CONTROL_FLOW = "control_flow", + DOCUMENTATION = "documentation", + TEST_CASE = "test_case", + CONFIGURATION = "configuration", +} + +/** + * Enhanced code block with semantic information + */ +export interface SemanticCodeBlock extends CodeBlock { + semanticContext: SemanticContext + parentBlockId?: string + childBlockIds: string[] + relatedBlockIds: string[] +} + +/** + * Semantic parser that creates intelligent code chunks based on AST analysis + */ +export class SemanticParser { + private readonly semanticBlocks: Map = new Map() + + /** + * Parse file with semantic understanding + */ + async parseWithSemantics( + filePath: string, + content: string, + tree: any, + language: string, + ): Promise { + this.semanticBlocks.clear() + const fileHash = createHash("sha256").update(content).digest("hex") + const lines = content.split("\n") + + // Extract semantic blocks based on language + switch (language) { + case "typescript": + case "tsx": + case "javascript": + case "jsx": + return this.parseTypeScriptSemantics(tree.rootNode, filePath, lines, fileHash) + case "python": + return this.parsePythonSemantics(tree.rootNode, filePath, lines, fileHash) + default: + return this.parseGenericSemantics(tree.rootNode, filePath, lines, fileHash) + } + } + + /** + * Parse TypeScript/JavaScript with semantic understanding + */ + private parseTypeScriptSemantics( + rootNode: Node, + filePath: string, + lines: string[], + fileHash: string, + ): SemanticCodeBlock[] { + const blocks: SemanticCodeBlock[] = [] + const imports: string[] = [] + const exports: string[] = [] + const globalScope: string[] = [] + + // First pass: collect imports and exports + this.collectImportsExports(rootNode, imports, exports) + + // Second pass: extract semantic blocks + this.extractSemanticBlocks(rootNode, filePath, lines, fileHash, blocks, imports, exports, globalScope) + + // Third pass: establish relationships + this.establishBlockRelationships(blocks) + + // Fourth pass: optimize chunking + return this.optimizeChunking(blocks) + } + + /** + * Collect imports and exports from AST + */ + private collectImportsExports(node: Node, imports: string[], exports: string[]): void { + const visit = (currentNode: Node) => { + if (currentNode.type === "import_statement") { + const source = currentNode.childForFieldName("source")?.text?.replace(/['"]/g, "") + if (source) imports.push(source) + } + + if (currentNode.type === "export_statement") { + const declaration = currentNode.childForFieldName("declaration") + const name = declaration?.childForFieldName("name")?.text + if (name) exports.push(name) + } + + for (const child of currentNode.children) { + if (child) visit(child) + } + } + + visit(node) + } + + /** + * Extract semantic blocks from AST + */ + private extractSemanticBlocks( + node: Node, + filePath: string, + lines: string[], + fileHash: string, + blocks: SemanticCodeBlock[], + imports: string[], + exports: string[], + scope: string[], + parentBlockId?: string, + ): void { + const visit = (currentNode: Node, currentScope: string[], currentParentId?: string) => { + let blockId: string | undefined + let semanticType: SemanticBlockType | undefined + let blockName: string | undefined + + // Determine semantic type and extract block + if (currentNode.type === "class_declaration") { + semanticType = SemanticBlockType.CLASS_DEFINITION + blockName = currentNode.childForFieldName("name")?.text + } else if (currentNode.type === "function_declaration") { + semanticType = SemanticBlockType.FUNCTION_DEFINITION + blockName = currentNode.childForFieldName("name")?.text + } else if (currentNode.type === "method_definition") { + semanticType = SemanticBlockType.METHOD_DEFINITION + blockName = currentNode.childForFieldName("name")?.text + } else if (currentNode.type === "interface_declaration") { + semanticType = SemanticBlockType.INTERFACE_DEFINITION + blockName = currentNode.childForFieldName("name")?.text + } else if (currentNode.type === "type_alias_declaration") { + semanticType = SemanticBlockType.TYPE_DEFINITION + blockName = currentNode.childForFieldName("name")?.text + } else if (this.isTestCase(currentNode)) { + semanticType = SemanticBlockType.TEST_CASE + blockName = this.extractTestName(currentNode) + } + + // Create semantic block if applicable + if (semanticType && blockName) { + const startLine = currentNode.startPosition.row + 1 + const endLine = currentNode.endPosition.row + 1 + const content = lines.slice(startLine - 1, endLine).join("\n") + + // Check if content needs intelligent chunking + if (content.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { + // Split large blocks intelligently + const subBlocks = this.splitLargeSemanticBlock( + currentNode, + lines, + filePath, + fileHash, + semanticType, + blockName, + currentScope, + ) + blocks.push(...subBlocks) + blockId = subBlocks[0]?.segmentHash + } else if (content.length >= MIN_BLOCK_CHARS) { + blockId = this.createSemanticBlock( + filePath, + fileHash, + blockName, + semanticType, + startLine, + endLine, + content, + currentScope, + imports, + exports, + currentParentId, + ) + blocks.push(this.semanticBlocks.get(blockId)!) + } + + // Update scope for children + currentScope = [...currentScope, blockName] + } + + // Recurse through children + for (const child of currentNode.children) { + if (child) { + visit(child, currentScope, blockId || currentParentId) + } + } + } + + visit(node, scope, parentBlockId) + } + + /** + * Split large semantic blocks intelligently + */ + private splitLargeSemanticBlock( + node: Node, + lines: string[], + filePath: string, + fileHash: string, + semanticType: SemanticBlockType, + blockName: string, + scope: string[], + ): SemanticCodeBlock[] { + const blocks: SemanticCodeBlock[] = [] + const startLine = node.startPosition.row + 1 + const endLine = node.endPosition.row + 1 + + // For classes and interfaces, split by methods/properties + if ( + semanticType === SemanticBlockType.CLASS_DEFINITION || + semanticType === SemanticBlockType.INTERFACE_DEFINITION + ) { + // Create header block with class/interface signature + const headerEndLine = this.findBlockHeaderEnd(node, startLine) + if (headerEndLine > startLine) { + const headerContent = lines.slice(startLine - 1, headerEndLine).join("\n") + const headerId = this.createSemanticBlock( + filePath, + fileHash, + `${blockName}_header`, + semanticType, + startLine, + headerEndLine, + headerContent, + scope, + [], + [], + undefined, + ) + blocks.push(this.semanticBlocks.get(headerId)!) + } + + // Extract methods as separate blocks + for (const child of node.children) { + if (child && (child.type === "method_definition" || child.type === "property_signature")) { + const methodName = child.childForFieldName("name")?.text + if (methodName) { + const methodStart = child.startPosition.row + 1 + const methodEnd = child.endPosition.row + 1 + const methodContent = lines.slice(methodStart - 1, methodEnd).join("\n") + + if (methodContent.length >= MIN_BLOCK_CHARS) { + const methodId = this.createSemanticBlock( + filePath, + fileHash, + methodName, + SemanticBlockType.METHOD_DEFINITION, + methodStart, + methodEnd, + methodContent, + [...scope, blockName], + [], + [], + undefined, + ) + blocks.push(this.semanticBlocks.get(methodId)!) + } + } + } + } + } else { + // For large functions, split by logical sections + blocks.push(...this.splitFunctionIntoLogicalSections(node, lines, filePath, fileHash, blockName, scope)) + } + + return blocks + } + + /** + * Find where the header of a block ends (e.g., class signature) + */ + private findBlockHeaderEnd(node: Node, startLine: number): number { + // Find the first method or property + for (const child of node.children) { + if ( + child && + (child.type === "method_definition" || + child.type === "property_signature" || + child.type === "function_declaration") + ) { + return child.startPosition.row // Return line before first member + } + } + return startLine + 1 // Default to just the declaration line + } + + /** + * Split a function into logical sections + */ + private splitFunctionIntoLogicalSections( + node: Node, + lines: string[], + filePath: string, + fileHash: string, + functionName: string, + scope: string[], + ): SemanticCodeBlock[] { + const blocks: SemanticCodeBlock[] = [] + const startLine = node.startPosition.row + 1 + const endLine = node.endPosition.row + 1 + + // Identify logical sections (initialization, main logic, return statements) + const sections = this.identifyLogicalSections(node) + + let currentSectionStart = startLine + sections.forEach((section, index) => { + const sectionEnd = section.endLine + const sectionContent = lines.slice(currentSectionStart - 1, sectionEnd).join("\n") + + if (sectionContent.length >= MIN_BLOCK_CHARS) { + const sectionId = this.createSemanticBlock( + filePath, + fileHash, + `${functionName}_section_${index + 1}`, + SemanticBlockType.FUNCTION_DEFINITION, + currentSectionStart, + sectionEnd, + sectionContent, + scope, + [], + [], + undefined, + ) + blocks.push(this.semanticBlocks.get(sectionId)!) + } + + currentSectionStart = sectionEnd + 1 + }) + + // Add remaining content if any + if (currentSectionStart <= endLine) { + const remainingContent = lines.slice(currentSectionStart - 1, endLine).join("\n") + if (remainingContent.length >= MIN_BLOCK_CHARS) { + const remainingId = this.createSemanticBlock( + filePath, + fileHash, + `${functionName}_end`, + SemanticBlockType.FUNCTION_DEFINITION, + currentSectionStart, + endLine, + remainingContent, + scope, + [], + [], + undefined, + ) + blocks.push(this.semanticBlocks.get(remainingId)!) + } + } + + return blocks.length > 0 + ? blocks + : [ + { + file_path: filePath, + identifier: functionName, + type: "function", + start_line: startLine, + end_line: endLine, + content: lines.slice(startLine - 1, endLine).join("\n"), + segmentHash: createHash("sha256") + .update(`${filePath}-${functionName}-${startLine}`) + .digest("hex"), + fileHash, + semanticContext: { + scope, + imports: [], + exports: [], + dependencies: [], + complexity: this.calculateComplexity(node), + semanticType: SemanticBlockType.FUNCTION_DEFINITION, + }, + childBlockIds: [], + relatedBlockIds: [], + }, + ] + } + + /** + * Identify logical sections within a function + */ + private identifyLogicalSections(node: Node): Array<{ type: string; endLine: number }> { + const sections: Array<{ type: string; endLine: number }> = [] + let lastSignificantLine = node.startPosition.row + + const visit = (currentNode: Node) => { + // Look for significant control flow changes + if ( + currentNode.type === "if_statement" || + currentNode.type === "for_statement" || + currentNode.type === "while_statement" || + currentNode.type === "try_statement" + ) { + const endLine = currentNode.endPosition.row + if (endLine - lastSignificantLine > 10) { + sections.push({ type: "control_flow", endLine }) + lastSignificantLine = endLine + } + } + + for (const child of currentNode.children) { + if (child) visit(child) + } + } + + visit(node) + return sections + } + + /** + * Create a semantic block + */ + private createSemanticBlock( + filePath: string, + fileHash: string, + name: string, + semanticType: SemanticBlockType, + startLine: number, + endLine: number, + content: string, + scope: string[], + imports: string[], + exports: string[], + parentBlockId?: string, + ): string { + const segmentHash = createHash("sha256") + .update(`${filePath}-${name}-${startLine}-${content.length}`) + .digest("hex") + + const block: SemanticCodeBlock = { + file_path: filePath, + identifier: name, + type: semanticType, + start_line: startLine, + end_line: endLine, + content, + segmentHash, + fileHash, + semanticContext: { + scope, + imports: [...imports], + exports: [...exports], + dependencies: this.extractDependencies(content), + complexity: this.calculateComplexityFromContent(content), + semanticType, + }, + parentBlockId, + childBlockIds: [], + relatedBlockIds: [], + } + + this.semanticBlocks.set(segmentHash, block) + + // Update parent's children + if (parentBlockId) { + const parentBlock = this.semanticBlocks.get(parentBlockId) + if (parentBlock) { + parentBlock.childBlockIds.push(segmentHash) + } + } + + return segmentHash + } + + /** + * Establish relationships between blocks + */ + private establishBlockRelationships(blocks: SemanticCodeBlock[]): void { + // Find related blocks based on references + for (const block of blocks) { + for (const otherBlock of blocks) { + if (block === otherBlock) continue + + // Check if block references the other + if (block.content.includes(otherBlock.identifier || "")) { + block.relatedBlockIds.push(otherBlock.segmentHash) + } + } + } + } + + /** + * Optimize chunking for better retrieval + */ + private optimizeChunking(blocks: SemanticCodeBlock[]): SemanticCodeBlock[] { + const optimized: SemanticCodeBlock[] = [] + + for (const block of blocks) { + // Merge small related blocks + if (block.content.length < MIN_BLOCK_CHARS * 0.5 && block.relatedBlockIds.length > 0) { + // Try to merge with related blocks + const relatedBlock = blocks.find((b) => b.segmentHash === block.relatedBlockIds[0]) + if (relatedBlock && relatedBlock.content.length + block.content.length < MAX_BLOCK_CHARS) { + // Merge blocks + relatedBlock.content += "\n\n" + block.content + relatedBlock.end_line = block.end_line + relatedBlock.relatedBlockIds.push( + ...block.relatedBlockIds.filter((id) => id !== relatedBlock.segmentHash), + ) + continue + } + } + + optimized.push(block) + } + + return optimized + } + + /** + * Check if a node represents a test case + */ + private isTestCase(node: Node): boolean { + if (node.type !== "call_expression") return false + + const functionName = node.childForFieldName("function")?.text + return !!( + functionName && + (functionName === "test" || + functionName === "it" || + functionName === "describe" || + functionName.includes("test") || + functionName.includes("spec")) + ) + } + + /** + * Extract test name from test node + */ + private extractTestName(node: Node): string { + const args = node.childForFieldName("arguments") + if (args && args.children.length > 0) { + const firstArg = args.children[1] // Skip opening paren + if (firstArg && firstArg.type === "string") { + return firstArg.text?.replace(/['"]/g, "") || "test" + } + } + return `test_${node.startPosition.row + 1}` + } + + /** + * Extract dependencies from content + */ + private extractDependencies(content: string): string[] { + const dependencies: string[] = [] + + // Simple regex-based extraction + const importMatches = content.matchAll(/import\s+.*?\s+from\s+['"](.+?)['"]/g) + for (const match of importMatches) { + dependencies.push(match[1]) + } + + const requireMatches = content.matchAll(/require\(['"](.+?)['"]\)/g) + for (const match of requireMatches) { + dependencies.push(match[1]) + } + + return dependencies + } + + /** + * Calculate complexity from AST node + */ + private calculateComplexity(node: Node): number { + let complexity = 1 + + const visit = (currentNode: Node) => { + // Increment for control flow statements + if ( + currentNode.type === "if_statement" || + currentNode.type === "for_statement" || + currentNode.type === "while_statement" || + currentNode.type === "switch_statement" || + currentNode.type === "catch_clause" + ) { + complexity++ + } + + // Increment for logical operators + if (currentNode.type === "binary_expression") { + const operator = currentNode.childForFieldName("operator")?.text + if (operator === "&&" || operator === "||") { + complexity++ + } + } + + for (const child of currentNode.children) { + if (child) visit(child) + } + } + + visit(node) + return complexity + } + + /** + * Calculate complexity from content string + */ + private calculateComplexityFromContent(content: string): number { + let complexity = 1 + + // Count control flow keywords + const controlFlowKeywords = ["if", "else", "for", "while", "switch", "case", "catch", "finally"] + + for (const keyword of controlFlowKeywords) { + const regex = new RegExp(`\\b${keyword}\\b`, "g") + const matches = content.match(regex) + if (matches) { + complexity += matches.length + } + } + + // Count logical operators + complexity += (content.match(/&&|\|\|/g) || []).length + + return complexity + } + + /** + * Parse Python with semantic understanding + */ + private parsePythonSemantics( + rootNode: Node, + filePath: string, + lines: string[], + fileHash: string, + ): SemanticCodeBlock[] { + // Similar implementation adapted for Python syntax + return this.parseGenericSemantics(rootNode, filePath, lines, fileHash) + } + + /** + * Generic semantic parsing for other languages + */ + private parseGenericSemantics( + rootNode: Node, + filePath: string, + lines: string[], + fileHash: string, + ): SemanticCodeBlock[] { + const blocks: SemanticCodeBlock[] = [] + + // Basic semantic extraction + const visit = (node: Node) => { + const nodeText = node.text + if (nodeText.length >= MIN_BLOCK_CHARS && nodeText.length <= MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { + const block: SemanticCodeBlock = { + file_path: filePath, + identifier: null, + type: node.type, + start_line: node.startPosition.row + 1, + end_line: node.endPosition.row + 1, + content: nodeText, + segmentHash: createHash("sha256") + .update(`${filePath}-${node.startPosition.row}-${nodeText.length}`) + .digest("hex"), + fileHash, + semanticContext: { + scope: [], + imports: [], + exports: [], + dependencies: [], + complexity: 1, + semanticType: SemanticBlockType.FUNCTION_DEFINITION, + }, + childBlockIds: [], + relatedBlockIds: [], + } + blocks.push(block) + } + + for (const child of node.children) { + if (child) visit(child) + } + } + + visit(rootNode) + return blocks + } +}