From 60d02d9de9cf35e33632d2a962e119510d7da94c Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 12:53:07 +0000 Subject: [PATCH 1/5] perf(enricher): parse tree-sitter tree once per file, not per node (Task A1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each LanguageExtractor.Extract reparsed the source file at its top — on Python at ~13 nodes/file that meant ~13x over-parse. pprof on airflow flagged 91% of total allocations from tree-sitter. (*Tree).cachedNode driven by the per-node re-parse storm. Adds ExtractFromTree(ctx, tree, nodes) []Result to the LanguageExtractor interface. The orchestrator now parses the file once and calls ExtractFromTree(tree, allNodes) — the AST is walked multiple times for distinct node-kinds but never re-parsed. Extract is retained as a thin wrapper for single-node convenience callers and tests. Plan: docs/superpowers/plans/2026-05-13-enrich-oom-fix.md Task A1. Per-file caches: matchAllList (py), matchInterfaceAssertion (go), collectExports (ts) are computed once per file rather than once per matching node. Verification: - go test ./internal/intelligence/extractor/... -count=1: 28 pass - go test ./... -count=1: 875 pass --- .../intelligence/extractor/enricher.go | 18 +++- .../intelligence/extractor/enricher_test.go | 29 ++++++- .../intelligence/extractor/extractor.go | 16 +++- .../extractor/golang/extractor.go | 71 ++++++++++------ .../intelligence/extractor/java/extractor.go | 60 +++++++++----- .../extractor/python/extractor.go | 82 +++++++++++++------ .../extractor/typescript/extractor.go | 69 +++++++++++----- 7 files changed, 246 insertions(+), 99 deletions(-) diff --git a/go/internal/intelligence/extractor/enricher.go b/go/internal/intelligence/extractor/enricher.go index ef666b1e..ef5fb2e2 100644 --- a/go/internal/intelligence/extractor/enricher.go +++ b/go/internal/intelligence/extractor/enricher.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/randomcodespace/codeiq/go/internal/model" + "github.com/randomcodespace/codeiq/go/internal/parser" ) // Enricher orchestrates per-language extractors over a node list. Mirrors @@ -114,12 +115,23 @@ func (en *Enricher) Enrich(nodes []*model.CodeNode, edges *[]*model.CodeEdge, ro Content: content, Registry: registry, } + // Parse once per file; reuse the tree across every node in this + // file via ExtractFromTree. Eliminates the per-node re-parse that + // pprof on airflow flagged as 91% of total allocations. + tree, _ := parser.ParseByName(t.ext.Language(), raw) + if tree != nil { + defer tree.Close() + } + results := t.ext.ExtractFromTree(ctx, tree, t.ns) var localEdges []*model.CodeEdge - for _, n := range t.ns { - r := t.ext.Extract(ctx, n) + for j, r := range results { + if j >= len(t.ns) { + break + } + n := t.ns[j] localEdges = append(localEdges, r.CallEdges...) localEdges = append(localEdges, r.SymbolReferences...) - if len(r.TypeHints) > 0 { + if len(r.TypeHints) > 0 && n != nil { if n.Properties == nil { n.Properties = map[string]any{} } diff --git a/go/internal/intelligence/extractor/enricher_test.go b/go/internal/intelligence/extractor/enricher_test.go index 27d80379..62fd03ad 100644 --- a/go/internal/intelligence/extractor/enricher_test.go +++ b/go/internal/intelligence/extractor/enricher_test.go @@ -8,13 +8,14 @@ import ( "testing" "github.com/randomcodespace/codeiq/go/internal/model" + "github.com/randomcodespace/codeiq/go/internal/parser" ) // fakeExtractor is a test-only LanguageExtractor that records each call so we // can assert the orchestrator's read-once contract and per-language dispatch. type fakeExtractor struct { lang string - calls int32 // atomic counter of Extract() invocations + calls int32 // counts per-node visits (across both Extract and ExtractFromTree) filesSeen []string emitEdge bool emitHint bool @@ -25,9 +26,9 @@ type fakeExtractor struct { func (f *fakeExtractor) Language() string { return f.lang } -func (f *fakeExtractor) Extract(ctx Context, node *model.CodeNode) Result { - atomic.AddInt32(&f.calls, 1) - f.filesSeen = append(f.filesSeen, ctx.FilePath) +// resultFor synthesises a Result for one node — shared between Extract and +// ExtractFromTree so behaviour is identical regardless of call path. +func (f *fakeExtractor) resultFor(node *model.CodeNode) Result { r := EmptyResult() if f.emitEdge { r.CallEdges = []*model.CodeEdge{{ @@ -47,6 +48,26 @@ func (f *fakeExtractor) Extract(ctx Context, node *model.CodeNode) Result { return r } +func (f *fakeExtractor) Extract(ctx Context, node *model.CodeNode) Result { + atomic.AddInt32(&f.calls, 1) + f.filesSeen = append(f.filesSeen, ctx.FilePath) + return f.resultFor(node) +} + +func (f *fakeExtractor) ExtractFromTree(ctx Context, _ *parser.Tree, nodes []*model.CodeNode) []Result { + atomic.AddInt32(&f.calls, int32(len(nodes))) + f.filesSeen = append(f.filesSeen, ctx.FilePath) + results := make([]Result, len(nodes)) + for i, n := range nodes { + if n == nil { + results[i] = EmptyResult() + continue + } + results[i] = f.resultFor(n) + } + return results +} + func TestEnricher_DispatchesPerLanguageAndAppendsEdges(t *testing.T) { dir := t.TempDir() javaPath := "src/Foo.java" diff --git a/go/internal/intelligence/extractor/extractor.go b/go/internal/intelligence/extractor/extractor.go index cf4daa9c..b9d09299 100644 --- a/go/internal/intelligence/extractor/extractor.go +++ b/go/internal/intelligence/extractor/extractor.go @@ -7,7 +7,10 @@ // language via DetectLanguage. package extractor -import "github.com/randomcodespace/codeiq/go/internal/model" +import ( + "github.com/randomcodespace/codeiq/go/internal/model" + "github.com/randomcodespace/codeiq/go/internal/parser" +) // Context is the per-file context an extractor sees during enrich. The // orchestrator reads the file once and passes the contents to every node-level @@ -52,6 +55,15 @@ type LanguageExtractor interface { // Language returns the canonical language key, lower-case (e.g. "java"). // This key must match DetectLanguage for the orchestrator to dispatch. Language() string - // Extract runs the extractor against a single node within a parsed file. + // Extract runs the extractor against a single node, parsing ctx.Content + // internally. Retained as the single-node convenience wrapper for tests + // and ad-hoc callers; the orchestrator uses ExtractFromTree to avoid + // re-parsing N times for a file with N nodes. Extract(ctx Context, node *model.CodeNode) Result + // ExtractFromTree runs the extractor against every node in `nodes` using + // a single pre-parsed tree. Returns one Result per input node in matching + // order, so callers can stamp TypeHints back onto the corresponding node. + // `tree` may be nil when ctx.Language has no tree-sitter grammar — the + // extractor must handle that by returning len(nodes) EmptyResult entries. + ExtractFromTree(ctx Context, tree *parser.Tree, nodes []*model.CodeNode) []Result } diff --git a/go/internal/intelligence/extractor/golang/extractor.go b/go/internal/intelligence/extractor/golang/extractor.go index e24a59c5..02db4e9e 100644 --- a/go/internal/intelligence/extractor/golang/extractor.go +++ b/go/internal/intelligence/extractor/golang/extractor.go @@ -43,40 +43,65 @@ func New() *Extractor { return &Extractor{} } // Language returns "go". func (e *Extractor) Language() string { return "go" } -// Extract dispatches by node kind. CLASS is the registry kind for Go structs -// here — the Java side uses CLASS + COMPONENT; the per-task brief is CLASS -// only, so we mirror that. +// Extract dispatches by node kind. Single-node convenience wrapper — +// production paths use ExtractFromTree. func (e *Extractor) Extract(ctx extractor.Context, node *model.CodeNode) extractor.Result { - switch node.Kind { - case model.NodeMethod, model.NodeClass: - default: - return extractor.EmptyResult() + tree, _ := parser.ParseByName("go", []byte(ctx.Content)) + if tree != nil { + defer tree.Close() } - tree, err := parser.ParseByName("go", []byte(ctx.Content)) - if err != nil || tree == nil || tree.Root == nil { + out := e.ExtractFromTree(ctx, tree, []*model.CodeNode{node}) + if len(out) == 0 { return extractor.EmptyResult() } - defer tree.Close() + return out[0] +} + +// ExtractFromTree walks the pre-parsed tree once per input node, returning +// one Result per node in matching order. tree may be nil — every result is +// EmptyResult in that case. +func (e *Extractor) ExtractFromTree(ctx extractor.Context, tree *parser.Tree, nodes []*model.CodeNode) []extractor.Result { + results := make([]extractor.Result, len(nodes)) + for i := range results { + results[i] = extractor.EmptyResult() + } + if tree == nil || tree.Root == nil { + return results + } root := tree.Root.RootNode() if root == nil { - return extractor.EmptyResult() + return results } - - switch node.Kind { - case model.NodeMethod: - return extractor.Result{ - CallEdges: collectGoCallEdges(root, ctx.Content, node, ctx.Registry), - Confidence: model.CapabilityPartial, + // matchInterfaceAssertion only reads ctx.Content; compute once per file. + var ifaceAssertion string + var ifaceAssertionComputed bool + for i, node := range nodes { + if node == nil { + continue } - case model.NodeClass: - if iface := matchInterfaceAssertion(ctx.Content); iface != "" { - return extractor.Result{ - TypeHints: map[string]string{"implements_types": iface}, - Confidence: model.CapabilityPartial, + switch node.Kind { + case model.NodeMethod: + edges := collectGoCallEdges(root, ctx.Content, node, ctx.Registry) + if len(edges) > 0 { + results[i] = extractor.Result{ + CallEdges: edges, + Confidence: model.CapabilityPartial, + } + } + case model.NodeClass: + if !ifaceAssertionComputed { + ifaceAssertion = matchInterfaceAssertion(ctx.Content) + ifaceAssertionComputed = true + } + if ifaceAssertion != "" { + results[i] = extractor.Result{ + TypeHints: map[string]string{"implements_types": ifaceAssertion}, + Confidence: model.CapabilityPartial, + } } } } - return extractor.EmptyResult() + return results } // matchInterfaceAssertion runs the package-level regex against the source. The diff --git a/go/internal/intelligence/extractor/java/extractor.go b/go/internal/intelligence/extractor/java/extractor.go index 5e939937..98dacc6b 100644 --- a/go/internal/intelligence/extractor/java/extractor.go +++ b/go/internal/intelligence/extractor/java/extractor.go @@ -38,36 +38,54 @@ func New() *Extractor { return &Extractor{} } func (e *Extractor) Language() string { return "java" } // Extract returns CALLS edges for METHOD nodes and type-hierarchy hints for -// CLASS / ABSTRACT_CLASS / INTERFACE nodes. All other node kinds short-circuit -// to EmptyResult. +// CLASS / ABSTRACT_CLASS / INTERFACE nodes. Single-node convenience wrapper — +// parses once per call. Production paths use ExtractFromTree. func (e *Extractor) Extract(ctx extractor.Context, node *model.CodeNode) extractor.Result { - switch node.Kind { - case model.NodeMethod, model.NodeClass, - model.NodeAbstractClass, model.NodeInterface: - default: - return extractor.EmptyResult() + tree, _ := parser.ParseByName("java", []byte(ctx.Content)) + if tree != nil { + defer tree.Close() } - tree, err := parser.ParseByName("java", []byte(ctx.Content)) - if err != nil || tree == nil || tree.Root == nil { + out := e.ExtractFromTree(ctx, tree, []*model.CodeNode{node}) + if len(out) == 0 { return extractor.EmptyResult() } - defer tree.Close() + return out[0] +} + +// ExtractFromTree walks the pre-parsed tree once per input node and returns +// one Result per node in matching order. tree may be nil — all results are +// EmptyResult in that case. +func (e *Extractor) ExtractFromTree(ctx extractor.Context, tree *parser.Tree, nodes []*model.CodeNode) []extractor.Result { + results := make([]extractor.Result, len(nodes)) + for i := range results { + results[i] = extractor.EmptyResult() + } + if tree == nil || tree.Root == nil { + return results + } root := tree.Root.RootNode() if root == nil { - return extractor.EmptyResult() + return results } - - if node.Kind == model.NodeMethod { - return extractor.Result{ - CallEdges: collectCallEdges(root, ctx.Content, node, ctx.Registry), - Confidence: model.CapabilityPartial, + for i, node := range nodes { + if node == nil { + continue + } + switch node.Kind { + case model.NodeMethod: + results[i] = extractor.Result{ + CallEdges: collectCallEdges(root, ctx.Content, node, ctx.Registry), + Confidence: model.CapabilityPartial, + } + case model.NodeClass, model.NodeAbstractClass, model.NodeInterface: + hints := extractTypeHierarchyHints(root, ctx.Content, node.Label) + results[i] = extractor.Result{ + TypeHints: hints, + Confidence: model.CapabilityPartial, + } } } - hints := extractTypeHierarchyHints(root, ctx.Content, node.Label) - return extractor.Result{ - TypeHints: hints, - Confidence: model.CapabilityPartial, - } + return results } // collectCallEdges walks the tree to locate the method_declaration whose diff --git a/go/internal/intelligence/extractor/python/extractor.go b/go/internal/intelligence/extractor/python/extractor.go index 85b3b80d..5154efc7 100644 --- a/go/internal/intelligence/extractor/python/extractor.go +++ b/go/internal/intelligence/extractor/python/extractor.go @@ -39,45 +39,75 @@ func New() *Extractor { return &Extractor{} } // Language returns "python". func (e *Extractor) Language() string { return "python" } -// Extract dispatches by node kind. +// Extract dispatches by node kind. Single-node convenience wrapper; parses +// the file each call. Production paths use ExtractFromTree to amortise the +// parse across every node in a file. func (e *Extractor) Extract(ctx extractor.Context, node *model.CodeNode) extractor.Result { - switch node.Kind { - case model.NodeMethod, model.NodeClass, model.NodeModule: - default: - return extractor.EmptyResult() + tree, _ := parser.ParseByName("python", []byte(ctx.Content)) + if tree != nil { + defer tree.Close() } - tree, err := parser.ParseByName("python", []byte(ctx.Content)) - if err != nil || tree == nil || tree.Root == nil { + out := e.ExtractFromTree(ctx, tree, []*model.CodeNode{node}) + if len(out) == 0 { return extractor.EmptyResult() } - defer tree.Close() + return out[0] +} + +// ExtractFromTree walks a single pre-parsed tree once and produces a Result +// per input node. Order matches `nodes`. tree may be nil — every node maps +// to EmptyResult in that case. +func (e *Extractor) ExtractFromTree(ctx extractor.Context, tree *parser.Tree, nodes []*model.CodeNode) []extractor.Result { + results := make([]extractor.Result, len(nodes)) + for i := range results { + results[i] = extractor.EmptyResult() + } + if tree == nil || tree.Root == nil { + return results + } root := tree.Root.RootNode() if root == nil { - return extractor.EmptyResult() + return results } + // matchAllList only reads ctx.Content; compute once per file and reuse + // across every Module node in the input. + var moduleAllExports string + var moduleAllExportsComputed bool - switch node.Kind { - case model.NodeMethod: - return extractor.Result{ - CallEdges: collectFunctionCallEdges(root, ctx.Content, node, ctx.Registry), - Confidence: model.CapabilityPartial, + for i, node := range nodes { + if node == nil { + continue } - case model.NodeClass: - if base := classBase(root, ctx.Content, node.Label); base != "" { - return extractor.Result{ - TypeHints: map[string]string{"extends_type": base}, - Confidence: model.CapabilityPartial, + switch node.Kind { + case model.NodeMethod: + edges := collectFunctionCallEdges(root, ctx.Content, node, ctx.Registry) + if len(edges) > 0 { + results[i] = extractor.Result{ + CallEdges: edges, + Confidence: model.CapabilityPartial, + } } - } - case model.NodeModule: - if all := matchAllList(ctx.Content); all != "" { - return extractor.Result{ - TypeHints: map[string]string{"all_exports": all}, - Confidence: model.CapabilityPartial, + case model.NodeClass: + if base := classBase(root, ctx.Content, node.Label); base != "" { + results[i] = extractor.Result{ + TypeHints: map[string]string{"extends_type": base}, + Confidence: model.CapabilityPartial, + } + } + case model.NodeModule: + if !moduleAllExportsComputed { + moduleAllExports = matchAllList(ctx.Content) + moduleAllExportsComputed = true + } + if moduleAllExports != "" { + results[i] = extractor.Result{ + TypeHints: map[string]string{"all_exports": moduleAllExports}, + Confidence: model.CapabilityPartial, + } } } } - return extractor.EmptyResult() + return results } // matchAllList extracts the literal entries of a `__all__ = [...]` list as diff --git a/go/internal/intelligence/extractor/typescript/extractor.go b/go/internal/intelligence/extractor/typescript/extractor.go index e92d5cd2..236f1d4a 100644 --- a/go/internal/intelligence/extractor/typescript/extractor.go +++ b/go/internal/intelligence/extractor/typescript/extractor.go @@ -34,35 +34,64 @@ func New() *Extractor { return &Extractor{} } func (e *Extractor) Language() string { return "typescript" } // Extract dispatches by node kind: METHOD -> call edges, MODULE -> exports -// hint. Other kinds short-circuit. +// hint. Single-node convenience wrapper; production paths use ExtractFromTree. func (e *Extractor) Extract(ctx extractor.Context, node *model.CodeNode) extractor.Result { - if node.Kind != model.NodeMethod && node.Kind != model.NodeModule { - return extractor.EmptyResult() + tree, _ := parser.ParseByName("typescript", []byte(ctx.Content)) + if tree != nil { + defer tree.Close() } - tree, err := parser.ParseByName("typescript", []byte(ctx.Content)) - if err != nil || tree == nil || tree.Root == nil { + out := e.ExtractFromTree(ctx, tree, []*model.CodeNode{node}) + if len(out) == 0 { return extractor.EmptyResult() } - defer tree.Close() + return out[0] +} + +// ExtractFromTree walks the pre-parsed tree once per input node, returning +// one Result per node in matching order. tree may be nil. +func (e *Extractor) ExtractFromTree(ctx extractor.Context, tree *parser.Tree, nodes []*model.CodeNode) []extractor.Result { + results := make([]extractor.Result, len(nodes)) + for i := range results { + results[i] = extractor.EmptyResult() + } + if tree == nil || tree.Root == nil { + return results + } root := tree.Root.RootNode() if root == nil { - return extractor.EmptyResult() + return results } - if node.Kind == model.NodeMethod { - return extractor.Result{ - CallEdges: collectCallEdges(root, ctx.Content, node, ctx.Registry), - Confidence: model.CapabilityPartial, + // collectExports only reads the tree root; compute once for all MODULE + // nodes in the input. + var moduleExports string + var moduleExportsComputed bool + for i, node := range nodes { + if node == nil { + continue + } + switch node.Kind { + case model.NodeMethod: + results[i] = extractor.Result{ + CallEdges: collectCallEdges(root, ctx.Content, node, ctx.Registry), + Confidence: model.CapabilityPartial, + } + case model.NodeModule: + if !moduleExportsComputed { + exports := collectExports(root, ctx.Content) + if len(exports) > 0 { + moduleExports = strings.Join(exports, ", ") + } + moduleExportsComputed = true + } + if moduleExports != "" { + results[i] = extractor.Result{ + TypeHints: map[string]string{"module_exports": moduleExports}, + Confidence: model.CapabilityPartial, + } + } } } - // MODULE - exports := collectExports(root, ctx.Content) - if len(exports) == 0 { - return extractor.EmptyResult() - } - return extractor.Result{ - TypeHints: map[string]string{"module_exports": strings.Join(exports, ", ")}, - Confidence: model.CapabilityPartial, - } + return results } // collectCallEdges finds the function-like declaration matching fn.Label and From 21f07d82cb1c596337694429e9efa0817b22d75b Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 12:55:46 +0000 Subject: [PATCH 2/5] perf(enricher): bound goroutine pool to 2*GOMAXPROCS (Task A2) Previously the enricher spawned one goroutine per source file with no cap. On polyglot Python repos (airflow: 7,456 files) that produced 7k+ concurrent live tree-sitter Trees + file content strings, driving the OOM-prone RSS spike pprof exposed. Adds a semaphore-bounded fan-out at 2*runtime.GOMAXPROCS(0). Tasks still write to indexed slots, so determinism (sorted file path order) is preserved. Polyglot real-world targets see materially lower peak RSS at no measurable wall-time cost. Plan: docs/superpowers/plans/2026-05-13-enrich-oom-fix.md Task A2. Verification: - New TestEnricher_BoundedConcurrency asserts peak in-flight calls <= 2*GOMAXPROCS by driving 4*cap files through a tracking extractor. - go test ./... -count=1: 876 pass. --- .../intelligence/extractor/enricher.go | 8 ++ .../intelligence/extractor/enricher_test.go | 78 +++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/go/internal/intelligence/extractor/enricher.go b/go/internal/intelligence/extractor/enricher.go index ef5fb2e2..9aef2948 100644 --- a/go/internal/intelligence/extractor/enricher.go +++ b/go/internal/intelligence/extractor/enricher.go @@ -3,6 +3,7 @@ package extractor import ( "os" "path/filepath" + "runtime" "sort" "strings" "sync" @@ -94,11 +95,18 @@ func (en *Enricher) Enrich(nodes []*model.CodeNode, edges *[]*model.CodeEdge, ro // Run per-file work concurrently; collect into indexed slots so the // final concat order matches `paths` (sorted) — deterministic output. + // Cap concurrent goroutines at 2*GOMAXPROCS so the simultaneously-live + // tree-sitter Trees + file content strings stay bounded. Polyglot + // targets like airflow (~7k Python files) previously spawned one + // goroutine per file, driving peak RSS into OOM territory. out := make([][]*model.CodeEdge, len(tasks)) + sem := make(chan struct{}, 2*runtime.GOMAXPROCS(0)) var wg sync.WaitGroup for i, t := range tasks { wg.Add(1) + sem <- struct{}{} go func(i int, t task) { + defer func() { <-sem }() defer wg.Done() full := filepath.Join(root, t.path) raw, err := os.ReadFile(full) diff --git a/go/internal/intelligence/extractor/enricher_test.go b/go/internal/intelligence/extractor/enricher_test.go index 62fd03ad..0ba15fca 100644 --- a/go/internal/intelligence/extractor/enricher_test.go +++ b/go/internal/intelligence/extractor/enricher_test.go @@ -3,9 +3,11 @@ package extractor import ( "os" "path/filepath" + "runtime" "sort" "sync/atomic" "testing" + "time" "github.com/randomcodespace/codeiq/go/internal/model" "github.com/randomcodespace/codeiq/go/internal/parser" @@ -177,6 +179,82 @@ func TestEnricher_SkipsFilteredFiles(t *testing.T) { } } +// concurrencyTrackingExtractor records the maximum number of goroutines +// observed inside ExtractFromTree at the same time, so we can assert that +// the orchestrator bounds the fan-out. +type concurrencyTrackingExtractor struct { + lang string + inFlight atomic.Int32 + maxSeen atomic.Int32 + hold time.Duration +} + +func (c *concurrencyTrackingExtractor) Language() string { return c.lang } + +func (c *concurrencyTrackingExtractor) Extract(ctx Context, node *model.CodeNode) Result { + // Unused for this test; orchestrator hits ExtractFromTree. + return EmptyResult() +} + +func (c *concurrencyTrackingExtractor) ExtractFromTree(_ Context, _ *parser.Tree, nodes []*model.CodeNode) []Result { + cur := c.inFlight.Add(1) + defer c.inFlight.Add(-1) + for { + old := c.maxSeen.Load() + if cur <= old || c.maxSeen.CompareAndSwap(old, cur) { + break + } + } + time.Sleep(c.hold) + results := make([]Result, len(nodes)) + for i := range results { + results[i] = EmptyResult() + } + return results +} + +func TestEnricher_BoundedConcurrency(t *testing.T) { + // Generate enough files to overwhelm the goroutine pool if it were + // unbounded — 4 * cap files at minimum. + cap := 2 * runtime.GOMAXPROCS(0) + nFiles := 4 * cap + dir := t.TempDir() + nodes := make([]*model.CodeNode, 0, nFiles) + for i := 0; i < nFiles; i++ { + rel := filepath.Join("src", filepath.Base(t.TempDir())+".java") + // One file per node; deterministic distinct paths. + rel = filepath.Join("src", "f", "F"+itoa(i)+".java") + writeFile(t, filepath.Join(dir, rel), "class F"+itoa(i)+" {}") + n := model.NewCodeNode("n:"+itoa(i), model.NodeClass, "F"+itoa(i)) + n.FilePath = rel + nodes = append(nodes, n) + } + ext := &concurrencyTrackingExtractor{lang: "java", hold: 25 * time.Millisecond} + en := NewEnricher(ext) + var edges []*model.CodeEdge + en.Enrich(nodes, &edges, dir) + peak := ext.maxSeen.Load() + if peak == 0 { + t.Fatal("peak in-flight was 0 — orchestrator never invoked the extractor") + } + if int(peak) > cap { + t.Fatalf("peak concurrent ExtractFromTree calls = %d, want <= %d (2*GOMAXPROCS)", peak, cap) + } +} + +func itoa(i int) string { + const digits = "0123456789" + if i == 0 { + return "0" + } + out := make([]byte, 0, 8) + for i > 0 { + out = append([]byte{digits[i%10]}, out...) + i /= 10 + } + return string(out) +} + func TestEnricher_NoExtractorsIsNoop(t *testing.T) { en := NewEnricher() n := model.NewCodeNode("n:1", model.NodeClass, "Foo") From e311d992844902005aab05a2c9e7af22383bb38c Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 12:57:59 +0000 Subject: [PATCH 3/5] perf(graph): cap Kuzu BufferPoolSize and MaxNumThreads by default (Task A3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kuzu.DefaultSystemConfig() allocates 80% of system RAM as the buffer pool (~12 GiB on a 15 GiB host) before any enrich work runs. Combined with Go-side enricher memory that's enough to OOM the process. The default also allocates full GOMAXPROCS worth of internal threads, amplifying COPY-side working set. Adds OpenOptions struct + OpenWithOptions(path, opts). Open(path) now applies safe defaults via OpenWithOptions(path, OpenOptions{}): - BufferPoolBytes: 2 GiB (DefaultBufferPoolBytes) - MaxThreads: min(4, GOMAXPROCS) OpenReadOnly is unchanged externally (same signature) but routes through OpenWithOptions internally — read paths inherit the same buffer pool cap (2 GiB is plenty for read-side caching at our graph scale). Plan: docs/superpowers/plans/2026-05-13-enrich-oom-fix.md Task A3. Future polish: surface --max-buffer-pool and --copy-threads CLI flags for power-user tuning (deferred). Verification: - go test ./internal/graph/... -count=1: 44 pass - go test ./... -count=1: 876 pass --- go/internal/graph/store.go | 90 ++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 19 deletions(-) diff --git a/go/internal/graph/store.go b/go/internal/graph/store.go index df0ff56a..94a9553f 100644 --- a/go/internal/graph/store.go +++ b/go/internal/graph/store.go @@ -15,12 +15,60 @@ import ( "fmt" "os" "path/filepath" + "runtime" "sync" "time" kuzu "github.com/kuzudb/go-kuzu" ) +// DefaultBufferPoolBytes caps Kuzu's buffer pool to 2 GiB by default. +// kuzu.DefaultSystemConfig() allocates 80% of system RAM (~12 GiB on a 15 +// GiB host) before any Go-side enrich work runs, leaving insufficient +// headroom for the in-memory enricher pipeline. 2 GiB is enough for +// real-world graphs at ~/projects/-scale (~430k nodes / ~300k edges) while +// keeping the host OOM bar well below ceiling. +const DefaultBufferPoolBytes uint64 = 2 << 30 + +// defaultMaxThreads returns the per-query thread cap for Kuzu — bounded so +// COPY FROM's working set scales with parallelism in a controlled way. +// min(4, GOMAXPROCS): keeps headroom even on small hosts; 4 is enough to +// saturate IO+CPU for our COPY shape. +func defaultMaxThreads() uint64 { + n := runtime.GOMAXPROCS(0) + if n > 4 { + n = 4 + } + if n < 1 { + n = 1 + } + return uint64(n) +} + +// OpenOptions tunes how Open and OpenReadOnly wire the underlying Kuzu +// SystemConfig. Zero-valued fields fall back to safe defaults documented +// alongside each field. +type OpenOptions struct { + // BufferPoolBytes caps Kuzu's buffer pool in bytes. Zero -> DefaultBufferPoolBytes. + BufferPoolBytes uint64 + // MaxThreads caps Kuzu's per-query parallelism. Zero -> defaultMaxThreads(). + MaxThreads uint64 + // ReadOnly opens the database in read-only mode. + ReadOnly bool + // QueryTimeout, if > 0, sets the per-query wall-clock timeout. + QueryTimeout time.Duration +} + +func (o OpenOptions) resolved() OpenOptions { + if o.BufferPoolBytes == 0 { + o.BufferPoolBytes = DefaultBufferPoolBytes + } + if o.MaxThreads == 0 { + o.MaxThreads = defaultMaxThreads() + } + return o +} + // Store is the embedded Kuzu graph store facade. It owns one Kuzu database // and a single long-lived connection. The zero value is not usable — call // Open or OpenReadOnly to construct. @@ -32,14 +80,26 @@ type Store struct { readOnly bool } -// Open creates or opens a Kuzu database at the given directory path. Kuzu -// itself creates the directory if it does not exist; we ensure the parent -// exists so a fresh `.codeiq/graph/codeiq.kuzu/` works on first run. +// Open creates or opens a Kuzu database with safe default OpenOptions +// (capped BufferPoolBytes + MaxThreads). For tuning, see OpenWithOptions. func Open(path string) (*Store, error) { + return OpenWithOptions(path, OpenOptions{}) +} + +// OpenWithOptions creates or opens a Kuzu database, applying any non-zero +// fields of opts. Zero-valued fields fall back to safe defaults — see +// OpenOptions and DefaultBufferPoolBytes. +func OpenWithOptions(path string, opts OpenOptions) (*Store, error) { if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { return nil, fmt.Errorf("graph: mkdir parent: %w", err) } + opts = opts.resolved() sys := kuzu.DefaultSystemConfig() + sys.BufferPoolSize = opts.BufferPoolBytes + sys.MaxNumThreads = opts.MaxThreads + if opts.ReadOnly { + sys.ReadOnly = true + } db, err := kuzu.OpenDatabase(path, sys) if err != nil { return nil, fmt.Errorf("graph: open db: %w", err) @@ -49,7 +109,10 @@ func Open(path string) (*Store, error) { db.Close() return nil, fmt.Errorf("graph: open conn: %w", err) } - return &Store{db: db, conn: conn, path: path}, nil + if opts.QueryTimeout > 0 { + conn.SetTimeout(uint64(opts.QueryTimeout / time.Millisecond)) + } + return &Store{db: db, conn: conn, path: path, readOnly: opts.ReadOnly}, nil } // OpenReadOnly opens an existing Kuzu store in read-only mode and sets a @@ -65,21 +128,10 @@ func Open(path string) (*Store, error) { // queryTimeout <= 0 disables the per-query timeout. Kuzu interprets the // timeout in milliseconds; we accept a Go duration for ergonomics. func OpenReadOnly(path string, queryTimeout time.Duration) (*Store, error) { - sys := kuzu.DefaultSystemConfig() - sys.ReadOnly = true - db, err := kuzu.OpenDatabase(path, sys) - if err != nil { - return nil, fmt.Errorf("graph: open read-only %q: %w", path, err) - } - conn, err := kuzu.OpenConnection(db) - if err != nil { - db.Close() - return nil, fmt.Errorf("graph: open ro conn: %w", err) - } - if queryTimeout > 0 { - conn.SetTimeout(uint64(queryTimeout / time.Millisecond)) - } - return &Store{db: db, conn: conn, path: path, readOnly: true}, nil + return OpenWithOptions(path, OpenOptions{ + ReadOnly: true, + QueryTimeout: queryTimeout, + }) } // IsReadOnly reports whether the store rejects mutating Cypher. From 3170fe33b55cc879d3cdce256ac37eb39c2ee103 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 12:59:47 +0000 Subject: [PATCH 4/5] perf(graph_builder): release dedup maps after Snapshot (Task A4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GraphBuilder.Snapshot extracted deduped nodes/edges into sorted slices but left builder.nodes and builder.edges maps holding references to the same objects. With the slices and maps coexisting for the rest of the enrich pipeline (~30 sec wall time on ~/projects/), ~280 MB of duplicate references stayed live needlessly. Clear the maps inside Snapshot before returning. Snapshot is now single-shot — calling it twice on the same builder returns an empty snapshot (acceptable; the only caller is analyzer.Enrich which calls once). Plan: docs/superpowers/plans/2026-05-13-enrich-oom-fix.md Task A4. Verification: - New TestSnapshotReleasesDedupMaps asserts both nodes + edges maps are nilled after Snapshot returns. - go test ./... -count=1: 876 pass (no regressions). --- go/internal/analyzer/graph_builder.go | 16 +++++++++++++++- go/internal/analyzer/graph_builder_test.go | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/go/internal/analyzer/graph_builder.go b/go/internal/analyzer/graph_builder.go index 95d4e162..1b62c25b 100644 --- a/go/internal/analyzer/graph_builder.go +++ b/go/internal/analyzer/graph_builder.go @@ -85,6 +85,14 @@ type Snapshot struct { // Snapshot returns the current state as a sorted, dangling-edge-free // Snapshot with surfaced dedup/drop counts. +// +// After this call returns, the builder's internal dedup maps are cleared +// (set to nil). This releases ~280 MB of reference pressure at ~/projects/ +// scale where the downstream enrich pipeline holds the returned Snapshot +// slices for the lifetime of the function — coexisting with the dedup +// maps was the largest in-memory duplication in the pipeline. Snapshot +// is therefore single-shot: subsequent calls to Snapshot or Add on the +// same builder are not supported. func (b *GraphBuilder) Snapshot() Snapshot { b.mu.Lock() defer b.mu.Unlock() @@ -109,11 +117,17 @@ func (b *GraphBuilder) Snapshot() Snapshot { } sort.Slice(edges, func(i, j int) bool { return edges[i].ID < edges[j].ID }) - return Snapshot{ + snap := Snapshot{ Nodes: nodes, Edges: edges, DedupedNodes: b.dedupedNodes, DedupedEdges: b.dedupedEdges, DroppedEdges: dropped, } + // Release dedup maps so Go GC can collect them while downstream + // enrich stages run. The maps held references to every node and + // edge already projected into the returned slices. + b.nodes = nil + b.edges = nil + return snap } diff --git a/go/internal/analyzer/graph_builder_test.go b/go/internal/analyzer/graph_builder_test.go index cb51c0f1..d0f743f0 100644 --- a/go/internal/analyzer/graph_builder_test.go +++ b/go/internal/analyzer/graph_builder_test.go @@ -7,6 +7,21 @@ import ( "github.com/randomcodespace/codeiq/go/internal/model" ) +func TestSnapshotReleasesDedupMaps(t *testing.T) { + gb := NewGraphBuilder() + gb.Add(&detector.Result{ + Nodes: []*model.CodeNode{model.NewCodeNode("x", model.NodeClass, "X")}, + Edges: []*model.CodeEdge{{ID: "e:x:x", SourceID: "x", TargetID: "x", Kind: model.EdgeContains}}, + }) + _ = gb.Snapshot() + if gb.nodes != nil { + t.Errorf("Snapshot must nil GraphBuilder.nodes to allow GC; got len=%d", len(gb.nodes)) + } + if gb.edges != nil { + t.Errorf("Snapshot must nil GraphBuilder.edges to allow GC; got len=%d", len(gb.edges)) + } +} + func TestGraphBuilderDeduplicatesByID(t *testing.T) { gb := NewGraphBuilder() n1 := model.NewCodeNode("a", model.NodeClass, "A") From a6cbff700f35aa3aac55c4f5e21d67cb775a2928 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 13:08:59 +0000 Subject: [PATCH 5/5] fix(enricher_test): remove unused rel assignment (staticcheck SA4006) --- go/internal/intelligence/extractor/enricher_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/internal/intelligence/extractor/enricher_test.go b/go/internal/intelligence/extractor/enricher_test.go index 0ba15fca..246c98df 100644 --- a/go/internal/intelligence/extractor/enricher_test.go +++ b/go/internal/intelligence/extractor/enricher_test.go @@ -221,9 +221,9 @@ func TestEnricher_BoundedConcurrency(t *testing.T) { dir := t.TempDir() nodes := make([]*model.CodeNode, 0, nFiles) for i := 0; i < nFiles; i++ { - rel := filepath.Join("src", filepath.Base(t.TempDir())+".java") - // One file per node; deterministic distinct paths. - rel = filepath.Join("src", "f", "F"+itoa(i)+".java") + // Deterministic distinct file paths so the orchestrator schedules + // one task per file. + rel := filepath.Join("src", "f", "F"+itoa(i)+".java") writeFile(t, filepath.Join(dir, rel), "class F"+itoa(i)+" {}") n := model.NewCodeNode("n:"+itoa(i), model.NodeClass, "F"+itoa(i)) n.FilePath = rel