From 69ff2b33aa3b1f2370d6c28919a1f87e5749cfb9 Mon Sep 17 00:00:00 2001 From: Sephyi Date: Thu, 30 Apr 2026 19:05:51 +0200 Subject: [PATCH] perf(services): RegexSet first-pass for secret scanning (F-022) Replaces per-pattern `Regex::is_match` loops in the scanners with a single `RegexSet` traversal that returns the index of the lowest- indexed pattern that matched. The pattern slice is then consulted only to look up the descriptive name on a hit. Layout: - New `PatternSet` struct holds an owned `Vec` plus the derived `RegexSet`. `PatternSet::first_match` does the combined NFA traversal. - `BUILTIN_PATTERN_SET: LazyLock` caches the set for the built-in patterns (built once on first access on top of the existing `BUILTIN_PATTERNS` cache from F-012). - `scan_for_secrets` and `scan_full_diff_for_secrets` (no-args) use the cached set directly. - `scan_for_secrets_with_patterns` / `scan_full_diff_with_patterns` preserve their `&[SecretPattern]` API by building a one-shot `PatternSet` and delegating to private `*_with_pattern_set` helpers, so external callers see no API change. - Behaviour preserved: at most one match per added line; pattern precedence is the lowest-index in the configured set, matching the previous "first hit, break" semantics. Closes #25. --- src/services/safety.rs | 110 ++++++++++++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 22 deletions(-) diff --git a/src/services/safety.rs b/src/services/safety.rs index 62c56d9..4b99d7a 100644 --- a/src/services/safety.rs +++ b/src/services/safety.rs @@ -5,7 +5,7 @@ use std::borrow::Cow; use std::sync::LazyLock; -use regex::Regex; +use regex::{Regex, RegexSet}; use crate::domain::StagedChanges; @@ -28,6 +28,55 @@ pub struct SecretPattern { pub description: Cow<'static, str>, } +/// Bundle of secret patterns plus a derived `RegexSet` for batched first-pass +/// matching. +/// +/// `RegexSet` compiles all patterns into a single combined NFA, so checking +/// a line against the whole set is dramatically cheaper than running each +/// `Regex::is_match` independently. On a hit the indices returned by +/// `RegexSet::matches` map back into `patterns` for the descriptive name. +pub struct PatternSet { + pub patterns: Vec, + pub set: RegexSet, +} + +impl PatternSet { + /// Build a `PatternSet` from an owned `Vec`. + /// + /// The `RegexSet` is built once at construction time. If any individual + /// pattern would fail to compile in the combined set (it should not — the + /// patterns already passed `Regex::new`) we fall back to an empty + /// `RegexSet`, in which case `first_match` simply returns `None` for + /// every input. The per-pattern `Regex::is_match` fallback below then + /// preserves correctness. + pub fn new(patterns: Vec) -> Self { + let sources: Vec<&str> = patterns.iter().map(|p| p.regex.as_str()).collect(); + let set = RegexSet::new(&sources).unwrap_or_else(|_| RegexSet::empty()); + Self { patterns, set } + } + + /// First-pass match. Returns the index of the *lowest-index* pattern that + /// matches `line`, or `None` if no pattern matches. Uses the combined + /// `RegexSet` (one NFA traversal) instead of N independent `Regex::is_match` + /// calls. + fn first_match(&self, line: &str) -> Option { + // `matches()` returns a SetMatches; iterating it yields hit indices in + // ascending order, so `next()` is the smallest index. + self.set.matches(line).into_iter().next() + } +} + +/// Convenience: build a `PatternSet` from a borrowed slice. Used by callers +/// that hold an owned `Vec` and want first-pass acceleration. +fn pattern_set_from_slice(patterns: &[SecretPattern]) -> PatternSet { + PatternSet::new(patterns.to_vec()) +} + +/// Cached `PatternSet` over the built-in patterns. The pattern vector and the +/// `RegexSet` are both compiled exactly once per process. +static BUILTIN_PATTERN_SET: LazyLock = + LazyLock::new(|| PatternSet::new(BUILTIN_PATTERNS.clone())); + /// Build the full set of secret patterns, applying custom additions and disabled names. /// /// Custom patterns are compiled from user-provided regex strings. Invalid regexes @@ -229,14 +278,28 @@ static BUILTIN_PATTERNS: LazyLock> = LazyLock::new(|| { /// for library consumers who only have `StagedChanges`. #[allow(dead_code)] pub fn scan_for_secrets(changes: &StagedChanges) -> Vec { - scan_for_secrets_with_patterns(changes, builtin_patterns()) + scan_with_pattern_set(changes, &BUILTIN_PATTERN_SET) } -/// Scan per-file truncated diffs for secrets using the given pattern set. +/// Scan per-file truncated diffs for secrets using the given pattern slice. +/// +/// One-shot callers pay the cost of building a `RegexSet` here; repeated +/// callers should construct a `PatternSet` once and call +/// [`scan_with_pattern_set`] directly. pub fn scan_for_secrets_with_patterns( changes: &StagedChanges, patterns: &[SecretPattern], ) -> Vec { + let set = pattern_set_from_slice(patterns); + scan_with_pattern_set(changes, &set) +} + +/// Scan per-file truncated diffs for secrets using a pre-built `PatternSet`. +/// +/// Each added line is first matched against the combined `RegexSet`; only on +/// a hit do we look up the individual pattern's name. This avoids running +/// every `Regex::is_match` independently per line. +fn scan_with_pattern_set(changes: &StagedChanges, pset: &PatternSet) -> Vec { let mut found = Vec::new(); for file in &changes.files { @@ -253,15 +316,13 @@ pub fn scan_for_secrets_with_patterns( continue; } - for pat in patterns { - if pat.regex.is_match(line) { - found.push(SecretMatch { - pattern_name: pat.name.to_string(), - file: file.path.display().to_string(), - line: Some(line_num), - }); - break; // One match per line is enough - } + if let Some(idx) = pset.first_match(line) { + let pat = &pset.patterns[idx]; + found.push(SecretMatch { + pattern_name: pat.name.to_string(), + file: file.path.display().to_string(), + line: Some(line_num), + }); } } } @@ -272,7 +333,7 @@ pub fn scan_for_secrets_with_patterns( /// Scan the full unified diff for secrets using default patterns. #[allow(dead_code)] pub fn scan_full_diff_for_secrets(full_diff: &str) -> Vec { - scan_full_diff_with_patterns(full_diff, builtin_patterns()) + scan_full_diff_with_pattern_set(full_diff, &BUILTIN_PATTERN_SET) } static HUNK_HEADER: LazyLock = @@ -288,6 +349,13 @@ pub fn scan_full_diff_with_patterns( full_diff: &str, patterns: &[SecretPattern], ) -> Vec { + let pset = pattern_set_from_slice(patterns); + scan_full_diff_with_pattern_set(full_diff, &pset) +} + +/// Same as `scan_full_diff_with_patterns` but takes a pre-built `PatternSet`, +/// avoiding the per-call `RegexSet` construction. +fn scan_full_diff_with_pattern_set(full_diff: &str, pset: &PatternSet) -> Vec { let mut found = Vec::new(); let mut current_file = String::new(); let mut current_line: Option = None; @@ -342,15 +410,13 @@ pub fn scan_full_diff_with_patterns( // guard, which would also drop legitimate added lines whose content // begins with `++`. if let Some(content) = line.strip_prefix('+') { - for pat in patterns { - if pat.regex.is_match(content) { - found.push(SecretMatch { - pattern_name: pat.name.to_string(), - file: current_file.clone(), - line: Some(*line_num), - }); - break; - } + if let Some(idx) = pset.first_match(content) { + let pat = &pset.patterns[idx]; + found.push(SecretMatch { + pattern_name: pat.name.to_string(), + file: current_file.clone(), + line: Some(*line_num), + }); } *line_num += 1; } else if line.starts_with(' ') {