fix(glossary): fetch word boundary positions just once

Doing repeated single character regular expressions is slow, better to match all word boundaries at once early.
WeblateOrg · May 23, 2024 · 8eb6659 · 8eb6659
1 parent 1b4a796
commit 8eb6659
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 3 deletions.
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -12,6 +12,8 @@ Not yet released.
 
 **Bug fixes**
 
+* Loading of strings with many glossary matches.
+
 **Compatibility**
 
 **Upgrading**

diff --git a/weblate/glossary/models.py b/weblate/glossary/models.py
@@ -93,17 +93,22 @@ def get_glossary_terms(unit: Unit) -> list[Unit]:
     source = PLURAL_SEPARATOR.join(parts)
 
     uses_whitespace = source_language.uses_whitespace()
+    boundaries: set[int] = set()
+    if uses_whitespace:
+        # Get list of word boundaries
+        boundaries = {match.span()[0] for match in NON_WORD_RE.finditer(source)}
+        boundaries.add(-1)
+        boundaries.add(len(source))
 
     automaton = project.glossary_automaton
-    positions = defaultdict(list[tuple[int, int]])
+    positions: dict[str, list[tuple[int, int]]] = defaultdict(list)
     # Extract terms present in the source
     with sentry_sdk.start_span(op="glossary.match", description=project.slug):
         for _termno, start, end in automaton.find_matches_as_indexes(
             source, overlapping=True
         ):
             if not uses_whitespace or (
-                (start == 0 or NON_WORD_RE.match(source[start - 1]))
-                and (end >= len(source) or NON_WORD_RE.match(source[end]))
+                (start - 1 in boundaries) and (end in boundaries)
             ):
                 term = source[start:end].lower()
                 positions[term].append((start, end))