From 8eb6659709b4c83d08b23a7e50883b7ebf35dd2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= <michal@cihar.com>
Date: Thu, 23 May 2024 10:27:22 +0200
Subject: [PATCH] fix(glossary): fetch word boundary positions just once

Doing repeated single character regular expressions is slow, better
to match all word boundaries at once early.
---
 docs/changes.rst           |  2 ++
 weblate/glossary/models.py | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/changes.rst b/docs/changes.rst
index 9058b50a8a9c..7381695bb607 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -12,6 +12,8 @@ Not yet released.
 
 **Bug fixes**
 
+* Loading of strings with many glossary matches.
+
 **Compatibility**
 
 **Upgrading**
diff --git a/weblate/glossary/models.py b/weblate/glossary/models.py
index 3934aec68511..68898e3a0dc6 100644
--- a/weblate/glossary/models.py
+++ b/weblate/glossary/models.py
@@ -93,17 +93,22 @@ def get_glossary_terms(unit: Unit) -> list[Unit]:
     source = PLURAL_SEPARATOR.join(parts)
 
     uses_whitespace = source_language.uses_whitespace()
+    boundaries: set[int] = set()
+    if uses_whitespace:
+        # Get list of word boundaries
+        boundaries = {match.span()[0] for match in NON_WORD_RE.finditer(source)}
+        boundaries.add(-1)
+        boundaries.add(len(source))
 
     automaton = project.glossary_automaton
-    positions = defaultdict(list[tuple[int, int]])
+    positions: dict[str, list[tuple[int, int]]] = defaultdict(list)
     # Extract terms present in the source
     with sentry_sdk.start_span(op="glossary.match", description=project.slug):
         for _termno, start, end in automaton.find_matches_as_indexes(
             source, overlapping=True
         ):
             if not uses_whitespace or (
-                (start == 0 or NON_WORD_RE.match(source[start - 1]))
-                and (end >= len(source) or NON_WORD_RE.match(source[end]))
+                (start - 1 in boundaries) and (end in boundaries)
             ):
                 term = source[start:end].lower()
                 positions[term].append((start, end))