From 8eb6659709b4c83d08b23a7e50883b7ebf35dd2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Thu, 23 May 2024 10:27:22 +0200 Subject: [PATCH] fix(glossary): fetch word boundary positions just once Doing repeated single character regular expressions is slow, better to match all word boundaries at once early. --- docs/changes.rst | 2 ++ weblate/glossary/models.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 9058b50a8a9c..7381695bb607 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -12,6 +12,8 @@ Not yet released. **Bug fixes** +* Loading of strings with many glossary matches. + **Compatibility** **Upgrading** diff --git a/weblate/glossary/models.py b/weblate/glossary/models.py index 3934aec68511..68898e3a0dc6 100644 --- a/weblate/glossary/models.py +++ b/weblate/glossary/models.py @@ -93,17 +93,22 @@ def get_glossary_terms(unit: Unit) -> list[Unit]: source = PLURAL_SEPARATOR.join(parts) uses_whitespace = source_language.uses_whitespace() + boundaries: set[int] = set() + if uses_whitespace: + # Get list of word boundaries + boundaries = {match.span()[0] for match in NON_WORD_RE.finditer(source)} + boundaries.add(-1) + boundaries.add(len(source)) automaton = project.glossary_automaton - positions = defaultdict(list[tuple[int, int]]) + positions: dict[str, list[tuple[int, int]]] = defaultdict(list) # Extract terms present in the source with sentry_sdk.start_span(op="glossary.match", description=project.slug): for _termno, start, end in automaton.find_matches_as_indexes( source, overlapping=True ): if not uses_whitespace or ( - (start == 0 or NON_WORD_RE.match(source[start - 1])) - and (end >= len(source) or NON_WORD_RE.match(source[end])) + (start - 1 in boundaries) and (end in boundaries) ): term = source[start:end].lower() positions[term].append((start, end))