Checks: Improve duplicate words detection

We need tigher control what is considered word boundary than using built in definition in regexp. The list of chars is extracted using unicodedata and own exception rules apply. Fixes #4197
WeblateOrg · Jul 22, 2020 · 85528fb · 85528fb
1 parent f77c912
commit 85528fb
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 3 deletions.
diff --git a/weblate/checks/duplicate.py b/weblate/checks/duplicate.py
@@ -18,15 +18,45 @@
 #
 
 import re
+import sys
+import unicodedata
 
 from django.utils.html import escape
 from django.utils.safestring import mark_safe
 from django.utils.translation import gettext_lazy as _
 
 from weblate.checks.base import TargetCheck
 
+# Unicode categories to consider non word chars
+CATEGORIES = {"Po", "Zs"}
+# Excluded chars
+EXCLUDES = {
+    # Removed to avoid breaking regexp syntax
+    "]",
+    # We intentionally skip following
+    "-",
+    # Used in Catalan ŀ
+    "·",
+    "•",
+}
+# Set of non word characters
+NON_WORD_CHARS = {
+    char
+    for char in map(chr, range(sys.maxunicode + 1))
+    if char not in EXCLUDES and unicodedata.category(char) in CATEGORIES
+}
+# Regexp for non word chars
+NON_WORD = "[{}\\]]".format("".join(NON_WORD_CHARS))
 # Look for non-digit word sequences
-CHECK_RE = re.compile(r"\b([^\d\W]{2,})(?:\s+\1)\b")
+CHECK_RE = re.compile(
+    rf"""
+    (?:{NON_WORD}|^)    # Word boundary
+    ([^\d\W]{{2,}})       # Word to match
+    (?:{NON_WORD}+\1)   # Space + repeated word
+    (?={NON_WORD}|$)    # Word boundary
+    """,
+    re.VERBOSE,
+)
 
 # Per language ignore list
 IGNORES = {
@@ -44,11 +74,12 @@ class DuplicateCheck(TargetCheck):
     description = _("Text contains the same word twice in a row:")
 
     def check_single(self, source, target, unit):
+        lang_code = unit.translation.language.base_code
         source_matches = set(CHECK_RE.findall(source))
         target_matches = set(CHECK_RE.findall(target))
         diff = target_matches - source_matches
-        if unit.translation.language.base_code in IGNORES:
-            diff = diff - IGNORES[unit.translation.language.base_code]
+        if lang_code in IGNORES:
+            diff = diff - IGNORES[lang_code]
         return bool(diff)
 
     def get_description(self, check_obj):

diff --git a/weblate/checks/tests/test_duplicate_checks.py b/weblate/checks/tests/test_duplicate_checks.py
@@ -70,3 +70,9 @@ def test_description(self):
             self.check.get_description(check),
             "Text contains the same word twice in a row: lemons, two",
         )
+
+    def test_check_duplicated_language_cleanup(self):
+        self.assertFalse(self._run_check("Cancel·la la baixada", lang="ca"))
+
+    def test_separator(self):
+        self.assertFalse(self._run_check("plug-in in"))