Skip to content

Commit

Permalink
Checks: Improve duplicate words detection
Browse files Browse the repository at this point in the history
We need tigher control what is considered word boundary than using built
in definition in regexp. The list of chars is extracted using
unicodedata and own exception rules apply.

Fixes #4197
  • Loading branch information
nijel committed Jul 22, 2020
1 parent f77c912 commit 85528fb
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 3 deletions.
37 changes: 34 additions & 3 deletions weblate/checks/duplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,45 @@
#

import re
import sys
import unicodedata

from django.utils.html import escape
from django.utils.safestring import mark_safe
from django.utils.translation import gettext_lazy as _

from weblate.checks.base import TargetCheck

# Unicode categories to consider non word chars
CATEGORIES = {"Po", "Zs"}
# Excluded chars
EXCLUDES = {
# Removed to avoid breaking regexp syntax
"]",
# We intentionally skip following
"-",
# Used in Catalan ŀ
"·",
"•",
}
# Set of non word characters
NON_WORD_CHARS = {
char
for char in map(chr, range(sys.maxunicode + 1))
if char not in EXCLUDES and unicodedata.category(char) in CATEGORIES
}
# Regexp for non word chars
NON_WORD = "[{}\\]]".format("".join(NON_WORD_CHARS))
# Look for non-digit word sequences
CHECK_RE = re.compile(r"\b([^\d\W]{2,})(?:\s+\1)\b")
CHECK_RE = re.compile(
rf"""
(?:{NON_WORD}|^) # Word boundary
([^\d\W]{{2,}}) # Word to match
(?:{NON_WORD}+\1) # Space + repeated word
(?={NON_WORD}|$) # Word boundary
""",
re.VERBOSE,
)

# Per language ignore list
IGNORES = {
Expand All @@ -44,11 +74,12 @@ class DuplicateCheck(TargetCheck):
description = _("Text contains the same word twice in a row:")

def check_single(self, source, target, unit):
lang_code = unit.translation.language.base_code
source_matches = set(CHECK_RE.findall(source))
target_matches = set(CHECK_RE.findall(target))
diff = target_matches - source_matches
if unit.translation.language.base_code in IGNORES:
diff = diff - IGNORES[unit.translation.language.base_code]
if lang_code in IGNORES:
diff = diff - IGNORES[lang_code]
return bool(diff)

def get_description(self, check_obj):
Expand Down
6 changes: 6 additions & 0 deletions weblate/checks/tests/test_duplicate_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ def test_description(self):
self.check.get_description(check),
"Text contains the same word twice in a row: lemons, two",
)

def test_check_duplicated_language_cleanup(self):
self.assertFalse(self._run_check("Cancel·la la baixada", lang="ca"))

def test_separator(self):
self.assertFalse(self._run_check("plug-in in"))

0 comments on commit 85528fb

Please sign in to comment.