Add own implementation of regexp escaping for the databases

The problem is that Python's re.escape escapes too much, we just have to escape special chars. The current list is based on what PHP's preg_quote does, but we might be missing some chars... Fixes #1666 Signed-off-by: Michal Čihař <michal@cihar.com>
WeblateOrg · Nov 7, 2017 · 7425a74 · 7425a74
1 parent ad56c59
commit 7425a74
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 1 deletion.
diff --git a/weblate/trans/models/dictionary.py b/weblate/trans/models/dictionary.py
@@ -36,6 +36,7 @@
 from weblate.lang.models import Language
 from weblate.trans.formats import AutoFormat
 from weblate.trans.models.project import Project
+from weblate.utils.db import re_escape
 from weblate.utils.errors import report_error
 
 
@@ -177,7 +178,7 @@ def get_words(self, unit):
             # Can not use __in as we want case insensitive lookup
             dictionary = dictionary.filter(
                 source__iregex=r'^({0})$'.format(
-                    '|'.join([re.escape(word) for word in words])
+                    '|'.join([re_escape(word) for word in words])
                 )
             )
 

diff --git a/weblate/utils/db.py b/weblate/utils/db.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2012 - 2017 Michal Čihař <michal@cihar.com>
+#
+# This file is part of Weblate <https://weblate.org/>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+
+from __future__ import unicode_literals
+
+ESCAPED = frozenset('.\\+*?[^]$(){}=!<>|:-\000')
+
+
+def re_escape(pattern):
+    """Escape for use in database regexp match.
+
+    This is based on re.escape, but that one escapes too much.
+    """
+    s = list(pattern)
+    for i, c in enumerate(pattern):
+        if c in ESCAPED:
+            if c == "\000":
+                s[i] = "\\000"
+            else:
+                s[i] = "\\" + c
+    return "".join(s)
diff --git a/weblate/utils/tests/test_db.py b/weblate/utils/tests/test_db.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2012 - 2017 Michal Čihař <michal@cihar.com>
+#
+# This file is part of Weblate <https://weblate.org/>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+
+from unittest import TestCase
+
+from weblate.utils.db import re_escape
+
+
+class DbTest(TestCase):
+    def test_re_escape(self):
+        self.assertEqual(re_escape('[a-z]'), '\\[a\\-z\\]')
+        self.assertEqual(re_escape('a{1,4}'), 'a\\{1,4\\}')