Skip to content

Commit

Permalink
Add own implementation of regexp escaping for the databases
Browse files Browse the repository at this point in the history
The problem is that Python's re.escape escapes too much, we just have to
escape special chars. The current list is based on what PHP's
preg_quote does, but we might be missing some chars...

Fixes #1666

Signed-off-by: Michal Čihař <michal@cihar.com>
  • Loading branch information
nijel committed Nov 7, 2017
1 parent ad56c59 commit 7425a74
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 1 deletion.
3 changes: 2 additions & 1 deletion weblate/trans/models/dictionary.py
Expand Up @@ -36,6 +36,7 @@
from weblate.lang.models import Language
from weblate.trans.formats import AutoFormat
from weblate.trans.models.project import Project
from weblate.utils.db import re_escape
from weblate.utils.errors import report_error


Expand Down Expand Up @@ -177,7 +178,7 @@ def get_words(self, unit):
# Can not use __in as we want case insensitive lookup
dictionary = dictionary.filter(
source__iregex=r'^({0})$'.format(
'|'.join([re.escape(word) for word in words])
'|'.join([re_escape(word) for word in words])
)
)

Expand Down
38 changes: 38 additions & 0 deletions weblate/utils/db.py
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2017 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#

from __future__ import unicode_literals

ESCAPED = frozenset('.\\+*?[^]$(){}=!<>|:-\000')


def re_escape(pattern):
"""Escape for use in database regexp match.
This is based on re.escape, but that one escapes too much.
"""
s = list(pattern)
for i, c in enumerate(pattern):
if c in ESCAPED:
if c == "\000":
s[i] = "\\000"
else:
s[i] = "\\" + c
return "".join(s)
29 changes: 29 additions & 0 deletions weblate/utils/tests/test_db.py
@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2017 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#

from unittest import TestCase

from weblate.utils.db import re_escape


class DbTest(TestCase):
def test_re_escape(self):
self.assertEqual(re_escape('[a-z]'), '\\[a\\-z\\]')
self.assertEqual(re_escape('a{1,4}'), 'a\\{1,4\\}')

0 comments on commit 7425a74

Please sign in to comment.