Fixed the handling of escape sequences in the ntriples and nquads par…

…sers. These parsers will now correctly handle strings like `"\\r"`. The time it takes for these parsers to parse strings with escape sequences will be increased, and the increase will be correlated with the amount of escape sequences that occur in a string. For strings with many escape sequences the parsing speed seems to be almost 4 times slower. Also: - Add graph variant test scaffolding. Multiple files representing the same graph can now easily be tested to be isomorphic by just adding them in `test/variants`. - Add more things to `testutils.GraphHelper`, including some methods that does asserts with better messages. Also include some tests for GraphHelper. - Add some extra files to test_roundtrip, set the default identifier when parsing, and change verbose flag to rather be based on debug logging. - move one test from `test/test_issue247.py` to variants. - Fix problems with `.editorconfig` which prevents it from working properly. - Add xfail tests for a couple of issues This includes xfails for the following issues: - #1216 - #1649
RDFLib · Jan 12, 2022 · 3cffd4d · 3cffd4d
1 parent d957533
commit 3cffd4d
Show file tree

Hide file tree

Showing 29 changed files with 941 additions and 87 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -11,16 +11,16 @@ insert_final_newline = true
 trim_trailing_whitespace = true
 
 # Leave line endings as-is in Markdown and ReStructuredText files
-[*.{md, rst}]
+[*.{md,rst}]
 charset = utf-8
 trim_trailing_whitespace = false
 
 # Matches multiple files with brace expansion notation
 # Set default charset
-[*.{js, py, pyi, toml, yml, yaml}]
+[*.{js,py,pyi,toml,yml,yaml}]
 charset = utf-8
 
-[*.{yaml, yml}]
+[*.{yaml,yml,json}]
 indent_style = space
 indent_size = 2
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,33 @@
+RELEASE TODO
+========================
+
+Changes:
+--------
+
+### Fixed the handling of escape sequences in the ntriples and nquads parsers
+
+These parsers will now correctly handle strings like `"\\r"`.
+
+The time it takes for these parsers to parse strings with escape sequences will be increased, and the increase will be correlated with the amount of escape sequences that occur in a string.
+
+For strings with many escape sequences the parsing speed seems to be almost 4 times slower.
+
+Fixes [issue #1655](https://github.com/RDFLib/rdflib/issues/1655).
+
+### Deprecated Functions
+
+Marked the following functions as deprecated:
+
+- `rdflib.compat.decodeStringEscape`: This function is not used anywhere in
+  rdflib anymore and the utility that it does provide is not implemented
+  correctly. It will be removed in RDFLib 7.0.0
+
+PRs merged since last release:
+------------------------------
+
+* TODO
+
+
 2021-12-20 RELEASE 6.1.1
 ========================
 Better testing and tidier code.

diff --git a/rdflib/compat.py b/rdflib/compat.py
@@ -6,9 +6,9 @@
 import re
 import codecs
 import warnings
-import typing as t
+from typing import TYPE_CHECKING, Match
 
-if t.TYPE_CHECKING:
+if TYPE_CHECKING:
     import xml.etree.ElementTree as etree
 else:
     try:
@@ -82,6 +82,14 @@ def _unicodeExpand(s):
 
 
 def decodeStringEscape(s):
+    warnings.warn(
+        DeprecationWarning(
+            "rdflib.compat.decodeStringEscape() is deprecated, "
+            "it will be removed in rdflib 7.0.0. "
+            "This function is not used anywhere in rdflib anymore "
+            "and the utility that it does provide is not implemented correctly."
+        )
+    )
     r"""
     s is byte-string - replace \ escapes in string
     """
@@ -99,28 +107,38 @@ def decodeStringEscape(s):
     # return _unicodeExpand(s) # hmm - string escape doesn't do unicode escaping
 
 
-def decodeUnicodeEscape(s):
-    """
-    s is a unicode string
-    replace ``\\n`` and ``\\u00AC`` unicode escapes
-    """
-    if "\\" not in s:
-        # Most of times, there are no backslashes in strings.
-        # In the general case, it could use maketrans and translate.
-        return s
+_string_escape_map = {
+    "t": "\t",
+    "b": "\b",
+    "n": "\n",
+    "r": "\r",
+    "f": "\f",
+    '"': '"',
+    "'": "'",
+    "\\": "\\",
+}
+_string_escape_translator = str.maketrans(_string_escape_map)
 
-    s = s.replace("\\t", "\t")
-    s = s.replace("\\n", "\n")
-    s = s.replace("\\r", "\r")
-    s = s.replace("\\b", "\b")
-    s = s.replace("\\f", "\f")
-    s = s.replace('\\"', '"')
-    s = s.replace("\\'", "'")
-    s = s.replace("\\\\", "\\")
 
-    s = _unicodeExpand(s)  # hmm - string escape doesn't do unicode escaping
+def _turtle_escape_subber(match: Match[str]) -> str:
+    smatch, umatch = match.groups()
+    if smatch is not None:
+        return smatch.translate(_string_escape_translator)
+    else:
+        return chr(int(umatch[1:], 16))
 
-    return s
+
+_turtle_escape_pattern = re.compile(
+    r"""\\(?:([tbnrf"'\\])|(u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}))""",
+)
+
+
+def decodeUnicodeEscape(escaped: str) -> str:
+    if "\\" not in escaped:
+        # Most of times, there are no backslashes in strings.
+        # In the general case, it could use maketrans and translate.
+        return escaped
+    return _turtle_escape_pattern.sub(_turtle_escape_subber, escaped)
 
 
 # Migration to abc in Python 3.8

diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py
@@ -13,7 +13,7 @@
 from rdflib.term import Node, URIRef as URI
 from rdflib.term import BNode as bNode
 from rdflib.term import Literal
-from rdflib.compat import decodeUnicodeEscape
+from rdflib.compat import decodeUnicodeEscape, _string_escape_translator
 from rdflib.exceptions import ParserError as ParseError
 from rdflib.parser import InputSource, Parser
 
@@ -49,19 +49,18 @@ def triple(self, s, p, o):
         print(s, p, o)
 
 
-quot = {"t": "\t", "n": "\n", "r": "\r", '"': '"', "\\": "\\"}
 r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
-r_quot = re.compile(r'\\(t|n|r|"|\\)')
-r_uniquot = re.compile(r"\\u([0-9A-F]{4})|\\U([0-9A-F]{8})")
+r_quot = re.compile(r"""\\([tbnrf"'\\])""")
+r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")
 
 
-def unquote(s):
+def unquote(s: str) -> str:
     """Unquote an N-Triples string."""
     if not validate:
         if isinstance(s, str):  # nquads
             s = decodeUnicodeEscape(s)
         else:
-            s = s.decode("unicode-escape")
+            s = s.decode("unicode-escape")  # type: ignore[unreachable]
 
         return s
     else:
@@ -76,7 +75,7 @@ def unquote(s):
             m = r_quot.match(s)
             if m:
                 s = s[2:]
-                result.append(quot[m.group(1)])
+                result.append(m.group(1).translate(_string_escape_translator))
                 continue
 
             m = r_uniquot.match(s)

diff --git a/test/conftest.py b/test/conftest.py
@@ -1,3 +1,8 @@
 from .earl import EarlReporter
+import pytest
 
 pytest_plugins = [EarlReporter.__module__]
+
+# This is here so that asserts from these modules are formatted for human
+# readibility.
+pytest.register_assert_rewrite("test.testutils")
diff --git a/test/test_issue247.py b/test/test_issue247.py
@@ -15,31 +15,8 @@
 
 </rdf:RDF>"""
 
-passxml = """\
-<rdf:RDF
-    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-    xmlns:dc="http://purl.org/dc/elements/1.1/"
->
-
-<rdf:Description rdf:about="http://example.org/">
-    <dc:description rdf:parseType="Literal">
-        <p xmlns="http://www.w3.org/1999/xhtml"></p>
-    </dc:description>
-</rdf:Description>
-
-</rdf:RDF>"""
-
 
 class TestXMLLiteralwithLangAttr(unittest.TestCase):
-    def test_successful_parse_of_literal_without_xmllang_attr(self):
-        """
-        Test parse of Literal without xmllang attr passes
-        Parsing an RDF/XML document fails with a KeyError when
-        it contains a XML Literal with a xml:lang attribute:
-        """
-        g = rdflib.Graph()
-        g.parse(data=passxml, format="xml")
-
     def test_failing_parse_of_literal_with_xmllang_attr(self):
         """
         Show parse of Literal with xmllang attr fails