RDFLib · aucampia · Jan 18, 2022 · Jan 15, 2022 · Jan 16, 2022 · Jan 16, 2022
diff --git a/.editorconfig b/.editorconfig
@@ -11,16 +11,16 @@ insert_final_newline = true
 trim_trailing_whitespace = true
 
 # Leave line endings as-is in Markdown and ReStructuredText files
-[*.{md, rst}]
+[*.{md,rst}]
 charset = utf-8
 trim_trailing_whitespace = false
 
 # Matches multiple files with brace expansion notation
 # Set default charset
-[*.{js, py, pyi, toml, yml, yaml}]
+[*.{js,py,pyi,toml,yml,yaml}]
 charset = utf-8
 
-[*.{yaml, yml}]
+[*.{yaml,yml,json}]
 indent_style = space
 indent_size = 2
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,33 @@
+RELEASE TODO
+========================
+
+Changes:
+--------
+
+### Fixed the handling of escape sequences in the ntriples and nquads parsers
+
+These parsers will now correctly handle strings like `"\\r"`.
+
+The time it takes for these parsers to parse strings with escape sequences will be increased, and the increase will be correlated with the amount of escape sequences that occur in a string.
+
+For strings with many escape sequences the parsing speed seems to be almost 4 times slower.
+
+Fixes [issue #1655](https://github.com/RDFLib/rdflib/issues/1655).
+
+### Deprecated Functions
+
+Marked the following functions as deprecated:
+
+- `rdflib.compat.decodeStringEscape`: This function is not used anywhere in
+  rdflib anymore and the utility that it does provide is not implemented
+  correctly. It will be removed in RDFLib 7.0.0
+
+PRs merged since last release:
+------------------------------
+
+* TODO
+
+
 2021-12-20 RELEASE 6.1.1
 ========================
 Better testing and tidier code.

diff --git a/rdflib/compat.py b/rdflib/compat.py
@@ -5,9 +5,11 @@
 
 import re
 import codecs
-import typing as t
+import warnings
+from typing import TYPE_CHECKING, Match
 
-if t.TYPE_CHECKING:
+
+if TYPE_CHECKING:
     import xml.etree.ElementTree as etree
 else:
     try:
@@ -59,6 +61,14 @@ def _unicodeExpand(s):
 
 
 def decodeStringEscape(s):
+    warnings.warn(
+        DeprecationWarning(
+            "rdflib.compat.decodeStringEscape() is deprecated, "
+            "it will be removed in rdflib 7.0.0. "
+            "This function is not used anywhere in rdflib anymore "
+            "and the utility that it does provide is not implemented correctly."
+        )
+    )
     r"""
     s is byte-string - replace \ escapes in string
     """
@@ -76,28 +86,36 @@ def decodeStringEscape(s):
     # return _unicodeExpand(s) # hmm - string escape doesn't do unicode escaping
 
 
-def decodeUnicodeEscape(s):
-    """
-    s is a unicode string
-    replace ``\\n`` and ``\\u00AC`` unicode escapes
-    """
-    if "\\" not in s:
-        # Most of times, there are no backslashes in strings.
-        # In the general case, it could use maketrans and translate.
-        return s
+_string_escape_map = {
+    "t": "\t",
+    "b": "\b",
+    "n": "\n",
+    "r": "\r",
+    "f": "\f",
+    '"': '"',
+    "'": "'",
+    "\\": "\\",
+}
 
-    s = s.replace("\\t", "\t")
-    s = s.replace("\\n", "\n")
-    s = s.replace("\\r", "\r")
-    s = s.replace("\\b", "\b")
-    s = s.replace("\\f", "\f")
-    s = s.replace('\\"', '"')
-    s = s.replace("\\'", "'")
-    s = s.replace("\\\\", "\\")
 
-    s = _unicodeExpand(s)  # hmm - string escape doesn't do unicode escaping
+def _turtle_escape_subber(match: Match[str]) -> str:
+    smatch, umatch = match.groups()
+    if smatch is not None:
+        return _string_escape_map[smatch]
+    else:
+        return chr(int(umatch[1:], 16))
 
-    return s
+
+_turtle_escape_pattern = re.compile(
+    r"""\\(?:([tbnrf"'\\])|(u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}))""",
+)
+
+
+def decodeUnicodeEscape(escaped: str) -> str:
+    if "\\" not in escaped:
+        # Most of times, there are no backslashes in strings.
+        return escaped
+    return _turtle_escape_pattern.sub(_turtle_escape_subber, escaped)
 
 
 # Migration to abc in Python 3.8

diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py
@@ -13,7 +13,7 @@
 from rdflib.term import Node, URIRef as URI
 from rdflib.term import BNode as bNode
 from rdflib.term import Literal
-from rdflib.compat import decodeUnicodeEscape
+from rdflib.compat import decodeUnicodeEscape, _string_escape_map
 from rdflib.exceptions import ParserError as ParseError
 from rdflib.parser import InputSource, Parser
 
@@ -49,19 +49,18 @@ def triple(self, s, p, o):
         print(s, p, o)
 
 
-quot = {"t": "\t", "n": "\n", "r": "\r", '"': '"', "\\": "\\"}
 r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
-r_quot = re.compile(r'\\(t|n|r|"|\\)')
-r_uniquot = re.compile(r"\\u([0-9A-F]{4})|\\U([0-9A-F]{8})")
+r_quot = re.compile(r"""\\([tbnrf"'\\])""")
+r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")
 
 
-def unquote(s):
+def unquote(s: str) -> str:
     """Unquote an N-Triples string."""
     if not validate:
         if isinstance(s, str):  # nquads
             s = decodeUnicodeEscape(s)
         else:
-            s = s.decode("unicode-escape")
+            s = s.decode("unicode-escape")  # type: ignore[unreachable]
 
         return s
     else:
@@ -76,7 +75,7 @@ def unquote(s):
             m = r_quot.match(s)
             if m:
                 s = s[2:]
-                result.append(quot[m.group(1)])
+                result.append(_string_escape_map[m.group(1)])
                 continue
 
             m = r_uniquot.match(s)

diff --git a/test/conftest.py b/test/conftest.py
@@ -1,3 +1,8 @@
 from .earl import EarlReporter
+import pytest
 
 pytest_plugins = [EarlReporter.__module__]
+
+# This is here so that asserts from these modules are formatted for human
+# readibility.
+pytest.register_assert_rewrite("test.testutils")
diff --git a/test/test_issue247.py b/test/test_issue247.py
@@ -15,31 +15,8 @@
 
 </rdf:RDF>"""
 
-passxml = """\
-<rdf:RDF
-    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-    xmlns:dc="http://purl.org/dc/elements/1.1/"
->
-
-<rdf:Description rdf:about="http://example.org/">
-    <dc:description rdf:parseType="Literal">
-        <p xmlns="http://www.w3.org/1999/xhtml"></p>
-    </dc:description>
-</rdf:Description>
-
-</rdf:RDF>"""
-
 
 class TestXMLLiteralwithLangAttr(unittest.TestCase):
-    def test_successful_parse_of_literal_without_xmllang_attr(self):
-        """
-        Test parse of Literal without xmllang attr passes
-        Parsing an RDF/XML document fails with a KeyError when
-        it contains a XML Literal with a xml:lang attribute:
-        """
-        g = rdflib.Graph()
-        g.parse(data=passxml, format="xml")
-
     def test_failing_parse_of_literal_with_xmllang_attr(self):
         """
         Show parse of Literal with xmllang attr fails

diff --git a/test/test_roundtrip.py b/test/test_roundtrip.py
@@ -1,7 +1,7 @@
+from json.decoder import JSONDecodeError
 import logging
 import os.path
 from pathlib import Path
-from test.testutils import GraphHelper
 from typing import Callable, Collection, Iterable, List, Optional, Set, Tuple, Union
 from xml.sax import SAXParseException
 
@@ -10,8 +10,12 @@
 
 import rdflib
 import rdflib.compare
+from rdflib.plugins.parsers.notation3 import BadSyntax
 from rdflib.util import guess_format
 from rdflib.namespace import XSD
+from test.testutils import GraphHelper
+
+logger = logging.getLogger(__name__)
 
 """
 Test round-tripping by all serializers/parser that are registered.
@@ -35,7 +39,8 @@
 
 """
 
-NT_DATA_DIR = Path(__file__).parent / "nt"
+TEST_DIR = Path(__file__).parent
+NT_DATA_DIR = TEST_DIR / "nt"
 INVALID_NT_FILES = {
     # illegal literal as subject
     "literals-01.nt",
@@ -125,6 +130,30 @@
         reason='HexTuples conflates "" and ""^^xsd:string strings',
         raises=AssertionError,
     ),
+    ("xml", "special_chars.nt"): pytest.mark.xfail(
+        reason="missing escaping: PCDATA invalid Char value 12 and 8",
+        raises=SAXParseException,
+    ),
+    ("trix", "special_chars.nt"): pytest.mark.xfail(
+        reason="missing escaping: PCDATA invalid Char value 12 and 8",
+        raises=SAXParseException,
+    ),
+    ("n3", "rdf_prefix.jsonld"): pytest.mark.xfail(
+        reason="missing 'rdf:' prefix",
+        raises=BadSyntax,
+    ),
+    ("ttl", "rdf_prefix.jsonld"): pytest.mark.xfail(
+        reason="missing 'rdf:' prefix",
+        raises=BadSyntax,
+    ),
+    ("trig", "rdf_prefix.jsonld"): pytest.mark.xfail(
+        reason="missing 'rdf:' prefix",
+        raises=BadSyntax,
+    ),
+    ("turtle", "rdf_prefix.jsonld"): pytest.mark.xfail(
+        reason="missing 'rdf:' prefix",
+        raises=BadSyntax,
+    ),
 }
 
 # This is for files which can only be represented properly in one format
@@ -149,17 +178,15 @@ def collect_files(
     return result
 
 
-def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) -> None:
-
+def roundtrip(infmt: str, testfmt: str, source: Path) -> None:
     g1 = rdflib.ConjunctiveGraph()
 
     g1.parse(source, format=infmt)
 
     s = g1.serialize(format=testfmt)
 
-    if verbose:
-        print("S:")
-        print(s, flush=True)
+    if logger.isEnabledFor(logging.DEBUG):
+        logger.debug("serailized = \n%s", s)
 
     g2 = rdflib.ConjunctiveGraph()
     g2.parse(data=s, format=testfmt)
@@ -176,23 +203,16 @@ def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) ->
                     c.remove((s, p, o))
                     c.add((s, p, rdflib.Literal(str(o))))
 
-    if verbose:
+    if logger.isEnabledFor(logging.DEBUG):
         both, first, second = rdflib.compare.graph_diff(g1, g2)
-        print("Diff:")
-        print("%d triples in both" % len(both))
-        print("G1 Only:")
-        for t in sorted(first):
-            print(t)
+        logger.debug("Items in both:\n%s", GraphHelper.format_graph_set(both))
+        logger.debug("Items in G1 Only:\n%s", GraphHelper.format_graph_set(first))
+        logger.debug("Items in G2 Only:\n%s", GraphHelper.format_graph_set(second))
 
-        print("--------------------")
-        print("G2 Only")
-        for t in sorted(second):
-            print(t)
+    GraphHelper.assert_isomorphic(g1, g2)
 
-    assert rdflib.compare.isomorphic(g1, g2)
-
-    if verbose:
-        print("Ok!")
+    if logger.isEnabledFor(logging.DEBUG):
+        logger.debug("OK")
 
 
 _formats: Optional[Set[str]] = None
@@ -211,7 +231,9 @@ def get_formats() -> Set[str]:
     return _formats
 
 
-def make_cases(files: Collection[Tuple[Path, str]]) -> Iterable[ParameterSet]:
+def make_cases(
+    files: Collection[Tuple[Path, str]], hext_okay: bool = False
+) -> Iterable[ParameterSet]:
     formats = get_formats()
     for testfmt in formats:
         # if testfmt == "hext":
@@ -251,3 +273,18 @@ def test_nt(checker: Callable[[str, str, Path], None], args: Tuple[str, str, Pat
 @pytest.mark.parametrize("checker, args", make_cases(collect_files(N3_DATA_DIR)))
 def test_n3(checker: Callable[[str, str, Path], None], args: Tuple[str, str, Path]):
     checker(*args)
+
+
+EXTRA_FILES = [
+    (TEST_DIR / "variants" / "special_chars.nt", "ntriples"),
+    (TEST_DIR / "variants" / "xml_literal.rdf", "xml"),
+    (TEST_DIR / "variants" / "rdf_prefix.jsonld", "json-ld"),
+]
+
+
+@pytest.mark.parametrize("checker, args", make_cases(EXTRA_FILES, hext_okay=True))
+def test_extra(checker: Callable[[str, str, Path], None], args: Tuple[str, str, Path]):
+    """
+    Round tripping works correctly for selected extra files.
+    """
+    checker(*args)