Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed the handling of escape sequences in the ntriples and nquads parsers #1663

Merged
merged 3 commits into from Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions .editorconfig
Expand Up @@ -11,16 +11,16 @@ insert_final_newline = true
trim_trailing_whitespace = true

# Leave line endings as-is in Markdown and ReStructuredText files
[*.{md, rst}]
[*.{md,rst}]
charset = utf-8
trim_trailing_whitespace = false

# Matches multiple files with brace expansion notation
# Set default charset
[*.{js, py, pyi, toml, yml, yaml}]
[*.{js,py,pyi,toml,yml,yaml}]
charset = utf-8

[*.{yaml, yml}]
[*.{yaml,yml,json}]
indent_style = space
indent_size = 2

Expand Down
30 changes: 30 additions & 0 deletions CHANGELOG.md
@@ -1,3 +1,33 @@
RELEASE TODO
========================

Changes:
--------

### Fixed the handling of escape sequences in the ntriples and nquads parsers

These parsers will now correctly handle strings like `"\\r"`.

The time it takes for these parsers to parse strings with escape sequences will be increased, and the increase will be correlated with the amount of escape sequences that occur in a string.

For strings with many escape sequences the parsing speed seems to be almost 4 times slower.

Fixes [issue #1655](https://github.com/RDFLib/rdflib/issues/1655).

### Deprecated Functions

Marked the following functions as deprecated:

- `rdflib.compat.decodeStringEscape`: This function is not used anywhere in
rdflib anymore and the utility that it does provide is not implemented
correctly. It will be removed in RDFLib 7.0.0

PRs merged since last release:
------------------------------

* TODO


2021-12-20 RELEASE 6.1.1
========================
Better testing and tidier code.
Expand Down
60 changes: 39 additions & 21 deletions rdflib/compat.py
Expand Up @@ -5,9 +5,11 @@

import re
import codecs
import typing as t
import warnings
from typing import TYPE_CHECKING, Match

if t.TYPE_CHECKING:

if TYPE_CHECKING:
import xml.etree.ElementTree as etree
else:
try:
Expand Down Expand Up @@ -59,6 +61,14 @@ def _unicodeExpand(s):


def decodeStringEscape(s):
warnings.warn(
DeprecationWarning(
"rdflib.compat.decodeStringEscape() is deprecated, "
"it will be removed in rdflib 7.0.0. "
"This function is not used anywhere in rdflib anymore "
"and the utility that it does provide is not implemented correctly."
)
)
r"""
s is byte-string - replace \ escapes in string
"""
Expand All @@ -76,28 +86,36 @@ def decodeStringEscape(s):
# return _unicodeExpand(s) # hmm - string escape doesn't do unicode escaping


def decodeUnicodeEscape(s):
"""
s is a unicode string
replace ``\\n`` and ``\\u00AC`` unicode escapes
"""
if "\\" not in s:
# Most of times, there are no backslashes in strings.
# In the general case, it could use maketrans and translate.
return s
_string_escape_map = {
"t": "\t",
"b": "\b",
"n": "\n",
"r": "\r",
"f": "\f",
'"': '"',
"'": "'",
"\\": "\\",
}

s = s.replace("\\t", "\t")
s = s.replace("\\n", "\n")
s = s.replace("\\r", "\r")
s = s.replace("\\b", "\b")
s = s.replace("\\f", "\f")
s = s.replace('\\"', '"')
s = s.replace("\\'", "'")
s = s.replace("\\\\", "\\")

s = _unicodeExpand(s) # hmm - string escape doesn't do unicode escaping
def _turtle_escape_subber(match: Match[str]) -> str:
smatch, umatch = match.groups()
if smatch is not None:
return _string_escape_map[smatch]
else:
return chr(int(umatch[1:], 16))

return s

_turtle_escape_pattern = re.compile(
r"""\\(?:([tbnrf"'\\])|(u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}))""",
)


def decodeUnicodeEscape(escaped: str) -> str:
if "\\" not in escaped:
# Most of times, there are no backslashes in strings.
return escaped
return _turtle_escape_pattern.sub(_turtle_escape_subber, escaped)


# Migration to abc in Python 3.8
Expand Down
13 changes: 6 additions & 7 deletions rdflib/plugins/parsers/ntriples.py
Expand Up @@ -13,7 +13,7 @@
from rdflib.term import Node, URIRef as URI
from rdflib.term import BNode as bNode
from rdflib.term import Literal
from rdflib.compat import decodeUnicodeEscape
from rdflib.compat import decodeUnicodeEscape, _string_escape_map
from rdflib.exceptions import ParserError as ParseError
from rdflib.parser import InputSource, Parser

Expand Down Expand Up @@ -49,19 +49,18 @@ def triple(self, s, p, o):
print(s, p, o)


quot = {"t": "\t", "n": "\n", "r": "\r", '"': '"', "\\": "\\"}
r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
r_quot = re.compile(r'\\(t|n|r|"|\\)')
r_uniquot = re.compile(r"\\u([0-9A-F]{4})|\\U([0-9A-F]{8})")
r_quot = re.compile(r"""\\([tbnrf"'\\])""")
r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")


def unquote(s):
def unquote(s: str) -> str:
"""Unquote an N-Triples string."""
if not validate:
if isinstance(s, str): # nquads
s = decodeUnicodeEscape(s)
else:
s = s.decode("unicode-escape")
s = s.decode("unicode-escape") # type: ignore[unreachable]

return s
else:
Expand All @@ -76,7 +75,7 @@ def unquote(s):
m = r_quot.match(s)
if m:
s = s[2:]
result.append(quot[m.group(1)])
result.append(_string_escape_map[m.group(1)])
continue

m = r_uniquot.match(s)
Expand Down
5 changes: 5 additions & 0 deletions test/conftest.py
@@ -1,3 +1,8 @@
from .earl import EarlReporter
import pytest

pytest_plugins = [EarlReporter.__module__]

# This is here so that asserts from these modules are formatted for human
# readibility.
pytest.register_assert_rewrite("test.testutils")
23 changes: 0 additions & 23 deletions test/test_issue247.py
Expand Up @@ -15,31 +15,8 @@

</rdf:RDF>"""

passxml = """\
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>

<rdf:Description rdf:about="http://example.org/">
<dc:description rdf:parseType="Literal">
<p xmlns="http://www.w3.org/1999/xhtml"></p>
</dc:description>
</rdf:Description>

</rdf:RDF>"""


class TestXMLLiteralwithLangAttr(unittest.TestCase):
def test_successful_parse_of_literal_without_xmllang_attr(self):
"""
Test parse of Literal without xmllang attr passes
Parsing an RDF/XML document fails with a KeyError when
it contains a XML Literal with a xml:lang attribute:
"""
g = rdflib.Graph()
g.parse(data=passxml, format="xml")

def test_failing_parse_of_literal_with_xmllang_attr(self):
"""
Show parse of Literal with xmllang attr fails
Expand Down
81 changes: 59 additions & 22 deletions test/test_roundtrip.py
@@ -1,7 +1,7 @@
from json.decoder import JSONDecodeError
import logging
import os.path
from pathlib import Path
from test.testutils import GraphHelper
from typing import Callable, Collection, Iterable, List, Optional, Set, Tuple, Union
from xml.sax import SAXParseException

Expand All @@ -10,8 +10,12 @@

import rdflib
import rdflib.compare
from rdflib.plugins.parsers.notation3 import BadSyntax
from rdflib.util import guess_format
from rdflib.namespace import XSD
from test.testutils import GraphHelper

logger = logging.getLogger(__name__)

"""
Test round-tripping by all serializers/parser that are registered.
Expand All @@ -35,7 +39,8 @@

"""

NT_DATA_DIR = Path(__file__).parent / "nt"
TEST_DIR = Path(__file__).parent
NT_DATA_DIR = TEST_DIR / "nt"
INVALID_NT_FILES = {
# illegal literal as subject
"literals-01.nt",
Expand Down Expand Up @@ -125,6 +130,30 @@
reason='HexTuples conflates "" and ""^^xsd:string strings',
raises=AssertionError,
),
("xml", "special_chars.nt"): pytest.mark.xfail(
reason="missing escaping: PCDATA invalid Char value 12 and 8",
raises=SAXParseException,
),
("trix", "special_chars.nt"): pytest.mark.xfail(
reason="missing escaping: PCDATA invalid Char value 12 and 8",
raises=SAXParseException,
),
("n3", "rdf_prefix.jsonld"): pytest.mark.xfail(
reason="missing 'rdf:' prefix",
raises=BadSyntax,
),
("ttl", "rdf_prefix.jsonld"): pytest.mark.xfail(
reason="missing 'rdf:' prefix",
raises=BadSyntax,
),
("trig", "rdf_prefix.jsonld"): pytest.mark.xfail(
reason="missing 'rdf:' prefix",
raises=BadSyntax,
),
("turtle", "rdf_prefix.jsonld"): pytest.mark.xfail(
reason="missing 'rdf:' prefix",
raises=BadSyntax,
),
}

# This is for files which can only be represented properly in one format
Expand All @@ -149,17 +178,15 @@ def collect_files(
return result


def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) -> None:

def roundtrip(infmt: str, testfmt: str, source: Path) -> None:
g1 = rdflib.ConjunctiveGraph()

g1.parse(source, format=infmt)

s = g1.serialize(format=testfmt)

if verbose:
print("S:")
print(s, flush=True)
if logger.isEnabledFor(logging.DEBUG):
logger.debug("serailized = \n%s", s)

g2 = rdflib.ConjunctiveGraph()
g2.parse(data=s, format=testfmt)
Expand All @@ -176,23 +203,16 @@ def roundtrip(infmt: str, testfmt: str, source: Path, verbose: bool = False) ->
c.remove((s, p, o))
c.add((s, p, rdflib.Literal(str(o))))

if verbose:
if logger.isEnabledFor(logging.DEBUG):
both, first, second = rdflib.compare.graph_diff(g1, g2)
print("Diff:")
print("%d triples in both" % len(both))
print("G1 Only:")
for t in sorted(first):
print(t)
logger.debug("Items in both:\n%s", GraphHelper.format_graph_set(both))
logger.debug("Items in G1 Only:\n%s", GraphHelper.format_graph_set(first))
logger.debug("Items in G2 Only:\n%s", GraphHelper.format_graph_set(second))

print("--------------------")
print("G2 Only")
for t in sorted(second):
print(t)
GraphHelper.assert_isomorphic(g1, g2)

assert rdflib.compare.isomorphic(g1, g2)

if verbose:
print("Ok!")
if logger.isEnabledFor(logging.DEBUG):
logger.debug("OK")


_formats: Optional[Set[str]] = None
Expand All @@ -211,7 +231,9 @@ def get_formats() -> Set[str]:
return _formats


def make_cases(files: Collection[Tuple[Path, str]]) -> Iterable[ParameterSet]:
def make_cases(
files: Collection[Tuple[Path, str]], hext_okay: bool = False
) -> Iterable[ParameterSet]:
formats = get_formats()
for testfmt in formats:
# if testfmt == "hext":
Expand Down Expand Up @@ -251,3 +273,18 @@ def test_nt(checker: Callable[[str, str, Path], None], args: Tuple[str, str, Pat
@pytest.mark.parametrize("checker, args", make_cases(collect_files(N3_DATA_DIR)))
def test_n3(checker: Callable[[str, str, Path], None], args: Tuple[str, str, Path]):
checker(*args)


EXTRA_FILES = [
(TEST_DIR / "variants" / "special_chars.nt", "ntriples"),
(TEST_DIR / "variants" / "xml_literal.rdf", "xml"),
(TEST_DIR / "variants" / "rdf_prefix.jsonld", "json-ld"),
]


@pytest.mark.parametrize("checker, args", make_cases(EXTRA_FILES, hext_okay=True))
def test_extra(checker: Callable[[str, str, Path], None], args: Tuple[str, str, Path]):
"""
Round tripping works correctly for selected extra files.
"""
checker(*args)