Skip to content

Commit

Permalink
Fixed the handling of escape sequences in the ntriples and nquads par…
Browse files Browse the repository at this point in the history
…sers.

These parsers will now correctly handle strings like `"\\r"`.

The time it takes for these parsers to parse strings with escape
sequences will be increased, and the increase will be correlated with
the amount of escape sequences that occur in a string.

For strings with many escape sequences the parsing speed seems to be
almost 4 times slower.

Also:

- Add graph variant test scaffolding. Multiple files representing the
  same graph can now easily be tested to be isomorphic by just adding
  them in `test/variants`.
- Add more things to `testutils.GraphHelper`, including some methods that does
  asserts with better messages. Also include some tests for GraphHelper.
- Add some extra files to test_roundtrip, set the default identifier
  when parsing, and change verbose flag to rather be based on debug
  logging.
- move one test from `test/test_issue247.py` to variants.
- Fix problems with `.editorconfig` which prevents it from working
  properly.
- Add xfail tests for a couple of issues
  This includes xfails for the following issues:
  - #1216
  - #1649
  • Loading branch information
aucampia committed Jan 12, 2022
1 parent d957533 commit 3cffd4d
Show file tree
Hide file tree
Showing 29 changed files with 941 additions and 87 deletions.
6 changes: 3 additions & 3 deletions .editorconfig
Expand Up @@ -11,16 +11,16 @@ insert_final_newline = true
trim_trailing_whitespace = true

# Leave line endings as-is in Markdown and ReStructuredText files
[*.{md, rst}]
[*.{md,rst}]
charset = utf-8
trim_trailing_whitespace = false

# Matches multiple files with brace expansion notation
# Set default charset
[*.{js, py, pyi, toml, yml, yaml}]
[*.{js,py,pyi,toml,yml,yaml}]
charset = utf-8

[*.{yaml, yml}]
[*.{yaml,yml,json}]
indent_style = space
indent_size = 2

Expand Down
30 changes: 30 additions & 0 deletions CHANGELOG.md
@@ -1,3 +1,33 @@
RELEASE TODO
========================

Changes:
--------

### Fixed the handling of escape sequences in the ntriples and nquads parsers

These parsers will now correctly handle strings like `"\\r"`.

The time it takes for these parsers to parse strings with escape sequences will be increased, and the increase will be correlated with the amount of escape sequences that occur in a string.

For strings with many escape sequences the parsing speed seems to be almost 4 times slower.

Fixes [issue #1655](https://github.com/RDFLib/rdflib/issues/1655).

### Deprecated Functions

Marked the following functions as deprecated:

- `rdflib.compat.decodeStringEscape`: This function is not used anywhere in
rdflib anymore and the utility that it does provide is not implemented
correctly. It will be removed in RDFLib 7.0.0

PRs merged since last release:
------------------------------

* TODO


2021-12-20 RELEASE 6.1.1
========================
Better testing and tidier code.
Expand Down
60 changes: 39 additions & 21 deletions rdflib/compat.py
Expand Up @@ -6,9 +6,9 @@
import re
import codecs
import warnings
import typing as t
from typing import TYPE_CHECKING, Match

if t.TYPE_CHECKING:
if TYPE_CHECKING:
import xml.etree.ElementTree as etree
else:
try:
Expand Down Expand Up @@ -82,6 +82,14 @@ def _unicodeExpand(s):


def decodeStringEscape(s):
warnings.warn(
DeprecationWarning(
"rdflib.compat.decodeStringEscape() is deprecated, "
"it will be removed in rdflib 7.0.0. "
"This function is not used anywhere in rdflib anymore "
"and the utility that it does provide is not implemented correctly."
)
)
r"""
s is byte-string - replace \ escapes in string
"""
Expand All @@ -99,28 +107,38 @@ def decodeStringEscape(s):
# return _unicodeExpand(s) # hmm - string escape doesn't do unicode escaping


def decodeUnicodeEscape(s):
"""
s is a unicode string
replace ``\\n`` and ``\\u00AC`` unicode escapes
"""
if "\\" not in s:
# Most of times, there are no backslashes in strings.
# In the general case, it could use maketrans and translate.
return s
_string_escape_map = {
"t": "\t",
"b": "\b",
"n": "\n",
"r": "\r",
"f": "\f",
'"': '"',
"'": "'",
"\\": "\\",
}
_string_escape_translator = str.maketrans(_string_escape_map)

s = s.replace("\\t", "\t")
s = s.replace("\\n", "\n")
s = s.replace("\\r", "\r")
s = s.replace("\\b", "\b")
s = s.replace("\\f", "\f")
s = s.replace('\\"', '"')
s = s.replace("\\'", "'")
s = s.replace("\\\\", "\\")

s = _unicodeExpand(s) # hmm - string escape doesn't do unicode escaping
def _turtle_escape_subber(match: Match[str]) -> str:
smatch, umatch = match.groups()
if smatch is not None:
return smatch.translate(_string_escape_translator)
else:
return chr(int(umatch[1:], 16))

return s

_turtle_escape_pattern = re.compile(
r"""\\(?:([tbnrf"'\\])|(u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}))""",
)


def decodeUnicodeEscape(escaped: str) -> str:
if "\\" not in escaped:
# Most of times, there are no backslashes in strings.
# In the general case, it could use maketrans and translate.
return escaped
return _turtle_escape_pattern.sub(_turtle_escape_subber, escaped)


# Migration to abc in Python 3.8
Expand Down
13 changes: 6 additions & 7 deletions rdflib/plugins/parsers/ntriples.py
Expand Up @@ -13,7 +13,7 @@
from rdflib.term import Node, URIRef as URI
from rdflib.term import BNode as bNode
from rdflib.term import Literal
from rdflib.compat import decodeUnicodeEscape
from rdflib.compat import decodeUnicodeEscape, _string_escape_translator
from rdflib.exceptions import ParserError as ParseError
from rdflib.parser import InputSource, Parser

Expand Down Expand Up @@ -49,19 +49,18 @@ def triple(self, s, p, o):
print(s, p, o)


quot = {"t": "\t", "n": "\n", "r": "\r", '"': '"', "\\": "\\"}
r_safe = re.compile(r"([\x20\x21\x23-\x5B\x5D-\x7E]+)")
r_quot = re.compile(r'\\(t|n|r|"|\\)')
r_uniquot = re.compile(r"\\u([0-9A-F]{4})|\\U([0-9A-F]{8})")
r_quot = re.compile(r"""\\([tbnrf"'\\])""")
r_uniquot = re.compile(r"\\u([0-9A-Fa-f]{4})|\\U([0-9A-Fa-f]{8})")


def unquote(s):
def unquote(s: str) -> str:
"""Unquote an N-Triples string."""
if not validate:
if isinstance(s, str): # nquads
s = decodeUnicodeEscape(s)
else:
s = s.decode("unicode-escape")
s = s.decode("unicode-escape") # type: ignore[unreachable]

return s
else:
Expand All @@ -76,7 +75,7 @@ def unquote(s):
m = r_quot.match(s)
if m:
s = s[2:]
result.append(quot[m.group(1)])
result.append(m.group(1).translate(_string_escape_translator))
continue

m = r_uniquot.match(s)
Expand Down
5 changes: 5 additions & 0 deletions test/conftest.py
@@ -1,3 +1,8 @@
from .earl import EarlReporter
import pytest

pytest_plugins = [EarlReporter.__module__]

# This is here so that asserts from these modules are formatted for human
# readibility.
pytest.register_assert_rewrite("test.testutils")
23 changes: 0 additions & 23 deletions test/test_issue247.py
Expand Up @@ -15,31 +15,8 @@
</rdf:RDF>"""

passxml = """\
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
>
<rdf:Description rdf:about="http://example.org/">
<dc:description rdf:parseType="Literal">
<p xmlns="http://www.w3.org/1999/xhtml"></p>
</dc:description>
</rdf:Description>
</rdf:RDF>"""


class TestXMLLiteralwithLangAttr(unittest.TestCase):
def test_successful_parse_of_literal_without_xmllang_attr(self):
"""
Test parse of Literal without xmllang attr passes
Parsing an RDF/XML document fails with a KeyError when
it contains a XML Literal with a xml:lang attribute:
"""
g = rdflib.Graph()
g.parse(data=passxml, format="xml")

def test_failing_parse_of_literal_with_xmllang_attr(self):
"""
Show parse of Literal with xmllang attr fails
Expand Down

0 comments on commit 3cffd4d

Please sign in to comment.