From 588286bfb342f503a0a746061856dd3dfcff9df4 Mon Sep 17 00:00:00 2001 From: Iwan Aucamp Date: Wed, 19 Jul 2023 19:55:53 +0200 Subject: [PATCH] fix: handling of `rdf:HTML` literals (#2490) Previously, without `html5lib` installed, literals with`rdf:HTML` datatypes were treated as [ill-typed](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal), even if they were not ill-typed. With this change, if `html5lib` is not installed, literals with the `rdf:HTML` datatype will not be treated as ill-typed, and will have `Null` as their `ill_typed` attribute value, which means that it is unknown whether they are ill-typed or not. This change also fixes the mapping from `rdf:HTML` literal values to lexical forms. Other changes: - Add tests for `rdflib.NORMALIZE_LITERALS` to ensure it behaves correctly. Related issues: - Fixes --- rdflib/term.py | 68 ++++++++++---- test/conftest.py | 7 +- test/test_literal/test_literal.py | 103 +++++++++++++++++++-- test/test_literal/test_literal_html5lib.py | 79 ++++++++++++++++ test/test_sparql/test_sparql.py | 17 ++++ test/utils/literal.py | 39 +++++++- tox.ini | 4 +- 7 files changed, 281 insertions(+), 36 deletions(-) create mode 100644 test/test_literal/test_literal_html5lib.py diff --git a/rdflib/term.py b/rdflib/term.py index 0a136684c..e0f9cbf3e 100644 --- a/rdflib/term.py +++ b/rdflib/term.py @@ -76,6 +76,15 @@ from .namespace import NamespaceManager from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath +_HAS_HTML5LIB = False + +try: + import html5lib + + _HAS_HTML5LIB = True +except ImportError: + html5lib = None + _SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io" logger = logging.getLogger(__name__) @@ -1638,20 +1647,34 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802 return retval -def _parseHTML(htmltext: str) -> xml.dom.minidom.DocumentFragment: # noqa: N802 - try: - import html5lib - except ImportError: - raise ImportError( - "HTML5 parser not available. Try installing" - + " html5lib " - ) +def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment: + """ + Parse the lexical form of an HTML literal into a document fragment + using the ``dom`` from html5lib tree builder. + + :param lexical_form: The lexical form of the HTML literal. + :return: A document fragment representing the HTML literal. + :raises: `html5lib.html5parser.ParseError` if the lexical form is + not valid HTML. + """ parser = html5lib.HTMLParser( tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True ) - retval = parser.parseFragment(htmltext) - retval.normalize() - return retval + result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form) + result.normalize() + return result + + +def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes: + """ + Serialize a document fragment representing an HTML literal into + its lexical form. + + :param value: A document fragment representing an HTML literal. + :return: The lexical form of the HTML literal. + """ + result = html5lib.serialize(value, tree="dom") + return result def _writeXML( # noqa: N802 @@ -1967,14 +1990,21 @@ def _castPythonToLiteral( # noqa: N802 (Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)), (timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)), (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)), - # this is a bit dirty - by accident the html5lib parser produces - # DocumentFragments, and the xml parser Documents, letting this - # decide what datatype to use makes roundtripping easier, but it a - # bit random - (xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)), (Fraction, (None, _OWL_RATIONAL)), ] +if html5lib is not None: + # This is a bit dirty, by accident the html5lib parser produces + # DocumentFragments, and the xml parser Documents, letting this + # decide what datatype to use makes roundtripping easier, but it a + # bit random. + # + # This must happen before _GenericPythonToXSDRules is assigned to + # _OriginalGenericPythonToXSDRules. + _GenericPythonToXSDRules.append( + (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)) + ) + _OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules) _SpecificPythonToXSDRules: List[ @@ -2025,9 +2055,13 @@ def _castPythonToLiteral( # noqa: N802 URIRef(_XSD_PFX + "base64Binary"): b64decode, URIRef(_XSD_PFX + "anyURI"): None, _RDF_XMLLITERAL: _parseXML, - _RDF_HTMLLITERAL: _parseHTML, } +if html5lib is not None: + # It is probably best to keep this close to the definition of + # _GenericPythonToXSDRules so nobody misses it. + XSDToPython[_RDF_HTMLLITERAL] = _parse_html + _check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = { URIRef(_XSD_PFX + "boolean"): _well_formed_boolean, URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer, diff --git a/test/conftest.py b/test/conftest.py index 01153f9fa..51b946ec8 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,6 +3,8 @@ import pytest +# This is here so that asserts from these modules are formatted for human +# readibility. pytest.register_assert_rewrite("test.utils") from pathlib import Path # noqa: E402 @@ -19,7 +21,7 @@ Union, ) -from rdflib import Graph +from rdflib import Graph # noqa: E402 from .data import TEST_DATA_DIR from .utils.earl import EARLReporter # noqa: E402 @@ -27,9 +29,6 @@ pytest_plugins = [EARLReporter.__module__] -# This is here so that asserts from these modules are formatted for human -# readibility. - @pytest.fixture(scope="session") def http_file_server() -> Generator[HTTPFileServer, None, None]: diff --git a/test/test_literal/test_literal.py b/test/test_literal/test_literal.py index 5fc3cec71..5193fedc1 100644 --- a/test/test_literal/test_literal.py +++ b/test/test_literal/test_literal.py @@ -1,3 +1,15 @@ +from __future__ import annotations + +import builtins +import datetime +import logging +from decimal import Decimal +from test.utils import affix_tuples +from test.utils.literal import LiteralChecker, literal_idfn +from test.utils.namespace import EGDC +from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives +from typing import Any, Callable, Generator, Optional, Type, Union + # NOTE: The config below enables strict mode for mypy. # mypy: no_ignore_errors # mypy: warn_unused_configs, disallow_any_generics @@ -7,14 +19,13 @@ # mypy: no_implicit_optional, warn_redundant_casts, warn_unused_ignores # mypy: warn_return_any, no_implicit_reexport, strict_equality -import datetime -import logging -from decimal import Decimal -from test.utils import affix_tuples -from test.utils.literal import LiteralChecker -from test.utils.namespace import EGDC -from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives -from typing import Any, Callable, Generator, Optional, Type, Union + +try: + import html5lib as _ # noqa: F401 + + _HAVE_HTML5LIB = True +except ImportError: + _HAVE_HTML5LIB = False import isodate import pytest @@ -915,6 +926,21 @@ def unlexify(s: str) -> str: ) +class _UnknownType: + """ + A class that is not known to rdflib, used to test the how + rdflib.term.Literal handles unknown python types. + """ + + def __repr__(self) -> str: + return "_UnknownType()" + + def __eq__(self, __value: object) -> bool: + if isinstance(__value, _UnknownType): + return True + return False + + @pytest.mark.parametrize( ["literal_maker", "outcome"], [ @@ -951,7 +977,30 @@ def unlexify(s: str) -> str: lambda: Literal(Literal("blue sky", "en")), Literal("blue sky", "en"), ), + ( + lambda: Literal("", datatype=RDF.HTML), + LiteralChecker( + ..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "" + ), + ), + ( + lambda: Literal("
", datatype=RDF.HTML), + LiteralChecker( + ..., + None, + RDF.HTML, + False if _HAVE_HTML5LIB else None, + "
", + ), + ), + ( + lambda: Literal(_UnknownType(), datatype=EGDC.UnknownType), + LiteralChecker( + _UnknownType(), None, EGDC.UnknownType, None, "_UnknownType()" + ), + ), ], + ids=literal_idfn, ) def test_literal_construction( literal_maker: Callable[[], Literal], @@ -961,3 +1010,41 @@ def test_literal_construction( with checker.context(): actual_outcome = literal_maker() checker.check(actual_outcome) + + +@pytest.mark.parametrize( + ["literal_maker", "normalize_literals", "outcome"], + [ + ( + lambda: Literal("001000", datatype=XSD.integer), + ..., + LiteralChecker(1000, None, XSD.integer, False, "1000"), + ), + ( + lambda: Literal("001000", datatype=XSD.integer), + True, + LiteralChecker(1000, None, XSD.integer, False, "1000"), + ), + ( + lambda: Literal("001000", datatype=XSD.integer), + False, + LiteralChecker(1000, None, XSD.integer, False, "001000"), + ), + ], + ids=literal_idfn, +) +def test_global_normalize( + literal_maker: Callable[[], Literal], + normalize_literals: Union[builtins.ellipsis, bool], + outcome: OutcomePrimitives[Literal], +) -> None: + _normalize_literals = rdflib.NORMALIZE_LITERALS + try: + if normalize_literals is not ...: + rdflib.NORMALIZE_LITERALS = normalize_literals + checker = OutcomeChecker[Literal].from_primitives(outcome) + with checker.context(): + actual_outcome = literal_maker() + checker.check(actual_outcome) + finally: + rdflib.NORMALIZE_LITERALS = _normalize_literals diff --git a/test/test_literal/test_literal_html5lib.py b/test/test_literal/test_literal_html5lib.py new file mode 100644 index 000000000..ce39039c0 --- /dev/null +++ b/test/test_literal/test_literal_html5lib.py @@ -0,0 +1,79 @@ +import xml.dom.minidom +from test.utils.literal import LiteralChecker +from test.utils.outcome import OutcomeChecker, OutcomePrimitives +from typing import Callable + +import pytest + +import rdflib.term +from rdflib.namespace import RDF +from rdflib.term import Literal + +try: + import html5lib as _ # noqa: F401 +except ImportError: + pytest.skip("html5lib not installed", allow_module_level=True) + + +def test_has_html5lib() -> None: + assert rdflib.term._HAS_HTML5LIB is True + assert RDF.HTML in rdflib.term.XSDToPython + rule = next( + ( + item + for item in rdflib.term._GenericPythonToXSDRules + if item[0] is xml.dom.minidom.DocumentFragment + ), + None, + ) + assert rule is not None + assert rule[1][1] == RDF.HTML + + +@pytest.mark.parametrize( + ["factory", "outcome"], + [ + # Ill-typed literals, these have lexical forms that result in + # errors when parsed as HTML by html5lib. + ( + lambda: Literal("

Hello, World!

", datatype=RDF.HTML), + LiteralChecker( + ..., None, RDF.HTML, True, "

Hello, World!

" + ), + ), + ( + lambda: Literal("", datatype=RDF.HTML), + LiteralChecker(..., None, RDF.HTML, True, ""), + ), + ( + lambda: Literal("THE TEXT IS IN HERE", datatype=RDF.HTML), + LiteralChecker( + ..., None, RDF.HTML, True, "THE TEXT IS IN HERE" + ), + ), + # Well-typed literals, these have lexical forms that parse + # without errors with html5lib. + ( + lambda: Literal("
", datatype=RDF.HTML), + LiteralChecker(..., None, RDF.HTML, False, "
"), + ), + ( + lambda: Literal("
", datatype=RDF.HTML, normalize=True), + LiteralChecker(..., None, RDF.HTML, False, "
"), + ), + ( + lambda: Literal( + "
", datatype=RDF.HTML, normalize=False + ), + LiteralChecker(..., None, RDF.HTML, False, "
"), + ), + ], +) +def test_literal_construction( + factory: Callable[[], Literal], + outcome: OutcomePrimitives[Literal], +) -> None: + checker = OutcomeChecker[Literal].from_primitives(outcome) + with checker.context(): + actual_outcome = factory() + checker.check(actual_outcome) diff --git a/test/test_sparql/test_sparql.py b/test/test_sparql/test_sparql.py index 197323cba..4f7624af6 100644 --- a/test/test_sparql/test_sparql.py +++ b/test/test_sparql/test_sparql.py @@ -844,6 +844,23 @@ def thrower(*args: Any, **kwargs: Any) -> None: ], id="select-group-concat-optional-many", ), + pytest.param( + """ + PREFIX rdf: + + SELECT * WHERE { + BIND(STRDT("", rdf:HTML) as ?tag1) # incorrectly disappearing literal + BIND("" as ?tag2) # correctly appearing literal + } + """, + [ + { + Variable("tag1"): Literal("", datatype=RDF.HTML), + Variable("tag2"): Literal(""), + } + ], + id="select-bind-strdt-html", + ), ], ) def test_queries( diff --git a/test/utils/literal.py b/test/utils/literal.py index b4b8cbf43..702f82f6d 100644 --- a/test/utils/literal.py +++ b/test/utils/literal.py @@ -1,9 +1,11 @@ from __future__ import annotations import builtins +import logging from dataclasses import dataclass from test.utils.outcome import NoExceptionChecker -from typing import Any, Union +from typing import Any, Optional, Union +from xml.dom.minidom import DocumentFragment from rdflib.term import Literal, URIRef @@ -17,13 +19,40 @@ class LiteralChecker(NoExceptionChecker[Literal]): lexical: Union[builtins.ellipsis, str] = ... def check(self, actual: Literal) -> None: + logging.debug( + "actual = %r, value = %r, ill_typed = %r", + actual, + actual.value, + actual.ill_typed, + ) if self.value is not Ellipsis: + if callable(self.value): + logging.debug(f"Checking value {actual.value} with {self.value}") + if isinstance(actual.value, DocumentFragment): + logging.debug(f"childNodes = {actual.value.childNodes}") + assert self.value(actual.value) + else: + assert self.value == actual.value assert self.value == actual.value if self.lexical is not Ellipsis: - assert self.lexical == f"{actual}" + assert self.lexical == f"{actual}", "Literal lexical form does not match" if self.ill_typed is not Ellipsis: - assert self.ill_typed == actual.ill_typed + assert ( + self.ill_typed == actual.ill_typed + ), "Literal ill_typed flag does not match" if self.language is not Ellipsis: - assert self.language == actual.language + assert self.language == actual.language, "Literal language does not match" if self.datatype is not Ellipsis: - assert self.datatype == actual.datatype + assert self.datatype == actual.datatype, "Literal datatype does not match" + + +def literal_idfn(value: Any) -> Optional[str]: + if callable(value): + try: + literal = value() + except Exception: + return None + return f"{literal}" + if isinstance(value, LiteralChecker): + return f"{value}" + return None diff --git a/tox.ini b/tox.ini index d2ecc891a..e3a75d8f7 100644 --- a/tox.ini +++ b/tox.ini @@ -15,12 +15,12 @@ setenv = COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}} MYPY_CACHE_DIR = {envdir}/.mypy_cache docs: POETRY_ARGS_docs = --only=docs - extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx + extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html lxml: POETRY_ARGS_lxml = --extras=lxml commands_pre = py3{7,8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))' py3{7,8,9,10,11}: poetry lock --check - py3{7,8,9,10,11}: poetry install --no-root --only=main --only=dev --only=flake8 --only=tests --extras=html {env:POETRY_ARGS_docs:} {env:POETRY_ARGS_extensive:} {env:POETRY_ARGS_lxml:} {env:POETRY_ARGS:} --sync + py3{7,8,9,10,11}: poetry install --no-root --only=main --only=dev --only=flake8 --only=tests {env:POETRY_ARGS_docs:} {env:POETRY_ARGS_extensive:} {env:POETRY_ARGS_lxml:} {env:POETRY_ARGS:} --sync commands = {env:TOX_EXTRA_COMMAND:} {env:TOX_MYPY_COMMAND:poetry run python -m mypy --show-error-context --show-error-codes --junit-xml=test_reports/{env:TOX_JUNIT_XML_PREFIX:}mypy-junit.xml}