From 588286bfb342f503a0a746061856dd3dfcff9df4 Mon Sep 17 00:00:00 2001
From: Iwan Aucamp <aucampia@gmail.com>
Date: Wed, 19 Jul 2023 19:55:53 +0200
Subject: [PATCH] fix: handling of `rdf:HTML` literals (#2490)

Previously, without `html5lib` installed, literals with`rdf:HTML`
datatypes were treated as
[ill-typed](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal),
even if they were not ill-typed.

With this change, if `html5lib` is not installed, literals with the
`rdf:HTML` datatype will not be treated as ill-typed, and will have
`Null` as their `ill_typed` attribute value, which means that it is
unknown whether they are ill-typed or not.

This change also fixes the mapping from `rdf:HTML` literal values to
lexical forms.

Other changes:

- Add tests for `rdflib.NORMALIZE_LITERALS` to ensure it behaves
  correctly.

Related issues:

- Fixes <https://github.com/RDFLib/rdflib/issues/2475>
---
 rdflib/term.py                             |  68 ++++++++++----
 test/conftest.py                           |   7 +-
 test/test_literal/test_literal.py          | 103 +++++++++++++++++++--
 test/test_literal/test_literal_html5lib.py |  79 ++++++++++++++++
 test/test_sparql/test_sparql.py            |  17 ++++
 test/utils/literal.py                      |  39 +++++++-
 tox.ini                                    |   4 +-
 7 files changed, 281 insertions(+), 36 deletions(-)
 create mode 100644 test/test_literal/test_literal_html5lib.py

diff --git a/rdflib/term.py b/rdflib/term.py
index 0a136684c..e0f9cbf3e 100644
--- a/rdflib/term.py
+++ b/rdflib/term.py
@@ -76,6 +76,15 @@
     from .namespace import NamespaceManager
     from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
 
+_HAS_HTML5LIB = False
+
+try:
+    import html5lib
+
+    _HAS_HTML5LIB = True
+except ImportError:
+    html5lib = None
+
 _SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
 
 logger = logging.getLogger(__name__)
@@ -1638,20 +1647,34 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document:  # noqa: N802
     return retval
 
 
-def _parseHTML(htmltext: str) -> xml.dom.minidom.DocumentFragment:  # noqa: N802
-    try:
-        import html5lib
-    except ImportError:
-        raise ImportError(
-            "HTML5 parser not available. Try installing"
-            + " html5lib <http://code.google.com/p/html5lib>"
-        )
+def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
+    """
+    Parse the lexical form of an HTML literal into a document fragment
+    using the ``dom`` from html5lib tree builder.
+
+    :param lexical_form: The lexical form of the HTML literal.
+    :return: A document fragment representing the HTML literal.
+    :raises: `html5lib.html5parser.ParseError` if the lexical form is
+        not valid HTML.
+    """
     parser = html5lib.HTMLParser(
         tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
     )
-    retval = parser.parseFragment(htmltext)
-    retval.normalize()
-    return retval
+    result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
+    result.normalize()
+    return result
+
+
+def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
+    """
+    Serialize a document fragment representing an HTML literal into
+    its lexical form.
+
+    :param value: A document fragment representing an HTML literal.
+    :return: The lexical form of the HTML literal.
+    """
+    result = html5lib.serialize(value, tree="dom")
+    return result
 
 
 def _writeXML(  # noqa: N802
@@ -1967,14 +1990,21 @@ def _castPythonToLiteral(  # noqa: N802
     (Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
     (timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
     (xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
-    # this is a bit dirty - by accident the html5lib parser produces
-    # DocumentFragments, and the xml parser Documents, letting this
-    # decide what datatype to use makes roundtripping easier, but it a
-    # bit random
-    (xml.dom.minidom.DocumentFragment, (_writeXML, _RDF_HTMLLITERAL)),
     (Fraction, (None, _OWL_RATIONAL)),
 ]
 
+if html5lib is not None:
+    # This is a bit dirty, by accident the html5lib parser produces
+    # DocumentFragments, and the xml parser Documents, letting this
+    # decide what datatype to use makes roundtripping easier, but it a
+    # bit random.
+    #
+    # This must happen before _GenericPythonToXSDRules is assigned to
+    # _OriginalGenericPythonToXSDRules.
+    _GenericPythonToXSDRules.append(
+        (xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
+    )
+
 _OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
 
 _SpecificPythonToXSDRules: List[
@@ -2025,9 +2055,13 @@ def _castPythonToLiteral(  # noqa: N802
     URIRef(_XSD_PFX + "base64Binary"): b64decode,
     URIRef(_XSD_PFX + "anyURI"): None,
     _RDF_XMLLITERAL: _parseXML,
-    _RDF_HTMLLITERAL: _parseHTML,
 }
 
+if html5lib is not None:
+    # It is probably best to keep this close to the definition of
+    # _GenericPythonToXSDRules so nobody misses it.
+    XSDToPython[_RDF_HTMLLITERAL] = _parse_html
+
 _check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
     URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
     URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,
diff --git a/test/conftest.py b/test/conftest.py
index 01153f9fa..51b946ec8 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,6 +3,8 @@
 
 import pytest
 
+# This is here so that asserts from these modules are formatted for human
+# readibility.
 pytest.register_assert_rewrite("test.utils")
 
 from pathlib import Path  # noqa: E402
@@ -19,7 +21,7 @@
     Union,
 )
 
-from rdflib import Graph
+from rdflib import Graph  # noqa: E402
 
 from .data import TEST_DATA_DIR
 from .utils.earl import EARLReporter  # noqa: E402
@@ -27,9 +29,6 @@
 
 pytest_plugins = [EARLReporter.__module__]
 
-# This is here so that asserts from these modules are formatted for human
-# readibility.
-
 
 @pytest.fixture(scope="session")
 def http_file_server() -> Generator[HTTPFileServer, None, None]:
diff --git a/test/test_literal/test_literal.py b/test/test_literal/test_literal.py
index 5fc3cec71..5193fedc1 100644
--- a/test/test_literal/test_literal.py
+++ b/test/test_literal/test_literal.py
@@ -1,3 +1,15 @@
+from __future__ import annotations
+
+import builtins
+import datetime
+import logging
+from decimal import Decimal
+from test.utils import affix_tuples
+from test.utils.literal import LiteralChecker, literal_idfn
+from test.utils.namespace import EGDC
+from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
+from typing import Any, Callable, Generator, Optional, Type, Union
+
 # NOTE: The config below enables strict mode for mypy.
 # mypy: no_ignore_errors
 # mypy: warn_unused_configs, disallow_any_generics
@@ -7,14 +19,13 @@
 # mypy: no_implicit_optional, warn_redundant_casts, warn_unused_ignores
 # mypy: warn_return_any, no_implicit_reexport, strict_equality
 
-import datetime
-import logging
-from decimal import Decimal
-from test.utils import affix_tuples
-from test.utils.literal import LiteralChecker
-from test.utils.namespace import EGDC
-from test.utils.outcome import OutcomeChecker, OutcomePrimitive, OutcomePrimitives
-from typing import Any, Callable, Generator, Optional, Type, Union
+
+try:
+    import html5lib as _  # noqa: F401
+
+    _HAVE_HTML5LIB = True
+except ImportError:
+    _HAVE_HTML5LIB = False
 
 import isodate
 import pytest
@@ -915,6 +926,21 @@ def unlexify(s: str) -> str:
     )
 
 
+class _UnknownType:
+    """
+    A class that is not known to rdflib, used to test the how
+    rdflib.term.Literal handles unknown python types.
+    """
+
+    def __repr__(self) -> str:
+        return "_UnknownType()"
+
+    def __eq__(self, __value: object) -> bool:
+        if isinstance(__value, _UnknownType):
+            return True
+        return False
+
+
 @pytest.mark.parametrize(
     ["literal_maker", "outcome"],
     [
@@ -951,7 +977,30 @@ def unlexify(s: str) -> str:
             lambda: Literal(Literal("blue sky", "en")),
             Literal("blue sky", "en"),
         ),
+        (
+            lambda: Literal("<body>", datatype=RDF.HTML),
+            LiteralChecker(
+                ..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
+            ),
+        ),
+        (
+            lambda: Literal("<table></table>", datatype=RDF.HTML),
+            LiteralChecker(
+                ...,
+                None,
+                RDF.HTML,
+                False if _HAVE_HTML5LIB else None,
+                "<table></table>",
+            ),
+        ),
+        (
+            lambda: Literal(_UnknownType(), datatype=EGDC.UnknownType),
+            LiteralChecker(
+                _UnknownType(), None, EGDC.UnknownType, None, "_UnknownType()"
+            ),
+        ),
     ],
+    ids=literal_idfn,
 )
 def test_literal_construction(
     literal_maker: Callable[[], Literal],
@@ -961,3 +1010,41 @@ def test_literal_construction(
     with checker.context():
         actual_outcome = literal_maker()
         checker.check(actual_outcome)
+
+
+@pytest.mark.parametrize(
+    ["literal_maker", "normalize_literals", "outcome"],
+    [
+        (
+            lambda: Literal("001000", datatype=XSD.integer),
+            ...,
+            LiteralChecker(1000, None, XSD.integer, False, "1000"),
+        ),
+        (
+            lambda: Literal("001000", datatype=XSD.integer),
+            True,
+            LiteralChecker(1000, None, XSD.integer, False, "1000"),
+        ),
+        (
+            lambda: Literal("001000", datatype=XSD.integer),
+            False,
+            LiteralChecker(1000, None, XSD.integer, False, "001000"),
+        ),
+    ],
+    ids=literal_idfn,
+)
+def test_global_normalize(
+    literal_maker: Callable[[], Literal],
+    normalize_literals: Union[builtins.ellipsis, bool],
+    outcome: OutcomePrimitives[Literal],
+) -> None:
+    _normalize_literals = rdflib.NORMALIZE_LITERALS
+    try:
+        if normalize_literals is not ...:
+            rdflib.NORMALIZE_LITERALS = normalize_literals
+        checker = OutcomeChecker[Literal].from_primitives(outcome)
+        with checker.context():
+            actual_outcome = literal_maker()
+            checker.check(actual_outcome)
+    finally:
+        rdflib.NORMALIZE_LITERALS = _normalize_literals
diff --git a/test/test_literal/test_literal_html5lib.py b/test/test_literal/test_literal_html5lib.py
new file mode 100644
index 000000000..ce39039c0
--- /dev/null
+++ b/test/test_literal/test_literal_html5lib.py
@@ -0,0 +1,79 @@
+import xml.dom.minidom
+from test.utils.literal import LiteralChecker
+from test.utils.outcome import OutcomeChecker, OutcomePrimitives
+from typing import Callable
+
+import pytest
+
+import rdflib.term
+from rdflib.namespace import RDF
+from rdflib.term import Literal
+
+try:
+    import html5lib as _  # noqa: F401
+except ImportError:
+    pytest.skip("html5lib not installed", allow_module_level=True)
+
+
+def test_has_html5lib() -> None:
+    assert rdflib.term._HAS_HTML5LIB is True
+    assert RDF.HTML in rdflib.term.XSDToPython
+    rule = next(
+        (
+            item
+            for item in rdflib.term._GenericPythonToXSDRules
+            if item[0] is xml.dom.minidom.DocumentFragment
+        ),
+        None,
+    )
+    assert rule is not None
+    assert rule[1][1] == RDF.HTML
+
+
+@pytest.mark.parametrize(
+    ["factory", "outcome"],
+    [
+        # Ill-typed literals, these have lexical forms that result in
+        # errors when parsed as HTML by html5lib.
+        (
+            lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
+            LiteralChecker(
+                ..., None, RDF.HTML, True, "<body><h1>Hello, World!</h1></body>"
+            ),
+        ),
+        (
+            lambda: Literal("<body></body>", datatype=RDF.HTML),
+            LiteralChecker(..., None, RDF.HTML, True, "<body></body>"),
+        ),
+        (
+            lambda: Literal("<tr><td>THE TEXT IS IN HERE</td></tr>", datatype=RDF.HTML),
+            LiteralChecker(
+                ..., None, RDF.HTML, True, "<tr><td>THE TEXT IS IN HERE</td></tr>"
+            ),
+        ),
+        # Well-typed literals, these have lexical forms that parse
+        # without errors with html5lib.
+        (
+            lambda: Literal("<table></table>", datatype=RDF.HTML),
+            LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),
+        ),
+        (
+            lambda: Literal("  <table>  </table>  ", datatype=RDF.HTML, normalize=True),
+            LiteralChecker(..., None, RDF.HTML, False, "  <table>  </table>  "),
+        ),
+        (
+            lambda: Literal(
+                "  <table>  </table>  ", datatype=RDF.HTML, normalize=False
+            ),
+            LiteralChecker(..., None, RDF.HTML, False, "  <table>  </table>  "),
+        ),
+    ],
+)
+def test_literal_construction(
+    factory: Callable[[], Literal],
+    outcome: OutcomePrimitives[Literal],
+) -> None:
+    checker = OutcomeChecker[Literal].from_primitives(outcome)
+    with checker.context():
+        actual_outcome = factory()
+        checker.check(actual_outcome)
diff --git a/test/test_sparql/test_sparql.py b/test/test_sparql/test_sparql.py
index 197323cba..4f7624af6 100644
--- a/test/test_sparql/test_sparql.py
+++ b/test/test_sparql/test_sparql.py
@@ -844,6 +844,23 @@ def thrower(*args: Any, **kwargs: Any) -> None:
             ],
             id="select-group-concat-optional-many",
         ),
+        pytest.param(
+            """
+            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+
+            SELECT * WHERE {
+                BIND(STRDT("<body>", rdf:HTML) as ?tag1) # incorrectly disappearing literal
+                BIND("<body>" as ?tag2)                  # correctly appearing literal
+            }
+            """,
+            [
+                {
+                    Variable("tag1"): Literal("<body>", datatype=RDF.HTML),
+                    Variable("tag2"): Literal("<body>"),
+                }
+            ],
+            id="select-bind-strdt-html",
+        ),
     ],
 )
 def test_queries(
diff --git a/test/utils/literal.py b/test/utils/literal.py
index b4b8cbf43..702f82f6d 100644
--- a/test/utils/literal.py
+++ b/test/utils/literal.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
 import builtins
+import logging
 from dataclasses import dataclass
 from test.utils.outcome import NoExceptionChecker
-from typing import Any, Union
+from typing import Any, Optional, Union
+from xml.dom.minidom import DocumentFragment
 
 from rdflib.term import Literal, URIRef
 
@@ -17,13 +19,40 @@ class LiteralChecker(NoExceptionChecker[Literal]):
     lexical: Union[builtins.ellipsis, str] = ...
 
     def check(self, actual: Literal) -> None:
+        logging.debug(
+            "actual = %r, value = %r, ill_typed = %r",
+            actual,
+            actual.value,
+            actual.ill_typed,
+        )
         if self.value is not Ellipsis:
+            if callable(self.value):
+                logging.debug(f"Checking value {actual.value} with {self.value}")
+                if isinstance(actual.value, DocumentFragment):
+                    logging.debug(f"childNodes = {actual.value.childNodes}")
+                assert self.value(actual.value)
+            else:
+                assert self.value == actual.value
             assert self.value == actual.value
         if self.lexical is not Ellipsis:
-            assert self.lexical == f"{actual}"
+            assert self.lexical == f"{actual}", "Literal lexical form does not match"
         if self.ill_typed is not Ellipsis:
-            assert self.ill_typed == actual.ill_typed
+            assert (
+                self.ill_typed == actual.ill_typed
+            ), "Literal ill_typed flag does not match"
         if self.language is not Ellipsis:
-            assert self.language == actual.language
+            assert self.language == actual.language, "Literal language does not match"
         if self.datatype is not Ellipsis:
-            assert self.datatype == actual.datatype
+            assert self.datatype == actual.datatype, "Literal datatype does not match"
+
+
+def literal_idfn(value: Any) -> Optional[str]:
+    if callable(value):
+        try:
+            literal = value()
+        except Exception:
+            return None
+        return f"{literal}"
+    if isinstance(value, LiteralChecker):
+        return f"{value}"
+    return None
diff --git a/tox.ini b/tox.ini
index d2ecc891a..e3a75d8f7 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,12 +15,12 @@ setenv =
     COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
     MYPY_CACHE_DIR = {envdir}/.mypy_cache
     docs: POETRY_ARGS_docs = --only=docs
-    extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx
+    extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html
     lxml: POETRY_ARGS_lxml = --extras=lxml
 commands_pre =
     py3{7,8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
     py3{7,8,9,10,11}: poetry lock --check
-    py3{7,8,9,10,11}: poetry install --no-root --only=main --only=dev --only=flake8 --only=tests --extras=html  {env:POETRY_ARGS_docs:} {env:POETRY_ARGS_extensive:} {env:POETRY_ARGS_lxml:} {env:POETRY_ARGS:} --sync
+    py3{7,8,9,10,11}: poetry install --no-root --only=main --only=dev --only=flake8 --only=tests {env:POETRY_ARGS_docs:} {env:POETRY_ARGS_extensive:} {env:POETRY_ARGS_lxml:} {env:POETRY_ARGS:} --sync
 commands =
     {env:TOX_EXTRA_COMMAND:}
     {env:TOX_MYPY_COMMAND:poetry run python -m mypy --show-error-context --show-error-codes --junit-xml=test_reports/{env:TOX_JUNIT_XML_PREFIX:}mypy-junit.xml}