Skip to content

Commit

Permalink
fix: lexical-to-value mapping of rdf:HTML literals (#2483)
Browse files Browse the repository at this point in the history
Use strict mode when parsing `rdf:HTML` literals. This ensures that when
[lexical-to-value
mapping](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-to-value-mapping)
(i.e. parsing) of a literal with `rdf:HTML` data type occurs, a value will
only be assigned if the lexical form is a valid HTML5 fragment.
Otherwise, i.e. for invalid fragments, no value will be associated with
the literal
[[ref](https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal)] and
the literal will be ill-typed.

---------

Co-authored-by: WhiteGobo <richardfechner@posteo.net>
  • Loading branch information
WhiteGobo and WhiteGobo committed Jul 12, 2023
1 parent 6e6d9e9 commit 53aaf02
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
11 changes: 6 additions & 5 deletions rdflib/term.py
Expand Up @@ -1641,16 +1641,17 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
def _parseHTML(htmltext: str) -> xml.dom.minidom.DocumentFragment: # noqa: N802
try:
import html5lib

parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
retval = parser.parseFragment(htmltext)
retval.normalize()
return retval
except ImportError:
raise ImportError(
"HTML5 parser not available. Try installing"
+ " html5lib <http://code.google.com/p/html5lib>"
)
parser = html5lib.HTMLParser(
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
)
retval = parser.parseFragment(htmltext)
retval.normalize()
return retval


def _writeXML( # noqa: N802
Expand Down
5 changes: 5 additions & 0 deletions test/test_literal/test_xmlliterals.py
Expand Up @@ -100,6 +100,11 @@ def testHTML():
assert l2.value is not None, "xml must have been parsed"
assert l2.datatype == RDF.HTML, "literal must have right datatype"

l3 = Literal("<invalid", datatype=RDF.HTML)
assert l3.value is None, "invalid html must not be parsed"
assert l3.datatype == RDF.HTML, "literal must have right datatype"
assert str(l3) == "<invalid", "invalid html must not be normalized"

assert l1 != l2
assert not l1.eq(l2)

Expand Down

0 comments on commit 53aaf02

Please sign in to comment.