fix: small InputSource related issues (#2255)

I have added a bunch of tests for `InputSource` handling, checking most kinds of input source with most parsers. During this, I detected the following issues that I fixed: - `rdflib.util._iri2uri()` was URL quoting the `netloc` parameter, but this is wrong and the `idna` encoding already takes care of special characters. I removed the URL quoting of `netloc`. - HexTuple parsing was handling the input source in a way that would only work for some input sources, and not raising errors for other input sources. I changed the input source handling to be more generic. - `rdflib.parser.create_input_source()` incorrectly used `file.buffer` instead of `source.buffer` when dealing with IO stream sources. Other changes with no runtime impact include: - Changed the HTTP mocking stuff in test slightly to accommodate serving arbitrary files, as I used this in the `InputSource` tests. - Don't use Google in tests, as we keep getting `urllib.error.HTTPError: HTTP Error 429: Too Many Requests` from it.
RDFLib · Mar 10, 2023 · 2b98507 · 2b98507
1 parent a146e0a
commit 2b98507
Show file tree

Hide file tree

Showing 23 changed files with 1,193 additions and 141 deletions.
diff --git a/rdflib/parser.py b/rdflib/parser.py
@@ -363,6 +363,10 @@ def create_input_source(
     input_source = None
 
     if source is not None:
+        if TYPE_CHECKING:
+            assert file is None
+            assert data is None
+            assert location is None
         if isinstance(source, InputSource):
             input_source = source
         else:
@@ -379,7 +383,7 @@ def create_input_source(
                     input_source.setCharacterStream(source)
                     input_source.setEncoding(source.encoding)
                     try:
-                        b = file.buffer  # type: ignore[union-attr]
+                        b = source.buffer  # type: ignore[union-attr]
                         input_source.setByteStream(b)
                     except (AttributeError, LookupError):
                         input_source.setByteStream(source)
@@ -399,6 +403,10 @@ def create_input_source(
     auto_close = False  # make sure we close all file handles we open
 
     if location is not None:
+        if TYPE_CHECKING:
+            assert file is None
+            assert data is None
+            assert source is None
         (
             absolute_location,
             auto_close,
@@ -412,9 +420,17 @@ def create_input_source(
         )
 
     if file is not None:
+        if TYPE_CHECKING:
+            assert location is None
+            assert data is None
+            assert source is None
         input_source = FileInputSource(file)
 
     if data is not None:
+        if TYPE_CHECKING:
+            assert location is None
+            assert file is None
+            assert source is None
         if isinstance(data, dict):
             input_source = PythonInputSource(data)
             auto_close = True

diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py
@@ -7,10 +7,11 @@
 
 import json
 import warnings
-from typing import TYPE_CHECKING, Any, List, Optional, Union
+from io import TextIOWrapper
+from typing import Any, BinaryIO, List, Optional, TextIO, Union
 
 from rdflib.graph import ConjunctiveGraph, Graph
-from rdflib.parser import FileInputSource, InputSource, Parser
+from rdflib.parser import InputSource, Parser
 from rdflib.term import BNode, Literal, URIRef
 
 __all__ = ["HextuplesParser"]
@@ -92,19 +93,19 @@ def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None:  # ty
         cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
         cg.default_context = graph
 
-        # handle different source types - only file and string (data) for now
-        if hasattr(source, "file"):
-            if TYPE_CHECKING:
-                assert isinstance(source, FileInputSource)
-            # type error: Item "TextIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
-            # type error: Item "RawIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
-            # type error: Item "BufferedIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
-            with open(source.file.name, encoding="utf-8") as fp:  # type: ignore[union-attr]
-                for l in fp:  # noqa: E741
-                    self._parse_hextuple(cg, self._load_json_line(l))
-        elif hasattr(source, "_InputSource__bytefile"):
-            if hasattr(source._InputSource__bytefile, "wrapped"):
-                for (
-                    l  # noqa: E741
-                ) in source._InputSource__bytefile.wrapped.strip().splitlines():
-                    self._parse_hextuple(cg, self._load_json_line(l))
+        text_stream: Optional[TextIO] = source.getCharacterStream()
+        if text_stream is None:
+            binary_stream: Optional[BinaryIO] = source.getByteStream()
+            if binary_stream is None:
+                raise ValueError(
+                    f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
+                )
+            text_stream = TextIOWrapper(binary_stream, encoding="utf-8")
+
+        for line in text_stream:
+            if len(line) == 0 or line.isspace():
+                # Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
+                # The result is that we accept input that would otherwise be invalid.
+                # Possibly we should just let this result in an error.
+                continue
+            self._parse_hextuple(cg, self._load_json_line(line))
diff --git a/rdflib/util.py b/rdflib/util.py
@@ -518,6 +518,7 @@ def _iri2uri(iri: str) -> str:
     >>> _iri2uri("https://dbpedia.org/resource/Almería")
     'https://dbpedia.org/resource/Almer%C3%ADa'
     """
+    # https://datatracker.ietf.org/doc/html/rfc3305
 
     (scheme, netloc, path, query, fragment) = urlsplit(iri)
 
@@ -526,7 +527,7 @@ def _iri2uri(iri: str) -> str:
         return iri
 
     scheme = quote(scheme)
-    netloc = quote(netloc.encode("idna").decode("utf-8"))
+    netloc = netloc.encode("idna").decode("utf-8")
     path = quote(path)
     query = quote(query)
     fragment = quote(fragment)

diff --git a/test/conftest.py b/test/conftest.py
@@ -2,6 +2,8 @@
 
 pytest.register_assert_rewrite("test.utils")
 
+from test.utils.http import ctx_http_server  # noqa: E402
+from test.utils.httpfileserver import HTTPFileServer  # noqa: E402
 from typing import Generator  # noqa: E402
 
 from rdflib import Graph
@@ -16,20 +18,32 @@
 # readibility.
 
 
+@pytest.fixture(scope="session")
+def http_file_server() -> Generator[HTTPFileServer, None, None]:
+    host = "127.0.0.1"
+    server = HTTPFileServer((host, 0))
+    with ctx_http_server(server) as served:
+        yield served
+
+
 @pytest.fixture(scope="session")
 def rdfs_graph() -> Graph:
     return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle")
 
 
 @pytest.fixture(scope="session")
-def session_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
+def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
+    """
+    This fixture is session scoped, but it is reset for each function in
+    :func:`function_httpmock`. This should not be used directly.
+    """
     with ServedBaseHTTPServerMock() as httpmock:
         yield httpmock
 
 
 @pytest.fixture(scope="function")
 def function_httpmock(
-    session_httpmock: ServedBaseHTTPServerMock,
+    _session_function_httpmock: ServedBaseHTTPServerMock,
 ) -> Generator[ServedBaseHTTPServerMock, None, None]:
-    session_httpmock.reset()
-    yield session_httpmock
+    _session_function_httpmock.reset()
+    yield _session_function_httpmock
diff --git a/test/data/fetcher.py b/test/data/fetcher.py
@@ -268,6 +268,12 @@ def _member_io(
         remote=Request("https://www.w3.org/2009/sparql/docs/tests/test-update.n3"),
         local_path=(DATA_PATH / "defined_namespaces/ut.n3"),
     ),
+    FileResource(
+        remote=Request(
+            "https://github.com/web-platform-tests/wpt/raw/9d13065419df90d2ad71f3c6b78cc12e7800dae4/html/syntax/parsing/html5lib_tests1.html"
+        ),
+        local_path=(DATA_PATH / "html5lib_tests1.html"),
+    ),
 ]
 
 

diff --git a/test/data/html5lib_tests1.html b/test/data/html5lib_tests1.html
diff --git a/test/data/variants/diverse_triples.xml b/test/data/variants/diverse_triples.xml
@@ -0,0 +1,20 @@
+<rdf:RDF
+    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+    xmlns:eghttp="http://example.com/"
+    xmlns:egurn="urn:example:"
+    xmlns:egschema="example:"
+    xmlns:xsd="http://www.w3.org/2001/XMLSchema#" >
+  <rdf:Description rdf:about="example:object">
+    <eghttp:predicate>XSD string</eghttp:predicate>
+  </rdf:Description>
+  <rdf:Description rdf:about="http://example.com/subject">
+    <eghttp:predicate xml:lang="jpx">日本語の表記体系</eghttp:predicate>
+  </rdf:Description>
+  <rdf:Description rdf:about="urn:example:subject">
+    <egschema:predicate rdf:resource="example:subject"/>
+  </rdf:Description>
+  <rdf:Description rdf:about="example:subject">
+    <egschema:predicate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">12</egschema:predicate>
+    <egschema:predicate rdf:resource="example:object"/>
+  </rdf:Description>
+</rdf:RDF>
diff --git a/test/data/variants/simple_triple.jsonld b/test/data/variants/simple_triple.jsonld
@@ -0,0 +1,6 @@
+{
+    "@id": "http://example.org/subject",
+    "http://example.org/predicate": {
+        "@id": "http://example.org/object"
+    }
+}
diff --git a/test/data/variants/simple_triple.ttl b/test/data/variants/simple_triple.ttl
@@ -0,0 +1,2 @@
+<http://example.org/subject>
+        <http://example.org/predicate>  <http://example.org/object> .
diff --git a/test/data/variants/simple_triple.xml b/test/data/variants/simple_triple.xml
@@ -0,0 +1,7 @@
+<rdf:RDF
+    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+    xmlns:j.0="http://example.org/" > 
+  <rdf:Description rdf:about="http://example.org/subject">
+    <j.0:predicate rdf:resource="http://example.org/object"/>
+  </rdf:Description>
+</rdf:RDF>
diff --git a/test/jsonld/__init__.py b/test/jsonld/__init__.py
@@ -1,6 +1,10 @@
+from typing import List
+
 from rdflib import parser, plugin, serializer
 
 assert plugin
 assert serializer
 assert parser
 import json
+
+__all__: List[str] = []
diff --git a/test/test_graph/test_graph.py b/test/test_graph/test_graph.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from test.data import TEST_DATA_DIR, bob, cheese, hates, likes, michel, pizza, tarek
 from test.utils import GraphHelper, get_unique_plugin_names
+from test.utils.httpfileserver import HTTPFileServer, ProtoFileResource
 from typing import Callable, Optional, Set
 from urllib.error import HTTPError, URLError
 
@@ -272,7 +273,9 @@ def test_graph_intersection(make_graph: GraphFactory):
     assert (michel, likes, cheese) in g1
 
 
-def test_guess_format_for_parse(make_graph: GraphFactory):
+def test_guess_format_for_parse(
+    make_graph: GraphFactory, http_file_server: HTTPFileServer
+):
     graph = make_graph()
 
     # files
@@ -329,10 +332,16 @@ def test_guess_format_for_parse(make_graph: GraphFactory):
     graph.parse(data=rdf, format="xml")
 
     # URI
+    file_info = http_file_server.add_file_with_caching(
+        ProtoFileResource(
+            (("Content-Type", "text/html; charset=UTF-8"),),
+            TEST_DATA_DIR / "html5lib_tests1.html",
+        ),
+    )
 
     # only getting HTML
     with pytest.raises(PluginException):
-        graph.parse(location="https://www.google.com")
+        graph.parse(location=file_info.request_url)
 
     try:
         graph.parse(location="http://www.w3.org/ns/adms.ttl")

diff --git a/test/test_graph/test_graph_http.py b/test/test_graph/test_graph_http.py
@@ -3,11 +3,11 @@
 from test.data import TEST_DATA_DIR
 from test.utils import GraphHelper
 from test.utils.graph import cached_graph
+from test.utils.http import ctx_http_handler
 from test.utils.httpservermock import (
     MethodName,
     MockHTTPResponse,
     ServedBaseHTTPServerMock,
-    ctx_http_server,
 )
 from urllib.error import HTTPError
 
@@ -106,7 +106,7 @@ def test_content_negotiation(self) -> None:
         expected.add((EG.a, EG.b, EG.c))
         expected_triples = GraphHelper.triple_set(expected)
 
-        with ctx_http_server(ContentNegotiationHandler) as server:
+        with ctx_http_handler(ContentNegotiationHandler) as server:
             (host, port) = server.server_address
             if isinstance(host, (bytes, bytearray)):
                 host = host.decode("utf-8")
@@ -121,7 +121,7 @@ def test_content_negotiation_no_format(self) -> None:
         expected.add((EG.a, EG.b, EG.c))
         expected_triples = GraphHelper.triple_set(expected)
 
-        with ctx_http_server(ContentNegotiationHandler) as server:
+        with ctx_http_handler(ContentNegotiationHandler) as server:
             (host, port) = server.server_address
             if isinstance(host, (bytes, bytearray)):
                 host = host.decode("utf-8")

diff --git a/test/test_graph/test_variants.py b/test/test_graph/test_variants.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -69,6 +71,11 @@ def check(
             }
             assert set(self.has_subject_iris) == subjects_iris
 
+    @classmethod
+    def from_path(cls, path: Path) -> GraphAsserts:
+        with path.open("r") as f:
+            return cls(**json.load(f))
+
 
 @dataclass(order=True)
 class GraphVariants:
@@ -122,9 +129,7 @@ def for_files(
             else:
                 graph_variant = graph_varaint_dict[file_key]
             if variant_key.endswith("-asserts.json"):
-                graph_variant.asserts = GraphAsserts(
-                    **json.loads(file_path.read_text())
-                )
+                graph_variant.asserts = GraphAsserts.from_path(file_path)
             else:
                 graph_variant.variants[variant_key] = file_path
         return graph_varaint_dict

diff --git a/test/test_misc/__init__.py b/test/test_misc/__init__.py
diff --git a/test/test_misc/test_create_input_source.py b/test/test_misc/test_create_input_source.py