fix: small InputSource related issues

I have added a bunch of tests for `InputSource` handling, checking every kind of input source with every parser. During this, I detected the following issues that I fixed: - `rdflib.util._iri2uri()` should not URL quote the `netloc` parameter, the `idna` encoding already takes care of special characters. I removed the URL quoting of `netloc`. - HexTuple parsing was handling the input source in a way that would only work for some input sources, and not raising errors for other input sources. I changed the input source handling to be more generic. - `rdflib.parser.create_input_source()` incorrectly uses `file.buffer` instead of `source.buffer` when dealing with IO stream sources. Other changes with no runtime impact include: - extracted the logic to calculate the `Accept` HTTP header into a separate private function. - moved the inline function `_urlopen` out into a standalone function. - Changed the HTTP mocking stuff in test slightly to accommodate serving arbitrary files, as I used this in the `InputSource` tests.
RDFLib · Mar 8, 2023 · 0fd7567 · 0fd7567
1 parent a146e0a
commit 0fd7567
Show file tree

Hide file tree

Showing 22 changed files with 1,224 additions and 184 deletions.
diff --git a/rdflib/_uri_handling.py b/rdflib/_uri_handling.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import urllib.request
+from typing import TYPE_CHECKING, Optional
+from urllib.error import HTTPError
+
+if TYPE_CHECKING:
+    from urllib.request import Request
+    from urllib.response import addinfourl
+
+
+__all__ = ["_get_accept_header", "_urlopen"]
+
+
+def _urlopen(url: Request) -> addinfourl:
+    """
+    Wrapper around urllib.request.urlopen that handles HTTP 308 redirects.
+
+    This is a temporary workaround for https://bugs.python.org/issue40321
+
+    :param req: The request to open.
+    :return: The response which is the same as :py:func:`urllib.request.urlopen`
+        responses.
+    """
+    try:
+        return urllib.request.urlopen(url)
+    except HTTPError as ex:
+        # 308 (Permanent Redirect) is not supported by current python version(s)
+        # See https://bugs.python.org/issue40321
+        # This custom error handling should be removed once all
+        # supported versions of python support 308.
+        if ex.code == 308:
+            url.full_url = ex.headers.get("Location")
+            return _urlopen(url)
+        else:
+            raise
+
+
+def _get_accept_header(format: Optional[str]) -> str:
+    """
+    Create an Accept header for the given format.
+
+    :param format: The format to create an Accept header for.
+    :return: The Accept header value.
+    """
+    if format == "xml":
+        return "application/rdf+xml, */*;q=0.1"
+    elif format == "n3":
+        return "text/n3, */*;q=0.1"
+    elif format in ["turtle", "ttl"]:
+        return "text/turtle, application/x-turtle, */*;q=0.1"
+    elif format == "nt":
+        return "text/plain, */*;q=0.1"
+    elif format == "trig":
+        return "application/trig, */*;q=0.1"
+    elif format == "trix":
+        return "application/trix, */*;q=0.1"
+    elif format == "json-ld":
+        return "application/ld+json, application/json;q=0.9, */*;q=0.1"
+    else:
+        # if format not given, create an Accept header from all registered
+        # parser Media Types
+        from rdflib.parser import Parser
+        from rdflib.plugin import plugins
+
+        acc = []
+        for p in plugins(kind=Parser):  # only get parsers
+            if "/" in p.name:  # all Media Types known have a / in them
+                acc.append(p.name)
+
+        return ", ".join(acc)
diff --git a/rdflib/parser.py b/rdflib/parser.py
@@ -27,13 +27,13 @@
     Tuple,
     Union,
 )
-from urllib.error import HTTPError
 from urllib.parse import urljoin
-from urllib.request import Request, url2pathname, urlopen
+from urllib.request import Request, url2pathname
 from xml.sax import xmlreader
 
 import rdflib.util
 from rdflib import __version__
+from rdflib._uri_handling import _get_accept_header, _urlopen
 from rdflib.namespace import Namespace
 from rdflib.term import URIRef
 
@@ -236,51 +236,10 @@ def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None
 
         # copy headers to change
         myheaders = dict(headers)
-        if format == "xml":
-            myheaders["Accept"] = "application/rdf+xml, */*;q=0.1"
-        elif format == "n3":
-            myheaders["Accept"] = "text/n3, */*;q=0.1"
-        elif format in ["turtle", "ttl"]:
-            myheaders["Accept"] = "text/turtle, application/x-turtle, */*;q=0.1"
-        elif format == "nt":
-            myheaders["Accept"] = "text/plain, */*;q=0.1"
-        elif format == "trig":
-            myheaders["Accept"] = "application/trig, */*;q=0.1"
-        elif format == "trix":
-            myheaders["Accept"] = "application/trix, */*;q=0.1"
-        elif format == "json-ld":
-            myheaders[
-                "Accept"
-            ] = "application/ld+json, application/json;q=0.9, */*;q=0.1"
-        else:
-            # if format not given, create an Accept header from all registered
-            # parser Media Types
-            from rdflib.parser import Parser
-            from rdflib.plugin import plugins
-
-            acc = []
-            for p in plugins(kind=Parser):  # only get parsers
-                if "/" in p.name:  # all Media Types known have a / in them
-                    acc.append(p.name)
-
-            myheaders["Accept"] = ", ".join(acc)
+        myheaders["Accept"] = _get_accept_header(format)
 
         req = Request(system_id, None, myheaders)  # type: ignore[arg-type]
 
-        def _urlopen(req: Request) -> Any:
-            try:
-                return urlopen(req)
-            except HTTPError as ex:
-                # 308 (Permanent Redirect) is not supported by current python version(s)
-                # See https://bugs.python.org/issue40321
-                # This custom error handling should be removed once all
-                # supported versions of python support 308.
-                if ex.code == 308:
-                    req.full_url = ex.headers.get("Location")
-                    return _urlopen(req)
-                else:
-                    raise
-
         response: addinfourl = _urlopen(req)
         self.url = response.geturl()  # in case redirections took place
         self.links = self.get_links(response)
@@ -363,6 +322,10 @@ def create_input_source(
     input_source = None
 
     if source is not None:
+        if TYPE_CHECKING:
+            assert file is None
+            assert data is None
+            assert location is None
         if isinstance(source, InputSource):
             input_source = source
         else:
@@ -379,7 +342,7 @@ def create_input_source(
                     input_source.setCharacterStream(source)
                     input_source.setEncoding(source.encoding)
                     try:
-                        b = file.buffer  # type: ignore[union-attr]
+                        b = source.buffer  # type: ignore[union-attr]
                         input_source.setByteStream(b)
                     except (AttributeError, LookupError):
                         input_source.setByteStream(source)
@@ -399,6 +362,10 @@ def create_input_source(
     auto_close = False  # make sure we close all file handles we open
 
     if location is not None:
+        if TYPE_CHECKING:
+            assert file is None
+            assert data is None
+            assert source is None
         (
             absolute_location,
             auto_close,
@@ -412,9 +379,17 @@ def create_input_source(
         )
 
     if file is not None:
+        if TYPE_CHECKING:
+            assert location is None
+            assert data is None
+            assert source is None
         input_source = FileInputSource(file)
 
     if data is not None:
+        if TYPE_CHECKING:
+            assert location is None
+            assert file is None
+            assert source is None
         if isinstance(data, dict):
             input_source = PythonInputSource(data)
             auto_close = True

diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py
@@ -7,10 +7,11 @@
 
 import json
 import warnings
-from typing import TYPE_CHECKING, Any, List, Optional, Union
+from io import TextIOWrapper
+from typing import Any, BinaryIO, List, Optional, TextIO, Union
 
 from rdflib.graph import ConjunctiveGraph, Graph
-from rdflib.parser import FileInputSource, InputSource, Parser
+from rdflib.parser import InputSource, Parser
 from rdflib.term import BNode, Literal, URIRef
 
 __all__ = ["HextuplesParser"]
@@ -92,19 +93,18 @@ def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None:  # ty
         cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
         cg.default_context = graph
 
-        # handle different source types - only file and string (data) for now
-        if hasattr(source, "file"):
-            if TYPE_CHECKING:
-                assert isinstance(source, FileInputSource)
-            # type error: Item "TextIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
-            # type error: Item "RawIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
-            # type error: Item "BufferedIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
-            with open(source.file.name, encoding="utf-8") as fp:  # type: ignore[union-attr]
-                for l in fp:  # noqa: E741
-                    self._parse_hextuple(cg, self._load_json_line(l))
-        elif hasattr(source, "_InputSource__bytefile"):
-            if hasattr(source._InputSource__bytefile, "wrapped"):
-                for (
-                    l  # noqa: E741
-                ) in source._InputSource__bytefile.wrapped.strip().splitlines():
-                    self._parse_hextuple(cg, self._load_json_line(l))
+        text_stream: Optional[TextIO] = source.getCharacterStream()
+        if text_stream is None:
+            binary_stream: Optional[BinaryIO] = source.getByteStream()
+            if binary_stream is None:
+                raise ValueError(f"Unsupported source type: {type(source)}")
+            else:
+                text_stream = TextIOWrapper(binary_stream, encoding="utf-8")
+
+        for line in text_stream:
+            if len(line) == 0 or line.isspace():
+                # Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
+                # The result is that we accept input that would otherwise be invalid.
+                # Possibly we should just let this result in an error.
+                continue
+            self._parse_hextuple(cg, self._load_json_line(line))
diff --git a/rdflib/util.py b/rdflib/util.py
@@ -518,6 +518,7 @@ def _iri2uri(iri: str) -> str:
     >>> _iri2uri("https://dbpedia.org/resource/Almería")
     'https://dbpedia.org/resource/Almer%C3%ADa'
     """
+    # https://datatracker.ietf.org/doc/html/rfc3305
 
     (scheme, netloc, path, query, fragment) = urlsplit(iri)
 
@@ -526,7 +527,7 @@ def _iri2uri(iri: str) -> str:
         return iri
 
     scheme = quote(scheme)
-    netloc = quote(netloc.encode("idna").decode("utf-8"))
+    netloc = netloc.encode("idna").decode("utf-8")
     path = quote(path)
     query = quote(query)
     fragment = quote(fragment)

diff --git a/test/conftest.py b/test/conftest.py
@@ -2,6 +2,8 @@
 
 pytest.register_assert_rewrite("test.utils")
 
+from test.utils.http import ctx_http_server  # noqa: E402
+from test.utils.httpfileserver import HTTPFileServer  # noqa: E402
 from typing import Generator  # noqa: E402
 
 from rdflib import Graph
@@ -16,20 +18,32 @@
 # readibility.
 
 
+@pytest.fixture(scope="session")
+def http_file_server() -> Generator[HTTPFileServer, None, None]:
+    host = "127.0.0.1"
+    server = HTTPFileServer((host, 0))
+    with ctx_http_server(server) as served:
+        yield served
+
+
 @pytest.fixture(scope="session")
 def rdfs_graph() -> Graph:
     return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle")
 
 
 @pytest.fixture(scope="session")
-def session_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
+def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
+    """
+    This fixture is session scoped, but it is reset for each function in
+    :func:`function_httpmock`. This should not be used directly.
+    """
     with ServedBaseHTTPServerMock() as httpmock:
         yield httpmock
 
 
 @pytest.fixture(scope="function")
 def function_httpmock(
-    session_httpmock: ServedBaseHTTPServerMock,
+    _session_function_httpmock: ServedBaseHTTPServerMock,
 ) -> Generator[ServedBaseHTTPServerMock, None, None]:
-    session_httpmock.reset()
-    yield session_httpmock
+    _session_function_httpmock.reset()
+    yield _session_function_httpmock
diff --git a/test/data/variants/diverse_triples.xml b/test/data/variants/diverse_triples.xml
@@ -0,0 +1,20 @@
+<rdf:RDF
+    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+    xmlns:eghttp="http://example.com/"
+    xmlns:egurn="urn:example:"
+    xmlns:egschema="example:"
+    xmlns:xsd="http://www.w3.org/2001/XMLSchema#" >
+  <rdf:Description rdf:about="example:object">
+    <eghttp:predicate>XSD string</eghttp:predicate>
+  </rdf:Description>
+  <rdf:Description rdf:about="http://example.com/subject">
+    <eghttp:predicate xml:lang="jpx">日本語の表記体系</eghttp:predicate>
+  </rdf:Description>
+  <rdf:Description rdf:about="urn:example:subject">
+    <egschema:predicate rdf:resource="example:subject"/>
+  </rdf:Description>
+  <rdf:Description rdf:about="example:subject">
+    <egschema:predicate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">12</egschema:predicate>
+    <egschema:predicate rdf:resource="example:object"/>
+  </rdf:Description>
+</rdf:RDF>
diff --git a/test/data/variants/simple_triple.jsonld b/test/data/variants/simple_triple.jsonld
@@ -0,0 +1,6 @@
+{
+    "@id": "http://example.org/subject",
+    "http://example.org/predicate": {
+        "@id": "http://example.org/object"
+    }
+}
diff --git a/test/data/variants/simple_triple.ttl b/test/data/variants/simple_triple.ttl
@@ -0,0 +1,2 @@
+<http://example.org/subject>
+        <http://example.org/predicate>  <http://example.org/object> .
diff --git a/test/data/variants/simple_triple.xml b/test/data/variants/simple_triple.xml
@@ -0,0 +1,7 @@
+<rdf:RDF
+    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+    xmlns:j.0="http://example.org/" > 
+  <rdf:Description rdf:about="http://example.org/subject">
+    <j.0:predicate rdf:resource="http://example.org/object"/>
+  </rdf:Description>
+</rdf:RDF>
diff --git a/test/jsonld/__init__.py b/test/jsonld/__init__.py
@@ -1,6 +1,10 @@
+from typing import List
+
 from rdflib import parser, plugin, serializer
 
 assert plugin
 assert serializer
 assert parser
 import json
+
+__all__: List[str] = []
diff --git a/test/test_graph/test_graph_http.py b/test/test_graph/test_graph_http.py
@@ -3,11 +3,11 @@
 from test.data import TEST_DATA_DIR
 from test.utils import GraphHelper
 from test.utils.graph import cached_graph
+from test.utils.http import ctx_http_handler
 from test.utils.httpservermock import (
     MethodName,
     MockHTTPResponse,
     ServedBaseHTTPServerMock,
-    ctx_http_server,
 )
 from urllib.error import HTTPError
 
@@ -106,7 +106,7 @@ def test_content_negotiation(self) -> None:
         expected.add((EG.a, EG.b, EG.c))
         expected_triples = GraphHelper.triple_set(expected)
 
-        with ctx_http_server(ContentNegotiationHandler) as server:
+        with ctx_http_handler(ContentNegotiationHandler) as server:
             (host, port) = server.server_address
             if isinstance(host, (bytes, bytearray)):
                 host = host.decode("utf-8")
@@ -121,7 +121,7 @@ def test_content_negotiation_no_format(self) -> None:
         expected.add((EG.a, EG.b, EG.c))
         expected_triples = GraphHelper.triple_set(expected)
 
-        with ctx_http_server(ContentNegotiationHandler) as server:
+        with ctx_http_handler(ContentNegotiationHandler) as server:
             (host, port) = server.server_address
             if isinstance(host, (bytes, bytearray)):
                 host = host.decode("utf-8")