Skip to content

Commit

Permalink
fix: small InputSource related issues
Browse files Browse the repository at this point in the history
I have added a bunch of tests for `InputSource` handling, checking
every kind of input source with every parser. During this, I detected
the following issues that I fixed:

- `rdflib.util._iri2uri()` should not URL quote the `netloc` parameter,
  the `idna` encoding already takes care of special characters. I removed
  the URL quoting of `netloc`.

- HexTuple parsing was handling the input source in a way that would
  only work for some input sources, and not raising errors for other
  input sources. I changed the input source handling to be more generic.

- `rdflib.parser.create_input_source()` incorrectly uses `file.buffer`
  instead of `source.buffer` when dealing with IO stream sources.

Other changes with no runtime impact include:

- extracted the logic to calculate the `Accept` HTTP header into a
  separate private function.
- moved the inline function `_urlopen` out into a standalone function.
- Changed the HTTP mocking stuff in test slightly to accommodate
  serving arbitrary files, as I used this in the `InputSource` tests.
  • Loading branch information
aucampia committed Mar 8, 2023
1 parent a146e0a commit 0fd7567
Show file tree
Hide file tree
Showing 22 changed files with 1,224 additions and 184 deletions.
71 changes: 71 additions & 0 deletions rdflib/_uri_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

import urllib.request
from typing import TYPE_CHECKING, Optional
from urllib.error import HTTPError

if TYPE_CHECKING:
from urllib.request import Request
from urllib.response import addinfourl


__all__ = ["_get_accept_header", "_urlopen"]


def _urlopen(url: Request) -> addinfourl:
"""
Wrapper around urllib.request.urlopen that handles HTTP 308 redirects.
This is a temporary workaround for https://bugs.python.org/issue40321
:param req: The request to open.
:return: The response which is the same as :py:func:`urllib.request.urlopen`
responses.
"""
try:
return urllib.request.urlopen(url)
except HTTPError as ex:
# 308 (Permanent Redirect) is not supported by current python version(s)
# See https://bugs.python.org/issue40321
# This custom error handling should be removed once all
# supported versions of python support 308.
if ex.code == 308:
url.full_url = ex.headers.get("Location")
return _urlopen(url)
else:
raise


def _get_accept_header(format: Optional[str]) -> str:
"""
Create an Accept header for the given format.
:param format: The format to create an Accept header for.
:return: The Accept header value.
"""
if format == "xml":
return "application/rdf+xml, */*;q=0.1"
elif format == "n3":
return "text/n3, */*;q=0.1"
elif format in ["turtle", "ttl"]:
return "text/turtle, application/x-turtle, */*;q=0.1"
elif format == "nt":
return "text/plain, */*;q=0.1"
elif format == "trig":
return "application/trig, */*;q=0.1"
elif format == "trix":
return "application/trix, */*;q=0.1"
elif format == "json-ld":
return "application/ld+json, application/json;q=0.9, */*;q=0.1"
else:
# if format not given, create an Accept header from all registered
# parser Media Types
from rdflib.parser import Parser
from rdflib.plugin import plugins

acc = []
for p in plugins(kind=Parser): # only get parsers
if "/" in p.name: # all Media Types known have a / in them
acc.append(p.name)

return ", ".join(acc)
65 changes: 20 additions & 45 deletions rdflib/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
Tuple,
Union,
)
from urllib.error import HTTPError
from urllib.parse import urljoin
from urllib.request import Request, url2pathname, urlopen
from urllib.request import Request, url2pathname
from xml.sax import xmlreader

import rdflib.util
from rdflib import __version__
from rdflib._uri_handling import _get_accept_header, _urlopen
from rdflib.namespace import Namespace
from rdflib.term import URIRef

Expand Down Expand Up @@ -236,51 +236,10 @@ def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None

# copy headers to change
myheaders = dict(headers)
if format == "xml":
myheaders["Accept"] = "application/rdf+xml, */*;q=0.1"
elif format == "n3":
myheaders["Accept"] = "text/n3, */*;q=0.1"
elif format in ["turtle", "ttl"]:
myheaders["Accept"] = "text/turtle, application/x-turtle, */*;q=0.1"
elif format == "nt":
myheaders["Accept"] = "text/plain, */*;q=0.1"
elif format == "trig":
myheaders["Accept"] = "application/trig, */*;q=0.1"
elif format == "trix":
myheaders["Accept"] = "application/trix, */*;q=0.1"
elif format == "json-ld":
myheaders[
"Accept"
] = "application/ld+json, application/json;q=0.9, */*;q=0.1"
else:
# if format not given, create an Accept header from all registered
# parser Media Types
from rdflib.parser import Parser
from rdflib.plugin import plugins

acc = []
for p in plugins(kind=Parser): # only get parsers
if "/" in p.name: # all Media Types known have a / in them
acc.append(p.name)

myheaders["Accept"] = ", ".join(acc)
myheaders["Accept"] = _get_accept_header(format)

req = Request(system_id, None, myheaders) # type: ignore[arg-type]

def _urlopen(req: Request) -> Any:
try:
return urlopen(req)
except HTTPError as ex:
# 308 (Permanent Redirect) is not supported by current python version(s)
# See https://bugs.python.org/issue40321
# This custom error handling should be removed once all
# supported versions of python support 308.
if ex.code == 308:
req.full_url = ex.headers.get("Location")
return _urlopen(req)
else:
raise

response: addinfourl = _urlopen(req)
self.url = response.geturl() # in case redirections took place
self.links = self.get_links(response)
Expand Down Expand Up @@ -363,6 +322,10 @@ def create_input_source(
input_source = None

if source is not None:
if TYPE_CHECKING:
assert file is None
assert data is None
assert location is None
if isinstance(source, InputSource):
input_source = source
else:
Expand All @@ -379,7 +342,7 @@ def create_input_source(
input_source.setCharacterStream(source)
input_source.setEncoding(source.encoding)
try:
b = file.buffer # type: ignore[union-attr]
b = source.buffer # type: ignore[union-attr]
input_source.setByteStream(b)
except (AttributeError, LookupError):
input_source.setByteStream(source)
Expand All @@ -399,6 +362,10 @@ def create_input_source(
auto_close = False # make sure we close all file handles we open

if location is not None:
if TYPE_CHECKING:
assert file is None
assert data is None
assert source is None
(
absolute_location,
auto_close,
Expand All @@ -412,9 +379,17 @@ def create_input_source(
)

if file is not None:
if TYPE_CHECKING:
assert location is None
assert data is None
assert source is None
input_source = FileInputSource(file)

if data is not None:
if TYPE_CHECKING:
assert location is None
assert file is None
assert source is None
if isinstance(data, dict):
input_source = PythonInputSource(data)
auto_close = True
Expand Down
36 changes: 18 additions & 18 deletions rdflib/plugins/parsers/hext.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@

import json
import warnings
from typing import TYPE_CHECKING, Any, List, Optional, Union
from io import TextIOWrapper
from typing import Any, BinaryIO, List, Optional, TextIO, Union

from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.parser import FileInputSource, InputSource, Parser
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode, Literal, URIRef

__all__ = ["HextuplesParser"]
Expand Down Expand Up @@ -92,19 +93,18 @@ def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None: # ty
cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
cg.default_context = graph

# handle different source types - only file and string (data) for now
if hasattr(source, "file"):
if TYPE_CHECKING:
assert isinstance(source, FileInputSource)
# type error: Item "TextIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
# type error: Item "RawIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
# type error: Item "BufferedIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
with open(source.file.name, encoding="utf-8") as fp: # type: ignore[union-attr]
for l in fp: # noqa: E741
self._parse_hextuple(cg, self._load_json_line(l))
elif hasattr(source, "_InputSource__bytefile"):
if hasattr(source._InputSource__bytefile, "wrapped"):
for (
l # noqa: E741
) in source._InputSource__bytefile.wrapped.strip().splitlines():
self._parse_hextuple(cg, self._load_json_line(l))
text_stream: Optional[TextIO] = source.getCharacterStream()
if text_stream is None:
binary_stream: Optional[BinaryIO] = source.getByteStream()
if binary_stream is None:
raise ValueError(f"Unsupported source type: {type(source)}")
else:
text_stream = TextIOWrapper(binary_stream, encoding="utf-8")

for line in text_stream:
if len(line) == 0 or line.isspace():
# Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
# The result is that we accept input that would otherwise be invalid.
# Possibly we should just let this result in an error.
continue
self._parse_hextuple(cg, self._load_json_line(line))
3 changes: 2 additions & 1 deletion rdflib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,7 @@ def _iri2uri(iri: str) -> str:
>>> _iri2uri("https://dbpedia.org/resource/Almería")
'https://dbpedia.org/resource/Almer%C3%ADa'
"""
# https://datatracker.ietf.org/doc/html/rfc3305

(scheme, netloc, path, query, fragment) = urlsplit(iri)

Expand All @@ -526,7 +527,7 @@ def _iri2uri(iri: str) -> str:
return iri

scheme = quote(scheme)
netloc = quote(netloc.encode("idna").decode("utf-8"))
netloc = netloc.encode("idna").decode("utf-8")
path = quote(path)
query = quote(query)
fragment = quote(fragment)
Expand Down
22 changes: 18 additions & 4 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

pytest.register_assert_rewrite("test.utils")

from test.utils.http import ctx_http_server # noqa: E402
from test.utils.httpfileserver import HTTPFileServer # noqa: E402
from typing import Generator # noqa: E402

from rdflib import Graph
Expand All @@ -16,20 +18,32 @@
# readibility.


@pytest.fixture(scope="session")
def http_file_server() -> Generator[HTTPFileServer, None, None]:
host = "127.0.0.1"
server = HTTPFileServer((host, 0))
with ctx_http_server(server) as served:
yield served


@pytest.fixture(scope="session")
def rdfs_graph() -> Graph:
return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle")


@pytest.fixture(scope="session")
def session_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
"""
This fixture is session scoped, but it is reset for each function in
:func:`function_httpmock`. This should not be used directly.
"""
with ServedBaseHTTPServerMock() as httpmock:
yield httpmock


@pytest.fixture(scope="function")
def function_httpmock(
session_httpmock: ServedBaseHTTPServerMock,
_session_function_httpmock: ServedBaseHTTPServerMock,
) -> Generator[ServedBaseHTTPServerMock, None, None]:
session_httpmock.reset()
yield session_httpmock
_session_function_httpmock.reset()
yield _session_function_httpmock
20 changes: 20 additions & 0 deletions test/data/variants/diverse_triples.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:eghttp="http://example.com/"
xmlns:egurn="urn:example:"
xmlns:egschema="example:"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#" >
<rdf:Description rdf:about="example:object">
<eghttp:predicate>XSD string</eghttp:predicate>
</rdf:Description>
<rdf:Description rdf:about="http://example.com/subject">
<eghttp:predicate xml:lang="jpx">日本語の表記体系</eghttp:predicate>
</rdf:Description>
<rdf:Description rdf:about="urn:example:subject">
<egschema:predicate rdf:resource="example:subject"/>
</rdf:Description>
<rdf:Description rdf:about="example:subject">
<egschema:predicate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">12</egschema:predicate>
<egschema:predicate rdf:resource="example:object"/>
</rdf:Description>
</rdf:RDF>
6 changes: 6 additions & 0 deletions test/data/variants/simple_triple.jsonld
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"@id": "http://example.org/subject",
"http://example.org/predicate": {
"@id": "http://example.org/object"
}
}
2 changes: 2 additions & 0 deletions test/data/variants/simple_triple.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<http://example.org/subject>
<http://example.org/predicate> <http://example.org/object> .
7 changes: 7 additions & 0 deletions test/data/variants/simple_triple.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:j.0="http://example.org/" >
<rdf:Description rdf:about="http://example.org/subject">
<j.0:predicate rdf:resource="http://example.org/object"/>
</rdf:Description>
</rdf:RDF>
4 changes: 4 additions & 0 deletions test/jsonld/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import List

from rdflib import parser, plugin, serializer

assert plugin
assert serializer
assert parser
import json

__all__: List[str] = []
6 changes: 3 additions & 3 deletions test/test_graph/test_graph_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import cached_graph
from test.utils.http import ctx_http_handler
from test.utils.httpservermock import (
MethodName,
MockHTTPResponse,
ServedBaseHTTPServerMock,
ctx_http_server,
)
from urllib.error import HTTPError

Expand Down Expand Up @@ -106,7 +106,7 @@ def test_content_negotiation(self) -> None:
expected.add((EG.a, EG.b, EG.c))
expected_triples = GraphHelper.triple_set(expected)

with ctx_http_server(ContentNegotiationHandler) as server:
with ctx_http_handler(ContentNegotiationHandler) as server:
(host, port) = server.server_address
if isinstance(host, (bytes, bytearray)):
host = host.decode("utf-8")
Expand All @@ -121,7 +121,7 @@ def test_content_negotiation_no_format(self) -> None:
expected.add((EG.a, EG.b, EG.c))
expected_triples = GraphHelper.triple_set(expected)

with ctx_http_server(ContentNegotiationHandler) as server:
with ctx_http_handler(ContentNegotiationHandler) as server:
(host, port) = server.server_address
if isinstance(host, (bytes, bytearray)):
host = host.decode("utf-8")
Expand Down

0 comments on commit 0fd7567

Please sign in to comment.