Skip to content

Commit

Permalink
fix: small InputSource related issues (#2255)
Browse files Browse the repository at this point in the history
I have added a bunch of tests for `InputSource` handling, checking
most kinds of input source with most parsers. During this, I detected
the following issues that I fixed:

- `rdflib.util._iri2uri()` was URL quoting the `netloc` parameter, but
  this is wrong and the `idna` encoding already takes care of special
  characters. I removed the URL quoting of `netloc`.

- HexTuple parsing was handling the input source in a way that would
  only work for some input sources, and not raising errors for other
  input sources. I changed the input source handling to be more generic.

- `rdflib.parser.create_input_source()` incorrectly used `file.buffer`
  instead of `source.buffer` when dealing with IO stream sources.

Other changes with no runtime impact include:

- Changed the HTTP mocking stuff in test slightly to accommodate
  serving arbitrary files, as I used this in the `InputSource` tests.
- Don't use Google in tests, as we keep getting
  `urllib.error.HTTPError: HTTP Error 429: Too Many Requests`
  from it.
  • Loading branch information
aucampia committed Mar 10, 2023
1 parent a146e0a commit 2b98507
Show file tree
Hide file tree
Showing 23 changed files with 1,193 additions and 141 deletions.
18 changes: 17 additions & 1 deletion rdflib/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,10 @@ def create_input_source(
input_source = None

if source is not None:
if TYPE_CHECKING:
assert file is None
assert data is None
assert location is None
if isinstance(source, InputSource):
input_source = source
else:
Expand All @@ -379,7 +383,7 @@ def create_input_source(
input_source.setCharacterStream(source)
input_source.setEncoding(source.encoding)
try:
b = file.buffer # type: ignore[union-attr]
b = source.buffer # type: ignore[union-attr]
input_source.setByteStream(b)
except (AttributeError, LookupError):
input_source.setByteStream(source)
Expand All @@ -399,6 +403,10 @@ def create_input_source(
auto_close = False # make sure we close all file handles we open

if location is not None:
if TYPE_CHECKING:
assert file is None
assert data is None
assert source is None
(
absolute_location,
auto_close,
Expand All @@ -412,9 +420,17 @@ def create_input_source(
)

if file is not None:
if TYPE_CHECKING:
assert location is None
assert data is None
assert source is None
input_source = FileInputSource(file)

if data is not None:
if TYPE_CHECKING:
assert location is None
assert file is None
assert source is None
if isinstance(data, dict):
input_source = PythonInputSource(data)
auto_close = True
Expand Down
37 changes: 19 additions & 18 deletions rdflib/plugins/parsers/hext.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@

import json
import warnings
from typing import TYPE_CHECKING, Any, List, Optional, Union
from io import TextIOWrapper
from typing import Any, BinaryIO, List, Optional, TextIO, Union

from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.parser import FileInputSource, InputSource, Parser
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode, Literal, URIRef

__all__ = ["HextuplesParser"]
Expand Down Expand Up @@ -92,19 +93,19 @@ def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None: # ty
cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
cg.default_context = graph

# handle different source types - only file and string (data) for now
if hasattr(source, "file"):
if TYPE_CHECKING:
assert isinstance(source, FileInputSource)
# type error: Item "TextIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
# type error: Item "RawIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
# type error: Item "BufferedIOBase" of "Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase]" has no attribute "name"
with open(source.file.name, encoding="utf-8") as fp: # type: ignore[union-attr]
for l in fp: # noqa: E741
self._parse_hextuple(cg, self._load_json_line(l))
elif hasattr(source, "_InputSource__bytefile"):
if hasattr(source._InputSource__bytefile, "wrapped"):
for (
l # noqa: E741
) in source._InputSource__bytefile.wrapped.strip().splitlines():
self._parse_hextuple(cg, self._load_json_line(l))
text_stream: Optional[TextIO] = source.getCharacterStream()
if text_stream is None:
binary_stream: Optional[BinaryIO] = source.getByteStream()
if binary_stream is None:
raise ValueError(
f"Source does not have a character stream or a byte stream and cannot be used {type(source)}"
)
text_stream = TextIOWrapper(binary_stream, encoding="utf-8")

for line in text_stream:
if len(line) == 0 or line.isspace():
# Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way.
# The result is that we accept input that would otherwise be invalid.
# Possibly we should just let this result in an error.
continue
self._parse_hextuple(cg, self._load_json_line(line))
3 changes: 2 additions & 1 deletion rdflib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,7 @@ def _iri2uri(iri: str) -> str:
>>> _iri2uri("https://dbpedia.org/resource/Almería")
'https://dbpedia.org/resource/Almer%C3%ADa'
"""
# https://datatracker.ietf.org/doc/html/rfc3305

(scheme, netloc, path, query, fragment) = urlsplit(iri)

Expand All @@ -526,7 +527,7 @@ def _iri2uri(iri: str) -> str:
return iri

scheme = quote(scheme)
netloc = quote(netloc.encode("idna").decode("utf-8"))
netloc = netloc.encode("idna").decode("utf-8")
path = quote(path)
query = quote(query)
fragment = quote(fragment)
Expand Down
22 changes: 18 additions & 4 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

pytest.register_assert_rewrite("test.utils")

from test.utils.http import ctx_http_server # noqa: E402
from test.utils.httpfileserver import HTTPFileServer # noqa: E402
from typing import Generator # noqa: E402

from rdflib import Graph
Expand All @@ -16,20 +18,32 @@
# readibility.


@pytest.fixture(scope="session")
def http_file_server() -> Generator[HTTPFileServer, None, None]:
host = "127.0.0.1"
server = HTTPFileServer((host, 0))
with ctx_http_server(server) as served:
yield served


@pytest.fixture(scope="session")
def rdfs_graph() -> Graph:
return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle")


@pytest.fixture(scope="session")
def session_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
"""
This fixture is session scoped, but it is reset for each function in
:func:`function_httpmock`. This should not be used directly.
"""
with ServedBaseHTTPServerMock() as httpmock:
yield httpmock


@pytest.fixture(scope="function")
def function_httpmock(
session_httpmock: ServedBaseHTTPServerMock,
_session_function_httpmock: ServedBaseHTTPServerMock,
) -> Generator[ServedBaseHTTPServerMock, None, None]:
session_httpmock.reset()
yield session_httpmock
_session_function_httpmock.reset()
yield _session_function_httpmock
6 changes: 6 additions & 0 deletions test/data/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,12 @@ def _member_io(
remote=Request("https://www.w3.org/2009/sparql/docs/tests/test-update.n3"),
local_path=(DATA_PATH / "defined_namespaces/ut.n3"),
),
FileResource(
remote=Request(
"https://github.com/web-platform-tests/wpt/raw/9d13065419df90d2ad71f3c6b78cc12e7800dae4/html/syntax/parsing/html5lib_tests1.html"
),
local_path=(DATA_PATH / "html5lib_tests1.html"),
),
]


Expand Down
28 changes: 28 additions & 0 deletions test/data/html5lib_tests1.html

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions test/data/variants/diverse_triples.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:eghttp="http://example.com/"
xmlns:egurn="urn:example:"
xmlns:egschema="example:"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#" >
<rdf:Description rdf:about="example:object">
<eghttp:predicate>XSD string</eghttp:predicate>
</rdf:Description>
<rdf:Description rdf:about="http://example.com/subject">
<eghttp:predicate xml:lang="jpx">日本語の表記体系</eghttp:predicate>
</rdf:Description>
<rdf:Description rdf:about="urn:example:subject">
<egschema:predicate rdf:resource="example:subject"/>
</rdf:Description>
<rdf:Description rdf:about="example:subject">
<egschema:predicate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">12</egschema:predicate>
<egschema:predicate rdf:resource="example:object"/>
</rdf:Description>
</rdf:RDF>
6 changes: 6 additions & 0 deletions test/data/variants/simple_triple.jsonld
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"@id": "http://example.org/subject",
"http://example.org/predicate": {
"@id": "http://example.org/object"
}
}
2 changes: 2 additions & 0 deletions test/data/variants/simple_triple.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<http://example.org/subject>
<http://example.org/predicate> <http://example.org/object> .
7 changes: 7 additions & 0 deletions test/data/variants/simple_triple.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:j.0="http://example.org/" >
<rdf:Description rdf:about="http://example.org/subject">
<j.0:predicate rdf:resource="http://example.org/object"/>
</rdf:Description>
</rdf:RDF>
4 changes: 4 additions & 0 deletions test/jsonld/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import List

from rdflib import parser, plugin, serializer

assert plugin
assert serializer
assert parser
import json

__all__: List[str] = []
13 changes: 11 additions & 2 deletions test/test_graph/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from test.data import TEST_DATA_DIR, bob, cheese, hates, likes, michel, pizza, tarek
from test.utils import GraphHelper, get_unique_plugin_names
from test.utils.httpfileserver import HTTPFileServer, ProtoFileResource
from typing import Callable, Optional, Set
from urllib.error import HTTPError, URLError

Expand Down Expand Up @@ -272,7 +273,9 @@ def test_graph_intersection(make_graph: GraphFactory):
assert (michel, likes, cheese) in g1


def test_guess_format_for_parse(make_graph: GraphFactory):
def test_guess_format_for_parse(
make_graph: GraphFactory, http_file_server: HTTPFileServer
):
graph = make_graph()

# files
Expand Down Expand Up @@ -329,10 +332,16 @@ def test_guess_format_for_parse(make_graph: GraphFactory):
graph.parse(data=rdf, format="xml")

# URI
file_info = http_file_server.add_file_with_caching(
ProtoFileResource(
(("Content-Type", "text/html; charset=UTF-8"),),
TEST_DATA_DIR / "html5lib_tests1.html",
),
)

# only getting HTML
with pytest.raises(PluginException):
graph.parse(location="https://www.google.com")
graph.parse(location=file_info.request_url)

try:
graph.parse(location="http://www.w3.org/ns/adms.ttl")
Expand Down
6 changes: 3 additions & 3 deletions test/test_graph/test_graph_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import cached_graph
from test.utils.http import ctx_http_handler
from test.utils.httpservermock import (
MethodName,
MockHTTPResponse,
ServedBaseHTTPServerMock,
ctx_http_server,
)
from urllib.error import HTTPError

Expand Down Expand Up @@ -106,7 +106,7 @@ def test_content_negotiation(self) -> None:
expected.add((EG.a, EG.b, EG.c))
expected_triples = GraphHelper.triple_set(expected)

with ctx_http_server(ContentNegotiationHandler) as server:
with ctx_http_handler(ContentNegotiationHandler) as server:
(host, port) = server.server_address
if isinstance(host, (bytes, bytearray)):
host = host.decode("utf-8")
Expand All @@ -121,7 +121,7 @@ def test_content_negotiation_no_format(self) -> None:
expected.add((EG.a, EG.b, EG.c))
expected_triples = GraphHelper.triple_set(expected)

with ctx_http_server(ContentNegotiationHandler) as server:
with ctx_http_handler(ContentNegotiationHandler) as server:
(host, port) = server.server_address
if isinstance(host, (bytes, bytearray)):
host = host.decode("utf-8")
Expand Down
11 changes: 8 additions & 3 deletions test/test_graph/test_variants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import json
import logging
import os
Expand Down Expand Up @@ -69,6 +71,11 @@ def check(
}
assert set(self.has_subject_iris) == subjects_iris

@classmethod
def from_path(cls, path: Path) -> GraphAsserts:
with path.open("r") as f:
return cls(**json.load(f))


@dataclass(order=True)
class GraphVariants:
Expand Down Expand Up @@ -122,9 +129,7 @@ def for_files(
else:
graph_variant = graph_varaint_dict[file_key]
if variant_key.endswith("-asserts.json"):
graph_variant.asserts = GraphAsserts(
**json.loads(file_path.read_text())
)
graph_variant.asserts = GraphAsserts.from_path(file_path)
else:
graph_variant.variants[variant_key] = file_path
return graph_varaint_dict
Expand Down
Empty file added test/test_misc/__init__.py
Empty file.
15 changes: 0 additions & 15 deletions test/test_misc/test_create_input_source.py

This file was deleted.

0 comments on commit 2b98507

Please sign in to comment.