Skip to content

Commit

Permalink
Fixes #1429, add iri2uri (#1902)
Browse files Browse the repository at this point in the history
Add an iri-to-uri conversion utility to encode IRIs to URIs for `Graph.parse()` sources. Added a couple of tests because feeding it with a suite of IRIs to check seems overkill (not that I could find one).

Fixes #1429

Co-authored-by: Iwan Aucamp <aucampia@gmail.com>
  • Loading branch information
Graham Higgins and aucampia committed May 19, 2022
1 parent 10f33ee commit 32923ce
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 10 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ pep8-naming = ["-N802", "-N803", "-N806", "-N815"]
pep8-naming = ["-N802", "-N803", "-N806", "-N816"]
[tool.flakeheaven.exceptions."rdflib/plugins/serializers/turtle.py"]
pep8-naming = ["-N802", "-N806", "-N815"]
[tool.flakeheaven.exceptions."rdflib/__init__.py"]
pycodestyle = ["-E402"]


[tool.black]
Expand Down
12 changes: 5 additions & 7 deletions rdflib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@
"VOID",
"XSD",
"util",
"plugin",
"query",
]

import logging
Expand Down Expand Up @@ -157,7 +159,7 @@
Literal work, eq, __neq__, __lt__, etc.
"""

from rdflib import plugin, query

from rdflib.graph import ConjunctiveGraph, Dataset, Graph
from rdflib.namespace import (
BRICK,
Expand Down Expand Up @@ -190,9 +192,5 @@
)
from rdflib.term import BNode, IdentifiedNode, Literal, URIRef, Variable

# tedious sop to flake8
assert plugin
assert query

from rdflib import util
from rdflib.container import *
from rdflib import plugin, query, util # isort:skip
from rdflib.container import * # isort:skip # noqa:F401,F403
5 changes: 4 additions & 1 deletion rdflib/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@
from urllib.request import url2pathname
from warnings import warn

import rdflib.exceptions as exceptions
import rdflib.namespace as namespace # noqa: F401 # This is here because it is used in a docstring.
import rdflib.plugin as plugin
import rdflib.query as query
import rdflib.util # avoid circular dependency
from rdflib import exceptions, namespace, plugin, query
from rdflib.collection import Collection
from rdflib.exceptions import ParserError
from rdflib.namespace import RDF, Namespace, NamespaceManager
Expand Down
3 changes: 2 additions & 1 deletion rdflib/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from urllib.request import Request, url2pathname, urlopen
from xml.sax import xmlreader

import rdflib.util
from rdflib import __version__
from rdflib.namespace import Namespace
from rdflib.term import URIRef
Expand Down Expand Up @@ -448,7 +449,7 @@ def _create_input_source_from_location(

base = pathlib.Path.cwd().as_uri()

absolute_location = URIRef(location, base=base)
absolute_location = URIRef(rdflib.util._iri2uri(location), base=base)

if absolute_location.startswith("file:///"):
filename = url2pathname(absolute_location.replace("file:///", "/"))
Expand Down
35 changes: 35 additions & 0 deletions rdflib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
Tuple,
TypeVar,
)
from urllib.parse import quote, urlsplit, urlunsplit

import rdflib.graph # avoid circular dependency
from rdflib.compat import sign
Expand All @@ -58,6 +59,7 @@
"find_roots",
"get_tree",
"_coalesce",
"_iri2uri",
]


Expand Down Expand Up @@ -476,3 +478,36 @@ def _coalesce(*args: Optional[_AnyT]) -> Optional[_AnyT]:
if arg is not None:
return arg
return None


def _iri2uri(iri: str) -> str:
"""
Convert an IRI to a URI (Python 3).
https://stackoverflow.com/a/42309027
https://stackoverflow.com/a/40654295
netloc should be encoded using IDNA;
non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
non-ascii query parameters should be encoded to the encoding of a page
URL was extracted from (or to the encoding server uses), then
percent-escaped.
>>> _iri2uri("https://dbpedia.org/resource/Almería")
'https://dbpedia.org/resource/Almer%C3%ADa'
"""

(scheme, netloc, path, query, fragment) = urlsplit(iri)

# Just support http/https, otherwise return the iri unmolested
if scheme not in ["http", "https"]:
return iri

scheme = quote(scheme)
netloc = quote(netloc.encode("idna").decode("utf-8"))
path = quote(path)
query = quote(query)
fragment = quote(fragment)
uri = urlunsplit((scheme, netloc, path, query, fragment))

if iri.endswith("#") and not uri.endswith("#"):
uri += "#"

return uri
37 changes: 37 additions & 0 deletions test/test_graph/test_graph_http.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import re
from http.server import BaseHTTPRequestHandler
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import cached_graph
from test.utils.httpservermock import (
MethodName,
MockHTTPResponse,
ServedBaseHTTPServerMock,
ctx_http_server,
)
from typing import Generator
from urllib.error import HTTPError

import pytest
Expand Down Expand Up @@ -227,3 +230,37 @@ def test_5xx(self):
graph.parse(location=url, format="turtle")

assert raised.value.code == 500


@pytest.fixture(scope="module")
def module_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
with ServedBaseHTTPServerMock() as httpmock:
yield httpmock


@pytest.fixture(scope="function")
def httpmock(
module_httpmock: ServedBaseHTTPServerMock,
) -> Generator[ServedBaseHTTPServerMock, None, None]:
module_httpmock.reset()
yield module_httpmock


def test_iri_source(httpmock: ServedBaseHTTPServerMock) -> None:
diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl"

httpmock.responses[MethodName.GET].append(
MockHTTPResponse(
200,
"OK",
diverse_triples_path.read_bytes(),
{"Content-Type": ["text/turtle"]},
)
)
g = Graph()
g.parse(f"{httpmock.url}/resource/Almería")
assert httpmock.call_count == 1
GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g)

req = httpmock.requests[MethodName.GET].pop(0)
assert req.path == "/resource/Almer%C3%ADa"
79 changes: 78 additions & 1 deletion test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from rdflib.graph import ConjunctiveGraph, Graph, QuotedGraph
from rdflib.namespace import RDF, RDFS
from rdflib.term import BNode, IdentifiedNode, Literal, Node, URIRef
from rdflib.util import _coalesce, find_roots, get_tree
from rdflib.util import _coalesce, _iri2uri, find_roots, get_tree

n3source = """\
@prefix : <http://www.w3.org/2000/10/swap/Primer#>.
Expand Down Expand Up @@ -547,3 +547,80 @@ def test_get_tree(
assert catcher.value is not None
else:
assert expected_result == result


@pytest.mark.parametrize(
["iri", "expected_result"],
[
(
"https://example.com/resource/Almería",
{
"https://example.com/resource/Almer%C3%ADa",
},
),
(
"https://example.com/resource/Almeria",
{
"https://example.com/resource/Almeria",
},
),
(
"https://åæø.example.com/",
{
"https://xn--5cac8c.example.com/",
},
),
(
# Note: expected result is the same because the function only works
# for http and https.
"example:é",
{
"example:é",
},
),
(
# Note: expected result is the same because the function only works
# for http and https.
"urn:example:é",
{
"urn:example:é",
},
),
(
"http://example.com/?é=1",
{
"http://example.com/?%C3%A9=1",
"http://example.com/?%C3%A9%3D1",
},
),
(
"http://example.com/#é",
{
"http://example.com/#%C3%A9",
},
),
(
"http://example.com/é#",
{
"http://example.com/%C3%A9#",
},
),
],
)
def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None:
"""
Tests that
"""
catcher: Optional[pytest.ExceptionInfo[Exception]] = None

with ExitStack() as xstack:
if isinstance(expected_result, type) and issubclass(expected_result, Exception):
catcher = xstack.enter_context(pytest.raises(expected_result))
result = _iri2uri(iri)
logging.debug("result = %s", result)
if catcher is not None:
assert catcher is not None
assert catcher.value is not None
else:
assert isinstance(expected_result, set)
assert result in expected_result

0 comments on commit 32923ce

Please sign in to comment.