Skip to content

Commit

Permalink
fix: IRI to URI conversion (#2304)
Browse files Browse the repository at this point in the history
The URI to IRI conversion was percentage-quoting characters that should not have
been quoted, like equals in the query string. It was also not quoting things
that should have been quoted, like the username and password components of a
URI.

This change improves the conversion by only quoting characters that are not
allowed in specific parts of the URI and quoting previously unquoted components.
The safe characters for each segment are taken from
[RFC3986](https://datatracker.ietf.org/doc/html/rfc3986).

The new behavior is heavily inspired by
[`werkzeug.urls.iri_to_uri`](https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931)
though there are some differences.

- Closes <#2120>.
  • Loading branch information
aucampia committed Mar 23, 2023
1 parent cfe6e37 commit dfa4054
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 19 deletions.
90 changes: 75 additions & 15 deletions rdflib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,32 +522,92 @@ def _coalesce(
return default


_RFC3986_SUBDELIMS = "!$&'()*+,;="
"""
``sub-delims`` production from `RFC 3986, section 2.2
<https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2>`_.
"""

_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@"
"""
The non-unreserved characters in the ``pchar`` production from RFC 3986.
"""

_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?"
"""
The non-unreserved characters that are safe to use in in the query and fragment
components.
.. code-block::
pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query
= *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" )
"""

_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%"
"""
The non-unreserved characters that are safe to use in the username and password
components.
.. code-block::
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
":" is excluded as this is only used for the username and password components,
and they are treated separately.
"""

_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/"
"""
The non-unreserved characters that are safe to use in the path component.
This is based on various path-related productions from RFC 3986.
"""


def _iri2uri(iri: str) -> str:
"""
Convert an IRI to a URI (Python 3).
https://stackoverflow.com/a/42309027
https://stackoverflow.com/a/40654295
netloc should be encoded using IDNA;
non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
non-ascii query parameters should be encoded to the encoding of a page
URL was extracted from (or to the encoding server uses), then
percent-escaped.
Prior art:
* `iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>`_
>>> _iri2uri("https://dbpedia.org/resource/Almería")
'https://dbpedia.org/resource/Almer%C3%ADa'
"""
# https://datatracker.ietf.org/doc/html/rfc3986
# https://datatracker.ietf.org/doc/html/rfc3305

(scheme, netloc, path, query, fragment) = urlsplit(iri)
parts = urlsplit(iri)
(scheme, netloc, path, query, fragment) = parts

# Just support http/https, otherwise return the iri unmolested
# Just support http/https, otherwise return the iri unaltered
if scheme not in ["http", "https"]:
return iri

scheme = quote(scheme)
netloc = netloc.encode("idna").decode("utf-8")
path = quote(path)
query = quote(query)
fragment = quote(fragment)
path = quote(path, safe=_PATH_SAFE_CHARS)
query = quote(query, safe=_QUERY_SAFE_CHARS)
fragment = quote(fragment, safe=_QUERY_SAFE_CHARS)

if parts.hostname:
netloc = parts.hostname.encode("idna").decode("ascii")
else:
netloc = ""

if ":" in netloc:
# Quote IPv6 addresses
netloc = f"[{netloc}]"

if parts.port:
netloc = f"{netloc}:{parts.port}"

if parts.username:
auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS)
if parts.password:
pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS)
auth = f"{auth}:{pass_quoted}"
netloc = f"{auth}@{netloc}"

uri = urlunsplit((scheme, netloc, path, query, fragment))

if iri.endswith("#") and not uri.endswith("#"):
Expand Down
43 changes: 39 additions & 4 deletions test/test_graph/test_graph_http.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import logging
import re
from http.server import BaseHTTPRequestHandler
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import cached_graph
from test.utils.http import ctx_http_handler
from test.utils.http import (
MOCK_HTTP_REQUEST_WILDCARD,
MockHTTPRequest,
ctx_http_handler,
)
from test.utils.httpservermock import (
MethodName,
MockHTTPResponse,
ServedBaseHTTPServerMock,
)
from test.utils.wildcard import URL_PARSE_RESULT_WILDCARD
from urllib.error import HTTPError

import pytest
Expand Down Expand Up @@ -235,7 +241,34 @@ def test_5xx(self):
assert raised.value.code == 500


def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
@pytest.mark.parametrize(
["url_suffix", "expected_request"],
[
(
"/resource/Almería",
MOCK_HTTP_REQUEST_WILDCARD._replace(
path="/resource/Almer%C3%ADa",
parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
path="/resource/Almer%C3%ADa"
),
),
),
(
"/resource/Almería?foo=bar",
MOCK_HTTP_REQUEST_WILDCARD._replace(
parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
path="/resource/Almer%C3%ADa"
),
path_query={"foo": ["bar"]},
),
),
],
)
def test_iri_source(
url_suffix: str,
expected_request: MockHTTPRequest,
function_httpmock: ServedBaseHTTPServerMock,
) -> None:
diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl"

function_httpmock.responses[MethodName.GET].append(
Expand All @@ -247,9 +280,11 @@ def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
)
)
g = Graph()
g.parse(f"{function_httpmock.url}/resource/Almería")
g.parse(f"{function_httpmock.url}{url_suffix}")
assert function_httpmock.call_count == 1
GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g)
assert len(g) > 1

req = function_httpmock.requests[MethodName.GET].pop(0)
assert req.path == "/resource/Almer%C3%ADa"
logging.debug("req = %s", req)
assert expected_request == req
18 changes: 18 additions & 0 deletions test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,24 @@ def test_get_tree(
"http://example.com:1231/",
},
),
(
"http://example.com:1231/a=b",
{
"http://example.com:1231/a=b",
},
),
(
"http://aé:aé@example.com:1231/bé/a=bé&c=d#a=bé&c=d",
{
"http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
},
),
(
"http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
{
"http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
},
),
],
)
def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None:
Expand Down
9 changes: 9 additions & 0 deletions test/utils/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import random
from contextlib import contextmanager
from http.server import BaseHTTPRequestHandler, HTTPServer
from test.utils.wildcard import EQ_WILDCARD
from threading import Thread
from typing import (
Dict,
Expand Down Expand Up @@ -62,6 +63,14 @@ class MockHTTPRequest(NamedTuple):
body: Optional[bytes]


MOCK_HTTP_REQUEST_WILDCARD = MockHTTPRequest(
EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
)
"""
This object should be equal to any `MockHTTPRequest` object.
"""


class MockHTTPResponse(NamedTuple):
status_code: int
reason_phrase: str
Expand Down
28 changes: 28 additions & 0 deletions test/utils/wildcard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Any
from urllib.parse import ParseResult


class EqWildcard:
"""
An object that matches anything.
"""

def __eq__(self, other: Any) -> Any:
return True

def __req__(self, other: Any) -> Any:
return True

def __repr__(self) -> str:
return "EqWildcard()"


EQ_WILDCARD: Any = EqWildcard()


URL_PARSE_RESULT_WILDCARD = ParseResult(
EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
)
"""
This should be equal to any `ParseResult` object.
"""

0 comments on commit dfa4054

Please sign in to comment.