Skip to content

Commit

Permalink
fix: two issues with the N3 serializer
Browse files Browse the repository at this point in the history
This patch fixes two issues with the N3 serializer:
- The N3 serializer incorrectly considered a subject as already
  serialized if it has been serialized inside a quoted graph.
- The N3 serializer does not consider that the predicate of
  a triple can also be a graph.

Other changes included in this patch:
- Added the N3 test suite from https://github.com/w3c/N3/tree/master/tests
- Added `test/data/fetcher.py` which fetches remote test data.
- Changed `test.testutils.GraphHelper` to support nested graphs.

Fixes:
- #1807
- #1701

Related:
- #1840
  • Loading branch information
aucampia committed Apr 23, 2022
1 parent e4aae60 commit d57d967
Show file tree
Hide file tree
Showing 1,837 changed files with 624,378 additions and 166 deletions.
33 changes: 3 additions & 30 deletions rdflib/plugins/serializers/n3.py
Expand Up @@ -23,36 +23,6 @@ def reset(self):
super(N3Serializer, self).reset()
self._stores = {}

def subjectDone(self, subject):
super(N3Serializer, self).subjectDone(subject)
if self.parent:
self.parent.subjectDone(subject)

def isDone(self, subject):
return super(N3Serializer, self).isDone(subject) and (
not self.parent or self.parent.isDone(subject)
)

def startDocument(self):
super(N3Serializer, self).startDocument()
# if not isinstance(self.store, N3Store):
# return
#
# all_list = [self.label(var) for var in
# self.store.get_universals(recurse=False)]
# all_list.sort()
# some_list = [self.label(var) for var in
# self.store.get_existentials(recurse=False)]
# some_list.sort()
#
# for var in all_list:
# self.write('\n'+self.indent()+'@forAll %s. '%var)
# for var in some_list:
# self.write('\n'+self.indent()+'@forSome %s. '%var)
#
# if (len(all_list) + len(some_list)) > 0:
# self.write('\n')

def endDocument(self):
if not self.parent:
super(N3Serializer, self).endDocument()
Expand All @@ -68,6 +38,9 @@ def preprocessTriple(self, triple):
if isinstance(triple[0], Graph):
for t in triple[0]:
self.preprocessTriple(t)
if isinstance(triple[1], Graph):
for t in triple[1]:
self.preprocessTriple(t)
if isinstance(triple[2], Graph):
for t in triple[2]:
self.preprocessTriple(t)
Expand Down
26 changes: 23 additions & 3 deletions test/data/README.md
@@ -1,9 +1,29 @@
# Consistent Test Data
# Test Data

This directory contains consistent graphs that can be used inside tests, the
graphs in this directory should not change.
This directory contains data for use inside tests, ideally the data in this
directory should be constant and should not change, and in general non-original
data that is widely known is preferred to original data as well known data has
well known attributes and qualities that can make it easier to reason about.


## File origins

- `rdfs.ttl`: `http://www.w3.org/2000/01/rdf-schema#`

## Fetcher

Files that originate from the internet should be downloaded using `fetcher.py`
so we can easily verify the integrity of the files by re-running `fetcher.py`.

```bash
# run in repo root

# fetch everything
.venv/bin/python3 test/data/fetcher.py

# only fetch single file
.venv/bin/python3 test/data/fetcher.py test/data/rdfs.ttl

# only fetch files below path:
.venv/bin/python3 test/data/fetcher.py test/data/suites
```
308 changes: 308 additions & 0 deletions test/data/fetcher.py
@@ -0,0 +1,308 @@
import argparse
import enum
import logging
import os
import random
import re
import shutil
import string
import sys
import tarfile
from contextlib import ExitStack, contextmanager
from dataclasses import dataclass, field
from pathlib import Path
from tarfile import TarFile, TarInfo
from tempfile import TemporaryDirectory, mkdtemp
from typing import IO, Generator, List, Pattern, Union
from urllib.request import Request, urlopen
from zipfile import ZipFile, ZipInfo

DATA_PATH = Path(__file__).parent


@dataclass
class Resource:
remote: Union[str, Request]
local_path: Path

def fetch(self, tmp_path: Path) -> None:
raise NotImplementedError()


@dataclass
class FileResource(Resource):
def fetch(self, tmp_path: Path) -> None:
if self.local_path.exists():
logging.debug("info %s", self.local_path)
os.remove(self.local_path)

with ExitStack() as xstack:
request = (
self.remote
if isinstance(self.remote, Request)
else Request(self.remote)
)
response = urlopen(request)
remote_io: IO[bytes] = xstack.enter_context(response)

local_io = xstack.enter_context(self.local_path.open("wb+"))
shutil.copyfileobj(remote_io, local_io)

logging.info("Downloaded %s to %s", request.full_url, self.local_path)


class ArchiveType(enum.Enum):
ZIP = "zip"
TAR_GZ = "tar.gz"


@dataclass
class ArchiveResource(Resource):
type: ArchiveType
pattern: Pattern[str]

def fetch(self, tmp_path: Path) -> None:
if self.local_path.exists():
logging.debug("info %s", self.local_path)
shutil.rmtree(self.local_path)
with ExitStack() as xstack:
request = (
self.remote
if isinstance(self.remote, Request)
else Request(self.remote)
)
response = urlopen(request)
remote_io: IO[bytes] = xstack.enter_context(response)
name = (
"".join(
random.choices(
string.ascii_uppercase + string.digits + string.ascii_lowercase,
k=10,
)
)
+ f".{self.type.value}"
)
tmp_file = tmp_path / name
logging.info("fetching %s to temp file %s", self.remote, tmp_file)
with tmp_file.open("wb+") as tmp_io:
shutil.copyfileobj(remote_io, tmp_io)

archive_file: Union[ZipFile, TarFile]
if self.type is ArchiveType.ZIP:
archive_file = xstack.enter_context(ZipFile(tmp_file))
elif self.type is ArchiveType.TAR_GZ:
archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
# archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
else:
raise ValueError(f"invalid type {self.type}")

for member_info in self._member_list(archive_file):
member_filename = self._member_filename(member_info)
if self._member_isdir(member_info):
logging.debug("Ignoring directory %s", member_filename)
continue

match = self.pattern.match(member_filename)
if match is None:
logging.debug("Ignoring unmatched %s", member_filename)
continue
groups = match.groups()
if len(groups) > 0:
dest_filename = groups[0]

member_io: IO[bytes]
with self._member_io(archive_file, member_info) as member_io:
local_file = self.local_path / dest_filename
if not local_file.parent.exists():
local_file.parent.mkdir(parents=True)
logging.debug("writing %s to %s", member_filename, local_file)
local_file.write_bytes(member_io.read())

logging.info(
"Downloaded %s and extracted files matching %s to %s",
request.full_url,
self.pattern,
self.local_path,
)

@classmethod
def _member_list(
cls, archive: Union[ZipFile, TarFile]
) -> Union[List[ZipInfo], List[TarInfo]]:
if isinstance(archive, ZipFile):
return archive.infolist()
return archive.getmembers()

@classmethod
def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
if isinstance(member_info, ZipInfo):
return member_info.is_dir()
return member_info.isdir()

@classmethod
def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
if isinstance(member_info, ZipInfo):
return member_info.filename
return member_info.name

@classmethod
@contextmanager
def _member_io(
cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
) -> Generator[IO[bytes], None, None]:
if isinstance(archive, ZipFile):
assert isinstance(member_info, ZipInfo)
with archive.open(member_info) as member_io:
yield member_io
else:
assert isinstance(member_info, TarInfo)
opt_io = archive.extractfile(member_info)
assert opt_io is not None
yield opt_io


RESOURCES: List[Resource] = [
ArchiveResource(
remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
type=ArchiveType.ZIP,
pattern=re.compile(r"^[^\/]+[\/]tests[\/](.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^[^\/]+[\/](.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^(.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^(.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^(.+)$"),
),
# NOTE: Commented out as these files contains local modifications.
# ArchiveResource(
# remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
# local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
# type=ArchiveType.ZIP,
# pattern=re.compile(r"^(.+)$"),
# ),
# NOTE: Commented out as this contains local modifications.
# ArchiveResource(
# remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
# local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
# type=ArchiveType.TAR_GZ,
# pattern=re.compile(r"^[^\/]+[\/](.+)$"),
# ),
FileResource(
remote=Request(
"http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
),
local_path=(DATA_PATH / "rdfs.ttl"),
),
]


@dataclass
class Application:
parser: argparse.ArgumentParser = field(
default_factory=lambda: argparse.ArgumentParser(add_help=True)
)

def __post_init__(self) -> None:
parser = self.parser
parser.add_argument(
"-v",
"--verbose",
action="count",
dest="verbosity",
help="increase verbosity level",
)
parser.add_argument(
"--keep-tmp",
action="store_true",
default=False,
)
parser.add_argument("paths", nargs="*", type=str)
parser.set_defaults(handler=self.handle)

def run(self, args: List[str]) -> None:
parse_result = self.parser.parse_args(args)

verbosity = parse_result.verbosity
if verbosity is not None:
root_logger = logging.getLogger("")
root_logger.propagate = True
new_level = (
root_logger.getEffectiveLevel()
- (min(1, verbosity)) * 10
- min(max(0, verbosity - 1), 9) * 1
)
root_logger.setLevel(new_level)

logging.debug(
"args = %s, parse_result = %s, logging.level = %s",
args,
parse_result,
logging.getLogger("").getEffectiveLevel(),
)

parse_result.handler(parse_result)

def handle(self, parse_result: argparse.Namespace) -> None:
logging.debug("entry ...")

paths = {Path(path).absolute() for path in parse_result.paths}

logging.debug("paths = %s", paths)

if parse_result.keep_tmp:
tmp_path = Path(mkdtemp())
else:
tmp_dir = TemporaryDirectory()
tmp_path = Path(tmp_dir.name)

for resource in RESOURCES:
if paths:
include = False
for path in paths:
try:
resource.local_path.absolute().relative_to(path)
include = True
except ValueError:
# not relative to, ignoring
pass
if not include:
logging.info("skipping %s", resource.local_path)
continue
resource.fetch(tmp_path)


def main() -> None:
logging.basicConfig(
level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
stream=sys.stderr,
datefmt="%Y-%m-%dT%H:%M:%S",
format=(
"%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
"%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
),
)

Application().run(sys.argv[1:])


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions test/data/suites/w3c/n3/LICENSE.md
@@ -0,0 +1,4 @@
All documents in this Repository are licensed by contributors
under both the the [W3C Test Suite License](http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) and
[W3C Software and Document License](https://www.w3.org/Consortium/Legal/copyright-software).

0 comments on commit d57d967

Please sign in to comment.