fix: two issues with the N3 serializer

This patch fixes two issues with the N3 serializer: - The N3 serializer incorrectly considered a subject as already serialized if it has been serialized inside a quoted graph. - The N3 serializer does not consider that the predicate of a triple can also be a graph. Other changes included in this patch: - Added the N3 test suite from https://github.com/w3c/N3/tree/master/tests - Added `test/data/fetcher.py` which fetches remote test data. - Changed `test.testutils.GraphHelper` to support nested graphs. Fixes: - #1807 - #1701 Related: - #1840
RDFLib · Apr 23, 2022 · d57d967 · d57d967
1 parent e4aae60
commit d57d967
Show file tree

Hide file tree

Showing 1,837 changed files with 624,378 additions and 166 deletions.
diff --git a/rdflib/plugins/serializers/n3.py b/rdflib/plugins/serializers/n3.py
@@ -23,36 +23,6 @@ def reset(self):
         super(N3Serializer, self).reset()
         self._stores = {}
 
-    def subjectDone(self, subject):
-        super(N3Serializer, self).subjectDone(subject)
-        if self.parent:
-            self.parent.subjectDone(subject)
-
-    def isDone(self, subject):
-        return super(N3Serializer, self).isDone(subject) and (
-            not self.parent or self.parent.isDone(subject)
-        )
-
-    def startDocument(self):
-        super(N3Serializer, self).startDocument()
-        # if not isinstance(self.store, N3Store):
-        #    return
-        #
-        # all_list = [self.label(var) for var in
-        #        self.store.get_universals(recurse=False)]
-        # all_list.sort()
-        # some_list = [self.label(var) for var in
-        #        self.store.get_existentials(recurse=False)]
-        # some_list.sort()
-        #
-        # for var in all_list:
-        #    self.write('\n'+self.indent()+'@forAll %s. '%var)
-        # for var in some_list:
-        #    self.write('\n'+self.indent()+'@forSome %s. '%var)
-        #
-        # if (len(all_list) + len(some_list)) > 0:
-        #    self.write('\n')
-
     def endDocument(self):
         if not self.parent:
             super(N3Serializer, self).endDocument()
@@ -68,6 +38,9 @@ def preprocessTriple(self, triple):
         if isinstance(triple[0], Graph):
             for t in triple[0]:
                 self.preprocessTriple(t)
+        if isinstance(triple[1], Graph):
+            for t in triple[1]:
+                self.preprocessTriple(t)
         if isinstance(triple[2], Graph):
             for t in triple[2]:
                 self.preprocessTriple(t)

diff --git a/test/data/README.md b/test/data/README.md
@@ -1,9 +1,29 @@
-# Consistent Test Data
+# Test Data
 
-This directory contains consistent graphs that can be used inside tests, the
-graphs in this directory should not change.
+This directory contains data for use inside tests, ideally the data in this
+directory should be constant and should not change, and in general non-original
+data that is widely known is preferred to original data as well known data has
+well known attributes and qualities that can make it easier to reason about.
 
 
 ## File origins
 
 - `rdfs.ttl`: `http://www.w3.org/2000/01/rdf-schema#`
+
+## Fetcher
+
+Files that originate from the internet should be downloaded using `fetcher.py`
+so we can easily verify the integrity of the files by re-running `fetcher.py`.
+
+```bash
+# run in repo root
+
+# fetch everything
+.venv/bin/python3 test/data/fetcher.py
+
+# only fetch single file
+.venv/bin/python3 test/data/fetcher.py test/data/rdfs.ttl
+
+# only fetch files below path:
+.venv/bin/python3 test/data/fetcher.py test/data/suites
+```
diff --git a/test/data/fetcher.py b/test/data/fetcher.py
@@ -0,0 +1,308 @@
+import argparse
+import enum
+import logging
+import os
+import random
+import re
+import shutil
+import string
+import sys
+import tarfile
+from contextlib import ExitStack, contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from tarfile import TarFile, TarInfo
+from tempfile import TemporaryDirectory, mkdtemp
+from typing import IO, Generator, List, Pattern, Union
+from urllib.request import Request, urlopen
+from zipfile import ZipFile, ZipInfo
+
+DATA_PATH = Path(__file__).parent
+
+
+@dataclass
+class Resource:
+    remote: Union[str, Request]
+    local_path: Path
+
+    def fetch(self, tmp_path: Path) -> None:
+        raise NotImplementedError()
+
+
+@dataclass
+class FileResource(Resource):
+    def fetch(self, tmp_path: Path) -> None:
+        if self.local_path.exists():
+            logging.debug("info %s", self.local_path)
+            os.remove(self.local_path)
+
+        with ExitStack() as xstack:
+            request = (
+                self.remote
+                if isinstance(self.remote, Request)
+                else Request(self.remote)
+            )
+            response = urlopen(request)
+            remote_io: IO[bytes] = xstack.enter_context(response)
+
+            local_io = xstack.enter_context(self.local_path.open("wb+"))
+            shutil.copyfileobj(remote_io, local_io)
+
+        logging.info("Downloaded %s to %s", request.full_url, self.local_path)
+
+
+class ArchiveType(enum.Enum):
+    ZIP = "zip"
+    TAR_GZ = "tar.gz"
+
+
+@dataclass
+class ArchiveResource(Resource):
+    type: ArchiveType
+    pattern: Pattern[str]
+
+    def fetch(self, tmp_path: Path) -> None:
+        if self.local_path.exists():
+            logging.debug("info %s", self.local_path)
+            shutil.rmtree(self.local_path)
+        with ExitStack() as xstack:
+            request = (
+                self.remote
+                if isinstance(self.remote, Request)
+                else Request(self.remote)
+            )
+            response = urlopen(request)
+            remote_io: IO[bytes] = xstack.enter_context(response)
+            name = (
+                "".join(
+                    random.choices(
+                        string.ascii_uppercase + string.digits + string.ascii_lowercase,
+                        k=10,
+                    )
+                )
+                + f".{self.type.value}"
+            )
+            tmp_file = tmp_path / name
+            logging.info("fetching %s to temp file %s", self.remote, tmp_file)
+            with tmp_file.open("wb+") as tmp_io:
+                shutil.copyfileobj(remote_io, tmp_io)
+
+            archive_file: Union[ZipFile, TarFile]
+            if self.type is ArchiveType.ZIP:
+                archive_file = xstack.enter_context(ZipFile(tmp_file))
+            elif self.type is ArchiveType.TAR_GZ:
+                archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
+                # archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
+            else:
+                raise ValueError(f"invalid type {self.type}")
+
+            for member_info in self._member_list(archive_file):
+                member_filename = self._member_filename(member_info)
+                if self._member_isdir(member_info):
+                    logging.debug("Ignoring directory %s", member_filename)
+                    continue
+
+                match = self.pattern.match(member_filename)
+                if match is None:
+                    logging.debug("Ignoring unmatched %s", member_filename)
+                    continue
+                groups = match.groups()
+                if len(groups) > 0:
+                    dest_filename = groups[0]
+
+                member_io: IO[bytes]
+                with self._member_io(archive_file, member_info) as member_io:
+                    local_file = self.local_path / dest_filename
+                    if not local_file.parent.exists():
+                        local_file.parent.mkdir(parents=True)
+                    logging.debug("writing %s to %s", member_filename, local_file)
+                    local_file.write_bytes(member_io.read())
+
+        logging.info(
+            "Downloaded %s and extracted files matching %s to %s",
+            request.full_url,
+            self.pattern,
+            self.local_path,
+        )
+
+    @classmethod
+    def _member_list(
+        cls, archive: Union[ZipFile, TarFile]
+    ) -> Union[List[ZipInfo], List[TarInfo]]:
+        if isinstance(archive, ZipFile):
+            return archive.infolist()
+        return archive.getmembers()
+
+    @classmethod
+    def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
+        if isinstance(member_info, ZipInfo):
+            return member_info.is_dir()
+        return member_info.isdir()
+
+    @classmethod
+    def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
+        if isinstance(member_info, ZipInfo):
+            return member_info.filename
+        return member_info.name
+
+    @classmethod
+    @contextmanager
+    def _member_io(
+        cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
+    ) -> Generator[IO[bytes], None, None]:
+        if isinstance(archive, ZipFile):
+            assert isinstance(member_info, ZipInfo)
+            with archive.open(member_info) as member_io:
+                yield member_io
+        else:
+            assert isinstance(member_info, TarInfo)
+            opt_io = archive.extractfile(member_info)
+            assert opt_io is not None
+            yield opt_io
+
+
+RESOURCES: List[Resource] = [
+    ArchiveResource(
+        remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
+        local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
+        type=ArchiveType.ZIP,
+        pattern=re.compile(r"^[^\/]+[\/]tests[\/](.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^[^\/]+[\/](.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    # NOTE: Commented out as these files contains local modifications.
+    # ArchiveResource(
+    #     remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
+    #     local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
+    #     type=ArchiveType.ZIP,
+    #     pattern=re.compile(r"^(.+)$"),
+    # ),
+    # NOTE: Commented out as this contains local modifications.
+    # ArchiveResource(
+    #     remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
+    #     local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
+    #     type=ArchiveType.TAR_GZ,
+    #     pattern=re.compile(r"^[^\/]+[\/](.+)$"),
+    # ),
+    FileResource(
+        remote=Request(
+            "http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
+        ),
+        local_path=(DATA_PATH / "rdfs.ttl"),
+    ),
+]
+
+
+@dataclass
+class Application:
+    parser: argparse.ArgumentParser = field(
+        default_factory=lambda: argparse.ArgumentParser(add_help=True)
+    )
+
+    def __post_init__(self) -> None:
+        parser = self.parser
+        parser.add_argument(
+            "-v",
+            "--verbose",
+            action="count",
+            dest="verbosity",
+            help="increase verbosity level",
+        )
+        parser.add_argument(
+            "--keep-tmp",
+            action="store_true",
+            default=False,
+        )
+        parser.add_argument("paths", nargs="*", type=str)
+        parser.set_defaults(handler=self.handle)
+
+    def run(self, args: List[str]) -> None:
+        parse_result = self.parser.parse_args(args)
+
+        verbosity = parse_result.verbosity
+        if verbosity is not None:
+            root_logger = logging.getLogger("")
+            root_logger.propagate = True
+            new_level = (
+                root_logger.getEffectiveLevel()
+                - (min(1, verbosity)) * 10
+                - min(max(0, verbosity - 1), 9) * 1
+            )
+            root_logger.setLevel(new_level)
+
+        logging.debug(
+            "args = %s, parse_result = %s, logging.level = %s",
+            args,
+            parse_result,
+            logging.getLogger("").getEffectiveLevel(),
+        )
+
+        parse_result.handler(parse_result)
+
+    def handle(self, parse_result: argparse.Namespace) -> None:
+        logging.debug("entry ...")
+
+        paths = {Path(path).absolute() for path in parse_result.paths}
+
+        logging.debug("paths = %s", paths)
+
+        if parse_result.keep_tmp:
+            tmp_path = Path(mkdtemp())
+        else:
+            tmp_dir = TemporaryDirectory()
+            tmp_path = Path(tmp_dir.name)
+
+        for resource in RESOURCES:
+            if paths:
+                include = False
+                for path in paths:
+                    try:
+                        resource.local_path.absolute().relative_to(path)
+                        include = True
+                    except ValueError:
+                        # not relative to, ignoring
+                        pass
+                if not include:
+                    logging.info("skipping %s", resource.local_path)
+                    continue
+            resource.fetch(tmp_path)
+
+
+def main() -> None:
+    logging.basicConfig(
+        level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
+        stream=sys.stderr,
+        datefmt="%Y-%m-%dT%H:%M:%S",
+        format=(
+            "%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
+            "%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
+        ),
+    )
+
+    Application().run(sys.argv[1:])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/data/suites/w3c/n3/LICENSE.md b/test/data/suites/w3c/n3/LICENSE.md
@@ -0,0 +1,4 @@
+All documents in this Repository are licensed by contributors
+under both the the [W3C Test Suite License](http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) and 
+[W3C Software and Document License](https://www.w3.org/Consortium/Legal/copyright-software).
+