Merge pull request #1140 from RDFLib/improve_graph_parse

improved Graph().parse()
RDFLib · Aug 27, 2020 · aa52774 · aa52774
2 parents 4be2749 + 3afffcd
commit aa52774
Show file tree

Hide file tree

Showing 13 changed files with 125 additions and 29 deletions.
diff --git a/rdflib/extras/describer.py b/rdflib/extras/describer.py
@@ -102,7 +102,7 @@
     ...     </cv:hasWorkHistory>
     ...   </cv:CV>
     ... </rdf:RDF>
-    ... ''')
+    ... ''', format="xml")
     >>>
     >>> from rdflib.compare import isomorphic
     >>> isomorphic(person_graph, expected)  #doctest: +SKIP

diff --git a/rdflib/graph.py b/rdflib/graph.py
@@ -24,6 +24,7 @@
 from rdflib.resource import Resource
 from rdflib.collection import Collection
 import rdflib.util  # avoid circular dependency
+from rdflib.exceptions import ParserError
 
 import os
 import shutil
@@ -1000,7 +1001,7 @@ def parse(
         **args
     ):
         """
-        Parse source adding the resulting triples to the Graph.
+        Parse an RDF source adding the resulting triples to the Graph.
 
         The source is specified using one of source, location, file or
         data.
@@ -1014,9 +1015,10 @@ def parse(
             is specified.
           - `file`: A file-like object.
           - `data`: A string containing the data to be parsed.
-          - `format`: Used if format can not be determined from source.
-            Defaults to rdf/xml. Format support can be extended with plugins,
-            but "xml", "n3", "nt" & "trix" are built in.
+          - `format`: Used if format can not be determined from source, e.g. file
+            extension or Media Type. Defaults to text/turtle. Format support can
+            be extended with plugins, but "xml", "n3" (use for turtle), "nt" &
+            "trix" are built in.
           - `publicID`: the logical URI to use as the document base. If None
             specified the document location is used (at least in the case where
             there is a document location).
@@ -1062,6 +1064,11 @@ def parse(
 
         >>> os.remove(file_name)
 
+        >>> # default turtle parsing
+        >>> result = g.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .")
+        >>> len(g)
+        3
+
         """
 
         source = create_input_source(
@@ -1074,7 +1081,7 @@ def parse(
         )
         if format is None:
             format = source.content_type
-        assumed_xml = False
+        could_not_guess_format = False
         if format is None:
             if (
                 hasattr(source, "file")
@@ -1083,19 +1090,18 @@ def parse(
             ):
                 format = rdflib.util.guess_format(source.file.name)
             if format is None:
-                format = "application/rdf+xml"
-                assumed_xml = True
+                format = "turtle"
+                could_not_guess_format = True
         parser = plugin.get(format, Parser)()
         try:
             parser.parse(source, self, **args)
-        except SAXParseException as saxpe:
-            if assumed_xml:
-                logger.warning(
-                    "Could not guess format for %r, so assumed xml."
-                    " You can explicitly specify format using the format argument."
-                    % source
-                )
-            raise saxpe
+        except SyntaxError as se:
+            if could_not_guess_format:
+                raise ParserError(
+                    "Could not guess RDF format for %r from file extension so tried Turtle but failed."
+                    "You can explicitly specify format using the format argument." % source)
+            else:
+                raise se
         finally:
             if source.auto_close:
                 source.close()

diff --git a/rdflib/util.py b/rdflib/util.py
@@ -352,8 +352,8 @@ def parse_date_time(val):
 
 
 SUFFIX_FORMAT_MAP = {
+    "xml": "xml",
     "rdf": "xml",
-    "rdfs": "xml",
     "owl": "xml",
     "n3": "n3",
     "ttl": "turtle",

diff --git a/test/rdf/datatypes/test001.borked b/test/rdf/datatypes/test001.borked
@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+
+<!--
+  Copyright World Wide Web Consortium, (Massachusetts Institute of
+  Technology, Institut National de Recherche en Informatique et en
+  Automatique, Keio University).
+ 
+  All Rights Reserved.
+ 
+  Please see the full Copyright clause at
+  <http://www.w3.org/Consortium/Legal/copyright-software.html>
+
+  Description: A simple datatype production; a language+
+	datatype production. Simply duplicate the constructs under
+	http://www.w3.org/2000/10/rdf-tests/rdfcore/ntriples/test.nt
+
+  $Id: test001.rdf,v 1.2 2002/11/20 14:51:34 jgrant Exp $
+
+-->
+
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns:eg="http://example.org/">
+
+ <rdf:Description rdf:about="http://example.org/foo">
+   <eg:bar rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">10</eg:bar>
+   <eg:baz rdf:datatype="http://www.w3.org/2001/XMLSchema#integer" xml:lang="fr">10</eg:baz>
+ </rdf:Description>
+
+</rdf:RDF>
diff --git a/test/test_graph.py b/test/test_graph.py
@@ -5,7 +5,9 @@
 from tempfile import mkdtemp, mkstemp
 import shutil
 
-from rdflib import URIRef, RDF, Graph, plugin
+from rdflib import URIRef, Graph, plugin
+from rdflib.exceptions import ParserError
+from rdflib.plugin import PluginException
 
 from nose.exc import SkipTest
 
@@ -248,6 +250,65 @@ def testGraphIntersection(self):
 
         self.assertEqual((michel, likes, cheese) in g1, True)
 
+    def testGuessFormatForParse(self):
+        self.graph = Graph()
+
+        # files
+        with self.assertRaises(ParserError):
+            self.graph.parse(__file__)  # here we are trying to parse a Python file!!
+
+        # .nt can be parsed by Turtle Parser
+        self.graph.parse("test/nt/anons-01.nt")
+        # RDF/XML
+        self.graph.parse("test/rdf/datatypes/test001.rdf")  # XML
+        # bad filename but set format
+        self.graph.parse("test/rdf/datatypes/test001.borked", format="xml")
+
+        # strings
+        self.graph = Graph()
+
+        with self.assertRaises(ParserError):
+            self.graph.parse(data="rubbish")
+
+        # Turtle - default
+        self.graph.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .")
+
+        # Turtle - format given
+        self.graph.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .", format="turtle")
+
+        # RDF/XML - format given
+        rdf = """<rdf:RDF
+  xmlns:ns1="http://example.org/#"
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+>
+  <rdf:Description rdf:nodeID="ub63bL2C1">
+    <ns1:p rdf:resource="http://example.org/q"/>
+    <ns1:r rdf:resource="http://example.org/s"/>
+  </rdf:Description>
+  <rdf:Description rdf:nodeID="ub63bL5C1">
+    <ns1:r>
+      <rdf:Description rdf:nodeID="ub63bL6C11">
+        <ns1:s rdf:resource="http://example.org/#t"/>
+      </rdf:Description>
+    </ns1:r>
+    <ns1:p rdf:resource="http://example.org/q"/>
+  </rdf:Description>
+</rdf:RDF>        
+        """
+        self.graph.parse(data=rdf, format="xml")
+
+        # URI
+        self.graph = Graph()
+
+        # only getting HTML
+        with self.assertRaises(PluginException):
+            self.graph.parse(location="https://www.google.com")
+
+        self.graph.parse(location="http://www.w3.org/ns/adms.ttl")
+        self.graph.parse(location="http://www.w3.org/ns/adms.rdf")
+        # persistent Australian Government online RDF resource without a file-like ending
+        self.graph.parse(location="https://linked.data.gov.au/def/agrif?_format=text/turtle")
+
 
 # dynamically create classes for each registered Store
 

diff --git a/test/test_issue247.py b/test/test_issue247.py
@@ -38,7 +38,7 @@ def test_successful_parse_of_literal_without_xmllang_attr(self):
         it contains a XML Literal with a xml:lang attribute:
         """
         g = rdflib.Graph()
-        g.parse(data=passxml)
+        g.parse(data=passxml, format="xml")
 
     def test_failing_parse_of_literal_with_xmllang_attr(self):
         """
@@ -47,7 +47,7 @@ def test_failing_parse_of_literal_with_xmllang_attr(self):
         it contains a XML Literal with a xml:lang attribute:
         """
         g = rdflib.Graph()
-        g.parse(data=failxml)
+        g.parse(data=failxml, format="xml")
 
 
 if __name__ == "__main__":

diff --git a/test/test_issue363.py b/test/test_issue363.py
@@ -38,7 +38,7 @@ def p():
 
 
 def test_parsetype_resource():
-    g = rdflib.Graph().parse(data=data2)
+    g = rdflib.Graph().parse(data=data2, format="xml")
     print(g.serialize(format="n3"))
 
 

diff --git a/test/test_issue_git_336.py b/test/test_issue_git_336.py
@@ -37,7 +37,7 @@ def test_ns_localname_roundtrip():
     xmldump = g.serialize().decode("utf-8")
     g1 = rdflib.Graph()
 
-    g1.parse(data=xmldump)
+    g1.parse(data=xmldump, format="xml")
 
     g1.parse(data=turtledump, format="turtle")
 

diff --git a/test/test_literal.py b/test/test_literal.py
@@ -33,7 +33,7 @@ def test_backslash(self):
 </rdf:RDF>
 """
         g = rdflib.Graph()
-        g.parse(data=d)
+        g.parse(data=d, format="xml")
         a = rdflib.Literal("a\\b")
         b = list(g.objects())[0]
         self.assertEqual(a, b)

diff --git a/test/test_parse_file_guess_format.py b/test/test_parse_file_guess_format.py
@@ -3,7 +3,7 @@
 from shutil import copyfile
 from tempfile import TemporaryDirectory
 
-from xml.sax import SAXParseException
+from rdflib.exceptions import ParserError
 
 from rdflib import Graph, logger as graph_logger
 
@@ -21,11 +21,10 @@ def test_warning(self):
         g = Graph()
         with TemporaryDirectory() as tmpdirname:
             newpath = Path(tmpdirname).joinpath("no_file_ext")
-            copyfile("test/w3c/turtle/IRI_subject.ttl", str(newpath))
+            copyfile("test/rdf/Manifest.rdf", str(newpath))
             with self.assertLogs(graph_logger, "WARNING") as log_cm:
-                with self.assertRaises(SAXParseException):
+                with self.assertRaises(ParserError):
                     g.parse(str(newpath))
-            self.assertTrue(any("Could not guess format" in msg for msg in log_cm.output))
 
 
 if __name__ == '__main__':

diff --git a/test/test_parser.py b/test/test_parser.py
@@ -33,6 +33,7 @@ def testNoPathWithHash(self):
 
 </rdf:RDF>
 """,
+            format="xml",
             publicID="http://example.org",
         )
 

diff --git a/test/test_seq.py b/test/test_seq.py
@@ -29,7 +29,7 @@ class SeqTestCase(unittest.TestCase):
     def setUp(self):
         store = self.store = Graph(store=self.backend)
         store.open(self.path)
-        store.parse(data=s)
+        store.parse(data=s, format="xml")
 
     def tearDown(self):
         self.store.close()

diff --git a/test/test_xmlliterals.py b/test/test_xmlliterals.py
@@ -42,7 +42,7 @@ def testRDFXMLParse():
 </rdf:RDF>"""
 
     g = rdflib.Graph()
-    g.parse(data=rdfxml)
+    g.parse(data=rdfxml, format="xml")
     l1 = list(g)[0][2]
     assert l1.datatype == RDF.XMLLiteral