Skip to content

Commit

Permalink
Merge pull request #1140 from RDFLib/improve_graph_parse
Browse files Browse the repository at this point in the history
improved Graph().parse()
  • Loading branch information
ashleysommer committed Aug 27, 2020
2 parents 4be2749 + 3afffcd commit aa52774
Show file tree
Hide file tree
Showing 13 changed files with 125 additions and 29 deletions.
2 changes: 1 addition & 1 deletion rdflib/extras/describer.py
Expand Up @@ -102,7 +102,7 @@
... </cv:hasWorkHistory>
... </cv:CV>
... </rdf:RDF>
... ''')
... ''', format="xml")
>>>
>>> from rdflib.compare import isomorphic
>>> isomorphic(person_graph, expected) #doctest: +SKIP
Expand Down
36 changes: 21 additions & 15 deletions rdflib/graph.py
Expand Up @@ -24,6 +24,7 @@
from rdflib.resource import Resource
from rdflib.collection import Collection
import rdflib.util # avoid circular dependency
from rdflib.exceptions import ParserError

import os
import shutil
Expand Down Expand Up @@ -1000,7 +1001,7 @@ def parse(
**args
):
"""
Parse source adding the resulting triples to the Graph.
Parse an RDF source adding the resulting triples to the Graph.
The source is specified using one of source, location, file or
data.
Expand All @@ -1014,9 +1015,10 @@ def parse(
is specified.
- `file`: A file-like object.
- `data`: A string containing the data to be parsed.
- `format`: Used if format can not be determined from source.
Defaults to rdf/xml. Format support can be extended with plugins,
but "xml", "n3", "nt" & "trix" are built in.
- `format`: Used if format can not be determined from source, e.g. file
extension or Media Type. Defaults to text/turtle. Format support can
be extended with plugins, but "xml", "n3" (use for turtle), "nt" &
"trix" are built in.
- `publicID`: the logical URI to use as the document base. If None
specified the document location is used (at least in the case where
there is a document location).
Expand Down Expand Up @@ -1062,6 +1064,11 @@ def parse(
>>> os.remove(file_name)
>>> # default turtle parsing
>>> result = g.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .")
>>> len(g)
3
"""

source = create_input_source(
Expand All @@ -1074,7 +1081,7 @@ def parse(
)
if format is None:
format = source.content_type
assumed_xml = False
could_not_guess_format = False
if format is None:
if (
hasattr(source, "file")
Expand All @@ -1083,19 +1090,18 @@ def parse(
):
format = rdflib.util.guess_format(source.file.name)
if format is None:
format = "application/rdf+xml"
assumed_xml = True
format = "turtle"
could_not_guess_format = True
parser = plugin.get(format, Parser)()
try:
parser.parse(source, self, **args)
except SAXParseException as saxpe:
if assumed_xml:
logger.warning(
"Could not guess format for %r, so assumed xml."
" You can explicitly specify format using the format argument."
% source
)
raise saxpe
except SyntaxError as se:
if could_not_guess_format:
raise ParserError(
"Could not guess RDF format for %r from file extension so tried Turtle but failed."
"You can explicitly specify format using the format argument." % source)
else:
raise se
finally:
if source.auto_close:
source.close()
Expand Down
2 changes: 1 addition & 1 deletion rdflib/util.py
Expand Up @@ -352,8 +352,8 @@ def parse_date_time(val):


SUFFIX_FORMAT_MAP = {
"xml": "xml",
"rdf": "xml",
"rdfs": "xml",
"owl": "xml",
"n3": "n3",
"ttl": "turtle",
Expand Down
29 changes: 29 additions & 0 deletions test/rdf/datatypes/test001.borked
@@ -0,0 +1,29 @@
<?xml version="1.0"?>

<!--
Copyright World Wide Web Consortium, (Massachusetts Institute of
Technology, Institut National de Recherche en Informatique et en
Automatique, Keio University).
All Rights Reserved.
Please see the full Copyright clause at
<http://www.w3.org/Consortium/Legal/copyright-software.html>
Description: A simple datatype production; a language+
datatype production. Simply duplicate the constructs under
http://www.w3.org/2000/10/rdf-tests/rdfcore/ntriples/test.nt
$Id: test001.rdf,v 1.2 2002/11/20 14:51:34 jgrant Exp $
-->

<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:eg="http://example.org/">

<rdf:Description rdf:about="http://example.org/foo">
<eg:bar rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">10</eg:bar>
<eg:baz rdf:datatype="http://www.w3.org/2001/XMLSchema#integer" xml:lang="fr">10</eg:baz>
</rdf:Description>

</rdf:RDF>
63 changes: 62 additions & 1 deletion test/test_graph.py
Expand Up @@ -5,7 +5,9 @@
from tempfile import mkdtemp, mkstemp
import shutil

from rdflib import URIRef, RDF, Graph, plugin
from rdflib import URIRef, Graph, plugin
from rdflib.exceptions import ParserError
from rdflib.plugin import PluginException

from nose.exc import SkipTest

Expand Down Expand Up @@ -248,6 +250,65 @@ def testGraphIntersection(self):

self.assertEqual((michel, likes, cheese) in g1, True)

def testGuessFormatForParse(self):
self.graph = Graph()

# files
with self.assertRaises(ParserError):
self.graph.parse(__file__) # here we are trying to parse a Python file!!

# .nt can be parsed by Turtle Parser
self.graph.parse("test/nt/anons-01.nt")
# RDF/XML
self.graph.parse("test/rdf/datatypes/test001.rdf") # XML
# bad filename but set format
self.graph.parse("test/rdf/datatypes/test001.borked", format="xml")

# strings
self.graph = Graph()

with self.assertRaises(ParserError):
self.graph.parse(data="rubbish")

# Turtle - default
self.graph.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .")

# Turtle - format given
self.graph.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .", format="turtle")

# RDF/XML - format given
rdf = """<rdf:RDF
xmlns:ns1="http://example.org/#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
<rdf:Description rdf:nodeID="ub63bL2C1">
<ns1:p rdf:resource="http://example.org/q"/>
<ns1:r rdf:resource="http://example.org/s"/>
</rdf:Description>
<rdf:Description rdf:nodeID="ub63bL5C1">
<ns1:r>
<rdf:Description rdf:nodeID="ub63bL6C11">
<ns1:s rdf:resource="http://example.org/#t"/>
</rdf:Description>
</ns1:r>
<ns1:p rdf:resource="http://example.org/q"/>
</rdf:Description>
</rdf:RDF>
"""
self.graph.parse(data=rdf, format="xml")

# URI
self.graph = Graph()

# only getting HTML
with self.assertRaises(PluginException):
self.graph.parse(location="https://www.google.com")

self.graph.parse(location="http://www.w3.org/ns/adms.ttl")
self.graph.parse(location="http://www.w3.org/ns/adms.rdf")
# persistent Australian Government online RDF resource without a file-like ending
self.graph.parse(location="https://linked.data.gov.au/def/agrif?_format=text/turtle")


# dynamically create classes for each registered Store

Expand Down
4 changes: 2 additions & 2 deletions test/test_issue247.py
Expand Up @@ -38,7 +38,7 @@ def test_successful_parse_of_literal_without_xmllang_attr(self):
it contains a XML Literal with a xml:lang attribute:
"""
g = rdflib.Graph()
g.parse(data=passxml)
g.parse(data=passxml, format="xml")

def test_failing_parse_of_literal_with_xmllang_attr(self):
"""
Expand All @@ -47,7 +47,7 @@ def test_failing_parse_of_literal_with_xmllang_attr(self):
it contains a XML Literal with a xml:lang attribute:
"""
g = rdflib.Graph()
g.parse(data=failxml)
g.parse(data=failxml, format="xml")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion test/test_issue363.py
Expand Up @@ -38,7 +38,7 @@ def p():


def test_parsetype_resource():
g = rdflib.Graph().parse(data=data2)
g = rdflib.Graph().parse(data=data2, format="xml")
print(g.serialize(format="n3"))


Expand Down
2 changes: 1 addition & 1 deletion test/test_issue_git_336.py
Expand Up @@ -37,7 +37,7 @@ def test_ns_localname_roundtrip():
xmldump = g.serialize().decode("utf-8")
g1 = rdflib.Graph()

g1.parse(data=xmldump)
g1.parse(data=xmldump, format="xml")

g1.parse(data=turtledump, format="turtle")

Expand Down
2 changes: 1 addition & 1 deletion test/test_literal.py
Expand Up @@ -33,7 +33,7 @@ def test_backslash(self):
</rdf:RDF>
"""
g = rdflib.Graph()
g.parse(data=d)
g.parse(data=d, format="xml")
a = rdflib.Literal("a\\b")
b = list(g.objects())[0]
self.assertEqual(a, b)
Expand Down
7 changes: 3 additions & 4 deletions test/test_parse_file_guess_format.py
Expand Up @@ -3,7 +3,7 @@
from shutil import copyfile
from tempfile import TemporaryDirectory

from xml.sax import SAXParseException
from rdflib.exceptions import ParserError

from rdflib import Graph, logger as graph_logger

Expand All @@ -21,11 +21,10 @@ def test_warning(self):
g = Graph()
with TemporaryDirectory() as tmpdirname:
newpath = Path(tmpdirname).joinpath("no_file_ext")
copyfile("test/w3c/turtle/IRI_subject.ttl", str(newpath))
copyfile("test/rdf/Manifest.rdf", str(newpath))
with self.assertLogs(graph_logger, "WARNING") as log_cm:
with self.assertRaises(SAXParseException):
with self.assertRaises(ParserError):
g.parse(str(newpath))
self.assertTrue(any("Could not guess format" in msg for msg in log_cm.output))


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions test/test_parser.py
Expand Up @@ -33,6 +33,7 @@ def testNoPathWithHash(self):
</rdf:RDF>
""",
format="xml",
publicID="http://example.org",
)

Expand Down
2 changes: 1 addition & 1 deletion test/test_seq.py
Expand Up @@ -29,7 +29,7 @@ class SeqTestCase(unittest.TestCase):
def setUp(self):
store = self.store = Graph(store=self.backend)
store.open(self.path)
store.parse(data=s)
store.parse(data=s, format="xml")

def tearDown(self):
self.store.close()
Expand Down
2 changes: 1 addition & 1 deletion test/test_xmlliterals.py
Expand Up @@ -42,7 +42,7 @@ def testRDFXMLParse():
</rdf:RDF>"""

g = rdflib.Graph()
g.parse(data=rdfxml)
g.parse(data=rdfxml, format="xml")
l1 = list(g)[0][2]
assert l1.datatype == RDF.XMLLiteral

Expand Down

0 comments on commit aa52774

Please sign in to comment.