Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improved Graph().parse() #1140

Merged
merged 2 commits into from
Aug 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion rdflib/extras/describer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
... </cv:hasWorkHistory>
... </cv:CV>
... </rdf:RDF>
... ''')
... ''', format="xml")
>>>
>>> from rdflib.compare import isomorphic
>>> isomorphic(person_graph, expected) #doctest: +SKIP
Expand Down
36 changes: 21 additions & 15 deletions rdflib/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from rdflib.resource import Resource
from rdflib.collection import Collection
import rdflib.util # avoid circular dependency
from rdflib.exceptions import ParserError

import os
import shutil
Expand Down Expand Up @@ -1000,7 +1001,7 @@ def parse(
**args
):
"""
Parse source adding the resulting triples to the Graph.
Parse an RDF source adding the resulting triples to the Graph.

The source is specified using one of source, location, file or
data.
Expand All @@ -1014,9 +1015,10 @@ def parse(
is specified.
- `file`: A file-like object.
- `data`: A string containing the data to be parsed.
- `format`: Used if format can not be determined from source.
Defaults to rdf/xml. Format support can be extended with plugins,
but "xml", "n3", "nt" & "trix" are built in.
- `format`: Used if format can not be determined from source, e.g. file
extension or Media Type. Defaults to text/turtle. Format support can
be extended with plugins, but "xml", "n3" (use for turtle), "nt" &
"trix" are built in.
- `publicID`: the logical URI to use as the document base. If None
specified the document location is used (at least in the case where
there is a document location).
Expand Down Expand Up @@ -1062,6 +1064,11 @@ def parse(

>>> os.remove(file_name)

>>> # default turtle parsing
>>> result = g.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .")
>>> len(g)
3

"""

source = create_input_source(
Expand All @@ -1074,7 +1081,7 @@ def parse(
)
if format is None:
format = source.content_type
assumed_xml = False
could_not_guess_format = False
if format is None:
if (
hasattr(source, "file")
Expand All @@ -1083,19 +1090,18 @@ def parse(
):
format = rdflib.util.guess_format(source.file.name)
if format is None:
format = "application/rdf+xml"
assumed_xml = True
format = "turtle"
could_not_guess_format = True
parser = plugin.get(format, Parser)()
try:
parser.parse(source, self, **args)
except SAXParseException as saxpe:
if assumed_xml:
logger.warning(
"Could not guess format for %r, so assumed xml."
" You can explicitly specify format using the format argument."
% source
)
raise saxpe
except SyntaxError as se:
if could_not_guess_format:
raise ParserError(
"Could not guess RDF format for %r from file extension so tried Turtle but failed."
"You can explicitly specify format using the format argument." % source)
else:
raise se
finally:
if source.auto_close:
source.close()
Expand Down
2 changes: 1 addition & 1 deletion rdflib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,8 @@ def parse_date_time(val):


SUFFIX_FORMAT_MAP = {
"xml": "xml",
"rdf": "xml",
"rdfs": "xml",
"owl": "xml",
"n3": "n3",
"ttl": "turtle",
Expand Down
29 changes: 29 additions & 0 deletions test/rdf/datatypes/test001.borked
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?xml version="1.0"?>

<!--
Copyright World Wide Web Consortium, (Massachusetts Institute of
Technology, Institut National de Recherche en Informatique et en
Automatique, Keio University).

All Rights Reserved.

Please see the full Copyright clause at
<http://www.w3.org/Consortium/Legal/copyright-software.html>

Description: A simple datatype production; a language+
datatype production. Simply duplicate the constructs under
http://www.w3.org/2000/10/rdf-tests/rdfcore/ntriples/test.nt

$Id: test001.rdf,v 1.2 2002/11/20 14:51:34 jgrant Exp $

-->

<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:eg="http://example.org/">

<rdf:Description rdf:about="http://example.org/foo">
<eg:bar rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">10</eg:bar>
<eg:baz rdf:datatype="http://www.w3.org/2001/XMLSchema#integer" xml:lang="fr">10</eg:baz>
</rdf:Description>

</rdf:RDF>
63 changes: 62 additions & 1 deletion test/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from tempfile import mkdtemp, mkstemp
import shutil

from rdflib import URIRef, RDF, Graph, plugin
from rdflib import URIRef, Graph, plugin
from rdflib.exceptions import ParserError
from rdflib.plugin import PluginException

from nose.exc import SkipTest

Expand Down Expand Up @@ -248,6 +250,65 @@ def testGraphIntersection(self):

self.assertEqual((michel, likes, cheese) in g1, True)

def testGuessFormatForParse(self):
self.graph = Graph()

# files
with self.assertRaises(ParserError):
self.graph.parse(__file__) # here we are trying to parse a Python file!!

# .nt can be parsed by Turtle Parser
self.graph.parse("test/nt/anons-01.nt")
# RDF/XML
self.graph.parse("test/rdf/datatypes/test001.rdf") # XML
# bad filename but set format
self.graph.parse("test/rdf/datatypes/test001.borked", format="xml")

# strings
self.graph = Graph()

with self.assertRaises(ParserError):
self.graph.parse(data="rubbish")

# Turtle - default
self.graph.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .")

# Turtle - format given
self.graph.parse(data="<http://example.com/a> <http://example.com/a> <http://example.com/a> .", format="turtle")

# RDF/XML - format given
rdf = """<rdf:RDF
xmlns:ns1="http://example.org/#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
<rdf:Description rdf:nodeID="ub63bL2C1">
<ns1:p rdf:resource="http://example.org/q"/>
<ns1:r rdf:resource="http://example.org/s"/>
</rdf:Description>
<rdf:Description rdf:nodeID="ub63bL5C1">
<ns1:r>
<rdf:Description rdf:nodeID="ub63bL6C11">
<ns1:s rdf:resource="http://example.org/#t"/>
</rdf:Description>
</ns1:r>
<ns1:p rdf:resource="http://example.org/q"/>
</rdf:Description>
</rdf:RDF>
"""
self.graph.parse(data=rdf, format="xml")

# URI
self.graph = Graph()

# only getting HTML
with self.assertRaises(PluginException):
self.graph.parse(location="https://www.google.com")

self.graph.parse(location="http://www.w3.org/ns/adms.ttl")
self.graph.parse(location="http://www.w3.org/ns/adms.rdf")
# persistent Australian Government online RDF resource without a file-like ending
self.graph.parse(location="https://linked.data.gov.au/def/agrif?_format=text/turtle")


# dynamically create classes for each registered Store

Expand Down
4 changes: 2 additions & 2 deletions test/test_issue247.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_successful_parse_of_literal_without_xmllang_attr(self):
it contains a XML Literal with a xml:lang attribute:
"""
g = rdflib.Graph()
g.parse(data=passxml)
g.parse(data=passxml, format="xml")

def test_failing_parse_of_literal_with_xmllang_attr(self):
"""
Expand All @@ -47,7 +47,7 @@ def test_failing_parse_of_literal_with_xmllang_attr(self):
it contains a XML Literal with a xml:lang attribute:
"""
g = rdflib.Graph()
g.parse(data=failxml)
g.parse(data=failxml, format="xml")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion test/test_issue363.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def p():


def test_parsetype_resource():
g = rdflib.Graph().parse(data=data2)
g = rdflib.Graph().parse(data=data2, format="xml")
print(g.serialize(format="n3"))


Expand Down
2 changes: 1 addition & 1 deletion test/test_issue_git_336.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_ns_localname_roundtrip():
xmldump = g.serialize().decode("utf-8")
g1 = rdflib.Graph()

g1.parse(data=xmldump)
g1.parse(data=xmldump, format="xml")

g1.parse(data=turtledump, format="turtle")

Expand Down
2 changes: 1 addition & 1 deletion test/test_literal.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_backslash(self):
</rdf:RDF>
"""
g = rdflib.Graph()
g.parse(data=d)
g.parse(data=d, format="xml")
a = rdflib.Literal("a\\b")
b = list(g.objects())[0]
self.assertEqual(a, b)
Expand Down
7 changes: 3 additions & 4 deletions test/test_parse_file_guess_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from shutil import copyfile
from tempfile import TemporaryDirectory

from xml.sax import SAXParseException
from rdflib.exceptions import ParserError

from rdflib import Graph, logger as graph_logger

Expand All @@ -21,11 +21,10 @@ def test_warning(self):
g = Graph()
with TemporaryDirectory() as tmpdirname:
newpath = Path(tmpdirname).joinpath("no_file_ext")
copyfile("test/w3c/turtle/IRI_subject.ttl", str(newpath))
copyfile("test/rdf/Manifest.rdf", str(newpath))
with self.assertLogs(graph_logger, "WARNING") as log_cm:
with self.assertRaises(SAXParseException):
with self.assertRaises(ParserError):
g.parse(str(newpath))
self.assertTrue(any("Could not guess format" in msg for msg in log_cm.output))


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def testNoPathWithHash(self):

</rdf:RDF>
""",
format="xml",
publicID="http://example.org",
)

Expand Down
2 changes: 1 addition & 1 deletion test/test_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class SeqTestCase(unittest.TestCase):
def setUp(self):
store = self.store = Graph(store=self.backend)
store.open(self.path)
store.parse(data=s)
store.parse(data=s, format="xml")

def tearDown(self):
self.store.close()
Expand Down
2 changes: 1 addition & 1 deletion test/test_xmlliterals.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def testRDFXMLParse():
</rdf:RDF>"""

g = rdflib.Graph()
g.parse(data=rdfxml)
g.parse(data=rdfxml, format="xml")
l1 = list(g)[0][2]
assert l1.datatype == RDF.XMLLiteral

Expand Down