Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make parsers CharacterStream aware #1145

Merged
merged 1 commit into from
Aug 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/plugin_parsers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ mdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser`
microdata :class:`~rdflib.plugins.parsers.structureddata.MicrodataParser`
n3 :class:`~rdflib.plugins.parsers.notation3.N3Parser`
nquads :class:`~rdflib.plugins.parsers.nquads.NQuadsParser`
nt :class:`~rdflib.plugins.parsers.nt.NTParser`
nt :class:`~rdflib.plugins.parsers.ntriples.NTParser`
rdfa :class:`~rdflib.plugins.parsers.structureddata.RDFaParser`
rdfa1.0 :class:`~rdflib.plugins.parsers.structureddata.RDFa10Parser`
rdfa1.1 :class:`~rdflib.plugins.parsers.structureddata.RDFaParser`
Expand Down
104 changes: 85 additions & 19 deletions rdflib/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,11 @@
want to do so through the Graph class parse method.

"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import codecs
import os
import sys

from io import BytesIO

from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase

from urllib.request import pathname2url
from urllib.request import Request
Expand All @@ -41,13 +37,46 @@


class Parser(object):
__slots__ = set()

def __init__(self):
pass

def parse(self, source, sink):
pass


class BytesIOWrapper(BufferedIOBase):
__slots__ = ("wrapped", "encoded", "encoding")

def __init__(self, wrapped: str, encoding="utf-8"):
super(BytesIOWrapper, self).__init__()
self.wrapped = wrapped
self.encoding = encoding
self.encoded = None

def read(self, *args, **kwargs):
if self.encoded is None:
b, blen = codecs.getencoder(self.encoding)(self.wrapped)
self.encoded = BytesIO(b)
return self.encoded.read(*args, **kwargs)

def read1(self, *args, **kwargs):
if self.encoded is None:
b = codecs.getencoder(self.encoding)(self.wrapped)
self.encoded = BytesIO(b)
return self.encoded.read1(*args, **kwargs)

def readinto(self, *args, **kwargs):
raise NotImplementedError()

def readinto1(self, *args, **kwargs):
raise NotImplementedError()

def write(self, *args, **kwargs):
raise NotImplementedError()


class InputSource(xmlreader.InputSource, object):
"""
TODO:
Expand All @@ -59,23 +88,39 @@ def __init__(self, system_id=None):
self.auto_close = False # see Graph.parse(), true if opened by us

def close(self):
c = self.getCharacterStream()
if c and hasattr(c, "close"):
try:
c.close()
except Exception:
pass
f = self.getByteStream()
if f and hasattr(f, "close"):
f.close()
try:
f.close()
except Exception:
pass


class StringInputSource(InputSource):
"""
TODO:
Constructs an RDFLib Parser InputSource from a Python String or Bytes
"""

def __init__(self, value, system_id=None):
def __init__(self, value, encoding="utf-8", system_id=None):
super(StringInputSource, self).__init__(system_id)
stream = BytesIO(value)
self.setByteStream(stream)
# TODO:
# encoding = value.encoding
# self.setEncoding(encoding)
if isinstance(value, str):
stream = StringIO(value)
self.setCharacterStream(stream)
self.setEncoding(encoding)
b_stream = BytesIOWrapper(value, encoding)
self.setByteStream(b_stream)
else:
stream = BytesIO(value)
self.setByteStream(stream)
c_stream = TextIOWrapper(stream, encoding)
self.setCharacterStream(c_stream)
self.setEncoding(c_stream.encoding)


headers = {
Expand Down Expand Up @@ -134,8 +179,18 @@ def __init__(self, file):
system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base)
super(FileInputSource, self).__init__(system_id)
self.file = file
self.setByteStream(file)
# TODO: self.setEncoding(encoding)
if isinstance(file, TextIOBase): # Python3 unicode fp
self.setCharacterStream(file)
self.setEncoding(file.encoding)
try:
b = file.buffer
self.setByteStream(b)
except (AttributeError, LookupError):
self.setByteStream(file)
else:
self.setByteStream(file)
# We cannot set characterStream here because
# we do not know the Raw Bytes File encoding.

def __repr__(self):
return repr(self.file)
Expand Down Expand Up @@ -171,10 +226,21 @@ def create_input_source(
else:
if isinstance(source, str):
location = source
elif isinstance(source, bytes):
data = source
elif hasattr(source, "read") and not isinstance(source, Namespace):
f = source
input_source = InputSource()
input_source.setByteStream(f)
if hasattr(source, "encoding"):
input_source.setCharacterStream(source)
input_source.setEncoding(source.encoding)
try:
b = file.buffer
input_source.setByteStream(b)
except (AttributeError, LookupError):
input_source.setByteStream(source)
else:
input_source.setByteStream(f)
if f is sys.stdin:
input_source.setSystemId("file:///dev/stdin")
elif hasattr(f, "name"):
Expand Down Expand Up @@ -206,8 +272,8 @@ def create_input_source(
input_source = FileInputSource(file)

if data is not None:
if isinstance(data, str):
data = data.encode("utf-8")
if not isinstance(data, (str, bytes, bytearray)):
raise RuntimeError("parse data can only str, or bytes.")
input_source = StringInputSource(data)
auto_close = True

Expand Down
10 changes: 5 additions & 5 deletions rdflib/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

entry_points = {
'rdf.plugins.parser': [
'nt = rdf.plugins.parsers.nt:NTParser',
'nt = rdf.plugins.parsers.ntriples:NTParser',
],
'rdf.plugins.serializer': [
'nt = rdf.plugins.serializers.NTSerializer:NTSerializer',
Expand Down Expand Up @@ -185,10 +185,10 @@ def plugins(name=None, kind=None):
register("text/turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
register("turtle", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
register("ttl", Parser, "rdflib.plugins.parsers.notation3", "TurtleParser")
register("application/n-triples", Parser, "rdflib.plugins.parsers.nt", "NTParser")
register("ntriples", Parser, "rdflib.plugins.parsers.nt", "NTParser")
register("nt", Parser, "rdflib.plugins.parsers.nt", "NTParser")
register("nt11", Parser, "rdflib.plugins.parsers.nt", "NTParser")
register("application/n-triples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
register("ntriples", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
register("nt", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
register("nt11", Parser, "rdflib.plugins.parsers.ntriples", "NTParser")
register("application/n-quads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser")
register("nquads", Parser, "rdflib.plugins.parsers.nquads", "NQuadsParser")
register("application/trix", Parser, "rdflib.plugins.parsers.trix", "TriXParser")
Expand Down
7 changes: 5 additions & 2 deletions rdflib/plugins/parsers/notation3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1896,8 +1896,11 @@ def parse(self, source, graph, encoding="utf-8", turtle=True):

baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "")
p = SinkParser(sink, baseURI=baseURI, turtle=turtle)

p.loadStream(source.getByteStream())
# N3 parser prefers str stream
stream = source.getCharacterStream()
if not stream:
stream = source.getByteStream()
p.loadStream(stream)

for prefix, namespace in p._bindings.items():
graph.bind(prefix, namespace)
Expand Down
11 changes: 6 additions & 5 deletions rdflib/plugins/parsers/nquads.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@
from rdflib import ConjunctiveGraph

# Build up from the NTriples parser:
from rdflib.plugins.parsers.ntriples import NTriplesParser
from rdflib.plugins.parsers.ntriples import W3CNTriplesParser
from rdflib.plugins.parsers.ntriples import ParseError
from rdflib.plugins.parsers.ntriples import r_tail
from rdflib.plugins.parsers.ntriples import r_wspace

__all__ = ["NQuadsParser"]


class NQuadsParser(NTriplesParser):
class NQuadsParser(W3CNTriplesParser):
def parse(self, inputsource, sink, bnode_context=None, **kwargs):
"""
Parse inputsource as an N-Quads file.
Expand All @@ -57,13 +57,14 @@ def parse(self, inputsource, sink, bnode_context=None, **kwargs):
)
self.sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier)

source = inputsource.getByteStream()
source = inputsource.getCharacterStream()
if not source:
source = inputsource.getByteStream()
source = getreader("utf-8")(source)

if not hasattr(source, "read"):
raise ParseError("Item to parse must be a file-like object.")

source = getreader("utf-8")(source)

self.file = source
self.buffer = ""
while True:
Expand Down
33 changes: 0 additions & 33 deletions rdflib/plugins/parsers/nt.py

This file was deleted.