Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

509 lines (463 sloc) 20.394 kB
/* Recursive Descent XML Parser
This namespace implements the parser for XML. It is a home-brewed recursive
descent parser using the Rosella Parse library primitives.
These routines should not be used directly. Use the parse driver routines
in the Rosella.Xml namespace instead.
*/
namespace Rosella.Xml.Parser
{
// Top-level xml parser routine. Start parsing the document
function __parse_xml(string xml, var s, var b, int len, var document, int strict)
{
var tag_stack = [];
var current_tag = new Rosella.Xml.Tag.DocumentRoot();
var root = current_tag;
Rosella.Xml.Parser.__parse_frontmatter(xml, s, b, len, document, current_tag);
int c;
while(have_more_chars(s, b)) {
eat_whitespace(s, b);
if (!have_more_chars(s, b))
break;
c = get_next(s, b);
if (c == ASCII_LESS_THAN) {
:(var tag, int is_close, int contained, int is_comment) = Rosella.Xml.Parser.__parse_tag(xml, s, b, len);
if (is_comment) {
current_tag.add_child(tag);
continue;
}
// A self-contained tag: <foo />
if (contained) {
current_tag.add_child(tag);
continue;
}
// A closing tag: </foo>
if (is_close) {
if (tag.xmlns != current_tag.xmlns)
Rosella.Error.error("Syntax error, mismatched tags. Tag namespaces and/or names do not match at position %d", current_position(len, s, b));
if (tag.name != current_tag.name) {
if (strict)
Rosella.Error.error("Syntax error, mismatched tags. Found closing tag '%s' at position %d but current tag is '%s'", tag.name, current_position(len, s, b), current_tag.name);
else {
/* If we're not in strict mode, we can try to search up the tag for the matching open tag and just
deal with it. In reality, SGML allows stand-alone tags in some places and these semantics will be
wrong. */
current_tag = null;
while(elements(tag_stack) > 0) {
var dead_tag = tag_stack.pop();
if (dead_tag instanceof Rosella.Xml.Tag.DocumentRoot) {
current_tag = dead_tag;
break;
}
if (dead_tag.name == tag.name) {
dead_tag.set_end(tag);
current_tag = tag_stack.pop();
break;
}
}
continue;
}
}
current_tag.set_end(tag);
current_tag = tag_stack.pop();
continue;
}
// An opening tag: <foo ...>
current_tag.add_child(tag);
push(tag_stack, current_tag);
current_tag = tag;
continue;
}
var sb = new 'StringBuilder';
while(have_more_chars(s, b) && c != ASCII_LESS_THAN) {
push(sb, chr(c));
c = get_next(s, b);
}
if (!have_more_chars(s, b))
break;
unshift_int(s, c);
string str = string(sb);
if (current_tag != null)
current_tag.add_child_text(str);
}
if (elements(tag_stack) > 0)
Rosella.Error.error("Syntax error, mismatched tags. There are %d unclosed tags at position %d", elements(tag_stack), current_position(len, s, b));
document.set_documentroot(root);
return document;
}
// Parse the document frontmatter, such as the XML and possibly the DTD
// headers.
function __parse_frontmatter(string xml, var s, var b, int len, var document, var root)
{
while(have_more_chars(s, b)) {
eat_whitespace(s, b);
int c = get_next(s, b);
if (c != ASCII_LESS_THAN)
Rosella.Parse.error_unknown_char(c, "XML frontmatter", current_position(len, s, b));
c = get_next(s, b);
if (c == ASCII_QUESTION_MARK) {
Rosella.Xml.Parser.__parse_xml_header(xml, s, b, len, document);
continue;
}
if (c == ASCII_EXCLAMATION_POINT) {
c = peek_next(s, b);
if (c == ASCII_DASH) {
var c = Rosella.Xml.Parser.__parse_comment(xml, s, b, len);
root.add_child(c);
continue;
}
if (c == ASCII_D) {
Rosella.Xml.Parser.__parse_dtd_header(xml, s, b, len, document);
continue;
}
}
// If we don't have a <!-- comment, <?xml header or <!DOCTYPE, return and
// let the main loop parse it.
unshift_int(s, c);
unshift_int(s, ASCII_LESS_THAN);
return;
}
}
// Parse the <?xml ... ?> tag
function __parse_xml_header(string xml, var s, var b, int len, var document)
{
eat_whitespace(s, b);
int pos = current_position(len, s, b);
string tag_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
if (tag_name != "xml")
Rosella.Error.error("Malformed XML header. Should start with '<?xml ... ?>' at position %d", current_position(len, s, b));
var header_tag = new Rosella.Xml.Tag.XmlHeader(pos);
eat_whitespace(s, b);
Rosella.Xml.Parser.__parse_attributes(xml, s, b, len, header_tag);
eat_whitespace(s, b);
int c = get_next(s, b);
if (c == ASCII_QUESTION_MARK) {
c = get_next(s, b);
if (c == ASCII_GREATER_THAN) {
document.add_header(header_tag);
return;
}
unshift_int(s, c);
}
Rosella.Parse.error_unknown_char(c, "XML Header", current_position(len, s, b));
}
// Parse the <!DOCTYPE ... > tag
function __parse_dtd_header(string xml, var s, var b, int len, var document)
{
int pos = current_position(len, s, b);
string tagname = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
if (tagname != "DOCTYPE")
Rosella.Error.error("Invalid DOCTYPE tag");
eat_whitespace(s, b);
string doctype = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
var dtd_header = new Rosella.Xml.Tag.DtdHeader(pos, doctype);
document.add_header(dtd_header);
int c = peek_next(s, b);
if (c == ASCII_GREATER_THAN) {
c = get_next(s, b);
return;
}
if (c == ASCII_OPEN_BRACKET) {
__parse_dtd_header_inline(xml, s, b, len, dtd_header.get_dtd_document());
eat_whitespace(s, b);
c = get_next(s, b);
if (c != ASCII_GREATER_THAN)
Rosella.Parse.error_unknown_char(c, "!DOCTYPE header with inline declarations", current_position(len, s, b));
return;
}
// Should be one of SYSTEM, PUBLIC, etc
string dtd_scope = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
c = get_next(s, b);
if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
string filename = Rosella.Parse.parse_quoted(c, xml, s, b, len);
dtd_header.set_scope(dtd_scope, filename);
eat_whitespace(s, b);
}
c = peek_next(s, b);
if (c == ASCII_OPEN_BRACKET) {
__parse_dtd_header_inline(xml, s, b, len, dtd_header.get_dtd_document());
eat_whitespace(s, b);
c = get_next(s, b);
if (c != ASCII_GREATER_THAN)
Rosella.Parse.error_unknown_char(c, "!DOCTYPE header with scope and inline declarations", current_position(len, s, b));
return;
}
c = get_next(s, b);
if (c != ASCII_GREATER_THAN)
Rosella.Parse.error_unknown_char(c, "DOCTYPE header end", current_position(len, s, b));
}
// Parse the DTD header as part of an XML document
function __parse_dtd_header_inline(string xml, var s, var b, int len, var dtd_doc)
{
int c = get_next(s, b);
while (have_more_chars(s, b)) {
eat_whitespace(s, b);
c = get_next(s, b);
if (c == ASCII_CLOSE_BRACKET)
break;
if (c == ASCII_LESS_THAN)
__parse_dtd_inline_tag(xml, s, b, len, dtd_doc);
else
Rosella.Parse.error_unknown_char(c, "!DOCTYPE header", current_position(len, s, b));
}
}
// Parse a standalone DTD document
function __parse_dtd_standalone(string dtd, var s, var b, int len, var dtd_document)
{
while (have_more_chars(s, b)) {
eat_whitespace(s, b);
int c = get_next(s, b);
if (c == ASCII_LESS_THAN)
__parse_dtd_inline_tag(dtd, s, b, len, dtd_document);
else
Rosella.Parse.error_unknown_char(c, "DTD", current_position(len, s, b));
}
dtd_document.set_dtd_header(dtd_document);
return dtd_document;
}
// Parse the DD header as part of an XML document.
function __parse_dtd_inline_tag(string xml, var s, var b, int len, var dtd_doc)
{
int start_pos = current_position(len, s, b);
int c = get_next(s, b);
if (c != ASCII_EXCLAMATION_POINT)
Rosella.Parse.error_unknown_char(c, "DTD Document", current_position(len, s, b));
string tagname = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
string elemname = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
switch (tagname) {
case "ELEMENT":
var elem = new Rosella.Xml.Tag.DtdElement(start_pos, elemname);
dtd_doc.add_element_def(elemname, elem);
c = peek_next(s, b);
if (c == ASCII_E) {
string empty = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
if (empty != "EMPTY")
Rosella.Error.error("Expected 'EMPTY', got '%s' at position %d", empty, current_position(len, s, b));
elem.set_child_rule(empty);
} else if (c == ASCII_A) {
string any = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
if (any != "ANY")
Rosella.Error.error("Expected 'ANY', got '%s' at position %d", any, current_position(len, s, b));
elem.set_child_rule(any);
} else if (c == ASCII_OPEN_PAREN)
__parse_dtd_element_children(xml, s, b, len, dtd_doc, elem);
else
Rosella.Parse.error_unknown_char(c, "!ELEMENT tag", current_position(len, s, b));
break;
case "ATTLIST":
while (true) {
c = peek_next(s, b);
if (c == ASCII_GREATER_THAN)
break;
string attrname = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
string attrtype, default_rule, default_value, fixed_value;
eat_whitespace(s, b);
c = peek_next(s, b);
if (c == ASCII_OPEN_PAREN) {
// TODO: Enumerated attribute values
Rosella.Error.not_implemented("Enumerated attribute values");
} else {
attrtype = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
}
eat_whitespace(s, b);
c = peek_next(s, b);
if (c == ASCII_OCTOTHORPE) {
c = get_next(s, b);
default_rule = __parse_ident(xml, s, b, len);
// If "FIXED", we need a quoted value too
if (default_rule == "FIXED") {
eat_whitespace(s, b);
c = get_next(s, b);
fixed_value = Rosella.Parse.parse_quoted(c, xml, s, b, len);
}
eat_whitespace(s, b);
c = peek_next(s, b);
} else {
if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
c = get_next(s, b);
default_value = Rosella.Parse.parse_quoted(c, xml, s, b, len);
} else
Rosella.Parse.error_unknown_char(c, "!ATTLIST tag", current_position(len, s, b));
}
dtd_doc.add_element_attribute_def(elemname, attrname, attrtype, default_value, default_rule, fixed_value);
eat_whitespace(s, b);
}
break;
case "ENTITY":
Rosella.Error.not_implemented("DTD !ENTITY declarations");
break;
case "NOTATION":
Rosella.Error.not_implemented("DTD !NOTATION declarations");
break;
default:
Rosella.Error.error("Unknown inline DTD tag name '%s' at position %d", tagname, current_position(len, s, b));
}
eat_whitespace(s, b);
c = get_next(s, b);
if (c != ASCII_GREATER_THAN)
Rosella.Parse.error_unknown_char(c, "DTD tag", current_position(len, s, b));
}
// Parse the contents of the <!ELEMENT tag
function __parse_dtd_element_children(string xml, var s, var b, int len, var dtd_header, var elem)
{
// TODO: Handle alternations (foo|bar|baz)[<modifier>], ...
// or (foo,bar,(baz|fie)?), etc
int c = get_next(s, b);
while (have_more_chars(s, b)) {
eat_whitespace(s, b);
string tagname = __parse_dtd_tagname(xml, s, b, len);
eat_whitespace(s, b);
c = get_next(s, b);
string modifier = "";
if (c == ASCII_ASTERISK || c == ASCII_QUESTION_MARK || c == ASCII_PLUS)
modifier = chr(c);
elem.add_child_type(tagname, modifier);
if (c == ASCII_COMMA)
continue;
if (c = ASCII_CLOSE_PAREN)
return;
Rosella.Parse.error_unknown_char(c, "!ELEMENT child list", current_position(len, s, b));
}
Rosella.Parse.error_unknown_char(ASCII_NULL, "!ELEMENT child_list", current_position(len, s, b));
}
// Parse a tag
function __parse_tag(string xml, var s, var b, int len)
{
int pos = current_position(len, s, b);
eat_whitespace(s, b);
int c = get_next(s, b);
if (c == ASCII_EXCLAMATION_POINT)
return Rosella.Xml.Parser.__parse_comment(xml, s, b, len);
int is_close_tag = false;
if (c == ASCII_SLASH) {
c = get_next(s, b);
is_close_tag = true;
}
unshift_int(s, c);
string ns = "";
string tag_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
if (peek_next(s, b) == ASCII_COLON) {
get_next(s, b);
eat_whitespace(s, b);
ns = tag_name;
tag_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
}
var xtag;
if (is_close_tag)
xtag = new Rosella.Xml.Tag.EndTag(pos, ns, tag_name);
else
xtag = new Rosella.Xml.Tag(pos, ns, tag_name);
eat_whitespace(s, b);
if (!is_close_tag)
Rosella.Xml.Parser.__parse_attributes(xml, s, b, len, xtag);
c = get_next(s, b);
if (c == ASCII_GREATER_THAN)
return xtag, is_close_tag, false, false;
if (c == ASCII_SLASH) {
int next = get_next(s, b);
if (next == ASCII_GREATER_THAN)
return xtag, false, true, false;
else
Rosella.Error.error("Syntax error in tag %s at position %d", tag_name, current_position(len, s, b));
}
Rosella.Parse.error_unknown_char(c, "tag " + tag_name, current_position(len, s, b));
}
// Parse a comment <!-- ... -->
function __parse_comment(string xml, var s, var b, int len)
{
int pos = current_position(len, s, b);
if (get_next(s, b) != ASCII_DASH || get_next(s, b) != ASCII_DASH)
Rosella.Error.error("Malformed comment at position %d", current_position(len, s, b));
var sb = new 'StringBuilder';
while (have_more_chars(s, b)) {
int c = get_next(s, b);
if (c == ASCII_DASH) {
int d = get_next(s, b);
if (d == ASCII_DASH) {
int e = get_next(s, b);
if (e == ASCII_GREATER_THAN) {
string text = string(sb);
var comment = new Rosella.Xml.Tag.Comment(pos, text);
return comment, true, true, true;
}
unshift_int(s, e);
}
unshift_int(s, d);
}
push(sb, chr(c));
}
Rosella.Error.error("Unterminated comment at end of document");
}
// Parse key="value" attributes
function __parse_attributes(string xml, var s, var b, int len, var xtag)
{
while (have_more_chars(s, b)) {
eat_whitespace(s, b);
int c = peek_next(s, b);
if (!is_name_char(c))
break;
string attr_ns = "";
string attr_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
c = get_next(s, b);
if (c == ASCII_COLON) {
attr_ns = attr_name;
attr_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
eat_whitespace(s, b);
c = get_next(s, b);
}
if (c != ASCII_EQUALS) {
unshift_int(s, c);
xtag.add_attribute(attr_ns, attr_name, "true");
eat_whitespace(s, b);
continue;
}
c = get_next(s, b);
if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
string attr_value = Rosella.Parse.parse_quoted(c, xml, s, b, len);
xtag.add_attribute(attr_ns, attr_name, attr_value);
} else {
unshift_int(s, c);
string attr_value = Rosella.Xml.Parser.__parse_ident(xml, s, b, len);
xtag.add_attribute(attr_ns, attr_name, attr_value);
}
}
}
// Parse an identifier
function __parse_ident(string xml, var s, var b, int len)
{
var sb = new 'StringBuilder';
while(have_more_chars(s, b)) {
int c = get_next(s, b);
if (!is_name_char(c)) {
unshift_int(s, c);
break;
}
push(sb, chr(c));
}
string result = string(sb);
return result;
}
// dtd tagname here is like an ident, except it might start with an #.
// Just read the first character and force it into the buffer
function __parse_dtd_tagname(string xml, var s, var b, int len)
{
var sb = new 'StringBuilder';
int c = get_next(s, b);
push(sb, chr(c));
while(have_more_chars(s, b)) {
int c = get_next(s, b);
if (!is_name_char(c)) {
unshift_int(s, c);
break;
}
push(sb, chr(c));
}
string result = string(sb);
return result;
}
}
Jump to Line
Something went wrong with that request. Please try again.