Permalink
Browse files

[Xml] Break parsing out into a new file. Fix a bug where not having w…

…hitespace at the end was causing errors
  • Loading branch information...
Whiteknight committed Apr 5, 2012
1 parent 1b43e8d commit f68937d43342eae94a872ba4e580f41a728452d6
Showing with 310 additions and 306 deletions.
  1. +1 −0 setup.winxed
  2. +1 −1 src/include/Parsing.winxed
  3. +307 −0 src/unstable/xml/Parser.winxed
  4. +1 −305 src/unstable/xml/Xml.winxed
View
@@ -403,6 +403,7 @@ function setup_experimental_libraries(var rosella)
"xml/Includes",
"xml/Xml",
"xml/Document",
+ "xml/Parser",
"xml/Tag",
"xml/Text"
);
@@ -29,7 +29,7 @@ inline eat_whitespace(var s, var b)
int c = ASCII_SPACE;
while(have_more_chars(s, b) && codepoint_is_whitespace(c))
c = get_next(s, b);
- if (have_more_chars(s, b))
+ if (c != ASCII_NULL && !codepoint_is_whitespace(c))
unshift_int(s, c);
}
@@ -0,0 +1,307 @@
+namespace Rosella.Xml.Parser
+{
+ function __parse_frontmatter(string xml, var s, var b, int len, var document)
+ {
+ while(have_more_chars(s, b)) {
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c != ASCII_LESS_THAN)
+ Rosella.Xml.Parser.__error_unknown_char(c, "XML frontmatter");
+ c = get_next(s, b);
+ if (c == ASCII_QUESTION_MARK) {
+ Rosella.Xml.Parser.__parse_xml_header(xml, s, b, len, document);
+ continue;
+ }
+ if (c == ASCII_EXCLAMATION_POINT) {
+ c = peek_next(s, b);
+ if (c == ASCII_DASH) {
+ // TODO: We shouldn't just discard this
+ Rosella.Xml.Parser.__parse_comment(xml, s, b, len);
+ continue;
+ }
+ if (c == ASCII_D) {
+ Rosella.Xml.Parser.__parse_dtd_header(xml, s, b, len, document);
+ continue;
+ }
+ }
+
+ // If we don't have a <!-- comment, <?xml header or <!DOCTYPE, return and
+ // let the main loop parse it.
+ unshift_int(s, c);
+ unshift_int(s, ASCII_LESS_THAN);
+ return;
+ }
+ }
+
+ function __parse_xml(string xml, var s, var b, int len, var document)
+ {
+ var tag_stack = [];
+ var current_tag = null;
+
+ eat_whitespace(s, b);
+
+ Rosella.Xml.Parser.__parse_frontmatter(xml, s, b, len, document);
+
+ int c;
+ while(have_more_chars(s, b)) {
+ eat_whitespace(s, b);
+ if (!have_more_chars(s, b))
+ break;
+ c = get_next(s, b);
+ if (c == ASCII_LESS_THAN) {
+ :(var tag, int is_close, int contained, int is_comment) = Rosella.Xml.Parser.__parse_tag(xml, s, b, len);
+
+ if (is_comment) {
+ // TODO: What do we do about comments appearing before and
+ // after the root node?
+ if (current_tag != null)
+ current_tag.add_child(tag);
+ continue;
+ }
+
+ // A self-contained tag: <foo />
+ if (contained) {
+ if (current_tag != null)
+ current_tag.add_child(tag);
+ else {
+ current_tag = tag;
+ document.set_root(tag);
+ }
+ continue;
+ }
+
+ // A closing tag: </foo>
+ if (is_close) {
+ if (current_tag == null)
+ Rosella.Error.error("Syntax error. Found closing tag '%s' without corresponding open tag", tag.name);
+ current_tag.set_end(tag);
+ current_tag = tag_stack.pop();
+ continue;
+ }
+
+ // An opening tag: <foo ...>
+ if (current_tag != null)
+ current_tag.add_child(tag);
+ else
+ document.set_root(tag);
+ push(tag_stack, current_tag);
+ current_tag = tag;
+ continue;
+ }
+ var sb = new 'StringBuilder';
+ while(have_more_chars(s, b) && c != ASCII_LESS_THAN) {
+ push(sb, chr(c));
+ c = get_next(s, b);
+ }
+
+ if (!have_more_chars(s, b))
+ break;
+
+ unshift_int(s, c);
+ string str = string(sb);
+ if (current_tag != null)
+ current_tag.add_child_text(str);
+ Rosella.IO.sayf("Done reading text after tag");
+ }
+ return document;
+ }
+
+ // TODO: If available, read the encoding attribute and compare to the
+ // encoding that the string is being read as.
+ function __parse_xml_header(string xml, var s, var b, int len, var document)
+ {
+ eat_whitespace(s, b);
+ string tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ if (tag_name != "xml")
+ Rosella.Error.error("Malformed XML header. Should start with '<?xml ... ?>'");
+
+ var header_tag = new Rosella.Xml.Tag.XmlHeader();
+ eat_whitespace(s, b);
+ Rosella.Xml.Parser.__parse_attributes(xml, s, b, len, header_tag);
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c == ASCII_QUESTION_MARK) {
+ c = get_next(s, b);
+ if (c == ASCII_GREATER_THAN) {
+ document.add_header(header_tag);
+ return;
+ }
+ unshift_int(s, c);
+ }
+ Rosella.Xml.Parser.__error_unknown_char(c, "XML Header");
+ }
+
+ function __parse_dtd_header(string xml, var s, var b, int len, var document)
+ {
+ string tagname = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ if (tagname != "DOCTYPE")
+ Rosella.Error.error("Invalid DOCTYPE tag");
+ eat_whitespace(s, b);
+
+ string doctype = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ eat_whitespace(s, b);
+
+ // Should be one of SYSTEM, PUBLIC, etc
+ string dtd_scope = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+
+ var dtd_header = new Rosella.Xml.Tag.DtdHeader(doctype, dtd_scope);
+
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
+ string filename = Rosella.Xml.Parser.__parse_quoted(c, xml, s, b, len);
+ dtd_header.set_filename(filename);
+ } else if (c == ASCII_OPEN_BRACKET) {
+ // TODO: Parse the inline DTD definitions. Add each as a child to
+ // the dtd_header
+ Rosella.Error.not_implemented("Cannot parse inline DOCTYPE yet!");
+ }
+ eat_whitespace(s, b);
+ c = get_next(s, b);
+ if (c != ASCII_GREATER_THAN)
+ Rosella.Xml.Parser.__error_unknown_char(c, "DOCTYPE header");
+ document.add_header(dtd_header);
+ }
+
+ function __parse_tag(string xml, var s, var b, int len)
+ {
+ // TODO: Namespaces (<foo:bar ...> </foo:bar>)
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c == ASCII_EXCLAMATION_POINT)
+ return Rosella.Xml.Parser.__parse_comment(xml, s, b, len);
+ int is_close_tag = false;
+
+ if (c == ASCII_SLASH) {
+ c = get_next(s, b);
+ is_close_tag = true;
+ }
+ unshift_int(s, c);
+ string ns = "";
+ string tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+
+ eat_whitespace(s, b);
+ if (peek_next(s, b) == ASCII_COLON) {
+ get_next(s, b);
+ eat_whitespace(s, b);
+ ns = tag_name;
+ tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ }
+
+ if (is_close_tag)
+ tag_name = "/" + tag_name;
+ var xtag = new Rosella.Xml.Tag(ns, tag_name);
+
+ eat_whitespace(s, b);
+ if (!is_close_tag)
+ Rosella.Xml.Parser.__parse_attributes(xml, s, b, len, xtag);
+
+ c = get_next(s, b);
+ if (c == ASCII_GREATER_THAN)
+ return xtag, is_close_tag, false, false;
+
+ if (c == ASCII_SLASH) {
+ int next = get_next(s, b);
+ if (next == ASCII_GREATER_THAN)
+ return xtag, false, true, false;
+ else
+ Rosella.Error.error("Syntax error in tag %s", tag_name);
+ }
+
+ Rosella.Xml.Parser.__error_unknown_char(c, "tag " + tag_name);
+ }
+
+ function __error_unknown_char(int c, string context)
+ {
+ if (c == ASCII_NULL)
+ Rosella.Error.error("Unexpected end of input while parsing %s", context);
+ Rosella.Error.error("Unexpected token '%s' in %s", chr(c), context);
+ }
+
+ function __parse_comment(string xml, var s, var b, int len)
+ {
+ if (get_next(s, b) != ASCII_DASH || get_next(s, b) != ASCII_DASH)
+ Rosella.Error.error("Malformed comment");
+
+ var sb = new 'StringBuilder';
+ while (have_more_chars(s, b)) {
+ int c = get_next(s, b);
+
+ if (c == ASCII_DASH) {
+ int d = get_next(s, b);
+ if (d == ASCII_DASH) {
+ int e = get_next(s, b);
+ if (e == ASCII_GREATER_THAN) {
+ string text = string(sb);
+ var comment = new Rosella.Xml.Tag.Comment(text);
+ return comment, true, true, true;
+ }
+ unshift_int(s, e);
+ }
+ unshift_int(s, d);
+ }
+ push(sb, chr(c));
+ }
+ Rosella.Error.error("Unterminated comment at end of document");
+ }
+
+ function __parse_attributes(string xml, var s, var b, int len, var xtag)
+ {
+ while (have_more_chars(s, b)) {
+ eat_whitespace(s, b);
+ int c = peek_next(s, b);
+ if (!is_name_char(c))
+ break;
+
+ string attr_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ eat_whitespace(s, b);
+
+ c = get_next(s, b);
+ if (c != ASCII_EQUALS) {
+ unshift_int(s, c);
+ xtag.add_attribute(attr_name, "true");
+ eat_whitespace(s, b);
+ continue;
+ }
+
+ c = get_next(s, b);
+ if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
+ string attr_value = Rosella.Xml.Parser.__parse_quoted(c, xml, s, b, len);
+ xtag.add_attribute(attr_name, attr_value);
+ } else {
+ unshift_int(s, c);
+ string attr_value = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ xtag.add_attribute(attr_name, attr_value);
+ }
+ }
+ }
+
+ function __parse_alphanumeric(string xml, var s, var b, int len)
+ {
+ var sb = new 'StringBuilder';
+ int c = get_next(s, b);
+ while(have_more_chars(s, b) && is_name_char(c)) {
+ push(sb, chr(c));
+ c = get_next(s, b);
+ }
+ Rosella.IO.sayf("parse_alpha unshifting %d %s", c, chr(c));
+ unshift_int(s, c);
+ string result = string(sb);
+ return result;
+ }
+
+ function __parse_quoted(int q, string xml, var s, var b, int len)
+ {
+ var sb = new 'StringBuilder';
+ int c = get_next(s, b);
+ while (c != q) {
+ push(sb, chr(c));
+ if (c == ASCII_SLASH) {
+ c = get_next(s, b);
+ push(sb, chr(c));
+ }
+ c = get_next(s, b);
+ }
+ return sb;
+ }
+}
Oops, something went wrong.

0 comments on commit f68937d

Please sign in to comment.