Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

[Xml] Break parsing out into a new file. Fix a bug where not having w…

…hitespace at the end was causing errors
  • Loading branch information...
commit f68937d43342eae94a872ba4e580f41a728452d6 1 parent 1b43e8d
@Whiteknight authored
View
1  setup.winxed
@@ -403,6 +403,7 @@ function setup_experimental_libraries(var rosella)
"xml/Includes",
"xml/Xml",
"xml/Document",
+ "xml/Parser",
"xml/Tag",
"xml/Text"
);
View
2  src/include/Parsing.winxed
@@ -29,7 +29,7 @@ inline eat_whitespace(var s, var b)
int c = ASCII_SPACE;
while(have_more_chars(s, b) && codepoint_is_whitespace(c))
c = get_next(s, b);
- if (have_more_chars(s, b))
+ if (c != ASCII_NULL && !codepoint_is_whitespace(c))
unshift_int(s, c);
}
View
307 src/unstable/xml/Parser.winxed
@@ -0,0 +1,307 @@
+namespace Rosella.Xml.Parser
+{
+ function __parse_frontmatter(string xml, var s, var b, int len, var document)
+ {
+ while(have_more_chars(s, b)) {
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c != ASCII_LESS_THAN)
+ Rosella.Xml.Parser.__error_unknown_char(c, "XML frontmatter");
+ c = get_next(s, b);
+ if (c == ASCII_QUESTION_MARK) {
+ Rosella.Xml.Parser.__parse_xml_header(xml, s, b, len, document);
+ continue;
+ }
+ if (c == ASCII_EXCLAMATION_POINT) {
+ c = peek_next(s, b);
+ if (c == ASCII_DASH) {
+ // TODO: We shouldn't just discard this
+ Rosella.Xml.Parser.__parse_comment(xml, s, b, len);
+ continue;
+ }
+ if (c == ASCII_D) {
+ Rosella.Xml.Parser.__parse_dtd_header(xml, s, b, len, document);
+ continue;
+ }
+ }
+
+ // If we don't have a <!-- comment, <?xml header or <!DOCTYPE, return and
+ // let the main loop parse it.
+ unshift_int(s, c);
+ unshift_int(s, ASCII_LESS_THAN);
+ return;
+ }
+ }
+
+ function __parse_xml(string xml, var s, var b, int len, var document)
+ {
+ var tag_stack = [];
+ var current_tag = null;
+
+ eat_whitespace(s, b);
+
+ Rosella.Xml.Parser.__parse_frontmatter(xml, s, b, len, document);
+
+ int c;
+ while(have_more_chars(s, b)) {
+ eat_whitespace(s, b);
+ if (!have_more_chars(s, b))
+ break;
+ c = get_next(s, b);
+ if (c == ASCII_LESS_THAN) {
+ :(var tag, int is_close, int contained, int is_comment) = Rosella.Xml.Parser.__parse_tag(xml, s, b, len);
+
+ if (is_comment) {
+ // TODO: What do we do about comments appearing before and
+ // after the root node?
+ if (current_tag != null)
+ current_tag.add_child(tag);
+ continue;
+ }
+
+ // A self-contained tag: <foo />
+ if (contained) {
+ if (current_tag != null)
+ current_tag.add_child(tag);
+ else {
+ current_tag = tag;
+ document.set_root(tag);
+ }
+ continue;
+ }
+
+ // A closing tag: </foo>
+ if (is_close) {
+ if (current_tag == null)
+ Rosella.Error.error("Syntax error. Found closing tag '%s' without corresponding open tag", tag.name);
+ current_tag.set_end(tag);
+ current_tag = tag_stack.pop();
+ continue;
+ }
+
+ // An opening tag: <foo ...>
+ if (current_tag != null)
+ current_tag.add_child(tag);
+ else
+ document.set_root(tag);
+ push(tag_stack, current_tag);
+ current_tag = tag;
+ continue;
+ }
+ var sb = new 'StringBuilder';
+ while(have_more_chars(s, b) && c != ASCII_LESS_THAN) {
+ push(sb, chr(c));
+ c = get_next(s, b);
+ }
+
+ if (!have_more_chars(s, b))
+ break;
+
+ unshift_int(s, c);
+ string str = string(sb);
+ if (current_tag != null)
+ current_tag.add_child_text(str);
+ Rosella.IO.sayf("Done reading text after tag");
+ }
+ return document;
+ }
+
+ // TODO: If available, read the encoding attribute and compare to the
+ // encoding that the string is being read as.
+ function __parse_xml_header(string xml, var s, var b, int len, var document)
+ {
+ eat_whitespace(s, b);
+ string tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ if (tag_name != "xml")
+ Rosella.Error.error("Malformed XML header. Should start with '<?xml ... ?>'");
+
+ var header_tag = new Rosella.Xml.Tag.XmlHeader();
+ eat_whitespace(s, b);
+ Rosella.Xml.Parser.__parse_attributes(xml, s, b, len, header_tag);
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c == ASCII_QUESTION_MARK) {
+ c = get_next(s, b);
+ if (c == ASCII_GREATER_THAN) {
+ document.add_header(header_tag);
+ return;
+ }
+ unshift_int(s, c);
+ }
+ Rosella.Xml.Parser.__error_unknown_char(c, "XML Header");
+ }
+
+ function __parse_dtd_header(string xml, var s, var b, int len, var document)
+ {
+ string tagname = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ if (tagname != "DOCTYPE")
+ Rosella.Error.error("Invalid DOCTYPE tag");
+ eat_whitespace(s, b);
+
+ string doctype = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ eat_whitespace(s, b);
+
+ // Should be one of SYSTEM, PUBLIC, etc
+ string dtd_scope = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+
+ var dtd_header = new Rosella.Xml.Tag.DtdHeader(doctype, dtd_scope);
+
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
+ string filename = Rosella.Xml.Parser.__parse_quoted(c, xml, s, b, len);
+ dtd_header.set_filename(filename);
+ } else if (c == ASCII_OPEN_BRACKET) {
+ // TODO: Parse the inline DTD definitions. Add each as a child to
+ // the dtd_header
+ Rosella.Error.not_implemented("Cannot parse inline DOCTYPE yet!");
+ }
+ eat_whitespace(s, b);
+ c = get_next(s, b);
+ if (c != ASCII_GREATER_THAN)
+ Rosella.Xml.Parser.__error_unknown_char(c, "DOCTYPE header");
+ document.add_header(dtd_header);
+ }
+
+ function __parse_tag(string xml, var s, var b, int len)
+ {
+ // TODO: Namespaces (<foo:bar ...> </foo:bar>)
+ eat_whitespace(s, b);
+ int c = get_next(s, b);
+ if (c == ASCII_EXCLAMATION_POINT)
+ return Rosella.Xml.Parser.__parse_comment(xml, s, b, len);
+ int is_close_tag = false;
+
+ if (c == ASCII_SLASH) {
+ c = get_next(s, b);
+ is_close_tag = true;
+ }
+ unshift_int(s, c);
+ string ns = "";
+ string tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+
+ eat_whitespace(s, b);
+ if (peek_next(s, b) == ASCII_COLON) {
+ get_next(s, b);
+ eat_whitespace(s, b);
+ ns = tag_name;
+ tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ }
+
+ if (is_close_tag)
+ tag_name = "/" + tag_name;
+ var xtag = new Rosella.Xml.Tag(ns, tag_name);
+
+ eat_whitespace(s, b);
+ if (!is_close_tag)
+ Rosella.Xml.Parser.__parse_attributes(xml, s, b, len, xtag);
+
+ c = get_next(s, b);
+ if (c == ASCII_GREATER_THAN)
+ return xtag, is_close_tag, false, false;
+
+ if (c == ASCII_SLASH) {
+ int next = get_next(s, b);
+ if (next == ASCII_GREATER_THAN)
+ return xtag, false, true, false;
+ else
+ Rosella.Error.error("Syntax error in tag %s", tag_name);
+ }
+
+ Rosella.Xml.Parser.__error_unknown_char(c, "tag " + tag_name);
+ }
+
+ function __error_unknown_char(int c, string context)
+ {
+ if (c == ASCII_NULL)
+ Rosella.Error.error("Unexpected end of input while parsing %s", context);
+ Rosella.Error.error("Unexpected token '%s' in %s", chr(c), context);
+ }
+
+ function __parse_comment(string xml, var s, var b, int len)
+ {
+ if (get_next(s, b) != ASCII_DASH || get_next(s, b) != ASCII_DASH)
+ Rosella.Error.error("Malformed comment");
+
+ var sb = new 'StringBuilder';
+ while (have_more_chars(s, b)) {
+ int c = get_next(s, b);
+
+ if (c == ASCII_DASH) {
+ int d = get_next(s, b);
+ if (d == ASCII_DASH) {
+ int e = get_next(s, b);
+ if (e == ASCII_GREATER_THAN) {
+ string text = string(sb);
+ var comment = new Rosella.Xml.Tag.Comment(text);
+ return comment, true, true, true;
+ }
+ unshift_int(s, e);
+ }
+ unshift_int(s, d);
+ }
+ push(sb, chr(c));
+ }
+ Rosella.Error.error("Unterminated comment at end of document");
+ }
+
+ function __parse_attributes(string xml, var s, var b, int len, var xtag)
+ {
+ while (have_more_chars(s, b)) {
+ eat_whitespace(s, b);
+ int c = peek_next(s, b);
+ if (!is_name_char(c))
+ break;
+
+ string attr_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ eat_whitespace(s, b);
+
+ c = get_next(s, b);
+ if (c != ASCII_EQUALS) {
+ unshift_int(s, c);
+ xtag.add_attribute(attr_name, "true");
+ eat_whitespace(s, b);
+ continue;
+ }
+
+ c = get_next(s, b);
+ if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
+ string attr_value = Rosella.Xml.Parser.__parse_quoted(c, xml, s, b, len);
+ xtag.add_attribute(attr_name, attr_value);
+ } else {
+ unshift_int(s, c);
+ string attr_value = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len);
+ xtag.add_attribute(attr_name, attr_value);
+ }
+ }
+ }
+
+ function __parse_alphanumeric(string xml, var s, var b, int len)
+ {
+ var sb = new 'StringBuilder';
+ int c = get_next(s, b);
+ while(have_more_chars(s, b) && is_name_char(c)) {
+ push(sb, chr(c));
+ c = get_next(s, b);
+ }
+ Rosella.IO.sayf("parse_alpha unshifting %d %s", c, chr(c));
+ unshift_int(s, c);
+ string result = string(sb);
+ return result;
+ }
+
+ function __parse_quoted(int q, string xml, var s, var b, int len)
+ {
+ var sb = new 'StringBuilder';
+ int c = get_next(s, b);
+ while (c != q) {
+ push(sb, chr(c));
+ if (c == ASCII_SLASH) {
+ c = get_next(s, b);
+ push(sb, chr(c));
+ }
+ c = get_next(s, b);
+ }
+ return sb;
+ }
+}
View
306 src/unstable/xml/Xml.winxed
@@ -19,310 +19,6 @@ namespace Rosella.Xml
int len = length(xml);
var b = get_iterator(var(xml));
int s[] = [];
- return __parse_xml(xml, s, b, len, document);
- }
-
- function __parse_frontmatter(string xml, var s, var b, int len, var document)
- {
- while(have_more_chars(s, b)) {
- eat_whitespace(s, b);
- int c = get_next(s, b);
- if (c != ASCII_LESS_THAN)
- Rosella.Xml.__error_unknown_char(c, "XML frontmatter");
- c = get_next(s, b);
- if (c == ASCII_QUESTION_MARK) {
- Rosella.Xml.__parse_xml_header(xml, s, b, len, document);
- continue;
- }
- if (c == ASCII_EXCLAMATION_POINT) {
- c = peek_next(s, b);
- if (c == ASCII_DASH) {
- // TODO: We shouldn't just discard this
- Rosella.Xml.__parse_comment(xml, s, b, len);
- continue;
- }
- if (c == ASCII_D) {
- Rosella.Xml.__parse_dtd_header(xml, s, b, len, document);
- continue;
- }
- }
-
- // If we don't have a <!-- comment, <?xml header or <!DOCTYPE, return and
- // let the main loop parse it.
- unshift_int(s, c);
- unshift_int(s, ASCII_LESS_THAN);
- return;
- }
- }
-
- function __parse_xml(string xml, var s, var b, int len, var document)
- {
- var tag_stack = [];
- var current_tag = null;
-
- eat_whitespace(s, b);
-
- Rosella.Xml.__parse_frontmatter(xml, s, b, len, document);
-
- int c;
- while(have_more_chars(s, b)) {
- eat_whitespace(s, b);
- if (!have_more_chars(s, b))
- break;
- c = get_next(s, b);
- if (c == ASCII_LESS_THAN) {
- :(var tag, int is_close, int contained, int is_comment) = Rosella.Xml.__parse_tag(xml, s, b, len);
-
- if (is_comment) {
- // TODO: What do we do about comments appearing before and
- // after the root node?
- if (current_tag != null)
- current_tag.add_child(tag);
- continue;
- }
-
- // A self-contained tag: <foo />
- if (contained) {
- if (current_tag != null)
- current_tag.add_child(tag);
- else {
- current_tag = tag;
- document.set_root(tag);
- }
- continue;
- }
-
- // A closing tag: </foo>
- if (is_close) {
- if (current_tag == null)
- Rosella.Error.error("Syntax error. Found closing tag '%s' without corresponding open tag", tag.name);
- current_tag.set_end(tag);
- current_tag = tag_stack.pop();
- continue;
- }
-
- // An opening tag: <foo ...>
- if (current_tag != null)
- current_tag.add_child(tag);
- else
- document.set_root(tag);
- push(tag_stack, current_tag);
- current_tag = tag;
- continue;
- }
- var sb = new 'StringBuilder';
- while(have_more_chars(s, b) && c != ASCII_LESS_THAN) {
- push(sb, chr(c));
- c = get_next(s, b);
- }
-
- if (!have_more_chars(s, b))
- break;
-
- unshift_int(s, c);
- string str = string(sb);
- if (current_tag != null)
- current_tag.add_child_text(str);
- Rosella.IO.sayf("Done reading text after tag");
- }
- return document;
- }
-
- // TODO: If available, read the encoding attribute and compare to the
- // encoding that the string is being read as.
- function __parse_xml_header(string xml, var s, var b, int len, var document)
- {
- eat_whitespace(s, b);
- string tag_name = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- if (tag_name != "xml")
- Rosella.Error.error("Malformed XML header. Should start with '<?xml ... ?>'");
-
- var header_tag = new Rosella.Xml.Tag.XmlHeader();
- eat_whitespace(s, b);
- Rosella.Xml.__parse_attributes(xml, s, b, len, header_tag);
- eat_whitespace(s, b);
- int c = get_next(s, b);
- if (c == ASCII_QUESTION_MARK) {
- c = get_next(s, b);
- if (c == ASCII_GREATER_THAN) {
- document.add_header(header_tag);
- return;
- }
- unshift_int(s, c);
- }
- Rosella.Xml.__error_unknown_char(c, "XML Header");
- }
-
- function __parse_dtd_header(string xml, var s, var b, int len, var document)
- {
- string tagname = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- if (tagname != "DOCTYPE")
- Rosella.Error.error("Invalid DOCTYPE tag");
- eat_whitespace(s, b);
-
- string doctype = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- eat_whitespace(s, b);
-
- // Should be one of SYSTEM, PUBLIC, etc
- string dtd_scope = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
-
- var dtd_header = new Rosella.Xml.Tag.DtdHeader(doctype, dtd_scope);
-
- eat_whitespace(s, b);
- int c = get_next(s, b);
- if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
- string filename = Rosella.Xml.__parse_quoted(c, xml, s, b, len);
- dtd_header.set_filename(filename);
- } else if (c == ASCII_OPEN_BRACKET) {
- // TODO: Parse the inline DTD definitions. Add each as a child to
- // the dtd_header
- Rosella.Error.not_implemented("Cannot parse inline DOCTYPE yet!");
- }
- eat_whitespace(s, b);
- c = get_next(s, b);
- if (c != ASCII_GREATER_THAN)
- Rosella.Xml.__error_unknown_char(c, "DOCTYPE header");
- document.add_header(dtd_header);
- }
-
- function __parse_tag(string xml, var s, var b, int len)
- {
- // TODO: Namespaces (<foo:bar ...> </foo:bar>)
- eat_whitespace(s, b);
- int c = get_next(s, b);
- if (c == ASCII_EXCLAMATION_POINT)
- return Rosella.Xml.__parse_comment(xml, s, b, len);
- int is_close_tag = false;
-
- if (c == ASCII_SLASH) {
- c = get_next(s, b);
- is_close_tag = true;
- }
- unshift_int(s, c);
- string ns = "";
- string tag_name = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- eat_whitespace(s, b);
- c = peek_next(s, b);
- if (c == ASCII_COLON) {
- get_next(s, b);
- eat_whitespace(s, b);
- ns = tag_name;
- tag_name = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- }
-
- //if (is_close_tag)
- //tag_name = "/" + tag_name;
- var xtag = new Rosella.Xml.Tag(ns, tag_name);
-
- eat_whitespace(s, b);
- if (!is_close_tag)
- Rosella.Xml.__parse_attributes(xml, s, b, len, xtag);
-
- c = get_next(s, b);
- if (c == ASCII_GREATER_THAN)
- return xtag, is_close_tag, false, false;
-
- if (c == ASCII_SLASH) {
- int next = get_next(s, b);
- if (next == ASCII_GREATER_THAN)
- return xtag, false, true, false;
- else
- Rosella.Error.error("Syntax error in tag %s", tag_name);
- }
-
- Rosella.Xml.__error_unknown_char(c, "tag " + tag_name);
- }
-
- function __error_unknown_char(int c, string context)
- {
- if (c == ASCII_NULL)
- Rosella.Error.error("Unexpected end of input while parsing %s", context);
- Rosella.Error.error("Unexpected token '%s' in %s", chr(c), context);
- }
-
- function __parse_comment(string xml, var s, var b, int len)
- {
- if (get_next(s, b) != ASCII_DASH || get_next(s, b) != ASCII_DASH)
- Rosella.Error.error("Malformed comment");
-
- var sb = new 'StringBuilder';
- while (have_more_chars(s, b)) {
- int c = get_next(s, b);
-
- if (c == ASCII_DASH) {
- int d = get_next(s, b);
- if (d == ASCII_DASH) {
- int e = get_next(s, b);
- if (e == ASCII_GREATER_THAN) {
- string text = string(sb);
- var comment = new Rosella.Xml.Tag.Comment(text);
- return comment, true, true, true;
- }
- unshift_int(s, e);
- }
- unshift_int(s, d);
- }
- push(sb, chr(c));
- }
- Rosella.Error.error("Unterminated comment at end of document");
- }
-
- function __parse_attributes(string xml, var s, var b, int len, var xtag)
- {
- while (have_more_chars(s, b)) {
- eat_whitespace(s, b);
- int c = peek_next(s, b);
- if (!is_name_char(c))
- break;
-
- string attr_name = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- eat_whitespace(s, b);
-
- c = get_next(s, b);
- if (c != ASCII_EQUALS) {
- unshift_int(s, c);
- xtag.add_attribute(attr_name, "true");
- eat_whitespace(s, b);
- continue;
- }
-
- c = get_next(s, b);
- if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) {
- string attr_value = Rosella.Xml.__parse_quoted(c, xml, s, b, len);
- xtag.add_attribute(attr_name, attr_value);
- } else {
- unshift_int(s, c);
- string attr_value = Rosella.Xml.__parse_alphanumeric(xml, s, b, len);
- xtag.add_attribute(attr_name, attr_value);
- }
- }
- }
-
- function __parse_alphanumeric(string xml, var s, var b, int len)
- {
- var sb = new 'StringBuilder';
- int c = get_next(s, b);
- while(have_more_chars(s, b) && is_name_char(c)) {
- push(sb, chr(c));
- c = get_next(s, b);
- }
- unshift_int(s, c);
- string result = string(sb);
- return result;
- }
-
- function __parse_quoted(int q, string xml, var s, var b, int len)
- {
- var sb = new 'StringBuilder';
- int c = get_next(s, b);
- while (c != q) {
- push(sb, chr(c));
- if (c == ASCII_SLASH) {
- c = get_next(s, b);
- push(sb, chr(c));
- }
- c = get_next(s, b);
- }
- return sb;
+ return Rosella.Xml.Parser.__parse_xml(xml, s, b, len, document);
}
}
Please sign in to comment.
Something went wrong with that request. Please try again.