From 389c66c834637dee3eeb85a8e82510c353e94ef4 Mon Sep 17 00:00:00 2001 From: Whiteknight Date: Fri, 6 Apr 2012 21:22:52 -0400 Subject: [PATCH] [Parse] Add a new Parse library, to hold some common parsing-related routines. Factor out common code from Xml and Json --- setup.winxed | 12 ++- src/include/Parse.winxed | 11 +++ .../{Parsing.winxed => Parse_builtins.winxed} | 0 src/unstable/json/Parser.winxed | 81 +--------------- src/unstable/net/Uri.winxed | 10 +- src/unstable/parse/Parse.winxed | 97 +++++++++++++++++++ src/unstable/xml/Parser.winxed | 53 +++------- 7 files changed, 140 insertions(+), 124 deletions(-) create mode 100644 src/include/Parse.winxed rename src/include/{Parsing.winxed => Parse_builtins.winxed} (100%) create mode 100644 src/unstable/parse/Parse.winxed diff --git a/setup.winxed b/setup.winxed index bdc00fd9..1b97e318 100644 --- a/setup.winxed +++ b/setup.winxed @@ -270,7 +270,7 @@ function setup_stable_libraries(var rosella) ); // Random number generation and tools - setup_winxed_lib(rosella, "random", ["Math_Builtins", "String"], + setup_winxed_lib(rosella, "random", ["String", "Math_Builtins"], "random/Includes", "random/Random", "random/RandomNumber", @@ -380,7 +380,7 @@ function setup_experimental_libraries(var rosella) "genetic/mutator/Generic" ); - setup_unstable_lib(rosella, "net", ["Core", "Ascii", "Math_Builtins", "String", "FileSystem", "Date", "Random"], + setup_unstable_lib(rosella, "net", ["Core", "Math_Builtins", "Ascii", "Parse_builtins", "Parse", "String", "FileSystem", "Date", "Random"], "net/Includes", "net/Net", "net/Http", @@ -399,7 +399,11 @@ function setup_experimental_libraries(var rosella) "net/SocketFactory" ); - setup_unstable_lib(rosella, "xml", ["Core", "Ascii", "Parsing", "String", "FileSystem"], + setup_unstable_lib(rosella, "parse", ["Core", "Ascii", "Parse_builtins"], + "parse/Parse" + ); + + setup_unstable_lib(rosella, "xml", ["Core", "Ascii", "Parse_builtins", "Parse", "String", "FileSystem"], "xml/Includes", "xml/Xml", "xml/Document", @@ -408,7 +412,7 @@ function setup_experimental_libraries(var rosella) "xml/Text" ); - setup_unstable_lib(rosella, "json", ["Core", "Ascii", "Parsing", "Dumper"], + setup_unstable_lib(rosella, "json", ["Core", "Ascii", "Parse_builtins", "Parse", "Dumper"], "json/Includes", "json/Json", "json/Dumper", diff --git a/src/include/Parse.winxed b/src/include/Parse.winxed new file mode 100644 index 00000000..8879a9b3 --- /dev/null +++ b/src/include/Parse.winxed @@ -0,0 +1,11 @@ +namespace Rosella.Parse { + extern function parse_string; + extern function parse_alphanumeric; + extern function parse_number; + extern function error_unknown_char; +} + +function __include_parse [anon,init,load] () +{ + Rosella.load_bytecode_file('rosella/parse.pbc', 'load'); +} diff --git a/src/include/Parsing.winxed b/src/include/Parse_builtins.winxed similarity index 100% rename from src/include/Parsing.winxed rename to src/include/Parse_builtins.winxed diff --git a/src/unstable/json/Parser.winxed b/src/unstable/json/Parser.winxed index 35e2be80..8d4bf035 100644 --- a/src/unstable/json/Parser.winxed +++ b/src/unstable/json/Parser.winxed @@ -13,9 +13,9 @@ namespace Rosella.Json.Parser if (c == ASCII_OPEN_BRACKET) return __parse_array(json, s, b, len); if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) - return __parse_string(c, json, s, b, len); + return Rosella.Parse.parse_string(c, json, s, b, len); if (codepoint_is_digit(c)) - return __parse_number(c, json, s, b, len); + return Rosella.Parse.parse_number(c, json, s, b, len); if (c == ASCII_t) { if (get_next(s, b) == ASCII_r && get_next(s, b) == ASCII_u && @@ -50,7 +50,7 @@ namespace Rosella.Json.Parser eat_whitespace(s, b); int c = get_next(s, b); if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) { - string str = __parse_string(c, json, s, b, len); + string str = Rosella.Parse.parse_string(c, json, s, b, len); eat_whitespace(s, b); c = get_next(s, b); if (c != ASCII_COLON) @@ -66,7 +66,7 @@ namespace Rosella.Json.Parser } else if (codepoint_is_alpha(c)) { unshift_int(s, c); - string str = __parse_alphanumeric(json, s, b, len); + string str = Rosella.Parse.parse_alphanumeric(json, s, b, len); eat_whitespace(s, b); c = get_next(s, b); if (c != ASCII_COLON) @@ -110,77 +110,4 @@ namespace Rosella.Json.Parser } return a; } - - function __parse_string(int q, string json, var s, var b, int len) - { - var sb = new 'StringBuilder'; - while(have_more_chars(s, b)) { - int c = get_next(s, b); - if (c == ASCII_BACKSLASH) { - push(sb, "\\"); - c = get_next(s, b); - push(sb, chr(c)); - continue; - } - if (c == q) - break; - push(sb, chr(c)); - } - return sb; - } - - function __parse_alphanumeric(string xml, var s, var b, int len) - { - var sb = new 'StringBuilder'; - while(have_more_chars(s, b)) { - int c = get_next(s, b); - if (!codepoint_is_alphanumeric(c)) { - unshift_int(s, c); - break; - } - push(sb, chr(c)); - } - string result = string(sb); - return result; - } - - function __parse_number(int c, string json, var s, var b, int len) - { - int have_e = false; - int have_dot = false; - int have_sign = false; - var sb = new 'StringBuilder'; - push(sb, codepoint_to_string(c)); - while (have_more_chars(s, b)) { - int d = get_next(s, b); - if (!have_sign && (d == ASCII_PLUS|| d == ASCII_DASH)) { - have_sign = true; - push(sb, codepoint_to_string(d)); - } - else if (codepoint_is_digit(d)) { - push(sb, codepoint_to_string(d)); - continue; - } - else if (!have_e && (d == ASCII_e || d == ASCII_E)) { - have_e = true; - have_sign = false; - push(sb, 'e'); - } - else if (!have_e && !have_dot && d == ASCII_PERIOD) { - have_dot = true; - push(sb, '.'); - } - else { - unshift_int(s, d); - break; - } - } - if (have_dot) { - float f_value = float(string(sb)); - return f_value; - } else { - int i_value = int(string(sb)); - return i_value; - } - } } diff --git a/src/unstable/net/Uri.winxed b/src/unstable/net/Uri.winxed index b37dc81e..816540d0 100644 --- a/src/unstable/net/Uri.winxed +++ b/src/unstable/net/Uri.winxed @@ -79,13 +79,11 @@ class Rosella.Net.Uri // Get the Path and Query function path_query() { return self.parts["Path_Query"]; } - // TODO: This // Get the Query String - function query_string() { } + function query_string() { return self.parts["Query_String"]; } - // TODO: This - // Get the anchor - function anchor() { } + // Get the fragment/anchor + function fragment() { return self.parts["Fragment"]; } /* Private Helper Methods */ @@ -101,7 +99,7 @@ class Rosella.Net.Uri "Port": "", "Path_Query": "", "Query_String": "", - "Anchor": "" + "Fragment": "" }; } diff --git a/src/unstable/parse/Parse.winxed b/src/unstable/parse/Parse.winxed new file mode 100644 index 00000000..4f92a276 --- /dev/null +++ b/src/unstable/parse/Parse.winxed @@ -0,0 +1,97 @@ +namespace Rosella.Parse +{ + function parse_quoted(int q, string xml, var s, var b, int len) + { + var sb = new 'StringBuilder'; + while (have_more_chars(s, b)) { + int c = get_next(s, b); + if (c == q) + break; + if (c == ASCII_BACKSLASH) + c = get_next(s, b); + push(sb, chr(c)); + } + return sb; + } + + function parse_string(int q, string _str, var s, var b, int len) + { + var sb = new 'StringBuilder'; + while(have_more_chars(s, b)) { + int c = get_next(s, b); + if (c == ASCII_BACKSLASH) { + // TODO: Deal with common C-style escapes + push(sb, "\\"); + c = get_next(s, b); + push(sb, chr(c)); + continue; + } + if (c == q) + break; + push(sb, chr(c)); + } + return sb; + } + + function parse_alphanumeric(string _str, var s, var b, int len) + { + var sb = new 'StringBuilder'; + while(have_more_chars(s, b)) { + int c = get_next(s, b); + if (!codepoint_is_alphanumeric(c)) { + unshift_int(s, c); + break; + } + push(sb, chr(c)); + } + string result = string(sb); + return result; + } + + function parse_number(int c, string _str, var s, var b, int len) + { + int have_e = false; + int have_dot = false; + int have_sign = false; + var sb = new 'StringBuilder'; + push(sb, codepoint_to_string(c)); + while (have_more_chars(s, b)) { + int d = get_next(s, b); + if (!have_sign && (d == ASCII_PLUS|| d == ASCII_DASH)) { + have_sign = true; + push(sb, codepoint_to_string(d)); + } + else if (codepoint_is_digit(d)) { + push(sb, codepoint_to_string(d)); + continue; + } + else if (!have_e && (d == ASCII_e || d == ASCII_E)) { + have_e = true; + have_sign = false; + push(sb, 'e'); + } + else if (!have_e && !have_dot && d == ASCII_PERIOD) { + have_dot = true; + push(sb, '.'); + } + else { + unshift_int(s, d); + break; + } + } + if (have_dot) { + float f_value = float(string(sb)); + return f_value; + } else { + int i_value = int(string(sb)); + return i_value; + } + } + + function error_unknown_char(int c, string context, int pos) + { + if (c == ASCII_NULL) + Rosella.Error.error("Unexpected end of input while parsing %s at position %d", context, pos); + Rosella.Error.error("Unexpected token '%s' in %s at position %d", chr(c), context, pos); + } +} diff --git a/src/unstable/xml/Parser.winxed b/src/unstable/xml/Parser.winxed index f4ecf12f..6f919097 100644 --- a/src/unstable/xml/Parser.winxed +++ b/src/unstable/xml/Parser.winxed @@ -6,7 +6,7 @@ namespace Rosella.Xml.Parser var current_tag = new Rosella.Xml.Tag.DocumentRoot(); var root = current_tag; - Rosella.Xml.Parser.__parse_frontmatter(xml, s, b, len, document); + Rosella.Xml.Parser.__parse_frontmatter(xml, s, b, len, document, current_tag); int c; while(have_more_chars(s, b)) { @@ -66,13 +66,13 @@ namespace Rosella.Xml.Parser return document; } - function __parse_frontmatter(string xml, var s, var b, int len, var document) + function __parse_frontmatter(string xml, var s, var b, int len, var document, var root) { while(have_more_chars(s, b)) { eat_whitespace(s, b); int c = get_next(s, b); if (c != ASCII_LESS_THAN) - Rosella.Xml.Parser.__error_unknown_char(c, "XML frontmatter"); + Rosella.Parse.error_unknown_char(c, "XML frontmatter", current_position(len, s, b)); c = get_next(s, b); if (c == ASCII_QUESTION_MARK) { Rosella.Xml.Parser.__parse_xml_header(xml, s, b, len, document); @@ -81,8 +81,8 @@ namespace Rosella.Xml.Parser if (c == ASCII_EXCLAMATION_POINT) { c = peek_next(s, b); if (c == ASCII_DASH) { - // TODO: We shouldn't just discard this - Rosella.Xml.Parser.__parse_comment(xml, s, b, len); + var c = Rosella.Xml.Parser.__parse_comment(xml, s, b, len); + root.add_child(c); continue; } if (c == ASCII_D) { @@ -105,7 +105,7 @@ namespace Rosella.Xml.Parser { eat_whitespace(s, b); int pos = current_position(len, s, b); - string tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string tag_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); if (tag_name != "xml") Rosella.Error.error("Malformed XML header. Should start with '' at position %d", current_position(len, s, b)); @@ -128,23 +128,23 @@ namespace Rosella.Xml.Parser function __parse_dtd_header(string xml, var s, var b, int len, var document) { int pos = current_position(len, s, b); - string tagname = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string tagname = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); if (tagname != "DOCTYPE") Rosella.Error.error("Invalid DOCTYPE tag"); eat_whitespace(s, b); - string doctype = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string doctype = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); eat_whitespace(s, b); // Should be one of SYSTEM, PUBLIC, etc - string dtd_scope = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string dtd_scope = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); var dtd_header = new Rosella.Xml.Tag.DtdHeader(pos, doctype, dtd_scope); eat_whitespace(s, b); int c = get_next(s, b); if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) { - string filename = Rosella.Xml.Parser.__parse_quoted(c, xml, s, b, len); + string filename = Rosella.Parse.parse_quoted(c, xml, s, b, len); dtd_header.set_filename(filename); } else if (c == ASCII_OPEN_BRACKET) { // TODO: Parse the inline DTD definitions. Add each as a child to @@ -174,14 +174,14 @@ namespace Rosella.Xml.Parser } unshift_int(s, c); string ns = ""; - string tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string tag_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); eat_whitespace(s, b); if (peek_next(s, b) == ASCII_COLON) { get_next(s, b); eat_whitespace(s, b); ns = tag_name; - tag_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + tag_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); } var xtag; @@ -209,13 +209,6 @@ namespace Rosella.Xml.Parser Rosella.Xml.Parser.__error_unknown_char(c, "tag " + tag_name); } - function __error_unknown_char(int c, string context, int pos) - { - if (c == ASCII_NULL) - Rosella.Error.error("Unexpected end of input while parsing %s at position %d", context, pos); - Rosella.Error.error("Unexpected token '%s' in %s at position %d", chr(c), context, pos); - } - function __parse_comment(string xml, var s, var b, int len) { int pos = current_position(len, s, b); @@ -252,7 +245,7 @@ namespace Rosella.Xml.Parser if (!is_name_char(c)) break; - string attr_name = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string attr_name = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); eat_whitespace(s, b); c = get_next(s, b); @@ -265,17 +258,17 @@ namespace Rosella.Xml.Parser c = get_next(s, b); if (c == ASCII_SINGLE_QUOTE || c == ASCII_DOUBLE_QUOTE) { - string attr_value = Rosella.Xml.Parser.__parse_quoted(c, xml, s, b, len); + string attr_value = Rosella.Parse.parse_quoted(c, xml, s, b, len); xtag.add_attribute(attr_name, attr_value); } else { unshift_int(s, c); - string attr_value = Rosella.Xml.Parser.__parse_alphanumeric(xml, s, b, len); + string attr_value = Rosella.Xml.Parser.__parse_ident(xml, s, b, len); xtag.add_attribute(attr_name, attr_value); } } } - function __parse_alphanumeric(string xml, var s, var b, int len) + function __parse_ident(string xml, var s, var b, int len) { var sb = new 'StringBuilder'; while(have_more_chars(s, b)) { @@ -289,18 +282,4 @@ namespace Rosella.Xml.Parser string result = string(sb); return result; } - - function __parse_quoted(int q, string xml, var s, var b, int len) - { - var sb = new 'StringBuilder'; - while (have_more_chars(s, b)) { - int c = get_next(s, b); - if (c == ASCII_NULL || c == q) - break; - if (c == ASCII_BACKSLASH) - c = get_next(s, b); - push(sb, chr(c)); - } - return sb; - } }