Skip to content

Commit e44c87c

Browse files
committed
LibWeb: Implement enough HTML parsing to handle a small simple DOM :^)
We can now parse a little DOM like this: <!DOCTYPE html> <html> <head></head> <body> <div></div> </body> </html> This is pretty slow work, but the incremental progress is satisfying!
1 parent fd1b31d commit e44c87c

File tree

5 files changed

+110
-6
lines changed

5 files changed

+110
-6
lines changed

Base/home/anon/www/simple.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
<!DOCTYPE html>
22
<html>
33
<head><meta name="greeting" content='Hello friends!' foo=bar></head>
4+
<body>
5+
<div></div>
6+
</body>
47
</html>

Libraries/LibWeb/Parser/HTMLDocumentParser.cpp

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,6 @@ void HTMLDocumentParser::run()
5555

5656
dbg() << "[" << insertion_mode_name() << "] " << token.to_string();
5757

58-
if (token.type() == HTMLToken::Type::EndOfFile)
59-
return;
60-
6158
switch (m_insertion_mode) {
6259
case InsertionMode::Initial:
6360
handle_initial(token);
@@ -80,6 +77,12 @@ void HTMLDocumentParser::run()
8077
case InsertionMode::InBody:
8178
handle_in_body(token);
8279
break;
80+
case InsertionMode::AfterBody:
81+
handle_after_body(token);
82+
break;
83+
case InsertionMode::AfterAfterBody:
84+
handle_after_after_body(token);
85+
break;
8386
case InsertionMode::Text:
8487
handle_text(token);
8588
break;
@@ -199,7 +202,10 @@ void HTMLDocumentParser::handle_after_head(HTMLToken& token)
199202
}
200203

201204
if (token.is_start_tag() && token.tag_name() == "body") {
202-
ASSERT_NOT_REACHED();
205+
insert_html_element(token);
206+
m_frameset_ok = false;
207+
m_insertion_mode = InsertionMode::InBody;
208+
return;
203209
}
204210

205211
if (token.is_start_tag() && token.tag_name() == "frameset") {
@@ -231,10 +237,94 @@ void HTMLDocumentParser::handle_after_head(HTMLToken& token)
231237
fake_body_token.m_tag.tag_name.append("body");
232238
insert_html_element(fake_body_token);
233239
m_insertion_mode = InsertionMode::InBody;
240+
// FIXME: Reprocess the current token in InBody!
234241
}
235242

236-
void HTMLDocumentParser::handle_in_body(HTMLToken&)
243+
void HTMLDocumentParser::generate_implied_end_tags()
237244
{
245+
Vector<String> names { "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc" };
246+
while (names.contains_slow(current_node()->tag_name()))
247+
m_stack_of_open_elements.take_last();
248+
}
249+
250+
bool HTMLDocumentParser::stack_of_open_elements_has_element_with_tag_name_in_scope(const FlyString& tag_name)
251+
{
252+
Vector<String> list { "applet", "caption", "html", "table", "td", "th", "marquee", "object", "template" };
253+
for (ssize_t i = m_stack_of_open_elements.size() - 1; i >= 0; --i) {
254+
auto& node = m_stack_of_open_elements.at(i);
255+
if (node.tag_name() == tag_name)
256+
return true;
257+
if (list.contains_slow(node.tag_name()))
258+
return false;
259+
}
260+
ASSERT_NOT_REACHED();
261+
}
262+
263+
void HTMLDocumentParser::handle_after_body(HTMLToken& token)
264+
{
265+
if (token.is_end_tag() && token.tag_name() == "html") {
266+
if (m_parsing_fragment) {
267+
ASSERT_NOT_REACHED();
268+
}
269+
m_insertion_mode = InsertionMode::AfterAfterBody;
270+
return;
271+
}
272+
ASSERT_NOT_REACHED();
273+
}
274+
275+
void HTMLDocumentParser::handle_after_after_body(HTMLToken& token)
276+
{
277+
if (token.is_end_of_file()) {
278+
dbg() << "Stop parsing! :^)";
279+
return;
280+
}
281+
ASSERT_NOT_REACHED();
282+
}
283+
284+
void HTMLDocumentParser::handle_in_body(HTMLToken& token)
285+
{
286+
if (token.is_end_tag() && token.tag_name() == "body") {
287+
if (!stack_of_open_elements_has_element_with_tag_name_in_scope("body")) {
288+
ASSERT_NOT_REACHED();
289+
}
290+
291+
// FIXME: Otherwise, if there is a node in the stack of open elements that is
292+
// not either a dd element, a dt element, an li element, an optgroup element,
293+
// an option element, a p element, an rb element, an rp element, an rt element,
294+
// an rtc element, a tbody element, a td element, a tfoot element, a th element,
295+
// a thead element, a tr element, the body element, or the html element,
296+
// then this is a parse error.
297+
298+
m_insertion_mode = InsertionMode::AfterBody;
299+
return;
300+
}
301+
302+
{
303+
Vector<String> names { "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul" };
304+
if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
305+
// FIXME: If the stack of open elements has a p element in button scope, then close a p element.
306+
insert_html_element(token);
307+
return;
308+
}
309+
310+
if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
311+
// FIXME: If the stack of open elements has a p element in button scope, then close a p element.
312+
313+
if (!stack_of_open_elements_has_element_with_tag_name_in_scope(token.tag_name())) {
314+
ASSERT_NOT_REACHED();
315+
}
316+
317+
generate_implied_end_tags();
318+
319+
if (current_node()->tag_name() != token.tag_name()) {
320+
ASSERT_NOT_REACHED();
321+
}
322+
323+
m_stack_of_open_elements.take_last();
324+
return;
325+
}
326+
}
327+
238328
ASSERT_NOT_REACHED();
239329
}
240330

Libraries/LibWeb/Parser/HTMLDocumentParser.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,12 @@ class HTMLDocumentParser {
8484
void handle_in_head_noscript(HTMLToken&);
8585
void handle_after_head(HTMLToken&);
8686
void handle_in_body(HTMLToken&);
87+
void handle_after_body(HTMLToken&);
88+
void handle_after_after_body(HTMLToken&);
8789
void handle_text(HTMLToken&);
8890

91+
void generate_implied_end_tags();
92+
bool stack_of_open_elements_has_element_with_tag_name_in_scope(const FlyString& tag_name);
8993
NonnullRefPtr<Element> create_element_for(HTMLToken&);
9094
RefPtr<Node> find_appropriate_place_for_inserting_node();
9195
RefPtr<Element> insert_html_element(HTMLToken&);
@@ -97,6 +101,8 @@ class HTMLDocumentParser {
97101
HTMLTokenizer m_tokenizer;
98102

99103
bool m_foster_parenting { false };
104+
bool m_frameset_ok { true };
105+
bool m_parsing_fragment { false };
100106

101107
RefPtr<Document> m_document;
102108
RefPtr<HTMLHeadElement> m_head_element;

Libraries/LibWeb/Parser/HTMLTokenizer.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@
7474
#define ANYTHING_ELSE if (1)
7575

7676
#define EMIT_EOF \
77+
if (m_has_emitted_eof) \
78+
return {}; \
79+
m_has_emitted_eof = true; \
7780
create_new_token(HTMLToken::Type::EndOfFile); \
7881
return m_current_token;
7982

@@ -775,7 +778,7 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
775778
void HTMLTokenizer::flush_current_character_or_comment_if_needed()
776779
{
777780
//if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
778-
// emit_current_token();
781+
// emit_current_token();
779782
}
780783

781784
}

Libraries/LibWeb/Parser/HTMLTokenizer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,5 +156,7 @@ class HTMLTokenizer {
156156
size_t m_cursor { 0 };
157157

158158
HTMLToken m_current_token;
159+
160+
bool m_has_emitted_eof { false };
159161
};
160162
}

0 commit comments

Comments
 (0)