From 5fdf7d47aa90a0983fa356b577a2ff5e16e68147 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Mon, 12 Oct 2020 14:17:03 -0400 Subject: [PATCH] Correctly parse raw `script` and `style` tags. (#1038) * Ensure unclosed script tags are parsed correctly by providing a workaround for https://bugs.python.org/issue41989. * Avoid cdata_mode outside of HTML blocks, such as in inline code spans. Fixes #1036. --- docs/change_log/index.md | 1 + markdown/htmlparser.py | 70 ++++++++++++++++ tests/test_syntax/blocks/test_html_blocks.py | 85 ++++++++++++++++++++ 3 files changed, 156 insertions(+) diff --git a/docs/change_log/index.md b/docs/change_log/index.md index 554864480..3b2eea59c 100644 --- a/docs/change_log/index.md +++ b/docs/change_log/index.md @@ -5,6 +5,7 @@ Python-Markdown Change Log Under development: version 3.3.1 (a bug-fix release). +* Correctly parse raw `script` and `style` tags (#1036). * Ensure consistent class handling by `fenced_code` and `codehilite` (#1032). Oct 6, 2020: version 3.3 ([Notes](release-3.3.md)). diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index f83ddeace..6776d340f 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -72,6 +72,13 @@ def reset(self): def close(self): """Handle any buffered data.""" super().close() + if len(self.rawdata): + # Temp fix for https://bugs.python.org/issue41989 + # TODO: remove this when the bug is fixed in all supported Python versions. + if self.convert_charrefs and not self.cdata_elem: # pragma: no cover + self.handle_data(htmlparser.unescape(self.rawdata)) + else: + self.handle_data(self.rawdata) # Handle any unclosed tags. if len(self._cache): self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) @@ -124,6 +131,9 @@ def handle_starttag(self, tag, attrs): self._cache.append(text) else: self.cleandoc.append(text) + if tag in self.CDATA_CONTENT_ELEMENTS: + # This is presumably a standalone tag in a code span (see #1036). + self.clear_cdata_mode() def handle_endtag(self, tag): text = self.get_endtag_text(tag) @@ -200,3 +210,63 @@ def handle_pi(self, data): def unknown_decl(self, data): end = ']]>' if data.startswith('CDATA[') else ']>' self.handle_empty_tag(''.""" + return self.__starttag_text + + def parse_starttag(self, i): # pragma: no cover + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = htmlparser.tagfind_tolerant.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = match.group(1).lower() + while k < endpos: + m = htmlparser.attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = htmlparser.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") # noqa: E127 + else: + offset = offset + len(self.__starttag_text) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + # *** set cdata_mode first so we can override it in handle_starttag (see #1036) *** + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + self.handle_starttag(tag, attrs) + return endpos diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 0a2092d3f..3fea76675 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -1317,3 +1317,88 @@ def text_invalid_tags(self): """ ) ) + + def test_script_tags(self): + self.assertMarkdownRenders( + self.dedent( + """ + + + + """ + ), + self.dedent( + """ + + + + """ + ) + ) + + def test_unclosed_script_tag(self): + # Ensure we have a working fix for https://bugs.python.org/issue41989 + self.assertMarkdownRenders( + self.dedent( + """ + ` tag. + """ + ), + self.dedent( + """ +

Text <script> more text.

+
+ *foo* +
+ +
+ + bar + +
+ +

A new paragraph with a closing </script> tag.

+ """ + ) + )