Skip to content

Commit

Permalink
Correctly parse raw script and style tags. (#1038)
Browse files Browse the repository at this point in the history
* Ensure unclosed script tags are parsed correctly by providing a workaround for https://bugs.python.org/issue41989.
* Avoid cdata_mode outside of HTML blocks, such as in inline code spans.

Fixes #1036.
  • Loading branch information
waylan committed Oct 12, 2020
1 parent e02ed39 commit 5fdf7d4
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/change_log/index.md
Expand Up @@ -5,6 +5,7 @@ Python-Markdown Change Log

Under development: version 3.3.1 (a bug-fix release).

* Correctly parse raw `script` and `style` tags (#1036).
* Ensure consistent class handling by `fenced_code` and `codehilite` (#1032).

Oct 6, 2020: version 3.3 ([Notes](release-3.3.md)).
Expand Down
70 changes: 70 additions & 0 deletions markdown/htmlparser.py
Expand Up @@ -72,6 +72,13 @@ def reset(self):
def close(self):
"""Handle any buffered data."""
super().close()
if len(self.rawdata):
# Temp fix for https://bugs.python.org/issue41989
# TODO: remove this when the bug is fixed in all supported Python versions.
if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
self.handle_data(htmlparser.unescape(self.rawdata))
else:
self.handle_data(self.rawdata)
# Handle any unclosed tags.
if len(self._cache):
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
Expand Down Expand Up @@ -124,6 +131,9 @@ def handle_starttag(self, tag, attrs):
self._cache.append(text)
else:
self.cleandoc.append(text)
if tag in self.CDATA_CONTENT_ELEMENTS:
# This is presumably a standalone tag in a code span (see #1036).
self.clear_cdata_mode()

def handle_endtag(self, tag):
text = self.get_endtag_text(tag)
Expand Down Expand Up @@ -200,3 +210,63 @@ def handle_pi(self, data):
def unknown_decl(self, data):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

# The rest has been copied from base class in standard lib to address #1036.
# As __startag_text is private, all references to it must be in this subclass.
# The last few lines of parse_starttag are reversed so that handle_starttag
# can override cdata_mode in certain situations (in a code span).
__starttag_text = None

def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def parse_starttag(self, i): # pragma: no cover
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]

# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = htmlparser.attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = htmlparser.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()

end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n") # noqa: E127
else:
offset = offset + len(self.__starttag_text)
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
# *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
self.handle_starttag(tag, attrs)
return endpos
85 changes: 85 additions & 0 deletions tests/test_syntax/blocks/test_html_blocks.py
Expand Up @@ -1317,3 +1317,88 @@ def text_invalid_tags(self):
"""
)
)

def test_script_tags(self):
self.assertMarkdownRenders(
self.dedent(
"""
<script>
*random stuff* <div> &amp;
</script>
<style>
**more stuff**
</style>
"""
),
self.dedent(
"""
<script>
*random stuff* <div> &amp;
</script>
<style>
**more stuff**
</style>
"""
)
)

def test_unclosed_script_tag(self):
# Ensure we have a working fix for https://bugs.python.org/issue41989
self.assertMarkdownRenders(
self.dedent(
"""
<script>
*random stuff* <div> &amp;
Still part of the *script* tag
"""
),
self.dedent(
"""
<script>
*random stuff* <div> &amp;
Still part of the *script* tag
"""
)
)

def test_inline_script_tags(self):
# Ensure inline script tags doesn't cause the parser to eat content (see #1036).
self.assertMarkdownRenders(
self.dedent(
"""
Text `<script>` more *text*.
<div>
*foo*
</div>
<div>
bar
</div>
A new paragraph with a closing `</script>` tag.
"""
),
self.dedent(
"""
<p>Text <code>&lt;script&gt;</code> more <em>text</em>.</p>
<div>
*foo*
</div>
<div>
bar
</div>
<p>A new paragraph with a closing <code>&lt;/script&gt;</code> tag.</p>
"""
)
)

0 comments on commit 5fdf7d4

Please sign in to comment.