Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly parse raw script and style tags. #1038

Merged
merged 5 commits into from
Oct 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/change_log/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Python-Markdown Change Log

Under development: version 3.3.1 (a bug-fix release).

* Correctly parse raw `script` and `style` tags (#1036).
* Ensure consistent class handling by `fenced_code` and `codehilite` (#1032).

Oct 6, 2020: version 3.3 ([Notes](release-3.3.md)).
Expand Down
70 changes: 70 additions & 0 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def reset(self):
def close(self):
"""Handle any buffered data."""
super().close()
if len(self.rawdata):
# Temp fix for https://bugs.python.org/issue41989
# TODO: remove this when the bug is fixed in all supported Python versions.
if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
self.handle_data(htmlparser.unescape(self.rawdata))
else:
self.handle_data(self.rawdata)
# Handle any unclosed tags.
if len(self._cache):
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
Expand Down Expand Up @@ -124,6 +131,9 @@ def handle_starttag(self, tag, attrs):
self._cache.append(text)
else:
self.cleandoc.append(text)
if tag in self.CDATA_CONTENT_ELEMENTS:
# This is presumably a standalone tag in a code span (see #1036).
self.clear_cdata_mode()

def handle_endtag(self, tag):
text = self.get_endtag_text(tag)
Expand Down Expand Up @@ -200,3 +210,63 @@ def handle_pi(self, data):
def unknown_decl(self, data):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

# The rest has been copied from base class in standard lib to address #1036.
# As __startag_text is private, all references to it must be in this subclass.
# The last few lines of parse_starttag are reversed so that handle_starttag
# can override cdata_mode in certain situations (in a code span).
__starttag_text = None

def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def parse_starttag(self, i): # pragma: no cover
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]

# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = htmlparser.attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = htmlparser.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()

end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n") # noqa: E127
else:
offset = offset + len(self.__starttag_text)
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
# *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
self.handle_starttag(tag, attrs)
return endpos
85 changes: 85 additions & 0 deletions tests/test_syntax/blocks/test_html_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1317,3 +1317,88 @@ def text_invalid_tags(self):
"""
)
)

def test_script_tags(self):
self.assertMarkdownRenders(
self.dedent(
"""
<script>
*random stuff* <div> &amp;
</script>

<style>
**more stuff**
</style>
"""
),
self.dedent(
"""
<script>
*random stuff* <div> &amp;
</script>

<style>
**more stuff**
</style>
"""
)
)

def test_unclosed_script_tag(self):
# Ensure we have a working fix for https://bugs.python.org/issue41989
self.assertMarkdownRenders(
self.dedent(
"""
<script>
*random stuff* <div> &amp;

Still part of the *script* tag
"""
),
self.dedent(
"""
<script>
*random stuff* <div> &amp;

Still part of the *script* tag
"""
)
)

def test_inline_script_tags(self):
# Ensure inline script tags doesn't cause the parser to eat content (see #1036).
self.assertMarkdownRenders(
self.dedent(
"""
Text `<script>` more *text*.

<div>
*foo*
</div>

<div>

bar

</div>

A new paragraph with a closing `</script>` tag.
"""
),
self.dedent(
"""
<p>Text <code>&lt;script&gt;</code> more <em>text</em>.</p>
<div>
*foo*
</div>

<div>

bar

</div>

<p>A new paragraph with a closing <code>&lt;/script&gt;</code> tag.</p>
"""
)
)