From af4b344c0ddd45b79f919fa91f7c802f12c8cf46 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 13 Jul 2022 15:41:06 -0400 Subject: [PATCH 1/3] Move backslash unescaping to treeprocessor By unescaping backslash escapes in a treeprocessor, the text is properly escaped during serialization. Fixes #1131. As it is recognized that varous third-party extensions may be calling the old class at `postprocessors.UnescapePostprocessor` the old class remains in the codebase, but has been deprecated and will be removed in a future release. The new class `treeprocessors.UnescapeTreeprocessor` should be used instead. --- docs/change_log/release-3.4.md | 10 +++++- markdown/extensions/toc.py | 6 ++-- markdown/postprocessors.py | 6 ++-- markdown/treeprocessors.py | 27 ++++++++++++++++ tests/basic/backlash-escapes.html | 2 +- tests/test_syntax/extensions/test_smarty.py | 36 +++++++++++++++++++++ 6 files changed, 80 insertions(+), 7 deletions(-) create mode 100644 tests/test_syntax/extensions/test_smarty.py diff --git a/docs/change_log/release-3.4.md b/docs/change_log/release-3.4.md index 9d1cd1782..c06fff0b2 100644 --- a/docs/change_log/release-3.4.md +++ b/docs/change_log/release-3.4.md @@ -30,10 +30,18 @@ markdown.markdown(src, extensions=[TableExtension(use_align_attribute=True)]) In addition, tests were moved to the modern test environment. +### `UnescapePostprocessor` deprecated and replaced with `UnescapeTreeprocessor` (#1131). + +Unescaping backslash escapes has been moved to a treeprocessor. However, it is +recognized that varous third-party extensions may be calling the old class at +`postprocessors.UnescapePostprocessor`. Therefore, the old class remains in the +codebase, but has been deprecated and will be removed in a future release. The +new class `treeprocessors.UnescapeTreeprocessor` should be used instead. + ### Previously deprecated objects have been removed Various objects were deprecated in version 3.0 and began raising deprecation -warnings (see the [version 3.0 release notes] for details). Any of those object +warnings (see the [version 3.0 release notes] for details). Any of those objects which remained in version 3.3 have been removed from the code base in version 3.4 and will now raise errors. A summary of the objects are provided below. diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index 80138b3a7..cd3be24a0 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -16,7 +16,7 @@ from . import Extension from ..treeprocessors import Treeprocessor from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString -from ..postprocessors import UnescapePostprocessor +from ..treeprocessors import UnescapeTreeprocessor import re import html import unicodedata @@ -84,8 +84,8 @@ def _html_sub(m): def unescape(text): """ Unescape escaped text. """ - c = UnescapePostprocessor() - return c.run(text) + c = UnescapeTreeprocessor() + return c.unescape(text) def nest_toc_tokens(toc_list): diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py index f4fb92477..cb226acc9 100644 --- a/markdown/postprocessors.py +++ b/markdown/postprocessors.py @@ -37,7 +37,6 @@ def build_postprocessors(md, **kwargs): postprocessors = util.Registry() postprocessors.register(RawHtmlPostprocessor(md), 'raw_html', 30) postprocessors.register(AndSubstitutePostprocessor(), 'amp_substitute', 20) - postprocessors.register(UnescapePostprocessor(), 'unescape', 10) return postprocessors @@ -121,7 +120,10 @@ def run(self, text): text = text.replace(util.AMP_SUBSTITUTE, "&") return text - +@util.deprecated( + "This class will be removed in the future; " + "use 'treeprocessors.UnescapeTreeprocessor' instead." +) class UnescapePostprocessor(Postprocessor): """ Restore escaped chars """ diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py index e02a505b7..e9f48ca11 100644 --- a/markdown/treeprocessors.py +++ b/markdown/treeprocessors.py @@ -19,6 +19,7 @@ License: BSD (see LICENSE.md for details). """ +import re import xml.etree.ElementTree as etree from . import util from . import inlinepatterns @@ -29,6 +30,7 @@ def build_treeprocessors(md, **kwargs): treeprocessors = util.Registry() treeprocessors.register(InlineProcessor(md), 'inline', 20) treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) + treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) return treeprocessors @@ -429,3 +431,28 @@ def run(self, root): # Only prettify code containing text only if not len(code) and code.text is not None: code.text = util.AtomicString(code.text.rstrip() + '\n') + + +class UnescapeTreeprocessor(Treeprocessor): + """ Restore escaped chars """ + + RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) + + def _unescape(self, m): + return chr(int(m.group(1))) + + def unescape(self, text): + return self.RE.sub(self._unescape, text) + + def run(self, root): + """ Loop over all elements and unescape all text. """ + for elem in root.iter(): + # Unescape text content + if elem.text and not elem.tag == 'code': + elem.text = self.unescape(elem.text) + # Unescape tail content + if elem.tail: + elem.tail = self.unescape(elem.tail) + # Unescape attribute values + for key, value in elem.items(): + elem.set(key, self.unescape(value)) diff --git a/tests/basic/backlash-escapes.html b/tests/basic/backlash-escapes.html index ef7c4b5ea..876775f43 100644 --- a/tests/basic/backlash-escapes.html +++ b/tests/basic/backlash-escapes.html @@ -9,7 +9,7 @@

Right bracket: ]

Left paren: (

Right paren: )

-

Greater-than: >

+

Greater-than: >

Hash: #

Period: .

Bang: !

diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py new file mode 100644 index 000000000..fc635ad3d --- /dev/null +++ b/tests/test_syntax/extensions/test_smarty.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2022 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). +""" + +from markdown.test_tools import TestCase + + +class TestSmarty(TestCase): + + default_kwargs = {'extensions': ['smarty']} + + def test_escaped_attr(self): + self.assertMarkdownRenders( + '![x\"x](x)', + '

x"x

' + ) + + # TODO: Move rest of smarty tests here. From 2e3d14d3c4b59a95ab47089bf0474c6f89029214 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 13 Jul 2022 16:05:35 -0400 Subject: [PATCH 2/3] cleanup --- docs/change_log/release-3.4.md | 8 ++++---- markdown/postprocessors.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/change_log/release-3.4.md b/docs/change_log/release-3.4.md index c06fff0b2..9db6707d0 100644 --- a/docs/change_log/release-3.4.md +++ b/docs/change_log/release-3.4.md @@ -30,12 +30,12 @@ markdown.markdown(src, extensions=[TableExtension(use_align_attribute=True)]) In addition, tests were moved to the modern test environment. -### `UnescapePostprocessor` deprecated and replaced with `UnescapeTreeprocessor` (#1131). +### Backslash unescaping moved to Treeprocessor (#1131). -Unescaping backslash escapes has been moved to a treeprocessor. However, it is -recognized that varous third-party extensions may be calling the old class at +Unescaping backslash escapes has been moved to a Treeprocessor. However, it is +recognized that various third-party extensions may be calling the old class at `postprocessors.UnescapePostprocessor`. Therefore, the old class remains in the -codebase, but has been deprecated and will be removed in a future release. The +code base, but has been deprecated and will be removed in a future release. The new class `treeprocessors.UnescapeTreeprocessor` should be used instead. ### Previously deprecated objects have been removed diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py index cb226acc9..498f7e892 100644 --- a/markdown/postprocessors.py +++ b/markdown/postprocessors.py @@ -120,6 +120,7 @@ def run(self, text): text = text.replace(util.AMP_SUBSTITUTE, "&") return text + @util.deprecated( "This class will be removed in the future; " "use 'treeprocessors.UnescapeTreeprocessor' instead." From acb0c31297b2a7fa8233d01c2153df3a0b9445a6 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 14 Jul 2022 15:16:37 -0400 Subject: [PATCH 3/3] Remove unnessecary unescape call from toc ext. --- markdown/extensions/toc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index cd3be24a0..1ded18d63 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -289,10 +289,10 @@ def run(self, doc): toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], - 'name': unescape(stashedHTML2text( + 'name': stashedHTML2text( code_escape(el.attrib.get('data-toc-label', text)), self.md, strip_entities=False - )) + ) }) # Remove the data-toc-label attribute as it is no longer needed