From 3d8afc6f89e169522f44c1bbec15f66dc359eccb Mon Sep 17 00:00:00 2001 From: Oleh Prypin Date: Tue, 12 Mar 2024 16:54:39 +0100 Subject: [PATCH] Allow attr_list quoted values to contain curly braces How it worked before: * Extract the content without allowing any `}` in it, and require that it ends with `}` - for block elements anchored to the end of the line, otherwise not. * Parse the content in more detail. No edge cases with `}` can arise. If parsing is interrupted by some unrecognized token, discard the rest of the string. How it works now: * Extract the content *and allow* `}` in it, and require that it ends with `}` - for block elements it's anchored to the end of the line, otherwise not. * Parse the content in more detail. Allow `}` only within the quoted parts, otherwise interrupt parsing like for any other unrecognized token. If parsing is interrupted, there is remaining unrecognized text. Ideally perhaps we would bail out at this point entirely (and not recognize it as an attr_list), but to preserve historic behavior, any extra text before `}` is just discarded. If there is an extra `}` in the remaining text: * For block elements: that must mean that the attr_list syntax did not in fact terminate at the end of the line but earlier. So, bail out and do not register any attributes and do not change the original text. * For inline elements: that must mean that we just overmatched a bit, but that's OK, we just assign attrs as normal and put the extra text back into the string. As mentioned, any extra text *before* `}` is just discarded. --- .spell-dict | 1 + docs/changelog.md | 4 +- markdown/extensions/attr_list.py | 86 ++++++++++++------- markdown/extensions/fenced_code.py | 15 +++- .../test_syntax/extensions/test_attr_list.py | 45 +++++++++- .../extensions/test_fenced_code.py | 42 +++++++++ 6 files changed, 153 insertions(+), 40 deletions(-) diff --git a/.spell-dict b/.spell-dict index ae124542..9c1db010 100644 --- a/.spell-dict +++ b/.spell-dict @@ -146,6 +146,7 @@ Treeprocessor Treeprocessors tuple tuples +unparsable unclosed unescape unescaping diff --git a/docs/changelog.md b/docs/changelog.md index 9c2b302e..f61c6198 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -34,8 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Include `scripts/*.py` in the generated source tarballs (#1430). * Ensure lines after heading in loose list are properly detabbed (#1443). * Give smarty tree processor higher priority than toc (#1440). -* Permit carrots (`^`) and square brackets (`]`) but explicitly exclude +* Permit carets (`^`) and square brackets (`]`) but explicitly exclude backslashes (`\`) from abbreviations (#1444). +* In attribute lists (`attr_list`, `fenced_code`), quoted attribute values are + now allowed to contain curly braces (`}`) (#1414). ## [3.5.2] -- 2024-01-10 diff --git a/markdown/extensions/attr_list.py b/markdown/extensions/attr_list.py index 7ce3f992..9206d11e 100644 --- a/markdown/extensions/attr_list.py +++ b/markdown/extensions/attr_list.py @@ -57,17 +57,30 @@ def _handle_word(s, t): _scanner = re.Scanner([ - (r'[^ =]+=".*?"', _handle_double_quote), - (r"[^ =]+='.*?'", _handle_single_quote), - (r'[^ =]+=[^ =]+', _handle_key_value), - (r'[^ =]+', _handle_word), + (r'[^ =}]+=".*?"', _handle_double_quote), + (r"[^ =}]+='.*?'", _handle_single_quote), + (r'[^ =}]+=[^ =}]+', _handle_key_value), + (r'[^ =}]+', _handle_word), (r' ', None) ]) -def get_attrs(str: str) -> list[tuple[str, str]]: - """ Parse attribute list and return a list of attribute tuples. """ - return _scanner.scan(str)[0] +def get_attrs_and_remainder(attrs_string: str) -> tuple[list[tuple[str, str]], str]: + """ Parse attribute list and return a list of attribute tuples. + + Additionally, return any text that remained after a curly brace. In typical cases, its presence + should mean that the input does not match the intended attribute list syntax. + """ + attrs, remainder = _scanner.scan(attrs_string) + # To keep historic behavior, discard all unparsable text prior to '}'. + index = remainder.find('}') + remainder = remainder[index:] if index != -1 else '' + return attrs, remainder + + +def get_attrs(str: str) -> list[tuple[str, str]]: # pragma: no cover + """ Soft-deprecated. Prefer `get_attrs_and_remainder`. """ + return get_attrs_and_remainder(str)[0] def isheader(elem: Element) -> bool: @@ -76,7 +89,7 @@ def isheader(elem: Element) -> bool: class AttrListTreeprocessor(Treeprocessor): - BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}' + BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}' HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE)) BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE)) INLINE_RE = re.compile(r'^{}'.format(BASE_RE)) @@ -106,49 +119,58 @@ def run(self, doc: Element) -> None: # use tail of last child. no `ul` or `ol`. m = RE.search(elem[-1].tail) if m: - self.assign_attrs(elem, m.group(1)) - elem[-1].tail = elem[-1].tail[:m.start()] + if not self.assign_attrs(elem, m.group(1), strict=True): + elem[-1].tail = elem[-1].tail[:m.start()] elif pos is not None and pos > 0 and elem[pos-1].tail: # use tail of last child before `ul` or `ol` m = RE.search(elem[pos-1].tail) if m: - self.assign_attrs(elem, m.group(1)) - elem[pos-1].tail = elem[pos-1].tail[:m.start()] + if not self.assign_attrs(elem, m.group(1), strict=True): + elem[pos-1].tail = elem[pos-1].tail[:m.start()] elif elem.text: # use text. `ul` is first child. m = RE.search(elem.text) if m: - self.assign_attrs(elem, m.group(1)) - elem.text = elem.text[:m.start()] + if not self.assign_attrs(elem, m.group(1), strict=True): + elem.text = elem.text[:m.start()] elif len(elem) and elem[-1].tail: # has children. Get from tail of last child m = RE.search(elem[-1].tail) if m: - self.assign_attrs(elem, m.group(1)) - elem[-1].tail = elem[-1].tail[:m.start()] - if isheader(elem): - # clean up trailing #s - elem[-1].tail = elem[-1].tail.rstrip('#').rstrip() + if not self.assign_attrs(elem, m.group(1), strict=True): + elem[-1].tail = elem[-1].tail[:m.start()] + if isheader(elem): + # clean up trailing #s + elem[-1].tail = elem[-1].tail.rstrip('#').rstrip() elif elem.text: # no children. Get from text. m = RE.search(elem.text) if m: - self.assign_attrs(elem, m.group(1)) - elem.text = elem.text[:m.start()] - if isheader(elem): - # clean up trailing #s - elem.text = elem.text.rstrip('#').rstrip() + if not self.assign_attrs(elem, m.group(1), strict=True): + elem.text = elem.text[:m.start()] + if isheader(elem): + # clean up trailing #s + elem.text = elem.text.rstrip('#').rstrip() else: # inline: check for `attrs` at start of tail if elem.tail: m = self.INLINE_RE.match(elem.tail) if m: - self.assign_attrs(elem, m.group(1)) - elem.tail = elem.tail[m.end():] + remainder = self.assign_attrs(elem, m.group(1)) + elem.tail = elem.tail[m.end():] + remainder + + def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str: + """ Assign `attrs` to element. + + If the `attrs_string` has an extra closing curly brace, the remaining text is returned. + + The `strict` argument controls whether to still assign `attrs` if there is a remaining `}`. + """ + attrs, remainder = get_attrs_and_remainder(attrs_string) + if strict and remainder: + return remainder - def assign_attrs(self, elem: Element, attrs: str) -> None: - """ Assign `attrs` to element. """ - for k, v in get_attrs(attrs): + for k, v in attrs: if k == '.': # add to class cls = elem.get('class') @@ -159,11 +181,13 @@ def assign_attrs(self, elem: Element, attrs: str) -> None: else: # assign attribute `k` with `v` elem.set(self.sanitize_name(k), v) + # The text that we initially over-matched will be put back. + return remainder def sanitize_name(self, name: str) -> str: """ - Sanitize name as 'an XML Name, minus the ":"'. - See https://www.w3.org/TR/REC-xml-names/#NT-NCName + Sanitize name as 'an XML Name, minus the `:`.' + See . """ return self.NAME_RE.sub('_', name) diff --git a/markdown/extensions/fenced_code.py b/markdown/extensions/fenced_code.py index da1a9be1..bae7330a 100644 --- a/markdown/extensions/fenced_code.py +++ b/markdown/extensions/fenced_code.py @@ -25,7 +25,7 @@ from . import Extension from ..preprocessors import Preprocessor from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines -from .attr_list import get_attrs, AttrListExtension +from .attr_list import get_attrs_and_remainder, AttrListExtension from ..util import parseBoolValue from ..serializers import _escape_attrib_html import re @@ -56,7 +56,7 @@ class FencedBlockPreprocessor(Preprocessor): FENCED_BLOCK_RE = re.compile( dedent(r''' (?P^(?:~{3,}|`{3,}))[ ]* # opening fence - ((\{(?P[^\}\n]*)\})| # (optional {attrs} or + ((\{(?P[^\n]*)\})| # (optional {attrs} or (\.?(?P[\w#.+-]*)[ ]*)? # optional (.)lang (hl_lines=(?P"|')(?P.*?)(?P=quot)[ ]*)?) # optional hl_lines) \n # newline (end of opening fence) @@ -94,12 +94,17 @@ def run(self, lines: list[str]) -> list[str]: self.checked_for_deps = True text = "\n".join(lines) + index = 0 while 1: - m = self.FENCED_BLOCK_RE.search(text) + m = self.FENCED_BLOCK_RE.search(text, index) if m: lang, id, classes, config = None, '', [], {} if m.group('attrs'): - id, classes, config = self.handle_attrs(get_attrs(m.group('attrs'))) + attrs, remainder = get_attrs_and_remainder(m.group('attrs')) + if remainder: # Does not have correctly matching curly braces, so the syntax is invalid. + index = m.end('attrs') # Explicitly skip over this, to prevent an infinite loop. + continue + id, classes, config = self.handle_attrs(attrs) if len(classes): lang = classes.pop(0) else: @@ -151,6 +156,8 @@ def run(self, lines: list[str]) -> list[str]: placeholder = self.md.htmlStash.store(code) text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}' + # Continue from after the replaced text in the next iteration. + index = m.start() + 1 + len(placeholder) else: break return text.split("\n") diff --git a/tests/test_syntax/extensions/test_attr_list.py b/tests/test_syntax/extensions/test_attr_list.py index ba8b2369..e9a10960 100644 --- a/tests/test_syntax/extensions/test_attr_list.py +++ b/tests/test_syntax/extensions/test_attr_list.py @@ -23,16 +23,53 @@ class TestAttrList(TestCase): - maxDiff = None + default_kwargs = {'extensions': ['attr_list']} # TODO: Move the rest of the `attr_list` tests here. - def test_empty_list(self): + def test_empty_attr_list(self): self.assertMarkdownRenders( '*foo*{ }', - '

foo{ }

', - extensions=['attr_list'] + '

foo{ }

' + ) + + def test_curly_after_inline(self): + self.assertMarkdownRenders( + '*inline*{.a} } *text*{.a }}', + '

inline } text}

' + ) + + def test_extra_eq_gets_ignored_inside_curly_inline(self): + # Undesired behavior but kept for historic compatibility. + self.assertMarkdownRenders( + '*inline*{data-test="x" =a} *text*', + '

inline text

' + ) + + def test_curly_after_block(self): + self.assertMarkdownRenders( + '# Heading {.a} }', + '

Heading {.a} }

' + ) + + def test_curly_in_single_quote(self): + self.assertMarkdownRenders( + "# Heading {data-test='{}'}", + '

Heading

' + ) + + def test_curly_in_double_quote(self): + self.assertMarkdownRenders( + '# Heading {data-test="{}"}', + '

Heading

' + ) + + def test_unclosed_quote_ignored(self): + # Undesired behavior but kept for historic compatibility. + self.assertMarkdownRenders( + '# Heading {foo="bar}', + '

Heading

' ) def test_table_td(self): diff --git a/tests/test_syntax/extensions/test_fenced_code.py b/tests/test_syntax/extensions/test_fenced_code.py index e24a1778..aa58bbfb 100644 --- a/tests/test_syntax/extensions/test_fenced_code.py +++ b/tests/test_syntax/extensions/test_fenced_code.py @@ -394,6 +394,48 @@ def testFencedCodeEscapedAttrs(self): extensions=['fenced_code', 'attr_list'] ) + def testFencedCodeCurlyInAttrs(self): + self.assertMarkdownRenders( + self.dedent( + ''' + ``` { data-test="{}" } + # Some python code + ``` + ''' + ), + self.dedent( + ''' +
# Some python code
+                
+ ''' + ), + extensions=['fenced_code', 'attr_list'] + ) + + def testFencedCodeMismatchedCurlyInAttrs(self): + self.assertMarkdownRenders( + self.dedent( + ''' + ``` { data-test="{}" } } + # Some python code + ``` + ``` + test + ``` + ''' + ), + self.dedent( + ''' +

``` { data-test="{}" } }

+

Some python code

+
+

test + ```

+ ''' + ), + extensions=['fenced_code', 'attr_list'] + ) + class TestFencedCodeWithCodehilite(TestCase):