diff --git a/.spell-dict b/.spell-dict index ae124542..9c1db010 100644 --- a/.spell-dict +++ b/.spell-dict @@ -146,6 +146,7 @@ Treeprocessor Treeprocessors tuple tuples +unparsable unclosed unescape unescaping diff --git a/docs/changelog.md b/docs/changelog.md index 9c2b302e..f61c6198 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -34,8 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Include `scripts/*.py` in the generated source tarballs (#1430). * Ensure lines after heading in loose list are properly detabbed (#1443). * Give smarty tree processor higher priority than toc (#1440). -* Permit carrots (`^`) and square brackets (`]`) but explicitly exclude +* Permit carets (`^`) and square brackets (`]`) but explicitly exclude backslashes (`\`) from abbreviations (#1444). +* In attribute lists (`attr_list`, `fenced_code`), quoted attribute values are + now allowed to contain curly braces (`}`) (#1414). ## [3.5.2] -- 2024-01-10 diff --git a/markdown/extensions/attr_list.py b/markdown/extensions/attr_list.py index 7ce3f992..9206d11e 100644 --- a/markdown/extensions/attr_list.py +++ b/markdown/extensions/attr_list.py @@ -57,17 +57,30 @@ def _handle_word(s, t): _scanner = re.Scanner([ - (r'[^ =]+=".*?"', _handle_double_quote), - (r"[^ =]+='.*?'", _handle_single_quote), - (r'[^ =]+=[^ =]+', _handle_key_value), - (r'[^ =]+', _handle_word), + (r'[^ =}]+=".*?"', _handle_double_quote), + (r"[^ =}]+='.*?'", _handle_single_quote), + (r'[^ =}]+=[^ =}]+', _handle_key_value), + (r'[^ =}]+', _handle_word), (r' ', None) ]) -def get_attrs(str: str) -> list[tuple[str, str]]: - """ Parse attribute list and return a list of attribute tuples. """ - return _scanner.scan(str)[0] +def get_attrs_and_remainder(attrs_string: str) -> tuple[list[tuple[str, str]], str]: + """ Parse attribute list and return a list of attribute tuples. + + Additionally, return any text that remained after a curly brace. In typical cases, its presence + should mean that the input does not match the intended attribute list syntax. + """ + attrs, remainder = _scanner.scan(attrs_string) + # To keep historic behavior, discard all unparsable text prior to '}'. + index = remainder.find('}') + remainder = remainder[index:] if index != -1 else '' + return attrs, remainder + + +def get_attrs(str: str) -> list[tuple[str, str]]: # pragma: no cover + """ Soft-deprecated. Prefer `get_attrs_and_remainder`. """ + return get_attrs_and_remainder(str)[0] def isheader(elem: Element) -> bool: @@ -76,7 +89,7 @@ def isheader(elem: Element) -> bool: class AttrListTreeprocessor(Treeprocessor): - BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}' + BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}' HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE)) BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE)) INLINE_RE = re.compile(r'^{}'.format(BASE_RE)) @@ -106,49 +119,58 @@ def run(self, doc: Element) -> None: # use tail of last child. no `ul` or `ol`. m = RE.search(elem[-1].tail) if m: - self.assign_attrs(elem, m.group(1)) - elem[-1].tail = elem[-1].tail[:m.start()] + if not self.assign_attrs(elem, m.group(1), strict=True): + elem[-1].tail = elem[-1].tail[:m.start()] elif pos is not None and pos > 0 and elem[pos-1].tail: # use tail of last child before `ul` or `ol` m = RE.search(elem[pos-1].tail) if m: - self.assign_attrs(elem, m.group(1)) - elem[pos-1].tail = elem[pos-1].tail[:m.start()] + if not self.assign_attrs(elem, m.group(1), strict=True): + elem[pos-1].tail = elem[pos-1].tail[:m.start()] elif elem.text: # use text. `ul` is first child. m = RE.search(elem.text) if m: - self.assign_attrs(elem, m.group(1)) - elem.text = elem.text[:m.start()] + if not self.assign_attrs(elem, m.group(1), strict=True): + elem.text = elem.text[:m.start()] elif len(elem) and elem[-1].tail: # has children. Get from tail of last child m = RE.search(elem[-1].tail) if m: - self.assign_attrs(elem, m.group(1)) - elem[-1].tail = elem[-1].tail[:m.start()] - if isheader(elem): - # clean up trailing #s - elem[-1].tail = elem[-1].tail.rstrip('#').rstrip() + if not self.assign_attrs(elem, m.group(1), strict=True): + elem[-1].tail = elem[-1].tail[:m.start()] + if isheader(elem): + # clean up trailing #s + elem[-1].tail = elem[-1].tail.rstrip('#').rstrip() elif elem.text: # no children. Get from text. m = RE.search(elem.text) if m: - self.assign_attrs(elem, m.group(1)) - elem.text = elem.text[:m.start()] - if isheader(elem): - # clean up trailing #s - elem.text = elem.text.rstrip('#').rstrip() + if not self.assign_attrs(elem, m.group(1), strict=True): + elem.text = elem.text[:m.start()] + if isheader(elem): + # clean up trailing #s + elem.text = elem.text.rstrip('#').rstrip() else: # inline: check for `attrs` at start of tail if elem.tail: m = self.INLINE_RE.match(elem.tail) if m: - self.assign_attrs(elem, m.group(1)) - elem.tail = elem.tail[m.end():] + remainder = self.assign_attrs(elem, m.group(1)) + elem.tail = elem.tail[m.end():] + remainder + + def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str: + """ Assign `attrs` to element. + + If the `attrs_string` has an extra closing curly brace, the remaining text is returned. + + The `strict` argument controls whether to still assign `attrs` if there is a remaining `}`. + """ + attrs, remainder = get_attrs_and_remainder(attrs_string) + if strict and remainder: + return remainder - def assign_attrs(self, elem: Element, attrs: str) -> None: - """ Assign `attrs` to element. """ - for k, v in get_attrs(attrs): + for k, v in attrs: if k == '.': # add to class cls = elem.get('class') @@ -159,11 +181,13 @@ def assign_attrs(self, elem: Element, attrs: str) -> None: else: # assign attribute `k` with `v` elem.set(self.sanitize_name(k), v) + # The text that we initially over-matched will be put back. + return remainder def sanitize_name(self, name: str) -> str: """ - Sanitize name as 'an XML Name, minus the ":"'. - See https://www.w3.org/TR/REC-xml-names/#NT-NCName + Sanitize name as 'an XML Name, minus the `:`.' + See . """ return self.NAME_RE.sub('_', name) diff --git a/markdown/extensions/fenced_code.py b/markdown/extensions/fenced_code.py index da1a9be1..bae7330a 100644 --- a/markdown/extensions/fenced_code.py +++ b/markdown/extensions/fenced_code.py @@ -25,7 +25,7 @@ from . import Extension from ..preprocessors import Preprocessor from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines -from .attr_list import get_attrs, AttrListExtension +from .attr_list import get_attrs_and_remainder, AttrListExtension from ..util import parseBoolValue from ..serializers import _escape_attrib_html import re @@ -56,7 +56,7 @@ class FencedBlockPreprocessor(Preprocessor): FENCED_BLOCK_RE = re.compile( dedent(r''' (?P^(?:~{3,}|`{3,}))[ ]* # opening fence - ((\{(?P[^\}\n]*)\})| # (optional {attrs} or + ((\{(?P[^\n]*)\})| # (optional {attrs} or (\.?(?P[\w#.+-]*)[ ]*)? # optional (.)lang (hl_lines=(?P"|')(?P.*?)(?P=quot)[ ]*)?) # optional hl_lines) \n # newline (end of opening fence) @@ -94,12 +94,17 @@ def run(self, lines: list[str]) -> list[str]: self.checked_for_deps = True text = "\n".join(lines) + index = 0 while 1: - m = self.FENCED_BLOCK_RE.search(text) + m = self.FENCED_BLOCK_RE.search(text, index) if m: lang, id, classes, config = None, '', [], {} if m.group('attrs'): - id, classes, config = self.handle_attrs(get_attrs(m.group('attrs'))) + attrs, remainder = get_attrs_and_remainder(m.group('attrs')) + if remainder: # Does not have correctly matching curly braces, so the syntax is invalid. + index = m.end('attrs') # Explicitly skip over this, to prevent an infinite loop. + continue + id, classes, config = self.handle_attrs(attrs) if len(classes): lang = classes.pop(0) else: @@ -151,6 +156,8 @@ def run(self, lines: list[str]) -> list[str]: placeholder = self.md.htmlStash.store(code) text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}' + # Continue from after the replaced text in the next iteration. + index = m.start() + 1 + len(placeholder) else: break return text.split("\n") diff --git a/tests/test_syntax/extensions/test_attr_list.py b/tests/test_syntax/extensions/test_attr_list.py index ba8b2369..e9a10960 100644 --- a/tests/test_syntax/extensions/test_attr_list.py +++ b/tests/test_syntax/extensions/test_attr_list.py @@ -23,16 +23,53 @@ class TestAttrList(TestCase): - maxDiff = None + default_kwargs = {'extensions': ['attr_list']} # TODO: Move the rest of the `attr_list` tests here. - def test_empty_list(self): + def test_empty_attr_list(self): self.assertMarkdownRenders( '*foo*{ }', - '

foo{ }

', - extensions=['attr_list'] + '

foo{ }

' + ) + + def test_curly_after_inline(self): + self.assertMarkdownRenders( + '*inline*{.a} } *text*{.a }}', + '

inline } text}

' + ) + + def test_extra_eq_gets_ignored_inside_curly_inline(self): + # Undesired behavior but kept for historic compatibility. + self.assertMarkdownRenders( + '*inline*{data-test="x" =a} *text*', + '

inline text

' + ) + + def test_curly_after_block(self): + self.assertMarkdownRenders( + '# Heading {.a} }', + '

Heading {.a} }

' + ) + + def test_curly_in_single_quote(self): + self.assertMarkdownRenders( + "# Heading {data-test='{}'}", + '

Heading

' + ) + + def test_curly_in_double_quote(self): + self.assertMarkdownRenders( + '# Heading {data-test="{}"}', + '

Heading

' + ) + + def test_unclosed_quote_ignored(self): + # Undesired behavior but kept for historic compatibility. + self.assertMarkdownRenders( + '# Heading {foo="bar}', + '

Heading

' ) def test_table_td(self): diff --git a/tests/test_syntax/extensions/test_fenced_code.py b/tests/test_syntax/extensions/test_fenced_code.py index e24a1778..aa58bbfb 100644 --- a/tests/test_syntax/extensions/test_fenced_code.py +++ b/tests/test_syntax/extensions/test_fenced_code.py @@ -394,6 +394,48 @@ def testFencedCodeEscapedAttrs(self): extensions=['fenced_code', 'attr_list'] ) + def testFencedCodeCurlyInAttrs(self): + self.assertMarkdownRenders( + self.dedent( + ''' + ``` { data-test="{}" } + # Some python code + ``` + ''' + ), + self.dedent( + ''' +
# Some python code
+                
+ ''' + ), + extensions=['fenced_code', 'attr_list'] + ) + + def testFencedCodeMismatchedCurlyInAttrs(self): + self.assertMarkdownRenders( + self.dedent( + ''' + ``` { data-test="{}" } } + # Some python code + ``` + ``` + test + ``` + ''' + ), + self.dedent( + ''' +

``` { data-test="{}" } }

+

Some python code

+
+

test + ```

+ ''' + ), + extensions=['fenced_code', 'attr_list'] + ) + class TestFencedCodeWithCodehilite(TestCase):