Skip to content

Commit

Permalink
Allow attr_list quoted values to contain curly braces
Browse files Browse the repository at this point in the history
How it worked before:

  * Extract the content without allowing any `}` in it, and require that it ends with `}` - for block elements anchored to the end of the line, otherwise not.
  * Parse the content in more detail. No edge cases with `}` can arise. If parsing is interrupted by some unrecognized token, discard the rest of the string.

How it works now:

  * Extract the content *and allow* `}` in it, and require that it ends with `}` - for block elements it's anchored to the end of the line, otherwise not.
  * Parse the content in more detail. Allow `}` only within the quoted parts, otherwise interrupt parsing like for any other unrecognized token.
    If parsing is interrupted, there is remaining unrecognized text. Ideally perhaps we would bail out at this point entirely (and not recognize it as an attr_list), but to preserve historic behavior, any extra text before `}` is just discarded.
    If there is an extra `}` in the remaining text:
      * For block elements: that must mean that the attr_list syntax did not in fact terminate at the end of the line but earlier. So, bail out and do not register any attributes and do not change the original text.
      * For inline elements: that must mean that we just overmatched a bit, but that's OK, we just assign attrs as normal and put the extra text back into the string. As mentioned, any extra text *before* `}` is just discarded.
  • Loading branch information
oprypin committed Mar 12, 2024
1 parent 9edba85 commit 3d8afc6
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 40 deletions.
1 change: 1 addition & 0 deletions .spell-dict
Expand Up @@ -146,6 +146,7 @@ Treeprocessor
Treeprocessors
tuple
tuples
unparsable
unclosed
unescape
unescaping
Expand Down
4 changes: 3 additions & 1 deletion docs/changelog.md
Expand Up @@ -34,8 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Include `scripts/*.py` in the generated source tarballs (#1430).
* Ensure lines after heading in loose list are properly detabbed (#1443).
* Give smarty tree processor higher priority than toc (#1440).
* Permit carrots (`^`) and square brackets (`]`) but explicitly exclude
* Permit carets (`^`) and square brackets (`]`) but explicitly exclude
backslashes (`\`) from abbreviations (#1444).
* In attribute lists (`attr_list`, `fenced_code`), quoted attribute values are
now allowed to contain curly braces (`}`) (#1414).

## [3.5.2] -- 2024-01-10

Expand Down
86 changes: 55 additions & 31 deletions markdown/extensions/attr_list.py
Expand Up @@ -57,17 +57,30 @@ def _handle_word(s, t):


_scanner = re.Scanner([
(r'[^ =]+=".*?"', _handle_double_quote),
(r"[^ =]+='.*?'", _handle_single_quote),
(r'[^ =]+=[^ =]+', _handle_key_value),
(r'[^ =]+', _handle_word),
(r'[^ =}]+=".*?"', _handle_double_quote),
(r"[^ =}]+='.*?'", _handle_single_quote),
(r'[^ =}]+=[^ =}]+', _handle_key_value),
(r'[^ =}]+', _handle_word),
(r' ', None)
])


def get_attrs(str: str) -> list[tuple[str, str]]:
""" Parse attribute list and return a list of attribute tuples. """
return _scanner.scan(str)[0]
def get_attrs_and_remainder(attrs_string: str) -> tuple[list[tuple[str, str]], str]:
""" Parse attribute list and return a list of attribute tuples.
Additionally, return any text that remained after a curly brace. In typical cases, its presence
should mean that the input does not match the intended attribute list syntax.
"""
attrs, remainder = _scanner.scan(attrs_string)
# To keep historic behavior, discard all unparsable text prior to '}'.
index = remainder.find('}')
remainder = remainder[index:] if index != -1 else ''
return attrs, remainder


def get_attrs(str: str) -> list[tuple[str, str]]: # pragma: no cover
""" Soft-deprecated. Prefer `get_attrs_and_remainder`. """
return get_attrs_and_remainder(str)[0]


def isheader(elem: Element) -> bool:
Expand All @@ -76,7 +89,7 @@ def isheader(elem: Element) -> bool:

class AttrListTreeprocessor(Treeprocessor):

BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}'
HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
Expand Down Expand Up @@ -106,49 +119,58 @@ def run(self, doc: Element) -> None:
# use tail of last child. no `ul` or `ol`.
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[-1].tail = elem[-1].tail[:m.start()]
elif pos is not None and pos > 0 and elem[pos-1].tail:
# use tail of last child before `ul` or `ol`
m = RE.search(elem[pos-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
elif elem.text:
# use text. `ul` is first child.
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem.text = elem.text[:m.start()]
elif len(elem) and elem[-1].tail:
# has children. Get from tail of last child
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if isheader(elem):
# clean up trailing #s
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[-1].tail = elem[-1].tail[:m.start()]
if isheader(elem):
# clean up trailing #s
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
elif elem.text:
# no children. Get from text.
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if isheader(elem):
# clean up trailing #s
elem.text = elem.text.rstrip('#').rstrip()
if not self.assign_attrs(elem, m.group(1), strict=True):
elem.text = elem.text[:m.start()]
if isheader(elem):
# clean up trailing #s
elem.text = elem.text.rstrip('#').rstrip()
else:
# inline: check for `attrs` at start of tail
if elem.tail:
m = self.INLINE_RE.match(elem.tail)
if m:
self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():]
remainder = self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():] + remainder

def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str:
""" Assign `attrs` to element.
If the `attrs_string` has an extra closing curly brace, the remaining text is returned.
The `strict` argument controls whether to still assign `attrs` if there is a remaining `}`.
"""
attrs, remainder = get_attrs_and_remainder(attrs_string)
if strict and remainder:
return remainder

def assign_attrs(self, elem: Element, attrs: str) -> None:
""" Assign `attrs` to element. """
for k, v in get_attrs(attrs):
for k, v in attrs:
if k == '.':
# add to class
cls = elem.get('class')
Expand All @@ -159,11 +181,13 @@ def assign_attrs(self, elem: Element, attrs: str) -> None:
else:
# assign attribute `k` with `v`
elem.set(self.sanitize_name(k), v)
# The text that we initially over-matched will be put back.
return remainder

def sanitize_name(self, name: str) -> str:
"""
Sanitize name as 'an XML Name, minus the ":"'.
See https://www.w3.org/TR/REC-xml-names/#NT-NCName
Sanitize name as 'an XML Name, minus the `:`.'
See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
"""
return self.NAME_RE.sub('_', name)

Expand Down
15 changes: 11 additions & 4 deletions markdown/extensions/fenced_code.py
Expand Up @@ -25,7 +25,7 @@
from . import Extension
from ..preprocessors import Preprocessor
from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines
from .attr_list import get_attrs, AttrListExtension
from .attr_list import get_attrs_and_remainder, AttrListExtension
from ..util import parseBoolValue
from ..serializers import _escape_attrib_html
import re
Expand Down Expand Up @@ -56,7 +56,7 @@ class FencedBlockPreprocessor(Preprocessor):
FENCED_BLOCK_RE = re.compile(
dedent(r'''
(?P<fence>^(?:~{3,}|`{3,}))[ ]* # opening fence
((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
((\{(?P<attrs>[^\n]*)\})| # (optional {attrs} or
(\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
(hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines)
\n # newline (end of opening fence)
Expand Down Expand Up @@ -94,12 +94,17 @@ def run(self, lines: list[str]) -> list[str]:
self.checked_for_deps = True

text = "\n".join(lines)
index = 0
while 1:
m = self.FENCED_BLOCK_RE.search(text)
m = self.FENCED_BLOCK_RE.search(text, index)
if m:
lang, id, classes, config = None, '', [], {}
if m.group('attrs'):
id, classes, config = self.handle_attrs(get_attrs(m.group('attrs')))
attrs, remainder = get_attrs_and_remainder(m.group('attrs'))
if remainder: # Does not have correctly matching curly braces, so the syntax is invalid.
index = m.end('attrs') # Explicitly skip over this, to prevent an infinite loop.
continue
id, classes, config = self.handle_attrs(attrs)
if len(classes):
lang = classes.pop(0)
else:
Expand Down Expand Up @@ -151,6 +156,8 @@ def run(self, lines: list[str]) -> list[str]:

placeholder = self.md.htmlStash.store(code)
text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}'
# Continue from after the replaced text in the next iteration.
index = m.start() + 1 + len(placeholder)
else:
break
return text.split("\n")
Expand Down
45 changes: 41 additions & 4 deletions tests/test_syntax/extensions/test_attr_list.py
Expand Up @@ -23,16 +23,53 @@


class TestAttrList(TestCase):

maxDiff = None
default_kwargs = {'extensions': ['attr_list']}

# TODO: Move the rest of the `attr_list` tests here.

def test_empty_list(self):
def test_empty_attr_list(self):
self.assertMarkdownRenders(
'*foo*{ }',
'<p><em>foo</em>{ }</p>',
extensions=['attr_list']
'<p><em>foo</em>{ }</p>'
)

def test_curly_after_inline(self):
self.assertMarkdownRenders(
'*inline*{.a} } *text*{.a }}',
'<p><em class="a">inline</em> } <em class="a">text</em>}</p>'
)

def test_extra_eq_gets_ignored_inside_curly_inline(self):
# Undesired behavior but kept for historic compatibility.
self.assertMarkdownRenders(
'*inline*{data-test="x" =a} *text*',
'<p><em data-test="x">inline</em> <em>text</em></p>'
)

def test_curly_after_block(self):
self.assertMarkdownRenders(
'# Heading {.a} }',
'<h1>Heading {.a} }</h1>'
)

def test_curly_in_single_quote(self):
self.assertMarkdownRenders(
"# Heading {data-test='{}'}",
'<h1 data-test="{}">Heading</h1>'
)

def test_curly_in_double_quote(self):
self.assertMarkdownRenders(
'# Heading {data-test="{}"}',
'<h1 data-test="{}">Heading</h1>'
)

def test_unclosed_quote_ignored(self):
# Undesired behavior but kept for historic compatibility.
self.assertMarkdownRenders(
'# Heading {foo="bar}',
'<h1 foo="&quot;bar">Heading</h1>'
)

def test_table_td(self):
Expand Down
42 changes: 42 additions & 0 deletions tests/test_syntax/extensions/test_fenced_code.py
Expand Up @@ -394,6 +394,48 @@ def testFencedCodeEscapedAttrs(self):
extensions=['fenced_code', 'attr_list']
)

def testFencedCodeCurlyInAttrs(self):
self.assertMarkdownRenders(
self.dedent(
'''
``` { data-test="{}" }
# Some python code
```
'''
),
self.dedent(
'''
<pre><code data-test="{}"># Some python code
</code></pre>
'''
),
extensions=['fenced_code', 'attr_list']
)

def testFencedCodeMismatchedCurlyInAttrs(self):
self.assertMarkdownRenders(
self.dedent(
'''
``` { data-test="{}" } }
# Some python code
```
```
test
```
'''
),
self.dedent(
'''
<p>``` { data-test="{}" } }</p>
<h1>Some python code</h1>
<pre><code></code></pre>
<p>test
```</p>
'''
),
extensions=['fenced_code', 'attr_list']
)


class TestFencedCodeWithCodehilite(TestCase):

Expand Down

0 comments on commit 3d8afc6

Please sign in to comment.