From e4ab4a610edc6332ce81a53aa4ae6f97516ce461 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Fri, 8 Mar 2024 09:05:34 -0500 Subject: [PATCH] Refactor TOC sanitation * All postprocessors are run on heading content. * Footnote references are stripped from heading content. Fixes #660. * A more robust `striptags` is provided to convert headings to plain text. Unlike, the `markupsafe` implementation, HTML entities are not unescaped. * The plain text `name`, rich `html` and unescaped raw `data-toc-label` are saved to `toc_tokens`, allowing users to access the full rich text content of the headings directly from `toc_tokens`. * `data-toc-label` is sanitized separate from heading content. * A `html.unescape` call is made just prior to calling `slugify` so that `slugify` only operates on Unicode characters. Note that `html.unescape` is not run on the `name` or `html`. * The `get_name` and `stashedHTML2text` functions defined in the `toc` extension are both **deprecated**. Instead, use some combination of `run_postprocessors`, `render_inner_html` and `striptags`. Co-authored-by: Oleh Prypin --- docs/changelog.md | 19 + docs/extensions/toc.md | 11 + markdown/extensions/toc.py | 104 ++- tests/test_extensions.py | 412 ---------- tests/test_syntax/extensions/test_smarty.py | 2 + tests/test_syntax/extensions/test_toc.py | 784 +++++++++++++++++++- 6 files changed, 904 insertions(+), 428 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 8deaefd2..a71dbf25 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased] +### Changed + +#### Refactor TOC Sanitation + +* All postprocessors are run on heading content. +* Footnote references are stripped from heading content. Fixes #660. +* A more robust `striptags` is provided to convert headings to plain text. + Unlike, the `markupsafe` implementation, HTML entities are not unescaped. +* The plain text `name`, rich `html` and unescaped raw `data-toc-label` are + saved to `toc_tokens`, allowing users to access the full rich text content of + the headings directly from `toc_tokens`. +* `data-toc-label` is sanitized separate from heading content. +* A `html.unescape` call is made just prior to calling `slugify` so that + `slugify` only operates on Unicode characters. Note that `html.unescape` is + not run on the `name` or `html`. +* The `get_name` and `stashedHTML2text` functions defined in the `toc` extension + are both **deprecated**. Instead, use some combination of `run_postprocessors`, + `render_inner_html` and `striptags`. + ### Fixed * Include `scripts/*.py` in the generated source tarballs (#1430). diff --git a/docs/extensions/toc.md b/docs/extensions/toc.md index 1f80c7ea..d1c64a9d 100644 --- a/docs/extensions/toc.md +++ b/docs/extensions/toc.md @@ -80,6 +80,8 @@ the following object at `md.toc_tokens`: 'level': 1, 'id': 'header-1', 'name': 'Header 1', + 'html': 'Header 1', + 'data-toc-label': '', 'children': [ {'level': 2, 'id': 'header-2', 'name': 'Header 2', 'children':[]} ] @@ -91,6 +93,11 @@ Note that the `level` refers to the `hn` level. In other words, `

` is level `1` and `

` is level `2`, etc. Be aware that improperly nested levels in the input may result in odd nesting of the output. +`name` is the sanitized value which would also be used as a label for the HTML +version of the Table of Contents. `html` contains the fully rendered HTML +content of the heading and has not been sanitized in any way. This may be used +with your own custom sanitation to create custom table of contents. + ### Custom Labels In most cases, the text label in the Table of Contents should match the text of @@ -131,6 +138,10 @@ attribute list to provide a cleaner URL when linking to the header. If the ID is not manually defined, it is always derived from the text of the header, never from the `data-toc-label` attribute. +The value of the `data-toc-label` attribute is sanitized and stripped of any HTML +tags. However, `toc_tokens` will contain the raw content under +`data-toc-label`. + Usage ----- diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py index a17d7241..5462a906 100644 --- a/markdown/extensions/toc.py +++ b/markdown/extensions/toc.py @@ -6,7 +6,7 @@ # Original code Copyright 2008 [Jack Miller](https://codezen.org/) -# All changes Copyright 2008-2014 The Python Markdown Project +# All changes Copyright 2008-2024 The Python Markdown Project # License: [BSD](https://opensource.org/licenses/bsd-license.php) @@ -21,11 +21,13 @@ from . import Extension from ..treeprocessors import Treeprocessor -from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString +from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString from ..treeprocessors import UnescapeTreeprocessor +from ..serializers import RE_AMP import re import html import unicodedata +from copy import deepcopy import xml.etree.ElementTree as etree from typing import TYPE_CHECKING, Any, Iterator, MutableSet @@ -63,6 +65,7 @@ def unique(id: str, ids: MutableSet[str]) -> str: return id +@deprecated('Use `render_inner_html` and `striptags` instead.') def get_name(el: etree.Element) -> str: """Get title name.""" @@ -75,6 +78,7 @@ def get_name(el: etree.Element) -> str: return ''.join(text).strip() +@deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.') def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str: """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ def _html_sub(m: re.Match[str]) -> str: @@ -93,11 +97,80 @@ def _html_sub(m: re.Match[str]) -> str: def unescape(text: str) -> str: - """ Unescape escaped text. """ + """ Unescape Markdown backslash escaped text. """ c = UnescapeTreeprocessor() return c.unescape(text) +def strip_tags(text: str) -> str: + """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """ + # A comment could contain a tag, so strip comments first + while (start := text.find('', start)) != -1: + text = f'{text[:start]}{text[end + 3:]}' + + while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1: + text = f'{text[:start]}{text[end + 1:]}' + + # Collapse whitespace + text = ' '.join(text.split()) + return text + + +def escape_cdata(text: str) -> str: + """ Escape character data. """ + if "&" in text: + # Only replace & when not part of an entity + text = RE_AMP.sub('&', text) + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + return text + + +def run_postprocessors(text: str, md: Markdown) -> str: + """ Run postprocessors from Markdown instance on text. """ + for pp in md.postprocessors: + text = pp.run(text) + return text.strip() + + +def render_inner_html(el: etree.Element, md: Markdown) -> str: + """ Fully render inner html of an `etree` element as a string. """ + # The `UnescapeTreeprocessor` runs after `toc` extension so run here. + text = unescape(md.serializer(el)) + + # strip parent tag + start = text.index('>') + 1 + end = text.rindex('<') + text = text[start:end].strip() + + return run_postprocessors(text, md) + + +def remove_fnrefs(root: etree.Element) -> etree.Element: + """ Remove footnote references from a copy of the element, if any are present. """ + # Remove footnote references, which look like this: `...`. + # If there are no `sup` elements, then nothing to do. + if next(root.iter('sup'), None) is None: + return root + root = deepcopy(root) + # Find parent elements that contain `sup` elements. + for parent in root.findall('.//sup/..'): + carry_text = "" + for child in reversed(parent): # Reversed for the ability to mutate during iteration. + # Remove matching footnote references but carry any `tail` text to preceding elements. + if child.tag == 'sup' and child.get('id', '').startswith('fnref'): + carry_text = f'{child.tail or ""}{carry_text}' + parent.remove(child) + elif carry_text: + child.tail = f'{child.tail or ""}{carry_text}' + carry_text = "" + if carry_text: + parent.text = f'{parent.text or ""}{carry_text}' + return root + + def nest_toc_tokens(toc_list): """Given an unsorted list with errors and skips, return a nested one. @@ -300,27 +373,30 @@ def run(self, doc: etree.Element) -> None: for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) - text = get_name(el) + innerhtml = render_inner_html(remove_fnrefs(el), self.md) + name = strip_tags(innerhtml) # Do not override pre-existing ids if "id" not in el.attrib: - innertext = unescape(stashedHTML2text(text, self.md)) - el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids) + el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids) + + data_toc_label = '' + if 'data-toc-label' in el.attrib: + data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md) + # Overwrite name with sanitized value of `data-toc-label`. + name = escape_cdata(strip_tags(data_toc_label)) + # Remove the data-toc-label attribute as it is no longer needed + del el.attrib['data-toc-label'] if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom: toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], - 'name': unescape(stashedHTML2text( - code_escape(el.attrib.get('data-toc-label', text)), - self.md, strip_entities=False - )) + 'name': name, + 'html': innerhtml, + 'data-toc-label': data_toc_label }) - # Remove the data-toc-label attribute as it is no longer needed - if 'data-toc-label' in el.attrib: - del el.attrib['data-toc-label'] - if self.use_anchors: self.add_anchor(el, el.attrib["id"]) if self.use_permalinks not in [False, None]: diff --git a/tests/test_extensions.py b/tests/test_extensions.py index c96772ff..b8bc3c81 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -29,17 +29,6 @@ import markdown -class TestCaseWithAssertStartsWith(unittest.TestCase): - - def assertStartsWith(self, expectedPrefix, text, msg=None): - if not text.startswith(expectedPrefix): - if len(expectedPrefix) + 5 < len(text): - text = text[:len(expectedPrefix) + 5] + '...' - standardMsg = '{} not found at the start of {}'.format(repr(expectedPrefix), - repr(text)) - self.fail(self._formatMessage(msg, standardMsg)) - - class TestExtensionClass(unittest.TestCase): """ Test markdown.extensions.Extension. """ @@ -281,407 +270,6 @@ def testRE(self): self.assertEqual(RE.match(test).groups(), expected) -class TestTOC(TestCaseWithAssertStartsWith): - """ Test TOC Extension. """ - - def setUp(self): - self.md = markdown.Markdown(extensions=['toc']) - - def testMarker(self): - """ Test TOC with a Marker. """ - text = '[TOC]\n\n# Header 1\n\n## Header 2' - self.assertEqual( - self.md.convert(text), - '
\n' - '
    \n' # noqa - '
  • Header 1' # noqa - '\n' # noqa - '
  • \n' # noqa - '
\n' # noqa - '
\n' - '

Header 1

\n' - '

Header 2

' - ) - - def testNoMarker(self): - """ Test TOC without a Marker. """ - text = '# Header 1\n\n## Header 2' - self.assertEqual( - self.md.convert(text), - '

Header 1

\n' - '

Header 2

' - ) - self.assertEqual( - self.md.toc, - '
\n' - '
    \n' # noqa - '
  • Header 1' # noqa - '\n' # noqa - '
  • \n' # noqa - '
\n' # noqa - '
\n' - ) - - def testAlternateMarker(self): - """ Test TOC with user defined marker. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(marker='{{marker}}')] - ) - text = '{{marker}}\n\n# Header 1\n\n## Header 2' - self.assertEqual( - md.convert(text), - '
\n' - '
    \n' # noqa - '
  • Header 1' # noqa - '\n' # noqa - '
  • \n' # noqa - '
\n' # noqa - '
\n' - '

Header 1

\n' - '

Header 2

' - ) - - def testDisabledMarker(self): - """ Test TOC with disabled marker. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(marker='')] - ) - text = '[TOC]\n\n# Header 1\n\n## Header 2' - self.assertEqual( - md.convert(text), - '

[TOC]

\n' - '

Header 1

\n' - '

Header 2

' - ) - self.assertStartsWith('
', md.toc) - - def testReset(self): - """ Test TOC Reset. """ - self.assertEqual(self.md.toc, '') - self.md.convert('# Header 1\n\n## Header 2') - self.assertStartsWith('
', self.md.toc) - self.md.reset() - self.assertEqual(self.md.toc, '') - self.assertEqual(self.md.toc_tokens, []) - - def testUniqueIds(self): - """ Test Unique IDs. """ - - text = '#Header\n#Header\n#Header' - self.assertEqual( - self.md.convert(text), - '

Header

\n' - '

Header

\n' - '

Header

' - ) - self.assertEqual( - self.md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'header', 'name': 'Header', 'children': []}, - {'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []}, - {'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []}, - ]) - - def testHtmlEntities(self): - """ Test Headers with HTML Entities. """ - text = '# Foo & bar' - self.assertEqual( - self.md.convert(text), - '

Foo & bar

' - ) - self.assertEqual( - self.md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo & bar', 'children': []}, - ]) - - def testHtmlSpecialChars(self): - """ Test Headers with HTML special characters. """ - text = '# Foo > & bar' - self.assertEqual( - self.md.convert(text), - '

Foo > & bar

' - ) - self.assertEqual( - self.md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar', 'name': 'Foo > & bar', 'children': []}, - ]) - - def testRawHtml(self): - """ Test Headers with raw HTML. """ - text = '# Foo Bar Baz.' - self.assertEqual( - self.md.convert(text), - '

Foo Bar Baz.

' - ) - self.assertEqual( - self.md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []}, - ]) - - def testBaseLevel(self): - """ Test Header Base Level. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(baselevel=5)] - ) - text = '# Some Header\n\n## Next Level\n\n### Too High' - self.assertEqual( - md.convert(text), - '
Some Header
\n' - '
Next Level
\n' - '
Too High
' - ) - self.assertEqual( - md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(md.toc_tokens, [ - {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [ - {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []}, - {'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []}, - ]}, - ]) - - def testHeaderInlineMarkup(self): - """ Test Headers with inline markup. """ - - text = '#Some *Header* with [markup](http://example.com).' - self.assertEqual( - self.md.convert(text), - '

Some Header with ' - 'markup.

' - ) - self.assertEqual( - self.md.toc, - '
\n' - '\n' # noqa - '
\n' - ) - self.assertEqual(self.md.toc_tokens, [ - {'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []}, - ]) - - def testTitle(self): - """ Test TOC Title. """ - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension(title='Table of Contents')] - ) - md.convert('# Header 1\n\n## Header 2') - self.assertStartsWith( - '
Table of Contents
    ', - md.toc - ) - - def testWithAttrList(self): - """ Test TOC with `attr_list` Extension. """ - md = markdown.Markdown(extensions=['toc', 'attr_list']) - text = ('# Header 1\n\n' - '## Header 2 { #foo }\n\n' - '## Header 3 { data-toc-label="Foo Bar" }\n\n' - '# Header 4 { data-toc-label="Foo > Baz" }\n\n' - '# Header 5 { data-toc-label="Foo Quux" }') - - self.assertEqual( - md.convert(text), - '

    Header 1

    \n' - '

    Header 2

    \n' - '

    Header 3

    \n' - '

    Header 4

    \n' - '

    Header 5

    ' - ) - self.assertEqual( - md.toc, - '
    \n' - '\n' # noqa - '
    \n' - ) - self.assertEqual(md.toc_tokens, [ - {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [ - {'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []}, - {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []} - ]}, - {'level': 1, 'id': 'header-4', 'name': 'Foo > Baz', 'children': []}, - {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []}, - ]) - - def testUniqueFunc(self): - """ Test 'unique' function. """ - from markdown.extensions.toc import unique - ids = {'foo'} - self.assertEqual(unique('foo', ids), 'foo_1') - self.assertEqual(ids, {'foo', 'foo_1'}) - - def testTocInHeaders(self): - - text = '[TOC]\n#[TOC]' - self.assertEqual( - self.md.convert(text), - '
    \n' # noqa - '
      \n' # noqa - '
    • [TOC]
    • \n' # noqa - '
    \n' # noqa - '
    \n' # noqa - '

    [TOC]

    ' # noqa - ) - - text = '#[TOC]\n[TOC]' - self.assertEqual( - self.md.convert(text), - '

    [TOC]

    \n' # noqa - '
    \n' # noqa - '
      \n' # noqa - '
    • [TOC]
    • \n' # noqa - '
    \n' # noqa - '
    ' # noqa - ) - - text = '[TOC]\n# *[TOC]*' - self.assertEqual( - self.md.convert(text), - '
    \n' # noqa - '
      \n' # noqa - '
    • [TOC]
    • \n' # noqa - '
    \n' # noqa - '
    \n' # noqa - '

    [TOC]

    ' # noqa - ) - - def testPermalink(self): - """ Test TOC `permalink` feature. """ - text = '# Hd 1\n\n## Hd 2' - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL")] - ) - self.assertEqual( - md.convert(text), - '

    ' - 'Hd 1' # noqa - '' # noqa - '¶' # noqa - '' # noqa - '

    \n' - '

    ' - 'Hd 2' # noqa - '' # noqa - '¶' # noqa - '' # noqa - '

    ' - ) - - def testPermalinkLeading(self): - """ Test TOC `permalink` with `permalink_leading` option. """ - text = '# Hd 1\n\n## Hd 2' - md = markdown.Markdown(extensions=[ - markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL", permalink_leading=True)] - ) - self.assertEqual( - md.convert(text), - '

    ' - '' # noqa - '¶' # noqa - '' # noqa - 'Hd 1' # noqa - '

    \n' - '

    ' - '' # noqa - '¶' # noqa - '' # noqa - 'Hd 2' # noqa - '

    ' - ) - - def testInlineMarkupPermalink(self): - """ Test TOC `permalink` with headers containing markup. """ - text = '# Code `in` hd' - md = markdown.Markdown( - extensions=[markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL")] - ) - self.assertEqual( - md.convert(text), - '

    ' - 'Code in hd' # noqa - '' # noqa - '¶' # noqa - '' # noqa - '

    ' - ) - - def testInlineMarkupPermalinkLeading(self): - """ Test TOC `permalink_leading` with headers containing markup. """ - text = '# Code `in` hd' - md = markdown.Markdown(extensions=[ - markdown.extensions.toc.TocExtension( - permalink=True, permalink_title="PL", permalink_leading=True)] - ) - self.assertEqual( - md.convert(text), - '

    ' - '' # noqa - '¶' # noqa - '' # noqa - 'Code in hd' # noqa - '

    ' - ) - - class TestSmarty(unittest.TestCase): def setUp(self): config = { diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py index 8a176745..0228ddf0 100644 --- a/tests/test_syntax/extensions/test_smarty.py +++ b/tests/test_syntax/extensions/test_smarty.py @@ -216,6 +216,8 @@ def test_smarty_and_toc(self): 'level': 1, 'id': 'foo-bar', 'name': 'Foo — bar', + 'html': 'Foobar', + 'data-toc-label': '', 'children': [], }, ], diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py index 79764364..9902072a 100644 --- a/tests/test_syntax/extensions/test_toc.py +++ b/tests/test_syntax/extensions/test_toc.py @@ -20,14 +20,500 @@ """ from markdown.test_tools import TestCase -from markdown.extensions.toc import TocExtension +from markdown import Markdown +from markdown.extensions.toc import TocExtension, strip_tags, unique from markdown.extensions.nl2br import Nl2BrExtension class TestTOC(TestCase): maxDiff = None + default_kwargs = { + 'extensions': [TocExtension()] + } - # TODO: Move the rest of the TOC tests here. + def testTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + + # Header 1 + + ## Header 2 + ''' + ), + '
    \n' + '
      \n' # noqa + '
    • Header 1' # noqa + '\n' # noqa + '
    • \n' # noqa + '
    \n' # noqa + '
    \n' + '

    Header 1

    \n' + '

    Header 2

    ' + ) + + def testNoTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1 + + ## Header 2 + ''' + ), + self.dedent( + ''' +

    Header 1

    +

    Header 2

    + ''' + ), + expected_attrs={ + 'toc': ( + '
    \n' + '
      \n' # noqa + '
    • Header 1' # noqa + '\n' # noqa + '
    • \n' # noqa + '
    \n' # noqa + '
    \n' + ) + } + ) + + def testAlternateTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + {{marker}} + + # Header 1 + + ## Header 2 + ''' + ), + '
    \n' + '
      \n' # noqa + '
    • Header 1' # noqa + '\n' # noqa + '
    • \n' # noqa + '
    \n' # noqa + '
    \n' + '

    Header 1

    \n' + '

    Header 2

    ', + extensions=[TocExtension(marker='{{marker}}')] + ) + + def testDisabledTOCMarker(self): + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + + # Header 1 + + ## Header 2 + ''' + ), + self.dedent( + ''' +

    [TOC]

    +

    Header 1

    +

    Header 2

    + ''' + ), + expected_attrs={ + 'toc': ( + '
    \n' + '
      \n' # noqa + '
    • Header 1' # noqa + '\n' # noqa + '
    • \n' # noqa + '
    \n' # noqa + '
    \n' + ) + }, + extensions=[TocExtension(marker='')] + ) + + def testTOCReset(self): + md = Markdown(extensions=[TocExtension()]) + self.assertEqual(md.toc, '') + self.assertEqual(md.toc_tokens, []) + md.convert('# Header 1') + self.assertEqual('
    ', md.toc[:17]) + self.assertEqual(len(md.toc_tokens), 1) + md.reset() + self.assertEqual(md.toc, '') + self.assertEqual(md.toc_tokens, []) + + def testUniqueIds(self): + self.assertMarkdownRenders( + self.dedent( + ''' + #Header + #Header + #Header + ''' + ), + self.dedent( + ''' +

    Header

    +

    Header

    +

    Header

    + ''' + ), + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header_1', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header_2', + 'name': 'Header', + 'html': 'Header', + 'data-toc-label': '', + 'children': [] + }, + ] + } + ) + + def testHtmlEntitiesInTOC(self): + self.assertMarkdownRenders( + '# Foo & bar', + '

    Foo & bar

    ', + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'foo-bar', + 'name': 'Foo & bar', + 'html': 'Foo & bar', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testHtmlSpecialCharsInTOC(self): + self.assertMarkdownRenders( + '# Foo > & bar', + '

    Foo > & bar

    ', + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'foo-bar', + 'name': 'Foo > & bar', + 'html': 'Foo > & bar', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testRawHtmlInTOC(self): + self.assertMarkdownRenders( + '# Foo Bar Baz.', + '

    Foo Bar Baz.

    ', + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'foo-bar-baz', + 'name': 'Foo Bar Baz.', + 'html': 'Foo Bar Baz.', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testTOCBaseLevel(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Some Header + ## Next Level + ### Too High + ''' + ), + self.dedent( + ''' +
    Some Header
    +
    Next Level
    +
    Too High
    + ''' + ), + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [{ + 'level': 5, + 'id': 'some-header', + 'name': 'Some Header', + 'html': 'Some Header', + 'data-toc-label': '', + 'children': [ + { + 'level': 6, + 'id': 'next-level', + 'name': 'Next Level', + 'html': 'Next Level', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 6, + 'id': 'too-high', + 'name': 'Too High', + 'html': 'Too High', + 'data-toc-label': '', + 'children': [] + } + ] + }] + }, + extensions=[TocExtension(baselevel=5)] + ) + + def testHeaderInlineMarkup(self): + self.assertMarkdownRenders( + '#Some *Header* with [markup](http://example.com).', + '

    Some Header with ' + 'markup.

    ', + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [{ + 'level': 1, + 'id': 'some-header-with-markup', + 'name': 'Some Header with markup.', + 'html': 'Some Header with markup.', + 'data-toc-label': '', + 'children': [] + }] + } + ) + + def testTOCTitle(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1 + + ## Header 2 + ''' + ), + self.dedent( + ''' +

    Header 1

    +

    Header 2

    + ''' + ), + expected_attrs={ + 'toc': ( + '
    Table of Contents' + '
      \n' # noqa + '
    • Header 1' # noqa + '\n' # noqa + '
    • \n' # noqa + '
    \n' # noqa + '
    \n' + ) + }, + extensions=[TocExtension(title='Table of Contents')] + ) + + def testTOCUniqueFunc(self): + ids = {'foo'} + self.assertEqual(unique('foo', ids), 'foo_1') + self.assertEqual(ids, {'foo', 'foo_1'}) + + def testTocInHeaders(self): + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + #[TOC] + ''' + ), + '
    \n' # noqa + '
      \n' # noqa + '
    • [TOC]
    • \n' # noqa + '
    \n' # noqa + '
    \n' # noqa + '

    [TOC]

    ' # noqa + ) + + self.assertMarkdownRenders( + self.dedent( + ''' + #[TOC] + [TOC] + ''' + ), + '

    [TOC]

    \n' # noqa + '
    \n' # noqa + '
      \n' # noqa + '
    • [TOC]
    • \n' # noqa + '
    \n' # noqa + '
    ' # noqa + ) + + self.assertMarkdownRenders( + self.dedent( + ''' + [TOC] + # *[TOC]* + ''' + ), + '
    \n' # noqa + '
      \n' # noqa + '
    • [TOC]
    • \n' # noqa + '
    \n' # noqa + '
    \n' # noqa + '

    [TOC]

    ' # noqa + ) + + def testTOCPermalink(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Hd 1 + + ## Hd 2 + ''' + ), + '

    ' + 'Hd 1' # noqa + '' # noqa + '¶' # noqa + '' # noqa + '

    \n' + '

    ' + 'Hd 2' # noqa + '' # noqa + '¶' # noqa + '' # noqa + '

    ', + extensions=[TocExtension(permalink=True, permalink_title="PL")] + ) + + def testTOCPermalinkLeading(self): + self.assertMarkdownRenders( + self.dedent( + ''' + # Hd 1 + ## Hd 2 + ''' + ), + '

    ' + '' # noqa + '¶' # noqa + '' # noqa + 'Hd 1' # noqa + '

    \n' + '

    ' + '' # noqa + '¶' # noqa + '' # noqa + 'Hd 2' # noqa + '

    ', + extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)] + ) + + def testTOCInlineMarkupPermalink(self): + self.assertMarkdownRenders( + '# Code `in` hd', + '

    ' + 'Code in hd' # noqa + '' # noqa + '¶' # noqa + '' # noqa + '

    ', + extensions=[TocExtension(permalink=True, permalink_title="PL")] + ) + + def testTOCInlineMarkupPermalinkLeading(self): + self.assertMarkdownRenders( + '# Code `in` hd', + '

    ' + '' # noqa + '¶' # noqa + '' # noqa + 'Code in hd' # noqa + '

    ', + extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)] + ) def testAnchorLink(self): self.assertMarkdownRenders( @@ -140,11 +626,15 @@ def testMinMaxLevel(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', + 'data-toc-label': '', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', + 'data-toc-label': '', 'children': [] } ] @@ -189,11 +679,15 @@ def testMaxLevel(self): 'level': 1, 'id': 'header-1', 'name': 'Header 1', + 'html': 'Header 1', + 'data-toc-label': '', 'children': [ { 'level': 2, 'id': 'header-2', 'name': 'Header 2', + 'html': 'Header 2', + 'data-toc-label': '', 'children': [] } ] @@ -245,11 +739,15 @@ def testMinMaxLevelwithAnchorLink(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', + 'data-toc-label': '', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', + 'data-toc-label': '', 'children': [] } ] @@ -301,11 +799,15 @@ def testMinMaxLevelwithPermalink(self): 'level': 3, 'id': 'header-3', 'name': 'Header 3', + 'html': 'Header 3', + 'data-toc-label': '', 'children': [ { 'level': 4, 'id': 'header-4', 'name': 'Header 4', + 'html': 'Header 4', + 'data-toc-label': '', 'children': [] } ] @@ -353,11 +855,15 @@ def testMinMaxLevelwithBaseLevel(self): 'level': 4, 'id': 'second-level', 'name': 'Second Level', + 'html': 'Second Level', + 'data-toc-label': '', 'children': [ { 'level': 5, 'id': 'third-level', 'name': 'Third Level', + 'html': 'Third Level', + 'data-toc-label': '', 'children': [] } ] @@ -402,11 +908,15 @@ def testMaxLevelwithBaseLevel(self): 'level': 2, 'id': 'some-header', 'name': 'Some Header', + 'html': 'Some Header', + 'data-toc-label': '', 'children': [ { 'level': 3, 'id': 'next-level', 'name': 'Next Level', + 'html': 'Next Level', + 'data-toc-label': '', 'children': [] } ] @@ -455,6 +965,32 @@ def test_escaped_char_in_id(self): 'level': 1, 'id': 'escaped_character', 'name': 'escaped_character', + 'html': 'escaped_character', + 'data-toc-label': '', + 'children': [] + } + ] + }, + extensions=['toc'] + ) + + def testAutoLinkEmail(self): + self.assertMarkdownRenders( + '## ', + '

    foo@exa' + 'mple.org

    ', + expected_attrs={ + 'toc_tokens': [ + { + 'level': 2, + 'id': 'fooexampleorg', + 'name': 'foo@exam' + 'ple.org', + 'html': 'foo' + '@example.org', + 'data-toc-label': '', 'children': [] } ] @@ -671,3 +1207,247 @@ def testTOCWithCustomTitleClass(self): ), extensions=[TocExtension(title_class="tocname", title='ToC')] ) + + def testTocWithAttrList(self): + + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1 + + ## Header 2 { #foo } + + ## Header 3 { data-toc-label="Foo Bar" } + + # Header 4 { data-toc-label="Foo > & < Baz" } + + # Header 5 { data-toc-label="Foo Quux" } + ''' + ), + self.dedent( + ''' +

    Header 1

    +

    Header 2

    +

    Header 3

    +

    Header 4

    +

    Header 5

    + ''' + ), + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header-1', + 'name': 'Header 1', + 'html': 'Header 1', + 'data-toc-label': '', + 'children': [ + { + 'level': 2, + 'id': 'foo', + 'name': 'Header 2', + 'html': 'Header 2', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 2, + 'id': 'header-3', + 'name': 'Foo Bar', + 'html': 'Header 3', + 'data-toc-label': 'Foo Bar', + 'children': [] + } + ] + }, + { + 'level': 1, + 'id': 'header-4', + 'name': 'Foo > & < Baz', + 'html': 'Header 4', + 'data-toc-label': 'Foo > & < Baz', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-5', + 'name': 'Foo Quux', + 'html': 'Header 5', + 'data-toc-label': 'Foo Quux', + 'children': [] + }, + ] + }, + extensions=[TocExtension(), 'attr_list'] + ) + + def testHeadingRemoveFootnoteRef(self): + + self.assertMarkdownRenders( + self.dedent( + ''' + # Header 1[^1] + # Header[^1] 2 + # Header *subelement*[^1] 3 + # Header[^1] double[^1] 4 + + [^1]: footnote + ''' + ), + ( + '

    Header 11

    \n' + '

    Header1 2

    \n' + '

    ' + 'Header subelement1 3' + '

    \n' + '

    ' + 'Header1 double' + '1 4' + '

    \n' + '
    \n' + '
    \n' + '
      \n' + '
    1. \n' + '

      ' + 'footnote ' + '' + '' + '' + '' + '' + '

      \n' + '
    2. \n' + '
    \n' + '
    ' + ), + expected_attrs={ + 'toc': ( + '
    \n' + '\n' # noqa + '
    \n' # noqa + ), + 'toc_tokens': [ + { + 'level': 1, + 'id': 'header-1', + 'name': 'Header 1', + 'html': 'Header 1', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-2', + 'name': 'Header 2', + 'html': 'Header 2', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-subelement-3', + 'name': 'Header subelement 3', + 'html': 'Header subelement 3', + 'data-toc-label': '', + 'children': [] + }, + { + 'level': 1, + 'id': 'header-double-4', + 'name': 'Header double 4', + 'html': 'Header double 4', + 'data-toc-label': '', + 'children': [] + } + ] + }, + extensions=[TocExtension(), 'footnotes'] + ) + + +class testStripTags(TestCase): + + def testStripElement(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripOpenElement(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripEmptyElement(self): + self.assertEqual( + strip_tags('foo
    bar'), + 'foo bar' + ) + + def testDontStripOpenBracket(self): + self.assertEqual( + strip_tags('foo < bar'), + 'foo < bar' + ) + + def testDontStripCloseBracket(self): + self.assertEqual( + strip_tags('foo > bar'), + 'foo > bar' + ) + + def testStripCollapseWhitespace(self): + self.assertEqual( + strip_tags('foo \tbar\t'), + 'foo bar' + ) + + def testStripElementWithNewlines(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripComment(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripCommentWithInnerTags(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testStripCommentInElement(self): + self.assertEqual( + strip_tags('foo bar'), + 'foo bar' + ) + + def testDontStripHTMLEntities(self): + self.assertEqual( + strip_tags('foo < & < bar'), + 'foo < & < bar' + )