From e4ab4a610edc6332ce81a53aa4ae6f97516ce461 Mon Sep 17 00:00:00 2001
From: Waylan Limberg <waylan.limberg@icloud.com>
Date: Fri, 8 Mar 2024 09:05:34 -0500
Subject: [PATCH] Refactor TOC sanitation

* All postprocessors are run on heading content.
* Footnote references are stripped from heading content. Fixes #660.
* A more robust `striptags` is provided to convert headings to plain text.
  Unlike, the `markupsafe` implementation, HTML entities are not unescaped.
* The plain text `name`, rich `html` and unescaped raw `data-toc-label` are
  saved to `toc_tokens`, allowing users to access the full rich text content of
  the headings directly from `toc_tokens`.
* `data-toc-label` is sanitized separate from heading content.
* A `html.unescape` call is made just prior to calling `slugify` so that
  `slugify` only operates on Unicode characters. Note that `html.unescape` is
  not run on the `name` or `html`.
* The `get_name` and `stashedHTML2text` functions defined in the `toc` extension
  are both **deprecated**. Instead, use some combination of `run_postprocessors`,
  `render_inner_html` and `striptags`.

Co-authored-by: Oleh Prypin <oleh@pryp.in>
---
 docs/changelog.md                           |  19 +
 docs/extensions/toc.md                      |  11 +
 markdown/extensions/toc.py                  | 104 ++-
 tests/test_extensions.py                    | 412 ----------
 tests/test_syntax/extensions/test_smarty.py |   2 +
 tests/test_syntax/extensions/test_toc.py    | 784 +++++++++++++++++++-
 6 files changed, 904 insertions(+), 428 deletions(-)
diff --git a/docs/changelog.md b/docs/changelog.md
index 8deaefd2..a71dbf25 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -10,6 +10,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [unreleased]
 
+### Changed
+
+#### Refactor TOC Sanitation
+
+* All postprocessors are run on heading content.
+* Footnote references are stripped from heading content. Fixes #660.
+* A more robust `striptags` is provided to convert headings to plain text.
+  Unlike, the `markupsafe` implementation, HTML entities are not unescaped.
+* The plain text `name`, rich `html` and unescaped raw `data-toc-label` are
+  saved to `toc_tokens`, allowing users to access the full rich text content of
+  the headings directly from `toc_tokens`.
+* `data-toc-label` is sanitized separate from heading content.
+* A `html.unescape` call is made just prior to calling `slugify` so that
+  `slugify` only operates on Unicode characters. Note that `html.unescape` is
+  not run on the `name` or `html`.
+* The `get_name` and `stashedHTML2text` functions defined in the `toc` extension
+  are both **deprecated**. Instead, use some combination of `run_postprocessors`,
+  `render_inner_html` and `striptags`.
+
 ### Fixed
 
 * Include `scripts/*.py` in the generated source tarballs (#1430).
diff --git a/docs/extensions/toc.md b/docs/extensions/toc.md
index 1f80c7ea..d1c64a9d 100644
--- a/docs/extensions/toc.md
+++ b/docs/extensions/toc.md
@@ -80,6 +80,8 @@ the following object at `md.toc_tokens`:
         'level': 1,
         'id': 'header-1',
         'name': 'Header 1',
+        'html': 'Header 1',
+        'data-toc-label': '',
         'children': [
             {'level': 2, 'id': 'header-2', 'name': 'Header 2', 'children':[]}
         ]
@@ -91,6 +93,11 @@ Note that the `level` refers to the `hn` level. In other words, `<h1>` is level
 `1` and `<h2>` is level `2`, etc. Be aware that improperly nested levels in the
 input may result in odd nesting of the output.
 
+`name` is the sanitized value which would also be used as a label for the HTML
+version of the Table of Contents. `html` contains the fully rendered HTML
+content of the heading and has not been sanitized in any way. This may be used
+with your own custom sanitation to create custom table of contents.
+
 ### Custom Labels
 
 In most cases, the text label in the Table of Contents should match the text of
@@ -131,6 +138,10 @@ attribute list to provide a cleaner URL when linking to the header. If the ID is
 not manually defined, it is always derived from the text of the header, never
 from the `data-toc-label` attribute.
 
+The value of the `data-toc-label` attribute is sanitized and stripped of any HTML
+tags. However, `toc_tokens` will contain the raw content under
+`data-toc-label`.
+
 Usage
 -----
 
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
index a17d7241..5462a906 100644
--- a/markdown/extensions/toc.py
+++ b/markdown/extensions/toc.py
@@ -6,7 +6,7 @@
 
 # Original code Copyright 2008 [Jack Miller](https://codezen.org/)
 
-# All changes Copyright 2008-2014 The Python Markdown Project
+# All changes Copyright 2008-2024 The Python Markdown Project
 
 # License: [BSD](https://opensource.org/licenses/bsd-license.php)
 
@@ -21,11 +21,13 @@
 
 from . import Extension
 from ..treeprocessors import Treeprocessor
-from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
+from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString
 from ..treeprocessors import UnescapeTreeprocessor
+from ..serializers import RE_AMP
 import re
 import html
 import unicodedata
+from copy import deepcopy
 import xml.etree.ElementTree as etree
 from typing import TYPE_CHECKING, Any, Iterator, MutableSet
 
@@ -63,6 +65,7 @@ def unique(id: str, ids: MutableSet[str]) -> str:
     return id
 
 
+@deprecated('Use `render_inner_html` and `striptags` instead.')
 def get_name(el: etree.Element) -> str:
     """Get title name."""
 
@@ -75,6 +78,7 @@ def get_name(el: etree.Element) -> str:
     return ''.join(text).strip()
 
 
+@deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.')
 def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
     """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
     def _html_sub(m: re.Match[str]) -> str:
@@ -93,11 +97,80 @@ def _html_sub(m: re.Match[str]) -> str:
 
 
 def unescape(text: str) -> str:
-    """ Unescape escaped text. """
+    """ Unescape Markdown backslash escaped text. """
     c = UnescapeTreeprocessor()
     return c.unescape(text)
 
 
+def strip_tags(text: str) -> str:
+    """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
+    # A comment could contain a tag, so strip comments first
+    while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
+        text = f'{text[:start]}{text[end + 3:]}'
+
+    while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
+        text = f'{text[:start]}{text[end + 1:]}'
+
+    # Collapse whitespace
+    text = ' '.join(text.split())
+    return text
+
+
+def escape_cdata(text: str) -> str:
+    """ Escape character data. """
+    if "&" in text:
+        # Only replace & when not part of an entity
+        text = RE_AMP.sub('&amp;', text)
+    if "<" in text:
+        text = text.replace("<", "&lt;")
+    if ">" in text:
+        text = text.replace(">", "&gt;")
+    return text
+
+
+def run_postprocessors(text: str, md: Markdown) -> str:
+    """ Run postprocessors from Markdown instance on text. """
+    for pp in md.postprocessors:
+        text = pp.run(text)
+    return text.strip()
+
+
+def render_inner_html(el: etree.Element, md: Markdown) -> str:
+    """ Fully render inner html of an `etree` element as a string. """
+    # The `UnescapeTreeprocessor` runs after `toc` extension so run here.
+    text = unescape(md.serializer(el))
+
+    # strip parent tag
+    start = text.index('>') + 1
+    end = text.rindex('<')
+    text = text[start:end].strip()
+
+    return run_postprocessors(text, md)
+
+
+def remove_fnrefs(root: etree.Element) -> etree.Element:
+    """ Remove footnote references from a copy of the element, if any are present. """
+    # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
+    # If there are no `sup` elements, then nothing to do.
+    if next(root.iter('sup'), None) is None:
+        return root
+    root = deepcopy(root)
+    # Find parent elements that contain `sup` elements.
+    for parent in root.findall('.//sup/..'):
+        carry_text = ""
+        for child in reversed(parent):  # Reversed for the ability to mutate during iteration.
+            # Remove matching footnote references but carry any `tail` text to preceding elements.
+            if child.tag == 'sup' and child.get('id', '').startswith('fnref'):
+                carry_text = f'{child.tail or ""}{carry_text}'
+                parent.remove(child)
+            elif carry_text:
+                child.tail = f'{child.tail or ""}{carry_text}'
+                carry_text = ""
+        if carry_text:
+            parent.text = f'{parent.text or ""}{carry_text}'
+    return root
+
+
 def nest_toc_tokens(toc_list):
     """Given an unsorted list with errors and skips, return a nested one.
 
@@ -300,27 +373,30 @@ def run(self, doc: etree.Element) -> None:
         for el in doc.iter():
             if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
                 self.set_level(el)
-                text = get_name(el)
+                innerhtml = render_inner_html(remove_fnrefs(el), self.md)
+                name = strip_tags(innerhtml)
 
                 # Do not override pre-existing ids
                 if "id" not in el.attrib:
-                    innertext = unescape(stashedHTML2text(text, self.md))
-                    el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
+                    el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids)
+
+                data_toc_label = ''
+                if 'data-toc-label' in el.attrib:
+                    data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md)
+                    # Overwrite name with sanitized value of `data-toc-label`.
+                    name = escape_cdata(strip_tags(data_toc_label))
+                    # Remove the data-toc-label attribute as it is no longer needed
+                    del el.attrib['data-toc-label']
 
                 if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
                     toc_tokens.append({
                         'level': int(el.tag[-1]),
                         'id': el.attrib["id"],
-                        'name': unescape(stashedHTML2text(
-                            code_escape(el.attrib.get('data-toc-label', text)),
-                            self.md, strip_entities=False
-                        ))
+                        'name': name,
+                        'html': innerhtml,
+                        'data-toc-label': data_toc_label
                     })
 
-                # Remove the data-toc-label attribute as it is no longer needed
-                if 'data-toc-label' in el.attrib:
-                    del el.attrib['data-toc-label']
-
                 if self.use_anchors:
                     self.add_anchor(el, el.attrib["id"])
                 if self.use_permalinks not in [False, None]:
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index c96772ff..b8bc3c81 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -29,17 +29,6 @@
 import markdown
 
 
-class TestCaseWithAssertStartsWith(unittest.TestCase):
-
-    def assertStartsWith(self, expectedPrefix, text, msg=None):
-        if not text.startswith(expectedPrefix):
-            if len(expectedPrefix) + 5 < len(text):
-                text = text[:len(expectedPrefix) + 5] + '...'
-            standardMsg = '{} not found at the start of {}'.format(repr(expectedPrefix),
-                                                                   repr(text))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-
 class TestExtensionClass(unittest.TestCase):
     """ Test markdown.extensions.Extension. """
 
@@ -281,407 +270,6 @@ def testRE(self):
             self.assertEqual(RE.match(test).groups(), expected)
 
 
-class TestTOC(TestCaseWithAssertStartsWith):
-    """ Test TOC Extension. """
-
-    def setUp(self):
-        self.md = markdown.Markdown(extensions=['toc'])
-
-    def testMarker(self):
-        """ Test TOC with a Marker. """
-        text = '[TOC]\n\n# Header 1\n\n## Header 2'
-        self.assertEqual(
-            self.md.convert(text),
-            '<div class="toc">\n'
-              '<ul>\n'                                             # noqa
-                '<li><a href="#header-1">Header 1</a>'             # noqa
-                  '<ul>\n'                                         # noqa
-                    '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
-                  '</ul>\n'                                        # noqa
-                '</li>\n'                                          # noqa
-              '</ul>\n'                                            # noqa
-            '</div>\n'
-            '<h1 id="header-1">Header 1</h1>\n'
-            '<h2 id="header-2">Header 2</h2>'
-        )
-
-    def testNoMarker(self):
-        """ Test TOC without a Marker. """
-        text = '# Header 1\n\n## Header 2'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="header-1">Header 1</h1>\n'
-            '<h2 id="header-2">Header 2</h2>'
-        )
-        self.assertEqual(
-            self.md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                             # noqa
-                '<li><a href="#header-1">Header 1</a>'             # noqa
-                  '<ul>\n'                                         # noqa
-                    '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
-                  '</ul>\n'                                        # noqa
-                '</li>\n'                                          # noqa
-              '</ul>\n'                                            # noqa
-            '</div>\n'
-        )
-
-    def testAlternateMarker(self):
-        """ Test TOC with user defined marker. """
-        md = markdown.Markdown(
-            extensions=[markdown.extensions.toc.TocExtension(marker='{{marker}}')]
-        )
-        text = '{{marker}}\n\n# Header 1\n\n## Header 2'
-        self.assertEqual(
-            md.convert(text),
-            '<div class="toc">\n'
-              '<ul>\n'                                             # noqa
-                '<li><a href="#header-1">Header 1</a>'             # noqa
-                  '<ul>\n'                                         # noqa
-                    '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
-                  '</ul>\n'                                        # noqa
-                '</li>\n'                                          # noqa
-              '</ul>\n'                                            # noqa
-            '</div>\n'
-            '<h1 id="header-1">Header 1</h1>\n'
-            '<h2 id="header-2">Header 2</h2>'
-        )
-
-    def testDisabledMarker(self):
-        """ Test TOC with disabled marker. """
-        md = markdown.Markdown(
-            extensions=[markdown.extensions.toc.TocExtension(marker='')]
-        )
-        text = '[TOC]\n\n# Header 1\n\n## Header 2'
-        self.assertEqual(
-            md.convert(text),
-            '<p>[TOC]</p>\n'
-            '<h1 id="header-1">Header 1</h1>\n'
-            '<h2 id="header-2">Header 2</h2>'
-        )
-        self.assertStartsWith('<div class="toc">', md.toc)
-
-    def testReset(self):
-        """ Test TOC Reset. """
-        self.assertEqual(self.md.toc, '')
-        self.md.convert('# Header 1\n\n## Header 2')
-        self.assertStartsWith('<div class="toc">', self.md.toc)
-        self.md.reset()
-        self.assertEqual(self.md.toc, '')
-        self.assertEqual(self.md.toc_tokens, [])
-
-    def testUniqueIds(self):
-        """ Test Unique IDs. """
-
-        text = '#Header\n#Header\n#Header'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="header">Header</h1>\n'
-            '<h1 id="header_1">Header</h1>\n'
-            '<h1 id="header_2">Header</h1>'
-        )
-        self.assertEqual(
-            self.md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                       # noqa
-                '<li><a href="#header">Header</a></li>\n'    # noqa
-                '<li><a href="#header_1">Header</a></li>\n'  # noqa
-                '<li><a href="#header_2">Header</a></li>\n'  # noqa
-              '</ul>\n'                                      # noqa
-            '</div>\n'
-        )
-        self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'header', 'name': 'Header', 'children': []},
-            {'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []},
-            {'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []},
-        ])
-
-    def testHtmlEntities(self):
-        """ Test Headers with HTML Entities. """
-        text = '# Foo &amp; bar'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="foo-bar">Foo &amp; bar</h1>'
-        )
-        self.assertEqual(
-            self.md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                             # noqa
-                '<li><a href="#foo-bar">Foo &amp; bar</a></li>\n'  # noqa
-              '</ul>\n'                                            # noqa
-            '</div>\n'
-        )
-        self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'children': []},
-        ])
-
-    def testHtmlSpecialChars(self):
-        """ Test Headers with HTML special characters. """
-        text = '# Foo > & bar'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="foo-bar">Foo &gt; &amp; bar</h1>'
-        )
-        self.assertEqual(
-            self.md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                                  # noqa
-                '<li><a href="#foo-bar">Foo &gt; &amp; bar</a></li>\n'  # noqa
-              '</ul>\n'                                                 # noqa
-            '</div>\n'
-        )
-        self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'children': []},
-        ])
-
-    def testRawHtml(self):
-        """ Test Headers with raw HTML. """
-        text = '# Foo <b>Bar</b> Baz.'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="foo-bar-baz">Foo <b>Bar</b> Baz.</h1>'
-        )
-        self.assertEqual(
-            self.md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                                # noqa
-                '<li><a href="#foo-bar-baz">Foo Bar Baz.</a></li>\n'  # noqa
-              '</ul>\n'                                               # noqa
-            '</div>\n'
-        )
-        self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []},
-        ])
-
-    def testBaseLevel(self):
-        """ Test Header Base Level. """
-        md = markdown.Markdown(
-            extensions=[markdown.extensions.toc.TocExtension(baselevel=5)]
-        )
-        text = '# Some Header\n\n## Next Level\n\n### Too High'
-        self.assertEqual(
-            md.convert(text),
-            '<h5 id="some-header">Some Header</h5>\n'
-            '<h6 id="next-level">Next Level</h6>\n'
-            '<h6 id="too-high">Too High</h6>'
-        )
-        self.assertEqual(
-            md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                                 # noqa
-                '<li><a href="#some-header">Some Header</a>'           # noqa
-                  '<ul>\n'                                             # noqa
-                    '<li><a href="#next-level">Next Level</a></li>\n'  # noqa
-                    '<li><a href="#too-high">Too High</a></li>\n'      # noqa
-                  '</ul>\n'                                            # noqa
-                '</li>\n'                                              # noqa
-              '</ul>\n'                                                # noqa
-            '</div>\n'
-        )
-        self.assertEqual(md.toc_tokens, [
-            {'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [
-                {'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []},
-                {'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []},
-            ]},
-        ])
-
-    def testHeaderInlineMarkup(self):
-        """ Test Headers with inline markup. """
-
-        text = '#Some *Header* with [markup](http://example.com).'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="some-header-with-markup">Some <em>Header</em> with '
-            '<a href="http://example.com">markup</a>.</h1>'
-        )
-        self.assertEqual(
-            self.md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                     # noqa
-                '<li><a href="#some-header-with-markup">'  # noqa
-                  'Some Header with markup.</a></li>\n'    # noqa
-              '</ul>\n'                                    # noqa
-            '</div>\n'
-        )
-        self.assertEqual(self.md.toc_tokens, [
-            {'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []},
-        ])
-
-    def testTitle(self):
-        """ Test TOC Title. """
-        md = markdown.Markdown(
-            extensions=[markdown.extensions.toc.TocExtension(title='Table of Contents')]
-        )
-        md.convert('# Header 1\n\n## Header 2')
-        self.assertStartsWith(
-            '<div class="toc"><span class="toctitle">Table of Contents</span><ul>',
-            md.toc
-        )
-
-    def testWithAttrList(self):
-        """ Test TOC with `attr_list` Extension. """
-        md = markdown.Markdown(extensions=['toc', 'attr_list'])
-        text = ('# Header 1\n\n'
-                '## Header 2 { #foo }\n\n'
-                '## Header 3 { data-toc-label="Foo Bar" }\n\n'
-                '# Header 4 { data-toc-label="Foo > Baz" }\n\n'
-                '# Header 5 { data-toc-label="Foo <b>Quux</b>" }')
-
-        self.assertEqual(
-            md.convert(text),
-            '<h1 id="header-1">Header 1</h1>\n'
-            '<h2 id="foo">Header 2</h2>\n'
-            '<h2 id="header-3">Header 3</h2>\n'
-            '<h1 id="header-4">Header 4</h1>\n'
-            '<h1 id="header-5">Header 5</h1>'
-        )
-        self.assertEqual(
-            md.toc,
-            '<div class="toc">\n'
-              '<ul>\n'                                             # noqa
-                '<li><a href="#header-1">Header 1</a>'             # noqa
-                  '<ul>\n'                                         # noqa
-                    '<li><a href="#foo">Header 2</a></li>\n'       # noqa
-                    '<li><a href="#header-3">Foo Bar</a></li>\n'   # noqa
-                  '</ul>\n'                                        # noqa
-                '</li>\n'                                          # noqa
-                '<li><a href="#header-4">Foo &gt; Baz</a></li>\n'  # noqa
-                '<li><a href="#header-5">Foo Quux</a></li>\n'      # noqa
-              '</ul>\n'                                            # noqa
-            '</div>\n'
-        )
-        self.assertEqual(md.toc_tokens, [
-            {'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [
-                {'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []},
-                {'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []}
-            ]},
-            {'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'children': []},
-            {'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []},
-        ])
-
-    def testUniqueFunc(self):
-        """ Test 'unique' function. """
-        from markdown.extensions.toc import unique
-        ids = {'foo'}
-        self.assertEqual(unique('foo', ids), 'foo_1')
-        self.assertEqual(ids, {'foo', 'foo_1'})
-
-    def testTocInHeaders(self):
-
-        text = '[TOC]\n#[TOC]'
-        self.assertEqual(
-            self.md.convert(text),
-            '<div class="toc">\n'                       # noqa
-              '<ul>\n'                                  # noqa
-                '<li><a href="#toc">[TOC]</a></li>\n'   # noqa
-              '</ul>\n'                                 # noqa
-            '</div>\n'                                  # noqa
-            '<h1 id="toc">[TOC]</h1>'                   # noqa
-        )
-
-        text = '#[TOC]\n[TOC]'
-        self.assertEqual(
-            self.md.convert(text),
-            '<h1 id="toc">[TOC]</h1>\n'                 # noqa
-            '<div class="toc">\n'                       # noqa
-              '<ul>\n'                                  # noqa
-                '<li><a href="#toc">[TOC]</a></li>\n'   # noqa
-              '</ul>\n'                                 # noqa
-            '</div>'                                    # noqa
-        )
-
-        text = '[TOC]\n# *[TOC]*'
-        self.assertEqual(
-            self.md.convert(text),
-            '<div class="toc">\n'                       # noqa
-              '<ul>\n'                                  # noqa
-                '<li><a href="#toc">[TOC]</a></li>\n'   # noqa
-              '</ul>\n'                                 # noqa
-            '</div>\n'                                  # noqa
-            '<h1 id="toc"><em>[TOC]</em></h1>'          # noqa
-        )
-
-    def testPermalink(self):
-        """ Test TOC `permalink` feature. """
-        text = '# Hd 1\n\n## Hd 2'
-        md = markdown.Markdown(
-            extensions=[markdown.extensions.toc.TocExtension(
-                permalink=True, permalink_title="PL")]
-        )
-        self.assertEqual(
-            md.convert(text),
-            '<h1 id="hd-1">'
-                'Hd 1'                                            # noqa
-                '<a class="headerlink" href="#hd-1" title="PL">'  # noqa
-                    '&para;'                                      # noqa
-                '</a>'                                            # noqa
-            '</h1>\n'
-            '<h2 id="hd-2">'
-                'Hd 2'                                            # noqa
-                '<a class="headerlink" href="#hd-2" title="PL">'  # noqa
-                    '&para;'                                      # noqa
-                '</a>'                                            # noqa
-            '</h2>'
-        )
-
-    def testPermalinkLeading(self):
-        """ Test TOC `permalink` with `permalink_leading` option. """
-        text = '# Hd 1\n\n## Hd 2'
-        md = markdown.Markdown(extensions=[
-            markdown.extensions.toc.TocExtension(
-                permalink=True, permalink_title="PL", permalink_leading=True)]
-        )
-        self.assertEqual(
-            md.convert(text),
-            '<h1 id="hd-1">'
-                '<a class="headerlink" href="#hd-1" title="PL">'  # noqa
-                    '&para;'                                      # noqa
-                '</a>'                                            # noqa
-                'Hd 1'                                            # noqa
-            '</h1>\n'
-            '<h2 id="hd-2">'
-                '<a class="headerlink" href="#hd-2" title="PL">'  # noqa
-                    '&para;'                                      # noqa
-                '</a>'                                            # noqa
-                'Hd 2'                                            # noqa
-            '</h2>'
-        )
-
-    def testInlineMarkupPermalink(self):
-        """ Test TOC `permalink` with headers containing markup. """
-        text = '# Code `in` hd'
-        md = markdown.Markdown(
-            extensions=[markdown.extensions.toc.TocExtension(
-                permalink=True, permalink_title="PL")]
-        )
-        self.assertEqual(
-            md.convert(text),
-            '<h1 id="code-in-hd">'
-                'Code <code>in</code> hd'                               # noqa
-                '<a class="headerlink" href="#code-in-hd" title="PL">'  # noqa
-                    '&para;'                                            # noqa
-                '</a>'                                                  # noqa
-            '</h1>'
-        )
-
-    def testInlineMarkupPermalinkLeading(self):
-        """ Test TOC `permalink_leading` with headers containing markup. """
-        text = '# Code `in` hd'
-        md = markdown.Markdown(extensions=[
-            markdown.extensions.toc.TocExtension(
-                permalink=True, permalink_title="PL", permalink_leading=True)]
-        )
-        self.assertEqual(
-            md.convert(text),
-            '<h1 id="code-in-hd">'
-                '<a class="headerlink" href="#code-in-hd" title="PL">'  # noqa
-                    '&para;'                                            # noqa
-                '</a>'                                                  # noqa
-                'Code <code>in</code> hd'                               # noqa
-            '</h1>'
-        )
-
-
 class TestSmarty(unittest.TestCase):
     def setUp(self):
         config = {
diff --git a/tests/test_syntax/extensions/test_smarty.py b/tests/test_syntax/extensions/test_smarty.py
index 8a176745..0228ddf0 100644
--- a/tests/test_syntax/extensions/test_smarty.py
+++ b/tests/test_syntax/extensions/test_smarty.py
@@ -216,6 +216,8 @@ def test_smarty_and_toc(self):
                         'level': 1,
                         'id': 'foo-bar',
                         'name': 'Foo &mdash; bar',
+                        'html': '<em>Foo</em> &mdash; <code>bar</code>',
+                        'data-toc-label': '',
                         'children': [],
                     },
                 ],
diff --git a/tests/test_syntax/extensions/test_toc.py b/tests/test_syntax/extensions/test_toc.py
index 79764364..9902072a 100644
--- a/tests/test_syntax/extensions/test_toc.py
+++ b/tests/test_syntax/extensions/test_toc.py
@@ -20,14 +20,500 @@
 """
 
 from markdown.test_tools import TestCase
-from markdown.extensions.toc import TocExtension
+from markdown import Markdown
+from markdown.extensions.toc import TocExtension, strip_tags, unique
 from markdown.extensions.nl2br import Nl2BrExtension
 
 
 class TestTOC(TestCase):
     maxDiff = None
+    default_kwargs = {
+        'extensions': [TocExtension()]
+    }
 
-    # TODO: Move the rest of the TOC tests here.
+    def testTOCMarker(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                [TOC]
+
+                # Header 1
+
+                ## Header 2
+                '''
+            ),
+            '<div class="toc">\n'
+              '<ul>\n'                                             # noqa
+                '<li><a href="#header-1">Header 1</a>'             # noqa
+                  '<ul>\n'                                         # noqa
+                    '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
+                  '</ul>\n'                                        # noqa
+                '</li>\n'                                          # noqa
+              '</ul>\n'                                            # noqa
+            '</div>\n'
+            '<h1 id="header-1">Header 1</h1>\n'
+            '<h2 id="header-2">Header 2</h2>'
+        )
+
+    def testNoTOCMarker(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Header 1
+
+                ## Header 2
+                '''
+            ),
+            self.dedent(
+                '''
+                <h1 id="header-1">Header 1</h1>
+                <h2 id="header-2">Header 2</h2>
+                '''
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                             # noqa
+                        '<li><a href="#header-1">Header 1</a>'             # noqa
+                          '<ul>\n'                                         # noqa
+                            '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
+                          '</ul>\n'                                        # noqa
+                        '</li>\n'                                          # noqa
+                      '</ul>\n'                                            # noqa
+                    '</div>\n'
+                )
+            }
+        )
+
+    def testAlternateTOCMarker(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                {{marker}}
+
+                # Header 1
+
+                ## Header 2
+                '''
+            ),
+            '<div class="toc">\n'
+              '<ul>\n'                                             # noqa
+                '<li><a href="#header-1">Header 1</a>'             # noqa
+                  '<ul>\n'                                         # noqa
+                    '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
+                  '</ul>\n'                                        # noqa
+                '</li>\n'                                          # noqa
+              '</ul>\n'                                            # noqa
+            '</div>\n'
+            '<h1 id="header-1">Header 1</h1>\n'
+            '<h2 id="header-2">Header 2</h2>',
+            extensions=[TocExtension(marker='{{marker}}')]
+        )
+
+    def testDisabledTOCMarker(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                [TOC]
+
+                # Header 1
+
+                ## Header 2
+                '''
+            ),
+            self.dedent(
+                '''
+                <p>[TOC]</p>
+                <h1 id="header-1">Header 1</h1>
+                <h2 id="header-2">Header 2</h2>
+                '''
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                             # noqa
+                        '<li><a href="#header-1">Header 1</a>'             # noqa
+                          '<ul>\n'                                         # noqa
+                            '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
+                          '</ul>\n'                                        # noqa
+                        '</li>\n'                                          # noqa
+                      '</ul>\n'                                            # noqa
+                    '</div>\n'
+                )
+            },
+            extensions=[TocExtension(marker='')]
+        )
+
+    def testTOCReset(self):
+        md = Markdown(extensions=[TocExtension()])
+        self.assertEqual(md.toc, '')
+        self.assertEqual(md.toc_tokens, [])
+        md.convert('# Header 1')
+        self.assertEqual('<div class="toc">', md.toc[:17])
+        self.assertEqual(len(md.toc_tokens), 1)
+        md.reset()
+        self.assertEqual(md.toc, '')
+        self.assertEqual(md.toc_tokens, [])
+
+    def testUniqueIds(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                #Header
+                #Header
+                #Header
+                '''
+            ),
+            self.dedent(
+                '''
+                <h1 id="header">Header</h1>
+                <h1 id="header_1">Header</h1>
+                <h1 id="header_2">Header</h1>
+                '''
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                       # noqa
+                        '<li><a href="#header">Header</a></li>\n'    # noqa
+                        '<li><a href="#header_1">Header</a></li>\n'  # noqa
+                        '<li><a href="#header_2">Header</a></li>\n'  # noqa
+                      '</ul>\n'                                      # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [
+                    {
+                        'level': 1,
+                        'id': 'header',
+                        'name': 'Header',
+                        'html': 'Header',
+                        'data-toc-label': '',
+                        'children': []
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header_1',
+                        'name': 'Header',
+                        'html': 'Header',
+                        'data-toc-label': '',
+                        'children': []
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header_2',
+                        'name': 'Header',
+                        'html': 'Header',
+                        'data-toc-label': '',
+                        'children': []
+                    },
+                ]
+            }
+        )
+
+    def testHtmlEntitiesInTOC(self):
+        self.assertMarkdownRenders(
+            '# Foo &amp; bar',
+            '<h1 id="foo-bar">Foo &amp; bar</h1>',
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                             # noqa
+                        '<li><a href="#foo-bar">Foo &amp; bar</a></li>\n'  # noqa
+                      '</ul>\n'                                            # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [{
+                    'level': 1,
+                    'id': 'foo-bar',
+                    'name': 'Foo &amp; bar',
+                    'html': 'Foo &amp; bar',
+                    'data-toc-label': '',
+                    'children': []
+                }]
+            }
+        )
+
+    def testHtmlSpecialCharsInTOC(self):
+        self.assertMarkdownRenders(
+            '# Foo > & bar',
+            '<h1 id="foo-bar">Foo &gt; &amp; bar</h1>',
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                                  # noqa
+                        '<li><a href="#foo-bar">Foo &gt; &amp; bar</a></li>\n'  # noqa
+                      '</ul>\n'                                                 # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [{
+                    'level': 1,
+                    'id': 'foo-bar',
+                    'name': 'Foo &gt; &amp; bar',
+                    'html': 'Foo &gt; &amp; bar',
+                    'data-toc-label': '',
+                    'children': []
+                }]
+            }
+        )
+
+    def testRawHtmlInTOC(self):
+        self.assertMarkdownRenders(
+            '# Foo <b>Bar</b> Baz.',
+            '<h1 id="foo-bar-baz">Foo <b>Bar</b> Baz.</h1>',
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                                # noqa
+                        '<li><a href="#foo-bar-baz">Foo Bar Baz.</a></li>\n'  # noqa
+                      '</ul>\n'                                               # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [{
+                    'level': 1,
+                    'id': 'foo-bar-baz',
+                    'name': 'Foo Bar Baz.',
+                    'html': 'Foo <b>Bar</b> Baz.',
+                    'data-toc-label': '',
+                    'children': []
+                }]
+            }
+        )
+
+    def testTOCBaseLevel(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Some Header
+                ## Next Level
+                ### Too High
+                '''
+            ),
+            self.dedent(
+                '''
+                <h5 id="some-header">Some Header</h5>
+                <h6 id="next-level">Next Level</h6>
+                <h6 id="too-high">Too High</h6>
+                '''
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                                 # noqa
+                        '<li><a href="#some-header">Some Header</a>'           # noqa
+                          '<ul>\n'                                             # noqa
+                            '<li><a href="#next-level">Next Level</a></li>\n'  # noqa
+                            '<li><a href="#too-high">Too High</a></li>\n'      # noqa
+                          '</ul>\n'                                            # noqa
+                        '</li>\n'                                              # noqa
+                      '</ul>\n'                                                # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [{
+                    'level': 5,
+                    'id': 'some-header',
+                    'name': 'Some Header',
+                    'html': 'Some Header',
+                    'data-toc-label': '',
+                    'children': [
+                        {
+                            'level': 6,
+                            'id': 'next-level',
+                            'name': 'Next Level',
+                            'html': 'Next Level',
+                            'data-toc-label': '',
+                            'children': []
+                        },
+                        {
+                            'level': 6,
+                            'id': 'too-high',
+                            'name': 'Too High',
+                            'html': 'Too High',
+                            'data-toc-label': '',
+                            'children': []
+                        }
+                    ]
+                }]
+            },
+            extensions=[TocExtension(baselevel=5)]
+        )
+
+    def testHeaderInlineMarkup(self):
+        self.assertMarkdownRenders(
+            '#Some *Header* with [markup](http://example.com).',
+            '<h1 id="some-header-with-markup">Some <em>Header</em> with '
+            '<a href="http://example.com">markup</a>.</h1>',
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                     # noqa
+                        '<li><a href="#some-header-with-markup">'  # noqa
+                          'Some Header with markup.</a></li>\n'    # noqa
+                      '</ul>\n'                                    # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [{
+                    'level': 1,
+                    'id': 'some-header-with-markup',
+                    'name': 'Some Header with markup.',
+                    'html': 'Some <em>Header</em> with <a href="http://example.com">markup</a>.',
+                    'data-toc-label': '',
+                    'children': []
+                }]
+            }
+        )
+
+    def testTOCTitle(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Header 1
+
+                ## Header 2
+                '''
+            ),
+            self.dedent(
+                '''
+                <h1 id="header-1">Header 1</h1>
+                <h2 id="header-2">Header 2</h2>
+                '''
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc"><span class="toctitle">Table of Contents</span>'
+                      '<ul>\n'                                             # noqa
+                        '<li><a href="#header-1">Header 1</a>'             # noqa
+                          '<ul>\n'                                         # noqa
+                            '<li><a href="#header-2">Header 2</a></li>\n'  # noqa
+                          '</ul>\n'                                        # noqa
+                        '</li>\n'                                          # noqa
+                      '</ul>\n'                                            # noqa
+                    '</div>\n'
+                )
+            },
+            extensions=[TocExtension(title='Table of Contents')]
+        )
+
+    def testTOCUniqueFunc(self):
+        ids = {'foo'}
+        self.assertEqual(unique('foo', ids), 'foo_1')
+        self.assertEqual(ids, {'foo', 'foo_1'})
+
+    def testTocInHeaders(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                [TOC]
+                #[TOC]
+                '''
+            ),
+            '<div class="toc">\n'                       # noqa
+              '<ul>\n'                                  # noqa
+                '<li><a href="#toc">[TOC]</a></li>\n'   # noqa
+              '</ul>\n'                                 # noqa
+            '</div>\n'                                  # noqa
+            '<h1 id="toc">[TOC]</h1>'                   # noqa
+        )
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                #[TOC]
+                [TOC]
+                '''
+            ),
+            '<h1 id="toc">[TOC]</h1>\n'                 # noqa
+            '<div class="toc">\n'                       # noqa
+              '<ul>\n'                                  # noqa
+                '<li><a href="#toc">[TOC]</a></li>\n'   # noqa
+              '</ul>\n'                                 # noqa
+            '</div>'                                    # noqa
+        )
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                [TOC]
+                # *[TOC]*
+                '''
+            ),
+            '<div class="toc">\n'                       # noqa
+              '<ul>\n'                                  # noqa
+                '<li><a href="#toc">[TOC]</a></li>\n'   # noqa
+              '</ul>\n'                                 # noqa
+            '</div>\n'                                  # noqa
+            '<h1 id="toc"><em>[TOC]</em></h1>'          # noqa
+        )
+
+    def testTOCPermalink(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Hd 1
+
+                ## Hd 2
+                '''
+            ),
+            '<h1 id="hd-1">'
+                'Hd 1'                                            # noqa
+                '<a class="headerlink" href="#hd-1" title="PL">'  # noqa
+                    '&para;'                                      # noqa
+                '</a>'                                            # noqa
+            '</h1>\n'
+            '<h2 id="hd-2">'
+                'Hd 2'                                            # noqa
+                '<a class="headerlink" href="#hd-2" title="PL">'  # noqa
+                    '&para;'                                      # noqa
+                '</a>'                                            # noqa
+            '</h2>',
+            extensions=[TocExtension(permalink=True, permalink_title="PL")]
+        )
+
+    def testTOCPermalinkLeading(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Hd 1
+                ## Hd 2
+                '''
+            ),
+            '<h1 id="hd-1">'
+                '<a class="headerlink" href="#hd-1" title="PL">'  # noqa
+                    '&para;'                                      # noqa
+                '</a>'                                            # noqa
+                'Hd 1'                                            # noqa
+            '</h1>\n'
+            '<h2 id="hd-2">'
+                '<a class="headerlink" href="#hd-2" title="PL">'  # noqa
+                    '&para;'                                      # noqa
+                '</a>'                                            # noqa
+                'Hd 2'                                            # noqa
+            '</h2>',
+            extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)]
+        )
+
+    def testTOCInlineMarkupPermalink(self):
+        self.assertMarkdownRenders(
+            '# Code `in` hd',
+            '<h1 id="code-in-hd">'
+                'Code <code>in</code> hd'                               # noqa
+                '<a class="headerlink" href="#code-in-hd" title="PL">'  # noqa
+                    '&para;'                                            # noqa
+                '</a>'                                                  # noqa
+            '</h1>',
+            extensions=[TocExtension(permalink=True, permalink_title="PL")]
+        )
+
+    def testTOCInlineMarkupPermalinkLeading(self):
+        self.assertMarkdownRenders(
+            '# Code `in` hd',
+            '<h1 id="code-in-hd">'
+                '<a class="headerlink" href="#code-in-hd" title="PL">'  # noqa
+                    '&para;'                                            # noqa
+                '</a>'                                                  # noqa
+                'Code <code>in</code> hd'                               # noqa
+            '</h1>',
+            extensions=[TocExtension(permalink=True, permalink_title="PL", permalink_leading=True)]
+        )
 
     def testAnchorLink(self):
         self.assertMarkdownRenders(
@@ -140,11 +626,15 @@ def testMinMaxLevel(self):
                         'level': 3,
                         'id': 'header-3',
                         'name': 'Header 3',
+                        'html': 'Header 3',
+                        'data-toc-label': '',
                         'children': [
                             {
                                 'level': 4,
                                 'id': 'header-4',
                                 'name': 'Header 4',
+                                'html': 'Header 4',
+                                'data-toc-label': '',
                                 'children': []
                             }
                         ]
@@ -189,11 +679,15 @@ def testMaxLevel(self):
                         'level': 1,
                         'id': 'header-1',
                         'name': 'Header 1',
+                        'html': 'Header 1',
+                        'data-toc-label': '',
                         'children': [
                             {
                                 'level': 2,
                                 'id': 'header-2',
                                 'name': 'Header 2',
+                                'html': 'Header 2',
+                                'data-toc-label': '',
                                 'children': []
                             }
                         ]
@@ -245,11 +739,15 @@ def testMinMaxLevelwithAnchorLink(self):
                         'level': 3,
                         'id': 'header-3',
                         'name': 'Header 3',
+                        'html': 'Header 3',
+                        'data-toc-label': '',
                         'children': [
                             {
                                 'level': 4,
                                 'id': 'header-4',
                                 'name': 'Header 4',
+                                'html': 'Header 4',
+                                'data-toc-label': '',
                                 'children': []
                             }
                         ]
@@ -301,11 +799,15 @@ def testMinMaxLevelwithPermalink(self):
                         'level': 3,
                         'id': 'header-3',
                         'name': 'Header 3',
+                        'html': 'Header 3',
+                        'data-toc-label': '',
                         'children': [
                             {
                                 'level': 4,
                                 'id': 'header-4',
                                 'name': 'Header 4',
+                                'html': 'Header 4',
+                                'data-toc-label': '',
                                 'children': []
                             }
                         ]
@@ -353,11 +855,15 @@ def testMinMaxLevelwithBaseLevel(self):
                         'level': 4,
                         'id': 'second-level',
                         'name': 'Second Level',
+                        'html': 'Second Level',
+                        'data-toc-label': '',
                         'children': [
                             {
                                 'level': 5,
                                 'id': 'third-level',
                                 'name': 'Third Level',
+                                'html': 'Third Level',
+                                'data-toc-label': '',
                                 'children': []
                             }
                         ]
@@ -402,11 +908,15 @@ def testMaxLevelwithBaseLevel(self):
                         'level': 2,
                         'id': 'some-header',
                         'name': 'Some Header',
+                        'html': 'Some Header',
+                        'data-toc-label': '',
                         'children': [
                             {
                                 'level': 3,
                                 'id': 'next-level',
                                 'name': 'Next Level',
+                                'html': 'Next Level',
+                                'data-toc-label': '',
                                 'children': []
                             }
                         ]
@@ -455,6 +965,32 @@ def test_escaped_char_in_id(self):
                         'level': 1,
                         'id': 'escaped_character',
                         'name': 'escaped_character',
+                        'html': 'escaped_character',
+                        'data-toc-label': '',
+                        'children': []
+                    }
+                ]
+            },
+            extensions=['toc']
+        )
+
+    def testAutoLinkEmail(self):
+        self.assertMarkdownRenders(
+            '## <foo@example.org>',
+            '<h2 id="fooexampleorg"><a href="&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#102;&#111;&#111;&#64;&#101;'
+            '&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;">&#102;&#111;&#111;&#64;&#101;&#120;&#97;'
+            '&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;</a></h2>',
+            expected_attrs={
+                'toc_tokens': [
+                    {
+                        'level': 2,
+                        'id': 'fooexampleorg',
+                        'name': '&#102;&#111;&#111;&#64;&#101;&#120;&#97;&#109;'
+                                '&#112;&#108;&#101;&#46;&#111;&#114;&#103;',
+                        'html': '<a href="&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#102;&#111;&#111;&#64;&#101;'
+                                '&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;">&#102;&#111;&#111;'
+                                '&#64;&#101;&#120;&#97;&#109;&#112;&#108;&#101;&#46;&#111;&#114;&#103;</a>',
+                        'data-toc-label': '',
                         'children': []
                     }
                 ]
@@ -671,3 +1207,247 @@ def testTOCWithCustomTitleClass(self):
             ),
             extensions=[TocExtension(title_class="tocname", title='ToC')]
         )
+
+    def testTocWithAttrList(self):
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Header 1
+
+                ## Header 2 { #foo }
+
+                ## Header 3 { data-toc-label="Foo Bar" }
+
+                # Header 4 { data-toc-label="Foo > &amp; < Baz" }
+
+                # Header 5 { data-toc-label="Foo <b>Quux</b>" }
+                '''
+            ),
+            self.dedent(
+                '''
+                <h1 id="header-1">Header 1</h1>
+                <h2 id="foo">Header 2</h2>
+                <h2 id="header-3">Header 3</h2>
+                <h1 id="header-4">Header 4</h1>
+                <h1 id="header-5">Header 5</h1>
+                '''
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                                        # noqa
+                        '<li><a href="#header-1">Header 1</a>'                        # noqa
+                          '<ul>\n'                                                    # noqa
+                            '<li><a href="#foo">Header 2</a></li>\n'                  # noqa
+                            '<li><a href="#header-3">Foo Bar</a></li>\n'              # noqa
+                          '</ul>\n'                                                   # noqa
+                        '</li>\n'                                                     # noqa
+                        '<li><a href="#header-4">Foo &gt; &amp; &lt; Baz</a></li>\n'  # noqa
+                        '<li><a href="#header-5">Foo Quux</a></li>\n'                 # noqa
+                      '</ul>\n'                                                       # noqa
+                    '</div>\n'
+                ),
+                'toc_tokens': [
+                    {
+                        'level': 1,
+                        'id': 'header-1',
+                        'name': 'Header 1',
+                        'html': 'Header 1',
+                        'data-toc-label': '',
+                        'children': [
+                            {
+                                'level': 2,
+                                'id': 'foo',
+                                'name': 'Header 2',
+                                'html': 'Header 2',
+                                'data-toc-label': '',
+                                'children': []
+                            },
+                            {
+                                'level': 2,
+                                'id': 'header-3',
+                                'name': 'Foo Bar',
+                                'html': 'Header 3',
+                                'data-toc-label': 'Foo Bar',
+                                'children': []
+                            }
+                        ]
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header-4',
+                        'name': 'Foo &gt; &amp; &lt; Baz',
+                        'html': 'Header 4',
+                        'data-toc-label': 'Foo > &amp; < Baz',
+                        'children': []
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header-5',
+                        'name': 'Foo Quux',
+                        'html': 'Header 5',
+                        'data-toc-label': 'Foo <b>Quux</b>',
+                        'children': []
+                    },
+                ]
+            },
+            extensions=[TocExtension(), 'attr_list']
+        )
+
+    def testHeadingRemoveFootnoteRef(self):
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                # Header 1[^1]
+                # Header[^1] 2
+                # Header *subelement*[^1] 3
+                # Header[^1] double[^1] 4
+
+                [^1]: footnote
+                '''
+            ),
+            (
+                '<h1 id="header-1">Header 1<sup id="fnref:1"><a class="footnote-ref" href="#fn:1">1</a></sup></h1>\n'
+                '<h1 id="header-2">Header<sup id="fnref2:1"><a class="footnote-ref" href="#fn:1">1</a></sup> 2</h1>\n'
+                '<h1 id="header-subelement-3">'
+                'Header <em>subelement</em><sup id="fnref3:1"><a class="footnote-ref" href="#fn:1">1</a></sup> 3'
+                '</h1>\n'
+                '<h1 id="header-double-4">'
+                'Header<sup id="fnref4:1"><a class="footnote-ref" href="#fn:1">1</a></sup> double'
+                '<sup id="fnref5:1"><a class="footnote-ref" href="#fn:1">1</a></sup> 4'
+                '</h1>\n'
+                '<div class="footnote">\n'
+                '<hr />\n'
+                '<ol>\n'
+                '<li id="fn:1">\n'
+                '<p>'
+                'footnote&#160;'
+                '<a class="footnote-backref" href="#fnref:1" title="Jump back to footnote 1 in the text">&#8617;</a>'
+                '<a class="footnote-backref" href="#fnref2:1" title="Jump back to footnote 1 in the text">&#8617;</a>'
+                '<a class="footnote-backref" href="#fnref3:1" title="Jump back to footnote 1 in the text">&#8617;</a>'
+                '<a class="footnote-backref" href="#fnref4:1" title="Jump back to footnote 1 in the text">&#8617;</a>'
+                '<a class="footnote-backref" href="#fnref5:1" title="Jump back to footnote 1 in the text">&#8617;</a>'
+                '</p>\n'
+                '</li>\n'
+                '</ol>\n'
+                '</div>'
+            ),
+            expected_attrs={
+                'toc': (
+                    '<div class="toc">\n'
+                      '<ul>\n'                                                               # noqa
+                        '<li><a href="#header-1">Header 1</a></li>\n'                        # noqa
+                        '<li><a href="#header-2">Header 2</a></li>\n'                        # noqa
+                        '<li><a href="#header-subelement-3">Header subelement 3</a></li>\n'  # noqa
+                        '<li><a href="#header-double-4">Header double 4</a></li>\n'          # noqa
+                      '</ul>\n'                                                              # noqa
+                    '</div>\n'                                                               # noqa
+                ),
+                'toc_tokens': [
+                    {
+                        'level': 1,
+                        'id': 'header-1',
+                        'name': 'Header 1',
+                        'html': 'Header 1',
+                        'data-toc-label': '',
+                        'children': []
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header-2',
+                        'name': 'Header 2',
+                        'html': 'Header 2',
+                        'data-toc-label': '',
+                        'children': []
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header-subelement-3',
+                        'name': 'Header subelement 3',
+                        'html': 'Header <em>subelement</em> 3',
+                        'data-toc-label': '',
+                        'children': []
+                    },
+                    {
+                        'level': 1,
+                        'id': 'header-double-4',
+                        'name': 'Header double 4',
+                        'html': 'Header double 4',
+                        'data-toc-label': '',
+                        'children': []
+                    }
+                ]
+            },
+            extensions=[TocExtension(), 'footnotes']
+        )
+
+
+class testStripTags(TestCase):
+
+    def testStripElement(self):
+        self.assertEqual(
+            strip_tags('foo <em>bar</em>'),
+            'foo bar'
+        )
+
+    def testStripOpenElement(self):
+        self.assertEqual(
+            strip_tags('foo <em>bar'),
+            'foo bar'
+        )
+
+    def testStripEmptyElement(self):
+        self.assertEqual(
+            strip_tags('foo <br />bar'),
+            'foo bar'
+        )
+
+    def testDontStripOpenBracket(self):
+        self.assertEqual(
+            strip_tags('foo < bar'),
+            'foo < bar'
+        )
+
+    def testDontStripCloseBracket(self):
+        self.assertEqual(
+            strip_tags('foo > bar'),
+            'foo > bar'
+        )
+
+    def testStripCollapseWhitespace(self):
+        self.assertEqual(
+            strip_tags('foo <em>\tbar\t</em>'),
+            'foo bar'
+        )
+
+    def testStripElementWithNewlines(self):
+        self.assertEqual(
+            strip_tags('foo <meta content="tag\nwith\nnewlines"> bar'),
+            'foo bar'
+        )
+
+    def testStripComment(self):
+        self.assertEqual(
+            strip_tags('foo <!-- comment --> bar'),
+            'foo bar'
+        )
+
+    def testStripCommentWithInnerTags(self):
+        self.assertEqual(
+            strip_tags('foo <!-- comment with <em> --> bar'),
+            'foo bar'
+        )
+
+    def testStripCommentInElement(self):
+        self.assertEqual(
+            strip_tags('<em>foo <!-- comment --> bar<em>'),
+            'foo bar'
+        )
+
+    def testDontStripHTMLEntities(self):
+        self.assertEqual(
+            strip_tags('foo &lt; &amp; &lt; bar'),
+            'foo &lt; &amp; &lt; bar'
+        )