Permalink
Browse files

Unit tests for lxml.html.html5parser

--HG--
extra : rebase_source : fc5d3b785f1eb962b54077611438af2667e1f2e8
  • Loading branch information...
1 parent 544f1b4 commit 3a872ae9b45c39eb66720b28b8609638938b83f8 @dairiki dairiki committed Mar 31, 2012
Showing with 358 additions and 0 deletions.
  1. +358 −0 src/lxml/html/tests/test_html5parser.py
View
358 src/lxml/html/tests/test_html5parser.py
@@ -0,0 +1,358 @@
+import imp
+try:
+ from StringIO import StringIO
+except ImportError: # python 3
+ from io import StringIO
+import sys
+import tempfile
+import unittest
+try:
+ from unittest import skipUnless
+except ImportError:
+ # sys.version < (2, 7)
+ def skipUnless(condition, reason):
+ return lambda f: condition and f or None
+
+from lxml.builder import ElementMaker
+from lxml.etree import Element, ElementTree, ParserError
+from lxml.html import html_parser, XHTML_NAMESPACE
+
+try:
+ import html5lib
+except ImportError:
+ html5lib = None
+
+ class BogusModules(object):
+ # See PEP 302 for details on how this works
+ def __init__(self, mocks):
+ self.mocks = mocks
+
+ def find_module(self, fullname, path=None):
+ if fullname in self.mocks:
+ return self
+ return None
+
+ def load_module(self, fullname):
+ mod = sys.modules.setdefault(fullname, imp.new_module(fullname))
+ mod.__file__, mod.__loader__, mod.__path__ = "<dummy>", self, []
+ mod.__dict__.update(self.mocks[fullname])
+ return mod
+
+ # Fake just enough of html5lib so that html5parser.py is importable
+ # without errors.
+ sys.meta_path.append(BogusModules({
+ 'html5lib': {
+ # A do-nothing HTMLParser class
+ 'HTMLParser': type('HTMLParser', (object,), {
+ '__init__': lambda self, **kw: None,
+ }),
+ },
+ 'html5lib.treebuilders': {
+ },
+ 'html5lib.treebuilders.etree_lxml': {
+ 'TreeBuilder': 'dummy treebuilder',
+ },
+ }))
+
+class Test_HTMLParser(unittest.TestCase):
+ def make_one(self, **kwargs):
+ from lxml.html.html5parser import HTMLParser
+ return HTMLParser(**kwargs)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ parser = self.make_one(strict=True)
+ tree = parser.parse(XHTML_TEST_DOCUMENT)
+ root = tree.getroot()
+ self.assertEqual(root.tag, xhtml_tag('html'))
+
+class Test_XHTMLParser(unittest.TestCase):
+ def make_one(self, **kwargs):
+ from lxml.html.html5parser import XHTMLParser
+ return XHTMLParser(**kwargs)
+
+ @skipUnless(hasattr(html5lib, 'XHTMLParser'),
+ 'xhtml5lib does not have XHTMLParser')
+ def test_integration(self):
+ # XXX: This test are untested. (html5lib no longer has an XHTMLParser)
+ parser = self.make_one(strict=True)
+ tree = parser.parse(XHTML_TEST_DOCUMENT)
+ root = tree.getroot()
+ self.assertEqual(root.tag, xhtml_tag('html'))
+
+class Test_document_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import document_fromstring
+ return document_fromstring(*args, **kwargs)
+
+ def test_basic(self):
+ parser = DummyParser(doc=DummyElementTree(root='dummy root'))
+ elem = self.call_it('dummy input', parser=parser)
+ self.assertEqual(elem, 'dummy root')
+ self.assertEqual(parser.parse_args, ('dummy input',))
+ self.assertEqual(parser.parse_kwargs, {'useChardet': True})
+
+ def test_guess_charset_arg_gets_passed_to_parser(self):
+ parser = DummyParser()
+ elem = self.call_it('', guess_charset='gc_arg', parser=parser)
+ self.assertEqual(parser.parse_kwargs, {'useChardet': 'gc_arg'})
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ elem = self.call_it(XHTML_TEST_DOCUMENT)
+ self.assertEqual(elem.tag, xhtml_tag('html'))
+
+class Test_fragments_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import fragments_fromstring
+ return fragments_fromstring(*args, **kwargs)
+
+ def test_basic(self):
+ parser = DummyParser(fragments='fragments')
+ fragments = self.call_it('dummy input', parser=parser)
+ self.assertEqual(fragments, 'fragments')
+
+ def test_guess_charset_arg_gets_passed_to_parser(self):
+ parser = DummyParser()
+ elem = self.call_it('', guess_charset='gc_arg', parser=parser)
+ self.assertEqual(parser.parseFragment_kwargs, {'useChardet': 'gc_arg'})
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ def test_no_leading_text_strips_empty_leading_text(self):
+ parser = DummyParser(fragments=['', 'tail'])
+ fragments = self.call_it('', parser=parser, no_leading_text=True)
+ self.assertEqual(fragments, ['tail'])
+
+ def test_no_leading_text_raises_error_if_leading_text(self):
+ parser = DummyParser(fragments=['leading text', 'tail'])
+ self.assertRaises(ParserError, self.call_it,
+ '', parser=parser, no_leading_text=True)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ fragments = self.call_it('a<b>c</b>')
+ self.assertEqual(len(fragments), 2)
+ self.assertEqual(fragments[0], 'a')
+ self.assertEqual(fragments[1].tag, xhtml_tag('b'))
+
+
+class Test_fragment_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import fragment_fromstring
+ return fragment_fromstring(*args, **kwargs)
+
+ def test_basic(self):
+ element = DummyElement()
+ parser = DummyParser(fragments=[element])
+ self.assertEqual(self.call_it('html', parser=parser), element)
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ def test_create_parent(self):
+ parser = DummyParser(fragments=['head', Element('child')])
+ elem = self.call_it('html', parser=parser, create_parent='parent')
+ self.assertEqual(elem.tag, 'parent')
+ self.assertEqual(elem.text, 'head')
+ self.assertEqual(elem[0].tag, 'child')
+
+ def test_create_parent_default_type_no_ns(self):
+ parser = DummyParser(fragments=[], namespaceHTMLElements=False)
+ elem = self.call_it('html', parser=parser, create_parent=True)
+ self.assertEqual(elem.tag, 'div')
+
+ def test_raises_error_on_leading_text(self):
+ parser = DummyParser(fragments=['leading text'])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+ def test_raises_error_if_no_elements_found(self):
+ parser = DummyParser(fragments=[])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+ def test_raises_error_if_multiple_elements_found(self):
+ parser = DummyParser(fragments=[DummyElement(), DummyElement()])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+ def test_raises_error_if_tail(self):
+ parser = DummyParser(fragments=[DummyElement(tail='tail')])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+class Test_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import fromstring
+ return fromstring(*args, **kwargs)
+
+ def test_returns_whole_doc_if_input_contains_html_tag(self):
+ parser = DummyParser(root='the doc')
+ self.assertEqual(self.call_it('<html></html>', parser=parser),
+ 'the doc')
+
+ def test_returns_whole_doc_if_input_contains_doctype(self):
+ parser = DummyParser(root='the doc')
+ self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser),
+ 'the doc')
+
+ def test_returns_whole_doc_if_head_not_empty(self, use_ns=True):
+ E = HTMLElementMaker(namespaceHTMLElements=use_ns)
+ root = E.html(E.head(E.title()))
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), root)
+
+ def test_returns_whole_doc_if_head_not_empty_no_ns(self):
+ self.test_returns_whole_doc_if_head_not_empty(use_ns=False)
+
+ def test_returns_unwraps_body_if_single_element(self):
+ E = HTMLElementMaker()
+ elem = E.p('test')
+ root = E.html(E.head(), E.body(elem))
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), elem)
+
+ def test_returns_body_if_has_text(self):
+ E = HTMLElementMaker()
+ elem = E.p('test')
+ body = E.body('text', elem)
+ root = E.html(E.head(), body)
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), body)
+
+ def test_returns_body_if_single_element_has_tail(self):
+ E = HTMLElementMaker()
+ elem = E.p('test')
+ elem.tail = 'tail'
+ body = E.body(elem)
+ root = E.html(E.head(), body)
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), body)
+
+ def test_wraps_multiple_fragments_in_div_no_ns(self):
+ E = HTMLElementMaker(namespaceHTMLElements=False)
+ parser = DummyParser(root=E.html(E.head(), E.body(E.h1(), E.p())),
+ namespaceHTMLElements=False)
+ elem = self.call_it('', parser=parser)
+ self.assertEqual(elem.tag, 'div')
+
+ def test_wraps_multiple_fragments_in_span_no_ns(self):
+ E = HTMLElementMaker(namespaceHTMLElements=False)
+ parser = DummyParser(root=E.html(E.head(), E.body('foo', E.a('link'))),
+ namespaceHTMLElements=False)
+ elem = self.call_it('', parser=parser)
+ self.assertEqual(elem.tag, 'span')
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration_whole_doc(self):
+ elem = self.call_it(XHTML_TEST_DOCUMENT)
+ self.assertEqual(elem.tag, xhtml_tag('html'))
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration_single_fragment(self):
+ elem = self.call_it('<p></p>')
+ self.assertEqual(elem.tag, xhtml_tag('p'))
+
+class Test_parse(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import parse
+ return parse(*args, **kwargs)
+
+ def make_temp_file(self, contents=''):
+ tmpfile = tempfile.NamedTemporaryFile()
+ tmpfile.write(contents.encode('utf8'))
+ tmpfile.flush()
+ tmpfile.seek(0)
+ return tmpfile
+
+ def test_with_file_object(self):
+ parser = DummyParser(doc='the doc')
+ fp = open(__file__)
+ self.assertEqual(self.call_it(fp, parser=parser), 'the doc')
+ self.assertEqual(parser.parse_args, (fp,))
+
+ def test_with_file_name(self):
+ parser = DummyParser(doc='the doc')
+ tmpfile = self.make_temp_file('data')
+ self.assertEqual(self.call_it(tmpfile.name, parser=parser), 'the doc')
+ fp, = parser.parse_args
+ self.assertEqual(fp.read(), tmpfile.read())
+
+ def test_with_url(self):
+ parser = DummyParser(doc='the doc')
+ tmpfile = self.make_temp_file('content')
+ url = 'file://' + tmpfile.name
+ self.assertEqual(self.call_it(url, parser=parser), 'the doc')
+ fp, = parser.parse_args
+ self.assertEqual(fp.read(), tmpfile.read())
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ doc = self.call_it(StringIO(XHTML_TEST_DOCUMENT))
+ root = doc.getroot()
+ self.assertEqual(root.tag, xhtml_tag('html'))
+
+def test_suite():
+ loader = unittest.TestLoader()
+ return loader.loadTestsFromModule(sys.modules[__name__])
+
+
+class HTMLElementMaker(ElementMaker):
+ def __init__(self, namespaceHTMLElements=True):
+ initargs = dict(makeelement=html_parser.makeelement)
+ if namespaceHTMLElements:
+ initargs.update(namespace=XHTML_NAMESPACE,
+ nsmap={None: XHTML_NAMESPACE})
+ ElementMaker.__init__(self, **initargs)
+
+class DummyParser(object):
+ def __init__(self, doc=None, root=None,
+ fragments=None, namespaceHTMLElements=True):
+ self.doc = doc or DummyElementTree(root=root)
+ self.fragments = fragments
+ self.tree = DummyTreeBuilder(namespaceHTMLElements)
+
+ def parse(self, *args, **kwargs):
+ self.parse_args = args
+ self.parse_kwargs = kwargs
+ return self.doc
+
+ def parseFragment(self, *args, **kwargs):
+ self.parseFragment_args = args
+ self.parseFragment_kwargs = kwargs
+ return self.fragments
+
+class DummyTreeBuilder(object):
+ def __init__(self, namespaceHTMLElements=True):
+ self.namespaceHTMLElements = namespaceHTMLElements
+
+class DummyElementTree(object):
+ def __init__(self, root):
+ self.root = root
+
+ def getroot(self):
+ return self.root
+
+class DummyElement(object):
+ def __init__(self, tag='tag', tail=None):
+ self.tag = tag
+ self.tail = tail
+
+def xhtml_tag(tag):
+ return '{%s}%s' % (XHTML_NAMESPACE, tag)
+
+XHTML_TEST_DOCUMENT = '''
+ <!DOCTYPE html>
+ <html>
+ <head><title>TITLE</title></head>
+ <body></body>
+ </html>
+ '''

0 comments on commit 3a872ae

Please sign in to comment.