Merge branch 'master' into dev

# Conflicts: # lexbor
ProfoundNetworks · Sep 24, 2021 · afba938 · afba938
2 parents c8b9632 + 09a228e
commit afba938
Show file tree

Hide file tree

Showing 11 changed files with 55 additions and 12 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,30 @@
 selectolax Changelog
 ====================
 
+Version 0.3.4
+--------------
+
+Released
+
+- Fix ``HTMLParser.html``
+
+
+Version 0.3.3
+--------------
+
+Released
+
+- Use `document` for the ``HTMLParser.html``, ``LexborHTMLParser.html``  root properties
+
+Version 0.3.2
+--------------
+
+Released
+
+- Fix  ``selector`` method for lexbor
+- Improve text extraction for lexbor
+
+
 Version 0.3.1
 --------------
 

diff --git a/LICENSE b/LICENSE
@@ -1,7 +1,7 @@
 
 MIT License
 
-Copyright (c) 2018-2020, Artem Golubin
+Copyright (c) 2018-2021, Artem Golubin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

diff --git a/README.rst b/README.rst
@@ -98,7 +98,7 @@ To use ``lexbor``, just import the parser and use it in the similar way to the `
        ...: <div id="updated">2021-08-15</div>
        ...: """
 
-    In [3]: parser = selectolax.lexbor.LexborHTMLParser(html)
+    In [3]: parser = LexborHTMLParser(html)
     In [4]: parser.root.css_first("#updated").text()
     Out[4]: '2021-08-15'
 

diff --git a/selectolax/__init__.py b/selectolax/__init__.py
@@ -3,4 +3,4 @@
 
 __author__ = """Artem Golubin"""
 __email__ = 'me@rushter.com'
-__version__ = '0.3.1'
+__version__ = '0.3.4'
diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx
@@ -15,7 +15,7 @@ cdef class LexborHTMLParser:
 
     Use this class to parse raw HTML.
 
-    This parser mimics most of the stuff from ``HTMLParser`` but not inherits in directly.
+    This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
 
     Parameters
     ----------
@@ -144,7 +144,10 @@ cdef class LexborHTMLParser:
     @property
     def html(self):
         """Return HTML representation of the page."""
-        return self.root.html
+        if self.document == NULL:
+            return None
+        node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
+        return node.html
 
     def css(self, str query):
         """A CSS selector.

diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -267,9 +267,9 @@ cdef class LexborNode:
 
         """
         if recursive:
-            lxb_dom_node_destroy(<lxb_dom_node_t *>self.node)
+            lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
         else:
-            lxb_dom_node_destroy_deep(<lxb_dom_node_t *>self.node)
+            lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
 
     def strip_tags(self, list tags, bool recursive = False):
         """Remove specified tags from the HTML tree.
@@ -739,7 +739,7 @@ cdef class LexborNode:
         -------
         selector : The `Selector` class.
         """
-        return LexborSelector(<LexborNode>self.node, query)
+        return LexborSelector(self, query)
 
     def __eq__(self, other):
         if isinstance(other, str):

diff --git a/selectolax/parser.pxd b/selectolax/parser.pxd
@@ -440,6 +440,7 @@ cdef extern from "myhtml/tree.h" nogil:
     myhtml_tree_node_t * myhtml_tree_node_clone(myhtml_tree_node_t* node)
     myhtml_tree_node_t * myhtml_tree_node_insert_root(myhtml_tree_t* tree, myhtml_token_node_t* token,
                                                       myhtml_namespace ns)
+    void myhtml_tree_node_add_child(myhtml_tree_node_t* root, myhtml_tree_node_t* node)
 
 cdef extern from "myhtml/serialization.h" nogil:
     mystatus_t myhtml_serialization(myhtml_tree_node_t* scope_node, mycore_string_raw_t* str)

diff --git a/selectolax/parser.pyx b/selectolax/parser.pyx
@@ -289,7 +289,11 @@ cdef class HTMLParser:
     @property
     def html(self):
         """Return HTML representation of the page."""
-        return self.root.html
+        if self.html_tree and self.html_tree.document:
+            node = Node()
+            node._init(self.html_tree.document, self)
+            return node.html
+        return None
 
     def select(self, query=None):
         """Select nodes give a CSS selector.
@@ -381,7 +385,7 @@ cdef class HTMLParser:
             raise RuntimeError("Can't init MyHTML Tree object.")
 
         node = myhtml_node_clone_deep(html_tree, self.html_tree.node_html)
-        myhtml_tree_node_insert_root(html_tree, NULL, MyHTML_NAMESPACE_HTML)
+        myhtml_tree_node_add_child(html_tree.document, node)
         html_tree.node_html = node
 
         cls = HTMLParser.from_tree(

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.1
+current_version = 0.3.4
 commit = True
 tag = True
 

diff --git a/setup.py b/setup.py
@@ -143,7 +143,7 @@ def make_extensions():
 
 setup(
     name='selectolax',
-    version='0.3.1',
+    version='0.3.4',
     description="Fast HTML5 parser with CSS selectors.",
     long_description=readme,
     author="Artem Golubin",

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -108,3 +108,14 @@ def test_tags(parser):
     """)
     assert len(html_parser.tags('div')) == 5
 
+
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_preserves_doctype(parser):
+    html_parser = parser("""
+    <!DOCTYPE html>
+    <html>
+        <head><title>Test</title></head>
+        <body><p>Hello World</p></body>
+    </html>
+    """)
+    assert '<!DOCTYPE html>' in html_parser.html