Skip to content

Commit

Permalink
Merge branch 'master' into dev
Browse files Browse the repository at this point in the history
# Conflicts:
#	lexbor
  • Loading branch information
rushter committed Sep 24, 2021
2 parents c8b9632 + 09a228e commit afba938
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 12 deletions.
24 changes: 24 additions & 0 deletions CHANGES.rst
@@ -1,6 +1,30 @@
selectolax Changelog
====================

Version 0.3.4
--------------

Released

- Fix ``HTMLParser.html``


Version 0.3.3
--------------

Released

- Use `document` for the ``HTMLParser.html``, ``LexborHTMLParser.html`` root properties

Version 0.3.2
--------------

Released

- Fix ``selector`` method for lexbor
- Improve text extraction for lexbor


Version 0.3.1
--------------

Expand Down
2 changes: 1 addition & 1 deletion LICENSE
@@ -1,7 +1,7 @@

MIT License

Copyright (c) 2018-2020, Artem Golubin
Copyright (c) 2018-2021, Artem Golubin

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

Expand Down
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -98,7 +98,7 @@ To use ``lexbor``, just import the parser and use it in the similar way to the `
...: <div id="updated">2021-08-15</div>
...: """
In [3]: parser = selectolax.lexbor.LexborHTMLParser(html)
In [3]: parser = LexborHTMLParser(html)
In [4]: parser.root.css_first("#updated").text()
Out[4]: '2021-08-15'
Expand Down
2 changes: 1 addition & 1 deletion selectolax/__init__.py
Expand Up @@ -3,4 +3,4 @@

__author__ = """Artem Golubin"""
__email__ = 'me@rushter.com'
__version__ = '0.3.1'
__version__ = '0.3.4'
7 changes: 5 additions & 2 deletions selectolax/lexbor.pyx
Expand Up @@ -15,7 +15,7 @@ cdef class LexborHTMLParser:
Use this class to parse raw HTML.
This parser mimics most of the stuff from ``HTMLParser`` but not inherits in directly.
This parser mimics most of the stuff from ``HTMLParser`` but not inherits it directly.
Parameters
----------
Expand Down Expand Up @@ -144,7 +144,10 @@ cdef class LexborHTMLParser:
@property
def html(self):
"""Return HTML representation of the page."""
return self.root.html
if self.document == NULL:
return None
node = LexborNode()._cinit(<lxb_dom_node_t *> &self.document.dom_document, self)
return node.html

def css(self, str query):
"""A CSS selector.
Expand Down
6 changes: 3 additions & 3 deletions selectolax/lexbor/node.pxi
Expand Up @@ -267,9 +267,9 @@ cdef class LexborNode:
"""
if recursive:
lxb_dom_node_destroy(<lxb_dom_node_t *>self.node)
lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
else:
lxb_dom_node_destroy_deep(<lxb_dom_node_t *>self.node)
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)

def strip_tags(self, list tags, bool recursive = False):
"""Remove specified tags from the HTML tree.
Expand Down Expand Up @@ -739,7 +739,7 @@ cdef class LexborNode:
-------
selector : The `Selector` class.
"""
return LexborSelector(<LexborNode>self.node, query)
return LexborSelector(self, query)

def __eq__(self, other):
if isinstance(other, str):
Expand Down
1 change: 1 addition & 0 deletions selectolax/parser.pxd
Expand Up @@ -440,6 +440,7 @@ cdef extern from "myhtml/tree.h" nogil:
myhtml_tree_node_t * myhtml_tree_node_clone(myhtml_tree_node_t* node)
myhtml_tree_node_t * myhtml_tree_node_insert_root(myhtml_tree_t* tree, myhtml_token_node_t* token,
myhtml_namespace ns)
void myhtml_tree_node_add_child(myhtml_tree_node_t* root, myhtml_tree_node_t* node)

cdef extern from "myhtml/serialization.h" nogil:
mystatus_t myhtml_serialization(myhtml_tree_node_t* scope_node, mycore_string_raw_t* str)
Expand Down
8 changes: 6 additions & 2 deletions selectolax/parser.pyx
Expand Up @@ -289,7 +289,11 @@ cdef class HTMLParser:
@property
def html(self):
"""Return HTML representation of the page."""
return self.root.html
if self.html_tree and self.html_tree.document:
node = Node()
node._init(self.html_tree.document, self)
return node.html
return None

def select(self, query=None):
"""Select nodes give a CSS selector.
Expand Down Expand Up @@ -381,7 +385,7 @@ cdef class HTMLParser:
raise RuntimeError("Can't init MyHTML Tree object.")

node = myhtml_node_clone_deep(html_tree, self.html_tree.node_html)
myhtml_tree_node_insert_root(html_tree, NULL, MyHTML_NAMESPACE_HTML)
myhtml_tree_node_add_child(html_tree.document, node)
html_tree.node_html = node

cls = HTMLParser.from_tree(
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.1
current_version = 0.3.4
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -143,7 +143,7 @@ def make_extensions():

setup(
name='selectolax',
version='0.3.1',
version='0.3.4',
description="Fast HTML5 parser with CSS selectors.",
long_description=readme,
author="Artem Golubin",
Expand Down
11 changes: 11 additions & 0 deletions tests/test_parser.py
Expand Up @@ -108,3 +108,14 @@ def test_tags(parser):
""")
assert len(html_parser.tags('div')) == 5


@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
def test_preserves_doctype(parser):
html_parser = parser("""
<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body><p>Hello World</p></body>
</html>
""")
assert '<!DOCTYPE html>' in html_parser.html

0 comments on commit afba938

Please sign in to comment.