Permalink
Browse files

rewrite of node matcher to remove code duplication and make it PyPy c…

…ompatible
  • Loading branch information...
1 parent 7dcdd7c commit 714024ea3e239cd23d59ca59bbccd0404eb35e93 @scoder scoder committed Apr 21, 2012
Showing with 152 additions and 209 deletions.
  1. +40 −7 src/lxml/apihelpers.pxi
  2. +63 −181 src/lxml/cleanup.pxi
  3. +48 −21 src/lxml/lxml.etree.pyx
  4. +1 −0 src/lxml/python.pxd
View
@@ -946,15 +946,15 @@ cdef inline bint _tagMatches(xmlNode* c_node, char* c_href, char* c_name):
else:
return 0
-cdef inline bint _tagMatchesExactly(xmlNode* c_node, char* c_href, char* c_name):
+cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname):
u"""Tests if the node matches namespace URI and tag name.
This differs from _tagMatches() in that it does not consider a
- NULL value in c_href a wildcard, and that it expects the c_name to
- be taken from the doc dict, i.e. it only compares the names by
+ NULL value in qname.href a wildcard, and that it expects the c_name
+ to be taken from the doc dict, i.e. it only compares the names by
address.
- A node matches if it matches both c_href and c_name.
+ A node matches if it matches both href and c_name of the qname.
A node matches c_href if any of the following is true:
* its namespace is NULL and c_href is the empty string
@@ -965,15 +965,48 @@ cdef inline bint _tagMatchesExactly(xmlNode* c_node, char* c_href, char* c_name)
* its name string points to the same address (!) as c_name
"""
cdef char* c_node_href
- if c_name is not NULL and c_name is not c_node.name:
+ if c_qname.c_name is not NULL and c_qname.c_name is not c_node.name:
return 0
c_node_href = _getNs(c_node)
- if c_href is NULL:
+ if c_qname.href is NULL:
return c_node_href is NULL or c_node_href[0] == '\0'
elif c_node_href is NULL:
return 0
else:
- return cstring_h.strcmp(c_href, c_node_href) == 0
+ return cstring_h.strcmp(python.__cstr(c_qname.href), c_node_href) == 0
+
+cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags,
+ qname* c_ns_tags, bint force_into_dict) except -1:
+ u"""Map a sequence of (name, namespace) pairs to a qname array for efficient
+ matching with _tagMatchesExactly() above.
+
+ Note that each qname struct in the array owns its href byte string object
+ if it is not NULL.
+ """
+ cdef Py_ssize_t count = 0
+ cdef char* c_tag
+ cdef bytes ns, tag
+ for ns, tag in ns_tags:
+ if tag is None:
+ c_tag = NULL
+ elif force_into_dict:
+ c_tag = tree.xmlDictLookup(c_doc.dict, _cstr(tag), len(tag))
+ if c_tag is NULL:
+ raise MemoryError()
+ else:
+ c_tag = tree.xmlDictExists(c_doc.dict, _cstr(tag), len(tag))
+ if c_tag is NULL:
+ # not in the dict => not in the document
+ continue
+ c_ns_tags[0].c_name = c_tag
+ if ns is None:
+ c_ns_tags[0].href = NULL
+ else:
+ python.Py_INCREF(ns) # keep an owned reference!
+ c_ns_tags[0].href = <python.PyObject*>ns
+ c_ns_tags += 1
+ count += 1
+ return count
cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
u"""Unlink and free a node and subnodes if possible. Otherwise, make sure
View
@@ -22,59 +22,30 @@ def strip_attributes(tree_or_element, *attribute_names):
'simpleattr',
'{http://some/ns}attrname')
"""
+ cdef _MultiTagMatcher matcher
cdef _Element element
- cdef list ns_tags
- cdef char** c_ns_tags
- cdef Py_ssize_t c_tag_count
element = _rootNodeOrRaise(tree_or_element)
- if not attribute_names: return
-
- ns_tags = _sortedTagList([ _getNsTag(attr)
- for attr in <tuple>attribute_names ])
- ns_tags = [ (ns, tag if tag != b'*' else None)
- for ns, tag in ns_tags ]
-
- # tag names are passes as C pointers as this allows us to take
- # them from the doc dict and do pointer comparisons
- c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
- if c_ns_tags is NULL:
- raise MemoryError()
+ if not attribute_names:
+ return
- try:
- c_tag_count = _mapTagsToCharArray(element._doc._c_doc, ns_tags, c_ns_tags)
- if c_tag_count > 0:
- _strip_attributes(element._c_node, c_ns_tags, c_tag_count)
- finally:
- stdlib.free(c_ns_tags)
+ matcher = _MultiTagMatcher(attribute_names)
+ matcher.cacheTags(element._doc)
+ if matcher.rejectsAllAttributes():
+ return
+ _strip_attributes(element._c_node, matcher)
-cdef _strip_attributes(xmlNode* c_node, char** c_ns_tags, Py_ssize_t c_tag_count):
+cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
cdef xmlAttr* c_attr
- cdef Py_ssize_t i
- cdef char* c_href
- cdef char* c_name
-
+ cdef xmlAttr* c_next_attr
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
- if c_node.properties is not NULL:
- for i in range(c_tag_count):
- c_href = c_ns_tags[2*i]
- c_name = c_ns_tags[2*i+1]
- # must compare attributes manually to make sure we
- # only match on wildcard tag names if the attribute
- # has no namespace
- c_attr = c_node.properties
- while c_attr is not NULL:
- if c_name is NULL or c_attr.name == c_name:
- if c_href is NULL:
- if c_attr.ns is NULL or c_attr.ns.href is NULL:
- tree.xmlRemoveProp(c_attr)
- break
- elif c_attr.ns is not NULL and c_attr.ns.href is not NULL:
- if cstring_h.strcmp(c_attr.ns.href, c_href) == 0:
- tree.xmlRemoveProp(c_attr)
- break
- c_attr = c_attr.next
+ c_attr = c_node.properties
+ while c_attr is not NULL:
+ c_next_attr = c_attr.next
+ if matcher.matchesAttribute(c_attr):
+ tree.xmlRemoveProp(c_attr)
+ c_attr = c_next_attr
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
@@ -100,48 +71,36 @@ def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
lxml.etree.Comment # comments
)
"""
+ cdef _MultiTagMatcher matcher
cdef _Element element
cdef _Document doc
cdef list ns_tags
- cdef char** c_ns_tags
+ cdef qname* c_ns_tags
cdef Py_ssize_t c_tag_count
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
- if not tag_names: return
+ if not tag_names:
+ return
- ns_tags = _filterSpecialTagNames(
- tag_names, &strip_comments, &strip_pis, &strip_entities)
+ matcher = _MultiTagMatcher(tag_names)
+ matcher.cacheTags(doc)
+ if matcher.rejectsAll():
+ return
- if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
+ if isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
- if strip_comments:
+ if matcher.matchesType(tree.XML_COMMENT_NODE):
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
- if strip_pis:
+ if matcher.matchesType(tree.XML_PI_NODE):
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
+ _strip_elements(doc, element._c_node, matcher, with_tail)
- # tag names are passed as C pointers as this allows us to take
- # them from the doc dict and do pointer comparisons
- c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
- if c_ns_tags is NULL:
- raise MemoryError()
-
- try:
- c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
- if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
- _strip_elements(doc, element._c_node, c_ns_tags, c_tag_count,
- strip_comments, strip_pis, strip_entities, with_tail)
- finally:
- stdlib.free(c_ns_tags)
-
-cdef _strip_elements(_Document doc, xmlNode* c_node,
- char** c_ns_tags, Py_ssize_t c_tag_count,
- bint strip_comments, bint strip_pis, bint strip_entities,
+cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
bint with_tail):
cdef xmlNode* c_child
cdef xmlNode* c_next
- cdef Py_ssize_t i
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
if c_node.type == tree.XML_ELEMENT_NODE:
@@ -151,20 +110,16 @@ cdef _strip_elements(_Document doc, xmlNode* c_node,
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
c_next = _nextElement(c_child)
- if c_child.type == tree.XML_ELEMENT_NODE:
- for i in range(0, c_tag_count*2, 2):
- if _tagMatchesExactly(c_child, c_ns_tags[i], c_ns_tags[i+1]):
- if not with_tail:
- tree.xmlUnlinkNode(c_child)
- _removeNode(doc, c_child)
- break
- elif c_child.type == tree.XML_COMMENT_NODE and strip_comments \
- or c_child.type == tree.XML_PI_NODE and strip_pis \
- or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
- if with_tail:
- _removeText(c_child.next)
- tree.xmlUnlinkNode(c_child)
- attemptDeallocation(c_child)
+ if matcher.matches(c_child):
+ if c_child.type == tree.XML_ELEMENT_NODE:
+ if not with_tail:
+ tree.xmlUnlinkNode(c_child)
+ _removeNode(doc, c_child)
+ else:
+ if with_tail:
+ _removeText(c_child.next)
+ tree.xmlUnlinkNode(c_child)
+ attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
@@ -191,6 +146,7 @@ def strip_tags(tree_or_element, *tag_names):
Comment # comments (including their text!)
)
"""
+ cdef _MultiTagMatcher matcher
cdef _Element element
cdef _Document doc
cdef list ns_tags
@@ -200,35 +156,23 @@ def strip_tags(tree_or_element, *tag_names):
doc = _documentOrRaise(tree_or_element)
element = _rootNodeOrRaise(tree_or_element)
- if not tag_names: return
+ if not tag_names:
+ return
- ns_tags = _filterSpecialTagNames(
- tag_names, &strip_comments, &strip_pis, &strip_entities)
+ matcher = _MultiTagMatcher(tag_names)
+ matcher.cacheTags(doc)
+ if matcher.rejectsAll():
+ return
- if (strip_comments or strip_pis) and isinstance(tree_or_element, _ElementTree):
+ if isinstance(tree_or_element, _ElementTree):
# include PIs and comments next to the root node
- if strip_comments:
+ if matcher.matchesType(tree.XML_COMMENT_NODE):
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
- if strip_pis:
+ if matcher.matchesType(tree.XML_PI_NODE):
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
+ _strip_tags(doc, element._c_node, matcher)
- # tag names are passes as C pointers as this allows us to take
- # them from the doc dict and do pointer comparisons
- c_ns_tags = <char**> stdlib.malloc(sizeof(char*) * len(ns_tags) * 2 + 2)
- if c_ns_tags is NULL:
- raise MemoryError()
-
- try:
- c_tag_count = _mapTagsToCharArray(doc._c_doc, ns_tags, c_ns_tags)
- if c_tag_count > 0 or strip_comments or strip_pis or strip_entities:
- _strip_tags(doc, element._c_node, c_ns_tags, c_tag_count,
- strip_comments, strip_pis, strip_entities)
- finally:
- stdlib.free(c_ns_tags)
-
-cdef _strip_tags(_Document doc, xmlNode* c_node,
- char** c_ns_tags, Py_ssize_t c_tag_count,
- bint strip_comments, bint strip_pis, bint strip_entities):
+cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
cdef xmlNode* c_child
cdef xmlNode* c_next
cdef Py_ssize_t i
@@ -240,82 +184,20 @@ cdef _strip_tags(_Document doc, xmlNode* c_node,
# c_node itself
c_child = _findChildForwards(c_node, 0)
while c_child is not NULL:
+ if not matcher.matches(c_child):
+ c_child = _nextElement(c_child)
+ continue
if c_child.type == tree.XML_ELEMENT_NODE:
- for i in range(c_tag_count):
- if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]):
- c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
- _replaceNodeByChildren(doc, c_child)
- if not attemptDeallocation(c_child):
- if c_child.nsDef is not NULL:
- # make namespaces absolute
- moveNodeToDocument(doc, doc._c_doc, c_child)
- c_child = c_next
- break
- else:
- c_child = _nextElement(c_child)
+ c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
+ _replaceNodeByChildren(doc, c_child)
+ if not attemptDeallocation(c_child):
+ if c_child.nsDef is not NULL:
+ # make namespaces absolute
+ moveNodeToDocument(doc, doc._c_doc, c_child)
+ c_child = c_next
else:
c_next = _nextElement(c_child)
- if c_child.type == tree.XML_COMMENT_NODE and strip_comments \
- or c_child.type == tree.XML_PI_NODE and strip_pis \
- or c_child.type == tree.XML_ENTITY_REF_NODE and strip_entities:
- tree.xmlUnlinkNode(c_child)
- attemptDeallocation(c_child)
+ tree.xmlUnlinkNode(c_child)
+ attemptDeallocation(c_child)
c_child = c_next
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
-
-
-# helper functions
-
-cdef list _sortedTagList(list l):
- # This is required since the namespace may be None (which Py3
- # can't compare to strings). A bit of overhead, but at least
- # portable ...
- cdef list decorated_list
- cdef tuple ns_tag
- cdef Py_ssize_t i
- decorated_list = [ (ns_tag[0] or b'', ns_tag[1], i, ns_tag)
- for i, ns_tag in enumerate(l) ]
- decorated_list.sort()
- return [ item[-1] for item in decorated_list ]
-
-cdef list _filterSpecialTagNames(tag_names, bint* comments, bint* pis, bint* entities):
- cdef list ns_tags
- comments[0] = 0
- pis[0] = 0
- entities[0] = 0
-
- ns_tags = []
- for tag in tag_names:
- if tag is Comment:
- comments[0] = 1
- elif tag is ProcessingInstruction:
- pis[0] = 1
- elif tag is Entity:
- entities[0] = 1
- else:
- ns_tags.append(_getNsTag(tag))
-
- return [ (ns, tag if tag != b'*' else None)
- for ns, tag in _sortedTagList(ns_tags) ]
-
-cdef Py_ssize_t _mapTagsToCharArray(xmlDoc* c_doc, list ns_tags,
- char** c_ns_tags) except -1:
- cdef Py_ssize_t count = 0
- cdef char* c_tag
- for ns, tag in ns_tags:
- if ns is None:
- c_ns_tags[0] = NULL
- else:
- c_ns_tags[0] = _cstr(ns)
- if tag is None:
- c_ns_tags[1] = NULL
- else:
- c_tag = _cstr(tag)
- c_ns_tags[1] = tree.xmlDictExists(
- c_doc.dict, c_tag, cstring_h.strlen(c_tag))
- if c_ns_tags[1] == NULL:
- # not in the dict => not in the document
- continue
- c_ns_tags += 2
- count += 1
- return count
Oops, something went wrong.

0 comments on commit 714024e

Please sign in to comment.