[svn r3632] r4141@delle: sbehnel | 2008-05-02 21:47:32 +0200

support XHTML tags in XPath expressions of lxml.html --HG-- branch : trunk
SimonSapin · May 2, 2008 · 8864abe · 8864abe
1 parent 75090d8
commit 8864abe
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 41 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -2,6 +2,21 @@
 lxml changelog
 ==============
 
+Under development
+=================
+
+Features added
+--------------
+
+* Most features in lxml.html work for XHTML namespaced tag names.
+
+Bugs fixed
+----------
+
+Other changes
+-------------
+
+
 2.1beta2 (2008-05-02)
 =====================
 

diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
@@ -22,16 +22,30 @@
     'find_rel_links', 'find_class', 'make_links_absolute',
     'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
 
-_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
+XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
+                               namespaces={'x':XHTML_NAMESPACE})
+_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
+                             namespaces={'x':XHTML_NAMESPACE})
+_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
+                           namespaces={'x':XHTML_NAMESPACE})
 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
 _collect_string_content = etree.XPath("string()")
 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
 _css_import_re = re.compile(r'@import "(.*?)"')
-_label_xpath = etree.XPath("//label[@for=$id]")
+_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
+                           namespaces={'x':XHTML_NAMESPACE})
 _archive_re = re.compile(r'[^ ]+')
 
+def _nons(tag):
+    if isinstance(tag, basestring):
+        if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
+            return tag.split('}')[-1]
+    return tag
+
 class HtmlMixin(object):
 
     def base_url(self):
@@ -48,23 +62,23 @@ def forms(self):
         """
         Return a list of all the forms
         """
-        return list(self.getiterator('form'))
+        return _forms_xpath(self)
     forms = property(forms, doc=forms.__doc__)
 
     def body(self):
         """
         Return the <body> element.  Can be called from a child element
         to get the document's head.
         """
-        return self.xpath('//body')[0]
+        return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
     body = property(body, doc=body.__doc__)
 
     def head(self):
         """
         Returns the <head> element.  Can be called from a child
         element to get the document's head.
         """
-        return self.xpath('//head')[0]
+        return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
     head = property(head, doc=head.__doc__)
 
     def _label__get(self):
@@ -85,7 +99,7 @@ def _label__set(self, label):
             raise TypeError(
                 "You cannot set a label for an element (%r) that has no id"
                 % self)
-        if not label.tag == 'label':
+        if _nons(label.tag) != 'label':
             raise TypeError(
                 "You can only assign label to a label element (not %r)"
                 % label)
@@ -228,7 +242,7 @@ def resolve_base_href(self):
         tag once it has been applied.
         """
         base_href = None
-        basetags = self.xpath('//base[@href]')
+        basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
         for b in basetags:
             base_href = b.get('href')
             b.drop_tree()
@@ -249,11 +263,12 @@ def iterlinks(self):
         link_attrs = defs.link_attrs
         for el in self.getiterator():
             attribs = el.attrib
-            if el.tag != 'object':
+            tag = _nons(el.tag)
+            if tag != 'object':
                 for attrib in link_attrs:
                     if attrib in attribs:
                         yield (el, attrib, attribs[attrib], 0)
-            elif el.tag == 'object':
+            elif tag == 'object':
                 codebase = None
                 ## <object> tags have attributes that are relative to
                 ## codebase
@@ -272,7 +287,7 @@ def iterlinks(self):
                         if codebase is not None:
                             value = urlparse.urljoin(codebase, value)
                         yield (el, 'archive', value, match.start())
-            if el.tag == 'param':
+            if tag == 'param':
                 valuetype = el.get('valuetype') or ''
                 if valuetype.lower() == 'ref':
                     ## FIXME: while it's fine we *find* this link,
@@ -282,7 +297,7 @@ def iterlinks(self):
                     ## doesn't have a valuetype="ref" (which seems to be the norm)
                     ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
                     yield (el, 'value', el.get('value'), 0)
-            if el.tag == 'style' and el.text:
+            if tag == 'style' and el.text:
                 for match in _css_url_re.finditer(el.text):
                     yield (el, None, match.group(1), match.start(1))
                 for match in _css_import_re.finditer(el.text):
@@ -471,8 +486,8 @@ def fragments_fromstring(html, no_leading_text=False, base_url=None,
     if not start.startswith('<html') and not start.startswith('<!doctype'):
         html = '<html><body>%s</body></html>' % html
     doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
-    assert doc.tag == 'html'
-    bodies = [e for e in doc if e.tag == 'body']
+    assert _nons(doc.tag) == 'html'
+    bodies = [e for e in doc if _nons(e.tag) == 'body']
     assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
     body = bodies[0]
     elements = []
@@ -540,6 +555,8 @@ def fromstring(html, base_url=None, parser=None, **kw):
     # otherwise, lets parse it out...
     doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
     bodies = doc.findall('body')
+    if not bodies:
+        bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
     if bodies:
         body = bodies[0]
         if len(bodies) > 1:
@@ -558,6 +575,8 @@ def fromstring(html, base_url=None, parser=None, **kw):
     else:
         body = None
     heads = doc.findall('head')
+    if not heads:
+        heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
     if heads:
         # Well, we have some sort of structure, so lets keep it all
         head = heads[0]
@@ -598,7 +617,7 @@ def _contains_block_level_tag(el):
     # FIXME: I could do this with XPath, but would that just be
     # unnecessarily slow?
     for el in el.getiterator():
-        if el.tag in defs.block_tags:
+        if _nons(el.tag) in defs.block_tags:
             return True
     return False
 
@@ -608,7 +627,7 @@ def _element_name(el):
     elif isinstance(el, basestring):
         return 'string'
     else:
-        return el.tag
+        return _nons(el.tag)
 
 ################################################################################
 # form handling
@@ -655,7 +674,10 @@ def _name(self):
             return self.get('name')
         elif self.get('id'):
             return '#' + self.get('id')
-        return str(self.body.findall('form').index(self))
+        forms = self.body.findall('form')
+        if not forms:
+            forms = self.body.findall('{%s}form' % XHTML_NAMESPACE)
+        return str(forms.index(self))
 
     def form_values(self):
         """
@@ -667,17 +689,18 @@ def form_values(self):
             name = el.name
             if not name:
                 continue
-            if el.tag == 'textarea':
+            tag = _nons(el.tag)
+            if tag == 'textarea':
                 results.append((name, el.value))
-            elif el.tag == 'select':
+            elif tag == 'select':
                 value = el.value
                 if el.multiple:
                     for v in value:
                         results.append((name, v))
                 elif value is not None:
                     results.append((name, el.value))
             else:
-                assert el.tag == 'input', (
+                assert tag == 'input', (
                     "Unexpected tag: %r" % el)
                 if el.checkable and not el.checked:
                     continue
@@ -801,8 +824,8 @@ class InputGetter(object):
     checkboxes and radio elements are returned individually.
     """
 
-    _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]")
-    _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']")
+    _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
+    _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
 
     def __init__(self, form):
         self.form = form
@@ -919,7 +942,7 @@ def _value__get(self):
         """
         if self.multiple:
             return MultipleSelectOptions(self)
-        for el in self.getiterator('option'):
+        for el in _options_xpath(self):
             if 'selected' in el.attrib:
                 value = el.get('value')
                 # FIXME: If value is None, what to return?, get_text()?
@@ -935,15 +958,15 @@ def _value__set(self, value):
             self.value.update(value)
             return
         if value is not None:
-            for el in self.getiterator('option'):
+            for el in _options_xpath(self):
                 # FIXME: also if el.get('value') is None?
                 if el.get('value') == value:
                     checked_option = el
                     break
             else:
                 raise ValueError(
                     "There is no option with the value of %r" % value)
-        for el in self.getiterator('option'):
+        for el in _options_xpath(self):
             if 'selected' in el.attrib:
                 del el.attrib['selected']
         if value is not None:
@@ -963,7 +986,7 @@ def value_options(self):
         All the possible values this select can have (the ``value``
         attribute of all the ``<option>`` elements.
         """
-        return [el.get('value') for el in self.getiterator('option')]
+        return [el.get('value') for el in _options_xpath(self)]
     value_options = property(value_options, doc=value_options.__doc__)
 
     def _multiple__get(self):
@@ -995,7 +1018,7 @@ def options(self):
         """
         Iterator of all the ``<option>`` elements.
         """
-        return self.select.getiterator('option')
+        return iter(_options_xpath(self.select))
     options = property(options)
 
     def __iter__(self):

diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
@@ -9,7 +9,7 @@
 import urlparse
 from lxml import etree
 from lxml.html import defs
-from lxml.html import fromstring, tostring
+from lxml.html import fromstring, tostring, XHTML_NAMESPACE, _nons
 
 try:
     set
@@ -62,7 +62,9 @@
     "descendant-or-self::*[@style]")
 
 _find_external_links = etree.XPath(
-    "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
+    ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
+     "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
+    namespaces={'x':XHTML_NAMESPACE})
 
 class Cleaner(object):
     """
@@ -201,6 +203,11 @@ def __call__(self, doc):
         if hasattr(doc, 'getroot'):
             # ElementTree instance, instead of an element
             doc = doc.getroot()
+        # convert XHTML to HTML
+        for el in doc.iter():
+            tag = el.tag
+            if isinstance(tag, basestring):
+                el.tag = _nons(tag)
         # Normalize a case that IE treats <image> like <img>, and that
         # can confuse either this step or later steps.
         for el in doc.iter('image'):

diff --git a/src/lxml/html/formfill.py b/src/lxml/html/formfill.py
@@ -1,5 +1,6 @@
 from lxml.etree import XPath, ElementBase
-from lxml.html import fromstring, tostring
+from lxml.html import fromstring, tostring, XHTML_NAMESPACE
+from lxml.html import _forms_xpath, _options_xpath, _nons
 from lxml.html import defs
 
 __all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
@@ -11,9 +12,11 @@ class FormNotFound(LookupError):
     Raised when no form can be found
     """
 
-_form_name_xpath = XPath('descendant-or-self::form[name=$name]')
-_input_xpath = XPath('descendant-or-self::input | descendant-or-self::select | descendant-or-self::textarea')
-_label_for_xpath = XPath('//label[@for=$for_id]')
+_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
+_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
+                               namespaces={'x':XHTML_NAMESPACE})
+_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
+                               namespaces={'x':XHTML_NAMESPACE})
 _name_xpath = XPath('descendant-or-self::*[@name=$name]')
 
 def fill_form(
@@ -69,7 +72,7 @@ def _fill_form(el, values):
             _fill_single(input, value)
 
 def _takes_multiple(input):
-    if input.tag == 'select' and input.get('multiple'):
+    if _nons(input.tag) == 'select' and input.get('multiple'):
         # FIXME: multiple="0"?
         return True
     type = input.get('type', '').lower()
@@ -96,8 +99,8 @@ def _fill_multiple(input, value):
         v = input.get('value')
         _check(input, v in value)
     else:
-        assert input.tag == 'select'
-        for option in input.findall('option'):
+        assert _nons(input.tag) == 'select'
+        for option in _options_xpath(input):
             v = option.get('value')
             if v is None:
                 # This seems to be the default, at least on IE
@@ -120,15 +123,15 @@ def _select(el, select):
             del el.attrib['selected']
 
 def _fill_single(input, value):
-    if input.tag == 'textarea':
+    if _nons(input.tag) == 'textarea':
         input.clear()
         input.text = value
     else:
         input.set('value', value)
 
 def _find_form(el, form_id=None, form_index=None):
     if form_id is None and form_index is None:
-        forms = el.getiterator('form')
+        forms = _forms_xpath(el)
         for form in forms:
             return form
         raise FormNotFound(
@@ -145,7 +148,7 @@ def _find_form(el, form_id=None, form_index=None):
                 "No form with the name or id of %r (forms: %s)"
                 % (id, ', '.join(_find_form_ids(el))))               
     if form_index is not None:
-        forms = el.getiterator('form')
+        forms = _forms_xpath(el)
         try:
             return forms[form_index]
         except IndexError:
@@ -154,7 +157,7 @@ def _find_form(el, form_id=None, form_index=None):
                 % (form_index, len(forms)))
 
 def _find_form_ids(el):
-    forms = el.getiterator('form')
+    forms = _forms_xpath(el)
     if not forms:
         yield '(no forms)'
         return
@@ -254,11 +257,11 @@ def insert_errors_html(html, values, **kw):
         return doc
 
 def _insert_error(el, error, error_class, error_creator):
-    if el.tag in defs.empty_tags or el.tag == 'textarea':
+    if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
         is_block = False
     else:
         is_block = True
-    if el.tag != 'form' and error_class:
+    if _nons(el.tag) != 'form' and error_class:
         _add_class(el, error_class)
     if el.get('id'):
         labels = _label_for_xpath(el, for_id=el.get('id'))