Skip to content

Commit

Permalink
[svn r3632] r4141@delle: sbehnel | 2008-05-02 21:47:32 +0200
Browse files Browse the repository at this point in the history
 support XHTML tags in XPath expressions of lxml.html

--HG--
branch : trunk
  • Loading branch information
scoder committed May 2, 2008
1 parent 75090d8 commit 8864abe
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 41 deletions.
15 changes: 15 additions & 0 deletions CHANGES.txt
Expand Up @@ -2,6 +2,21 @@
lxml changelog
==============

Under development
=================

Features added
--------------

* Most features in lxml.html work for XHTML namespaced tag names.

Bugs fixed
----------

Other changes
-------------


2.1beta2 (2008-05-02)
=====================

Expand Down
75 changes: 49 additions & 26 deletions src/lxml/html/__init__.py
Expand Up @@ -22,16 +22,30 @@
'find_rel_links', 'find_class', 'make_links_absolute',
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']

_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"

_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
namespaces={'x':XHTML_NAMESPACE})
_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
namespaces={'x':XHTML_NAMESPACE})
_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
namespaces={'x':XHTML_NAMESPACE})
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_label_xpath = etree.XPath("//label[@for=$id]")
_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
namespaces={'x':XHTML_NAMESPACE})
_archive_re = re.compile(r'[^ ]+')

def _nons(tag):
if isinstance(tag, basestring):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag

class HtmlMixin(object):

def base_url(self):
Expand All @@ -48,23 +62,23 @@ def forms(self):
"""
Return a list of all the forms
"""
return list(self.getiterator('form'))
return _forms_xpath(self)
forms = property(forms, doc=forms.__doc__)

def body(self):
"""
Return the <body> element. Can be called from a child element
to get the document's head.
"""
return self.xpath('//body')[0]
return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
body = property(body, doc=body.__doc__)

def head(self):
"""
Returns the <head> element. Can be called from a child
element to get the document's head.
"""
return self.xpath('//head')[0]
return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
head = property(head, doc=head.__doc__)

def _label__get(self):
Expand All @@ -85,7 +99,7 @@ def _label__set(self, label):
raise TypeError(
"You cannot set a label for an element (%r) that has no id"
% self)
if not label.tag == 'label':
if _nons(label.tag) != 'label':
raise TypeError(
"You can only assign label to a label element (not %r)"
% label)
Expand Down Expand Up @@ -228,7 +242,7 @@ def resolve_base_href(self):
tag once it has been applied.
"""
base_href = None
basetags = self.xpath('//base[@href]')
basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
for b in basetags:
base_href = b.get('href')
b.drop_tree()
Expand All @@ -249,11 +263,12 @@ def iterlinks(self):
link_attrs = defs.link_attrs
for el in self.getiterator():
attribs = el.attrib
if el.tag != 'object':
tag = _nons(el.tag)
if tag != 'object':
for attrib in link_attrs:
if attrib in attribs:
yield (el, attrib, attribs[attrib], 0)
elif el.tag == 'object':
elif tag == 'object':
codebase = None
## <object> tags have attributes that are relative to
## codebase
Expand All @@ -272,7 +287,7 @@ def iterlinks(self):
if codebase is not None:
value = urlparse.urljoin(codebase, value)
yield (el, 'archive', value, match.start())
if el.tag == 'param':
if tag == 'param':
valuetype = el.get('valuetype') or ''
if valuetype.lower() == 'ref':
## FIXME: while it's fine we *find* this link,
Expand All @@ -282,7 +297,7 @@ def iterlinks(self):
## doesn't have a valuetype="ref" (which seems to be the norm)
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
yield (el, 'value', el.get('value'), 0)
if el.tag == 'style' and el.text:
if tag == 'style' and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
for match in _css_import_re.finditer(el.text):
Expand Down Expand Up @@ -471,8 +486,8 @@ def fragments_fromstring(html, no_leading_text=False, base_url=None,
if not start.startswith('<html') and not start.startswith('<!doctype'):
html = '<html><body>%s</body></html>' % html
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
assert doc.tag == 'html'
bodies = [e for e in doc if e.tag == 'body']
assert _nons(doc.tag) == 'html'
bodies = [e for e in doc if _nons(e.tag) == 'body']
assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
body = bodies[0]
elements = []
Expand Down Expand Up @@ -540,6 +555,8 @@ def fromstring(html, base_url=None, parser=None, **kw):
# otherwise, lets parse it out...
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
bodies = doc.findall('body')
if not bodies:
bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
if bodies:
body = bodies[0]
if len(bodies) > 1:
Expand All @@ -558,6 +575,8 @@ def fromstring(html, base_url=None, parser=None, **kw):
else:
body = None
heads = doc.findall('head')
if not heads:
heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
if heads:
# Well, we have some sort of structure, so lets keep it all
head = heads[0]
Expand Down Expand Up @@ -598,7 +617,7 @@ def _contains_block_level_tag(el):
# FIXME: I could do this with XPath, but would that just be
# unnecessarily slow?
for el in el.getiterator():
if el.tag in defs.block_tags:
if _nons(el.tag) in defs.block_tags:
return True
return False

Expand All @@ -608,7 +627,7 @@ def _element_name(el):
elif isinstance(el, basestring):
return 'string'
else:
return el.tag
return _nons(el.tag)

################################################################################
# form handling
Expand Down Expand Up @@ -655,7 +674,10 @@ def _name(self):
return self.get('name')
elif self.get('id'):
return '#' + self.get('id')
return str(self.body.findall('form').index(self))
forms = self.body.findall('form')
if not forms:
forms = self.body.findall('{%s}form' % XHTML_NAMESPACE)
return str(forms.index(self))

def form_values(self):
"""
Expand All @@ -667,17 +689,18 @@ def form_values(self):
name = el.name
if not name:
continue
if el.tag == 'textarea':
tag = _nons(el.tag)
if tag == 'textarea':
results.append((name, el.value))
elif el.tag == 'select':
elif tag == 'select':
value = el.value
if el.multiple:
for v in value:
results.append((name, v))
elif value is not None:
results.append((name, el.value))
else:
assert el.tag == 'input', (
assert tag == 'input', (
"Unexpected tag: %r" % el)
if el.checkable and not el.checked:
continue
Expand Down Expand Up @@ -801,8 +824,8 @@ class InputGetter(object):
checkboxes and radio elements are returned individually.
"""

_name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]")
_all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']")
_name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
_all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")

def __init__(self, form):
self.form = form
Expand Down Expand Up @@ -919,7 +942,7 @@ def _value__get(self):
"""
if self.multiple:
return MultipleSelectOptions(self)
for el in self.getiterator('option'):
for el in _options_xpath(self):
if 'selected' in el.attrib:
value = el.get('value')
# FIXME: If value is None, what to return?, get_text()?
Expand All @@ -935,15 +958,15 @@ def _value__set(self, value):
self.value.update(value)
return
if value is not None:
for el in self.getiterator('option'):
for el in _options_xpath(self):
# FIXME: also if el.get('value') is None?
if el.get('value') == value:
checked_option = el
break
else:
raise ValueError(
"There is no option with the value of %r" % value)
for el in self.getiterator('option'):
for el in _options_xpath(self):
if 'selected' in el.attrib:
del el.attrib['selected']
if value is not None:
Expand All @@ -963,7 +986,7 @@ def value_options(self):
All the possible values this select can have (the ``value``
attribute of all the ``<option>`` elements.
"""
return [el.get('value') for el in self.getiterator('option')]
return [el.get('value') for el in _options_xpath(self)]
value_options = property(value_options, doc=value_options.__doc__)

def _multiple__get(self):
Expand Down Expand Up @@ -995,7 +1018,7 @@ def options(self):
"""
Iterator of all the ``<option>`` elements.
"""
return self.select.getiterator('option')
return iter(_options_xpath(self.select))
options = property(options)

def __iter__(self):
Expand Down
11 changes: 9 additions & 2 deletions src/lxml/html/clean.py
Expand Up @@ -9,7 +9,7 @@
import urlparse
from lxml import etree
from lxml.html import defs
from lxml.html import fromstring, tostring
from lxml.html import fromstring, tostring, XHTML_NAMESPACE, _nons

try:
set
Expand Down Expand Up @@ -62,7 +62,9 @@
"descendant-or-self::*[@style]")

_find_external_links = etree.XPath(
"descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
namespaces={'x':XHTML_NAMESPACE})

class Cleaner(object):
"""
Expand Down Expand Up @@ -201,6 +203,11 @@ def __call__(self, doc):
if hasattr(doc, 'getroot'):
# ElementTree instance, instead of an element
doc = doc.getroot()
# convert XHTML to HTML
for el in doc.iter():
tag = el.tag
if isinstance(tag, basestring):
el.tag = _nons(tag)
# Normalize a case that IE treats <image> like <img>, and that
# can confuse either this step or later steps.
for el in doc.iter('image'):
Expand Down
29 changes: 16 additions & 13 deletions src/lxml/html/formfill.py
@@ -1,5 +1,6 @@
from lxml.etree import XPath, ElementBase
from lxml.html import fromstring, tostring
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
from lxml.html import _forms_xpath, _options_xpath, _nons
from lxml.html import defs

__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
Expand All @@ -11,9 +12,11 @@ class FormNotFound(LookupError):
Raised when no form can be found
"""

_form_name_xpath = XPath('descendant-or-self::form[name=$name]')
_input_xpath = XPath('descendant-or-self::input | descendant-or-self::select | descendant-or-self::textarea')
_label_for_xpath = XPath('//label[@for=$for_id]')
_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
namespaces={'x':XHTML_NAMESPACE})
_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
namespaces={'x':XHTML_NAMESPACE})
_name_xpath = XPath('descendant-or-self::*[@name=$name]')

def fill_form(
Expand Down Expand Up @@ -69,7 +72,7 @@ def _fill_form(el, values):
_fill_single(input, value)

def _takes_multiple(input):
if input.tag == 'select' and input.get('multiple'):
if _nons(input.tag) == 'select' and input.get('multiple'):
# FIXME: multiple="0"?
return True
type = input.get('type', '').lower()
Expand All @@ -96,8 +99,8 @@ def _fill_multiple(input, value):
v = input.get('value')
_check(input, v in value)
else:
assert input.tag == 'select'
for option in input.findall('option'):
assert _nons(input.tag) == 'select'
for option in _options_xpath(input):
v = option.get('value')
if v is None:
# This seems to be the default, at least on IE
Expand All @@ -120,15 +123,15 @@ def _select(el, select):
del el.attrib['selected']

def _fill_single(input, value):
if input.tag == 'textarea':
if _nons(input.tag) == 'textarea':
input.clear()
input.text = value
else:
input.set('value', value)

def _find_form(el, form_id=None, form_index=None):
if form_id is None and form_index is None:
forms = el.getiterator('form')
forms = _forms_xpath(el)
for form in forms:
return form
raise FormNotFound(
Expand All @@ -145,7 +148,7 @@ def _find_form(el, form_id=None, form_index=None):
"No form with the name or id of %r (forms: %s)"
% (id, ', '.join(_find_form_ids(el))))
if form_index is not None:
forms = el.getiterator('form')
forms = _forms_xpath(el)
try:
return forms[form_index]
except IndexError:
Expand All @@ -154,7 +157,7 @@ def _find_form(el, form_id=None, form_index=None):
% (form_index, len(forms)))

def _find_form_ids(el):
forms = el.getiterator('form')
forms = _forms_xpath(el)
if not forms:
yield '(no forms)'
return
Expand Down Expand Up @@ -254,11 +257,11 @@ def insert_errors_html(html, values, **kw):
return doc

def _insert_error(el, error, error_class, error_creator):
if el.tag in defs.empty_tags or el.tag == 'textarea':
if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
is_block = False
else:
is_block = True
if el.tag != 'form' and error_class:
if _nons(el.tag) != 'form' and error_class:
_add_class(el, error_class)
if el.get('id'):
labels = _label_for_xpath(el, for_id=el.get('id'))
Expand Down

0 comments on commit 8864abe

Please sign in to comment.