Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tag: v0.7.1
Fetching contributors…

Cannot retrieve contributors at this time

721 lines (604 sloc) 22.197 kb
# coding: utf8
"""
cssselect.parser
================
Tokenizer, parser and parsed objects for CSS selectors.
:copyright: (c) 2007-2012 Ian Bicking and contributors.
See AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""
import sys
import re
import operator
if sys.version_info[0] < 3:
_unicode = unicode
_unichr = unichr
else:
_unicode = str
_unichr = chr
def ascii_lower(string):
"""Lower-case, but only in the ASCII range."""
return string.encode('utf8').lower().decode('utf8')
class SelectorError(Exception):
"""Common parent for :class:`SelectorSyntaxError` and
:class:`ExpressionError`.
You can just use ``except SelectorError:`` when calling
:meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types.
"""
class SelectorSyntaxError(SelectorError, SyntaxError):
"""Parsing a selector that does not match the grammar."""
#### Parsed objects
class Selector(object):
"""
Represents a parsed selector.
:meth:`~GenericTranslator.selector_to_xpath` accepts this object,
but ignores :attr:`pseudo_element`. It is the user’s responsibility
to account for pseudo-elements and reject selectors with unknown
or unsupported pseudo-elements.
"""
def __init__(self, tree, pseudo_element=None):
self.parsed_tree = tree
if pseudo_element is not None:
pseudo_element = ascii_lower(pseudo_element)
#: The identifier for the pseudo-element as a string, or ``None``.
#:
#: +-------------------------+----------------+----------------+
#: | | Selector | Pseudo-element |
#: +=========================+================+================+
#: | CSS3 syntax | ``a::before`` | ``'before'`` |
#: +-------------------------+----------------+----------------+
#: | Older syntax | ``a:before`` | ``'before'`` |
#: +-------------------------+----------------+----------------+
#: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
#: | not in Selectors3 | | |
#: +-------------------------+----------------+----------------+
#: | Invalid pseudo-class | ``li:marker`` | ``None`` |
#: +-------------------------+----------------+----------------+
#:
#: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
self.pseudo_element = pseudo_element
def __repr__(self):
if self.pseudo_element:
pseudo_element = '::%s' % self.pseudo_element
else:
pseudo_element = ''
return '%s[%r%s]' % (
self.__class__.__name__, self.parsed_tree, pseudo_element)
def specificity(self):
"""Return the specificity_ of this selector as a tuple of 3 integers.
.. _specificity: http://www.w3.org/TR/selectors/#specificity
"""
a, b, c = self.parsed_tree.specificity()
if self.pseudo_element:
c += 1
return a, b, c
class Class(object):
"""
Represents selector.class_name
"""
def __init__(self, selector, class_name):
self.selector = selector
self.class_name = class_name
def __repr__(self):
return '%s[%r.%s]' % (
self.__class__.__name__, self.selector, self.class_name)
def specificity(self):
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Function(object):
"""
Represents selector:name(expr)
"""
def __init__(self, selector, name, arguments):
self.selector = selector
self.name = ascii_lower(name)
self.arguments = arguments
def __repr__(self):
return '%s[%r:%s(%r)]' % (
self.__class__.__name__, self.selector, self.name,
[token.value for token in self.arguments])
def argument_types(self):
return [token.type for token in self.arguments]
def specificity(self):
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Pseudo(object):
"""
Represents selector:ident
"""
def __init__(self, selector, ident):
self.selector = selector
self.ident = ascii_lower(ident)
def __repr__(self):
return '%s[%r:%s]' % (
self.__class__.__name__, self.selector, self.ident)
def specificity(self):
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Negation(object):
"""
Represents selector:not(subselector)
"""
def __init__(self, selector, subselector):
self.selector = selector
self.subselector = subselector
def __repr__(self):
return '%s[%r:not(%r)]' % (
self.__class__.__name__, self.selector, self.subselector)
def specificity(self):
a1, b1, c1 = self.selector.specificity()
a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
class Attrib(object):
"""
Represents selector[namespace|attrib operator value]
"""
def __init__(self, selector, namespace, attrib, operator, value):
self.selector = selector
self.namespace = namespace
self.attrib = attrib
self.operator = operator
self.value = value
def __repr__(self):
if self.namespace:
attrib = '%s|%s' % (self.namespace, self.attrib)
else:
attrib = self.attrib
if self.operator == 'exists':
return '%s[%r[%s]]' % (
self.__class__.__name__, self.selector, attrib)
else:
return '%s[%r[%s %s %r]]' % (
self.__class__.__name__, self.selector, attrib,
self.operator, self.value)
def specificity(self):
a, b, c = self.selector.specificity()
b += 1
return a, b, c
class Element(object):
"""
Represents namespace|element
`None` is for the universal selector '*'
"""
def __init__(self, namespace=None, element=None):
self.namespace = namespace
self.element = element
def __repr__(self):
element = self.element or '*'
if self.namespace:
element = '%s|%s' % (self.namespace, element)
return '%s[%s]' % (self.__class__.__name__, element)
def specificity(self):
if self.element:
return 0, 0, 1
else:
return 0, 0, 0
class Hash(object):
"""
Represents selector#id
"""
def __init__(self, selector, id):
self.selector = selector
self.id = id
def __repr__(self):
return '%s[%r#%s]' % (
self.__class__.__name__, self.selector, self.id)
def specificity(self):
a, b, c = self.selector.specificity()
a += 1
return a, b, c
class CombinedSelector(object):
def __init__(self, selector, combinator, subselector):
assert selector is not None
self.selector = selector
self.combinator = combinator
self.subselector = subselector
def __repr__(self):
if self.combinator == ' ':
comb = '<followed>'
else:
comb = self.combinator
return '%s[%r %s %r]' % (
self.__class__.__name__, self.selector, comb, self.subselector)
def specificity(self):
a1, b1, c1 = self.selector.specificity()
a2, b2, c2 = self.subselector.specificity()
return a1 + a2, b1 + b2, c1 + c2
#### Parser
# foo
_el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$')
# foo#bar or #bar
_id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$')
# foo.bar or .bar
_class_re = re.compile(
r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$')
def parse(css):
"""Parse a CSS *group of selectors*.
If you don't care about pseudo-elements or selector specificity,
you can skip this and use :meth:`~GenericTranslator.css_to_xpath`.
:param css:
A *group of selectors* as an Unicode string.
:raises:
:class:`SelectorSyntaxError` on invalid selectors.
:returns:
A list of parsed :class:`Selector` objects, one for each
selector in the comma-separated group.
"""
# Fast path for simple cases
match = _el_re.match(css)
if match:
return [Selector(Element(element=match.group(1)))]
match = _id_re.match(css)
if match is not None:
return [Selector(Hash(Element(element=match.group(1) or None),
match.group(2)))]
match = _class_re.match(css)
if match is not None:
return [Selector(Class(Element(element=match.group(1) or None),
match.group(2)))]
stream = TokenStream(tokenize(css))
stream.source = css
return list(parse_selector_group(stream))
# except SelectorSyntaxError:
# e = sys.exc_info()[1]
# message = "%s at %s -> %r" % (
# e, stream.used, stream.peek())
# e.msg = message
# if sys.version_info < (2,6):
# e.message = message
# e.args = tuple([message])
# raise
def parse_selector_group(stream):
stream.skip_whitespace()
while 1:
yield Selector(*parse_selector(stream))
if stream.peek() == ('DELIM', ','):
stream.next()
stream.skip_whitespace()
else:
break
def parse_selector(stream):
result, pseudo_element = parse_simple_selector(stream)
while 1:
stream.skip_whitespace()
peek = stream.peek()
if peek in (('EOF', None), ('DELIM', ',')):
break
if pseudo_element:
raise SelectorSyntaxError(
'Got pseudo-element ::%s not at the end of a selector'
% pseudo_element)
if peek.is_delim('+', '>', '~'):
# A combinator
combinator = stream.next().value
stream.skip_whitespace()
else:
# By exclusion, the last parse_simple_selector() ended
# at peek == ' '
combinator = ' '
next_selector, pseudo_element = parse_simple_selector(stream)
result = CombinedSelector(result, combinator, next_selector)
return result, pseudo_element
def parse_simple_selector(stream, inside_negation=False):
stream.skip_whitespace()
selector_start = len(stream.used)
peek = stream.peek()
if peek.type == 'IDENT' or peek == ('DELIM', '*'):
if peek.type == 'IDENT':
namespace = stream.next().value
else:
stream.next()
namespace = None
if stream.peek() == ('DELIM', '|'):
stream.next()
element = stream.next_ident_or_star()
else:
element = namespace
namespace = None
else:
element = namespace = None
result = Element(namespace, element)
pseudo_element = None
while 1:
peek = stream.peek()
if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or (
inside_negation and peek == ('DELIM', ')')):
break
if pseudo_element:
raise SelectorSyntaxError(
'Got pseudo-element ::%s not at the end of a selector'
% pseudo_element)
if peek.type == 'HASH':
result = Hash(result, stream.next().value)
elif peek == ('DELIM', '.'):
stream.next()
result = Class(result, stream.next_ident())
elif peek == ('DELIM', '['):
stream.next()
result = parse_attrib(result, stream)
elif peek == ('DELIM', ':'):
stream.next()
if stream.peek() == ('DELIM', ':'):
stream.next()
pseudo_element = stream.next_ident()
continue
ident = stream.next_ident()
if ident.lower() in ('first-line', 'first-letter',
'before', 'after'):
# Special case: CSS 2.1 pseudo-elements can have a single ':'
# Any new pseudo-element must have two.
pseudo_element = _unicode(ident)
continue
if stream.peek() != ('DELIM', '('):
result = Pseudo(result, ident)
continue
stream.next()
stream.skip_whitespace()
if ident.lower() == 'not':
if inside_negation:
raise SelectorSyntaxError('Got nested :not()')
argument, argument_pseudo_element = parse_simple_selector(
stream, inside_negation=True)
next = stream.next()
if argument_pseudo_element:
raise SelectorSyntaxError(
'Got pseudo-element ::%s inside :not() at %s'
% (argument_pseudo_element, next.pos))
if next != ('DELIM', ')'):
raise SelectorSyntaxError("Expected ')', got %s" % (next,))
result = Negation(result, argument)
else:
arguments = []
while 1:
stream.skip_whitespace()
next = stream.next()
if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [
('DELIM', '+'), ('DELIM', '-')]:
arguments.append(next)
elif next == ('DELIM', ')'):
break
else:
raise SelectorSyntaxError(
"Expected an argument, got %s" % (next,))
if not arguments:
raise SelectorSyntaxError(
"Expected at least one argument, got %s" % (next,))
result = Function(result, ident, arguments)
else:
raise SelectorSyntaxError(
"Expected selector, got %s" % (peek,))
if len(stream.used) == selector_start:
raise SelectorSyntaxError(
"Expected selector, got %s" % (stream.peek(),))
return result, pseudo_element
def parse_attrib(selector, stream):
stream.skip_whitespace()
attrib = stream.next_ident_or_star()
if attrib is None and stream.peek() != ('DELIM', '|'):
raise SelectorSyntaxError(
"Expected '|', got %s" % (stream.peek(),))
if stream.peek() == ('DELIM', '|'):
stream.next()
if stream.peek() == ('DELIM', '='):
namespace = None
stream.next()
op = '|='
else:
namespace = attrib
attrib = stream.next_ident()
op = None
else:
namespace = op = None
if op is None:
stream.skip_whitespace()
next = stream.next()
if next == ('DELIM', ']'):
return Attrib(selector, namespace, attrib, 'exists', None)
elif next == ('DELIM', '='):
op = '='
elif next.is_delim('^', '$', '*', '~', '|', '!') and (
stream.peek() == ('DELIM', '=')):
op = next.value + '='
stream.next()
else:
raise SelectorSyntaxError(
"Operator expected, got %s" % (next,))
stream.skip_whitespace()
value = stream.next()
if value.type not in ('IDENT', 'STRING'):
raise SelectorSyntaxError(
"Expected string or ident, got %s" % (value,))
stream.skip_whitespace()
next = stream.next()
if next != ('DELIM', ']'):
raise SelectorSyntaxError(
"Expected ']', got %s" % (next,))
return Attrib(selector, namespace, attrib, op, value.value)
def parse_series(tokens):
"""
Parses the arguments for :nth-child() and friends.
:raises: A list of tokens
:returns: :``(a, b)``
"""
for token in tokens:
if token.type == 'STRING':
raise ValueError('String tokens not allowed in series.')
s = ''.join(token.value for token in tokens).strip()
if s == 'odd':
return (2, 1)
elif s == 'even':
return (2, 0)
elif s == 'n':
return (1, 0)
if 'n' not in s:
# Just b
return (0, int(s))
a, b = s.split('n', 1)
if not a:
a = 1
elif a == '-' or a == '+':
a = int(a+'1')
else:
a = int(a)
if not b:
b = 0
else:
b = int(b)
return (a, b)
#### Token objects
class Token(tuple):
def __new__(cls, type_, value, pos):
obj = tuple.__new__(cls, (type_, value))
obj.pos = pos
return obj
def __repr__(self):
return "<%s '%s' at %i>" % (self.type, self.value, self.pos)
def is_delim(self, *values):
return self.type == 'DELIM' and self.value in values
type = property(operator.itemgetter(0))
value = property(operator.itemgetter(1))
class EOFToken(Token):
def __new__(cls, pos):
return Token.__new__(cls, 'EOF', None, pos)
def __repr__(self):
return '<%s at %i>' % (self.type, self.pos)
#### Tokenizer
class TokenMacros:
unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?'
escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]'
string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape
nonascii = r'[^\0-\177]'
nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii)
nmstart = '[_a-z]|%s|%s' % (escape, nonascii)
def _compile(pattern):
return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
_match_whitespace = _compile(r'[ \t\r\n\f]+')
_match_number = _compile('[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)')
_match_hash = _compile('#(?:%(nmchar)s)+')
_match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*')
_match_string_by_quote = {
"'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
'"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
}
_sub_simple_escape = re.compile(r'\\(.)').sub
_sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub
_sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub
# Same as r'\1', but faster on CPython
if hasattr(operator, 'methodcaller'):
# Python 2.6+
_replace_simple = operator.methodcaller('group', 1)
else:
def _replace_simple(match):
return match.group(1)
def _replace_unicode(match):
codepoint = int(match.group(1), 16)
if codepoint > sys.maxunicode:
codepoint = 0xFFFD
return _unichr(codepoint)
def unescape_ident(value):
value = _sub_unicode_escape(_replace_unicode, value)
value = _sub_simple_escape(_replace_simple, value)
return value
def tokenize(s):
pos = 0
len_s = len(s)
while pos < len_s:
match = _match_whitespace(s, pos=pos)
if match:
yield Token('S', ' ', pos)
pos = match.end()
continue
match = _match_ident(s, pos=pos)
if match:
value = _sub_simple_escape(_replace_simple,
_sub_unicode_escape(_replace_unicode, match.group()))
yield Token('IDENT', value, pos)
pos = match.end()
continue
match = _match_hash(s, pos=pos)
if match:
value = _sub_simple_escape(_replace_simple,
_sub_unicode_escape(_replace_unicode, match.group()[1:]))
yield Token('HASH', value, pos)
pos = match.end()
continue
quote = s[pos]
if quote in _match_string_by_quote:
match = _match_string_by_quote[quote](s, pos=pos + 1)
assert match, 'Should have found at least an empty match'
end_pos = match.end()
if end_pos == len_s:
raise SelectorSyntaxError('Unclosed string at %s' % pos)
if s[end_pos] != quote:
raise SelectorSyntaxError('Invalid string at %s' % next_pos)
value = _sub_simple_escape(_replace_simple,
_sub_unicode_escape(_replace_unicode,
_sub_newline_escape('', match.group())))
yield Token('STRING', value, pos)
pos = end_pos + 1
continue
match = _match_number(s, pos=pos)
if match:
value = match.group()
yield Token('NUMBER', value, pos)
pos = match.end()
continue
pos2 = pos + 2
if s[pos:pos2] == '/*':
pos = s.find('*/', pos2)
if pos == -1:
pos = len_s
else:
pos += 2
continue
yield Token('DELIM', s[pos], pos)
pos += 1
assert pos == len_s
yield EOFToken(pos)
class TokenStream(object):
def __init__(self, tokens, source=None):
self.used = []
self.tokens = iter(tokens)
self.source = source
self.peeked = None
self._peeking = False
try:
self.next_token = self.tokens.next
except AttributeError:
# Python 3
self.next_token = self.tokens.__next__
def next(self):
if self._peeking:
self._peeking = False
self.used.append(self.peeked)
return self.peeked
else:
next = self.next_token()
self.used.append(next)
return next
def peek(self):
if not self._peeking:
self.peeked = self.next_token()
self._peeking = True
return self.peeked
def next_ident(self):
next = self.next()
if next.type != 'IDENT':
raise SelectorSyntaxError('Expected ident, got %s' % (next,))
return next.value
def next_ident_or_star(self):
next = self.next()
if next.type == 'IDENT':
return next.value
elif next == ('DELIM', '*'):
return None
else:
raise SelectorSyntaxError(
"Expected ident or '*', got %s" % (next,))
def skip_whitespace(self):
peek = self.peek()
if peek.type == 'S':
self.next()
Jump to Line
Something went wrong with that request. Please try again.