Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
190 lines (167 sloc) 6.95 KB
# coding: utf8
Cython module for speeding up inner loops.
Right now only :func:`tokenize_flat` has a second implementation.
:copyright: (c) 2010 by Simon Sapin.
:license: BSD, see LICENSE for more details.
from __future__ import unicode_literals
from .token_data import (
(name, i) for i, (name, regexp) in enumerate(COMPILED_TOKEN_REGEXPS))
cdef class CToken:
"""A token built by the Cython speedups. Identical to
is_container = False
cdef public object type, _as_css, value, unit
cdef public Py_ssize_t line, column
def __init__(self, type_, css_value, value, unit, line, column):
self.type = type_
self._as_css = css_value
self.value = value
self.unit = unit
self.line = line
self.column = column
def as_css(self):
Return as an Unicode string the CSS representation of the token,
as parsed in the source.
return self._as_css
def __repr__(self):
return ('<Token {0.type} at {0.line}:{0.column} {0.value!r}{1}>'
.format(self, self.unit or ''))
def tokenize_flat(css_source, int ignore_comments=1):
:param css_source:
CSS as an unicode string
:param ignore_comments:
if true (the default) comments will not be included in the
return value
An iterator of :class:`Token`
# Make these local variable to avoid global lookups in the loop
tokens_dispatch = TOKEN_DISPATCH
compiled_token_indexes = COMPILED_TOKEN_INDEXES
compiled_tokens = COMPILED_TOKEN_REGEXPS
unicode_unescape = UNICODE_UNESCAPE
newline_unescape = NEWLINE_UNESCAPE
simple_unescape = SIMPLE_UNESCAPE
find_newlines = FIND_NEWLINES
# Use the integer indexes instead of string markers
cdef Py_ssize_t BAD_COMMENT = compiled_token_indexes['BAD_COMMENT']
cdef Py_ssize_t BAD_STRING = compiled_token_indexes['BAD_STRING']
cdef Py_ssize_t PERCENTAGE = compiled_token_indexes['PERCENTAGE']
cdef Py_ssize_t DIMENSION = compiled_token_indexes['DIMENSION']
cdef Py_ssize_t ATKEYWORD = compiled_token_indexes['ATKEYWORD']
cdef Py_ssize_t FUNCTION = compiled_token_indexes['FUNCTION']
cdef Py_ssize_t COMMENT = compiled_token_indexes['COMMENT']
cdef Py_ssize_t NUMBER = compiled_token_indexes['NUMBER']
cdef Py_ssize_t STRING = compiled_token_indexes['STRING']
cdef Py_ssize_t IDENT = compiled_token_indexes['IDENT']
cdef Py_ssize_t HASH = compiled_token_indexes['HASH']
cdef Py_ssize_t URI = compiled_token_indexes['URI']
cdef Py_ssize_t DELIM = -1
cdef Py_ssize_t pos = 0
cdef Py_ssize_t line = 1
cdef Py_ssize_t column = 1
cdef Py_ssize_t source_len = len(css_source)
cdef Py_ssize_t n_tokens = len(compiled_tokens)
cdef Py_ssize_t length, next_pos, type_
cdef CToken token
tokens = []
while pos < source_len:
char = css_source[pos]
if char in ':;{}()[]':
type_ = -1 # not parsed further anyway
type_name = char
css_value = char
codepoint = min(ord(char), 160)
for type_, type_name, regexp in tokens_dispatch[codepoint]:
match = regexp(css_source, pos)
if match:
# First match is the longest. See comments on TOKENS above.
css_value =
# No match.
# "Any other character not matched by the above rules,
# and neither a single nor a double quote."
# ... but quotes at the start of a token are always matched
# by STRING or BAD_STRING. So DELIM is any single character.
type_ = DELIM
type_name = 'DELIM'
css_value = char
length = len(css_value)
next_pos = pos + length
# A BAD_COMMENT is a comment at EOF. Ignore it too.
if not (ignore_comments and type_ in (COMMENT, BAD_COMMENT)):
# Parse numbers, extract strings and URIs, unescape
unit = None
if type_ == DIMENSION:
value =
value = float(value) if '.' in value else int(value)
unit =
unit = simple_unescape(unit)
unit = unicode_unescape(unit)
unit = unit.lower() # normalize
elif type_ == PERCENTAGE:
value = css_value[:-1]
value = float(value) if '.' in value else int(value)
unit = '%'
elif type_ == NUMBER:
value = css_value
if '.' in value:
value = float(value)
value = int(value)
type_name = 'INTEGER'
value = simple_unescape(css_value)
value = unicode_unescape(value)
elif type_ == URI:
value =
if value and value[0] in '"\'':
value = value[1:-1] # Remove quotes
value = newline_unescape(value)
value = simple_unescape(value)
value = unicode_unescape(value)
elif type_ == STRING:
value = css_value[1:-1] # Remove quotes
value = newline_unescape(value)
value = simple_unescape(value)
value = unicode_unescape(value)
# BAD_STRING can only be one of:
# * Unclosed string at the end of the stylesheet:
# Close the string, but this is not an error.
# Make it a "good" STRING token.
# * Unclosed string at the (unescaped) end of the line:
# Close the string, but this is an error.
# Leave it as a BAD_STRING, don’t bother parsing it.
# See
elif type_ == BAD_STRING and next_pos == source_len:
type_name = 'STRING'
value = css_value[1:] # Remove quote
value = newline_unescape(value)
value = simple_unescape(value)
value = unicode_unescape(value)
value = css_value
token = CToken(type_name, css_value, value, unit, line, column)
pos = next_pos
newlines = list(find_newlines(css_value))
if newlines:
line += len(newlines)
# Add 1 to have lines start at column 1, not 0
column = length - newlines[-1].end() + 1
column += length
return tokens
Something went wrong with that request. Please try again.