Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 217 lines (193 sloc) 7.735 kB
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
1 # coding: utf8
2 """
3 tinycss.tokenizer
4 -----------------
5
6 Tokenizer for the CSS core syntax:
7 http://www.w3.org/TR/CSS21/syndata.html#tokenization
8
9 This is the pure-python implementation. See also speedups.pyx
10
3fcf55d @SimonSapin Copy/pasting old boilerplate is bad. (Hello 2012!)
authored
11 :copyright: (c) 2012 by Simon Sapin.
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
12 :license: BSD, see LICENSE for more details.
13 """
14
15 from __future__ import unicode_literals
16
17 from . import token_data
18
19
20 def tokenize_flat(css_source, ignore_comments=True,
21 # Make these local variable to avoid global lookups in the loop
7a72df6 @SimonSapin Moar speed: dispatch tokens by first character.
authored
22 tokens_dispatch=token_data.TOKEN_DISPATCH,
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
23 unicode_unescape=token_data.UNICODE_UNESCAPE,
24 newline_unescape=token_data.NEWLINE_UNESCAPE,
25 simple_unescape=token_data.SIMPLE_UNESCAPE,
26 find_newlines=token_data.FIND_NEWLINES,
27 Token=token_data.Token,
28 len=len,
29 int=int,
30 float=float,
31 list=list,
32 _None=None,
33 ):
34 """
35 :param css_source:
36 CSS as an unicode string
37 :param ignore_comments:
38 if true (the default) comments will not be included in the
39 return value
40 :return:
41 An iterator of :class:`Token`
42
43 """
44
45 pos = 0
46 line = 1
47 column = 1
48 source_len = len(css_source)
49 tokens = []
50 while pos < source_len:
5d33b50 @SimonSapin Moar speed: fast path for :;{}()[] tokens
authored
51 char = css_source[pos]
52 if char in ':;{}()[]':
53 type_ = char
54 css_value = char
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
55 else:
7a72df6 @SimonSapin Moar speed: dispatch tokens by first character.
authored
56 codepoint = min(ord(char), 160)
57 for _index, type_, regexp in tokens_dispatch[codepoint]:
5d33b50 @SimonSapin Moar speed: fast path for :;{}()[] tokens
authored
58 match = regexp(css_source, pos)
59 if match:
60 # First match is the longest. See comments on TOKENS above.
61 css_value = match.group()
62 break
63 else:
64 # No match.
65 # "Any other character not matched by the above rules,
66 # and neither a single nor a double quote."
67 # ... but quotes at the start of a token are always matched
68 # by STRING or BAD_STRING. So DELIM is any single character.
69 type_ = 'DELIM'
70 css_value = char
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
71 length = len(css_value)
72 next_pos = pos + length
73
74 # A BAD_COMMENT is a comment at EOF. Ignore it too.
75 if not (ignore_comments and type_ in ('COMMENT', 'BAD_COMMENT')):
76 # Parse numbers, extract strings and URIs, unescape
77 unit = _None
78 if type_ == 'DIMENSION':
79 value = match.group(1)
80 value = float(value) if '.' in value else int(value)
81 unit = match.group(2)
82 unit = simple_unescape(unit)
7852a27 @SimonSapin Fix tokenizing \5c
authored
83 unit = unicode_unescape(unit)
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
84 unit = unit.lower() # normalize
85 elif type_ == 'PERCENTAGE':
86 value = css_value[:-1]
87 value = float(value) if '.' in value else int(value)
88 unit = '%'
89 elif type_ == 'NUMBER':
90 value = css_value
91 if '.' in value:
92 value = float(value)
93 else:
94 value = int(value)
95 type_ = 'INTEGER'
96 elif type_ in ('IDENT', 'ATKEYWORD', 'HASH', 'FUNCTION'):
7852a27 @SimonSapin Fix tokenizing \5c
authored
97 value = simple_unescape(css_value)
98 value = unicode_unescape(value)
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
99 elif type_ == 'URI':
100 value = match.group(1)
101 if value and value[0] in '"\'':
102 value = value[1:-1] # Remove quotes
103 value = newline_unescape(value)
104 value = simple_unescape(value)
7852a27 @SimonSapin Fix tokenizing \5c
authored
105 value = unicode_unescape(value)
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
106 elif type_ == 'STRING':
107 value = css_value[1:-1] # Remove quotes
108 value = newline_unescape(value)
109 value = simple_unescape(value)
7852a27 @SimonSapin Fix tokenizing \5c
authored
110 value = unicode_unescape(value)
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
111 # BAD_STRING can only be one of:
112 # * Unclosed string at the end of the stylesheet:
113 # Close the string, but this is not an error.
114 # Make it a "good" STRING token.
115 # * Unclosed string at the (unescaped) end of the line:
116 # Close the string, but this is an error.
117 # Leave it as a BAD_STRING, don’t bother parsing it.
118 # See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors
119 elif type_ == 'BAD_STRING' and next_pos == source_len:
120 type_ = 'STRING'
121 value = css_value[1:] # Remove quote
122 value = newline_unescape(value)
123 value = simple_unescape(value)
7852a27 @SimonSapin Fix tokenizing \5c
authored
124 value = unicode_unescape(value)
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
125 else:
126 value = css_value
127 tokens.append(Token(type_, css_value, value, unit, line, column))
128
129 pos = next_pos
130 newlines = list(find_newlines(css_value))
131 if newlines:
132 line += len(newlines)
133 # Add 1 to have lines start at column 1, not 0
134 column = length - newlines[-1].end() + 1
135 else:
136 column += length
137 return tokens
138
139
140 def regroup(tokens):
141 """
142 Match pairs of tokens: () [] {} function()
143 (Strings in "" or '' are taken care of by the tokenizer.)
144
145 Opening tokens are replaced by a :class:`ContainerToken`.
146 Closing tokens are removed. Unmatched closing tokens are invalid
147 but left as-is. All nested structures that are still open at
148 the end of the stylesheet are implicitly closed.
149
150 :param tokens:
151 a *flat* iterable of tokens, as returned by :func:`tokenize_flat`.
152 :return:
153 A tree of tokens.
154
155 """
156 # "global" objects for the inner recursion
157 pairs = {'FUNCTION': ')', '(': ')', '[': ']', '{': '}'}
158 tokens = iter(tokens)
159 eof = [False]
160
161 def _regroup_inner(stop_at=None,
162 tokens=tokens, pairs=pairs, eof=eof,
163 ContainerToken=token_data.ContainerToken,
164 FunctionToken=token_data.FunctionToken):
165 for token in tokens:
166 type_ = token.type
167 if type_ == stop_at:
168 return
169
170 end = pairs.get(type_)
171 if end is None:
172 yield token # Not a grouping token
173 else:
174 assert not isinstance(token, ContainerToken), (
175 'Token looks already grouped: {0}'.format(token))
176 content = list(_regroup_inner(end))
177 if eof[0]:
178 end = '' # Implicit end of structure at EOF.
179 if type_ == 'FUNCTION':
8ceae28 @SimonSapin Shave some API yaks.
authored
180 yield FunctionToken(token.type, token.as_css(), end,
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
181 token.value, content,
182 token.line, token.column)
183 else:
8ceae28 @SimonSapin Shave some API yaks.
authored
184 yield ContainerToken(token.type, token.as_css(), end,
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
185 content,
186 token.line, token.column)
187 else:
188 eof[0] = True # end of file/stylesheet
189 return _regroup_inner()
190
191
192 def tokenize_grouped(css_source, ignore_comments=True):
193 """
194 :param css_source:
195 CSS as an unicode string
196 :param ignore_comments:
197 if true (the default) comments will not be included in the
198 return value
199 :return:
200 An iterator of :class:`Token`
201
202 """
203 return regroup(tokenize_flat(css_source, ignore_comments))
204
205
206 # Optional Cython version of tokenize_flat
207 # Make both versions available with explicit names for tests.
208 python_tokenize_flat = tokenize_flat
209 try:
210 from . import speedups
8ceae28 @SimonSapin Shave some API yaks.
authored
211 except ImportError:
97ceaaa @SimonSapin Make tinycss.tokenizer a module (not a package) again
authored
212 cython_tokenize_flat = None
213 else:
214 cython_tokenize_flat = speedups.tokenize_flat
215 # Default to the Cython version if available
216 tokenize_flat = cython_tokenize_flat
Something went wrong with that request. Please try again.