Skip to content
This repository has been archived by the owner on Sep 18, 2019. It is now read-only.

Commit

Permalink
Add stylesheet.encoding attribute; better error messages for @charset
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonSapin committed Mar 23, 2012
1 parent 32f4fe2 commit 153c14b
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 36 deletions.
63 changes: 40 additions & 23 deletions tinycss/core.py
Expand Up @@ -11,7 +11,7 @@
"""

from __future__ import unicode_literals
from itertools import chain
from itertools import chain, islice

from .decoding import decode
from .tokenizer import tokenize_grouped
Expand Down Expand Up @@ -46,10 +46,15 @@ class Stylesheet(object):
.. attribute:: errors
a list of :class:`ParseError`
.. attribute:: encoding
The character encoding used to decode the stylesheet from bytes,
or ``None`` for Unicode stylesheets.
"""
def __init__(self, statements, errors):
def __init__(self, statements, errors, encoding):
self.statements = statements
self.errors = errors
self.encoding = encoding

def __repr__(self): # pragma: no cover
return '<{0.__class__.__name__} {1} rules {2} errors>'.format(
Expand Down Expand Up @@ -209,6 +214,26 @@ def __repr__(self): # pragma: no cover
return ('<{0.__class__.__name__}: {0.message}>'.format(self))


def _remove_at_charset(tokens):
"""Remove any valid @charset at the beggining of a token stream.
:param tokens:
An iterable of tokens
:returns:
A possibly truncated iterable of tokens
"""
tokens = iter(tokens)
header = list(islice(tokens, 4))
if [t.type for t in header] == ['ATKEYWORD', 'S', 'STRING', ';']:
atkw, space, string, semicolon = header
if ((atkw.value, space.value) == ('@charset', ' ')
and string.as_css[0] == '"'):
# Found a valid @charset rule, only keep what’s after it.
return tokens
return chain(header, tokens)


class CoreParser(object):
"""
Currently the parser holds no state. It is only a class to allow
Expand Down Expand Up @@ -241,24 +266,29 @@ def parse_stylesheet_bytes(self, css_bytes, protocol_encoding=None,
A :class:`Stylesheet`.
"""
css_unicode = decode(css_bytes, protocol_encoding,
linking_encoding, document_encoding)
return self.parse_stylesheet(css_unicode)
css_unicode, encoding = decode(css_bytes, protocol_encoding,
linking_encoding, document_encoding)
return self.parse_stylesheet(css_unicode, encoding=encoding)

def parse_stylesheet(self, css_unicode):
def parse_stylesheet(self, css_unicode, encoding=None):
"""Parse a stylesheet from an Unicode string.
:param css_unicode:
A CSS stylesheet as an unicode string.
:param encoding:
The character encoding used to decode the stylesheet from bytes,
if any.
:return:
A :class:`Stylesheet`.
"""
tokens = tokenize_grouped(css_unicode)
if encoding:
tokens = _remove_at_charset(tokens)
errors = []
statements = self.parse_statements(
tokens, errors, context='stylesheet')
return Stylesheet(statements, errors)
return Stylesheet(statements, errors, encoding)

def parse_style_attr(self, css_source):
"""Parse a "style" attribute (eg. of an HTML element).
Expand Down Expand Up @@ -296,8 +326,7 @@ def parse_statements(self, tokens, errors, context):
rule = self.read_at_rule(token, tokens)
result = self.parse_at_rule(
rule, rules, errors, context)
if result:
rules.append(result)
rules.append(result)
else:
rule, rule_errors = self.parse_ruleset(token, tokens)
rules.append(rule)
Expand Down Expand Up @@ -335,22 +364,10 @@ def parse_at_rule(self, rule, previous_rules, errors, context):
"""
if rule.at_keyword == '@charset':
# (1, 1) assumes that the byte order mark (BOM), if any,
# was removed when decoding bytes to Unicode.
# This also implies context == 'stylesheet':
if (rule.line, rule.column) == (1, 1):
if not (len(rule.head) == 1 and rule.head[0].type == 'STRING'
and rule.head[0].as_css[0] == '"' and not rule.body):
raise ParseError(rule, 'invalid @charset rule')
else:
raise ParseError(rule,
'@charset rule not at the beginning of the stylesheet')
# The rule is valid, but ignored.
# (It should not appear in stylesheet.rules)
return False
raise ParseError(rule, 'mis-placed or malformed @charset rule')
else:
raise ParseError(rule, 'unknown at-rule in {0} context: {1}'
.format(context, rule.at_keyword))
.format(context, rule.at_keyword))

def read_at_rule(self, at_keyword_token, tokens):
"""Read an at-rule.
Expand Down
3 changes: 1 addition & 2 deletions tinycss/css21.py
Expand Up @@ -289,8 +289,7 @@ def parse_page_block(self, body, errors):
rule = self.read_at_rule(token, tokens)
result = self.parse_at_rule(
rule, at_rules, errors, '@page')
if result:
at_rules.append(result)
at_rules.append(result)
except ParseError as err:
errors.append(err)
elif token.type != 'S':
Expand Down
11 changes: 6 additions & 5 deletions tinycss/decoding.py
Expand Up @@ -39,13 +39,14 @@ def decode(css_bytes, protocol_encoding=None,
:raises:
:class:`UnicodeDecodeError` if decoding failed
:return:
Unicode string, with any BOM removed
A tuple of an Unicode string, with any BOM removed, and the
encoding that was used.
"""
if protocol_encoding:
css_unicode = try_encoding(css_bytes, protocol_encoding)
if css_unicode is not None:
return css_unicode
return css_unicode, protocol_encoding
for encoding, pattern in ENCODING_MAGIC_NUMBERS:
match = pattern(css_bytes)
if match:
Expand All @@ -59,14 +60,14 @@ def decode(css_bytes, protocol_encoding=None,
css_unicode = try_encoding(css_bytes, encoding)
if css_unicode and not (has_at_charset and not
css_unicode.startswith('@charset "')):
return css_unicode
return css_unicode, encoding
break
for encoding in [linking_encoding, document_encoding]:
if encoding:
css_unicode = try_encoding(css_bytes, encoding)
if css_unicode is not None:
return css_unicode
return try_encoding(css_bytes, 'utf8', fallback=False)
return css_unicode, encoding
return try_encoding(css_bytes, 'utf8', fallback=False), 'utf8'


def try_encoding(css_bytes, encoding, fallback=True):
Expand Down
14 changes: 10 additions & 4 deletions tinycss/tests/test_core.py
Expand Up @@ -32,9 +32,14 @@ def parse_at_rule(self, rule, stylesheet_rules, errors, context):
('@import "é";'.encode('utf16'), {}, 'é'), # with a BOM
('@import "é";'.encode('latin1'), {}, None),
('@charset "latin1";@import "é";'.encode('latin1'), {}, 'é'),
(' @charset "latin1";@import "é";'.encode('latin1'), {}, None),
('@import "é";'.encode('latin1'), {'document_encoding': 'latin1'}, 'é'),
('@import "é";'.encode('latin1'), {'document_encoding': 'utf8'}, None),
('@charset "utf8"; @import "é";'.encode('utf8'),
{'document_encoding': 'latin1'}, 'é'),
# Mojibake yay!
(' @charset "utf8"; @import "é";'.encode('utf8'),
{'document_encoding': 'latin1'}, 'é'),
('@import "é";'.encode('utf8'), {'document_encoding': 'latin1'}, 'é'),
])
def test_bytes(css_bytes, kwargs, expected_result):
Expand All @@ -56,13 +61,14 @@ def test_bytes(css_bytes, kwargs, expected_result):
('foo{} @page{} bar{}', 2,
['unknown at-rule in stylesheet context: @page']),
('@charset "ascii"; foo {}', 1, []),
(' @charset "ascii"; foo {}', 1, ['@charset rule not at the beginning']),
('@charset ascii; foo {}', 1, ['invalid @charset']),
('foo {} @charset "ascii";', 1, ['@charset rule not at the beginning']),
(' @charset "ascii"; foo {}', 1, ['mis-placed or malformed @charset rule']),
('@charset ascii; foo {}', 1, ['mis-placed or malformed @charset rule']),
('foo {} @charset "ascii";', 1, ['mis-placed or malformed @charset rule']),
])
def test_at_rules(css_source, expected_rules, expected_errors):
# Pass 'encoding' to allow @charset
# Not using TestParser here:
stylesheet = CoreParser().parse_stylesheet(css_source)
stylesheet = CoreParser().parse_stylesheet(css_source, encoding='utf8')
assert_errors(stylesheet.errors, expected_errors)
result = len(stylesheet.statements)
assert result == expected_rules
Expand Down
3 changes: 2 additions & 1 deletion tinycss/tests/test_css21.py
Expand Up @@ -37,7 +37,8 @@
[], ["expected ';', got a block"]),
])
def test_at_import(css_source, expected_rules, expected_errors):
stylesheet = CSS21Parser().parse_stylesheet(css_source)
# Pass 'encoding' to allow @charset
stylesheet = CSS21Parser().parse_stylesheet(css_source, encoding='utf8')
assert_errors(stylesheet.errors, expected_errors)

result = [
Expand Down
4 changes: 3 additions & 1 deletion tinycss/tests/test_decoding.py
Expand Up @@ -29,6 +29,8 @@ def params(css, encoding, use_bom=False, expect_error=False, **kwargs):
params('é', 'latin1', document_encoding='ISO-8859-1'),
params('é', 'latin1', protocol_encoding='utf8',
document_encoding='latin1'),
params('@charset "utf8"; é', 'latin1', expect_error=True),
params('@charset "uùùùùtf8"; é', 'latin1', expect_error=True),
params('@charset "utf8"; é', 'latin1', document_encoding='latin1'),
params('é', 'latin1', linking_encoding='utf8',
document_encoding='latin1'),
Expand Down Expand Up @@ -65,7 +67,7 @@ def test_decode(css, encoding, use_bom, expect_error, kwargs):
source = css
css_bytes = source.encode(encoding)
try:
result = decode(css_bytes, **kwargs)
result, result_encoding = decode(css_bytes, **kwargs)
except UnicodeDecodeError as exc:
result = exc
if expect_error:
Expand Down

0 comments on commit 153c14b

Please sign in to comment.