Add stylesheet.encoding attribute; better error messages for @charset

Kozea · Mar 23, 2012 · 153c14b · 153c14b
1 parent 32f4fe2
commit 153c14b
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 36 deletions.
diff --git a/tinycss/core.py b/tinycss/core.py
@@ -11,7 +11,7 @@
 """
 
 from __future__ import unicode_literals
-from itertools import chain
+from itertools import chain, islice
 
 from .decoding import decode
 from .tokenizer import tokenize_grouped
@@ -46,10 +46,15 @@ class Stylesheet(object):
     .. attribute:: errors
         a list of :class:`ParseError`
 
+    .. attribute:: encoding
+        The character encoding used to decode the stylesheet from bytes,
+        or ``None`` for Unicode stylesheets.
+
     """
-    def __init__(self, statements, errors):
+    def __init__(self, statements, errors, encoding):
         self.statements = statements
         self.errors = errors
+        self.encoding = encoding
 
     def __repr__(self):  # pragma: no cover
         return '<{0.__class__.__name__} {1} rules {2} errors>'.format(
@@ -209,6 +214,26 @@ def __repr__(self):  # pragma: no cover
         return ('<{0.__class__.__name__}: {0.message}>'.format(self))
 
 
+def _remove_at_charset(tokens):
+    """Remove any valid @charset at the beggining of a token stream.
+
+    :param tokens:
+        An iterable of tokens
+    :returns:
+        A possibly truncated iterable of tokens
+
+    """
+    tokens = iter(tokens)
+    header = list(islice(tokens, 4))
+    if [t.type for t in header] == ['ATKEYWORD', 'S', 'STRING', ';']:
+        atkw, space, string, semicolon = header
+        if ((atkw.value, space.value) == ('@charset', ' ')
+                and string.as_css[0] == '"'):
+            # Found a valid @charset rule, only keep what’s after it.
+            return tokens
+    return chain(header, tokens)
+
+
 class CoreParser(object):
     """
     Currently the parser holds no state. It is only a class to allow
@@ -241,24 +266,29 @@ def parse_stylesheet_bytes(self, css_bytes, protocol_encoding=None,
             A :class:`Stylesheet`.
 
         """
-        css_unicode = decode(css_bytes, protocol_encoding,
-                             linking_encoding, document_encoding)
-        return self.parse_stylesheet(css_unicode)
+        css_unicode, encoding = decode(css_bytes, protocol_encoding,
+                                       linking_encoding, document_encoding)
+        return self.parse_stylesheet(css_unicode, encoding=encoding)
 
-    def parse_stylesheet(self, css_unicode):
+    def parse_stylesheet(self, css_unicode, encoding=None):
         """Parse a stylesheet from an Unicode string.
 
         :param css_unicode:
             A CSS stylesheet as an unicode string.
+        :param encoding:
+            The character encoding used to decode the stylesheet from bytes,
+            if any.
         :return:
             A :class:`Stylesheet`.
 
         """
         tokens = tokenize_grouped(css_unicode)
+        if encoding:
+            tokens = _remove_at_charset(tokens)
         errors = []
         statements = self.parse_statements(
             tokens, errors, context='stylesheet')
-        return Stylesheet(statements, errors)
+        return Stylesheet(statements, errors, encoding)
 
     def parse_style_attr(self, css_source):
         """Parse a "style" attribute (eg. of an HTML element).
@@ -296,8 +326,7 @@ def parse_statements(self, tokens, errors, context):
                         rule = self.read_at_rule(token, tokens)
                         result = self.parse_at_rule(
                             rule, rules, errors, context)
-                        if result:
-                            rules.append(result)
+                        rules.append(result)
                     else:
                         rule, rule_errors = self.parse_ruleset(token, tokens)
                         rules.append(rule)
@@ -335,22 +364,10 @@ def parse_at_rule(self, rule, previous_rules, errors, context):
 
         """
         if rule.at_keyword == '@charset':
-            # (1, 1) assumes that the byte order mark (BOM), if any,
-            # was removed when decoding bytes to Unicode.
-            # This also implies context == 'stylesheet':
-            if (rule.line, rule.column) == (1, 1):
-                if not (len(rule.head) == 1 and rule.head[0].type == 'STRING'
-                        and rule.head[0].as_css[0] == '"' and not rule.body):
-                    raise ParseError(rule, 'invalid @charset rule')
-            else:
-                raise ParseError(rule,
-                    '@charset rule not at the beginning of the stylesheet')
-            # The rule is valid, but ignored.
-            # (It should not appear in stylesheet.rules)
-            return False
+            raise ParseError(rule, 'mis-placed or malformed @charset rule')
         else:
             raise ParseError(rule, 'unknown at-rule in {0} context: {1}'
-                .format(context, rule.at_keyword))
+                                    .format(context, rule.at_keyword))
 
     def read_at_rule(self, at_keyword_token, tokens):
         """Read an at-rule.

diff --git a/tinycss/css21.py b/tinycss/css21.py
@@ -289,8 +289,7 @@ def parse_page_block(self, body, errors):
                     rule = self.read_at_rule(token, tokens)
                     result = self.parse_at_rule(
                         rule, at_rules, errors, '@page')
-                    if result:
-                        at_rules.append(result)
+                    at_rules.append(result)
                 except ParseError as err:
                     errors.append(err)
             elif token.type != 'S':

diff --git a/tinycss/decoding.py b/tinycss/decoding.py
@@ -39,13 +39,14 @@ def decode(css_bytes, protocol_encoding=None,
     :raises:
         :class:`UnicodeDecodeError` if decoding failed
     :return:
-        Unicode string, with any BOM removed
+        A tuple of an Unicode string, with any BOM removed, and the
+        encoding that was used.
 
     """
     if protocol_encoding:
         css_unicode = try_encoding(css_bytes, protocol_encoding)
         if css_unicode is not None:
-            return css_unicode
+            return css_unicode, protocol_encoding
     for encoding, pattern in ENCODING_MAGIC_NUMBERS:
         match = pattern(css_bytes)
         if match:
@@ -59,14 +60,14 @@ def decode(css_bytes, protocol_encoding=None,
             css_unicode = try_encoding(css_bytes, encoding)
             if css_unicode and not (has_at_charset and not
                                     css_unicode.startswith('@charset "')):
-                return css_unicode
+                return css_unicode, encoding
             break
     for encoding in [linking_encoding, document_encoding]:
         if encoding:
             css_unicode = try_encoding(css_bytes, encoding)
             if css_unicode is not None:
-                return css_unicode
-    return try_encoding(css_bytes, 'utf8', fallback=False)
+                return css_unicode, encoding
+    return try_encoding(css_bytes, 'utf8', fallback=False), 'utf8'
 
 
 def try_encoding(css_bytes, encoding, fallback=True):

diff --git a/tinycss/tests/test_core.py b/tinycss/tests/test_core.py
@@ -32,9 +32,14 @@ def parse_at_rule(self, rule, stylesheet_rules, errors, context):
     ('@import "é";'.encode('utf16'), {}, 'é'),  # with a BOM
     ('@import "é";'.encode('latin1'), {}, None),
     ('@charset "latin1";@import "é";'.encode('latin1'), {}, 'é'),
+    (' @charset "latin1";@import "é";'.encode('latin1'), {}, None),
     ('@import "é";'.encode('latin1'), {'document_encoding': 'latin1'}, 'é'),
     ('@import "é";'.encode('latin1'), {'document_encoding': 'utf8'}, None),
+    ('@charset "utf8"; @import "é";'.encode('utf8'),
+        {'document_encoding': 'latin1'}, 'é'),
     # Mojibake yay!
+    (' @charset "utf8"; @import "é";'.encode('utf8'),
+        {'document_encoding': 'latin1'}, 'Ã©'),
     ('@import "é";'.encode('utf8'), {'document_encoding': 'latin1'}, 'Ã©'),
 ])
 def test_bytes(css_bytes, kwargs, expected_result):
@@ -56,13 +61,14 @@ def test_bytes(css_bytes, kwargs, expected_result):
     ('foo{} @page{} bar{}', 2,
         ['unknown at-rule in stylesheet context: @page']),
     ('@charset "ascii"; foo {}', 1, []),
-    (' @charset "ascii"; foo {}', 1, ['@charset rule not at the beginning']),
-    ('@charset ascii; foo {}', 1, ['invalid @charset']),
-    ('foo {} @charset "ascii";', 1, ['@charset rule not at the beginning']),
+    (' @charset "ascii"; foo {}', 1, ['mis-placed or malformed @charset rule']),
+    ('@charset ascii; foo {}', 1, ['mis-placed or malformed @charset rule']),
+    ('foo {} @charset "ascii";', 1, ['mis-placed or malformed @charset rule']),
 ])
 def test_at_rules(css_source, expected_rules, expected_errors):
+    # Pass 'encoding' to allow @charset
     # Not using TestParser here:
-    stylesheet = CoreParser().parse_stylesheet(css_source)
+    stylesheet = CoreParser().parse_stylesheet(css_source, encoding='utf8')
     assert_errors(stylesheet.errors, expected_errors)
     result = len(stylesheet.statements)
     assert result == expected_rules

diff --git a/tinycss/tests/test_css21.py b/tinycss/tests/test_css21.py
@@ -37,7 +37,8 @@
         [], ["expected ';', got a block"]),
 ])
 def test_at_import(css_source, expected_rules, expected_errors):
-    stylesheet = CSS21Parser().parse_stylesheet(css_source)
+    # Pass 'encoding' to allow @charset
+    stylesheet = CSS21Parser().parse_stylesheet(css_source, encoding='utf8')
     assert_errors(stylesheet.errors, expected_errors)
 
     result = [

diff --git a/tinycss/tests/test_decoding.py b/tinycss/tests/test_decoding.py
@@ -29,6 +29,8 @@ def params(css, encoding, use_bom=False, expect_error=False, **kwargs):
     params('é', 'latin1', document_encoding='ISO-8859-1'),
     params('é', 'latin1', protocol_encoding='utf8',
                           document_encoding='latin1'),
+    params('@charset "utf8"; é', 'latin1', expect_error=True),
+    params('@charset "uùùùùtf8"; é', 'latin1', expect_error=True),
     params('@charset "utf8"; é', 'latin1', document_encoding='latin1'),
     params('é', 'latin1', linking_encoding='utf8',
                           document_encoding='latin1'),
@@ -65,7 +67,7 @@ def test_decode(css, encoding, use_bom, expect_error, kwargs):
         source = css
     css_bytes = source.encode(encoding)
     try:
-        result = decode(css_bytes, **kwargs)
+        result, result_encoding = decode(css_bytes, **kwargs)
     except UnicodeDecodeError as exc:
         result = exc
     if expect_error: