Skip to content
This repository has been archived by the owner on Sep 18, 2019. It is now read-only.

Commit

Permalink
WIP: C implementation of the tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonSapin committed Mar 1, 2012
1 parent 9791ed5 commit 0c87e26
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .gitignore
@@ -1,3 +1,6 @@
*.pyc
*.so
*.egg-info
/.coverage
/htmlcov
/build
6 changes: 5 additions & 1 deletion setup.py
@@ -1,7 +1,7 @@
from __future__ import with_statement
import re
import os.path
from setuptools import setup, find_packages
from setuptools import setup, find_packages, Extension


init_py = os.path.join(os.path.dirname(__file__), 'tinycss', '__init__.py')
Expand All @@ -18,4 +18,8 @@
author_email='simon.sapin@exyr.org',
description='A CSS parser, and nothing else.',
packages=find_packages(),
ext_modules=[Extension(
'tinycss.tokenizer._speedups',
sources=[os.path.join('tinycss', 'tokenizer', '_speedups.c')]
)],
)
12 changes: 8 additions & 4 deletions tinycss/tokenizer/__init__.py
Expand Up @@ -111,9 +111,9 @@
# Strings with {macro} expanded
COMPILED_MACROS = {}

# match methods of re.RegexObject
COMPILED_TOKEN_REGEXPS = [] # ordered

COMPILED_TOKEN_REGEXPS = [] # [(name, regexp.match)] ordered
COMPILED_TOKEN_INDEXES = {} # {name: i} helper for the C speedups

def _init():
"""Import-time initialization."""
Expand All @@ -140,6 +140,10 @@ def _init():
).match
))

COMPILED_TOKEN_INDEXES.clear()
for i, (name, regexp) in enumerate(COMPILED_TOKEN_REGEXPS):
COMPILED_TOKEN_INDEXES[name] = i

_init()


Expand Down Expand Up @@ -307,7 +311,7 @@ def __init__(self, type_, css_start, css_end, function_name, content,

def tokenize_flat(css_source, ignore_comments=True,
# Make these local variable to avoid global lookups in the loop
compiled_token=COMPILED_TOKEN_REGEXPS,
compiled_tokens=COMPILED_TOKEN_REGEXPS,
unicode_unescape=UNICODE_UNESCAPE,
newline_unescape=NEWLINE_UNESCAPE,
simple_unescape=SIMPLE_UNESCAPE,
Expand Down Expand Up @@ -336,7 +340,7 @@ def tokenize_flat(css_source, ignore_comments=True,
source_len = len(css_source)
tokens = []
while pos < source_len:
for type_, regexp in compiled_token:
for type_, regexp in compiled_tokens:
match = regexp(css_source, pos)
if match:
# First match is the longest. See comments on TOKENS above.
Expand Down
150 changes: 150 additions & 0 deletions tinycss/tokenizer/_speedups.c
@@ -0,0 +1,150 @@
#include <Python.h>


static PyObject *
tokenize_flat(PyObject *self, PyObject *args, PyObject *kwargs)
{
int ignore_comments = 1;
Py_ssize_t
pos = 0, line = 1, column = 1, source_len, type, n_tokens, length,
next_pos,
COMMENT, BAD_COMMENT, DIMENSION, PERCENTAGE, NUMBER, IDENT,
ATKEYWORD, HASH, FUNCTION, URI, STRING, BAD_STRING, DELIM;
PyObject
*css_source = NULL, *rv = NULL, *tokenizer_module = NULL,
*compiled_tokens = NULL, *compiled_token_indexes = NULL,
*unicode_unescape = NULL, *newline_unescape = NULL,
*simple_unescape = NULL, *find_newlines = NULL, *Token = NULL,
*item = NULL, *type_name = NULL, *DELIM_type_name = NULL,
*tokens = NULL, *regexp = NULL, *match = NULL, *css_value = NULL,
*value = NULL, *unit = NULL;

#define CHECK(expr) { if(!(expr)) { goto error; } }

static char *kwlist[] = {"css_source", "ignore_comments", NULL};
CHECK(PyArg_ParseTupleAndKeywords(args, kwargs, "U|i", kwlist,
&css_source, &ignore_comments));
CHECK((source_len = PyUnicode_GetSize(css_source)) >= 0);
CHECK(tokenizer_module = PyImport_ImportModule("tinycss.tokenizer"));

#define GET_TOKENIZER_ATTR(variable, attr) CHECK( \
variable = PyObject_GetAttrString(tokenizer_module, (attr)));

GET_TOKENIZER_ATTR(compiled_tokens, "COMPILED_TOKEN_REGEXPS");
GET_TOKENIZER_ATTR(compiled_token_indexes, "COMPILED_TOKEN_INDEXES");
GET_TOKENIZER_ATTR(unicode_unescape, "UNICODE_UNESCAPE");
GET_TOKENIZER_ATTR(newline_unescape, "NEWLINE_UNESCAPE");
GET_TOKENIZER_ATTR(simple_unescape, "SIMPLE_UNESCAPE");
GET_TOKENIZER_ATTR(find_newlines, "FIND_NEWLINES");
GET_TOKENIZER_ATTR(Token, "Token");

#define GET_TOKEN_INDEX(variable, name) { \
CHECK(item = PyMapping_GetItemString(compiled_token_indexes, name)); \
variable = PyNumber_AsSsize_t(item, NULL); \
Py_DECREF(item); }

GET_TOKEN_INDEX(COMMENT, "COMMENT");
GET_TOKEN_INDEX(BAD_COMMENT, "BAD_COMMENT");
GET_TOKEN_INDEX(DIMENSION, "DIMENSION");
GET_TOKEN_INDEX(PERCENTAGE, "PERCENTAGE");
GET_TOKEN_INDEX(NUMBER, "NUMBER");
GET_TOKEN_INDEX(IDENT, "IDENT");
GET_TOKEN_INDEX(ATKEYWORD, "ATKEYWORD");
GET_TOKEN_INDEX(HASH, "HASH");
GET_TOKEN_INDEX(FUNCTION, "FUNCTION");
GET_TOKEN_INDEX(URI, "URI");
GET_TOKEN_INDEX(STRING, "STRING");
GET_TOKEN_INDEX(BAD_STRING, "BAD_STRING");
DELIM = -1;
DELIM_type_name = PyUnicode_FromString("DELIM");

CHECK((n_tokens = PySequence_Length(compiled_tokens)) >= 0);
CHECK(tokens = PyList_New(0));

while (pos < source_len) {
css_value = NULL;
for (type = 0; type < n_tokens && css_value == NULL; type++) {
CHECK(item = PySequence_GetItem(compiled_tokens, type));
/* type_name and regexp are borrowed refs: */
CHECK(PyArg_ParseTuple(item, "OO", &type_name, &regexp));
CHECK(match = PyObject_CallFunction(regexp, "On", css_source, pos));
if (match != Py_None) {
/* First match is the longest. */
CHECK(css_value = PyObject_CallMethod(match, "group", NULL));
/* Take a ref not borrowed from item */
Py_INCREF(type_name);
} else {
Py_DECREF(match);
}
Py_DECREF(item);
}
if (css_value == NULL) {
/*
No match.
"Any other character not matched by the above rules,
and neither a single nor a double quote."
... but quotes at the start of a token are always matched
by STRING or BAD_STRING. So DELIM is any single character.
*/
CHECK(css_value = PySequence_GetItem(css_source, pos));
type = DELIM;
type_name = DELIM_type_name;
match = Py_None;
Py_INCREF(type_name);
Py_INCREF(match);
}
CHECK((length = PySequence_Length(css_value)) >= 0);
next_pos = pos + length;

value = css_value;
unit = Py_None;
Py_INCREF(unit);
/* TODO: parse values and units */

CHECK(item = PyObject_CallFunction(Token, "OOOOnn",
type_name, css_value, value, unit, pos, pos));
CHECK(PyList_Append(tokens, item) == 0);
/* XXX does PyList_Append make a new ref to item? */
/*Py_DECREF(item);*/

pos = next_pos;

Py_DECREF(type_name);
Py_DECREF(match);
}

rv = tokens;

error:
/* css_source is a reference borrowed from the caller,
type_name and regexp from an 'item' tuple. */
/* The reference to rv is trasfered to the caller. */
Py_XDECREF(tokenizer_module);
Py_XDECREF(compiled_tokens);
Py_XDECREF(compiled_token_indexes);
Py_XDECREF(unicode_unescape);
Py_XDECREF(newline_unescape);
Py_XDECREF(simple_unescape);
Py_XDECREF(find_newlines);
Py_XDECREF(Token);
Py_XDECREF(item);
Py_XDECREF(DELIM_type_name);
Py_XDECREF(match);
Py_XDECREF(css_value);

return rv;
}


static PyMethodDef SpeedupsMethods[] = {
{"tokenize_flat", (PyCFunction)tokenize_flat,
METH_VARARGS | METH_KEYWORDS, "C version of tokenize_flat."},
{NULL, NULL, 0, NULL}
};


PyMODINIT_FUNC
init_speedups(void)
{
(void) Py_InitModule("tinycss.tokenizer._speedups", SpeedupsMethods);
}

0 comments on commit 0c87e26

Please sign in to comment.