diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php new file mode 100644 index 00000000..b23557cd --- /dev/null +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -0,0 +1,1534 @@ +collect_tokens( $processor ); + $this->assertSame( $expected_tokens, $actual_tokens ); + } + + /** + * Provides the test cases from the @rmenke/css-processor-test test corpus. + * + * @see https://github.com/romainmenke/css-processor-tests/ + * @return array + */ + static public function corpus_provider(): array { + return json_decode(file_get_contents(__DIR__ . '/css-test-cases.json'), true); + } + + /** + * Collects all tokens from a CSS processor into an array. + * + * @param CSSProcessor $processor The CSS processor. + * @return array Array of tokens with type, raw, startIndex, endIndex, structured. + */ + static public function collect_tokens( CSSProcessor $processor, $keys = null ): array { + $tokens = array(); + + while ( $processor->next_token() ) { + $type = $processor->get_token_type(); + + $byte_start = $processor->get_token_start(); + $byte_end = $byte_start + $processor->get_token_length(); + + $token = array( + 'type' => $type, + 'raw' => $processor->get_unnormalized_token(), + 'startIndex' => $byte_start, + 'endIndex' => $byte_end, + 'normalized' => $processor->get_normalized_token(), + 'value' => $processor->get_token_value(), + ); + if ( null !== $processor->get_token_unit() ) { + $token['unit'] = $processor->get_token_unit(); + } + + if ( null !== $keys ) { + $token = array_intersect_key( $token, array_flip( $keys ) ); + } + + $tokens[] = $token; + } + + return $tokens; + } + + /** + * Tests handling of non-UTF-8 byte sequences in identifiers. + * + * Invalid UTF-8 sequences should be replaced with U+FFFD replacement characters + * during tokenization, allowing the CSS to continue processing. + */ + public function test_non_utf8_sequences_in_identifiers(): void { + // Invalid UTF-8 sequence 0xC0 0x80 (overlong encoding). + $css = ".class\xF1name"; + + $expected = array( + // .class�name (0xF1 replaced with U+FFFD). + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'normalized' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => "class\xF1name", + 'normalized' => 'class�name', + 'value' => 'class�name', + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value', 'unit'] ); + $this->assertSame( $expected, $actual_tokens ); + } + + public function test_invalid_utf8_with_valid_prefix_in_identifiers(): void { + // Invalid 2-byte prefix is replaced with a single U+FFFD. + $css = ".test\xE2\x80name"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'normalized' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => "test\xE2\x80name", + 'normalized' => 'test�name', + 'value' => 'test�name', + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); + $this->assertSame( $expected, $actual_tokens ); + } + + public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void { + // Two distinct single byte invalid sequences are replaced with + // two separate U+FFFD replacement characters. + $css = ".test\xE2\xE2name"; + + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'normalized' => '.', + 'value' => '.', + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => "test\xE2\xE2name", + 'normalized' => 'test��name', + 'value' => 'test��name', + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); + $this->assertSame( $expected, $actual_tokens ); + } + + /** + * Legacy test to ensure basic tokenization still works. + */ + public function test_tokenize_labels_core_tokens(): void { + $css = << CSSProcessor::TOKEN_AT_KEYWORD, 'raw' => '@media' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'screen' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'and' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_PAREN, 'raw' => '(' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'min-width' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '10px' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACE, 'raw' => '{' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => "\n\t" ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'background' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'url(' ), + array( 'type' => CSSProcessor::TOKEN_STRING, 'raw' => '"/images/a.png"' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => "\n" ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, 'raw' => '}' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of complex selectors with pseudo-classes. + */ + public function test_complex_selector_with_pseudo_classes(): void { + $css = 'a:hover::before, div.class#id:not(.disabled)'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'a' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'hover' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'before' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'div' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '.' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'class' ), + array( 'type' => CSSProcessor::TOKEN_HASH, 'raw' => '#id' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'not(' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '.' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'disabled' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of CSS comments. + */ + public function test_css_comments(): void { + $css = '/* This is a comment */ .class { color: red; /* Another comment */ }'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_COMMENT, 'raw' => '/* This is a comment */' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '.' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'class' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACE, 'raw' => '{' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'color' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'red' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_COMMENT, 'raw' => '/* Another comment */' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, 'raw' => '}' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of media queries. + */ + public function test_media_query(): void { + $css = '@media screen and (min-width: 768px) and (max-width: 1024px)'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_AT_KEYWORD, 'raw' => '@media' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'screen' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'and' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_PAREN, 'raw' => '(' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'min-width' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '768px' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'and' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_PAREN, 'raw' => '(' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'max-width' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '1024px' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of keyframes animation. + */ + public function test_keyframes_animation(): void { + $css = '@keyframes slide-in { 0% { opacity: 0; } 100% { opacity: 1; } }'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_AT_KEYWORD, 'raw' => '@keyframes' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'slide-in' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACE, 'raw' => '{' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_PERCENTAGE, 'raw' => '0%' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACE, 'raw' => '{' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'opacity' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '0' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, 'raw' => '}' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_PERCENTAGE, 'raw' => '100%' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACE, 'raw' => '{' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'opacity' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '1' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, 'raw' => '}' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, 'raw' => '}' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of vendor-prefixed properties. + */ + public function test_vendor_prefixed_properties(): void { + $css = '-webkit-transform: rotate(45deg); -moz-border-radius: 5px;'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => '-webkit-transform' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'rotate(' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '45deg' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => '-moz-border-radius' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '5px' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of attribute selectors. + */ + public function test_attribute_selectors(): void { + $css = 'input[type="text"][required], a[href^="https://"]'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'input' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACKET, 'raw' => '[' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'type' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '=' ), + array( 'type' => CSSProcessor::TOKEN_STRING, 'raw' => '"text"' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACKET, 'raw' => ']' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACKET, 'raw' => '[' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'required' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACKET, 'raw' => ']' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'a' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACKET, 'raw' => '[' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'href' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '^' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '=' ), + array( 'type' => CSSProcessor::TOKEN_STRING, 'raw' => '"https://"' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACKET, 'raw' => ']' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of calc() function with complex expressions. + */ + public function test_calc_function(): void { + $css = 'width: calc(100% - 20px * 2 + 5em);'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'width' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'calc(' ), + array( 'type' => CSSProcessor::TOKEN_PERCENTAGE, 'raw' => '100%' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '-' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '20px' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '*' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '2' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '+' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '5em' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of RGB/RGBA color functions. + */ + public function test_color_functions(): void { + $css = 'color: rgb(255, 128, 0); background: rgba(0, 0, 0, 0.5);'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'color' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'rgb(' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '255' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '128' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '0' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'background' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'rgba(' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '0' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '0' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '0' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '0.5' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of CSS custom properties (variables). + */ + public function test_css_variables(): void { + $css = '--main-color: #ff0000; color: var(--main-color);'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => '--main-color' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_HASH, 'raw' => '#ff0000' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'color' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'var(' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => '--main-color' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of gradient functions. + */ + public function test_gradient_functions(): void { + $css = 'background: linear-gradient(to right, red 0%, blue 100%);'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'background' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'linear-gradient(' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'to' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'right' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'red' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_PERCENTAGE, 'raw' => '0%' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'blue' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_PERCENTAGE, 'raw' => '100%' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of grid layout properties. + */ + public function test_grid_layout(): void { + $css = 'grid-template-columns: repeat(3, 1fr); gap: 10px 20px;'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'grid-template-columns' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'repeat(' ), + array( 'type' => CSSProcessor::TOKEN_NUMBER, 'raw' => '3' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '1fr' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'gap' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '10px' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '20px' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of URL functions with various formats. + */ + public function test_url_formats(): void { + $css = 'background: url("image.png"), url(\'font.woff\'), url(https://example.com/bg.jpg);'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'background' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'url(' ), + array( 'type' => CSSProcessor::TOKEN_STRING, 'raw' => '"image.png"' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_FUNCTION, 'raw' => 'url(' ), + array( 'type' => CSSProcessor::TOKEN_STRING, 'raw' => "'font.woff'" ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, 'raw' => ')' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_URL, 'raw' => 'url(https://example.com/bg.jpg)' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of !important declarations. + */ + public function test_important_declarations(): void { + $css = 'color: red !important; margin: 0px !important;'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'color' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'red' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '!' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'important' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'margin' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DIMENSION, 'raw' => '0px' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '!' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'important' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of multiple selectors with combinators. + */ + public function test_complex_combinators(): void { + $css = 'div > p + span ~ a.link'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'div' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '>' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'p' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '+' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'span' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '~' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'a' ), + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '.' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'link' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests tokenization of escaped characters in identifiers. + */ + public function test_escaped_identifiers(): void { + $css = '.class\\:name, #id\\@special { color: blue; }'; + + $expected = array( + array( 'type' => CSSProcessor::TOKEN_DELIM, 'raw' => '.' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'class\\:name' ), + array( 'type' => CSSProcessor::TOKEN_COMMA, 'raw' => ',' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_HASH, 'raw' => '#id\\@special' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_LEFT_BRACE, 'raw' => '{' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'color' ), + array( 'type' => CSSProcessor::TOKEN_COLON, 'raw' => ':' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_IDENT, 'raw' => 'blue' ), + array( 'type' => CSSProcessor::TOKEN_SEMICOLON, 'raw' => ';' ), + array( 'type' => CSSProcessor::TOKEN_WHITESPACE, 'raw' => ' ' ), + array( 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, 'raw' => '}' ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); + $this->assertSame( $actual_tokens, $expected ); + } + + /** + * Tests that get_normalized_token() applies CSS normalization. + * + * Uses a comprehensive CSS selector with rules that includes: + * - CSS escapes in class names and IDs + * - URLs with escape sequences + * - String values with escapes and line endings + * - Comments with various line ending characters + * - Null bytes in identifiers + * - Mixed line endings (\r\n, \r, \f) that need normalization + */ + public function test_get_normalized_token_applies_normalization(): void { + // Comprehensive CSS with normalization requirements. + $css = "/* Comment\r\nwith\flines */\r\n" . + ".c\\6c ass.n\\61 me\r#id\\@value\r\n{\r\n" . + "\tbackground:\furl(path\\2f to\\2f image.png);\r\n" . + "\tcontent:\r\"text\\A string\";\r\n" . + "}"; + + $expected = array( + // Comment with \r\n and \f. + array( + 'type' => CSSProcessor::TOKEN_COMMENT, + 'raw' => "/* Comment\r\nwith\flines */", + 'normalized' => "/* Comment\nwith\nlines */", + 'value' => null, + ), + // Whitespace with \r\n. + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r\n", + 'normalized' => "\n", + 'value' => null, + ), + // Class selector delimiter. + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'normalized' => '.', + 'value' => '.', + ), + // Class name with escape (\6c = 'l'), space gets consumed by escape. + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'c\\6c ass', + 'normalized' => 'class', // Escapes decoded. + 'value' => 'class', // Decoded: \6c → l, space consumed. + ), + // Delimiter. + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'normalized' => '.', + 'value' => '.', + ), + // Identifier with escape (\61 = 'a'), space gets consumed. + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'n\\61 me', + 'normalized' => 'name', // Escapes decoded. + 'value' => 'name', // Decoded: \61 → a, space consumed. + ), + // Whitespace with \r. + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r", + 'normalized' => "\n", + 'value' => null, + ), + // ID selector with escape. + array( + 'type' => CSSProcessor::TOKEN_HASH, + 'raw' => '#id\\@value', + 'normalized' => '#id@value', // Escapes decoded. + 'value' => 'id@value', // Decoded value. + ), + // Whitespace with \r\n. + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r\n", + 'normalized' => "\n", + 'value' => null, + ), + // Opening brace. + array( + 'type' => CSSProcessor::TOKEN_LEFT_BRACE, + 'raw' => '{', + 'normalized' => '{', + 'value' => null, + ), + // Whitespace with \r\n and tab (consumed together). + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r\n\t", + 'normalized' => "\n\t", + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'background', + 'normalized' => 'background', + 'value' => 'background', + ), + // Colon. + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + // Whitespace with \f. + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\f", + 'normalized' => "\n", + 'value' => null, + ), + // URL token with escapes (entire url(...) is one token). + array( + 'type' => CSSProcessor::TOKEN_URL, + 'raw' => 'url(path\\2f to\\2f image.png)', + 'normalized' => 'url(path/to/image.png)', // Escapes decoded. + 'value' => 'path/to/image.png', // Decoded: \2f → /, spaces consumed. + ), + // Semicolon. + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + // Whitespace with \r\n and tab (consumed together). + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r\n\t", + 'normalized' => "\n\t", + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'content', + 'normalized' => 'content', + 'value' => 'content', + ), + // Colon. + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + // Whitespace with \r. + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r", + 'normalized' => "\n", + 'value' => null, + ), + // String with escape (\A = newline, space consumed). + array( + 'type' => CSSProcessor::TOKEN_STRING, + 'raw' => '"text\\A string"', + 'normalized' => "\"text\nstring\"", // Escapes decoded, quotes preserved. + 'value' => "text\nstring", // \A → \n, space consumed. + ), + // Semicolon. + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + // Whitespace with \r\n. + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => "\r\n", + 'normalized' => "\n", + 'value' => null, + ), + // Closing brace. + array( + 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, + 'raw' => '}', + 'normalized' => '}', + 'value' => null, + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); + $this->assertSame( $expected, $actual_tokens ); + } + + public function test_dimension_token_value(): void { + $css = '10px;15em;20%;30pt;40pc;50vw;'; + $expected = array( + array( + 'type' => CSSProcessor::TOKEN_DIMENSION, + 'raw' => '10px', + 'normalized' => '10px', + 'value' => '10', + 'unit' => 'px', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_DIMENSION, + 'raw' => '15em', + 'normalized' => '15em', + 'value' => '15', + 'unit' => 'em', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_PERCENTAGE, + 'raw' => '20%', + 'normalized' => '20%', + 'value' => '20', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_DIMENSION, + 'raw' => '30pt', + 'normalized' => '30pt', + 'value' => '30', + 'unit' => 'pt', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_DIMENSION, + 'raw' => '40pc', + 'normalized' => '40pc', + 'value' => '40', + 'unit' => 'pc', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_DIMENSION, + 'raw' => '50vw', + 'normalized' => '50vw', + 'value' => '50', + 'unit' => 'vw', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + ); + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value', 'unit'] ); + $this->assertSame( $expected, $actual_tokens ); + } + + /** + * Tests that create() validates encoding and only accepts UTF-8. + */ + public function test_create_validates_encoding(): void { + // UTF-8 encoding should work (default). + $processor = CSSProcessor::create( '.class { color: red; }' ); + $this->assertInstanceOf( CSSProcessor::class, $processor ); + + // UTF-8 encoding should work (explicit). + $processor = CSSProcessor::create( '.class { color: red; }', 'UTF-8' ); + $this->assertInstanceOf( CSSProcessor::class, $processor ); + + // Other encodings should return null. + $processor = CSSProcessor::create( '.class { color: red; }', 'ISO-8859-1' ); + $this->assertNull( $processor ); + + $processor = CSSProcessor::create( '.class { color: red; }', 'Windows-1252' ); + $this->assertNull( $processor ); + } + + /** + * Tests escape sequences in unusual and edge-case positions. + * + * Covers: + * - Multiple consecutive escapes + * - Escapes in function names + * - Escapes in at-keywords + * - Escapes in dimension units + * - Null byte escapes (\0) + * - Escaped special characters (@, #, !, etc.) + * - Escaped whitespace that gets consumed by the escape + * - Unicode escapes for various characters + */ + public function test_escape_sequences_in_unusual_places() { + // Complex CSS with escapes in many unusual but valid positions + $css = '@\\6D edia ' . // @media with \6D (m) and space consumed + '\\73 creen ' . // screen with \73 (s) and space consumed + '{' . + ' .\\63 l\\61 ss\\5F name ' . // .class_name with escapes and spaces consumed + "#\\69 d\\5C 0test\x00 " . // #id\0test with null byte escape (should be preserved) + // AND an actual null byte (should be replaced with a U+FFFD REPLACEMENT CHARACTER) + '{' . + ' c\\6F lor: ' . // color: with \6F (o) and space consumed + 'r\\65 d ' . // red with escape + '\\21 important;' . // !important with escaped ! + ' w\\69 dth: ' . // width: + '10\\70 x;' . // 10px (dimension with escaped unit) + ' background: ' . + '\\75 rl(' . // url( with escaped u + '"p\\61 th\\2F img\\2E png"' . // "path/img.png" with escapes + ');' . + ' content: "\\5C \\5C ";' . // "\\ \\" - escaped backslashes + ' font-family: \\22 Arial\\22 ;' . // "Arial" with escaped quotes + ' }' . + '}'; + + $expected = array( + // @\6D edia -> @media + array( + 'type' => CSSProcessor::TOKEN_AT_KEYWORD, + 'raw' => '@\\6D edia', + 'normalized' => '@media', + 'value' => 'media', + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // \73 creen -> screen + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => '\\73 creen', + 'normalized' => 'screen', + 'value' => 'screen', + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_LEFT_BRACE, + 'raw' => '{', + 'normalized' => '{', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // Delimiter . + array( + 'type' => CSSProcessor::TOKEN_DELIM, + 'raw' => '.', + 'normalized' => '.', + 'value' => '.', + ), + // \63 l\61 ss\5F name -> class_name + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => '\\63 l\\61 ss\\5F name', + 'normalized' => 'class_name', + 'value' => 'class_name', + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // #\69 d\5C 0test -> #id\0test (with encoded null byte) + array( + 'type' => CSSProcessor::TOKEN_HASH, + 'raw' => "#\\69 d\\5C 0test\x00", + 'normalized' => "#id\\0test�", + // Ensure the value is normalized. + 'value' => "id\\0test�", + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_LEFT_BRACE, + 'raw' => '{', + 'normalized' => '{', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // c\6F lor -> color + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'c\\6F lor', + 'normalized' => 'color', + 'value' => 'color', + ), + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // r\65 d -> red + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'r\\65 d', + 'normalized' => 'red', + 'value' => 'red', + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // \21 important -> !important (single identifier with escaped !) + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => '\\21 important', + 'normalized' => '!important', + 'value' => '!important', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // w\69 dth -> width + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'w\\69 dth', + 'normalized' => 'width', + 'value' => 'width', + ), + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // 10\70 x -> 10px (dimension with escaped unit) + array( + 'type' => CSSProcessor::TOKEN_DIMENSION, + 'raw' => '10\\70 x', + 'normalized' => '10px', + 'value' => '10', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'background', + 'normalized' => 'background', + 'value' => 'background', + ), + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // \75 rl( -> url( (escaped function name) + array( + 'type' => CSSProcessor::TOKEN_FUNCTION, + 'raw' => '\\75 rl(', + 'normalized' => 'url(', + 'value' => 'url', + ), + // String with escapes: "p\61 th\2F img\2E png" + array( + 'type' => CSSProcessor::TOKEN_STRING, + 'raw' => '"p\\61 th\\2F img\\2E png"', + 'normalized' => '"path/img.png"', + 'value' => 'path/img.png', + ), + array( + 'type' => CSSProcessor::TOKEN_RIGHT_PAREN, + 'raw' => ')', + 'normalized' => ')', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'content', + 'normalized' => 'content', + 'value' => 'content', + ), + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // String with escaped backslashes: "\5C \5C " -> "\\" + // Each \5C sequence (with trailing space consumed) becomes one backslash + array( + 'type' => CSSProcessor::TOKEN_STRING, + 'raw' => '"\\5C \\5C "', + 'normalized' => '"\\\\"', + 'value' => '\\\\', // Two backslashes total + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => 'font-family', + 'normalized' => 'font-family', + 'value' => 'font-family', + ), + array( + 'type' => CSSProcessor::TOKEN_COLON, + 'raw' => ':', + 'normalized' => ':', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + // \22 Arial\22 -> "Arial" (escaped quotes make it an ident) + array( + 'type' => CSSProcessor::TOKEN_IDENT, + 'raw' => '\\22 Arial\\22 ', + 'normalized' => '"Arial"', + 'value' => '"Arial"', + ), + array( + 'type' => CSSProcessor::TOKEN_SEMICOLON, + 'raw' => ';', + 'normalized' => ';', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_WHITESPACE, + 'raw' => ' ', + 'normalized' => ' ', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, + 'raw' => '}', + 'normalized' => '}', + 'value' => null, + ), + array( + 'type' => CSSProcessor::TOKEN_RIGHT_BRACE, + 'raw' => '}', + 'normalized' => '}', + 'value' => null, + ), + ); + + $processor = CSSProcessor::create( $css ); + $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); + $this->assertSame( $expected, $actual_tokens ); + } + + /** + * Tests that set_token_value() only works on URL tokens. + */ + public function test_set_token_value_only_works_on_url_tokens(): void { + $css = 'color: red; background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + $token_type = $processor->get_token_type(); + + if ( CSSProcessor::TOKEN_URL === $token_type ) { + // Should succeed on URL tokens. + $this->assertTrue( $processor->set_token_value( 'new.jpg' ) ); + } else { + // Should fail on non-URL tokens. + $this->assertFalse( $processor->set_token_value( 'test' ) ); + } + } + + // Verify the update was applied. + $updated = $processor->get_updated_css(); + $this->assertSame( 'color: red; background: url("new.jpg");', $updated ); + } + + /** + * Tests that set_token_value() properly escapes special characters in quoted URLs. + */ + public function test_set_token_value_escapes_special_characters(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + // URL with spaces, quotes, and parentheses. + $processor->set_token_value( 'path with spaces("special").jpg' ); + } + } + + $updated = $processor->get_updated_css(); + + $this->assertSame( 'background: url("path with spaces(\\22 special\\22 ).jpg");', $updated ); + } + + /** + * Tests that set_token_value() preserves Unicode characters in quoted URLs. + */ + public function test_set_token_value_encodes_unicode(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + // URL with Unicode characters: "测试.jpg" (Chinese characters). + $processor->set_token_value( '测试.jpg' ); + } + } + + $updated = $processor->get_updated_css(); + + $this->assertSame( 'background: url("测试.jpg");', $updated ); + } + + /** + * Tests that set_token_value() preserves emoji in quoted URLs. + */ + public function test_set_token_value_handles_emoji(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + // URL with emoji: "image😀.jpg". + $processor->set_token_value( 'image😀.jpg' ); + } + } + + $updated = $processor->get_updated_css(); + + $this->assertSame( 'background: url("image😀.jpg");', $updated ); + } + + /** + * Tests that multiple URL values can be updated in the same CSS. + */ + public function test_set_token_value_multiple_urls(): void { + $css = 'background: url(old1.jpg); border-image: url(old2.png);'; + $processor = CSSProcessor::create( $css ); + + $url_count = 0; + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + $url_count++; + if ( 1 === $url_count ) { + $processor->set_token_value( 'new1.jpg' ); + } elseif ( 2 === $url_count ) { + $processor->set_token_value( 'new2.png' ); + } + } + } + + $updated = $processor->get_updated_css(); + + // Verify both URLs were updated. + $this->assertSame( 'background: url("new1.jpg"); border-image: url("new2.png");', $updated ); + } + + /** + * Tests that newlines are properly escaped in quoted URLs. + */ + public function test_set_token_value_escapes_control_characters(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + // URL with newlines and carriage returns. + $processor->set_token_value( "path\nwith\rnewlines\ftest.jpg" ); + } + } + + $updated = $processor->get_updated_css(); + + $this->assertSame( 'background: url("path\\a with\\a newlines\\a test.jpg");', $updated ); + } + + /** + * Tests that backslashes are properly escaped in quoted URLs. + */ + public function test_set_token_value_escapes_backslashes(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + $processor->set_token_value( 'path\\with\\backslashes.jpg' ); + } + } + + $updated = $processor->get_updated_css(); + + // Verify backslashes are escaped as \\. + $this->assertSame( 'background: url("path\\5C with\\5C backslashes.jpg");', $updated ); + } + + /** + * Tests that get_updated_css() returns original CSS when no changes are made. + */ + public function test_get_updated_css_returns_original_when_unchanged(): void { + $css = 'background: url(image.jpg); color: red;'; + $processor = CSSProcessor::create( $css ); + + // Iterate through tokens without making changes. + while ( $processor->next_token() ) { + // Do nothing. + } + + $updated = $processor->get_updated_css(); + + // Should return original CSS. + $this->assertSame( $css, $updated ); + } + + /** + * Tests that safe ASCII characters are preserved in quoted URLs. + */ + public function test_set_token_value_preserves_safe_characters(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + // URL with safe characters: letters, digits, hyphens, underscores, dots, slashes. + $processor->set_token_value( 'path/to/my-image_2024.jpg' ); + } + } + + $updated = $processor->get_updated_css(); + + // Verify safe characters are preserved as-is in quoted URL. + $this->assertSame( 'background: url("path/to/my-image_2024.jpg");', $updated ); + } + + /** + * Tests that safe ASCII characters are preserved in quoted URLs. + */ + public function test_set_token_with_invalid_utf8_sequence(): void { + $css = 'background: url(old.jpg);'; + $processor = CSSProcessor::create( $css ); + + while ( $processor->next_token() ) { + if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + // URL with safe characters: letters, digits, hyphens, underscores, dots, slashes. + $processor->set_token_value( "\xC0.jpg" ); + } + } + + $updated = $processor->get_updated_css(); + + // Invalid UTF-8 sequence is preserved as-is – garbage in, garbage out. + $this->assertSame( "background: url(\"\xC0.jpg\");", $updated ); + } + + +} diff --git a/components/DataLiberation/Tests/CSSTokenizerTest.php b/components/DataLiberation/Tests/CSSTokenizerTest.php deleted file mode 100644 index 1c568c2a..00000000 --- a/components/DataLiberation/Tests/CSSTokenizerTest.php +++ /dev/null @@ -1,1330 +0,0 @@ -collect_tokens( $processor ); - $this->assertSame( $expected_tokens, $actual_tokens ); - } - - /** - * Provides the test cases from the @rmenke/css-tokenizer-test test corpus. - * - * @see https://github.com/romainmenke/css-tokenizer-tests/ - * @return array - */ - static public function corpus_provider(): array { - return json_decode(file_get_contents(__DIR__ . '/css-test-cases.json'), true); - } - - /** - * Collects all tokens from a CSS processor into an array. - * - * @param CSSTokenizer $processor The CSS processor. - * @return array Array of tokens with type, raw, startIndex, endIndex, structured. - */ - static public function collect_tokens( CSSTokenizer $processor, $keys = null ): array { - $tokens = array(); - - while ( $processor->next_token() ) { - $type = $processor->get_token_type(); - - $byte_start = $processor->get_token_start(); - $byte_end = $byte_start + $processor->get_token_length(); - - $token = array( - 'type' => $type, - 'raw' => $processor->get_unnormalized_token(), - 'startIndex' => $byte_start, - 'endIndex' => $byte_end, - 'normalized' => $processor->get_normalized_token(), - 'value' => $processor->get_token_value(), - ); - if ( null !== $processor->get_token_unit() ) { - $token['unit'] = $processor->get_token_unit(); - } - - if ( null !== $keys ) { - $token = array_intersect_key( $token, array_flip( $keys ) ); - } - - $tokens[] = $token; - } - - return $tokens; - } - - /** - * Tests handling of non-UTF-8 byte sequences in identifiers. - * - * Invalid UTF-8 sequences should be replaced with U+FFFD replacement characters - * during tokenization, allowing the CSS to continue processing. - */ - public function test_non_utf8_sequences_in_identifiers(): void { - // Invalid UTF-8 sequence 0xC0 0x80 (overlong encoding). - $css = ".class\xF1name"; - - $expected = array( - // .class�name (0xF1 replaced with U+FFFD). - array( - 'type' => CSSTokenizer::TOKEN_DELIM, - 'raw' => '.', - 'normalized' => '.', - 'value' => '.', - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => "class\xF1name", - 'normalized' => 'class�name', - 'value' => 'class�name', - ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value', 'unit'] ); - $this->assertSame( $expected, $actual_tokens ); - } - - public function test_invalid_utf8_with_valid_prefix_in_identifiers(): void { - // Invalid 2-byte prefix is replaced with a single U+FFFD. - $css = ".test\xE2\x80name"; - - $expected = array( - array( - 'type' => CSSTokenizer::TOKEN_DELIM, - 'raw' => '.', - 'normalized' => '.', - 'value' => '.', - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => "test\xE2\x80name", - 'normalized' => 'test�name', - 'value' => 'test�name', - ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); - $this->assertSame( $expected, $actual_tokens ); - } - - public function test_invalid_utf8_with_two_single_byte_invalid_sequences(): void { - // Two distinct single byte invalid sequences are replaced with - // two separate U+FFFD replacement characters. - $css = ".test\xE2\xE2name"; - - $expected = array( - array( - 'type' => CSSTokenizer::TOKEN_DELIM, - 'raw' => '.', - 'normalized' => '.', - 'value' => '.', - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => "test\xE2\xE2name", - 'normalized' => 'test��name', - 'value' => 'test��name', - ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); - $this->assertSame( $expected, $actual_tokens ); - } - - /** - * Legacy test to ensure basic tokenization still works. - */ - public function test_tokenize_labels_core_tokens(): void { - $css = << CSSTokenizer::TOKEN_AT_KEYWORD, 'raw' => '@media' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'screen' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'and' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_PAREN, 'raw' => '(' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'min-width' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '10px' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, 'raw' => '{' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => "\n\t" ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'background' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'url(' ), - array( 'type' => CSSTokenizer::TOKEN_STRING, 'raw' => '"/images/a.png"' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => "\n" ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, 'raw' => '}' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of complex selectors with pseudo-classes. - */ - public function test_complex_selector_with_pseudo_classes(): void { - $css = 'a:hover::before, div.class#id:not(.disabled)'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'a' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'hover' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'before' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'div' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '.' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'class' ), - array( 'type' => CSSTokenizer::TOKEN_HASH, 'raw' => '#id' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'not(' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '.' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'disabled' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of CSS comments. - */ - public function test_css_comments(): void { - $css = '/* This is a comment */ .class { color: red; /* Another comment */ }'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_COMMENT, 'raw' => '/* This is a comment */' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '.' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'class' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, 'raw' => '{' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'color' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'red' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_COMMENT, 'raw' => '/* Another comment */' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, 'raw' => '}' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of media queries. - */ - public function test_media_query(): void { - $css = '@media screen and (min-width: 768px) and (max-width: 1024px)'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_AT_KEYWORD, 'raw' => '@media' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'screen' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'and' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_PAREN, 'raw' => '(' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'min-width' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '768px' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'and' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_PAREN, 'raw' => '(' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'max-width' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '1024px' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of keyframes animation. - */ - public function test_keyframes_animation(): void { - $css = '@keyframes slide-in { 0% { opacity: 0; } 100% { opacity: 1; } }'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_AT_KEYWORD, 'raw' => '@keyframes' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'slide-in' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, 'raw' => '{' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_PERCENTAGE, 'raw' => '0%' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, 'raw' => '{' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'opacity' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '0' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, 'raw' => '}' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_PERCENTAGE, 'raw' => '100%' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, 'raw' => '{' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'opacity' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '1' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, 'raw' => '}' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, 'raw' => '}' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of vendor-prefixed properties. - */ - public function test_vendor_prefixed_properties(): void { - $css = '-webkit-transform: rotate(45deg); -moz-border-radius: 5px;'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => '-webkit-transform' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'rotate(' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '45deg' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => '-moz-border-radius' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '5px' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of attribute selectors. - */ - public function test_attribute_selectors(): void { - $css = 'input[type="text"][required], a[href^="https://"]'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'input' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACKET, 'raw' => '[' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'type' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '=' ), - array( 'type' => CSSTokenizer::TOKEN_STRING, 'raw' => '"text"' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACKET, 'raw' => ']' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACKET, 'raw' => '[' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'required' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACKET, 'raw' => ']' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'a' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACKET, 'raw' => '[' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'href' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '^' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '=' ), - array( 'type' => CSSTokenizer::TOKEN_STRING, 'raw' => '"https://"' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACKET, 'raw' => ']' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of calc() function with complex expressions. - */ - public function test_calc_function(): void { - $css = 'width: calc(100% - 20px * 2 + 5em);'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'width' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'calc(' ), - array( 'type' => CSSTokenizer::TOKEN_PERCENTAGE, 'raw' => '100%' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '-' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '20px' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '*' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '2' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '+' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '5em' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of RGB/RGBA color functions. - */ - public function test_color_functions(): void { - $css = 'color: rgb(255, 128, 0); background: rgba(0, 0, 0, 0.5);'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'color' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'rgb(' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '255' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '128' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '0' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'background' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'rgba(' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '0' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '0' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '0' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '0.5' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of CSS custom properties (variables). - */ - public function test_css_variables(): void { - $css = '--main-color: #ff0000; color: var(--main-color);'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => '--main-color' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_HASH, 'raw' => '#ff0000' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'color' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'var(' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => '--main-color' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of gradient functions. - */ - public function test_gradient_functions(): void { - $css = 'background: linear-gradient(to right, red 0%, blue 100%);'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'background' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'linear-gradient(' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'to' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'right' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'red' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_PERCENTAGE, 'raw' => '0%' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'blue' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_PERCENTAGE, 'raw' => '100%' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of grid layout properties. - */ - public function test_grid_layout(): void { - $css = 'grid-template-columns: repeat(3, 1fr); gap: 10px 20px;'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'grid-template-columns' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'repeat(' ), - array( 'type' => CSSTokenizer::TOKEN_NUMBER, 'raw' => '3' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '1fr' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'gap' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '10px' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '20px' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of URL functions with various formats. - */ - public function test_url_formats(): void { - $css = 'background: url("image.png"), url(\'font.woff\'), url(https://example.com/bg.jpg);'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'background' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'url(' ), - array( 'type' => CSSTokenizer::TOKEN_STRING, 'raw' => '"image.png"' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_FUNCTION, 'raw' => 'url(' ), - array( 'type' => CSSTokenizer::TOKEN_STRING, 'raw' => "'font.woff'" ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, 'raw' => ')' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_URL, 'raw' => 'url(https://example.com/bg.jpg)' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of !important declarations. - */ - public function test_important_declarations(): void { - $css = 'color: red !important; margin: 0px !important;'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'color' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'red' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '!' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'important' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'margin' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DIMENSION, 'raw' => '0px' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '!' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'important' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of multiple selectors with combinators. - */ - public function test_complex_combinators(): void { - $css = 'div > p + span ~ a.link'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'div' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '>' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'p' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '+' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'span' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '~' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'a' ), - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '.' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'link' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests tokenization of escaped characters in identifiers. - */ - public function test_escaped_identifiers(): void { - $css = '.class\\:name, #id\\@special { color: blue; }'; - - $expected = array( - array( 'type' => CSSTokenizer::TOKEN_DELIM, 'raw' => '.' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'class\\:name' ), - array( 'type' => CSSTokenizer::TOKEN_COMMA, 'raw' => ',' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_HASH, 'raw' => '#id\\@special' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, 'raw' => '{' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'color' ), - array( 'type' => CSSTokenizer::TOKEN_COLON, 'raw' => ':' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_IDENT, 'raw' => 'blue' ), - array( 'type' => CSSTokenizer::TOKEN_SEMICOLON, 'raw' => ';' ), - array( 'type' => CSSTokenizer::TOKEN_WHITESPACE, 'raw' => ' ' ), - array( 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, 'raw' => '}' ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw'] ); - $this->assertSame( $actual_tokens, $expected ); - } - - /** - * Tests that get_normalized_token() applies CSS normalization. - * - * Uses a comprehensive CSS selector with rules that includes: - * - CSS escapes in class names and IDs - * - URLs with escape sequences - * - String values with escapes and line endings - * - Comments with various line ending characters - * - Null bytes in identifiers - * - Mixed line endings (\r\n, \r, \f) that need normalization - */ - public function test_get_normalized_token_applies_normalization(): void { - // Comprehensive CSS with normalization requirements. - $css = "/* Comment\r\nwith\flines */\r\n" . - ".c\\6c ass.n\\61 me\r#id\\@value\r\n{\r\n" . - "\tbackground:\furl(path\\2f to\\2f image.png);\r\n" . - "\tcontent:\r\"text\\A string\";\r\n" . - "}"; - - $expected = array( - // Comment with \r\n and \f. - array( - 'type' => CSSTokenizer::TOKEN_COMMENT, - 'raw' => "/* Comment\r\nwith\flines */", - 'normalized' => "/* Comment\nwith\nlines */", - 'value' => null, - ), - // Whitespace with \r\n. - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r\n", - 'normalized' => "\n", - 'value' => null, - ), - // Class selector delimiter. - array( - 'type' => CSSTokenizer::TOKEN_DELIM, - 'raw' => '.', - 'normalized' => '.', - 'value' => '.', - ), - // Class name with escape (\6c = 'l'), space gets consumed by escape. - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'c\\6c ass', - 'normalized' => 'class', // Escapes decoded. - 'value' => 'class', // Decoded: \6c → l, space consumed. - ), - // Delimiter. - array( - 'type' => CSSTokenizer::TOKEN_DELIM, - 'raw' => '.', - 'normalized' => '.', - 'value' => '.', - ), - // Identifier with escape (\61 = 'a'), space gets consumed. - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'n\\61 me', - 'normalized' => 'name', // Escapes decoded. - 'value' => 'name', // Decoded: \61 → a, space consumed. - ), - // Whitespace with \r. - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r", - 'normalized' => "\n", - 'value' => null, - ), - // ID selector with escape. - array( - 'type' => CSSTokenizer::TOKEN_HASH, - 'raw' => '#id\\@value', - 'normalized' => '#id@value', // Escapes decoded. - 'value' => 'id@value', // Decoded value. - ), - // Whitespace with \r\n. - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r\n", - 'normalized' => "\n", - 'value' => null, - ), - // Opening brace. - array( - 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, - 'raw' => '{', - 'normalized' => '{', - 'value' => null, - ), - // Whitespace with \r\n and tab (consumed together). - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r\n\t", - 'normalized' => "\n\t", - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'background', - 'normalized' => 'background', - 'value' => 'background', - ), - // Colon. - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - // Whitespace with \f. - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\f", - 'normalized' => "\n", - 'value' => null, - ), - // URL token with escapes (entire url(...) is one token). - array( - 'type' => CSSTokenizer::TOKEN_URL, - 'raw' => 'url(path\\2f to\\2f image.png)', - 'normalized' => 'url(path/to/image.png)', // Escapes decoded. - 'value' => 'path/to/image.png', // Decoded: \2f → /, spaces consumed. - ), - // Semicolon. - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - // Whitespace with \r\n and tab (consumed together). - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r\n\t", - 'normalized' => "\n\t", - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'content', - 'normalized' => 'content', - 'value' => 'content', - ), - // Colon. - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - // Whitespace with \r. - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r", - 'normalized' => "\n", - 'value' => null, - ), - // String with escape (\A = newline, space consumed). - array( - 'type' => CSSTokenizer::TOKEN_STRING, - 'raw' => '"text\\A string"', - 'normalized' => "\"text\nstring\"", // Escapes decoded, quotes preserved. - 'value' => "text\nstring", // \A → \n, space consumed. - ), - // Semicolon. - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - // Whitespace with \r\n. - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => "\r\n", - 'normalized' => "\n", - 'value' => null, - ), - // Closing brace. - array( - 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, - 'raw' => '}', - 'normalized' => '}', - 'value' => null, - ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); - $this->assertSame( $expected, $actual_tokens ); - } - - public function test_dimension_token_value(): void { - $css = '10px;15em;20%;30pt;40pc;50vw;'; - $expected = array( - array( - 'type' => CSSTokenizer::TOKEN_DIMENSION, - 'raw' => '10px', - 'normalized' => '10px', - 'value' => '10', - 'unit' => 'px', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_DIMENSION, - 'raw' => '15em', - 'normalized' => '15em', - 'value' => '15', - 'unit' => 'em', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_PERCENTAGE, - 'raw' => '20%', - 'normalized' => '20%', - 'value' => '20', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_DIMENSION, - 'raw' => '30pt', - 'normalized' => '30pt', - 'value' => '30', - 'unit' => 'pt', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_DIMENSION, - 'raw' => '40pc', - 'normalized' => '40pc', - 'value' => '40', - 'unit' => 'pc', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_DIMENSION, - 'raw' => '50vw', - 'normalized' => '50vw', - 'value' => '50', - 'unit' => 'vw', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - ); - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value', 'unit'] ); - $this->assertSame( $expected, $actual_tokens ); - } - - /** - * Tests that create() validates encoding and only accepts UTF-8. - */ - public function test_create_validates_encoding(): void { - // UTF-8 encoding should work (default). - $tokenizer = CSSTokenizer::create( '.class { color: red; }' ); - $this->assertInstanceOf( CSSTokenizer::class, $tokenizer ); - - // UTF-8 encoding should work (explicit). - $tokenizer = CSSTokenizer::create( '.class { color: red; }', 'UTF-8' ); - $this->assertInstanceOf( CSSTokenizer::class, $tokenizer ); - - // Other encodings should return null. - $tokenizer = CSSTokenizer::create( '.class { color: red; }', 'ISO-8859-1' ); - $this->assertNull( $tokenizer ); - - $tokenizer = CSSTokenizer::create( '.class { color: red; }', 'Windows-1252' ); - $this->assertNull( $tokenizer ); - } - - /** - * Tests escape sequences in unusual and edge-case positions. - * - * Covers: - * - Multiple consecutive escapes - * - Escapes in function names - * - Escapes in at-keywords - * - Escapes in dimension units - * - Null byte escapes (\0) - * - Escaped special characters (@, #, !, etc.) - * - Escaped whitespace that gets consumed by the escape - * - Unicode escapes for various characters - */ - public function test_escape_sequences_in_unusual_places() { - // Complex CSS with escapes in many unusual but valid positions - $css = '@\\6D edia ' . // @media with \6D (m) and space consumed - '\\73 creen ' . // screen with \73 (s) and space consumed - '{' . - ' .\\63 l\\61 ss\\5F name ' . // .class_name with escapes and spaces consumed - "#\\69 d\\5C 0test\x00 " . // #id\0test with null byte escape (should be preserved) - // AND an actual null byte (should be replaced with a U+FFFD REPLACEMENT CHARACTER) - '{' . - ' c\\6F lor: ' . // color: with \6F (o) and space consumed - 'r\\65 d ' . // red with escape - '\\21 important;' . // !important with escaped ! - ' w\\69 dth: ' . // width: - '10\\70 x;' . // 10px (dimension with escaped unit) - ' background: ' . - '\\75 rl(' . // url( with escaped u - '"p\\61 th\\2F img\\2E png"' . // "path/img.png" with escapes - ');' . - ' content: "\\5C \\5C ";' . // "\\ \\" - escaped backslashes - ' font-family: \\22 Arial\\22 ;' . // "Arial" with escaped quotes - ' }' . - '}'; - - $expected = array( - // @\6D edia -> @media - array( - 'type' => CSSTokenizer::TOKEN_AT_KEYWORD, - 'raw' => '@\\6D edia', - 'normalized' => '@media', - 'value' => 'media', - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // \73 creen -> screen - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => '\\73 creen', - 'normalized' => 'screen', - 'value' => 'screen', - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, - 'raw' => '{', - 'normalized' => '{', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // Delimiter . - array( - 'type' => CSSTokenizer::TOKEN_DELIM, - 'raw' => '.', - 'normalized' => '.', - 'value' => '.', - ), - // \63 l\61 ss\5F name -> class_name - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => '\\63 l\\61 ss\\5F name', - 'normalized' => 'class_name', - 'value' => 'class_name', - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // #\69 d\5C 0test -> #id\0test (with encoded null byte) - array( - 'type' => CSSTokenizer::TOKEN_HASH, - 'raw' => "#\\69 d\\5C 0test\x00", - 'normalized' => "#id\\0test�", - // Ensure the value is normalized. - 'value' => "id\\0test�", - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_LEFT_BRACE, - 'raw' => '{', - 'normalized' => '{', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // c\6F lor -> color - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'c\\6F lor', - 'normalized' => 'color', - 'value' => 'color', - ), - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // r\65 d -> red - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'r\\65 d', - 'normalized' => 'red', - 'value' => 'red', - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // \21 important -> !important (single identifier with escaped !) - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => '\\21 important', - 'normalized' => '!important', - 'value' => '!important', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // w\69 dth -> width - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'w\\69 dth', - 'normalized' => 'width', - 'value' => 'width', - ), - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // 10\70 x -> 10px (dimension with escaped unit) - array( - 'type' => CSSTokenizer::TOKEN_DIMENSION, - 'raw' => '10\\70 x', - 'normalized' => '10px', - 'value' => '10', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'background', - 'normalized' => 'background', - 'value' => 'background', - ), - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // \75 rl( -> url( (escaped function name) - array( - 'type' => CSSTokenizer::TOKEN_FUNCTION, - 'raw' => '\\75 rl(', - 'normalized' => 'url(', - 'value' => 'url', - ), - // String with escapes: "p\61 th\2F img\2E png" - array( - 'type' => CSSTokenizer::TOKEN_STRING, - 'raw' => '"p\\61 th\\2F img\\2E png"', - 'normalized' => '"path/img.png"', - 'value' => 'path/img.png', - ), - array( - 'type' => CSSTokenizer::TOKEN_RIGHT_PAREN, - 'raw' => ')', - 'normalized' => ')', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'content', - 'normalized' => 'content', - 'value' => 'content', - ), - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // String with escaped backslashes: "\5C \5C " -> "\\" - // Each \5C sequence (with trailing space consumed) becomes one backslash - array( - 'type' => CSSTokenizer::TOKEN_STRING, - 'raw' => '"\\5C \\5C "', - 'normalized' => '"\\\\"', - 'value' => '\\\\', // Two backslashes total - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => 'font-family', - 'normalized' => 'font-family', - 'value' => 'font-family', - ), - array( - 'type' => CSSTokenizer::TOKEN_COLON, - 'raw' => ':', - 'normalized' => ':', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - // \22 Arial\22 -> "Arial" (escaped quotes make it an ident) - array( - 'type' => CSSTokenizer::TOKEN_IDENT, - 'raw' => '\\22 Arial\\22 ', - 'normalized' => '"Arial"', - 'value' => '"Arial"', - ), - array( - 'type' => CSSTokenizer::TOKEN_SEMICOLON, - 'raw' => ';', - 'normalized' => ';', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_WHITESPACE, - 'raw' => ' ', - 'normalized' => ' ', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, - 'raw' => '}', - 'normalized' => '}', - 'value' => null, - ), - array( - 'type' => CSSTokenizer::TOKEN_RIGHT_BRACE, - 'raw' => '}', - 'normalized' => '}', - 'value' => null, - ), - ); - - $processor = CSSTokenizer::create( $css ); - $actual_tokens = $this->collect_tokens( $processor, ['type', 'raw', 'normalized', 'value'] ); - $this->assertSame( $expected, $actual_tokens ); - } -} diff --git a/components/DataLiberation/URL/class-csstokenizer.php b/components/DataLiberation/URL/class-cssprocessor.php similarity index 86% rename from components/DataLiberation/URL/class-csstokenizer.php rename to components/DataLiberation/URL/class-cssprocessor.php index 2864b2bc..a7a1afcb 100644 --- a/components/DataLiberation/URL/class-csstokenizer.php +++ b/components/DataLiberation/URL/class-cssprocessor.php @@ -10,10 +10,13 @@ /** * Tokenizes CSS according to the CSS Syntax Level 3 specification. * - * This class implements the CSS tokenization algorithm as defined in: - * https://www.w3.org/TR/css-syntax-3/ + * This class follows the algorithm in https://www.w3.org/TR/css-syntax-3/ and + * exposes a pull-based API so callers can stream over large stylesheets without + * allocating every token up front. Each call to next_token() advances the cursor + * and fills in metadata (type, value, raw slice, byte offsets) that you can read + * through the getter methods. * - * ## Design choices: + * ## Design choices * * ### On-the-fly normalization * @@ -25,26 +28,29 @@ * > Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT * > CHARACTER (�). * - * This tokenizer delays normalization as much as possible, rather than preprocessing - * the entire input upfront. This avoids the upfront allocation cost for clean CSS - * and preserves original byte positions for accurate raw token extraction. A part - * of the normalization is performed on-the-fly as the tokens are consumed. The rest - * of it is done once the token value is requested. + * This processor delays normalization as much as possible. That keeps the raw byte + * positions intact for accurate rewrites while still letting consumers ask for a + * normalized token when they need one. * * ### No EOF token * * The EOF token is a CSS parsing concept, not CSS tokenization concept. Therefore, - * this tokenizer does not produce it. + * this processor does not produce it. + * + * ### UTF-8 handling + * + * Only UTF-8 strings are supported. Invalid sequences are replaced with U+FFFD (�) + * using the maximal subpart approach described in + * https://www.unicode.org/versions/Unicode9.0.0/ch03.pdf, section 3.9 Best Practices + * for Using U+FFFD. * * ## Usage * - * The next_token() method is the main entry point for tokenizing a CSS string. - * It will consume the next token from the input stream and return true if a token - * was found. Otherwise, it will return false: + * Basic iteration: * * ```php * $css = 'width: 10px;'; - * $processor = CSSTokenizer::create( $css ); + * $processor = CSSProcessor::create( $css ); * while ( $processor->next_token() ) { * echo $processor->get_normalized_token(); * } @@ -52,10 +58,40 @@ * // width: 10px; * ``` * - * @TODO: More usage examples. + * Rewriting a URL while keeping the rest of the stylesheet intact: + * + * ```php + * $css = 'background: url(old.jpg) center / cover;'; + * $processor = CSSProcessor::create( $css ); + * while ( $processor->next_token() ) { + * if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + * $processor->set_value( 'uploads/new.jpg' ); + * } + * } + * $result = $processor->get_updated_css(); + * // background: url(uploads/new.jpg) center / cover; + * ``` + * + * Gathering diagnostics with byte offsets: + * + * ```php + * $css = "color: red;\ncolor: re\nd;"; + * $processor = CSSProcessor::create( $css ); + * $bad_strings = array(); + * while ( $processor->next_token() ) { + * if ( CSSProcessor::TOKEN_BAD_STRING === $processor->get_token_type() ) { + * $bad_strings[] = array( + * 'start' => $processor->get_token_start(), + * 'length' => $processor->get_token_length(), + * 'value' => $processor->get_unnormalized_token(), + * ); + * } + * } + * ``` + * * @see https://www.w3.org/TR/css-syntax-3/#tokenization */ -class CSSTokenizer { +class CSSProcessor { /** * Token type constants matching the CSS Syntax Level 3 specification. * @@ -72,7 +108,7 @@ class CSSTokenizer { * Invalid (produces bad-string): "hello * world" (literal newline breaks the string) * - * The tokenizer stops at the newline and produces a bad-string token for error recovery. + * The processor stops at the newline and produces a bad-string token for error recovery. * * @see https://www.w3.org/TR/css-syntax-3/#typedef-bad-string-token */ @@ -110,7 +146,7 @@ class CSSTokenizer { * Invalid characters: quotes ("), apostrophes ('), parentheses (() * Example invalid: url(image(.jpg) or url(image".jpg) * - * When detected, the tokenizer consumes everything up to ) or EOF. + * When detected, the processor consumes everything up to ) or EOF. * This prevents the bad URL from breaking subsequent tokens. * * @see https://www.w3.org/TR/css-syntax-3/#typedef-bad-url-token @@ -248,9 +284,19 @@ class CSSTokenizer { private $token_unit = null; /** - * Constructor for the CSS tokenizer. + * Lexical replacements to apply to input CSS document. * - * Do not instantiate directly. Use CSSTokenizer::create() instead. + * Tracks modifications to be applied to the CSS, such as changing URL values. + * Each entry is an associative array with 'start', 'length', and 'text' keys. + * + * @var array[] + */ + private $lexical_updates = array(); + + /** + * Constructor for the CSS processor. + * + * Do not instantiate directly. Use CSSProcessor::create() instead. * * @param string $css CSS source to tokenize. */ @@ -260,9 +306,9 @@ private function __construct( string $css ) { } /** - * Creates a CSS tokenizer for the given CSS string. + * Creates a CSS processor for the given CSS string. * - * Use this method to create a CSS tokenizer instance. + * Use this method to create a CSS processor instance. * * ## Current Support * @@ -270,7 +316,7 @@ private function __construct( string $css ) { * * @param string $css CSS source to tokenize. * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. - * @return static|null The created tokenizer if successful, otherwise null. + * @return static|null The created processor if successful, otherwise null. */ public static function create( string $css, string $encoding = 'UTF-8' ) { if ( 'UTF-8' !== $encoding ) { @@ -610,7 +656,7 @@ public function get_unnormalized_token(): ?string { /** * Gets the current token value as a normalized and decoded string. This is * a slight divergence from the CSS Syntax Level 3 spec, where all the numberic - * values are parsed as numbers. This tokenizer is only concerned with their + * values are parsed as numbers. This processor is only concerned with their * textual representation. * * Returns the semantic value of the token per CSS Syntax Level 3 spec: @@ -741,6 +787,166 @@ public function get_token_value_length(): ?int { return $this->token_value_length; } + /** + * Sets the value of the current URL token. + * + * This method allows modifying the URL value in url() tokens. The new value + * will be properly escaped according to CSS URL syntax rules. + * + * Currently only URL tokens are supported. Attempting to set the value on + * other token types will return false. + * + * Example: + * + * $css = 'background: url(old.jpg);'; + * $processor = CSSProcessor::create( $css ); + * while ( $processor->next_token() ) { + * if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + * $processor->set_token_value( 'new.jpg' ); + * } + * } + * echo $processor->get_updated_css(); + * // Outputs: background: url(new.jpg); + * + * @param string $new_value The new URL value (should not include url() wrapper). + * @return bool Whether the value was successfully updated. + */ + public function set_token_value( string $new_value ): bool { + // Only URL tokens are currently supported. + if ( self::TOKEN_URL !== $this->token_type ) { + return false; + } + + // Ensure we have valid token value boundaries. + if ( null === $this->token_value_starts_at || null === $this->token_value_length ) { + return false; + } + + // Escape the URL value for unquoted URL syntax. + $escaped_value = $this->escape_url_value( $new_value ); + + // Queue the lexical update. + $this->lexical_updates[] = array( + 'start' => $this->token_value_starts_at, + 'length' => $this->token_value_length, + 'text' => $escaped_value, + ); + + return true; + } + + /** + * Escapes a URL value for use in quoted url() syntax. + * + * Always returns a quoted URL string since they're easier + * to escape. Quoted URLs are consumed using the string token + * rules, and the only values we need to escape in strings, are: + * + * * Trailing quote. + * * Newlines. That amounts to \n, \r, \f, \r\n when preprocessing is considered. + * * U+005C REVERSE SOLIDUS (\) + * + * @see https://www.w3.org/TR/css-syntax-3/#consume-url-token + */ + private function escape_url_value( string $unescaped ): string { + $escaped = ''; + $at = 0; + while ( $at < strlen( $unescaped ) ) { + $safe_len = strcspn( $unescaped, "\n\r\f\\\"", $at ); + if ( $safe_len > 0 ) { + $escaped .= substr( $unescaped, $at, $safe_len ); + $at += $safe_len; + continue; + } + + $unsafe_char = $unescaped[ $at ]; + switch ( $unsafe_char ) { + case "\r": + ++$at; + /** + * Add a trailing space to prevent accidentally creating a + * wrong escape sequence. This is a valid CSS syntax and + * CSS parsers will ignore that whitespace. + * + * Without the space, "carriage\return" would be encoded as "carriage\aeturn", + * making `e` a part of the escape sequence `\ae` which is not + * what the caller intended. + */ + $escaped .= '\\a '; + if ( strlen( $unescaped ) > $at + 1 && "\n" === $unescaped[ $at + 1 ] ) { + ++$at; + } + break; + case "\f": + case "\n": + ++$at; + $escaped .= '\\a '; + break; + case '\\': + ++$at; + $escaped .= '\\5C '; + break; + case '"': + ++$at; + $escaped .= '\\22 '; + break; + default: + _doing_it_wrong( __METHOD__, 'Unexpected character in URL value: ' . $unsafe_char, '1.0.0' ); + break; + } + } + return '"' . $escaped . '"'; + } + + /** + * Returns the CSS with all modifications applied. + * + * This method applies all queued lexical updates and returns the modified CSS. + * If no modifications were made, returns the original CSS. + * + * Example: + * + * $css = 'background: url(old.jpg);'; + * $processor = CSSProcessor::create( $css ); + * while ( $processor->next_token() ) { + * if ( CSSProcessor::TOKEN_URL === $processor->get_token_type() ) { + * $processor->set_token_value( 'new.jpg' ); + * } + * } + * echo $processor->get_updated_css(); + * // Outputs: background: url(new.jpg); + * + * @return string The modified CSS. + */ + public function get_updated_css(): string { + if ( empty( $this->lexical_updates ) ) { + return $this->css; + } + + // Sort updates by start position in ascending order. + usort( + $this->lexical_updates, + function ( $a, $b ) { + return $a['start'] - $b['start']; + } + ); + + // Build the output by concatenating original CSS fragments with replacements. + $bytes_already_copied = 0; + $output = ''; + + foreach ( $this->lexical_updates as $update ) { + $output .= substr( $this->css, $bytes_already_copied, $update['start'] - $bytes_already_copied ); + $output .= $update['text']; + $bytes_already_copied = $update['start'] + $update['length']; + } + + // Copy remaining CSS after last update. + $output .= substr( $this->css, $bytes_already_copied ); + + return $output; + } + /** * Clears token state between tokens. */