diff --git a/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php b/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php index 4d3aa5e5..b924e7d0 100644 --- a/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php +++ b/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php @@ -4,10 +4,8 @@ use Rowbot\URL\URL; use WordPress\DataLiberation\URL\URLInTextProcessor; +use WordPress\DataLiberation\URL\CSSURLProcessor; use WordPress\DataLiberation\URL\WPURL; -use WordPress\DataLiberation\URL\ConvertedUrl; - -use function WordPress\DataLiberation\URL\urldecode_n; /** * Reports all the URLs in the imported post and enables rewriting them. @@ -23,6 +21,8 @@ class BlockMarkupUrlProcessor extends BlockMarkupProcessor { private $base_url_object; private $url_in_text_processor; private $url_in_text_node_updated; + private $css_url_processor; + private $css_url_processor_updated; /** * The list of names of URL-related HTML attributes that may be available on @@ -52,6 +52,14 @@ public function get_updated_html(): string { $this->url_in_text_node_updated = false; } + if ( $this->css_url_processor_updated ) { + if ( null !== $this->css_url_processor ) { + $updated_css = $this->css_url_processor->get_updated_css(); + $this->set_attribute( 'style', $updated_css ); + } + $this->css_url_processor_updated = false; + } + return parent::get_updated_html(); } @@ -70,8 +78,11 @@ public function next_token(): bool { $this->parsed_url = null; $this->inspecting_html_attributes = null; $this->url_in_text_processor = null; - // Do not reset url_in_text_node_updated – it's reset in get_updated_html() which - // is called in parent::next_token(). + $this->css_url_processor = null; + /* + * Do not reset url_in_text_node_updated or css_url_processor_updated – they're reset + * in get_updated_html() which is called in parent::next_token(). + */ return parent::next_token(); } @@ -111,7 +122,7 @@ private function next_url_in_text_node() { * way to recognize a substring "WordPress.org" as a URL. We might * get some false positives this way, e.g. in this string: * - * > And that's how you build a theme.Now let's take a look at..." + * > And that's how you build a theme. Now let's take a look at..." * * `theme.Now` would be recognized as a URL. It's up to the API consumer * to filter out such false positives e.g. by checking the domain against @@ -130,20 +141,75 @@ private function next_url_in_text_node() { return false; } + /** + * Advances to the next CSS URL in the `style` attribute of the current tag token. + * + * @return bool Whether a CSS URL was found. + */ + private function next_url_in_css() { + if ( '#tag' !== $this->get_token_type() ) { + return false; + } + + if ( null === $this->css_url_processor ) { + $css_value = $this->get_attribute( 'style' ); + if ( ! is_string( $css_value ) ) { + return false; + } + + $this->css_url_processor = new CSSURLProcessor( $css_value ); + } + + while ( $this->css_url_processor->next_url() ) { + /** + * Skip data URIs. They may be really large and they don't + * have a hostname to migrate. + */ + if ( $this->css_url_processor->is_data_uri() ) { + continue; + } + $this->raw_url = $this->css_url_processor->get_raw_url(); + $this->parsed_url = WPURL::parse( $this->raw_url, $this->base_url_string ); + if ( false === $this->parsed_url ) { + continue; + } + + return true; + } + + return false; + } + private function next_url_attribute() { $tag = $this->get_tag(); - if ( ! array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) { - return false; + // Check if we have a style attribute with CSS URLs to process. + if ( null !== $this->css_url_processor ) { + if ( $this->next_url_in_css() ) { + return true; + } + // Done with CSS URLs in this attribute, apply any pending updates and move on. + $this->get_updated_html(); + $this->css_url_processor = null; } if ( null === $this->inspecting_html_attributes ) { - /** - * Initialize the list on the first call to next_url_attribute() - * for the current token. The last element is the attribute we'll - * inspect in the while() loop below. - */ - $this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ]; + if ( array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) { + /** + * Initialize the list on the first call to next_url_attribute() + * for the current token. The last element is the attribute we'll + * inspect in the while() loop below. + */ + $this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ]; + // Add style attribute to the list if it exists. + if ( null !== $this->get_attribute( 'style' ) ) { + $this->inspecting_html_attributes[] = 'style'; + } + } elseif ( null !== $this->get_attribute( 'style' ) ) { + $this->inspecting_html_attributes = array( 'style' ); + } else { + return false; + } } else { /** * Forget the attribute we've inspected on the previous call to @@ -160,6 +226,18 @@ private function next_url_attribute() { continue; } + // Rewrite any CSS `url()` declarations in the `style` attribute. + if ( 'style' === $attr ) { + $this->css_url_processor = new CSSURLProcessor( $url_maybe ); + if ( $this->next_url_in_css() ) { + return true; + } + // No CSS URLs found, move to next attribute. + $this->css_url_processor = null; + array_pop( $this->inspecting_html_attributes ); + continue; + } + /* * Use base URL to resolve known URI attributes as we are certain we're * dealing with URI values. @@ -277,6 +355,12 @@ public function set_url( $raw_url, $parsed_url ) { $this->parsed_url = $parsed_url; switch ( parent::get_token_type() ) { case '#tag': + // Check if we're processing a CSS URL. + if ( null !== $this->css_url_processor ) { + $this->css_url_processor_updated = true; + return $this->css_url_processor->set_raw_url( $raw_url ); + } + $attr = $this->get_inspected_attribute_name(); if ( false === $attr ) { return false; diff --git a/components/DataLiberation/URL/class-cssprocessor.php b/components/DataLiberation/CSS/class-cssprocessor.php similarity index 97% rename from components/DataLiberation/URL/class-cssprocessor.php rename to components/DataLiberation/CSS/class-cssprocessor.php index 511a821f..0265fc2e 100644 --- a/components/DataLiberation/URL/class-cssprocessor.php +++ b/components/DataLiberation/CSS/class-cssprocessor.php @@ -1,6 +1,6 @@ token_value; } + /** + * Determines whether the current token is a data URI. + * + * Only meaningful for URL and STRING tokens. Returns false for all other token types. + * + * @return bool Whether the current token value starts with "data:" (case-insensitive). + */ + public function is_data_uri(): bool { + if ( null === $this->token_value_starts_at || null === $this->token_value_length ) { + return false; + } + + if ( $this->token_value_length < 5 ) { + return false; + } + + $offset = $this->token_value_starts_at; + return ( + ( 'd' === $this->css[ $offset ] || 'D' === $this->css[ $offset ] ) && + ( 'a' === $this->css[ $offset + 1 ] || 'A' === $this->css[ $offset + 1 ] ) && + ( 't' === $this->css[ $offset + 2 ] || 'T' === $this->css[ $offset + 2 ] ) && + ( 'a' === $this->css[ $offset + 3 ] || 'A' === $this->css[ $offset + 3 ] ) && + ':' === $this->css[ $offset + 4 ] + ); + } + /** * Gets the token start at. * @@ -812,27 +838,26 @@ public function get_token_value_length(): ?int { * @return bool Whether the value was successfully updated. */ public function set_token_value( string $new_value ): bool { - // Only URL tokens are currently supported. - if ( self::TOKEN_URL !== $this->token_type ) { - return false; - } - - // Ensure we have valid token value boundaries. - if ( null === $this->token_value_starts_at || null === $this->token_value_length ) { - return false; + // Only URL and string tokens are currently supported. + switch ( $this->token_type ) { + case self::TOKEN_URL: + $this->lexical_updates[] = array( + 'start' => $this->token_value_starts_at, + 'length' => $this->token_value_length, + 'text' => $this->escape_url_value( $new_value ), + ); + return true; + case self::TOKEN_STRING: + $this->lexical_updates[] = array( + 'start' => $this->token_starts_at, + 'length' => $this->token_length, + 'text' => $this->escape_url_value( $new_value ), + ); + return true; + default: + _doing_it_wrong( __METHOD__, 'set_token_value() only supports URL and string tokens. Got token type: ' . $this->token_type, '1.0.0' ); + return false; } - - // Escape the URL value for unquoted URL syntax. - $escaped_value = $this->escape_url_value( $new_value ); - - // Queue the lexical update. - $this->lexical_updates[] = array( - 'start' => $this->token_value_starts_at, - 'length' => $this->token_value_length, - 'text' => $escaped_value, - ); - - return true; } /** diff --git a/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php b/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php index f7b94b82..22193d9f 100644 --- a/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php +++ b/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php @@ -304,4 +304,167 @@ public static function provider_test_next_url_replace_base_url() { ), ); } + + /** + * @dataProvider provider_test_css_url_detection + */ + public function test_detects_css_urls_in_style_attribute( $expected_url, $markup, $base_url = 'https://example.com' ) { + $p = new BlockMarkupUrlProcessor( $markup, $base_url ); + $this->assertTrue( $p->next_url(), 'Failed to find CSS URL in style attribute' ); + $this->assertEquals( $expected_url, $p->get_raw_url(), 'Found CSS URL does not match expected URL' ); + } + + public static function provider_test_css_url_detection() { + return array( + 'Basic quoted URL in background' => array( + 'https://wordpress.org)', + '
', + ), + 'URL in CSS comment (should be skipped)' => array( + 'https://fallback.com', + '
', + ), + 'URL inside content string (should be skipped)' => array( + 'https://realurl.com', + '
', + ), + 'Unquoted URL with encoded space' => array( + 'https://wordpress.org/%20/d', + '
', + ), + 'URL with other properties before' => array( + 'https://wordpress.org/%20/d', + '
', + ), + 'URL with CSS comments around' => array( + 'https://wordpress.org/%20/d', + '
', + ), + 'URL with multiple properties' => array( + 'https://wordpress.org/%20/d', + '
', + ), + 'Single-quoted URL' => array( + 'https://example.com/image.png', + '
', + ), + 'URL with whitespace inside url()' => array( + 'https://example.com/image.png', + '
', + ), + 'Relative URL' => array( + '/images/bg.png', + '
', + ), + 'URL with escaped quotes in quoted form' => array( + 'https://example.com/path"with"quotes', + '
', + ), + 'Multiple URLs in single style attribute' => array( + 'https://example.com/bg1.png', + '
', + ), + 'URL in different CSS properties' => array( + 'https://example.com/cursor.png', + '
', + ), + 'Case-insensitive url() function' => array( + 'https://example.com/image.png', + '
', + ), + 'Mixed case Url() function' => array( + 'https://example.com/image.png', + '
', + ), + 'Unicode escape in quoted URL' => array( + 'https://example.com/image.png', + '
', + ), + 'Unicode escape in unquoted URL' => array( + 'https://example.com/image.png', + '
', + ), + ); + } + + /** + * @dataProvider provider_test_css_url_replacement + */ + public function test_replaces_css_urls_in_style_attribute( $markup, $new_url, $expected_output, $base_url = null ) { + $p = new BlockMarkupUrlProcessor( $markup, $base_url ); + $this->assertTrue( $p->next_url(), 'Failed to find CSS URL' ); + $this->assertTrue( $p->set_url( $new_url, WPURL::parse( $new_url, $base_url ) ), 'Failed to set CSS URL' ); + $this->assertEquals( $expected_output, $p->get_updated_html(), 'CSS URL replacement produced incorrect output' ); + } + + public static function provider_test_css_url_replacement() { + return array( + 'Replace quoted URL' => array( + '
', + 'https://new.com/image.png', + '
', + ), + 'Replace unquoted URL' => array( + '
', + 'https://new.com/image.png', + // CSSProcessor always quotes the new URL: + '
', + ), + 'Replace single-quoted URL' => array( + '
', + 'https://new.com/image.png', + '
', + ), + 'Replace relative URL' => array( + '
', + '/new/path.png', + '
', + 'https://example.com', // base URL needed to parse relative URLs + ), + 'Replace Unicode escaped URL' => array( + '
', + 'https://new.com/image.png', + '
', + ), + ); + } + + public function test_replaces_multiple_css_urls_in_style_attribute() { + $markup = '
'; + $p = new BlockMarkupUrlProcessor( $markup ); + + // First URL + $this->assertTrue( $p->next_url(), 'Failed to find first CSS URL' ); + $this->assertEquals( 'https://example.com/bg1.png', $p->get_raw_url() ); + $p->set_url( 'https://new.com/bg1.png', WPURL::parse( 'https://new.com/bg1.png' ) ); + + // Second URL + $this->assertTrue( $p->next_url(), 'Failed to find second CSS URL' ); + $this->assertEquals( 'https://example.com/bg2.png', $p->get_raw_url() ); + $p->set_url( 'https://new.com/bg2.png', WPURL::parse( 'https://new.com/bg2.png' ) ); + + // No more URLs + $this->assertFalse( $p->next_url(), 'Found more URLs than expected' ); + + $expected = '
'; + $this->assertEquals( $expected, $p->get_updated_html() ); + } + + public function test_css_urls_with_regular_attributes() { + $markup = ''; + $p = new BlockMarkupUrlProcessor( $markup ); + + $found_urls = array(); + while ( $p->next_url() ) { + $found_urls[] = $p->get_raw_url(); + $p->set_url( 'https://new.com/replaced.png', WPURL::parse( 'https://new.com/replaced.png' ) ); + } + + $this->assertCount( 2, $found_urls, 'Should find both src attribute and CSS URL' ); + $this->assertContains( 'https://example.com/image.png', $found_urls ); + $this->assertContains( 'https://example.com/border.png', $found_urls ); + + $expected = ''; + $this->assertEquals( $expected, $p->get_updated_html() ); + } } diff --git a/components/DataLiberation/Tests/CSSProcessorTest.php b/components/DataLiberation/Tests/CSSProcessorTest.php index b23557cd..900fcece 100644 --- a/components/DataLiberation/Tests/CSSProcessorTest.php +++ b/components/DataLiberation/Tests/CSSProcessorTest.php @@ -1,7 +1,7 @@ assertSame( "background: url(\"\xC0.jpg\");", $updated ); } - + } diff --git a/components/DataLiberation/Tests/CSSUrlProcessorTest.php b/components/DataLiberation/Tests/CSSUrlProcessorTest.php new file mode 100644 index 00000000..9fb9eb6a --- /dev/null +++ b/components/DataLiberation/Tests/CSSUrlProcessorTest.php @@ -0,0 +1,757 @@ +assertTrue( $processor->next_url(), 'Failed to find URL in CSS' ); + $this->assertEquals( $expected_url, $processor->get_raw_url(), 'Decoded URL does not match expected value' ); + } + + public static function provider_test_css_escape_decoding() { + // U+005C is REVERSE SOLIDUS (\) + // These tests all represent a backslash \ as a \u{5c} escape sequence + // to avoid confusing the reader with sequences such as \\\" where it's + // unclear which escapes belong to the PHP string, which to the CSS string, + // and what is the final string value. + return array( + // Basic hex escapes + "Space as `\u{5c}20`" => array( + "background: url(https://example.com/hello\u{5c}20world.png)", + 'https://example.com/hello world.png', + ), + "Space as `\u{5c}000020` (6 digits)" => array( + "background: url(https://example.com/hello\u{5c}000020world.png)", + 'https://example.com/hello world.png', + ), + "Space as `\u{5c}000020 ` (6 digits + space)" => array( + "background: url(https://example.com/hello\u{5c}000020 world.png)", + 'https://example.com/hello world.png', + ), + "8-digit space is treated as a replacement character followed by a string `\u{5c}20`: `\u{5c}00000020`" => array( + "background: url(https://example.com/hello\u{5c}00000020world.png)", + "https://example.com/hello\u{FFFD}20world.png", + ), + + // Single character escapes in unquoted URLs + "Escaped parenthesis `\u{5c}(`" => array( + "background: url(https://example.com/file\u{5c}(1\u{5c}).png)", + 'https://example.com/file(1).png', + ), + "Escaped quote `\u{5c}\u{0022}`" => array( + "background: url(https://example.com/file\u{5c}\u{0022}name.png)", + 'https://example.com/file"name.png', + ), + "Escaped single quote `\u{5c}'`" => array( + "background: url(https://example.com/file\u{5c}\u{0027}name.png)", + "https://example.com/file'name.png", + ), + "Escaped backslash `\u{5c}\u{5c}`" => array( + "background: url(https://example.com/path\u{5c}\u{5c}file.png)", + "https://example.com/path\u{5c}file.png", + ), + + // Hex escapes with trailing whitespace + // Note: A single whitespace character immediately after a hex escape is consumed + // as the escape sequence terminator and is not included in the decoded output. + // The decoded result can contain actual whitespace characters (from the escape itself). + 'Hex escape followed by more hex' => array( + "background: url(https://example.com/\u{5c}20test.png)", + 'https://example.com/ test.png', // \20 decodes to a space character + ), + 'Hex escape at end with space after' => array( + "background: url(\u{22}https://example.com/test\u{5c}20 more.png\u{22})", + 'https://example.com/test more.png', // \20 decodes to space; the space after \20 is consumed as terminator + ), + + // Edge cases with hex digits + '1-digit hex escape' => array( + "background: url(https://example.com/\u{5c}9.png)", + "https://example.com/\u{09}.png", + ), + '2-digit hex escape' => array( + "background: url(https://example.com/\u{5c}41.png)", + 'https://example.com/A.png', + ), + '3-digit hex escape' => array( + "background: url(https://example.com/\u{5c}263A.png)", + 'https://example.com/☺.png', + ), + '4-digit hex escape' => array( + "background: url(https://example.com/\u{5c}1F600.png)", + 'https://example.com/πŸ˜€.png', + ), + '5-digit hex escape' => array( + "background: url(https://example.com/\u{5c}0263A.png)", + 'https://example.com/☺.png', + ), + '6-digit hex escape (max length)' => array( + "background: url(https://example.com/\u{5c}01F600.png)", + 'https://example.com/πŸ˜€.png', + ), + + // Hex escapes followed by hex-like characters + 'Hex escape followed by non-hex letter' => array( + "background: url(https://example.com/\u{5c}41G.png)", + 'https://example.com/AG.png', + ), + 'Hex escape at end of value' => array( + "background: url(https://example.com/test\u{5c}41)", + 'https://example.com/testA', + ), + + // Line breaks in escapes + // Note: Hex escapes can encode line break characters (U+000A newline, U+000D carriage return). + // The decoded result contains actual line break characters. + 'Newline as hex `\u{5c}00000A`' => array( + "background: url(\u{22}https://example.com/test\u{5c}00000Amore.png\u{22})", + "https://example.com/test\u{0A}more.png", // \00000A decodes to newline character + ), + 'Carriage return as hex `\u{5c}00000D`' => array( + "background: url(\u{22}https://example.com/test\u{5c}00000Dmore.png\u{22})", + "https://example.com/test\u{0D}more.png", // \00000D decodes to carriage return character + ), + + // Multiple escapes + 'Multiple hex escapes' => array( + "background: url(https://example.com/\u{5c}41\u{5c}42\u{5c}43.png)", + 'https://example.com/ABC.png', + ), + 'Mixed escape types' => array( + "background: url(https://example.com/\u{5c}41\u{5c}(test\u{5c}).png)", + 'https://example.com/A(test).png', + ), + + // Backslash at end of string (edge case) + // Note: \\ at end escapes the backslash itself + 'Trailing escaped backslash' => array( + "background: url(\u{22}https://example.com/test\u{5c}\u{5c}\u{22})", + "https://example.com/test\u{5c}", + ), + + // Unicode characters + 'Unicode emoji via hex escape' => array( + "background: url(https://example.com/\u{5c}1F44D.png)", + 'https://example.com/πŸ‘.png', + ), + 'Chinese character via hex escape' => array( + "background: url(https://example.com/\u{5c}4E2D\u{5c}6587.png)", + 'https://example.com/δΈ­ζ–‡.png', + ), + // One space after hex escape is consumed as terminator; additional spaces are preserved + 'Multiple trailing whitespaces after the hex escape are preserved' => array( + "background: url(\u{22}https://example.com/test\u{5c}26 more.png\u{22})", // \26 = &, followed by 3 spaces + 'https://example.com/test& more.png', // Result has & followed by 2 spaces (1st space consumed as terminator) + ), + + // Case insensitivity of hex digits + 'Lowercase hex digits' => array( + "background: url(https://example.com/\u{5c}00002f\u{5c}000061.png)", + 'https://example.com//a.png', + ), + 'Uppercase hex digits' => array( + "background: url(https://example.com/\u{5c}00002F\u{5c}000041.png)", + 'https://example.com//A.png', + ), + 'Mixed case hex digits (2f 2F) with trailing whitespace' => array( + // Note: The whitespace after hex escapes is consumed as part of the escape sequence + "background: url(\u{22}https://example.com\u{5c}2F \u{5c}2f file.png\u{22})", + 'https://example.com//file.png', + ), + + // Very low codepoint + 'Control character `\u{5c}1` (SOH)' => array( + // https://example.com/test\1 .png + "background: url(\u{22}https://example.com/test\u{5c}1 .png\u{22})", + "https://example.com/test\u{01}.png", + ), + + // Special URL characters escaped + 'Escaped forward slash' => array( + // https://example.com/path\/to\/file.png + "background: url(https://example.com/path\u{5c}\u{2f}to\u{5c}\u{2f}file.png)", + 'https://example.com/path/to/file.png', + ), + 'Escaped question mark' => array( + // https://example.com/file.png\?query + "background: url(https://example.com/file.png\u{5c}\u{003f}query)", + 'https://example.com/file.png?query', + ), + 'Escaped hash' => array( + // https://example.com/file.png\#anchor + "background: url(https://example.com/file.png\u{5c}\u{0023}anchor)", + 'https://example.com/file.png#anchor', + ), + + // Consecutive backslashes + 'Two backslashes' => array( + // https://example.com/test\\.png + "background: url(https://example.com/test\u{5c}\u{5c}.png)", + "https://example.com/test\u{5c}.png", + ), + 'Three backslashes' => array( + // https://example.com/test\\\.png + "background: url(https://example.com/test\u{5c}\u{5c}\u{5c}.png)", + "https://example.com/test\u{5c}.png", + ), + 'Four backslashes' => array( + // https://example.com/test\\\\.png + "background: url(https://example.com/test\u{5c}\u{5c}\u{5c}\u{5c}.png)", + "https://example.com/test\u{5c}\u{5c}.png", + ), + ); + } + + /** + * @dataProvider provider_test_basic_css_url_detection + */ + public function test_basic_css_url_detection( $css_value, $should_find_url, $expected_url = null ) { + $processor = new CSSURLProcessor( $css_value ); + + if ( $should_find_url ) { + $this->assertTrue( $processor->next_url(), 'Failed to find URL in CSS' ); + $this->assertEquals( $expected_url, $processor->get_raw_url() ); + } else { + $this->assertFalse( $processor->next_url(), 'Should not find URL in CSS' ); + } + } + + public static function provider_test_basic_css_url_detection() { + return array( + 'Quoted URL' => array( + 'css' => 'background: url("https://example.com/image.png")', + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + 'Single-quoted URL' => array( + 'css' => "background: url('https://example.com/image.png')", + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + 'Unquoted URL' => array( + 'css' => 'background: url(https://example.com/image.png)', + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + 'Quoted URL with a whitespace before the opening quote' => array( + 'css' => 'background: url( "https://example.com/image.png")', + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + 'Unquoted URL with whitespace inside the parentheses' => array( + 'css' => 'background: url( https://example.com/image.png )', + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + 'Unquoted URL with whitespace in the middle of the URL' => array( + 'css' => 'background: url( https://example.com/ image.png )', + 'should-detect' => false, + ), + 'Quoted URL with whitespace in the middle of the URL' => array( + 'css' => 'background: url( "https://example.com/ image.png" )', + 'should-detect' => true, + 'url' => 'https://example.com/ image.png', + ), + 'Quoted URL with a comment before the opening quote' => array( + 'css' => 'background: url(/**/"https://example.com/image.png")', + 'should-detect' => false, + ), + 'Quoted URL with a whitespace after the closing quote' => array( + 'css' => 'background: url("https://example.com/image.png" )', + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + 'Uppercase URL function' => array( + 'css' => 'background: URL("https://example.com/image.png")', + 'should-detect' => true, + 'url' => 'https://example.com/image.png', + ), + + 'CSS comment containing a URL' => array( + 'css' => '/* background: url("https://commented.com/image.png"); */', + 'should-detect' => false, + ), + 'String content discussing a url() function' => array( + 'css' => 'content: "Visit url(https://example.com)";', + 'should-detect' => false, + ), + 'CSS containing no URL' => array( + 'css' => 'background: #fff; color: red;', + 'should-detect' => false, + ), + + // Verify real URLs are found after skipped content + 'Background URL placed after a CSS comment containing a URL' => array( + 'css' => '/* background: url("https://commented.com/image.png"); */ background: url("https://real.com/image.png")', + 'should-detect' => true, + 'url' => 'https://real.com/image.png', + ), + 'Background URL placed after a string discussing a url() function' => array( + 'css' => 'content: "Visit url(https://example.com)"; background: url("https://real.com/image.png")', + 'should-detect' => true, + 'url' => 'https://real.com/image.png', + ), + + 'Data URI' => array( + 'css' => 'background: url("data:image/png;base64,iVBORw0KGgo=")', + 'should-detect' => true, + 'url' => 'data:image/png;base64,iVBORw0KGgo=', + ), + ); + } + + public function test_handles_multiple_urls() { + $css = 'background: url("https://example.com/bg1.png"), url("https://example.com/bg2.png")'; + $processor = new CSSURLProcessor( $css ); + + $this->assertTrue( $processor->next_url() ); + $this->assertEquals( 'https://example.com/bg1.png', $processor->get_raw_url() ); + + $this->assertTrue( $processor->next_url() ); + $this->assertEquals( 'https://example.com/bg2.png', $processor->get_raw_url() ); + + $this->assertFalse( $processor->next_url() ); + } + + /** + * Tests set_raw_url() with various edge cases. + * Note: The output always produces a quoted URL, preserving the original quote style. + * + * @dataProvider provider_test_url_replacement + */ + public function test_url_replacement( $input_css, $new_url, $expected_css ) { + $processor = new CSSURLProcessor( $input_css ); + + $this->assertTrue( $processor->next_url(), 'Failed to find URL in input CSS' ); + $this->assertTrue( $processor->set_raw_url( $new_url ), 'Failed to set new URL' ); + $this->assertEquals( $expected_css, $processor->get_updated_css(), 'Output CSS does not match expected' ); + } + + public static function provider_test_url_replacement() { + return array( + 'Replace double-quoted URL' => array( + 'input' => 'background: url("https://old.com/image.png")', + 'new_url' => 'https://new.com/image.png', + 'expected' => 'background: url("https://new.com/image.png")', + ), + 'Replace single-quoted URL' => array( + 'input' => "background: url('https://old.com/image.png')", + 'new_url' => 'https://new.com/image.png', + 'expected' => "background: url(\u{22}https://new.com/image.png\u{22})", + ), + 'Replace unquoted URL (outputs quoted)' => array( + 'input' => 'background: url(https://old.com/image.png)', + 'new_url' => 'https://new.com/image.png', + 'expected' => 'background: url("https://new.com/image.png")', + ), + + 'Sets new URL with double quotes in path' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/path"with"quotes.png', + 'expected' => "background: url(\u{22}https://example.com/path\u{5c}22 with\u{5c}22 quotes.png\u{22})", // \22 = " + ), + 'Sets new URL with single quotes in single-quoted string' => array( + 'input' => "background: url('https://old.com/old.png')", + 'new_url' => "https://example.com/path'with'quotes.png", + 'expected' => "background: url(\u{22}https://example.com/path'with'quotes.png\u{22})", + // Single quotes not escaped in single-quoted context + ), + 'Sets new URL with backslashes in path' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/path\\with\\backslashes.png', + 'expected' => "background: url(\u{22}https://example.com/path\u{5c}5C with\u{5c}5C backslashes.png\u{22})", // \5C = \ + ), + 'Sets new URL with parentheses in path' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/file(1).png', + 'expected' => 'background: url("https://example.com/file(1).png")', + ), + 'Sets new URL with spaces in path' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/path with spaces.png', + 'expected' => 'background: url("https://example.com/path with spaces.png")', + ), + 'Sets new URL with newline character' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => "https://example.com/path\nwith\nnewlines.png", + 'expected' => "background: url(\u{22}https://example.com/path\u{5c}a with\u{5c}a newlines.png\u{22})", // \a = newline + ), + 'Sets new URL with tab character' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => "https://example.com/path\twith\ttabs.png", + 'expected' => "background: url(\u{22}https://example.com/path\twith\ttabs.png\u{22})", // Tab preserved as-is + ), + + 'Sets new URL with data URI' => array( + 'input' => 'background: url("https://old.com/image.png")', + 'new_url' => 'data:image/png;base64,iVBORw0KGgo=', + 'expected' => 'background: url("data:image/png;base64,iVBORw0KGgo=")', + ), + 'Sets new URL with data URI with regular URL' => array( + 'input' => 'background: url("data:image/png;base64,iVBORw0KGgo=")', + 'new_url' => 'https://new.com/image.png', + 'expected' => 'background: url("https://new.com/image.png")', + ), + + 'Sets new URL with relative URL' => array( + 'input' => 'background: url("https://old.com/image.png")', + 'new_url' => '/images/new.png', + 'expected' => 'background: url("/images/new.png")', + ), + 'Sets new URL with path-only URL' => array( + 'input' => 'background: url("https://old.com/image.png")', + 'new_url' => '../images/new.png', + 'expected' => 'background: url("../images/new.png")', + ), + + 'Sets new URL with emoji' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/πŸ˜€.png', + 'expected' => 'background: url("https://example.com/πŸ˜€.png")', + ), + 'Sets new URL with Chinese characters' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/δΈ­ζ–‡.png', + 'expected' => 'background: url("https://example.com/δΈ­ζ–‡.png")', + ), + + 'Sets new URL that is an empty string' => array( + 'input' => 'background: url("https://old.com/image.png")', + 'new_url' => '', + 'expected' => 'background: url("")', + ), + 'Sets new URL with query parameters' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/image.png?v=123&t=456', + 'expected' => 'background: url("https://example.com/image.png?v=123&t=456")', + ), + 'Sets new URL with fragment' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'https://example.com/image.png#section', + 'expected' => 'background: url("https://example.com/image.png#section")', + ), + 'Sets new URL that is not actually a valid URL' => array( + 'input' => 'background: url("https://old.com/old.png")', + 'new_url' => 'WordPress is great!', + 'expected' => 'background: url("WordPress is great!")', + ), + ); + } + + public function test_replaces_multiple_urls() { + $css = 'background: url("https://example.com/bg1.png"), url("https://example.com/bg2.png")'; + $processor = new CSSURLProcessor( $css ); + + $processor->next_url(); + $processor->set_raw_url( 'https://new.com/bg1.png' ); + + $processor->next_url(); + $processor->set_raw_url( 'https://new.com/bg2.png' ); + + $expected = 'background: url("https://new.com/bg1.png"), url("https://new.com/bg2.png")'; + $this->assertEquals( $expected, $processor->get_updated_css() ); + } + + /** + * Try replacing all the URLs in a longer CSS snippet with a variety + * of syntaxes. + */ + public function test_comprehensive_url_replacement_in_complex_css() { + // Using \u{5c} to represent backslashes in CSS escapes for clarity + $input_css = <<next_url() ) { + if ( $processor->is_data_uri() ) { + continue; + } + $original_url = $processor->get_raw_url(); + $found_urls[] = $original_url; + + $new_url = "https://replaced.test/url-{$url_counter}"; + $processor->set_raw_url( $new_url ); + + ++ $url_counter; + } + + // Verify the final CSS matches expected output + $this->assertEquals( $expected_css, $processor->get_updated_css(), 'Updated CSS should match expected output' ); + } + + public function test_handles_1mb_data_uri() { + // Test with 1MB data URI using state machine parser + // The parser can handle arbitrarily large URLs without PCRE limits + $data_uri = 'data:image/png;base64,' . str_repeat( 'A', 2 * 1024 * 1024 ); + $css_value = 'background: url("' . $data_uri . '")'; + $processor = new CSSURLProcessor( $css_value ); + + $this->assertTrue( $processor->next_url(), 'Failed to find URL in CSS' ); + $this->assertEquals( $data_uri, $processor->get_raw_url() ); + } + + /** + * @dataProvider provider_test_is_data_uri + */ + public function test_is_data_uri( $css_value, $expected ) { + $processor = new CSSURLProcessor( $css_value ); + + $this->assertTrue( $processor->next_url(), 'Failed to find URL in CSS' ); + $this->assertEquals( $expected, $processor->is_data_uri(), 'is_data_uri() returned unexpected value' ); + } + + public static function provider_test_is_data_uri() { + return array( + // Data URIs - quoted + 'Quoted data URI' => array( + 'background: url("data:image/png;base64,iVBORw0KGgo=")', + true, + ), + 'Single-quoted data URI' => array( + "background: url('data:image/png;base64,iVBORw0KGgo=')", + true, + ), + 'Quoted data URI uppercase' => array( + 'background: url("DATA:image/png;base64,iVBORw0KGgo=")', + true, + ), + 'Quoted data URI mixed case' => array( + 'background: url("DaTa:image/png;base64,iVBORw0KGgo=")', + true, + ), + + // Data URIs - unquoted + 'Unquoted data URI' => array( + 'background: url(data:image/png;base64,iVBORw0KGgo=)', + true, + ), + 'Unquoted data URI uppercase' => array( + 'background: url(DATA:image/png;base64,iVBORw0KGgo=)', + true, + ), + 'Unquoted data URI mixed case' => array( + 'background: url(DaTa:image/png;base64,iVBORw0KGgo=)', + true, + ), + + // Large data URIs + 'Large quoted data URI' => array( + 'background: url("data:image/png;base64,' . str_repeat( 'A', 10000 ) . '")', + true, + ), + 'Large unquoted data URI' => array( + 'background: url(data:image/png;base64,' . str_repeat( 'A', 10000 ) . ')', + true, + ), + + // Non-data URIs - quoted + 'Quoted HTTP URL' => array( + 'background: url("https://example.com/image.png")', + false, + ), + 'Quoted relative URL' => array( + 'background: url("/images/bg.png")', + false, + ), + 'Quoted file URL' => array( + 'background: url("file:///path/to/image.png")', + false, + ), + + // Non-data URIs - unquoted + 'Unquoted HTTP URL' => array( + 'background: url(https://example.com/image.png)', + false, + ), + 'Unquoted relative URL' => array( + 'background: url(/images/bg.png)', + false, + ), + + // Edge cases + 'URL containing "data:" substring' => array( + 'background: url("https://example.com/data:test.png")', + false, + ), + 'Short URL starting with "dat"' => array( + 'background: url(data)', + false, + ), + ); + } + + public function test_is_data_uri_without_url_match() { + $processor = new CSSURLProcessor( 'background: #fff;' ); + + $this->assertFalse( $processor->is_data_uri(), 'is_data_uri() should return false when no URL is matched' ); + } + + public function test_large_data_uri_does_not_allocate_additional_memory() { + // Save original memory limit + $original_limit = ini_get( 'memory_limit' ); + + // Set memory limit to 1GB for this test + ini_set( 'memory_limit', '1G' ); + + // Generate a 200MB data URI to test memory efficiency + $size_mb = 200; + $size_bytes = $size_mb * 1024 * 1024; + $data_payload = str_repeat( 'A', $size_bytes ); + $data_uri = 'data:image/png;base64,' . $data_payload; + $css_value = 'background: url("' . $data_uri . '")'; + + // Get memory before parsing + $memory_before = memory_get_usage( true ); + $memory_peak_before = memory_get_peak_usage( true ); + + // Parse the CSS + $processor = new CSSURLProcessor( $css_value ); + $this->assertTrue( $processor->next_url(), 'Failed to find URL in CSS' ); + + // Get memory after parsing + $memory_after = memory_get_usage( true ); + $memory_peak_after = memory_get_peak_usage( true ); + + // Calculate memory increase + $memory_increase = $memory_after - $memory_before; + + // The parser should not duplicate the 200MB data. We measure memory_get_usage(true) + // which tracks actual allocated memory from the OS. Some overhead is expected due to + // internal data structures, but it should be much less than duplicating the full data. + // Allow up to 10MB overhead for parser state and temporary allocations. + $max_allowed_increase = 10 * 1024 * 1024; // 10MB overhead + + $this->assertLessThan( + $max_allowed_increase, + $memory_increase, + sprintf( + 'Memory increased by %.2f MB during parsing. This suggests the data may be duplicated. Expected less than %.2f MB increase.', + $memory_increase / 1024 / 1024, + $max_allowed_increase / 1024 / 1024 + ) + ); + + $peak_increase = $memory_peak_after - $memory_peak_before; + $this->assertLessThan( + $max_allowed_increase, + $peak_increase, + sprintf( + 'Memory peak increased by %.2f MB during parsing. This suggests the data may be duplicated. Expected less than %.2f MB increase.', + $memory_increase / 1024 / 1024, + $max_allowed_increase / 1024 / 1024 + ) + ); + + // Also verify that is_data_uri() works correctly + $this->assertTrue( $processor->is_data_uri(), 'is_data_uri() should return true for large data URI' ); + + gc_collect_cycles(); + + // Restore original memory limit (if possible) + // Note: We can't restore if current usage exceeds the original limit + @ini_set( 'memory_limit', $original_limit ); + } +} diff --git a/components/DataLiberation/URL/class-cssurlprocessor.php b/components/DataLiberation/URL/class-cssurlprocessor.php new file mode 100644 index 00000000..92f4876d --- /dev/null +++ b/components/DataLiberation/URL/class-cssurlprocessor.php @@ -0,0 +1,94 @@ +processor = CSSProcessor::create( $css ); + } + + /** + * Moves the cursor to the next URL token, if available. + * + * @return bool + */ + public function next_url(): bool { + while ( $this->processor->next_token() ) { + $type = $this->processor->get_token_type(); + + // Direct URL token. + if ( CSSProcessor::TOKEN_URL === $type ) { + return true; + } + + // url() function with STRING token. + if ( CSSProcessor::TOKEN_FUNCTION === $type && + 0 === strcasecmp( $this->processor->get_token_value(), 'url' ) ) { + // Look ahead for STRING token, skipping whitespace. + while ( $this->processor->next_token() ) { + $inner_type = $this->processor->get_token_type(); + if ( CSSProcessor::TOKEN_WHITESPACE === $inner_type ) { + continue; // Skip whitespace. + } + if ( CSSProcessor::TOKEN_STRING === $inner_type ) { + return true; // Found the URL string. + } + // Hit something else (like RIGHT_PAREN or another token). + break; + } + } + } + return false; + } + + /** + * Returns the raw (decoded) URL for the current match. + * + * @return string|false + */ + public function get_raw_url() { + $value = $this->processor->get_token_value(); + return false !== $value ? $value : false; + } + + /** + * Replaces the currently matched URL with a new value. + * + * @param string $new_url Replacement URL without quoting. + * @return bool + */ + public function set_raw_url( string $new_url ): bool { + return $this->processor->set_token_value( $new_url ); + } + + /** + * Returns the updated CSS with all replacements applied. + * + * @return string + */ + public function get_updated_css(): string { + return $this->processor->get_updated_css(); + } + + /** + * Determines whether the current URL is a data URI. + * + * @return bool + */ + public function is_data_uri(): bool { + return $this->processor->is_data_uri(); + } +}