diff --git a/components/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php b/components/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php index d882af4a..97d53f02 100644 --- a/components/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php +++ b/components/DataLiberation/BlockMarkup/class-blockmarkupprocessor.php @@ -527,6 +527,14 @@ public function next_block_attribute() { return isset( $this->block_attribute_paths[ $this->block_attribute_index ] ); } + protected function get_block_attribute_path() { + if ( null === $this->block_attribute_paths || ! isset( $this->block_attribute_paths[ $this->block_attribute_index ] ) ) { + return false; + } + + return $this->block_attribute_paths[ $this->block_attribute_index ]; + } + /** * Gets the key of the currently matched block attribute. * diff --git a/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php b/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php index cedb23c5..8e6b25fc 100644 --- a/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php +++ b/components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php @@ -132,7 +132,7 @@ private function next_url_in_text_node() { private function next_url_attribute() { $tag = $this->get_tag(); - if ( ! array_key_exists( $tag, self::URL_ATTRIBUTES ) ) { + if ( ! array_key_exists( $tag, self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM ) ) { return false; } @@ -142,7 +142,7 @@ private function next_url_attribute() { * for the current token. The last element is the attribute we'll * inspect in the while() loop below. */ - $this->inspecting_html_attributes = self::URL_ATTRIBUTES[ $tag ]; + $this->inspecting_html_attributes = self::HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $tag ]; } else { /** * Forget the attribute we've inspected on the previous call to @@ -184,22 +184,75 @@ private function next_url_attribute() { private function next_url_block_attribute() { while ( $this->next_block_attribute() ) { $url_maybe = $this->get_block_attribute_value(); - /* - * Do not use base URL for block attributes. to avoid false positives. - * When a base URL is present, any word is a valid URL relative to the - * base URL. - * When a base URL is missing, the string must start with a protocol to - * be considered a URL. + if ( ! is_string( $url_maybe ) || + count( $this->get_block_attribute_path() ) > 1 + ) { + // @TODO: support arrays, objects, and other non-string data structures. + continue; + } + + /** + * Decide whether the current block attribute holds a URL. + * + * Known URL attributes can be assumed to hold a URL and be + * parsed with the base URL. For example, a "/about-us" value + * in a wp:navigation-link block's `url` attribute is a + * relative URL to the `/about-us` page. + * + * Other attributes may or may not contain URLs, but we cannot assume + * they do. A value `/about-us` could be a relative URL or a class name. + * In those cases, we'll let go of relative URLs and only detect + * absolute URLs to avoid treating every string as a URL. This requires + * parsing without a base URL. + */ + $is_relative_url_block_attribute = ( + isset( self::BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $this->get_block_name() ] ) && + in_array( $this->get_block_attribute_key(), self::BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM[ $this->get_block_name() ], true ) + ); + + /** + * Filters whether a block attribute is known to contain a relative URL. + * + * This filter allows extending the list of block attributes that are + * recognized as containing URLs. When a block attribute is marked as + * a known URL attribute, it will be parsed with the base URL, allowing + * relative URLs to be properly resolved. + * + * @since 6.8.0 + * + * @param bool $is_relative_url_block_attribute Whether the block attribute is known to contain a relative URL. + * @param array $context { + * Context information about the block attribute. + * + * @type string $block_name The name of the block (e.g., 'wp:image', 'wp:button'). + * @type string $attribute_name The name of the attribute (e.g., 'url', 'href'). + * } */ - if ( is_string( $url_maybe ) ) { + $is_relative_url_block_attribute = apply_filters( + 'url_processor_is_relative_url_block_attribute', + $is_relative_url_block_attribute, + array( + 'block_name' => $this->get_block_name(), + 'attribute_name' => $this->get_block_attribute_key(), + ) + ); + + $parsed_url = false; + if ( $is_relative_url_block_attribute ) { + // Known relative URL attribute – let's parse with the base URL. + $parsed_url = WPURL::parse( $url_maybe, $this->base_url_string ); + } else { + // Other attributes – let's parse without a base URL (and only detect absolute URLs). $parsed_url = WPURL::parse( $url_maybe ); - if ( false !== $parsed_url ) { - $this->raw_url = $url_maybe; - $this->parsed_url = $parsed_url; + } - return true; - } + if ( false === $parsed_url ) { + continue; } + + $this->raw_url = $url_maybe; + $this->parsed_url = $parsed_url; + return true; } return false; @@ -362,6 +415,26 @@ public function get_inspected_attribute_name() { return $this->inspecting_html_attributes[ count( $this->inspecting_html_attributes ) - 1 ]; } + /** + * A list of block attributes that are known to contain URLs. + * + * It covers WordPress core blocks as of WordPress version 6.9. It can be + * extended by plugins and themes via the "url_processor_is_relative_url_block_attribute" + * filter. + * + * @var array + */ + public const BLOCK_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM = array( + 'wp:button' => array( 'url', 'linkTarget' ), + 'wp:cover' => array( 'url' ), + 'wp:embed' => array( 'url' ), + 'wp:gallery' => array( 'url', 'fullUrl' ), + 'wp:image' => array( 'url', 'src', 'href' ), + 'wp:media-text' => array( 'mediaUrl', 'href' ), + 'wp:navigation-link' => array( 'url' ), + 'wp:navigation-submenu' => array( 'url' ), + 'wp:rss' => array( 'feedURL' ), + ); /** * A list of HTML attributes meant to contain URLs, as defined in the HTML specification. @@ -370,7 +443,7 @@ public function get_inspected_attribute_name() { * See https://html.spec.whatwg.org/multipage/indices.html#attributes-1. * See https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value. */ - public const URL_ATTRIBUTES = array( + public const HTML_ATTRIBUTES_TO_ACCEPT_RELATIVE_URLS_FROM = array( 'A' => array( 'href' ), 'APPLET' => array( 'codebase', 'archive' ), 'AREA' => array( 'href' ), @@ -405,7 +478,7 @@ public function get_inspected_attribute_name() { * @TODO: Either explicitly support these attributes, or explicitly drop support for * handling their subsyntax. A generic URL matcher might be good enough. */ - public const URL_ATTRIBUTES_WITH_SUBSYNTAX = array( + public const HTML_ATTRIBUTES_WITH_SUBSYNTAX_TO_ACCEPT_RELATIVE_URLS_FROM = array( '*' => array( 'style' ), // background(), background-image(). 'APPLET' => array( 'archive' ), 'IMG' => array( 'srcset' ), @@ -425,7 +498,7 @@ public function get_inspected_attribute_name() { * @TODO: Either explicitly support these tags, or explicitly drop support for * handling their subsyntax. A generic URL matcher might be good enough. */ - public const URL_CONTAINING_TAGS_WITH_SUBSYNTAX = array( + public const HTML_TAGS_WITH_SUBSYNTAX_TO_ACCEPT_RELATIVE_URLS_FROM = array( 'STYLE', 'SCRIPT', ); diff --git a/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php b/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php index 7005f3bb..af8eb1fb 100644 --- a/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php +++ b/components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php @@ -16,72 +16,96 @@ public function test_next_url_in_current_token_returns_false_when_no_url_is_foun * * @dataProvider provider_test_finds_next_url */ - public function test_next_url_finds_the_url( $expected_result, $markup, $base_url = 'https://wordpress.org' ) { + public function test_next_url_finds_the_url( $expected_raw_url, $expected_absolute_url, $markup, $base_url = 'https://wordpress.org' ) { $p = new BlockMarkupUrlProcessor( $markup, $base_url ); $this->assertTrue( $p->next_url(), 'Failed to find the URL in the markup.' ); - $this->assertEquals( $expected_result, $p->get_raw_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' ); + $this->assertEquals( $expected_raw_url, $p->get_raw_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' ); + $this->assertEquals( $expected_absolute_url, $p->get_parsed_url()->toString(), 'Found a URL in the markup, but it wasn\'t the expected one.' ); } public static function provider_test_finds_next_url() { return array( 'In the tag' => array( 'https://wordpress.org', + 'https://wordpress.org/', '', ), - 'In the second block attribute, when it contains just the URL' => array( - 'https://mysite.com/wp-content/image.png', - '', + 'In the wp:image url attribute when it is the first block attribute and contains a relative URL' => array( + '/wp-content/image.png', + 'https://wordpress.org/wp-content/image.png', + '', ), - 'In the first block attribute, when it contains just the URL' => array( + 'In the wp:image url attribute when it is the second block attribute and contains just the URL' => array( 'https://mysite.com/wp-content/image.png', - '', - ), - 'In a block attribute, in a nested object, when it contains just the URL' => array( 'https://mysite.com/wp-content/image.png', - '', - ), - 'In a block attribute, in an array, when it contains just the URL' => array( - 'https://mysite.com/wp-content/image.png', - '', + '', ), 'In a text node, when it contains a well-formed absolute URL' => array( 'https://wordpress.org', + 'https://wordpress.org/', 'Have you seen https://wordpress.org? ', ), 'In a text node after a tag' => array( 'wordpress.org', + 'https://wordpress.org/', '

Have you seen wordpress.org', ), 'In a text node, when it contains a protocol-relative absolute URL' => array( '//wordpress.org', + 'https://wordpress.org/', 'Have you seen //wordpress.org? ', ), 'In a text node, when it contains a domain-only absolute URL' => array( 'wordpress.org', + 'https://wordpress.org/', 'Have you seen wordpress.org? ', ), 'In a text node, when it contains a domain-only absolute URL with path' => array( 'wordpress.org/plugins', + 'https://wordpress.org/plugins', 'Have you seen wordpress.org/plugins? ', ), 'Matches an empty string in as a valid relative URL when given a base URL' => array( '', + 'https://wordpress.org/', '', - 'https://wordpress.org', + 'https://wordpress.org/', ), 'Skips over an empty string in when not given a base URL' => array( 'https://developer.w.org', + 'https://developer.w.org/', '', null, ), 'Skips over a class name in the tag' => array( 'https://developer.w.org', + 'https://developer.w.org/', '', null, ), ); } + /** + * + * @dataProvider provider_test_finds_next_negative_url + */ + public function test_next_url_finds_the_negative_url( $markup, $base_url = 'https://wordpress.org' ) { + $p = new BlockMarkupUrlProcessor( $markup, $base_url ); + $this->assertFalse( $p->next_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' ); + } + + public static function provider_test_finds_next_negative_url() { + return array( + 'In a block attribute, in a nested object, when it contains just the URL' => array( + '', + ), + 'In a block attribute, in an array, when it contains just the URL' => array( + '', + ), + ); + } + /** * @dataProvider provider_test_parse_url_with_base_url */ @@ -180,7 +204,7 @@ public static function provider_test_set_url_examples() { public function test_set_url_complex_test_case() { $p = new BlockMarkupUrlProcessor( << + @@ -204,9 +228,10 @@ public function test_set_url_complex_test_case() { $p->set_url( 'https://site-export.internal', WPURL::parse( 'https://site-export.internal' ) ); } + // meta.src is a nested property and not supported yet $this->assertEquals( << +