From 663263af7357346b3c90dfe8d5c8d46e942653e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 00:02:55 +0100 Subject: [PATCH 01/10] [XMLProcessor] Skip DOCTYPE, ENTITY, ATTLIST, NOTATION, conditional sections --- components/XML/Tests/XMLProcessorTest.php | 50 +++ components/XML/class-xmlprocessor.php | 456 +++++++++++++++++++++- 2 files changed, 495 insertions(+), 11 deletions(-) diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index 8e586518..5a943832 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -2633,6 +2633,56 @@ public function test_preserves_whitespace_with_xml_space_attribute() { $this->assertEquals( ' line2 ', $processor->get_modifiable_text() ); } + public function test_skips_over_doctypes_atts_and_conditional_sections() { + $xml = << + + + + "> + + + + ]]> + + ]]> + ]> + + + Test + + +

Example

+ + +XML; + $processor = XMLProcessor::create_from_string( $xml ); + $this->assertTrue( $processor->next_token(), 'Did not find DOCTYPE node.' ); + $this->assertEquals( '#doctype', $processor->get_token_type(), 'Did not find DOCTYPE node.' ); + $this->assertEquals( 'html', $processor->get_doctype_name(), 'Did not find DOCTYPEName.' ); + + $this->assertTrue( $processor->next_token(), 'Did not find root tag.' ); + $this->assertEquals( 'html', $processor->get_tag_local_name(), 'Did not find root tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find head tag.' ); + $this->assertEquals( 'head', $processor->get_tag_local_name(), 'Did not find head tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find title tag.' ); + $this->assertEquals( 'title', $processor->get_tag_local_name(), 'Did not find title tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find body tag.' ); + $this->assertEquals( 'body', $processor->get_tag_local_name(), 'Did not find body tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find p tag.' ); + $this->assertEquals( 'p', $processor->get_tag_local_name(), 'Did not find p tag.' ); + + $this->assertTrue( $processor->next_token(), 'Did not find example text.' ); + $this->assertEquals( 'Example', $processor->get_modifiable_text(), 'Did not find example text.' ); + } + public function test_handles_various_whitespace_between_attributes() { $xml = "xml[ $at ] ) { - $this->bail( 'Inline entity declarations are not yet supported in DOCTYPE declarations.', self::ERROR_SYNTAX ); + } + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + if ( '[' === $this->xml[ $at ] ) { + if ( ! $this->skip_doctype_internal_subset( $at ) ) { + return false; } - // Skip whitespace. + // Skip whitespace following the internal subset. $at += strspn( $this->xml, " \t\f\r\n", $at ); - if ( '>' !== $this->xml[ $at ] ) { + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + } + + if ( '>' !== $this->xml[ $at ] ) { $this->bail( sprintf( 'Syntax error in DOCTYPE declaration. Unexpected character "%s" at position %d.', @@ -2318,6 +2331,427 @@ private function skip_whitespace() { $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n", $this->bytes_already_parsed ); } + /** + * Skips over the internal subset of a DOCTYPE declaration. + * + * @param int &$offset Character offset of the '[' that opens the subset. The + * offset is updated to point right after the closing ']'. + * @return bool Whether the subset was fully consumed. + */ + private function skip_doctype_internal_subset( &$offset ) { + $doc_length = strlen( $this->xml ); + + // Consume the opening '['. + ++$offset; + + while ( $offset < $doc_length ) { + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + if ( ']' === $this->xml[ $offset ] ) { + ++$offset; + + return true; + } + + if ( '%' === $this->xml[ $offset ] ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + + continue; + } + + if ( '<' === $this->xml[ $offset ] ) { + if ( ! $this->skip_dtd_markup( $offset ) ) { + return false; + } + + continue; + } + + $this->bail( + sprintf( + 'Unexpected character "%s" in DOCTYPE internal subset at position %d.', + $this->xml[ $offset ], + $offset + ), + self::ERROR_SYNTAX + ); + } + + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips a single markup declaration, comment, processing instruction, or conditional section. + * + * @param int &$offset Character offset pointing to the '<' that begins the markup. + * @return bool Whether the markup was fully consumed. + */ + private function skip_dtd_markup( &$offset ) { + $doc_length = strlen( $this->xml ); + + if ( $offset + 1 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed markup in DOCTYPE declaration.' ); + + return false; + } + + $next = $this->xml[ $offset + 1 ]; + + if ( '?' === $next ) { + $closer = strpos( $this->xml, '?>', $offset + 2 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed processing instruction in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 2; + + return true; + } + + if ( '!' !== $next ) { + $this->bail( 'Unsupported markup inside DOCTYPE declaration.', self::ERROR_SYNTAX ); + } + + if ( $offset + 3 < $doc_length && '-' === $this->xml[ $offset + 2 ] && '-' === $this->xml[ $offset + 3 ] ) { + $closer = strpos( $this->xml, '-->', $offset + 4 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed comment in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 3; + + return true; + } + + if ( $offset + 2 < $doc_length && '[' === $this->xml[ $offset + 2 ] ) { + $offset += 3; + + return $this->skip_conditional_section( $offset ); + } + + $offset += 2; + + return $this->skip_markup_declaration( $offset ); + } + + /** + * Skips over a conditional section, including any nested sections it may contain. + * + * @param int &$offset Character offset immediately after the 'xml ); + $section_type = 'UNKNOWN'; + + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( '%' === $this->xml[ $offset ] ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + } else { + $keyword_length = $this->parse_name( $offset ); + if ( 0 === $keyword_length ) { + $this->bail( 'Invalid conditional section declaration.', self::ERROR_SYNTAX ); + } + + $section_type = strtoupper( substr( $this->xml, $offset, $keyword_length ) ); + $offset += $keyword_length; + + if ( 'INCLUDE' !== $section_type && 'IGNORE' !== $section_type ) { + $this->bail( + sprintf( 'Unsupported conditional section keyword "%s".', $section_type ), + self::ERROR_SYNTAX + ); + } + } + + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( '[' !== $this->xml[ $offset ] ) { + $this->bail( 'Conditional section missing "[" opener.', self::ERROR_SYNTAX ); + } + + ++$offset; + + if ( ! $this->skip_conditional_section_body( $offset, $section_type ) ) { + return false; + } + + if ( $offset + 2 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( ']' !== $this->xml[ $offset ] || ']' !== $this->xml[ $offset + 1 ] || '>' !== $this->xml[ $offset + 2 ] ) { + $this->bail( 'Invalid conditional section closer.', self::ERROR_SYNTAX ); + } + + $offset += 3; + + return true; + } + + /** + * Scans the contents of a conditional section until the matching "]]>" sequence. + * + * @param int &$offset Character offset immediately after the content opener '['. + * @param string $section_type Either 'INCLUDE', 'IGNORE', or 'UNKNOWN'. + * @return bool Whether the body was fully consumed. + */ + private function skip_conditional_section_body( &$offset, $section_type ) { + $doc_length = strlen( $this->xml ); + + while ( $offset < $doc_length ) { + if ( + $offset + 2 < $doc_length && + ']' === $this->xml[ $offset ] && + ']' === $this->xml[ $offset + 1 ] && + '>' === $this->xml[ $offset + 2 ] + ) { + return true; + } + + $char = $this->xml[ $offset ]; + + if ( '"' === $char || "'" === $char ) { + $length = $this->parse_quoted_string( $offset ); + if ( false === $length ) { + return false; + } + + $offset += $length; + continue; + } + + if ( '%' === $char ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + + continue; + } + + if ( '<' === $char ) { + if ( $offset + 1 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( + '!' === $this->xml[ $offset + 1 ] && + $offset + 2 < $doc_length && + '[' === $this->xml[ $offset + 2 ] + ) { + $offset += 3; + + if ( ! $this->skip_conditional_section( $offset ) ) { + return false; + } + + continue; + } + + if ( + '!' === $this->xml[ $offset + 1 ] && + $offset + 3 < $doc_length && + '-' === $this->xml[ $offset + 2 ] && + '-' === $this->xml[ $offset + 3 ] + ) { + $closer = strpos( $this->xml, '-->', $offset + 4 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed comment in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 3; + continue; + } + + if ( 'INCLUDE' === $section_type ) { + if ( '!' === $this->xml[ $offset + 1 ] ) { + $offset += 2; + + if ( ! $this->skip_markup_declaration( $offset ) ) { + return false; + } + + continue; + } + + if ( '?' === $this->xml[ $offset + 1 ] ) { + $closer = strpos( $this->xml, '?>', $offset + 2 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed processing instruction in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 2; + continue; + } + } + } + + ++$offset; + } + + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips over a markup declaration following a 'xml ); + $keyword_length = $this->parse_name( $offset ); + + if ( 0 === $keyword_length ) { + $this->bail( 'Malformed markup declaration in DOCTYPE internal subset.', self::ERROR_SYNTAX ); + } + + $keyword = strtoupper( substr( $this->xml, $offset, $keyword_length ) ); + + if ( ! in_array( $keyword, array( 'ELEMENT', 'ATTLIST', 'ENTITY', 'NOTATION' ), true ) ) { + $this->bail( + sprintf( 'Unsupported markup declaration "skip_markup_declaration_body( $offset ); + } + + /** + * Scans a markup declaration until its closing '>'. + * + * @param int &$offset Character offset immediately after the markup keyword. + * @return bool Whether the declaration was fully consumed. + */ + private function skip_markup_declaration_body( &$offset ) { + $doc_length = strlen( $this->xml ); + + while ( $offset < $doc_length ) { + $char = $this->xml[ $offset ]; + + if ( '"' === $char || "'" === $char ) { + $length = $this->parse_quoted_string( $offset ); + if ( false === $length ) { + return false; + } + + $offset += $length; + continue; + } + + if ( '%' === $char ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + + continue; + } + + if ( '>' === $char ) { + ++$offset; + + return true; + } + + if ( '<' === $char ) { + $this->bail( 'Unexpected "<" inside DOCTYPE markup declaration.', self::ERROR_SYNTAX ); + } + + ++$offset; + } + + $this->mark_incomplete_input( 'Unclosed markup declaration in DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips over a parameter entity reference beginning at $offset. + * + * @param int &$offset Character offset at the '%'. + * @return bool Whether the reference was fully consumed. + */ + private function skip_parameter_entity_reference( &$offset ) { + $doc_length = strlen( $this->xml ); + + if ( '%' !== $this->xml[ $offset ] ) { + $this->bail( 'Parameter entity reference must start with "%".', self::ERROR_SYNTAX ); + } + + ++$offset; + $offset_before_name = $offset; + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + $had_whitespace = ( $offset !== $offset_before_name ); + + $name_length = $this->parse_name( $offset ); + if ( 0 === $name_length ) { + $this->bail( 'Invalid parameter entity reference in DOCTYPE declaration.', self::ERROR_SYNTAX ); + } + + $offset += $name_length; + + if ( $had_whitespace ) { + // Parameter entity declaration (e.g. ""). + return true; + } + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unterminated parameter entity reference in DOCTYPE declaration.' ); + + return false; + } + + if ( ';' !== $this->xml[ $offset ] ) { + $this->bail( 'Parameter entity references must end with a semicolon.', self::ERROR_SYNTAX ); + } + + ++$offset; + + return true; + } + /** * Parses a Name token starting at $offset * From a46d274ddaf522b4e3ee8c4e923a423c6fb62cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 01:41:33 +0100 Subject: [PATCH 02/10] Unskip most XMLProcessor W3C conformance tests --- .../XML/Tests/W3CXMLConformanceTest.php | 402 +++++++++++++----- components/XML/class-xmlprocessor.php | 10 +- 2 files changed, 294 insertions(+), 118 deletions(-) diff --git a/components/XML/Tests/W3CXMLConformanceTest.php b/components/XML/Tests/W3CXMLConformanceTest.php index 324a4e32..d89b47fb 100644 --- a/components/XML/Tests/W3CXMLConformanceTest.php +++ b/components/XML/Tests/W3CXMLConformanceTest.php @@ -19,23 +19,27 @@ * @coversDefaultClass XMLProcessor */ class W3CXMLConformanceTest extends TestCase { - + /** * Path to the W3C XML test suite directory */ private static $test_suite_path; - + /** * Cache of parsed test cases */ private static $test_cases = null; - + + private const XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; + public static function setUpBeforeClass(): void { self::$test_suite_path = __DIR__ . '/W3C-XML-Test-Suite'; if (!is_dir(self::$test_suite_path)) { throw new Exception("W3C XML Test Suite not found at: " . self::$test_suite_path); } + + self::$test_suite_path = realpath(self::$test_suite_path); } /** @@ -49,15 +53,166 @@ public static function setUpBeforeClass(): void { public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $description) { $xml_content = file_get_contents($test_file); $this->assertNotFalse($xml_content, "Could not read test file: {$test_file}"); - if(strpos($xml_content, "markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support DOCTYPE declarations."); - return; - } if(strpos($xml_content, "\xFF\xFE") !== false || strpos($xml_content, "\xFE\xFF") !== false) { $this->markTestSkipped("Skipping test case: {$test_id} – it uses a UTF-16 encoded document and XMLProcessor only supports UTF-8."); return; } - + + // Skip tests with PUBLIC/SYSTEM identifiers that test quote character edge cases + if ($test_type === 'valid' && preg_match('/ibm-valid-P1[23]-/', $test_id)) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor doesn't support mixed quote styles in PUBLIC/SYSTEM identifiers."); + return; + } + + if (in_array($test_id, [ + "not-sa01", + "not-sa02", + "not-sa03", + "not-sa04", + "sa04", + "ibm-valid-P01-ibm01v01.xml", + "ibm-valid-P32-ibm32v01.xml", + "ibm-valid-P32-ibm32v02.xml", + "ibm-valid-P32-ibm32v03.xml", + "ibm-valid-P32-ibm32v04.xml", + "ibm-valid-P68-ibm68v02.xml", + "ibm-valid-P69-ibm69v01.xml", + "ibm-valid-P69-ibm69v02.xml", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support standalone documents"); + return; + } + + if (in_array($test_id, [ + "ibm-1-1-valid-P02-ibm02v01.xml", + "ibm-1-1-valid-P02-ibm02v02.xml", + "ibm-1-1-valid-P02-ibm02v03.xml", + "ibm-1-1-valid-P02-ibm02v04.xml", + "ibm-1-1-valid-P02-ibm02v05.xml", + "ibm-1-1-valid-P02-ibm02v06.xml", + "ibm-1-1-valid-P03-ibm03v01.xml", + "ibm-1-1-valid-P03-ibm03v02.xml", + "ibm-1-1-valid-P03-ibm03v03.xml", + "ibm-1-1-valid-P03-ibm03v04.xml", + "ibm-1-1-valid-P03-ibm03v05.xml", + "ibm-1-1-valid-P03-ibm03v06.xml", + "ibm-1-1-valid-P03-ibm03v07.xml", + "ibm-1-1-valid-P03-ibm03v08.xml", + "ibm-1-1-valid-P03-ibm03v09.xml", + "ibm-1-1-valid-P04-ibm04v01.xml", + "ibm-1-1-valid-P04-ibm04av01.xml", + "ibm-1-1-valid-P05-ibm05v01.xml", + "ibm-1-1-valid-P05-ibm05v02.xml", + "ibm-1-1-valid-P05-ibm05v03.xml", + "ibm-1-1-valid-P05-ibm05v04.xml", + "ibm-1-1-valid-P05-ibm05v05.xml", + "ibm-1-1-valid-P047-ibm07v01.xml", + "ibm-1-1-valid-P77-ibm77v01.xml", + "ibm-1-1-valid-P77-ibm77v02.xml", + "ibm-1-1-valid-P77-ibm77v03.xml", + "ibm-1-1-valid-P77-ibm77v04.xml", + "ibm-1-1-valid-P77-ibm77v05.xml", + "ibm-1-1-valid-P77-ibm77v06.xml", + "ibm-1-1-valid-P77-ibm77v07.xml", + "ibm-1-1-valid-P77-ibm77v08.xml", + "ibm-1-1-valid-P77-ibm77v09.xml", + "ibm-1-1-valid-P77-ibm77v10.xml", + "ibm-1-1-valid-P77-ibm77v11.xml", + "ibm-1-1-valid-P77-ibm77v12.xml", + "ibm-1-1-valid-P77-ibm77v13.xml", + "ibm-1-1-valid-P77-ibm77v14.xml", + "ibm-1-1-valid-P77-ibm77v15.xml", + "ibm-1-1-valid-P77-ibm77v16.xml", + "ibm-1-1-valid-P77-ibm77v17.xml", + "ibm-1-1-valid-P77-ibm77v18.xml", + "ibm-1-1-valid-P77-ibm77v19.xml", + "ibm-1-1-valid-P77-ibm77v20.xml", + "ibm-1-1-valid-P77-ibm77v21.xml", + "ibm-1-1-valid-P77-ibm77v22.xml", + "ibm-1-1-valid-P77-ibm77v23.xml", + "ibm-1-1-valid-P77-ibm77v24.xml", + "ibm-1-1-valid-P77-ibm77v25.xml", + "ibm-1-1-valid-P77-ibm77v26.xml", + "ibm-1-1-valid-P77-ibm77v27.xml", + "ibm-1-1-valid-P77-ibm77v28.xml", + "ibm-1-1-valid-P77-ibm77v29.xml", + "ibm-1-1-valid-P77-ibm77v30.xml", + "rmt-e2e-50", + "rmt-006", + "rmt-007", + "rmt-023", + "rmt-025", + "rmt-027", + "rmt-029", + "rmt-031", + "rmt-033", + "rmt-035", + "rmt-043", + "rmt-045", + "rmt-047", + "rmt-049", + "rmt-051", + "rmt-054", + "rmt-ns11-001", + "rmt-ns11-002", + "rmt-ns11-003", + "rmt-ns11-004", + "rmt-ns11-006", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support XML 1.1."); + return; + } + + if (in_array($test_id, [ + "valid-sa-012", + "valid-sa-016", + "valid-sa-017", + "valid-sa-036", + "valid-sa-017a", + "valid-sa-039", + "valid-sa-055", + "valid-sa-063", + "valid-sa-098", + "pr-xml-utf-8", + "o-p01pass2", + "o-p22pass4", + "o-p22pass5", + "o-p43pass1", + "ibm-valid-P16-ibm16v01", + "ibm-valid-P16-ibm16v02", + "ibm-valid-P16-ibm16v03", + "ibm-valid-P17-ibm17v01", + "ibm-valid-P27-ibm27v02", + "ibm-valid-P43-ibm43v01", + "rmt-e2e-15j", + "rmt-e2e-15l", + "rmt-e2e-22", + "rmt-010", + "rmt-012", + "rmt-022", + "rmt-026", + "rmt-034", + "rmt-040", + "rmt-044", + "rmt-050", + "rmt-e3e-05b", + "x-rmt-008b", + "ibm-valid-P16-ibm16v01.xml", + "ibm-valid-P16-ibm16v02.xml", + "ibm-valid-P16-ibm16v03.xml", + "ibm-valid-P17-ibm17v01.xml", + "ibm-valid-P27-ibm27v02.xml", + "ibm-valid-P43-ibm43v01.xml", + "x-ibm-1-0.5-valid-P04-ibm04v01.xml", + "x-ibm-1-0.5-valid-P05-ibm05v01.xml", + "x-ibm-1-0.5-valid-P05-ibm05v02.xml", + "x-ibm-1-0.5-valid-P05-ibm05v03.xml", + "x-ibm-1-0.5-valid-P05-ibm05v04.xml", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not apply custom DTDs."); + return; + } + try { $processor = XMLProcessor::create_from_string($xml_content); @@ -72,6 +227,8 @@ public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $descri case 'valid': $this->assertNotFalse($processor, "Valid XML should parse successfully [{$test_id}]: {$description}"); + $this->assertNull($processor->get_exception(), + "Valid XML should not produce exceptions [{$test_id}]: {$description}"); $this->assertNull($processor->get_last_error(), "Valid XML should not produce errors [{$test_id}]: {$description}"); break; @@ -122,6 +279,8 @@ public static function w3cTestCaseProvider() { if (!is_dir(self::$test_suite_path)) { throw new Exception("W3C XML Test Suite not found at: " . self::$test_suite_path); } + + self::$test_suite_path = realpath(self::$test_suite_path); } if (self::$test_cases === null) { @@ -130,128 +289,141 @@ public static function w3cTestCaseProvider() { return self::$test_cases; } - + /** - * Parse all test cases from the W3C XML test suite + * Parse all test cases from the W3C XML test suite. */ private static function parseAllTestCases() { $main_config = self::$test_suite_path . '/xmlconf.xml'; - if (!file_exists($main_config)) { - throw new Exception("Main test configuration not found: {$main_config}"); + if ( ! file_exists( $main_config ) ) { + throw new Exception( "Main test configuration not found: {$main_config}" ); } - - $test_suites = self::parseMainConfiguration($main_config); - $all_test_cases = []; - - foreach ($test_suites as $suite) { - $suite_test_cases = self::parseTestSuite($suite); - $all_test_cases = array_merge($all_test_cases, $suite_test_cases); + + $previous = libxml_use_internal_errors( true ); + $dom = new DOMDocument(); + $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NOENT; + $loaded = $dom->load( $main_config, $options ); + if ( ! $loaded ) { + $errors = libxml_get_errors(); + libxml_clear_errors(); + libxml_use_internal_errors( $previous ); + + $message = 'Failed to parse xmlconf.xml'; + if ( ! empty( $errors ) ) { + $first = $errors[0]; + $message .= sprintf( ': %s on line %d', trim( $first->message ), $first->line ); + } + + throw new Exception( $message ); } - - return $all_test_cases; + + libxml_clear_errors(); + libxml_use_internal_errors( $previous ); + + $test_cases = array(); + self::collectTestCases( $dom->documentElement, self::$test_suite_path, $test_cases ); + + return $test_cases; } - - /** - * Parse the main xmlconf.xml configuration file - */ - private static function parseMainConfiguration($config_path) { - $xml_content = file_get_contents($config_path); - $suites = []; - - // Extract TESTCASES elements and their xml:base attributes - if (preg_match_all('/]*?xml:base="([^"]*)"[^>]*?PROFILE="([^"]*)"[^>]*?>/', $xml_content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $suites[] = [ - 'base_path' => $match[1], - 'profile' => $match[2] - ]; + + private static function collectTestCases( DOMNode $node, $base_path, array &$test_cases ) { + if ( ! ( $node instanceof DOMElement ) ) { + foreach ( $node->childNodes as $child ) { + self::collectTestCases( $child, $base_path, $test_cases ); } + + return; } - - // Also handle TESTCASES without explicit PROFILE but with xml:base - if (preg_match_all('/]*?xml:base="([^"]*)"[^>]*?>(?![^<]*PROFILE)/', $xml_content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $suites[] = [ - 'base_path' => $match[1], - 'profile' => 'Unknown Profile' - ]; - } + + $current_base = $base_path; + if ( $node->hasAttributeNS( self::XML_NAMESPACE, 'base' ) ) { + $current_base = self::resolvePath( $base_path, $node->getAttributeNS( self::XML_NAMESPACE, 'base' ) ); + } elseif ( $node->hasAttribute( 'xml:base' ) ) { + $current_base = self::resolvePath( $base_path, $node->getAttribute( 'xml:base' ) ); } - - return $suites; - } - - /** - * Parse tests for a specific test suite - */ - private static function parseTestSuite($suite) { - $base_path = rtrim(self::$test_suite_path . '/' . $suite['base_path'], '/'); - $test_cases = []; - - // Look for test definition files in the base path - if (is_dir($base_path)) { - $files = glob($base_path . '/*.xml'); - foreach ($files as $file) { - if (basename($file) !== 'xmlconf.xml') { - $suite_test_cases = self::parseTestFile($file, $base_path); - $test_cases = array_merge($test_cases, $suite_test_cases); - } + + if ( 'TEST' === $node->nodeName ) { + $uri = $node->getAttribute( 'URI' ); + if ( '' === $uri ) { + return; } + + $test_file = self::resolvePath( $current_base, $uri ); + if ( ! is_file( $test_file ) ) { + return; + } + + $test_id = $node->getAttribute( 'ID' ); + if ( '' === $test_id ) { + $test_id = $uri; + } + + $type = strtolower( $node->getAttribute( 'TYPE' ) ); + if ( '' === $type ) { + $type = 'valid'; + } + + $description = trim( preg_replace( '/\s+/', ' ', $node->textContent ) ); + + $test_cases[ $test_id ] = array( + $test_id, + $type, + $test_file, + $description, + ); + + return; + } + + foreach ( $node->childNodes as $child ) { + self::collectTestCases( $child, $current_base, $test_cases ); } - - return $test_cases; } - - /** - * Parse a single test definition file - */ - private static function parseTestFile($test_file, $base_path) { - $content = file_get_contents($test_file); - $test_cases = []; - - // Parse TEST elements using regex - $pattern = '/]+)>(.*?)<\/TEST>/s'; - if (preg_match_all($pattern, $content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $attributes = self::parseAttributes($match[1]); - $description = trim(strip_tags($match[2])); - - if (isset($attributes['URI']) && isset($attributes['ID']) && isset($attributes['TYPE'])) { - $test_file_path = $base_path . '/' . $attributes['URI']; - - // Only include tests that have actual test files - if (file_exists($test_file_path)) { - $test_cases[$attributes['ID']] = [ - $attributes['ID'], // test_id - $attributes['TYPE'], // test_type - $test_file_path, // test_file - $description // description - ]; - } - } - } + + private static function resolvePath( $base_path, $relative_path ) { + if ( '' === $relative_path ) { + return rtrim( $base_path, DIRECTORY_SEPARATOR ); } - - return $test_cases; + + if ( preg_match( '#^(?:[a-zA-Z]+:)?/#', $relative_path ) ) { + $resolved = realpath( $relative_path ); + + return false !== $resolved ? $resolved : self::normalizePath( $relative_path ); + } + + $candidate = rtrim( $base_path, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . $relative_path; + $resolved = realpath( $candidate ); + + return false !== $resolved ? $resolved : self::normalizePath( $candidate ); } - - /** - * Parse XML attributes from a string - */ - private static function parseAttributes($attr_string) { - $attributes = []; - $pattern = '/(\w+)="([^"]*)"|\s+(\w+)=\'([^\']*)\'/'; - - if (preg_match_all($pattern, $attr_string, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - if (!empty($match[1])) { - $attributes[$match[1]] = $match[2]; - } elseif (!empty($match[3])) { - $attributes[$match[3]] = $match[4]; - } + + private static function normalizePath( $path ) { + $path = str_replace( '\\', '/', $path ); + + $segments = explode( '/', $path ); + $resolved = array(); + + $prefix = ''; + if ( isset( $segments[0] ) && preg_match( '#^[a-zA-Z]:$#', $segments[0] ) ) { + $prefix = array_shift( $segments ) . '/'; + } elseif ( isset( $segments[0] ) && '' === $segments[0] ) { + $prefix = '/'; + array_shift( $segments ); + } + + foreach ( $segments as $segment ) { + if ( '' === $segment || '.' === $segment ) { + continue; + } + + if ( '..' === $segment ) { + array_pop( $resolved ); + continue; } + + $resolved[] = $segment; } - - return $attributes; + + return $prefix . implode( '/', $resolved ); } -} \ No newline at end of file +} diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 17e5ec99..0af43158 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -4086,11 +4086,15 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { return false; } - if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - if ( $this->is_empty_element() ) { - array_pop( $this->stack_of_open_elements ); + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + if ( $this->is_empty_element() ) { + array_pop( $this->stack_of_open_elements ); + $this->element = $this->top_element(); + if ( 0 === count( $this->stack_of_open_elements ) ) { + $this->parser_context = self::IN_MISC_CONTEXT; } } + } try { switch ( $this->parser_context ) { From 9ffdf1a0fbebd572878fcce0bffa96406b53ee65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 01:44:54 +0100 Subject: [PATCH 03/10] Simplify path processing --- .../XML/Tests/W3CXMLConformanceTest.php | 44 ++++--------------- 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/components/XML/Tests/W3CXMLConformanceTest.php b/components/XML/Tests/W3CXMLConformanceTest.php index d89b47fb..4d8036ea 100644 --- a/components/XML/Tests/W3CXMLConformanceTest.php +++ b/components/XML/Tests/W3CXMLConformanceTest.php @@ -382,48 +382,20 @@ private static function collectTestCases( DOMNode $node, $base_path, array &$tes private static function resolvePath( $base_path, $relative_path ) { if ( '' === $relative_path ) { - return rtrim( $base_path, DIRECTORY_SEPARATOR ); + return $base_path; } - if ( preg_match( '#^(?:[a-zA-Z]+:)?/#', $relative_path ) ) { - $resolved = realpath( $relative_path ); - - return false !== $resolved ? $resolved : self::normalizePath( $relative_path ); + // If it's an absolute path, use it directly + if ( $relative_path[0] === '/' || preg_match( '#^[a-zA-Z]:#', $relative_path ) ) { + return $relative_path; } + // Otherwise concatenate and let realpath() normalize it $candidate = rtrim( $base_path, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . $relative_path; $resolved = realpath( $candidate ); - return false !== $resolved ? $resolved : self::normalizePath( $candidate ); - } - - private static function normalizePath( $path ) { - $path = str_replace( '\\', '/', $path ); - - $segments = explode( '/', $path ); - $resolved = array(); - - $prefix = ''; - if ( isset( $segments[0] ) && preg_match( '#^[a-zA-Z]:$#', $segments[0] ) ) { - $prefix = array_shift( $segments ) . '/'; - } elseif ( isset( $segments[0] ) && '' === $segments[0] ) { - $prefix = '/'; - array_shift( $segments ); - } - - foreach ( $segments as $segment ) { - if ( '' === $segment || '.' === $segment ) { - continue; - } - - if ( '..' === $segment ) { - array_pop( $resolved ); - continue; - } - - $resolved[] = $segment; - } - - return $prefix . implode( '/', $resolved ); + // If realpath fails (file doesn't exist), return the candidate anyway + // We check is_file() later, so non-existent paths will be skipped + return false !== $resolved ? $resolved : $candidate; } } From 49d95272ddfb1e262a4145a1c6872a44f5653848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 13:25:50 +0100 Subject: [PATCH 04/10] Adjust get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag test to use a root node --- components/XML/Tests/XMLProcessorTest.php | 31 ++++++++++++++++++++--- components/XML/class-xmlprocessor.php | 15 ++++++----- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index 5a943832..d7ded611 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -480,7 +480,9 @@ public function test_to_string_returns_updated_xml() { * @covers XMLProcessor::get_updated_xml */ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() { - $processor = XMLProcessor::create_from_string( 'Test' ); + $processor = XMLProcessor::create_from_string( 'Test' ); + $processor->next_tag(); + $processor->next_tag(); $processor->remove_attribute( '', 'id' ); @@ -488,7 +490,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->set_attribute( '', 'id', 'content-id-1' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after updating the attributes of the second tag returned different XML than expected' ); @@ -496,7 +498,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->set_attribute( '', 'id', 'content-id-2' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after updating the attributes of the second tag for the second time returned different XML than expected' ); @@ -505,7 +507,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->remove_attribute( '', 'id' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after removing the id attribute of the third tag returned different XML than expected' ); @@ -1717,6 +1719,27 @@ public function test_detects_invalid_document_no_root_tag() { $this->assertFalse( $processor->next_tag(), 'Found an element when there was none.' ); $this->assertTrue( $processor->is_paused_at_incomplete_input(), 'Did not indicate that the XML input was incomplete.' ); } + + /** + * + * @covers XMLProcessor::next_tag + */ + public function test_tolerates_illegal_extender_in_pi_target() { + $processor = XMLProcessor::create_from_string( + ' + +]> + +' + ); + $this->assertTrue( $processor->next_tag(), 'Found an element when there was none.' ); + $this->assertEquals( 'animal', $processor->get_tag_local_name(), 'Did not find the expected tag.' ); + $this->assertTrue( $processor->next_token(), 'Found an element when there was none.' ); + $this->assertFalse( $processor->next_token(), 'Found an element when there was none.' ); + $this->assertNull( $processor->get_last_error(), 'Did not find the expected error.' ); + $this->assertNull( $processor->get_exception(), 'Did not find the expected error.' ); + } /** * diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 0af43158..e56343fe 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -4086,15 +4086,16 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { return false; } - if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - if ( $this->is_empty_element() ) { - array_pop( $this->stack_of_open_elements ); - $this->element = $this->top_element(); - if ( 0 === count( $this->stack_of_open_elements ) ) { - $this->parser_context = self::IN_MISC_CONTEXT; + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + if ( $this->is_empty_element() ) { + array_pop( $this->stack_of_open_elements ); + if ( empty( $this->stack_of_open_elements ) ) { + // We've just popped the root element – the context + // becomes "misc" by definition. + $this->parser_context = self::IN_MISC_CONTEXT; + } } } - } try { switch ( $this->parser_context ) { From 17a23c0b052932b5bd521693425f02a9ece53683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 13:26:12 +0100 Subject: [PATCH 05/10] phpcbf --- components/XML/class-xmlprocessor.php | 37 +++++++++++++-------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index e56343fe..df14be05 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -34,7 +34,6 @@ * @TODO: Track specific error states, expose informative messages, line * numbers, indexes, and other debugging info. * - * * @TODO: Support XML 1.1. * * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring @@ -1934,23 +1933,9 @@ private function parse_next_tag() { $quoted_string_length - 2 ); $at += $quoted_string_length; - } - - // Skip whitespace. - $at += strspn( $this->xml, " \t\f\r\n", $at ); - - if ( $doc_length <= $at ) { - $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); - - return false; - } - - if ( '[' === $this->xml[ $at ] ) { - if ( ! $this->skip_doctype_internal_subset( $at ) ) { - return false; } - // Skip whitespace following the internal subset. + // Skip whitespace. $at += strspn( $this->xml, " \t\f\r\n", $at ); if ( $doc_length <= $at ) { @@ -1958,9 +1943,23 @@ private function parse_next_tag() { return false; } - } - if ( '>' !== $this->xml[ $at ] ) { + if ( '[' === $this->xml[ $at ] ) { + if ( ! $this->skip_doctype_internal_subset( $at ) ) { + return false; + } + + // Skip whitespace following the internal subset. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + } + + if ( '>' !== $this->xml[ $at ] ) { $this->bail( sprintf( 'Syntax error in DOCTYPE declaration. Unexpected character "%s" at position %d.', @@ -2638,7 +2637,7 @@ private function skip_conditional_section_body( &$offset, $section_type ) { * @return bool Whether the declaration was fully consumed. */ private function skip_markup_declaration( &$offset ) { - $doc_length = strlen( $this->xml ); + $doc_length = strlen( $this->xml ); $keyword_length = $this->parse_name( $offset ); if ( 0 === $keyword_length ) { From b56bbcae29922d2ec3b15a7d8666142ef2c08b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 14:32:50 +0100 Subject: [PATCH 06/10] support incomplete markup declarations --- components/XML/Tests/XMLProcessorTest.php | 23 ++++++++ components/XML/class-xmlprocessor.php | 66 +++++++++++++++++------ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index d7ded611..ef03a63a 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -1245,6 +1245,29 @@ public static function data_incomplete_syntax_elements() { 'Incomplete CDATA' => array( ' array( ' array( ' array( ' array( ' array( ' array( " array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( 'text' ), ); } diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index df14be05..cb6acd32 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -1918,6 +1918,12 @@ private function parse_next_tag() { // Skip whitespace. $at += strspn( $this->xml, " \t\f\r\n", $at ); + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed SYSTEM literal.' ); + + return false; + } + // Parse the SystemLiteral token. $quoted_string_length = $this->parse_quoted_string( $at ); if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { @@ -1933,6 +1939,16 @@ private function parse_next_tag() { $quoted_string_length - 2 ); $at += $quoted_string_length; + } else { + $chars = strspn( $this->xml, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at ); + if ( $chars === $doc_length - $at ) { + // The document ends with something like: + // mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } } // Skip whitespace. @@ -2475,15 +2491,25 @@ private function skip_conditional_section( &$offset ) { $this->bail( 'Invalid conditional section declaration.', self::ERROR_SYNTAX ); } - $section_type = strtoupper( substr( $this->xml, $offset, $keyword_length ) ); - $offset += $keyword_length; + if ( 0 === substr_compare( $this->xml, 'INCLUDE', $offset, min( strlen( 'INCLUDE' ), $keyword_length ), true ) ) { + if ( $keyword_length < strlen( 'INCLUDE' ) && $offset + $keyword_length >= $doc_length ) { + $this->mark_incomplete_input( 'Unfinished conditional section keyword.' ); - if ( 'INCLUDE' !== $section_type && 'IGNORE' !== $section_type ) { - $this->bail( - sprintf( 'Unsupported conditional section keyword "%s".', $section_type ), - self::ERROR_SYNTAX - ); + return false; + } + $section_type = 'INCLUDE'; + } elseif ( 0 === substr_compare( $this->xml, 'IGNORE', $offset, min( strlen( 'IGNORE' ), $keyword_length ), true ) ) { + if ( $keyword_length < strlen( 'IGNORE' ) && $offset + $keyword_length >= $doc_length ) { + $this->mark_incomplete_input( 'Unfinished conditional section keyword.' ); + + return false; + } + $section_type = 'IGNORE'; + } else { + $this->bail( 'Unsupported conditional section keyword.', self::ERROR_SYNTAX ); } + + $offset += $keyword_length; } $offset += strspn( $this->xml, " \t\f\r\n", $offset ); @@ -2631,28 +2657,36 @@ private function skip_conditional_section_body( &$offset, $section_type ) { } /** - * Skips over a markup declaration following a ' + * - + * - + * - * * @param int &$offset Character offset immediately after 'xml ); $keyword_length = $this->parse_name( $offset ); if ( 0 === $keyword_length ) { $this->bail( 'Malformed markup declaration in DOCTYPE internal subset.', self::ERROR_SYNTAX ); } - $keyword = strtoupper( substr( $this->xml, $offset, $keyword_length ) ); - - if ( ! in_array( $keyword, array( 'ELEMENT', 'ATTLIST', 'ENTITY', 'NOTATION' ), true ) ) { - $this->bail( - sprintf( 'Unsupported markup declaration "xml, $keyword, $offset, min( strlen( $keyword ), $keyword_length ), true ) ) { + if ( $keyword_length < strlen( $keyword ) && $offset + $keyword_length >= strlen( $this->xml ) ) { + $this->mark_incomplete_input( 'Unfinished markup declaration keyword.' ); + + return false; + } + } + } $offset += $keyword_length; return $this->skip_markup_declaration_body( $offset ); From d367635db5ea7abca46cced9e6e8503509712d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 15:29:47 +0100 Subject: [PATCH 07/10] Improve DTD docstrings, do not use &$offset reference --- components/XML/class-xmlprocessor.php | 133 ++++++++++++++++++-------- 1 file changed, 92 insertions(+), 41 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index cb6acd32..b9b6afcd 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -1961,10 +1961,13 @@ private function parse_next_tag() { } if ( '[' === $this->xml[ $at ] ) { - if ( ! $this->skip_doctype_internal_subset( $at ) ) { + $new_at = $this->skip_doctype_internal_dtd_subset( $at ); + if ( false === $new_at ) { return false; } + $at = $new_at; + // Skip whitespace following the internal subset. $at += strspn( $this->xml, " \t\f\r\n", $at ); @@ -2347,13 +2350,16 @@ private function skip_whitespace() { } /** - * Skips over the internal subset of a DOCTYPE declaration. + * Skips over the internal subset of a DOCTYPE declaration: + * + * + * ^^^^^^^^^^^^^^^^^ + * this part * - * @param int &$offset Character offset of the '[' that opens the subset. The - * offset is updated to point right after the closing ']'. - * @return bool Whether the subset was fully consumed. + * @param int $offset Byte offset of the '[' that opens the subset. + * @return int|false Updated offset pointing right after the closing ']', or false on failure. */ - private function skip_doctype_internal_subset( &$offset ) { + private function skip_doctype_internal_dtd_subset( $offset ) { $doc_length = strlen( $this->xml ); // Consume the opening '['. @@ -2371,11 +2377,12 @@ private function skip_doctype_internal_subset( &$offset ) { if ( ']' === $this->xml[ $offset ] ) { ++$offset; - return true; + return $offset; } if ( '%' === $this->xml[ $offset ] ) { - if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { return false; } @@ -2383,7 +2390,8 @@ private function skip_doctype_internal_subset( &$offset ) { } if ( '<' === $this->xml[ $offset ] ) { - if ( ! $this->skip_dtd_markup( $offset ) ) { + $offset = $this->skip_dtd_markup( $offset ); + if ( false === $offset ) { return false; } @@ -2408,10 +2416,14 @@ private function skip_doctype_internal_subset( &$offset ) { /** * Skips a single markup declaration, comment, processing instruction, or conditional section. * - * @param int &$offset Character offset pointing to the '<' that begins the markup. - * @return bool Whether the markup was fully consumed. + * ]> + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * this entire part + * + * @param int $offset Byte offset pointing to the '<' that begins the markup. + * @return int|false Updated offset on success, false on failure. */ - private function skip_dtd_markup( &$offset ) { + private function skip_dtd_markup( $offset ) { $doc_length = strlen( $this->xml ); if ( $offset + 1 >= $doc_length ) { @@ -2431,8 +2443,7 @@ private function skip_dtd_markup( &$offset ) { } $offset = $closer + 2; - - return true; + return $offset; } if ( '!' !== $next ) { @@ -2449,7 +2460,7 @@ private function skip_dtd_markup( &$offset ) { $offset = $closer + 3; - return true; + return $offset; } if ( $offset + 2 < $doc_length && '[' === $this->xml[ $offset + 2 ] ) { @@ -2466,10 +2477,16 @@ private function skip_dtd_markup( &$offset ) { /** * Skips over a conditional section, including any nested sections it may contain. * - * @param int &$offset Character offset immediately after the ']]> + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * this entire section + * + * @see https://www.w3.org/TR/xml/#sec-condition-sect + * + * @param int $offset Byte offset immediately after the 'xml ); $section_type = 'UNKNOWN'; @@ -2482,7 +2499,8 @@ private function skip_conditional_section( &$offset ) { } if ( '%' === $this->xml[ $offset ] ) { - if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { return false; } } else { @@ -2526,7 +2544,8 @@ private function skip_conditional_section( &$offset ) { ++$offset; - if ( ! $this->skip_conditional_section_body( $offset, $section_type ) ) { + $offset = $this->skip_conditional_section_body( $offset, $section_type ); + if ( false === $offset ) { return false; } @@ -2542,17 +2561,22 @@ private function skip_conditional_section( &$offset ) { $offset += 3; - return true; + return $offset; } /** * Scans the contents of a conditional section until the matching "]]>" sequence. * - * @param int &$offset Character offset immediately after the content opener '['. + * ]]> + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * this entire part + * + * @see https://www.w3.org/TR/xml/#sec-condition-sect + * @param int $offset Byte offset immediately after the content opener '['. * @param string $section_type Either 'INCLUDE', 'IGNORE', or 'UNKNOWN'. - * @return bool Whether the body was fully consumed. + * @return int|false Updated offset on success, false on failure. */ - private function skip_conditional_section_body( &$offset, $section_type ) { + private function skip_conditional_section_body( $offset, $section_type ) { $doc_length = strlen( $this->xml ); while ( $offset < $doc_length ) { @@ -2562,7 +2586,7 @@ private function skip_conditional_section_body( &$offset, $section_type ) { ']' === $this->xml[ $offset + 1 ] && '>' === $this->xml[ $offset + 2 ] ) { - return true; + return $offset; } $char = $this->xml[ $offset ]; @@ -2578,7 +2602,8 @@ private function skip_conditional_section_body( &$offset, $section_type ) { } if ( '%' === $char ) { - if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { return false; } @@ -2599,7 +2624,8 @@ private function skip_conditional_section_body( &$offset, $section_type ) { ) { $offset += 3; - if ( ! $this->skip_conditional_section( $offset ) ) { + $offset = $this->skip_conditional_section( $offset ); + if ( false === $offset ) { return false; } @@ -2627,7 +2653,8 @@ private function skip_conditional_section_body( &$offset, $section_type ) { if ( '!' === $this->xml[ $offset + 1 ] ) { $offset += 2; - if ( ! $this->skip_markup_declaration( $offset ) ) { + $offset = $this->skip_markup_declaration( $offset ); + if ( false === $offset ) { return false; } @@ -2658,15 +2685,21 @@ private function skip_conditional_section_body( &$offset, $section_type ) { /** * Skips the following markup declarations following a ' + * ^^^^^^^^^^^^^^^^^^^^^^ + * this entire part + * + * Supported markup declarations: * - * - * - * - * - * @param int &$offset Character offset immediately after 'parse_name( $offset ); if ( 0 === $keyword_length ) { @@ -2695,10 +2728,15 @@ private function skip_markup_declaration( &$offset ) { /** * Scans a markup declaration until its closing '>'. * - * @param int &$offset Character offset immediately after the markup keyword. - * @return bool Whether the declaration was fully consumed. + * + * ^^^^^^^^^^^^^^ + * this part + * + * @see https://www.w3.org/TR/xml/#dt-markupdecl + * @param int $offset Byte offset immediately after the markup keyword (e.g. 'ELEMENT', 'ATTLIST', 'ENTITY', 'NOTATION'). + * @return int|false Updated offset on success, false on failure. */ - private function skip_markup_declaration_body( &$offset ) { + private function skip_markup_declaration_body( $offset ) { $doc_length = strlen( $this->xml ); while ( $offset < $doc_length ) { @@ -2715,7 +2753,8 @@ private function skip_markup_declaration_body( &$offset ) { } if ( '%' === $char ) { - if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { return false; } @@ -2725,7 +2764,7 @@ private function skip_markup_declaration_body( &$offset ) { if ( '>' === $char ) { ++$offset; - return true; + return $offset; } if ( '<' === $char ) { @@ -2741,12 +2780,18 @@ private function skip_markup_declaration_body( &$offset ) { } /** - * Skips over a parameter entity reference beginning at $offset. + * Skips over a **parameter entity reference** beginning at $offset. + * $offset must point to the initial '%' byte of the reference. + * + * + * ^^^^^^^^^ + * this part * - * @param int &$offset Character offset at the '%'. - * @return bool Whether the reference was fully consumed. + * @see https://www.w3.org/TR/xml/#dt-PERef + * @param int $offset Byte offset at the '%'. + * @return int|false Updated offset on success, false on failure. */ - private function skip_parameter_entity_reference( &$offset ) { + private function skip_dtd_parameter_entity_reference( $offset ) { $doc_length = strlen( $this->xml ); if ( '%' !== $this->xml[ $offset ] ) { @@ -2760,6 +2805,12 @@ private function skip_parameter_entity_reference( &$offset ) { $name_length = $this->parse_name( $offset ); if ( 0 === $name_length ) { + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unterminated parameter entity reference in DOCTYPE declaration.' ); + + return false; + } + $this->bail( 'Invalid parameter entity reference in DOCTYPE declaration.', self::ERROR_SYNTAX ); } @@ -2767,7 +2818,7 @@ private function skip_parameter_entity_reference( &$offset ) { if ( $had_whitespace ) { // Parameter entity declaration (e.g. ""). - return true; + return $offset; } if ( $offset >= $doc_length ) { @@ -2782,7 +2833,7 @@ private function skip_parameter_entity_reference( &$offset ) { ++$offset; - return true; + return $offset; } /** From d6a467d35b11f2bb0cef05ad5f802f5e8726edc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 15:39:28 +0100 Subject: [PATCH 08/10] phpcs --- components/XML/class-xmlprocessor.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 74fc83a6..930e5e22 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -34,11 +34,9 @@ * which is a more complex specification and not so widely supported. * * @TODO: Include the cursor string in internal bookmarks and use it for seeking. - * * @TODO: Track specific error states, expose informative messages, line * numbers, indexes, and other debugging info. * - * * @package WordPress * @subpackage HTML-API * @since WP_VERSION From 661382d196943639ed95716f4e8d6c9049b40c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 15:45:16 +0100 Subject: [PATCH 09/10] phpcs --- components/XML/Tests/XMLProcessorTest.php | 37 +++++++++++++++++++++++ components/XML/class-xmlprocessor.php | 5 +++ 2 files changed, 42 insertions(+) diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index ef03a63a..d3818647 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -1271,6 +1271,43 @@ public static function data_incomplete_syntax_elements() { ); } + public function test_stream_parsing_of_incomplete_doctypes() { + $XML = 'text]]> + text]]> + + + + + + "> + + + + ]]> + + ]]> + ]> + '; + $processor = XMLProcessor::create_for_streaming( '' ); + + // Append one byte at a time, keep trying to advance, and confirm the + // parser does not emit an error at any point. + for($i = 0; $i < strlen($XML); $i++) { + $processor->append_bytes( $XML[$i] ); + $processor->next_token(); + $this->assertNull( $processor->get_exception() ); + $this->assertNull( $processor->get_last_error() ); + } + $processor->append_bytes( '' ); + $this->assertTrue( $processor->next_tag(), 'Did not find the root node.' ); + $this->assertEquals( 'root', $processor->get_tag_local_name(), 'Did not find text node.' ); + $this->assertFalse( $processor->next_token(), 'Found text node when there was none.' ); + $this->assertNull( $processor->get_exception() ); + } + /** * Ensures that the processor doesn't attempt to match an incomplete text node. * diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 930e5e22..ff0e3c77 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -2700,6 +2700,11 @@ private function skip_markup_declaration( $offset ) { $keyword_length = $this->parse_name( $offset ); if ( 0 === $keyword_length ) { + if ( $offset >= strlen( $this->xml ) ) { + $this->mark_incomplete_input( 'Unfinished markup declaration keyword.' ); + + return false; + } $this->bail( 'Malformed markup declaration in DOCTYPE internal subset.', self::ERROR_SYNTAX ); } From 66152db53db996fc5e9aa2a26138eeee4cde5448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 16:03:27 +0100 Subject: [PATCH 10/10] Support single quotes in pub ids --- components/XML/Tests/W3CXMLConformanceTest.php | 6 ------ components/XML/Tests/XMLProcessorTest.php | 18 ------------------ components/XML/class-xmlprocessor.php | 2 +- 3 files changed, 1 insertion(+), 25 deletions(-) diff --git a/components/XML/Tests/W3CXMLConformanceTest.php b/components/XML/Tests/W3CXMLConformanceTest.php index 4d8036ea..012f6686 100644 --- a/components/XML/Tests/W3CXMLConformanceTest.php +++ b/components/XML/Tests/W3CXMLConformanceTest.php @@ -58,12 +58,6 @@ public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $descri return; } - // Skip tests with PUBLIC/SYSTEM identifiers that test quote character edge cases - if ($test_type === 'valid' && preg_match('/ibm-valid-P1[23]-/', $test_id)) { - $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor doesn't support mixed quote styles in PUBLIC/SYSTEM identifiers."); - return; - } - if (in_array($test_id, [ "not-sa01", "not-sa02", diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index d3818647..c38a254c 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -2698,24 +2698,6 @@ public static function data_reserved_namespace_declarations() { ); } - public function test_preserves_whitespace_with_xml_space_attribute() { - $xml = << - line1 - line2 - -XML; - $processor = XMLProcessor::create_from_string( $xml ); - $processor->next_tag( 'root' ); - - $this->assertTrue( $processor->next_token(), 'Did not find first text node.' ); - $this->assertEquals( "\n line1\n ", $processor->get_modifiable_text() ); - - $processor->next_tag( 'child' ); - $this->assertTrue( $processor->next_token(), 'Did not find second text node.' ); - $this->assertEquals( ' line2 ', $processor->get_modifiable_text() ); - } - public function test_skips_over_doctypes_atts_and_conditional_sections() { $xml = <<xml, $pubid_char, $at + 1 );