From 663263af7357346b3c90dfe8d5c8d46e942653e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 00:02:55 +0100 Subject: [PATCH 1/5] [XMLProcessor] Skip DOCTYPE, ENTITY, ATTLIST, NOTATION, conditional sections --- components/XML/Tests/XMLProcessorTest.php | 50 +++ components/XML/class-xmlprocessor.php | 456 +++++++++++++++++++++- 2 files changed, 495 insertions(+), 11 deletions(-) diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index 8e586518..5a943832 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -2633,6 +2633,56 @@ public function test_preserves_whitespace_with_xml_space_attribute() { $this->assertEquals( ' line2 ', $processor->get_modifiable_text() ); } + public function test_skips_over_doctypes_atts_and_conditional_sections() { + $xml = << + + + + "> + + + + ]]> + + ]]> + ]> + + + Test + + +

Example

+ + +XML; + $processor = XMLProcessor::create_from_string( $xml ); + $this->assertTrue( $processor->next_token(), 'Did not find DOCTYPE node.' ); + $this->assertEquals( '#doctype', $processor->get_token_type(), 'Did not find DOCTYPE node.' ); + $this->assertEquals( 'html', $processor->get_doctype_name(), 'Did not find DOCTYPEName.' ); + + $this->assertTrue( $processor->next_token(), 'Did not find root tag.' ); + $this->assertEquals( 'html', $processor->get_tag_local_name(), 'Did not find root tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find head tag.' ); + $this->assertEquals( 'head', $processor->get_tag_local_name(), 'Did not find head tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find title tag.' ); + $this->assertEquals( 'title', $processor->get_tag_local_name(), 'Did not find title tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find body tag.' ); + $this->assertEquals( 'body', $processor->get_tag_local_name(), 'Did not find body tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find p tag.' ); + $this->assertEquals( 'p', $processor->get_tag_local_name(), 'Did not find p tag.' ); + + $this->assertTrue( $processor->next_token(), 'Did not find example text.' ); + $this->assertEquals( 'Example', $processor->get_modifiable_text(), 'Did not find example text.' ); + } + public function test_handles_various_whitespace_between_attributes() { $xml = "xml[ $at ] ) { - $this->bail( 'Inline entity declarations are not yet supported in DOCTYPE declarations.', self::ERROR_SYNTAX ); + } + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + if ( '[' === $this->xml[ $at ] ) { + if ( ! $this->skip_doctype_internal_subset( $at ) ) { + return false; } - // Skip whitespace. + // Skip whitespace following the internal subset. $at += strspn( $this->xml, " \t\f\r\n", $at ); - if ( '>' !== $this->xml[ $at ] ) { + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + } + + if ( '>' !== $this->xml[ $at ] ) { $this->bail( sprintf( 'Syntax error in DOCTYPE declaration. Unexpected character "%s" at position %d.', @@ -2318,6 +2331,427 @@ private function skip_whitespace() { $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n", $this->bytes_already_parsed ); } + /** + * Skips over the internal subset of a DOCTYPE declaration. + * + * @param int &$offset Character offset of the '[' that opens the subset. The + * offset is updated to point right after the closing ']'. + * @return bool Whether the subset was fully consumed. + */ + private function skip_doctype_internal_subset( &$offset ) { + $doc_length = strlen( $this->xml ); + + // Consume the opening '['. + ++$offset; + + while ( $offset < $doc_length ) { + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + if ( ']' === $this->xml[ $offset ] ) { + ++$offset; + + return true; + } + + if ( '%' === $this->xml[ $offset ] ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + + continue; + } + + if ( '<' === $this->xml[ $offset ] ) { + if ( ! $this->skip_dtd_markup( $offset ) ) { + return false; + } + + continue; + } + + $this->bail( + sprintf( + 'Unexpected character "%s" in DOCTYPE internal subset at position %d.', + $this->xml[ $offset ], + $offset + ), + self::ERROR_SYNTAX + ); + } + + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips a single markup declaration, comment, processing instruction, or conditional section. + * + * @param int &$offset Character offset pointing to the '<' that begins the markup. + * @return bool Whether the markup was fully consumed. + */ + private function skip_dtd_markup( &$offset ) { + $doc_length = strlen( $this->xml ); + + if ( $offset + 1 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed markup in DOCTYPE declaration.' ); + + return false; + } + + $next = $this->xml[ $offset + 1 ]; + + if ( '?' === $next ) { + $closer = strpos( $this->xml, '?>', $offset + 2 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed processing instruction in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 2; + + return true; + } + + if ( '!' !== $next ) { + $this->bail( 'Unsupported markup inside DOCTYPE declaration.', self::ERROR_SYNTAX ); + } + + if ( $offset + 3 < $doc_length && '-' === $this->xml[ $offset + 2 ] && '-' === $this->xml[ $offset + 3 ] ) { + $closer = strpos( $this->xml, '-->', $offset + 4 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed comment in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 3; + + return true; + } + + if ( $offset + 2 < $doc_length && '[' === $this->xml[ $offset + 2 ] ) { + $offset += 3; + + return $this->skip_conditional_section( $offset ); + } + + $offset += 2; + + return $this->skip_markup_declaration( $offset ); + } + + /** + * Skips over a conditional section, including any nested sections it may contain. + * + * @param int &$offset Character offset immediately after the 'xml ); + $section_type = 'UNKNOWN'; + + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( '%' === $this->xml[ $offset ] ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + } else { + $keyword_length = $this->parse_name( $offset ); + if ( 0 === $keyword_length ) { + $this->bail( 'Invalid conditional section declaration.', self::ERROR_SYNTAX ); + } + + $section_type = strtoupper( substr( $this->xml, $offset, $keyword_length ) ); + $offset += $keyword_length; + + if ( 'INCLUDE' !== $section_type && 'IGNORE' !== $section_type ) { + $this->bail( + sprintf( 'Unsupported conditional section keyword "%s".', $section_type ), + self::ERROR_SYNTAX + ); + } + } + + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( '[' !== $this->xml[ $offset ] ) { + $this->bail( 'Conditional section missing "[" opener.', self::ERROR_SYNTAX ); + } + + ++$offset; + + if ( ! $this->skip_conditional_section_body( $offset, $section_type ) ) { + return false; + } + + if ( $offset + 2 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( ']' !== $this->xml[ $offset ] || ']' !== $this->xml[ $offset + 1 ] || '>' !== $this->xml[ $offset + 2 ] ) { + $this->bail( 'Invalid conditional section closer.', self::ERROR_SYNTAX ); + } + + $offset += 3; + + return true; + } + + /** + * Scans the contents of a conditional section until the matching "]]>" sequence. + * + * @param int &$offset Character offset immediately after the content opener '['. + * @param string $section_type Either 'INCLUDE', 'IGNORE', or 'UNKNOWN'. + * @return bool Whether the body was fully consumed. + */ + private function skip_conditional_section_body( &$offset, $section_type ) { + $doc_length = strlen( $this->xml ); + + while ( $offset < $doc_length ) { + if ( + $offset + 2 < $doc_length && + ']' === $this->xml[ $offset ] && + ']' === $this->xml[ $offset + 1 ] && + '>' === $this->xml[ $offset + 2 ] + ) { + return true; + } + + $char = $this->xml[ $offset ]; + + if ( '"' === $char || "'" === $char ) { + $length = $this->parse_quoted_string( $offset ); + if ( false === $length ) { + return false; + } + + $offset += $length; + continue; + } + + if ( '%' === $char ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + + continue; + } + + if ( '<' === $char ) { + if ( $offset + 1 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( + '!' === $this->xml[ $offset + 1 ] && + $offset + 2 < $doc_length && + '[' === $this->xml[ $offset + 2 ] + ) { + $offset += 3; + + if ( ! $this->skip_conditional_section( $offset ) ) { + return false; + } + + continue; + } + + if ( + '!' === $this->xml[ $offset + 1 ] && + $offset + 3 < $doc_length && + '-' === $this->xml[ $offset + 2 ] && + '-' === $this->xml[ $offset + 3 ] + ) { + $closer = strpos( $this->xml, '-->', $offset + 4 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed comment in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 3; + continue; + } + + if ( 'INCLUDE' === $section_type ) { + if ( '!' === $this->xml[ $offset + 1 ] ) { + $offset += 2; + + if ( ! $this->skip_markup_declaration( $offset ) ) { + return false; + } + + continue; + } + + if ( '?' === $this->xml[ $offset + 1 ] ) { + $closer = strpos( $this->xml, '?>', $offset + 2 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed processing instruction in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 2; + continue; + } + } + } + + ++$offset; + } + + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips over a markup declaration following a 'xml ); + $keyword_length = $this->parse_name( $offset ); + + if ( 0 === $keyword_length ) { + $this->bail( 'Malformed markup declaration in DOCTYPE internal subset.', self::ERROR_SYNTAX ); + } + + $keyword = strtoupper( substr( $this->xml, $offset, $keyword_length ) ); + + if ( ! in_array( $keyword, array( 'ELEMENT', 'ATTLIST', 'ENTITY', 'NOTATION' ), true ) ) { + $this->bail( + sprintf( 'Unsupported markup declaration "skip_markup_declaration_body( $offset ); + } + + /** + * Scans a markup declaration until its closing '>'. + * + * @param int &$offset Character offset immediately after the markup keyword. + * @return bool Whether the declaration was fully consumed. + */ + private function skip_markup_declaration_body( &$offset ) { + $doc_length = strlen( $this->xml ); + + while ( $offset < $doc_length ) { + $char = $this->xml[ $offset ]; + + if ( '"' === $char || "'" === $char ) { + $length = $this->parse_quoted_string( $offset ); + if ( false === $length ) { + return false; + } + + $offset += $length; + continue; + } + + if ( '%' === $char ) { + if ( ! $this->skip_parameter_entity_reference( $offset ) ) { + return false; + } + + continue; + } + + if ( '>' === $char ) { + ++$offset; + + return true; + } + + if ( '<' === $char ) { + $this->bail( 'Unexpected "<" inside DOCTYPE markup declaration.', self::ERROR_SYNTAX ); + } + + ++$offset; + } + + $this->mark_incomplete_input( 'Unclosed markup declaration in DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips over a parameter entity reference beginning at $offset. + * + * @param int &$offset Character offset at the '%'. + * @return bool Whether the reference was fully consumed. + */ + private function skip_parameter_entity_reference( &$offset ) { + $doc_length = strlen( $this->xml ); + + if ( '%' !== $this->xml[ $offset ] ) { + $this->bail( 'Parameter entity reference must start with "%".', self::ERROR_SYNTAX ); + } + + ++$offset; + $offset_before_name = $offset; + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + $had_whitespace = ( $offset !== $offset_before_name ); + + $name_length = $this->parse_name( $offset ); + if ( 0 === $name_length ) { + $this->bail( 'Invalid parameter entity reference in DOCTYPE declaration.', self::ERROR_SYNTAX ); + } + + $offset += $name_length; + + if ( $had_whitespace ) { + // Parameter entity declaration (e.g. ""). + return true; + } + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unterminated parameter entity reference in DOCTYPE declaration.' ); + + return false; + } + + if ( ';' !== $this->xml[ $offset ] ) { + $this->bail( 'Parameter entity references must end with a semicolon.', self::ERROR_SYNTAX ); + } + + ++$offset; + + return true; + } + /** * Parses a Name token starting at $offset * From a46d274ddaf522b4e3ee8c4e923a423c6fb62cea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 01:41:33 +0100 Subject: [PATCH 2/5] Unskip most XMLProcessor W3C conformance tests --- .../XML/Tests/W3CXMLConformanceTest.php | 402 +++++++++++++----- components/XML/class-xmlprocessor.php | 10 +- 2 files changed, 294 insertions(+), 118 deletions(-) diff --git a/components/XML/Tests/W3CXMLConformanceTest.php b/components/XML/Tests/W3CXMLConformanceTest.php index 324a4e32..d89b47fb 100644 --- a/components/XML/Tests/W3CXMLConformanceTest.php +++ b/components/XML/Tests/W3CXMLConformanceTest.php @@ -19,23 +19,27 @@ * @coversDefaultClass XMLProcessor */ class W3CXMLConformanceTest extends TestCase { - + /** * Path to the W3C XML test suite directory */ private static $test_suite_path; - + /** * Cache of parsed test cases */ private static $test_cases = null; - + + private const XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; + public static function setUpBeforeClass(): void { self::$test_suite_path = __DIR__ . '/W3C-XML-Test-Suite'; if (!is_dir(self::$test_suite_path)) { throw new Exception("W3C XML Test Suite not found at: " . self::$test_suite_path); } + + self::$test_suite_path = realpath(self::$test_suite_path); } /** @@ -49,15 +53,166 @@ public static function setUpBeforeClass(): void { public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $description) { $xml_content = file_get_contents($test_file); $this->assertNotFalse($xml_content, "Could not read test file: {$test_file}"); - if(strpos($xml_content, "markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support DOCTYPE declarations."); - return; - } if(strpos($xml_content, "\xFF\xFE") !== false || strpos($xml_content, "\xFE\xFF") !== false) { $this->markTestSkipped("Skipping test case: {$test_id} – it uses a UTF-16 encoded document and XMLProcessor only supports UTF-8."); return; } - + + // Skip tests with PUBLIC/SYSTEM identifiers that test quote character edge cases + if ($test_type === 'valid' && preg_match('/ibm-valid-P1[23]-/', $test_id)) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor doesn't support mixed quote styles in PUBLIC/SYSTEM identifiers."); + return; + } + + if (in_array($test_id, [ + "not-sa01", + "not-sa02", + "not-sa03", + "not-sa04", + "sa04", + "ibm-valid-P01-ibm01v01.xml", + "ibm-valid-P32-ibm32v01.xml", + "ibm-valid-P32-ibm32v02.xml", + "ibm-valid-P32-ibm32v03.xml", + "ibm-valid-P32-ibm32v04.xml", + "ibm-valid-P68-ibm68v02.xml", + "ibm-valid-P69-ibm69v01.xml", + "ibm-valid-P69-ibm69v02.xml", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support standalone documents"); + return; + } + + if (in_array($test_id, [ + "ibm-1-1-valid-P02-ibm02v01.xml", + "ibm-1-1-valid-P02-ibm02v02.xml", + "ibm-1-1-valid-P02-ibm02v03.xml", + "ibm-1-1-valid-P02-ibm02v04.xml", + "ibm-1-1-valid-P02-ibm02v05.xml", + "ibm-1-1-valid-P02-ibm02v06.xml", + "ibm-1-1-valid-P03-ibm03v01.xml", + "ibm-1-1-valid-P03-ibm03v02.xml", + "ibm-1-1-valid-P03-ibm03v03.xml", + "ibm-1-1-valid-P03-ibm03v04.xml", + "ibm-1-1-valid-P03-ibm03v05.xml", + "ibm-1-1-valid-P03-ibm03v06.xml", + "ibm-1-1-valid-P03-ibm03v07.xml", + "ibm-1-1-valid-P03-ibm03v08.xml", + "ibm-1-1-valid-P03-ibm03v09.xml", + "ibm-1-1-valid-P04-ibm04v01.xml", + "ibm-1-1-valid-P04-ibm04av01.xml", + "ibm-1-1-valid-P05-ibm05v01.xml", + "ibm-1-1-valid-P05-ibm05v02.xml", + "ibm-1-1-valid-P05-ibm05v03.xml", + "ibm-1-1-valid-P05-ibm05v04.xml", + "ibm-1-1-valid-P05-ibm05v05.xml", + "ibm-1-1-valid-P047-ibm07v01.xml", + "ibm-1-1-valid-P77-ibm77v01.xml", + "ibm-1-1-valid-P77-ibm77v02.xml", + "ibm-1-1-valid-P77-ibm77v03.xml", + "ibm-1-1-valid-P77-ibm77v04.xml", + "ibm-1-1-valid-P77-ibm77v05.xml", + "ibm-1-1-valid-P77-ibm77v06.xml", + "ibm-1-1-valid-P77-ibm77v07.xml", + "ibm-1-1-valid-P77-ibm77v08.xml", + "ibm-1-1-valid-P77-ibm77v09.xml", + "ibm-1-1-valid-P77-ibm77v10.xml", + "ibm-1-1-valid-P77-ibm77v11.xml", + "ibm-1-1-valid-P77-ibm77v12.xml", + "ibm-1-1-valid-P77-ibm77v13.xml", + "ibm-1-1-valid-P77-ibm77v14.xml", + "ibm-1-1-valid-P77-ibm77v15.xml", + "ibm-1-1-valid-P77-ibm77v16.xml", + "ibm-1-1-valid-P77-ibm77v17.xml", + "ibm-1-1-valid-P77-ibm77v18.xml", + "ibm-1-1-valid-P77-ibm77v19.xml", + "ibm-1-1-valid-P77-ibm77v20.xml", + "ibm-1-1-valid-P77-ibm77v21.xml", + "ibm-1-1-valid-P77-ibm77v22.xml", + "ibm-1-1-valid-P77-ibm77v23.xml", + "ibm-1-1-valid-P77-ibm77v24.xml", + "ibm-1-1-valid-P77-ibm77v25.xml", + "ibm-1-1-valid-P77-ibm77v26.xml", + "ibm-1-1-valid-P77-ibm77v27.xml", + "ibm-1-1-valid-P77-ibm77v28.xml", + "ibm-1-1-valid-P77-ibm77v29.xml", + "ibm-1-1-valid-P77-ibm77v30.xml", + "rmt-e2e-50", + "rmt-006", + "rmt-007", + "rmt-023", + "rmt-025", + "rmt-027", + "rmt-029", + "rmt-031", + "rmt-033", + "rmt-035", + "rmt-043", + "rmt-045", + "rmt-047", + "rmt-049", + "rmt-051", + "rmt-054", + "rmt-ns11-001", + "rmt-ns11-002", + "rmt-ns11-003", + "rmt-ns11-004", + "rmt-ns11-006", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support XML 1.1."); + return; + } + + if (in_array($test_id, [ + "valid-sa-012", + "valid-sa-016", + "valid-sa-017", + "valid-sa-036", + "valid-sa-017a", + "valid-sa-039", + "valid-sa-055", + "valid-sa-063", + "valid-sa-098", + "pr-xml-utf-8", + "o-p01pass2", + "o-p22pass4", + "o-p22pass5", + "o-p43pass1", + "ibm-valid-P16-ibm16v01", + "ibm-valid-P16-ibm16v02", + "ibm-valid-P16-ibm16v03", + "ibm-valid-P17-ibm17v01", + "ibm-valid-P27-ibm27v02", + "ibm-valid-P43-ibm43v01", + "rmt-e2e-15j", + "rmt-e2e-15l", + "rmt-e2e-22", + "rmt-010", + "rmt-012", + "rmt-022", + "rmt-026", + "rmt-034", + "rmt-040", + "rmt-044", + "rmt-050", + "rmt-e3e-05b", + "x-rmt-008b", + "ibm-valid-P16-ibm16v01.xml", + "ibm-valid-P16-ibm16v02.xml", + "ibm-valid-P16-ibm16v03.xml", + "ibm-valid-P17-ibm17v01.xml", + "ibm-valid-P27-ibm27v02.xml", + "ibm-valid-P43-ibm43v01.xml", + "x-ibm-1-0.5-valid-P04-ibm04v01.xml", + "x-ibm-1-0.5-valid-P05-ibm05v01.xml", + "x-ibm-1-0.5-valid-P05-ibm05v02.xml", + "x-ibm-1-0.5-valid-P05-ibm05v03.xml", + "x-ibm-1-0.5-valid-P05-ibm05v04.xml", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not apply custom DTDs."); + return; + } + try { $processor = XMLProcessor::create_from_string($xml_content); @@ -72,6 +227,8 @@ public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $descri case 'valid': $this->assertNotFalse($processor, "Valid XML should parse successfully [{$test_id}]: {$description}"); + $this->assertNull($processor->get_exception(), + "Valid XML should not produce exceptions [{$test_id}]: {$description}"); $this->assertNull($processor->get_last_error(), "Valid XML should not produce errors [{$test_id}]: {$description}"); break; @@ -122,6 +279,8 @@ public static function w3cTestCaseProvider() { if (!is_dir(self::$test_suite_path)) { throw new Exception("W3C XML Test Suite not found at: " . self::$test_suite_path); } + + self::$test_suite_path = realpath(self::$test_suite_path); } if (self::$test_cases === null) { @@ -130,128 +289,141 @@ public static function w3cTestCaseProvider() { return self::$test_cases; } - + /** - * Parse all test cases from the W3C XML test suite + * Parse all test cases from the W3C XML test suite. */ private static function parseAllTestCases() { $main_config = self::$test_suite_path . '/xmlconf.xml'; - if (!file_exists($main_config)) { - throw new Exception("Main test configuration not found: {$main_config}"); + if ( ! file_exists( $main_config ) ) { + throw new Exception( "Main test configuration not found: {$main_config}" ); } - - $test_suites = self::parseMainConfiguration($main_config); - $all_test_cases = []; - - foreach ($test_suites as $suite) { - $suite_test_cases = self::parseTestSuite($suite); - $all_test_cases = array_merge($all_test_cases, $suite_test_cases); + + $previous = libxml_use_internal_errors( true ); + $dom = new DOMDocument(); + $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NOENT; + $loaded = $dom->load( $main_config, $options ); + if ( ! $loaded ) { + $errors = libxml_get_errors(); + libxml_clear_errors(); + libxml_use_internal_errors( $previous ); + + $message = 'Failed to parse xmlconf.xml'; + if ( ! empty( $errors ) ) { + $first = $errors[0]; + $message .= sprintf( ': %s on line %d', trim( $first->message ), $first->line ); + } + + throw new Exception( $message ); } - - return $all_test_cases; + + libxml_clear_errors(); + libxml_use_internal_errors( $previous ); + + $test_cases = array(); + self::collectTestCases( $dom->documentElement, self::$test_suite_path, $test_cases ); + + return $test_cases; } - - /** - * Parse the main xmlconf.xml configuration file - */ - private static function parseMainConfiguration($config_path) { - $xml_content = file_get_contents($config_path); - $suites = []; - - // Extract TESTCASES elements and their xml:base attributes - if (preg_match_all('/]*?xml:base="([^"]*)"[^>]*?PROFILE="([^"]*)"[^>]*?>/', $xml_content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $suites[] = [ - 'base_path' => $match[1], - 'profile' => $match[2] - ]; + + private static function collectTestCases( DOMNode $node, $base_path, array &$test_cases ) { + if ( ! ( $node instanceof DOMElement ) ) { + foreach ( $node->childNodes as $child ) { + self::collectTestCases( $child, $base_path, $test_cases ); } + + return; } - - // Also handle TESTCASES without explicit PROFILE but with xml:base - if (preg_match_all('/]*?xml:base="([^"]*)"[^>]*?>(?![^<]*PROFILE)/', $xml_content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $suites[] = [ - 'base_path' => $match[1], - 'profile' => 'Unknown Profile' - ]; - } + + $current_base = $base_path; + if ( $node->hasAttributeNS( self::XML_NAMESPACE, 'base' ) ) { + $current_base = self::resolvePath( $base_path, $node->getAttributeNS( self::XML_NAMESPACE, 'base' ) ); + } elseif ( $node->hasAttribute( 'xml:base' ) ) { + $current_base = self::resolvePath( $base_path, $node->getAttribute( 'xml:base' ) ); } - - return $suites; - } - - /** - * Parse tests for a specific test suite - */ - private static function parseTestSuite($suite) { - $base_path = rtrim(self::$test_suite_path . '/' . $suite['base_path'], '/'); - $test_cases = []; - - // Look for test definition files in the base path - if (is_dir($base_path)) { - $files = glob($base_path . '/*.xml'); - foreach ($files as $file) { - if (basename($file) !== 'xmlconf.xml') { - $suite_test_cases = self::parseTestFile($file, $base_path); - $test_cases = array_merge($test_cases, $suite_test_cases); - } + + if ( 'TEST' === $node->nodeName ) { + $uri = $node->getAttribute( 'URI' ); + if ( '' === $uri ) { + return; } + + $test_file = self::resolvePath( $current_base, $uri ); + if ( ! is_file( $test_file ) ) { + return; + } + + $test_id = $node->getAttribute( 'ID' ); + if ( '' === $test_id ) { + $test_id = $uri; + } + + $type = strtolower( $node->getAttribute( 'TYPE' ) ); + if ( '' === $type ) { + $type = 'valid'; + } + + $description = trim( preg_replace( '/\s+/', ' ', $node->textContent ) ); + + $test_cases[ $test_id ] = array( + $test_id, + $type, + $test_file, + $description, + ); + + return; + } + + foreach ( $node->childNodes as $child ) { + self::collectTestCases( $child, $current_base, $test_cases ); } - - return $test_cases; } - - /** - * Parse a single test definition file - */ - private static function parseTestFile($test_file, $base_path) { - $content = file_get_contents($test_file); - $test_cases = []; - - // Parse TEST elements using regex - $pattern = '/]+)>(.*?)<\/TEST>/s'; - if (preg_match_all($pattern, $content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $attributes = self::parseAttributes($match[1]); - $description = trim(strip_tags($match[2])); - - if (isset($attributes['URI']) && isset($attributes['ID']) && isset($attributes['TYPE'])) { - $test_file_path = $base_path . '/' . $attributes['URI']; - - // Only include tests that have actual test files - if (file_exists($test_file_path)) { - $test_cases[$attributes['ID']] = [ - $attributes['ID'], // test_id - $attributes['TYPE'], // test_type - $test_file_path, // test_file - $description // description - ]; - } - } - } + + private static function resolvePath( $base_path, $relative_path ) { + if ( '' === $relative_path ) { + return rtrim( $base_path, DIRECTORY_SEPARATOR ); } - - return $test_cases; + + if ( preg_match( '#^(?:[a-zA-Z]+:)?/#', $relative_path ) ) { + $resolved = realpath( $relative_path ); + + return false !== $resolved ? $resolved : self::normalizePath( $relative_path ); + } + + $candidate = rtrim( $base_path, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . $relative_path; + $resolved = realpath( $candidate ); + + return false !== $resolved ? $resolved : self::normalizePath( $candidate ); } - - /** - * Parse XML attributes from a string - */ - private static function parseAttributes($attr_string) { - $attributes = []; - $pattern = '/(\w+)="([^"]*)"|\s+(\w+)=\'([^\']*)\'/'; - - if (preg_match_all($pattern, $attr_string, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - if (!empty($match[1])) { - $attributes[$match[1]] = $match[2]; - } elseif (!empty($match[3])) { - $attributes[$match[3]] = $match[4]; - } + + private static function normalizePath( $path ) { + $path = str_replace( '\\', '/', $path ); + + $segments = explode( '/', $path ); + $resolved = array(); + + $prefix = ''; + if ( isset( $segments[0] ) && preg_match( '#^[a-zA-Z]:$#', $segments[0] ) ) { + $prefix = array_shift( $segments ) . '/'; + } elseif ( isset( $segments[0] ) && '' === $segments[0] ) { + $prefix = '/'; + array_shift( $segments ); + } + + foreach ( $segments as $segment ) { + if ( '' === $segment || '.' === $segment ) { + continue; + } + + if ( '..' === $segment ) { + array_pop( $resolved ); + continue; } + + $resolved[] = $segment; } - - return $attributes; + + return $prefix . implode( '/', $resolved ); } -} \ No newline at end of file +} diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 17e5ec99..0af43158 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -4086,11 +4086,15 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { return false; } - if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - if ( $this->is_empty_element() ) { - array_pop( $this->stack_of_open_elements ); + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + if ( $this->is_empty_element() ) { + array_pop( $this->stack_of_open_elements ); + $this->element = $this->top_element(); + if ( 0 === count( $this->stack_of_open_elements ) ) { + $this->parser_context = self::IN_MISC_CONTEXT; } } + } try { switch ( $this->parser_context ) { From 9ffdf1a0fbebd572878fcce0bffa96406b53ee65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 01:44:54 +0100 Subject: [PATCH 3/5] Simplify path processing --- .../XML/Tests/W3CXMLConformanceTest.php | 44 ++++--------------- 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/components/XML/Tests/W3CXMLConformanceTest.php b/components/XML/Tests/W3CXMLConformanceTest.php index d89b47fb..4d8036ea 100644 --- a/components/XML/Tests/W3CXMLConformanceTest.php +++ b/components/XML/Tests/W3CXMLConformanceTest.php @@ -382,48 +382,20 @@ private static function collectTestCases( DOMNode $node, $base_path, array &$tes private static function resolvePath( $base_path, $relative_path ) { if ( '' === $relative_path ) { - return rtrim( $base_path, DIRECTORY_SEPARATOR ); + return $base_path; } - if ( preg_match( '#^(?:[a-zA-Z]+:)?/#', $relative_path ) ) { - $resolved = realpath( $relative_path ); - - return false !== $resolved ? $resolved : self::normalizePath( $relative_path ); + // If it's an absolute path, use it directly + if ( $relative_path[0] === '/' || preg_match( '#^[a-zA-Z]:#', $relative_path ) ) { + return $relative_path; } + // Otherwise concatenate and let realpath() normalize it $candidate = rtrim( $base_path, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . $relative_path; $resolved = realpath( $candidate ); - return false !== $resolved ? $resolved : self::normalizePath( $candidate ); - } - - private static function normalizePath( $path ) { - $path = str_replace( '\\', '/', $path ); - - $segments = explode( '/', $path ); - $resolved = array(); - - $prefix = ''; - if ( isset( $segments[0] ) && preg_match( '#^[a-zA-Z]:$#', $segments[0] ) ) { - $prefix = array_shift( $segments ) . '/'; - } elseif ( isset( $segments[0] ) && '' === $segments[0] ) { - $prefix = '/'; - array_shift( $segments ); - } - - foreach ( $segments as $segment ) { - if ( '' === $segment || '.' === $segment ) { - continue; - } - - if ( '..' === $segment ) { - array_pop( $resolved ); - continue; - } - - $resolved[] = $segment; - } - - return $prefix . implode( '/', $resolved ); + // If realpath fails (file doesn't exist), return the candidate anyway + // We check is_file() later, so non-existent paths will be skipped + return false !== $resolved ? $resolved : $candidate; } } From 49d95272ddfb1e262a4145a1c6872a44f5653848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 13:25:50 +0100 Subject: [PATCH 4/5] Adjust get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag test to use a root node --- components/XML/Tests/XMLProcessorTest.php | 31 ++++++++++++++++++++--- components/XML/class-xmlprocessor.php | 15 ++++++----- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index 5a943832..d7ded611 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -480,7 +480,9 @@ public function test_to_string_returns_updated_xml() { * @covers XMLProcessor::get_updated_xml */ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() { - $processor = XMLProcessor::create_from_string( 'Test' ); + $processor = XMLProcessor::create_from_string( 'Test' ); + $processor->next_tag(); + $processor->next_tag(); $processor->remove_attribute( '', 'id' ); @@ -488,7 +490,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->set_attribute( '', 'id', 'content-id-1' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after updating the attributes of the second tag returned different XML than expected' ); @@ -496,7 +498,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->set_attribute( '', 'id', 'content-id-2' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after updating the attributes of the second tag for the second time returned different XML than expected' ); @@ -505,7 +507,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->remove_attribute( '', 'id' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after removing the id attribute of the third tag returned different XML than expected' ); @@ -1717,6 +1719,27 @@ public function test_detects_invalid_document_no_root_tag() { $this->assertFalse( $processor->next_tag(), 'Found an element when there was none.' ); $this->assertTrue( $processor->is_paused_at_incomplete_input(), 'Did not indicate that the XML input was incomplete.' ); } + + /** + * + * @covers XMLProcessor::next_tag + */ + public function test_tolerates_illegal_extender_in_pi_target() { + $processor = XMLProcessor::create_from_string( + ' + +]> + +' + ); + $this->assertTrue( $processor->next_tag(), 'Found an element when there was none.' ); + $this->assertEquals( 'animal', $processor->get_tag_local_name(), 'Did not find the expected tag.' ); + $this->assertTrue( $processor->next_token(), 'Found an element when there was none.' ); + $this->assertFalse( $processor->next_token(), 'Found an element when there was none.' ); + $this->assertNull( $processor->get_last_error(), 'Did not find the expected error.' ); + $this->assertNull( $processor->get_exception(), 'Did not find the expected error.' ); + } /** * diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index 0af43158..e56343fe 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -4086,15 +4086,16 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { return false; } - if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - if ( $this->is_empty_element() ) { - array_pop( $this->stack_of_open_elements ); - $this->element = $this->top_element(); - if ( 0 === count( $this->stack_of_open_elements ) ) { - $this->parser_context = self::IN_MISC_CONTEXT; + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + if ( $this->is_empty_element() ) { + array_pop( $this->stack_of_open_elements ); + if ( empty( $this->stack_of_open_elements ) ) { + // We've just popped the root element – the context + // becomes "misc" by definition. + $this->parser_context = self::IN_MISC_CONTEXT; + } } } - } try { switch ( $this->parser_context ) { From 17a23c0b052932b5bd521693425f02a9ece53683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 2 Nov 2025 13:26:12 +0100 Subject: [PATCH 5/5] phpcbf --- components/XML/class-xmlprocessor.php | 37 +++++++++++++-------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index e56343fe..df14be05 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -34,7 +34,6 @@ * @TODO: Track specific error states, expose informative messages, line * numbers, indexes, and other debugging info. * - * * @TODO: Support XML 1.1. * * @TODO: Evaluate the performance of utf8_codepoint_at() against using the mbstring @@ -1934,23 +1933,9 @@ private function parse_next_tag() { $quoted_string_length - 2 ); $at += $quoted_string_length; - } - - // Skip whitespace. - $at += strspn( $this->xml, " \t\f\r\n", $at ); - - if ( $doc_length <= $at ) { - $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); - - return false; - } - - if ( '[' === $this->xml[ $at ] ) { - if ( ! $this->skip_doctype_internal_subset( $at ) ) { - return false; } - // Skip whitespace following the internal subset. + // Skip whitespace. $at += strspn( $this->xml, " \t\f\r\n", $at ); if ( $doc_length <= $at ) { @@ -1958,9 +1943,23 @@ private function parse_next_tag() { return false; } - } - if ( '>' !== $this->xml[ $at ] ) { + if ( '[' === $this->xml[ $at ] ) { + if ( ! $this->skip_doctype_internal_subset( $at ) ) { + return false; + } + + // Skip whitespace following the internal subset. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + } + + if ( '>' !== $this->xml[ $at ] ) { $this->bail( sprintf( 'Syntax error in DOCTYPE declaration. Unexpected character "%s" at position %d.', @@ -2638,7 +2637,7 @@ private function skip_conditional_section_body( &$offset, $section_type ) { * @return bool Whether the declaration was fully consumed. */ private function skip_markup_declaration( &$offset ) { - $doc_length = strlen( $this->xml ); + $doc_length = strlen( $this->xml ); $keyword_length = $this->parse_name( $offset ); if ( 0 === $keyword_length ) {