diff --git a/components/XML/Tests/W3CXMLConformanceTest.php b/components/XML/Tests/W3CXMLConformanceTest.php index 324a4e32..012f6686 100644 --- a/components/XML/Tests/W3CXMLConformanceTest.php +++ b/components/XML/Tests/W3CXMLConformanceTest.php @@ -19,23 +19,27 @@ * @coversDefaultClass XMLProcessor */ class W3CXMLConformanceTest extends TestCase { - + /** * Path to the W3C XML test suite directory */ private static $test_suite_path; - + /** * Cache of parsed test cases */ private static $test_cases = null; - + + private const XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; + public static function setUpBeforeClass(): void { self::$test_suite_path = __DIR__ . '/W3C-XML-Test-Suite'; if (!is_dir(self::$test_suite_path)) { throw new Exception("W3C XML Test Suite not found at: " . self::$test_suite_path); } + + self::$test_suite_path = realpath(self::$test_suite_path); } /** @@ -49,15 +53,160 @@ public static function setUpBeforeClass(): void { public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $description) { $xml_content = file_get_contents($test_file); $this->assertNotFalse($xml_content, "Could not read test file: {$test_file}"); - if(strpos($xml_content, "markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support DOCTYPE declarations."); - return; - } if(strpos($xml_content, "\xFF\xFE") !== false || strpos($xml_content, "\xFE\xFF") !== false) { $this->markTestSkipped("Skipping test case: {$test_id} – it uses a UTF-16 encoded document and XMLProcessor only supports UTF-8."); return; } - + + if (in_array($test_id, [ + "not-sa01", + "not-sa02", + "not-sa03", + "not-sa04", + "sa04", + "ibm-valid-P01-ibm01v01.xml", + "ibm-valid-P32-ibm32v01.xml", + "ibm-valid-P32-ibm32v02.xml", + "ibm-valid-P32-ibm32v03.xml", + "ibm-valid-P32-ibm32v04.xml", + "ibm-valid-P68-ibm68v02.xml", + "ibm-valid-P69-ibm69v01.xml", + "ibm-valid-P69-ibm69v02.xml", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support standalone documents"); + return; + } + + if (in_array($test_id, [ + "ibm-1-1-valid-P02-ibm02v01.xml", + "ibm-1-1-valid-P02-ibm02v02.xml", + "ibm-1-1-valid-P02-ibm02v03.xml", + "ibm-1-1-valid-P02-ibm02v04.xml", + "ibm-1-1-valid-P02-ibm02v05.xml", + "ibm-1-1-valid-P02-ibm02v06.xml", + "ibm-1-1-valid-P03-ibm03v01.xml", + "ibm-1-1-valid-P03-ibm03v02.xml", + "ibm-1-1-valid-P03-ibm03v03.xml", + "ibm-1-1-valid-P03-ibm03v04.xml", + "ibm-1-1-valid-P03-ibm03v05.xml", + "ibm-1-1-valid-P03-ibm03v06.xml", + "ibm-1-1-valid-P03-ibm03v07.xml", + "ibm-1-1-valid-P03-ibm03v08.xml", + "ibm-1-1-valid-P03-ibm03v09.xml", + "ibm-1-1-valid-P04-ibm04v01.xml", + "ibm-1-1-valid-P04-ibm04av01.xml", + "ibm-1-1-valid-P05-ibm05v01.xml", + "ibm-1-1-valid-P05-ibm05v02.xml", + "ibm-1-1-valid-P05-ibm05v03.xml", + "ibm-1-1-valid-P05-ibm05v04.xml", + "ibm-1-1-valid-P05-ibm05v05.xml", + "ibm-1-1-valid-P047-ibm07v01.xml", + "ibm-1-1-valid-P77-ibm77v01.xml", + "ibm-1-1-valid-P77-ibm77v02.xml", + "ibm-1-1-valid-P77-ibm77v03.xml", + "ibm-1-1-valid-P77-ibm77v04.xml", + "ibm-1-1-valid-P77-ibm77v05.xml", + "ibm-1-1-valid-P77-ibm77v06.xml", + "ibm-1-1-valid-P77-ibm77v07.xml", + "ibm-1-1-valid-P77-ibm77v08.xml", + "ibm-1-1-valid-P77-ibm77v09.xml", + "ibm-1-1-valid-P77-ibm77v10.xml", + "ibm-1-1-valid-P77-ibm77v11.xml", + "ibm-1-1-valid-P77-ibm77v12.xml", + "ibm-1-1-valid-P77-ibm77v13.xml", + "ibm-1-1-valid-P77-ibm77v14.xml", + "ibm-1-1-valid-P77-ibm77v15.xml", + "ibm-1-1-valid-P77-ibm77v16.xml", + "ibm-1-1-valid-P77-ibm77v17.xml", + "ibm-1-1-valid-P77-ibm77v18.xml", + "ibm-1-1-valid-P77-ibm77v19.xml", + "ibm-1-1-valid-P77-ibm77v20.xml", + "ibm-1-1-valid-P77-ibm77v21.xml", + "ibm-1-1-valid-P77-ibm77v22.xml", + "ibm-1-1-valid-P77-ibm77v23.xml", + "ibm-1-1-valid-P77-ibm77v24.xml", + "ibm-1-1-valid-P77-ibm77v25.xml", + "ibm-1-1-valid-P77-ibm77v26.xml", + "ibm-1-1-valid-P77-ibm77v27.xml", + "ibm-1-1-valid-P77-ibm77v28.xml", + "ibm-1-1-valid-P77-ibm77v29.xml", + "ibm-1-1-valid-P77-ibm77v30.xml", + "rmt-e2e-50", + "rmt-006", + "rmt-007", + "rmt-023", + "rmt-025", + "rmt-027", + "rmt-029", + "rmt-031", + "rmt-033", + "rmt-035", + "rmt-043", + "rmt-045", + "rmt-047", + "rmt-049", + "rmt-051", + "rmt-054", + "rmt-ns11-001", + "rmt-ns11-002", + "rmt-ns11-003", + "rmt-ns11-004", + "rmt-ns11-006", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not support XML 1.1."); + return; + } + + if (in_array($test_id, [ + "valid-sa-012", + "valid-sa-016", + "valid-sa-017", + "valid-sa-036", + "valid-sa-017a", + "valid-sa-039", + "valid-sa-055", + "valid-sa-063", + "valid-sa-098", + "pr-xml-utf-8", + "o-p01pass2", + "o-p22pass4", + "o-p22pass5", + "o-p43pass1", + "ibm-valid-P16-ibm16v01", + "ibm-valid-P16-ibm16v02", + "ibm-valid-P16-ibm16v03", + "ibm-valid-P17-ibm17v01", + "ibm-valid-P27-ibm27v02", + "ibm-valid-P43-ibm43v01", + "rmt-e2e-15j", + "rmt-e2e-15l", + "rmt-e2e-22", + "rmt-010", + "rmt-012", + "rmt-022", + "rmt-026", + "rmt-034", + "rmt-040", + "rmt-044", + "rmt-050", + "rmt-e3e-05b", + "x-rmt-008b", + "ibm-valid-P16-ibm16v01.xml", + "ibm-valid-P16-ibm16v02.xml", + "ibm-valid-P16-ibm16v03.xml", + "ibm-valid-P17-ibm17v01.xml", + "ibm-valid-P27-ibm27v02.xml", + "ibm-valid-P43-ibm43v01.xml", + "x-ibm-1-0.5-valid-P04-ibm04v01.xml", + "x-ibm-1-0.5-valid-P05-ibm05v01.xml", + "x-ibm-1-0.5-valid-P05-ibm05v02.xml", + "x-ibm-1-0.5-valid-P05-ibm05v03.xml", + "x-ibm-1-0.5-valid-P05-ibm05v04.xml", + ])) { + $this->markTestSkipped("Skipping test case: {$test_id} – XMLProcessor does not apply custom DTDs."); + return; + } + try { $processor = XMLProcessor::create_from_string($xml_content); @@ -72,6 +221,8 @@ public function test_w3c_xml_test_case($test_id, $test_type, $test_file, $descri case 'valid': $this->assertNotFalse($processor, "Valid XML should parse successfully [{$test_id}]: {$description}"); + $this->assertNull($processor->get_exception(), + "Valid XML should not produce exceptions [{$test_id}]: {$description}"); $this->assertNull($processor->get_last_error(), "Valid XML should not produce errors [{$test_id}]: {$description}"); break; @@ -122,6 +273,8 @@ public static function w3cTestCaseProvider() { if (!is_dir(self::$test_suite_path)) { throw new Exception("W3C XML Test Suite not found at: " . self::$test_suite_path); } + + self::$test_suite_path = realpath(self::$test_suite_path); } if (self::$test_cases === null) { @@ -130,128 +283,113 @@ public static function w3cTestCaseProvider() { return self::$test_cases; } - + /** - * Parse all test cases from the W3C XML test suite + * Parse all test cases from the W3C XML test suite. */ private static function parseAllTestCases() { $main_config = self::$test_suite_path . '/xmlconf.xml'; - if (!file_exists($main_config)) { - throw new Exception("Main test configuration not found: {$main_config}"); + if ( ! file_exists( $main_config ) ) { + throw new Exception( "Main test configuration not found: {$main_config}" ); } - - $test_suites = self::parseMainConfiguration($main_config); - $all_test_cases = []; - - foreach ($test_suites as $suite) { - $suite_test_cases = self::parseTestSuite($suite); - $all_test_cases = array_merge($all_test_cases, $suite_test_cases); + + $previous = libxml_use_internal_errors( true ); + $dom = new DOMDocument(); + $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NOENT; + $loaded = $dom->load( $main_config, $options ); + if ( ! $loaded ) { + $errors = libxml_get_errors(); + libxml_clear_errors(); + libxml_use_internal_errors( $previous ); + + $message = 'Failed to parse xmlconf.xml'; + if ( ! empty( $errors ) ) { + $first = $errors[0]; + $message .= sprintf( ': %s on line %d', trim( $first->message ), $first->line ); + } + + throw new Exception( $message ); } - - return $all_test_cases; + + libxml_clear_errors(); + libxml_use_internal_errors( $previous ); + + $test_cases = array(); + self::collectTestCases( $dom->documentElement, self::$test_suite_path, $test_cases ); + + return $test_cases; } - - /** - * Parse the main xmlconf.xml configuration file - */ - private static function parseMainConfiguration($config_path) { - $xml_content = file_get_contents($config_path); - $suites = []; - - // Extract TESTCASES elements and their xml:base attributes - if (preg_match_all('/]*?xml:base="([^"]*)"[^>]*?PROFILE="([^"]*)"[^>]*?>/', $xml_content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $suites[] = [ - 'base_path' => $match[1], - 'profile' => $match[2] - ]; + + private static function collectTestCases( DOMNode $node, $base_path, array &$test_cases ) { + if ( ! ( $node instanceof DOMElement ) ) { + foreach ( $node->childNodes as $child ) { + self::collectTestCases( $child, $base_path, $test_cases ); } + + return; } - - // Also handle TESTCASES without explicit PROFILE but with xml:base - if (preg_match_all('/]*?xml:base="([^"]*)"[^>]*?>(?![^<]*PROFILE)/', $xml_content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $suites[] = [ - 'base_path' => $match[1], - 'profile' => 'Unknown Profile' - ]; - } + + $current_base = $base_path; + if ( $node->hasAttributeNS( self::XML_NAMESPACE, 'base' ) ) { + $current_base = self::resolvePath( $base_path, $node->getAttributeNS( self::XML_NAMESPACE, 'base' ) ); + } elseif ( $node->hasAttribute( 'xml:base' ) ) { + $current_base = self::resolvePath( $base_path, $node->getAttribute( 'xml:base' ) ); } - - return $suites; - } - - /** - * Parse tests for a specific test suite - */ - private static function parseTestSuite($suite) { - $base_path = rtrim(self::$test_suite_path . '/' . $suite['base_path'], '/'); - $test_cases = []; - - // Look for test definition files in the base path - if (is_dir($base_path)) { - $files = glob($base_path . '/*.xml'); - foreach ($files as $file) { - if (basename($file) !== 'xmlconf.xml') { - $suite_test_cases = self::parseTestFile($file, $base_path); - $test_cases = array_merge($test_cases, $suite_test_cases); - } + + if ( 'TEST' === $node->nodeName ) { + $uri = $node->getAttribute( 'URI' ); + if ( '' === $uri ) { + return; } - } - - return $test_cases; - } - - /** - * Parse a single test definition file - */ - private static function parseTestFile($test_file, $base_path) { - $content = file_get_contents($test_file); - $test_cases = []; - - // Parse TEST elements using regex - $pattern = '/]+)>(.*?)<\/TEST>/s'; - if (preg_match_all($pattern, $content, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - $attributes = self::parseAttributes($match[1]); - $description = trim(strip_tags($match[2])); - - if (isset($attributes['URI']) && isset($attributes['ID']) && isset($attributes['TYPE'])) { - $test_file_path = $base_path . '/' . $attributes['URI']; - - // Only include tests that have actual test files - if (file_exists($test_file_path)) { - $test_cases[$attributes['ID']] = [ - $attributes['ID'], // test_id - $attributes['TYPE'], // test_type - $test_file_path, // test_file - $description // description - ]; - } - } + + $test_file = self::resolvePath( $current_base, $uri ); + if ( ! is_file( $test_file ) ) { + return; + } + + $test_id = $node->getAttribute( 'ID' ); + if ( '' === $test_id ) { + $test_id = $uri; } + + $type = strtolower( $node->getAttribute( 'TYPE' ) ); + if ( '' === $type ) { + $type = 'valid'; + } + + $description = trim( preg_replace( '/\s+/', ' ', $node->textContent ) ); + + $test_cases[ $test_id ] = array( + $test_id, + $type, + $test_file, + $description, + ); + + return; + } + + foreach ( $node->childNodes as $child ) { + self::collectTestCases( $child, $current_base, $test_cases ); } - - return $test_cases; } - - /** - * Parse XML attributes from a string - */ - private static function parseAttributes($attr_string) { - $attributes = []; - $pattern = '/(\w+)="([^"]*)"|\s+(\w+)=\'([^\']*)\'/'; - - if (preg_match_all($pattern, $attr_string, $matches, PREG_SET_ORDER)) { - foreach ($matches as $match) { - if (!empty($match[1])) { - $attributes[$match[1]] = $match[2]; - } elseif (!empty($match[3])) { - $attributes[$match[3]] = $match[4]; - } - } + + private static function resolvePath( $base_path, $relative_path ) { + if ( '' === $relative_path ) { + return $base_path; } - - return $attributes; + + // If it's an absolute path, use it directly + if ( $relative_path[0] === '/' || preg_match( '#^[a-zA-Z]:#', $relative_path ) ) { + return $relative_path; + } + + // Otherwise concatenate and let realpath() normalize it + $candidate = rtrim( $base_path, DIRECTORY_SEPARATOR ) . DIRECTORY_SEPARATOR . $relative_path; + $resolved = realpath( $candidate ); + + // If realpath fails (file doesn't exist), return the candidate anyway + // We check is_file() later, so non-existent paths will be skipped + return false !== $resolved ? $resolved : $candidate; } -} \ No newline at end of file +} diff --git a/components/XML/Tests/XMLProcessorTest.php b/components/XML/Tests/XMLProcessorTest.php index 8e586518..c38a254c 100644 --- a/components/XML/Tests/XMLProcessorTest.php +++ b/components/XML/Tests/XMLProcessorTest.php @@ -480,7 +480,9 @@ public function test_to_string_returns_updated_xml() { * @covers XMLProcessor::get_updated_xml */ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() { - $processor = XMLProcessor::create_from_string( 'Test' ); + $processor = XMLProcessor::create_from_string( 'Test' ); + $processor->next_tag(); + $processor->next_tag(); $processor->remove_attribute( '', 'id' ); @@ -488,7 +490,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->set_attribute( '', 'id', 'content-id-1' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after updating the attributes of the second tag returned different XML than expected' ); @@ -496,7 +498,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->set_attribute( '', 'id', 'content-id-2' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after updating the attributes of the second tag for the second time returned different XML than expected' ); @@ -505,7 +507,7 @@ public function test_get_updated_xml_applies_the_updates_so_far_and_keeps_the_pr $processor->remove_attribute( '', 'id' ); $this->assertSame( - 'Test', + 'Test', $processor->get_updated_xml(), 'Calling get_updated_xml after removing the id attribute of the third tag returned different XML than expected' ); @@ -1243,7 +1245,67 @@ public static function data_incomplete_syntax_elements() { 'Incomplete CDATA' => array( ' array( ' array( ' array( ' array( ' array( ' array( " array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( ' array( 'text' ), + ); + } + + public function test_stream_parsing_of_incomplete_doctypes() { + $XML = 'text]]> + text]]> + + + + + + "> + + + + ]]> + + ]]> + ]> + '; + $processor = XMLProcessor::create_for_streaming( '' ); + + // Append one byte at a time, keep trying to advance, and confirm the + // parser does not emit an error at any point. + for($i = 0; $i < strlen($XML); $i++) { + $processor->append_bytes( $XML[$i] ); + $processor->next_token(); + $this->assertNull( $processor->get_exception() ); + $this->assertNull( $processor->get_last_error() ); + } + $processor->append_bytes( '' ); + $this->assertTrue( $processor->next_tag(), 'Did not find the root node.' ); + $this->assertEquals( 'root', $processor->get_tag_local_name(), 'Did not find text node.' ); + $this->assertFalse( $processor->next_token(), 'Found text node when there was none.' ); + $this->assertNull( $processor->get_exception() ); } /** @@ -1717,6 +1779,27 @@ public function test_detects_invalid_document_no_root_tag() { $this->assertFalse( $processor->next_tag(), 'Found an element when there was none.' ); $this->assertTrue( $processor->is_paused_at_incomplete_input(), 'Did not indicate that the XML input was incomplete.' ); } + + /** + * + * @covers XMLProcessor::next_tag + */ + public function test_tolerates_illegal_extender_in_pi_target() { + $processor = XMLProcessor::create_from_string( + ' + +]> + +' + ); + $this->assertTrue( $processor->next_tag(), 'Found an element when there was none.' ); + $this->assertEquals( 'animal', $processor->get_tag_local_name(), 'Did not find the expected tag.' ); + $this->assertTrue( $processor->next_token(), 'Found an element when there was none.' ); + $this->assertFalse( $processor->next_token(), 'Found an element when there was none.' ); + $this->assertNull( $processor->get_last_error(), 'Did not find the expected error.' ); + $this->assertNull( $processor->get_exception(), 'Did not find the expected error.' ); + } /** * @@ -2615,22 +2698,54 @@ public static function data_reserved_namespace_declarations() { ); } - public function test_preserves_whitespace_with_xml_space_attribute() { + public function test_skips_over_doctypes_atts_and_conditional_sections() { $xml = << - line1 - line2 - + + + + + "> + + + + ]]> + + ]]> + ]> + + + Test + + +

Example

+ + XML; $processor = XMLProcessor::create_from_string( $xml ); - $processor->next_tag( 'root' ); + $this->assertTrue( $processor->next_token(), 'Did not find DOCTYPE node.' ); + $this->assertEquals( '#doctype', $processor->get_token_type(), 'Did not find DOCTYPE node.' ); + $this->assertEquals( 'html', $processor->get_doctype_name(), 'Did not find DOCTYPEName.' ); + + $this->assertTrue( $processor->next_token(), 'Did not find root tag.' ); + $this->assertEquals( 'html', $processor->get_tag_local_name(), 'Did not find root tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find head tag.' ); + $this->assertEquals( 'head', $processor->get_tag_local_name(), 'Did not find head tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find title tag.' ); + $this->assertEquals( 'title', $processor->get_tag_local_name(), 'Did not find title tag.' ); + + $this->assertTrue( $processor->next_tag(), 'Did not find body tag.' ); + $this->assertEquals( 'body', $processor->get_tag_local_name(), 'Did not find body tag.' ); - $this->assertTrue( $processor->next_token(), 'Did not find first text node.' ); - $this->assertEquals( "\n line1\n ", $processor->get_modifiable_text() ); + $this->assertTrue( $processor->next_tag(), 'Did not find p tag.' ); + $this->assertEquals( 'p', $processor->get_tag_local_name(), 'Did not find p tag.' ); - $processor->next_tag( 'child' ); - $this->assertTrue( $processor->next_token(), 'Did not find second text node.' ); - $this->assertEquals( ' line2 ', $processor->get_modifiable_text() ); + $this->assertTrue( $processor->next_token(), 'Did not find example text.' ); + $this->assertEquals( 'Example', $processor->get_modifiable_text(), 'Did not find example text.' ); } public function test_handles_various_whitespace_between_attributes() { diff --git a/components/XML/class-xmlprocessor.php b/components/XML/class-xmlprocessor.php index c2998a05..772cb7b5 100644 --- a/components/XML/class-xmlprocessor.php +++ b/components/XML/class-xmlprocessor.php @@ -22,7 +22,7 @@ * – Well-formed * – UTF-8 encoded * – Not standalone (so can use external entities) - * – No DTD, DOCTYPE, ATTLIST, ENTITY, or conditional sections (will fail on them) + * – No external DTD subset expansion (external entities may exist but are not fetched). * * XML 1.1 is explicitly not a design goal here. Version 1.1 is * more complex specification and not so widely supported. @@ -34,17 +34,9 @@ * which is a more complex specification and not so widely supported. * * @TODO: Include the cursor string in internal bookmarks and use it for seeking. - * * @TODO: Track specific error states, expose informative messages, line * numbers, indexes, and other debugging info. * - * @TODO: Skip over the following syntax elements instead of failing: - * * xml, $pubid_char, $at + 1 ); @@ -1923,6 +1915,12 @@ private function parse_next_tag() { // Skip whitespace. $at += strspn( $this->xml, " \t\f\r\n", $at ); + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed SYSTEM literal.' ); + + return false; + } + // Parse the SystemLiteral token. $quoted_string_length = $this->parse_quoted_string( $at ); if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { @@ -1938,13 +1936,45 @@ private function parse_next_tag() { $quoted_string_length - 2 ); $at += $quoted_string_length; - } elseif ( '[' === $this->xml[ $at ] ) { - $this->bail( 'Inline entity declarations are not yet supported in DOCTYPE declarations.', self::ERROR_SYNTAX ); + } else { + $chars = strspn( $this->xml, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at ); + if ( $chars === $doc_length - $at ) { + // The document ends with something like: + // mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } } // Skip whitespace. $at += strspn( $this->xml, " \t\f\r\n", $at ); + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + if ( '[' === $this->xml[ $at ] ) { + $new_at = $this->skip_doctype_internal_dtd_subset( $at ); + if ( false === $new_at ) { + return false; + } + + $at = $new_at; + + // Skip whitespace following the internal subset. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + } + if ( '>' !== $this->xml[ $at ] ) { $this->bail( sprintf( @@ -2316,6 +2346,498 @@ private function skip_whitespace() { $this->bytes_already_parsed += strspn( $this->xml, " \t\f\r\n", $this->bytes_already_parsed ); } + /** + * Skips over the internal subset of a DOCTYPE declaration: + * + * + * ^^^^^^^^^^^^^^^^^ + * this part + * + * @param int $offset Byte offset of the '[' that opens the subset. + * @return int|false Updated offset pointing right after the closing ']', or false on failure. + */ + private function skip_doctype_internal_dtd_subset( $offset ) { + $doc_length = strlen( $this->xml ); + + // Consume the opening '['. + ++$offset; + + while ( $offset < $doc_length ) { + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + if ( ']' === $this->xml[ $offset ] ) { + ++$offset; + + return $offset; + } + + if ( '%' === $this->xml[ $offset ] ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { + return false; + } + + continue; + } + + if ( '<' === $this->xml[ $offset ] ) { + $offset = $this->skip_dtd_markup( $offset ); + if ( false === $offset ) { + return false; + } + + continue; + } + + $this->bail( + sprintf( + 'Unexpected character "%s" in DOCTYPE internal subset at position %d.', + $this->xml[ $offset ], + $offset + ), + self::ERROR_SYNTAX + ); + } + + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips a single markup declaration, comment, processing instruction, or conditional section. + * + * ]> + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * this entire part + * + * @param int $offset Byte offset pointing to the '<' that begins the markup. + * @return int|false Updated offset on success, false on failure. + */ + private function skip_dtd_markup( $offset ) { + $doc_length = strlen( $this->xml ); + + if ( $offset + 1 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed markup in DOCTYPE declaration.' ); + + return false; + } + + $next = $this->xml[ $offset + 1 ]; + + if ( '?' === $next ) { + $closer = strpos( $this->xml, '?>', $offset + 2 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed processing instruction in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 2; + return $offset; + } + + if ( '!' !== $next ) { + $this->bail( 'Unsupported markup inside DOCTYPE declaration.', self::ERROR_SYNTAX ); + } + + if ( $offset + 3 < $doc_length && '-' === $this->xml[ $offset + 2 ] && '-' === $this->xml[ $offset + 3 ] ) { + $closer = strpos( $this->xml, '-->', $offset + 4 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed comment in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 3; + + return $offset; + } + + if ( $offset + 2 < $doc_length && '[' === $this->xml[ $offset + 2 ] ) { + $offset += 3; + + return $this->skip_conditional_section( $offset ); + } + + $offset += 2; + + return $this->skip_markup_declaration( $offset ); + } + + /** + * Skips over a conditional section, including any nested sections it may contain. + * + * ]]> + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * this entire section + * + * @see https://www.w3.org/TR/xml/#sec-condition-sect + * + * @param int $offset Byte offset immediately after the 'xml ); + $section_type = 'UNKNOWN'; + + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( '%' === $this->xml[ $offset ] ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { + return false; + } + } else { + $keyword_length = $this->parse_name( $offset ); + if ( 0 === $keyword_length ) { + $this->bail( 'Invalid conditional section declaration.', self::ERROR_SYNTAX ); + } + + if ( 0 === substr_compare( $this->xml, 'INCLUDE', $offset, min( strlen( 'INCLUDE' ), $keyword_length ), true ) ) { + if ( $keyword_length < strlen( 'INCLUDE' ) && $offset + $keyword_length >= $doc_length ) { + $this->mark_incomplete_input( 'Unfinished conditional section keyword.' ); + + return false; + } + $section_type = 'INCLUDE'; + } elseif ( 0 === substr_compare( $this->xml, 'IGNORE', $offset, min( strlen( 'IGNORE' ), $keyword_length ), true ) ) { + if ( $keyword_length < strlen( 'IGNORE' ) && $offset + $keyword_length >= $doc_length ) { + $this->mark_incomplete_input( 'Unfinished conditional section keyword.' ); + + return false; + } + $section_type = 'IGNORE'; + } else { + $this->bail( 'Unsupported conditional section keyword.', self::ERROR_SYNTAX ); + } + + $offset += $keyword_length; + } + + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( '[' !== $this->xml[ $offset ] ) { + $this->bail( 'Conditional section missing "[" opener.', self::ERROR_SYNTAX ); + } + + ++$offset; + + $offset = $this->skip_conditional_section_body( $offset, $section_type ); + if ( false === $offset ) { + return false; + } + + if ( $offset + 2 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( ']' !== $this->xml[ $offset ] || ']' !== $this->xml[ $offset + 1 ] || '>' !== $this->xml[ $offset + 2 ] ) { + $this->bail( 'Invalid conditional section closer.', self::ERROR_SYNTAX ); + } + + $offset += 3; + + return $offset; + } + + /** + * Scans the contents of a conditional section until the matching "]]>" sequence. + * + * ]]> + * ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + * this entire part + * + * @see https://www.w3.org/TR/xml/#sec-condition-sect + * @param int $offset Byte offset immediately after the content opener '['. + * @param string $section_type Either 'INCLUDE', 'IGNORE', or 'UNKNOWN'. + * @return int|false Updated offset on success, false on failure. + */ + private function skip_conditional_section_body( $offset, $section_type ) { + $doc_length = strlen( $this->xml ); + + while ( $offset < $doc_length ) { + if ( + $offset + 2 < $doc_length && + ']' === $this->xml[ $offset ] && + ']' === $this->xml[ $offset + 1 ] && + '>' === $this->xml[ $offset + 2 ] + ) { + return $offset; + } + + $char = $this->xml[ $offset ]; + + if ( '"' === $char || "'" === $char ) { + $length = $this->parse_quoted_string( $offset ); + if ( false === $length ) { + return false; + } + + $offset += $length; + continue; + } + + if ( '%' === $char ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { + return false; + } + + continue; + } + + if ( '<' === $char ) { + if ( $offset + 1 >= $doc_length ) { + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + if ( + '!' === $this->xml[ $offset + 1 ] && + $offset + 2 < $doc_length && + '[' === $this->xml[ $offset + 2 ] + ) { + $offset += 3; + + $offset = $this->skip_conditional_section( $offset ); + if ( false === $offset ) { + return false; + } + + continue; + } + + if ( + '!' === $this->xml[ $offset + 1 ] && + $offset + 3 < $doc_length && + '-' === $this->xml[ $offset + 2 ] && + '-' === $this->xml[ $offset + 3 ] + ) { + $closer = strpos( $this->xml, '-->', $offset + 4 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed comment in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 3; + continue; + } + + if ( 'INCLUDE' === $section_type ) { + if ( '!' === $this->xml[ $offset + 1 ] ) { + $offset += 2; + + $offset = $this->skip_markup_declaration( $offset ); + if ( false === $offset ) { + return false; + } + + continue; + } + + if ( '?' === $this->xml[ $offset + 1 ] ) { + $closer = strpos( $this->xml, '?>', $offset + 2 ); + if ( false === $closer ) { + $this->mark_incomplete_input( 'Unclosed processing instruction in DOCTYPE declaration.' ); + + return false; + } + + $offset = $closer + 2; + continue; + } + } + } + + ++$offset; + } + + $this->mark_incomplete_input( 'Unclosed conditional section in DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips the following markup declarations following a ' + * ^^^^^^^^^^^^^^^^^^^^^^ + * this entire part + * + * Supported markup declarations: + * - + * - + * - + * - + * + * @param int $offset Byte offset immediately after 'parse_name( $offset ); + + if ( 0 === $keyword_length ) { + if ( $offset >= strlen( $this->xml ) ) { + $this->mark_incomplete_input( 'Unfinished markup declaration keyword.' ); + + return false; + } + $this->bail( 'Malformed markup declaration in DOCTYPE internal subset.', self::ERROR_SYNTAX ); + } + + static $supported_keywords = null; + if ( null === $supported_keywords ) { + $supported_keywords = array( 'ELEMENT', 'ATTLIST', 'ENTITY', 'NOTATION' ); + } + + foreach ( $supported_keywords as $keyword ) { + if ( 0 === substr_compare( $this->xml, $keyword, $offset, min( strlen( $keyword ), $keyword_length ), true ) ) { + if ( $keyword_length < strlen( $keyword ) && $offset + $keyword_length >= strlen( $this->xml ) ) { + $this->mark_incomplete_input( 'Unfinished markup declaration keyword.' ); + + return false; + } + } + } + $offset += $keyword_length; + + return $this->skip_markup_declaration_body( $offset ); + } + + /** + * Scans a markup declaration until its closing '>'. + * + * + * ^^^^^^^^^^^^^^ + * this part + * + * @see https://www.w3.org/TR/xml/#dt-markupdecl + * @param int $offset Byte offset immediately after the markup keyword (e.g. 'ELEMENT', 'ATTLIST', 'ENTITY', 'NOTATION'). + * @return int|false Updated offset on success, false on failure. + */ + private function skip_markup_declaration_body( $offset ) { + $doc_length = strlen( $this->xml ); + + while ( $offset < $doc_length ) { + $char = $this->xml[ $offset ]; + + if ( '"' === $char || "'" === $char ) { + $length = $this->parse_quoted_string( $offset ); + if ( false === $length ) { + return false; + } + + $offset += $length; + continue; + } + + if ( '%' === $char ) { + $offset = $this->skip_dtd_parameter_entity_reference( $offset ); + if ( false === $offset ) { + return false; + } + + continue; + } + + if ( '>' === $char ) { + ++$offset; + + return $offset; + } + + if ( '<' === $char ) { + $this->bail( 'Unexpected "<" inside DOCTYPE markup declaration.', self::ERROR_SYNTAX ); + } + + ++$offset; + } + + $this->mark_incomplete_input( 'Unclosed markup declaration in DOCTYPE declaration.' ); + + return false; + } + + /** + * Skips over a **parameter entity reference** beginning at $offset. + * $offset must point to the initial '%' byte of the reference. + * + * + * ^^^^^^^^^ + * this part + * + * @see https://www.w3.org/TR/xml/#dt-PERef + * @param int $offset Byte offset at the '%'. + * @return int|false Updated offset on success, false on failure. + */ + private function skip_dtd_parameter_entity_reference( $offset ) { + $doc_length = strlen( $this->xml ); + + if ( '%' !== $this->xml[ $offset ] ) { + $this->bail( 'Parameter entity reference must start with "%".', self::ERROR_SYNTAX ); + } + + ++$offset; + $offset_before_name = $offset; + $offset += strspn( $this->xml, " \t\f\r\n", $offset ); + $had_whitespace = ( $offset !== $offset_before_name ); + + $name_length = $this->parse_name( $offset ); + if ( 0 === $name_length ) { + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unterminated parameter entity reference in DOCTYPE declaration.' ); + + return false; + } + + $this->bail( 'Invalid parameter entity reference in DOCTYPE declaration.', self::ERROR_SYNTAX ); + } + + $offset += $name_length; + + if ( $had_whitespace ) { + // Parameter entity declaration (e.g. ""). + return $offset; + } + + if ( $offset >= $doc_length ) { + $this->mark_incomplete_input( 'Unterminated parameter entity reference in DOCTYPE declaration.' ); + + return false; + } + + if ( ';' !== $this->xml[ $offset ] ) { + $this->bail( 'Parameter entity references must end with a semicolon.', self::ERROR_SYNTAX ); + } + + ++$offset; + + return $offset; + } + /** * Parses a Name token starting at $offset * @@ -3705,6 +4227,11 @@ private function step( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( self::PROCESS_NEXT_NODE === $node_to_process ) { if ( $this->is_empty_element() ) { array_pop( $this->stack_of_open_elements ); + if ( empty( $this->stack_of_open_elements ) ) { + // We've just popped the root element – the context + // becomes "misc" by definition. + $this->parser_context = self::IN_MISC_CONTEXT; + } } }