@@ -5,7 +5,7 @@ use super::html::layout::TextAlign;
55use super :: html:: layout:: { DrawCommand , DrawState , ImageCommand , RootData , TextCommand } ;
66use super :: html:: layout:: { LoopContext , StyleData } ;
77use super :: html:: style:: StyleSheet ;
8- use super :: html:: xml:: XmlParser ;
8+ use super :: html:: xml:: parse_html5 ;
99use super :: pdf:: PdfOpener ;
1010use crate :: document:: { BoundedText , Document , Location , TextLocation , TocEntry , chapter_from_uri} ;
1111use crate :: framebuffer:: Pixmap ;
@@ -233,7 +233,7 @@ impl<R: Read + Seek> EpubDocument<R> {
233233 let mut zf = self . archive . by_name ( name) . ok ( ) ?;
234234 zf. read_to_string ( & mut text) . ok ( ) ?;
235235 }
236- let root = XmlParser :: new ( & text) . parse ( ) ;
236+ let root = parse_html5 ( & text) ;
237237 self . cache_uris ( root. root ( ) , name, start_offset, cache) ;
238238 cache. get ( uri) . cloned ( )
239239 } else {
@@ -273,7 +273,7 @@ impl<R: Read + Seek> EpubDocument<R> {
273273 }
274274 }
275275
276- let mut root = XmlParser :: new ( & text) . parse ( ) ;
276+ let mut root = parse_html5 ( & text) ;
277277 root. wrap_lost_inlines ( ) ;
278278
279279 let mut stylesheet = StyleSheet :: new ( ) ;
@@ -919,11 +919,281 @@ impl<R: Read + Seek> Document for EpubDocument<R> {
919919#[ cfg( test) ]
920920mod tests {
921921 use super :: * ;
922+ use crate :: document:: html:: dom:: XmlTree ;
922923 use crate :: document:: html:: layout:: DrawCommand ;
924+ use crate :: document:: html:: xml:: XmlParser ;
925+ use opf:: OpfDocument ;
923926 use std:: io:: Write ;
924927 use std:: path:: PathBuf ;
925928 use zip:: write:: SimpleFileOptions ;
926929
930+ /// Minimal EPUB chapter that resembles a real spine file: XML declaration,
931+ /// DOCTYPE, explicit html/head/body, paragraphs with `id` attributes
932+ /// (needed for `cache_uris` and `DrawCommand::Marker`), and a text span.
933+ const CHAPTER_HTML : & str = concat ! (
934+ "<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>\n " ,
935+ "<!DOCTYPE html PUBLIC \" -//W3C//DTD XHTML 1.1//EN\" \" \" >\n " ,
936+ "<html xmlns=\" http://www.w3.org/1999/xhtml\" >" ,
937+ "<head><title>Test</title></head>" ,
938+ "<body>" ,
939+ "<p id=\" s1\" >First paragraph.</p>" ,
940+ "<p id=\" s2\" >Second <em>emphasis</em> paragraph.</p>" ,
941+ "<p id=\" s3\" >Third paragraph with <span>inline</span> content.</p>" ,
942+ "</body></html>" ,
943+ ) ;
944+
945+ /// Variant of `CHAPTER_HTML` containing only block-level structure with no
946+ /// inline text nodes. Used by the display-list Marker test because the
947+ /// engine's inline-text layout path requires loaded fonts, whereas the
948+ /// block path that emits `DrawCommand::Marker` does not.
949+ const CHAPTER_HTML_BLOCK_ONLY : & str = concat ! (
950+ "<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>\n " ,
951+ "<!DOCTYPE html PUBLIC \" -//W3C//DTD XHTML 1.1//EN\" \" \" >\n " ,
952+ "<html xmlns=\" http://www.w3.org/1999/xhtml\" >" ,
953+ "<head></head>" ,
954+ "<body>" ,
955+ "<div id=\" s1\" ><div id=\" s1a\" ><div id=\" s1b\" ></div></div></div>" ,
956+ "<div id=\" s2\" ><div id=\" s2a\" ></div></div>" ,
957+ "<div id=\" s3\" ></div>" ,
958+ "</body></html>" ,
959+ ) ;
960+
961+ /// Collect `(tag_name, id_attr_value, byte_offset)` for every element that
962+ /// has an `id` attribute, in document order. Used to compare bookmark /
963+ /// annotation anchor points between parsers.
964+ fn collect_id_offsets ( tree : & XmlTree ) -> Vec < ( String , String , usize ) > {
965+ tree. root ( )
966+ . descendants ( )
967+ . filter_map ( |n| {
968+ let tag = n. tag_name ( ) ?;
969+ let id = n. attribute ( "id" ) ?;
970+ Some ( ( tag. to_string ( ) , id. to_string ( ) , n. offset ( ) ) )
971+ } )
972+ . collect ( )
973+ }
974+
975+ /// Collect all `DrawCommand::Marker` offsets from a flat display list, in
976+ /// order. Marker offsets are exactly what gets stored as reading positions
977+ /// and bookmark targets.
978+ fn collect_marker_offsets ( pages : & [ Page ] ) -> Vec < usize > {
979+ pages
980+ . iter ( )
981+ . flatten ( )
982+ . filter_map ( |cmd| match cmd {
983+ DrawCommand :: Marker ( offset) => Some ( * offset) ,
984+ _ => None ,
985+ } )
986+ . collect ( )
987+ }
988+
989+ /// Build an in-memory EPUB zip containing a single spine chapter and
990+ /// return it as a `Vec<u8>` suitable for `EpubDocument::from_archive`.
991+ fn build_minimal_epub ( chapter_html : & str ) -> Vec < u8 > {
992+ let buf = Vec :: new ( ) ;
993+ let cursor = std:: io:: Cursor :: new ( buf) ;
994+ let mut zip = zip:: ZipWriter :: new ( cursor) ;
995+ let opts = SimpleFileOptions :: default ( ) ;
996+
997+ zip. start_file ( "META-INF/container.xml" , opts) . unwrap ( ) ;
998+ zip. write_all (
999+ br#"<?xml version="1.0"?>
1000+ <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
1001+ <rootfiles>
1002+ <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
1003+ </rootfiles>
1004+ </container>"# ,
1005+ )
1006+ . unwrap ( ) ;
1007+
1008+ let chapter_bytes = chapter_html. as_bytes ( ) ;
1009+ zip. start_file ( "OEBPS/chapter.xhtml" , opts) . unwrap ( ) ;
1010+ zip. write_all ( chapter_bytes) . unwrap ( ) ;
1011+
1012+ let opf = r#"<?xml version="1.0"?>
1013+ <package xmlns="http://www.idpf.org/2007/opf" version="2.0">
1014+ <metadata/>
1015+ <manifest>
1016+ <item id="ch1" href="chapter.xhtml" media-type="application/xhtml+xml"/>
1017+ </manifest>
1018+ <spine>
1019+ <itemref idref="ch1"/>
1020+ </spine>
1021+ </package>"# ;
1022+ zip. start_file ( "OEBPS/content.opf" , opts) . unwrap ( ) ;
1023+ zip. write_all ( opf. as_bytes ( ) ) . unwrap ( ) ;
1024+
1025+ zip. finish ( ) . unwrap ( ) . into_inner ( )
1026+ }
1027+
1028+ /// Verify that `parse_html5` and `XmlParser` assign identical byte offsets
1029+ /// to every element that carries an `id` attribute in a realistic EPUB
1030+ /// chapter. These offsets are what gets stored as reading positions,
1031+ /// bookmark targets, and annotation anchors.
1032+ #[ test]
1033+ fn epub_spine_chapter_id_offsets_match_between_parsers ( ) {
1034+ let xml_offsets = {
1035+ let mut tree = XmlParser :: new ( CHAPTER_HTML ) . parse ( ) ;
1036+ tree. wrap_lost_inlines ( ) ;
1037+ collect_id_offsets ( & tree)
1038+ } ;
1039+
1040+ let h5_offsets = {
1041+ let mut tree = parse_html5 ( CHAPTER_HTML ) ;
1042+ tree. wrap_lost_inlines ( ) ;
1043+ collect_id_offsets ( & tree)
1044+ } ;
1045+
1046+ assert_eq ! (
1047+ xml_offsets, h5_offsets,
1048+ "id-attribute node offsets differ between XmlParser and parse_html5\n \
1049+ XmlParser: {xml_offsets:?}\n \
1050+ html5ever: {h5_offsets:?}"
1051+ ) ;
1052+ }
1053+
1054+ /// Verify that `cache_uris` (the `#anchor-id` → byte-offset map used for
1055+ /// in-book link resolution) produces identical mappings from both parsers.
1056+ #[ test]
1057+ fn epub_spine_chapter_cache_uris_match_between_parsers ( ) {
1058+ let name = "OEBPS/chapter.xhtml" ;
1059+ let start_offset: usize = 0 ;
1060+
1061+ let xml_cache = {
1062+ let mut cache = UriCache :: default ( ) ;
1063+ let tree = XmlParser :: new ( CHAPTER_HTML ) . parse ( ) ;
1064+ let mut dummy_doc: EpubDocument < std:: io:: Cursor < Vec < u8 > > > = EpubDocument {
1065+ archive : ZipArchive :: new ( std:: io:: Cursor :: new ( build_minimal_epub ( CHAPTER_HTML ) ) )
1066+ . unwrap ( ) ,
1067+ info : OpfDocument :: empty ( ) ,
1068+ parent : PathBuf :: default ( ) ,
1069+ engine : Engine :: new ( ) ,
1070+ spine : vec ! [ Chunk {
1071+ path: name. to_string( ) ,
1072+ size: CHAPTER_HTML . len( ) ,
1073+ } ] ,
1074+ cache : FxHashMap :: default ( ) ,
1075+ ignore_document_css : false ,
1076+ } ;
1077+ dummy_doc. cache_uris ( tree. root ( ) , name, start_offset, & mut cache) ;
1078+ cache
1079+ } ;
1080+
1081+ let h5_cache = {
1082+ let mut cache = UriCache :: default ( ) ;
1083+ let tree = parse_html5 ( CHAPTER_HTML ) ;
1084+ let mut dummy_doc: EpubDocument < std:: io:: Cursor < Vec < u8 > > > = EpubDocument {
1085+ archive : ZipArchive :: new ( std:: io:: Cursor :: new ( build_minimal_epub ( CHAPTER_HTML ) ) )
1086+ . unwrap ( ) ,
1087+ info : OpfDocument :: empty ( ) ,
1088+ parent : PathBuf :: default ( ) ,
1089+ engine : Engine :: new ( ) ,
1090+ spine : vec ! [ Chunk {
1091+ path: name. to_string( ) ,
1092+ size: CHAPTER_HTML . len( ) ,
1093+ } ] ,
1094+ cache : FxHashMap :: default ( ) ,
1095+ ignore_document_css : false ,
1096+ } ;
1097+ dummy_doc. cache_uris ( tree. root ( ) , name, start_offset, & mut cache) ;
1098+ cache
1099+ } ;
1100+
1101+ assert_eq ! (
1102+ xml_cache, h5_cache,
1103+ "cache_uris maps differ between XmlParser and parse_html5\n \
1104+ XmlParser: {xml_cache:?}\n \
1105+ html5ever: {h5_cache:?}"
1106+ ) ;
1107+ }
1108+
1109+ /// Verify that `build_display_list` emits `DrawCommand::Marker` commands
1110+ /// with identical offsets whether the spine chapter was parsed by
1111+ /// `XmlParser` or `parse_html5`. Marker offsets are stored as reading
1112+ /// positions and bookmark byte offsets, so they must be parser-independent.
1113+ ///
1114+ /// Uses a block-only chapter variant (no inline text nodes) so the engine
1115+ /// does not require loaded fonts — the Marker path is font-free.
1116+ #[ test]
1117+ fn epub_spine_chapter_marker_offsets_match_between_parsers ( ) {
1118+ let start_offset: usize = 512 ;
1119+
1120+ let xml_markers = {
1121+ let mut tree = XmlParser :: new ( CHAPTER_HTML_BLOCK_ONLY ) . parse ( ) ;
1122+ tree. wrap_lost_inlines ( ) ;
1123+ marker_offsets_from_tree ( tree, start_offset)
1124+ } ;
1125+
1126+ let h5_markers = {
1127+ let mut tree = parse_html5 ( CHAPTER_HTML_BLOCK_ONLY ) ;
1128+ tree. wrap_lost_inlines ( ) ;
1129+ marker_offsets_from_tree ( tree, start_offset)
1130+ } ;
1131+
1132+ assert ! (
1133+ !xml_markers. is_empty( ) ,
1134+ "no Marker commands produced — check id attributes"
1135+ ) ;
1136+ assert_eq ! (
1137+ xml_markers, h5_markers,
1138+ "Marker offsets differ between XmlParser and parse_html5\n \
1139+ XmlParser: {xml_markers:?}\n \
1140+ html5ever: {h5_markers:?}"
1141+ ) ;
1142+ }
1143+
1144+ /// Drive `Engine::build_display_list` directly for a pre-parsed tree and
1145+ /// collect all `DrawCommand::Marker` offsets. Uses a no-op resource
1146+ /// fetcher since the test chapter has no external assets.
1147+ fn marker_offsets_from_tree ( tree : XmlTree , start_offset : usize ) -> Vec < usize > {
1148+ struct NoopFetcher ;
1149+ impl ResourceFetcher for NoopFetcher {
1150+ fn fetch ( & mut self , _name : & str ) -> Result < Vec < u8 > , Error > {
1151+ Ok ( Vec :: new ( ) )
1152+ }
1153+ }
1154+
1155+ let mut engine = Engine :: new ( ) ;
1156+ engine. layout ( 600 , 800 , 12.0 , 265 ) ;
1157+
1158+ let rect = engine. rect ( ) ;
1159+ let mut draw_state = DrawState {
1160+ position : rect. min ,
1161+ ..Default :: default ( )
1162+ } ;
1163+ let root_data = RootData {
1164+ start_offset,
1165+ spine_dir : PathBuf :: default ( ) ,
1166+ rect,
1167+ } ;
1168+ let stylesheet = StyleSheet :: new ( ) ;
1169+ let style = StyleData {
1170+ font_size : engine. font_size ,
1171+ line_height : crate :: unit:: pt_to_px ( engine. line_height * engine. font_size , engine. dpi )
1172+ . round ( ) as i32 ,
1173+ text_align : engine. text_align ,
1174+ start_x : rect. min . x ,
1175+ end_x : rect. max . x ,
1176+ width : rect. max . x - rect. min . x ,
1177+ ..Default :: default ( )
1178+ } ;
1179+ let loop_context = LoopContext :: default ( ) ;
1180+ let mut pages: Vec < Page > = vec ! [ Vec :: new( ) ] ;
1181+
1182+ if let Some ( body) = tree. root ( ) . find ( "body" ) {
1183+ engine. build_display_list (
1184+ body,
1185+ & style,
1186+ & loop_context,
1187+ & stylesheet,
1188+ & root_data,
1189+ & mut NoopFetcher ,
1190+ & mut draw_state,
1191+ & mut pages,
1192+ ) ;
1193+ }
1194+
1195+ collect_marker_offsets ( & pages)
1196+ }
9271197 fn setup_epub ( ) -> EpubDocumentFile {
9281198 let root_dir = PathBuf :: from (
9291199 std:: env:: var ( "TEST_ROOT_DIR" ) . expect ( "TEST_ROOT_DIR must be set for epub tests" ) ,
0 commit comments