Skip to content

Commit 65cdaaa

Browse files
committed
feat(html): use html5ever source positions
1 parent 4d9e530 commit 65cdaaa

7 files changed

Lines changed: 674 additions & 162 deletions

File tree

crates/core/src/document/epub/mod.rs

Lines changed: 273 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use super::html::layout::TextAlign;
55
use super::html::layout::{DrawCommand, DrawState, ImageCommand, RootData, TextCommand};
66
use super::html::layout::{LoopContext, StyleData};
77
use super::html::style::StyleSheet;
8-
use super::html::xml::XmlParser;
8+
use super::html::xml::parse_html5;
99
use super::pdf::PdfOpener;
1010
use crate::document::{BoundedText, Document, Location, TextLocation, TocEntry, chapter_from_uri};
1111
use crate::framebuffer::Pixmap;
@@ -233,7 +233,7 @@ impl<R: Read + Seek> EpubDocument<R> {
233233
let mut zf = self.archive.by_name(name).ok()?;
234234
zf.read_to_string(&mut text).ok()?;
235235
}
236-
let root = XmlParser::new(&text).parse();
236+
let root = parse_html5(&text);
237237
self.cache_uris(root.root(), name, start_offset, cache);
238238
cache.get(uri).cloned()
239239
} else {
@@ -273,7 +273,7 @@ impl<R: Read + Seek> EpubDocument<R> {
273273
}
274274
}
275275

276-
let mut root = XmlParser::new(&text).parse();
276+
let mut root = parse_html5(&text);
277277
root.wrap_lost_inlines();
278278

279279
let mut stylesheet = StyleSheet::new();
@@ -919,11 +919,281 @@ impl<R: Read + Seek> Document for EpubDocument<R> {
919919
#[cfg(test)]
920920
mod tests {
921921
use super::*;
922+
use crate::document::html::dom::XmlTree;
922923
use crate::document::html::layout::DrawCommand;
924+
use crate::document::html::xml::XmlParser;
925+
use opf::OpfDocument;
923926
use std::io::Write;
924927
use std::path::PathBuf;
925928
use zip::write::SimpleFileOptions;
926929

930+
/// Minimal EPUB chapter that resembles a real spine file: XML declaration,
931+
/// DOCTYPE, explicit html/head/body, paragraphs with `id` attributes
932+
/// (needed for `cache_uris` and `DrawCommand::Marker`), and a text span.
933+
const CHAPTER_HTML: &str = concat!(
934+
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
935+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"\">\n",
936+
"<html xmlns=\"http://www.w3.org/1999/xhtml\">",
937+
"<head><title>Test</title></head>",
938+
"<body>",
939+
"<p id=\"s1\">First paragraph.</p>",
940+
"<p id=\"s2\">Second <em>emphasis</em> paragraph.</p>",
941+
"<p id=\"s3\">Third paragraph with <span>inline</span> content.</p>",
942+
"</body></html>",
943+
);
944+
945+
/// Variant of `CHAPTER_HTML` containing only block-level structure with no
946+
/// inline text nodes. Used by the display-list Marker test because the
947+
/// engine's inline-text layout path requires loaded fonts, whereas the
948+
/// block path that emits `DrawCommand::Marker` does not.
949+
const CHAPTER_HTML_BLOCK_ONLY: &str = concat!(
950+
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
951+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"\">\n",
952+
"<html xmlns=\"http://www.w3.org/1999/xhtml\">",
953+
"<head></head>",
954+
"<body>",
955+
"<div id=\"s1\"><div id=\"s1a\"><div id=\"s1b\"></div></div></div>",
956+
"<div id=\"s2\"><div id=\"s2a\"></div></div>",
957+
"<div id=\"s3\"></div>",
958+
"</body></html>",
959+
);
960+
961+
/// Collect `(tag_name, id_attr_value, byte_offset)` for every element that
962+
/// has an `id` attribute, in document order. Used to compare bookmark /
963+
/// annotation anchor points between parsers.
964+
fn collect_id_offsets(tree: &XmlTree) -> Vec<(String, String, usize)> {
965+
tree.root()
966+
.descendants()
967+
.filter_map(|n| {
968+
let tag = n.tag_name()?;
969+
let id = n.attribute("id")?;
970+
Some((tag.to_string(), id.to_string(), n.offset()))
971+
})
972+
.collect()
973+
}
974+
975+
/// Collect all `DrawCommand::Marker` offsets from a flat display list, in
976+
/// order. Marker offsets are exactly what gets stored as reading positions
977+
/// and bookmark targets.
978+
fn collect_marker_offsets(pages: &[Page]) -> Vec<usize> {
979+
pages
980+
.iter()
981+
.flatten()
982+
.filter_map(|cmd| match cmd {
983+
DrawCommand::Marker(offset) => Some(*offset),
984+
_ => None,
985+
})
986+
.collect()
987+
}
988+
989+
/// Build an in-memory EPUB zip containing a single spine chapter and
990+
/// return it as a `Vec<u8>` suitable for `EpubDocument::from_archive`.
991+
fn build_minimal_epub(chapter_html: &str) -> Vec<u8> {
992+
let buf = Vec::new();
993+
let cursor = std::io::Cursor::new(buf);
994+
let mut zip = zip::ZipWriter::new(cursor);
995+
let opts = SimpleFileOptions::default();
996+
997+
zip.start_file("META-INF/container.xml", opts).unwrap();
998+
zip.write_all(
999+
br#"<?xml version="1.0"?>
1000+
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
1001+
<rootfiles>
1002+
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
1003+
</rootfiles>
1004+
</container>"#,
1005+
)
1006+
.unwrap();
1007+
1008+
let chapter_bytes = chapter_html.as_bytes();
1009+
zip.start_file("OEBPS/chapter.xhtml", opts).unwrap();
1010+
zip.write_all(chapter_bytes).unwrap();
1011+
1012+
let opf = r#"<?xml version="1.0"?>
1013+
<package xmlns="http://www.idpf.org/2007/opf" version="2.0">
1014+
<metadata/>
1015+
<manifest>
1016+
<item id="ch1" href="chapter.xhtml" media-type="application/xhtml+xml"/>
1017+
</manifest>
1018+
<spine>
1019+
<itemref idref="ch1"/>
1020+
</spine>
1021+
</package>"#;
1022+
zip.start_file("OEBPS/content.opf", opts).unwrap();
1023+
zip.write_all(opf.as_bytes()).unwrap();
1024+
1025+
zip.finish().unwrap().into_inner()
1026+
}
1027+
1028+
/// Verify that `parse_html5` and `XmlParser` assign identical byte offsets
1029+
/// to every element that carries an `id` attribute in a realistic EPUB
1030+
/// chapter. These offsets are what gets stored as reading positions,
1031+
/// bookmark targets, and annotation anchors.
1032+
#[test]
1033+
fn epub_spine_chapter_id_offsets_match_between_parsers() {
1034+
let xml_offsets = {
1035+
let mut tree = XmlParser::new(CHAPTER_HTML).parse();
1036+
tree.wrap_lost_inlines();
1037+
collect_id_offsets(&tree)
1038+
};
1039+
1040+
let h5_offsets = {
1041+
let mut tree = parse_html5(CHAPTER_HTML);
1042+
tree.wrap_lost_inlines();
1043+
collect_id_offsets(&tree)
1044+
};
1045+
1046+
assert_eq!(
1047+
xml_offsets, h5_offsets,
1048+
"id-attribute node offsets differ between XmlParser and parse_html5\n\
1049+
XmlParser: {xml_offsets:?}\n\
1050+
html5ever: {h5_offsets:?}"
1051+
);
1052+
}
1053+
1054+
/// Verify that `cache_uris` (the `#anchor-id` → byte-offset map used for
1055+
/// in-book link resolution) produces identical mappings from both parsers.
1056+
#[test]
1057+
fn epub_spine_chapter_cache_uris_match_between_parsers() {
1058+
let name = "OEBPS/chapter.xhtml";
1059+
let start_offset: usize = 0;
1060+
1061+
let xml_cache = {
1062+
let mut cache = UriCache::default();
1063+
let tree = XmlParser::new(CHAPTER_HTML).parse();
1064+
let mut dummy_doc: EpubDocument<std::io::Cursor<Vec<u8>>> = EpubDocument {
1065+
archive: ZipArchive::new(std::io::Cursor::new(build_minimal_epub(CHAPTER_HTML)))
1066+
.unwrap(),
1067+
info: OpfDocument::empty(),
1068+
parent: PathBuf::default(),
1069+
engine: Engine::new(),
1070+
spine: vec![Chunk {
1071+
path: name.to_string(),
1072+
size: CHAPTER_HTML.len(),
1073+
}],
1074+
cache: FxHashMap::default(),
1075+
ignore_document_css: false,
1076+
};
1077+
dummy_doc.cache_uris(tree.root(), name, start_offset, &mut cache);
1078+
cache
1079+
};
1080+
1081+
let h5_cache = {
1082+
let mut cache = UriCache::default();
1083+
let tree = parse_html5(CHAPTER_HTML);
1084+
let mut dummy_doc: EpubDocument<std::io::Cursor<Vec<u8>>> = EpubDocument {
1085+
archive: ZipArchive::new(std::io::Cursor::new(build_minimal_epub(CHAPTER_HTML)))
1086+
.unwrap(),
1087+
info: OpfDocument::empty(),
1088+
parent: PathBuf::default(),
1089+
engine: Engine::new(),
1090+
spine: vec![Chunk {
1091+
path: name.to_string(),
1092+
size: CHAPTER_HTML.len(),
1093+
}],
1094+
cache: FxHashMap::default(),
1095+
ignore_document_css: false,
1096+
};
1097+
dummy_doc.cache_uris(tree.root(), name, start_offset, &mut cache);
1098+
cache
1099+
};
1100+
1101+
assert_eq!(
1102+
xml_cache, h5_cache,
1103+
"cache_uris maps differ between XmlParser and parse_html5\n\
1104+
XmlParser: {xml_cache:?}\n\
1105+
html5ever: {h5_cache:?}"
1106+
);
1107+
}
1108+
1109+
/// Verify that `build_display_list` emits `DrawCommand::Marker` commands
1110+
/// with identical offsets whether the spine chapter was parsed by
1111+
/// `XmlParser` or `parse_html5`. Marker offsets are stored as reading
1112+
/// positions and bookmark byte offsets, so they must be parser-independent.
1113+
///
1114+
/// Uses a block-only chapter variant (no inline text nodes) so the engine
1115+
/// does not require loaded fonts — the Marker path is font-free.
1116+
#[test]
1117+
fn epub_spine_chapter_marker_offsets_match_between_parsers() {
1118+
let start_offset: usize = 512;
1119+
1120+
let xml_markers = {
1121+
let mut tree = XmlParser::new(CHAPTER_HTML_BLOCK_ONLY).parse();
1122+
tree.wrap_lost_inlines();
1123+
marker_offsets_from_tree(tree, start_offset)
1124+
};
1125+
1126+
let h5_markers = {
1127+
let mut tree = parse_html5(CHAPTER_HTML_BLOCK_ONLY);
1128+
tree.wrap_lost_inlines();
1129+
marker_offsets_from_tree(tree, start_offset)
1130+
};
1131+
1132+
assert!(
1133+
!xml_markers.is_empty(),
1134+
"no Marker commands produced — check id attributes"
1135+
);
1136+
assert_eq!(
1137+
xml_markers, h5_markers,
1138+
"Marker offsets differ between XmlParser and parse_html5\n\
1139+
XmlParser: {xml_markers:?}\n\
1140+
html5ever: {h5_markers:?}"
1141+
);
1142+
}
1143+
1144+
/// Drive `Engine::build_display_list` directly for a pre-parsed tree and
1145+
/// collect all `DrawCommand::Marker` offsets. Uses a no-op resource
1146+
/// fetcher since the test chapter has no external assets.
1147+
fn marker_offsets_from_tree(tree: XmlTree, start_offset: usize) -> Vec<usize> {
1148+
struct NoopFetcher;
1149+
impl ResourceFetcher for NoopFetcher {
1150+
fn fetch(&mut self, _name: &str) -> Result<Vec<u8>, Error> {
1151+
Ok(Vec::new())
1152+
}
1153+
}
1154+
1155+
let mut engine = Engine::new();
1156+
engine.layout(600, 800, 12.0, 265);
1157+
1158+
let rect = engine.rect();
1159+
let mut draw_state = DrawState {
1160+
position: rect.min,
1161+
..Default::default()
1162+
};
1163+
let root_data = RootData {
1164+
start_offset,
1165+
spine_dir: PathBuf::default(),
1166+
rect,
1167+
};
1168+
let stylesheet = StyleSheet::new();
1169+
let style = StyleData {
1170+
font_size: engine.font_size,
1171+
line_height: crate::unit::pt_to_px(engine.line_height * engine.font_size, engine.dpi)
1172+
.round() as i32,
1173+
text_align: engine.text_align,
1174+
start_x: rect.min.x,
1175+
end_x: rect.max.x,
1176+
width: rect.max.x - rect.min.x,
1177+
..Default::default()
1178+
};
1179+
let loop_context = LoopContext::default();
1180+
let mut pages: Vec<Page> = vec![Vec::new()];
1181+
1182+
if let Some(body) = tree.root().find("body") {
1183+
engine.build_display_list(
1184+
body,
1185+
&style,
1186+
&loop_context,
1187+
&stylesheet,
1188+
&root_data,
1189+
&mut NoopFetcher,
1190+
&mut draw_state,
1191+
&mut pages,
1192+
);
1193+
}
1194+
1195+
collect_marker_offsets(&pages)
1196+
}
9271197
fn setup_epub() -> EpubDocumentFile {
9281198
let root_dir = PathBuf::from(
9291199
std::env::var("TEST_ROOT_DIR").expect("TEST_ROOT_DIR must be set for epub tests"),

crates/core/src/document/epub/opf.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,23 @@ impl OpfDocument {
7979
})
8080
}
8181

82+
/// Returns an empty `OpfDocument` with no manifest, spine, or metadata.
83+
///
84+
/// Used in tests that construct a stub [`super::EpubDocument`] without a
85+
/// real OPF file.
86+
#[cfg(test)]
87+
pub fn empty() -> Self {
88+
OpfDocument {
89+
manifest: Vec::new(),
90+
spine_idrefs: Vec::new(),
91+
spine_toc_id: None,
92+
dc_metadata: HashMap::new(),
93+
cover_href: None,
94+
series: None,
95+
categories: BTreeSet::new(),
96+
}
97+
}
98+
8299
/// Returns the `idref` values of all `<itemref>` children of `<spine>`,
83100
/// together with the `<spine toc="...">` attribute value if present.
84101
pub fn spine_idrefs(&self) -> (&[String], Option<&str>) {

crates/core/src/document/html/dom.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,8 @@ impl XmlTree {
337337
}
338338
}
339339

340+
// TODO(OGKevin): determine if this is actually still needed
341+
// with the move to external parser.
340342
pub fn wrap_lost_inlines(&mut self) {
341343
self.promote_blockish_inlines();
342344

crates/core/src/document/html/html5.rs

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,6 @@
33
//! This module exposes a single public type, [`Html5Document`], which wraps
44
//! the shared [`HtmlBase`] rendering pipeline and uses [`parse_html5`] to
55
//! build the document tree.
6-
//!
7-
//! Use [`Html5Document`] when HTML5 conformance matters more than offset
8-
//! precision — for example, the dictionary view, which renders content from
9-
//! third-party dictionaries that may contain entities, void elements, and
10-
//! implicitly-closed tags. Because node offsets are synthetic (not byte
11-
//! positions), reading positions must **not** be persisted when using this
12-
//! type.
136
147
use super::HtmlBase;
158
use super::layout::TextAlign;
@@ -28,13 +21,10 @@ const USER_STYLESHEET: &str = "css/dictionary-user.css";
2821
/// HTML document backed by the html5ever spec-compliant parser.
2922
///
3023
/// Handles HTML entities, void elements (`<br>`, `<img>`), and implicitly-
31-
/// closed block tags correctly per the HTML5 spec. Node offsets are **synthetic**
32-
/// (not byte positions in the source), so this type is **not** suitable for
33-
/// persisting reading positions to disk. Use it for ephemeral rendering such
34-
/// as the dictionary view.
35-
///
36-
/// For documents where offset accuracy matters (EPUB spine chapters, standalone
37-
/// HTML files) use [`HtmlDocument`](super::HtmlDocument) instead.
24+
/// closed block tags correctly per the HTML5 spec. Node offsets are
25+
/// byte-accurate source positions supplied by the `source-positions` feature
26+
/// of `html5ever`, matching those produced by [`XmlParser`](super::xml::XmlParser)
27+
/// for the same input.
3828
pub struct Html5Document {
3929
/// Shared rendering state (tree, engine, page cache, stylesheets).
4030
pub(super) base: HtmlBase,
@@ -52,7 +42,8 @@ impl Html5Document {
5242
/// [`set_user_stylesheet`](Self::set_user_stylesheet) to override them.
5343
#[cfg_attr(feature = "tracing", tracing::instrument(skip(text), fields(len = text.len())))]
5444
pub fn new_from_memory(text: &str) -> Html5Document {
55-
let content = parse_html5(text);
45+
let mut content = parse_html5(text);
46+
content.wrap_lost_inlines();
5647
Html5Document {
5748
base: HtmlBase::new(
5849
content,
@@ -69,7 +60,9 @@ impl Html5Document {
6960
#[cfg_attr(feature = "tracing", tracing::instrument(skip(self, text), fields(len = text.len())))]
7061
pub fn update(&mut self, text: &str) {
7162
self.base.size = text.len();
72-
self.base.content = parse_html5(text);
63+
let mut content = parse_html5(text);
64+
content.wrap_lost_inlines();
65+
self.base.content = content;
7366
self.base.pages.clear();
7467
}
7568

0 commit comments

Comments
 (0)