From 9170093776939132063897ddc73065587364e899 Mon Sep 17 00:00:00 2001 From: che cheng Date: Sun, 19 Apr 2026 18:53:56 +0800 Subject: [PATCH 1/5] refactor: swap pdf-to-latex-swift path dep for PsychQuant/pdf-to-latex-swift v0.1.0 (#79 Track A) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track A of #79 — formalize the existing PsychQuant/pdf-to-latex-swift remote that was pushed 2026-04-16 but never version-tagged. Discovery during spectra-discuss: diff -rq confirmed local and remote are byte-identical, so reconciliation collapses to ceremony: 1. git tag v0.1.0 + push origin v0.1.0 (done on remote) 2. Package.swift: path dep → url dep (from: 0.1.0) 3. swift package update pdf-to-latex-swift 4. Delete local packages/pdf-to-latex-swift/ (no longer needed — SPM resolves from URL) Pre-existing dependency pins preserved per design (Package.resolved cascade risk): note-core-swift@0.1.3, note-to-pdf-swift@0.1.2, word-builder-swift@0.9.0, ooxml-swift@0.7.0. swift build: clean link, no consumer breakage in MacDoc+PDF.swift, MacDoc+PDF+Phase2.swift, MacDoc+Config.swift, MacDoc+OCR.swift. Refs #79 --- Package.resolved | 9 +++++++++ Package.swift | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Package.resolved b/Package.resolved index c65fabf..d4c4191 100644 --- a/Package.resolved +++ b/Package.resolved @@ -90,6 +90,15 @@ "version" : "0.7.0" } }, + { + "identity" : "pdf-to-latex-swift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/PsychQuant/pdf-to-latex-swift.git", + "state" : { + "revision" : "d62eb4fd77bc1a7d0f554cf1400104e3dbd68f1a", + "version" : "0.1.0" + } + }, { "identity" : "swift-argument-parser", "kind" : "remoteSourceControl", diff --git a/Package.swift b/Package.swift index 8fbdddd..f631d24 100644 --- a/Package.swift +++ b/Package.swift @@ -12,7 +12,7 @@ let package = Package( .package(url: "https://github.com/PsychQuant/common-converter-swift.git", from: "0.4.0"), .package(url: "https://github.com/PsychQuant/word-to-md-swift.git", from: "0.5.1"), .package(name: "MarkerWordConverter", path: "packages/marker-word-converter-swift"), - .package(name: "pdf-to-latex-swift", path: "packages/pdf-to-latex-swift"), + .package(url: "https://github.com/PsychQuant/pdf-to-latex-swift.git", from: "0.1.0"), .package(name: "PDFToMD", path: "packages/pdf-to-md-swift"), .package(name: "WordToHTML", path: "packages/word-to-html-swift"), .package(name: "HTMLToWord", path: "packages/html-to-word-swift"), From 3a2806ad0472d067175e1eb2067599b7d98f5704 Mon Sep 17 00:00:00 2001 From: che cheng Date: Sun, 19 Apr 2026 18:55:41 +0800 Subject: [PATCH 2/5] refactor: extract ocr-swift to PsychQuant/ocr-swift v0.1.0 (#79 Track B) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track B of #79 — clean-slate extraction mirroring the note-* pattern. Changes: - packages/ocr-swift/ -> PsychQuant/ocr-swift v0.1.0 (new public repo) - 5 Swift sources (OCRPipeline, OCRBackend, MLXBackend, OllamaBackend, PDFKitExtractor) — byte-identical to previous local version - .gitignore excludes .build/, .swiftpm/, Package.resolved - Package.swift: .package(name: "OCRSwift", path: ...) -> .package(url:) - Package.swift: product ref .product(name: "OCRCore", package: "OCRSwift") -> package: "ocr-swift" (lowercase identity matches remote URL basename) Critical: PR #84's Qwen3-VL/OCRBackend wiring in PageOCRRunner.swift was specifically verified to compile against MLXBackend(modelConfig:) / OllamaBackend(host:) / OCRPipeline(backend:) from the new url dep. swift build: 6.28s clean link, no consumer breakage. Pre-existing dependency pins preserved: note-core-swift@0.1.3, note-to-pdf-swift@0.1.2, word-builder-swift@0.9.0, ooxml-swift@0.7.0, pdf-to-latex-swift@0.1.0. Refs #79 --- Package.resolved | 9 +++++++++ Package.swift | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Package.resolved b/Package.resolved index d4c4191..efa6573 100644 --- a/Package.resolved +++ b/Package.resolved @@ -81,6 +81,15 @@ "version" : "0.1.2" } }, + { + "identity" : "ocr-swift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/PsychQuant/ocr-swift.git", + "state" : { + "revision" : "692dc1748d7e94a954725c9166021a1c47522397", + "version" : "0.1.0" + } + }, { "identity" : "ooxml-swift", "kind" : "remoteSourceControl", diff --git a/Package.swift b/Package.swift index f631d24..ccf5669 100644 --- a/Package.swift +++ b/Package.swift @@ -28,7 +28,7 @@ let package = Package( .package(url: "https://github.com/PsychQuant/note-to-html-swift.git", from: "0.1.0"), .package(url: "https://github.com/PsychQuant/note-to-pdf-swift.git", from: "0.1.0"), .package(url: "https://github.com/PsychQuant/word-builder-swift.git", from: "0.9.0"), - .package(name: "OCRSwift", path: "packages/ocr-swift"), + .package(url: "https://github.com/PsychQuant/ocr-swift.git", from: "0.1.0"), ], targets: [ .executableTarget( @@ -52,7 +52,7 @@ let package = Package( .product(name: "TeXToDOCX", package: "TeXToDOCX"), .product(name: "NoteToHTML", package: "note-to-html-swift"), .product(name: "NoteToPDF", package: "note-to-pdf-swift"), - .product(name: "OCRCore", package: "OCRSwift"), + .product(name: "OCRCore", package: "ocr-swift"), .product(name: "ArgumentParser", package: "swift-argument-parser"), ] ), From 6fd48803e7631082582e128c79f6c124cab25ab3 Mon Sep 17 00:00:00 2001 From: che cheng Date: Sun, 19 Apr 2026 18:56:14 +0800 Subject: [PATCH 3/5] refactor: delete empty WordToMDTests testTarget (#79 Track C) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track C of #79 — delete dead scaffolding that declared a testTarget with .copy("Fixtures") resources but had zero test files and empty Fixtures/ subdirectory. Matches reality (no tests exist) and follows #81's "real tests only" philosophy. If WordToMDSwift coverage is wanted later, a separate issue can create real smoke tests with proper fixtures — not placeholder scaffolding that looks like infrastructure but produces no assertions. swift test: 28 tests still green (26 Swift Testing + 2 XCTest), no regression. Refs #79 --- Package.swift | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Package.swift b/Package.swift index ccf5669..0c1702b 100644 --- a/Package.swift +++ b/Package.swift @@ -56,13 +56,6 @@ let package = Package( .product(name: "ArgumentParser", package: "swift-argument-parser"), ] ), - .testTarget( - name: "WordToMDTests", - dependencies: [ - .product(name: "WordToMDSwift", package: "word-to-md-swift"), - ], - resources: [.copy("Fixtures")] - ), .testTarget( name: "MacDocCLITests", dependencies: [] From a380cfca782c84745b5d8e37c0487545e19b8048 Mon Sep 17 00:00:00 2001 From: che cheng Date: Sun, 19 Apr 2026 19:04:56 +0800 Subject: [PATCH 4/5] chore: whitelist bib-apa-{html,json,md}-swift packages in .gitignore (#79 Track D fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track D clean-clone verification surfaced 3 additional gitignored packages that #79's original scope didn't anticipate: - packages/bib-apa-to-html-swift (20 KB source) - packages/bib-apa-to-json-swift (4 KB source) - packages/bib-apa-to-md-swift (12 KB source) All 3 were matched by .gitignore's blanket 'packages/' rule and referenced by macdoc Package.swift as .package(name:, path:) deps — same pattern as #78's note-* and #79's pdf-to-latex/ocr-swift. Decision: Option (b) from issue #79 — whitelist + commit in-tree (matches srt-to-html-swift, md-to-html-swift, html-to-md-swift, marker-word-converter-swift, tex-to-docx-swift precedent). These are small (36 KB total), don't have a compelling independent-repo story, and the 2 existing private remotes under apa-bib-* naming would require rename + make-public ceremony for no real benefit. This completes #79's Phase 5 audit and unblocks clean-clone 'git clone && swift build' verification. Refs #79 --- .gitignore | 42 + packages/bib-apa-swift/Package.swift | 25 + .../Sources/BibAPA/APAReferenceRenderer.swift | 31 + .../Sources/BibAPA/APAStyler.swift | 217 +++++ .../Sources/BibAPA/APAStylerHelpers.swift | 400 ++++++++++ .../Sources/BibAPA/Models/APAReference.swift | 149 ++++ .../Sources/BibAPA/Models/TextSegment.swift | 35 + .../Tests/BibAPATests/APAStylerTests.swift | 67 ++ packages/bib-apa-to-html-swift/.gitignore | 8 + packages/bib-apa-to-html-swift/Package.swift | 25 + .../Sources/BibAPAToHTML/APACSS.swift | 74 ++ .../BibAPAToHTML/BibToAPAHTMLFormatter.swift | 65 ++ .../Sources/BibAPAToHTML/HTMLRenderer.swift | 176 ++++ .../Sources/BibAPAToHTML/Reexports.swift | 4 + .../BibAPAToHTMLTests/HTMLRendererTests.swift | 160 ++++ packages/bib-apa-to-json-swift/.gitignore | 8 + packages/bib-apa-to-json-swift/Package.swift | 25 + .../BibAPAToJSON/BibToAPAJSONFormatter.swift | 69 ++ .../JSONFormatterTests.swift | 102 +++ packages/bib-apa-to-md-swift/Package.swift | 25 + .../BibAPAToMD/BibToAPAFormatter.swift | 72 ++ .../Sources/BibAPAToMD/MarkdownRenderer.swift | 170 ++++ .../BibToAPAFormatterTests.swift | 207 +++++ .../BibAPAToMDTests/IntegrationTests.swift | 20 + packages/biblatex-apa-swift/.gitignore | 5 + packages/biblatex-apa-swift/LICENSE | 21 + .../HTMLToWord/HTMLToWordConverter.swift | 581 ++++++++++++++ .../HTMLToWordConverterTests.swift | 210 +++++ packages/md-to-word-swift/.gitignore | 7 + .../Sources/MDToWord/FigureImporter.swift | 75 ++ .../Sources/MDToWord/FootnoteParser.swift | 86 ++ .../Sources/MDToWord/MarkdownASTWalker.swift | 575 ++++++++++++++ .../MDToWord/MarkdownToWordConverter.swift | 751 ++++++++++++++++++ .../Sources/MDToWord/MetadataReader.swift | 700 ++++++++++++++++ .../Tests/MDToWordTests/E2ETests.swift | 332 ++++++++ .../MarkdownToWordConverterTests.swift | 155 ++++ .../MDToWordTests/MetadataReaderTests.swift | 438 ++++++++++ .../Tests/MDToWordTests/RoundTripTests.swift | 660 +++++++++++++++ .../PDFToDOCX/PDFToDOCXConverter.swift | 469 +++++++++++ .../PDFToDOCXConverterTests.swift | 188 +++++ .../Sources/PDFToMD/PDFConverter.swift | 330 ++++++++ .../Sources/PDFToMDSmokeTests/main.swift | 176 ++++ .../PDFToMDTests/PDFConverterTests.swift | 167 ++++ .../WordToHTML/WordHTMLConverter.swift | 655 +++++++++++++++ .../WordHTMLConverterTests.swift | 279 +++++++ 45 files changed, 9036 insertions(+) create mode 100644 packages/bib-apa-swift/Package.swift create mode 100644 packages/bib-apa-swift/Sources/BibAPA/APAReferenceRenderer.swift create mode 100644 packages/bib-apa-swift/Sources/BibAPA/APAStyler.swift create mode 100644 packages/bib-apa-swift/Sources/BibAPA/APAStylerHelpers.swift create mode 100644 packages/bib-apa-swift/Sources/BibAPA/Models/APAReference.swift create mode 100644 packages/bib-apa-swift/Sources/BibAPA/Models/TextSegment.swift create mode 100644 packages/bib-apa-swift/Tests/BibAPATests/APAStylerTests.swift create mode 100644 packages/bib-apa-to-html-swift/.gitignore create mode 100644 packages/bib-apa-to-html-swift/Package.swift create mode 100644 packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/APACSS.swift create mode 100644 packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/BibToAPAHTMLFormatter.swift create mode 100644 packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/HTMLRenderer.swift create mode 100644 packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/Reexports.swift create mode 100644 packages/bib-apa-to-html-swift/Tests/BibAPAToHTMLTests/HTMLRendererTests.swift create mode 100644 packages/bib-apa-to-json-swift/.gitignore create mode 100644 packages/bib-apa-to-json-swift/Package.swift create mode 100644 packages/bib-apa-to-json-swift/Sources/BibAPAToJSON/BibToAPAJSONFormatter.swift create mode 100644 packages/bib-apa-to-json-swift/Tests/BibAPAToJSONTests/JSONFormatterTests.swift create mode 100644 packages/bib-apa-to-md-swift/Package.swift create mode 100644 packages/bib-apa-to-md-swift/Sources/BibAPAToMD/BibToAPAFormatter.swift create mode 100644 packages/bib-apa-to-md-swift/Sources/BibAPAToMD/MarkdownRenderer.swift create mode 100644 packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/BibToAPAFormatterTests.swift create mode 100644 packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/IntegrationTests.swift create mode 100644 packages/biblatex-apa-swift/.gitignore create mode 100644 packages/biblatex-apa-swift/LICENSE create mode 100644 packages/html-to-word-swift/Sources/HTMLToWord/HTMLToWordConverter.swift create mode 100644 packages/html-to-word-swift/Tests/HTMLToWordTests/HTMLToWordConverterTests.swift create mode 100644 packages/md-to-word-swift/.gitignore create mode 100644 packages/md-to-word-swift/Sources/MDToWord/FigureImporter.swift create mode 100644 packages/md-to-word-swift/Sources/MDToWord/FootnoteParser.swift create mode 100644 packages/md-to-word-swift/Sources/MDToWord/MarkdownASTWalker.swift create mode 100644 packages/md-to-word-swift/Sources/MDToWord/MarkdownToWordConverter.swift create mode 100644 packages/md-to-word-swift/Sources/MDToWord/MetadataReader.swift create mode 100644 packages/md-to-word-swift/Tests/MDToWordTests/E2ETests.swift create mode 100644 packages/md-to-word-swift/Tests/MDToWordTests/MarkdownToWordConverterTests.swift create mode 100644 packages/md-to-word-swift/Tests/MDToWordTests/MetadataReaderTests.swift create mode 100644 packages/md-to-word-swift/Tests/MDToWordTests/RoundTripTests.swift create mode 100644 packages/pdf-to-docx-swift/Sources/PDFToDOCX/PDFToDOCXConverter.swift create mode 100644 packages/pdf-to-docx-swift/Tests/PDFToDOCXTests/PDFToDOCXConverterTests.swift create mode 100644 packages/pdf-to-md-swift/Sources/PDFToMD/PDFConverter.swift create mode 100644 packages/pdf-to-md-swift/Sources/PDFToMDSmokeTests/main.swift create mode 100644 packages/pdf-to-md-swift/Tests/PDFToMDTests/PDFConverterTests.swift create mode 100644 packages/word-to-html-swift/Sources/WordToHTML/WordHTMLConverter.swift create mode 100644 packages/word-to-html-swift/Tests/WordToHTMLTests/WordHTMLConverterTests.swift diff --git a/.gitignore b/.gitignore index 58b9829..f39700f 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,48 @@ packages/ !packages/tex-to-docx-swift/Sources/TeXToDOCX/TeXPreambleParser.swift !packages/tex-to-docx-swift/Tests/ !packages/tex-to-docx-swift/Tests/TeXToDOCXTests/ +!packages/bib-apa-to-html-swift/ +!packages/bib-apa-to-html-swift/Package.swift +!packages/bib-apa-to-html-swift/Sources/ +!packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/ +!packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/APACSS.swift +!packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/BibToAPAHTMLFormatter.swift +!packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/HTMLRenderer.swift +!packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/Reexports.swift +!packages/bib-apa-to-html-swift/Tests/ +!packages/bib-apa-to-html-swift/Tests/BibAPAToHTMLTests/ +!packages/bib-apa-to-html-swift/Tests/BibAPAToHTMLTests/HTMLRendererTests.swift +!packages/bib-apa-to-json-swift/ +!packages/bib-apa-to-json-swift/Package.swift +!packages/bib-apa-to-json-swift/Sources/ +!packages/bib-apa-to-json-swift/Sources/BibAPAToJSON/ +!packages/bib-apa-to-json-swift/Sources/BibAPAToJSON/BibToAPAJSONFormatter.swift +!packages/bib-apa-to-json-swift/Tests/ +!packages/bib-apa-to-json-swift/Tests/BibAPAToJSONTests/ +!packages/bib-apa-to-json-swift/Tests/BibAPAToJSONTests/JSONFormatterTests.swift +!packages/bib-apa-to-md-swift/ +!packages/bib-apa-to-md-swift/Package.swift +!packages/bib-apa-to-md-swift/Sources/ +!packages/bib-apa-to-md-swift/Sources/BibAPAToMD/ +!packages/bib-apa-to-md-swift/Sources/BibAPAToMD/BibToAPAFormatter.swift +!packages/bib-apa-to-md-swift/Sources/BibAPAToMD/MarkdownRenderer.swift +!packages/bib-apa-to-md-swift/Tests/ +!packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/ +!packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/BibToAPAFormatterTests.swift +!packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/IntegrationTests.swift +!packages/bib-apa-swift/ +!packages/bib-apa-swift/Package.swift +!packages/bib-apa-swift/Sources/ +!packages/bib-apa-swift/Sources/BibAPA/ +!packages/bib-apa-swift/Sources/BibAPA/APAReferenceRenderer.swift +!packages/bib-apa-swift/Sources/BibAPA/APAStyler.swift +!packages/bib-apa-swift/Sources/BibAPA/APAStylerHelpers.swift +!packages/bib-apa-swift/Sources/BibAPA/Models/ +!packages/bib-apa-swift/Sources/BibAPA/Models/APAReference.swift +!packages/bib-apa-swift/Sources/BibAPA/Models/TextSegment.swift +!packages/bib-apa-swift/Tests/ +!packages/bib-apa-swift/Tests/BibAPATests/ +!packages/bib-apa-swift/Tests/BibAPATests/APAStylerTests.swift mcp/ # Local test files (personal .note, .pptx, .docx etc.) diff --git a/packages/bib-apa-swift/Package.swift b/packages/bib-apa-swift/Package.swift new file mode 100644 index 0000000..7f4f0dc --- /dev/null +++ b/packages/bib-apa-swift/Package.swift @@ -0,0 +1,25 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "BibAPA", + platforms: [.macOS(.v14)], + products: [ + .library(name: "BibAPA", targets: ["BibAPA"]) + ], + dependencies: [ + .package(name: "BiblatexAPA", path: "../biblatex-apa-swift"), + ], + targets: [ + .target( + name: "BibAPA", + dependencies: ["BiblatexAPA"], + path: "Sources/BibAPA" + ), + .testTarget( + name: "BibAPATests", + dependencies: ["BibAPA"], + path: "Tests/BibAPATests" + ) + ] +) diff --git a/packages/bib-apa-swift/Sources/BibAPA/APAReferenceRenderer.swift b/packages/bib-apa-swift/Sources/BibAPA/APAReferenceRenderer.swift new file mode 100644 index 0000000..a0619d8 --- /dev/null +++ b/packages/bib-apa-swift/Sources/BibAPA/APAReferenceRenderer.swift @@ -0,0 +1,31 @@ +// APAReferenceRenderer.swift — Protocol for rendering APAReference to output format + +/// Renders an APAReference into a specific output format (Markdown, HTML, Astro props, etc.). +public protocol APAReferenceRenderer { + associatedtype Output + + func render(_ reference: APAReference) -> Output + + func renderArticle(_ ref: APAArticleRef) -> Output + func renderBook(_ ref: APABookRef) -> Output + func renderChapter(_ ref: APAChapterRef) -> Output + func renderThesis(_ ref: APAThesisRef) -> Output + func renderReport(_ ref: APAReportRef) -> Output + func renderPresentation(_ ref: APAPresentationRef) -> Output + func renderOnline(_ ref: APAOnlineRef) -> Output +} + +// Default dispatch +extension APAReferenceRenderer { + public func render(_ reference: APAReference) -> Output { + switch reference { + case .article(let ref): return renderArticle(ref) + case .book(let ref): return renderBook(ref) + case .chapter(let ref): return renderChapter(ref) + case .thesis(let ref): return renderThesis(ref) + case .report(let ref): return renderReport(ref) + case .presentation(let ref): return renderPresentation(ref) + case .online(let ref): return renderOnline(ref) + } + } +} diff --git a/packages/bib-apa-swift/Sources/BibAPA/APAStyler.swift b/packages/bib-apa-swift/Sources/BibAPA/APAStyler.swift new file mode 100644 index 0000000..30e2206 --- /dev/null +++ b/packages/bib-apa-swift/Sources/BibAPA/APAStyler.swift @@ -0,0 +1,217 @@ +// APAStyler.swift — BibEntry → APAReference semantic model +// Applies APA 7 rules to produce format-agnostic reference data. + +import Foundation +import BiblatexAPA + +public struct APAStyler { + + // MARK: - Public API + + /// Convert a BibEntry into a format-agnostic APAReference. + public static func style(_ entry: BibEntry) -> APAReference { + switch entry.normalizedType { + case "ARTICLE": + return .article(styleArticle(entry)) + case "BOOK", "COLLECTION", "PROCEEDINGS", "REFERENCE": + return .book(styleBook(entry)) + case "INBOOK", "INCOLLECTION", "INPROCEEDINGS": + return .chapter(styleChapter(entry)) + case "THESIS", "PHDTHESIS", "MASTERSTHESIS": + return .thesis(styleThesis(entry)) + case "REPORT": + return .report(styleReport(entry)) + case "PRESENTATION": + return .presentation(stylePresentation(entry)) + case "ONLINE", "MISC": + return .online(styleOnline(entry)) + default: + return .online(styleOnline(entry)) + } + } + + /// Format a single BibEntry as an in-text citation: (Author, Year) + public static func formatCitation(_ entry: BibEntry) -> String { + let authors = formatCitationAuthors(entry) + let year = extractYear(entry) + return "(\(authors), \(year))" + } + + /// Format a single BibEntry as a narrative citation: Author (Year) + public static func formatNarrativeCitation(_ entry: BibEntry) -> String { + let authors = formatCitationAuthors(entry) + let year = extractYear(entry) + return "\(authors) (\(year))" + } + + // MARK: - Per-Type Styling + + private static func styleArticle(_ entry: BibEntry) -> APAArticleRef { + let title = buildTitle(entry) + let journal = field(entry, "JOURNALTITLE").map(stripBraces) ?? "" + + return APAArticleRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + title: title, + journal: journal, + volume: field(entry, "VOLUME"), + issue: field(entry, "NUMBER"), + pages: field(entry, "PAGES").map(normalizePages), + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + private static func styleBook(_ entry: BibEntry) -> APABookRef { + let title = buildTitleWithSubtitle(entry) + + return APABookRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + title: title, + edition: formatEdition(field(entry, "EDITION")), + volume: field(entry, "VOLUME"), + publisher: field(entry, "PUBLISHER").map(stripBraces), + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + private static func styleChapter(_ entry: BibEntry) -> APAChapterRef { + let chapterTitle = toSentenceCase(stripBraces(entry.title ?? "")) + let bookTitle = toSentenceCase(stripBraces(field(entry, "BOOKTITLE") ?? "")) + let editorStr = buildEditorString(entry) + let pages = field(entry, "PAGES").map { "pp. \(normalizePages($0))" } + + return APAChapterRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + chapterTitle: chapterTitle, + editors: editorStr, + bookTitle: bookTitle, + edition: formatEdition(field(entry, "EDITION")), + volume: field(entry, "VOLUME").map { "Vol. \($0)" }, + pages: pages, + publisher: field(entry, "PUBLISHER").map(stripBraces), + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + private static func styleThesis(_ entry: BibEntry) -> APAThesisRef { + let thesisType: String + if let addon = field(entry, "TITLEADDON"), !addon.isEmpty { + thesisType = stripBraces(addon) + } else if entry.normalizedType == "MASTERSTHESIS" { + thesisType = "Master's thesis" + } else if let bibType = field(entry, "TYPE")?.lowercased(), bibType.contains("mathesis") { + thesisType = "Master's thesis" + } else { + thesisType = "Doctoral dissertation" + } + + let institution = (field(entry, "INSTITUTION") ?? field(entry, "SCHOOL")).map(stripBraces) + + return APAThesisRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + title: toSentenceCase(stripBraces(entry.title ?? "")), + thesisType: thesisType, + institution: institution, + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + private static func styleReport(_ entry: BibEntry) -> APAReportRef { + let titleAddon = field(entry, "TITLEADDON").map(stripBraces) + let number: String? + if let num = field(entry, "NUMBER"), !num.isEmpty { + let typeLabel = field(entry, "TYPE") ?? "" + if !typeLabel.isEmpty { + number = "\(stripBraces(typeLabel)) \(num)" + } else { + number = "No. \(num)" + } + } else { + number = nil + } + + return APAReportRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + title: toSentenceCase(stripBraces(entry.title ?? "")), + titleAddon: titleAddon, + number: number, + institution: field(entry, "INSTITUTION").map(stripBraces), + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + private static func stylePresentation(_ entry: BibEntry) -> APAPresentationRef { + let presentationType: String + if let addon = field(entry, "TITLEADDON"), !addon.isEmpty { + presentationType = stripBraces(addon) + } else { + presentationType = "Conference presentation" + } + + return APAPresentationRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + title: toSentenceCase(stripBraces(entry.title ?? "")), + presentationType: presentationType, + conference: field(entry, "EVENTTITLE").map(stripBraces), + venue: field(entry, "VENUE").map(stripBraces), + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + private static func styleOnline(_ entry: BibEntry) -> APAOnlineRef { + return APAOnlineRef( + authors: formatAuthorsString(entry), + date: formatDate(entry), + title: toSentenceCase(stripBraces(entry.title ?? "")), + publisher: field(entry, "PUBLISHER").map(stripBraces), + doi: normalizeDOI(entry), + url: doi(entry) == nil ? field(entry, "URL").map(stripBraces) : nil + ) + } + + // MARK: - Helpers + + private static func doi(_ entry: BibEntry) -> String? { + entry.doi.flatMap { $0.isEmpty ? nil : $0 } + } + + private static func normalizeDOI(_ entry: BibEntry) -> String? { + guard let raw = entry.doi, !raw.isEmpty else { return nil } + let clean = stripBraces(raw) + return clean.hasPrefix("http") ? clean : "https://doi.org/\(clean)" + } + + private static func buildTitle(_ entry: BibEntry) -> String { + let cleanTitle = stripBraces(entry.title ?? "") + var full = toSentenceCase(cleanTitle) + if let sub = field(entry, "SUBTITLE"), !sub.isEmpty { + full += ": \(capitalizeFirst(toSentenceCase(stripBraces(sub)).lowercased()))" + } + return full + } + + private static func buildTitleWithSubtitle(_ entry: BibEntry) -> String { + buildTitle(entry) + } + + private static func buildEditorString(_ entry: BibEntry) -> String? { + guard let edRaw = field(entry, "EDITOR"), !edRaw.isEmpty else { return nil } + let editors = edRaw.components(separatedBy: " and ") + .map { parseSingleAuthor($0.trimmingCharacters(in: .whitespaces)) } + let edNames = editors.map(formatSingleAuthorForIn).joined(separator: ", ") + let edLabel = editors.count == 1 ? "Ed." : "Eds." + return "\(edNames) (\(edLabel))" + } +} diff --git a/packages/bib-apa-swift/Sources/BibAPA/APAStylerHelpers.swift b/packages/bib-apa-swift/Sources/BibAPA/APAStylerHelpers.swift new file mode 100644 index 0000000..9643762 --- /dev/null +++ b/packages/bib-apa-swift/Sources/BibAPA/APAStylerHelpers.swift @@ -0,0 +1,400 @@ +// APAStylerHelpers.swift — Pure utility functions for APA 7 formatting +// Extracted from BibToAPAFormatter.swift (bib-to-apa-swift) + +import Foundation +import BiblatexAPA + +// MARK: - Author Name Model + +public struct AuthorName: Equatable, Sendable { + public let lastName: String + public let firstName: String // may be empty for corporate authors + public let suffix: String // e.g. "Jr." + + public var isCorporate: Bool { firstName.isEmpty && suffix.isEmpty } + + public init(lastName: String, firstName: String, suffix: String = "") { + self.lastName = lastName + self.firstName = firstName + self.suffix = suffix + } +} + +// MARK: - Author Parsing + +/// Parse biblatex author string into structured name parts. +/// Handles: "Last, First and Last, First" / "{Corporate Name}" / "Last, First, Jr." +public func parseAuthors(_ entry: BibEntry) -> [AuthorName] { + guard let raw = field(entry, "AUTHOR") else { return [] } + + let authorStrings = raw.components(separatedBy: " and ") + .map { $0.trimmingCharacters(in: .whitespaces) } + .filter { !$0.isEmpty } + + return authorStrings.map(parseSingleAuthor) +} + +public func parseSingleAuthor(_ raw: String) -> AuthorName { + let trimmed = raw.trimmingCharacters(in: .whitespaces) + + // Corporate author: {American Psychological Association} + if trimmed.hasPrefix("{") && trimmed.hasSuffix("}") { + let name = String(trimmed.dropFirst().dropLast()) + return AuthorName(lastName: name, firstName: "", suffix: "") + } + + let parts = trimmed.components(separatedBy: ",").map { + $0.trimmingCharacters(in: .whitespaces) + } + + switch parts.count { + case 1: + let words = parts[0].components(separatedBy: " ").filter { !$0.isEmpty } + if words.count == 1 { + return AuthorName(lastName: words[0], firstName: "") + } + let last = words.last! + let first = words.dropLast().joined(separator: " ") + return AuthorName(lastName: last, firstName: first) + + case 2: + return AuthorName(lastName: parts[0], firstName: parts[1]) + + case 3: + if parts[1].lowercased().contains("jr") || parts[1].lowercased().contains("sr") + || parts[1].lowercased().contains("iii") || parts[1].lowercased().contains("ii") { + return AuthorName(lastName: parts[0], firstName: parts[2], suffix: parts[1]) + } + return AuthorName(lastName: parts[0], firstName: "\(parts[1]), \(parts[2])") + + default: + return AuthorName(lastName: trimmed, firstName: "") + } +} + +// MARK: - Author Formatting (Reference List) + +/// APA 7 reference list format: Last, F. M., & Last, F. M. +public func formatAuthorNames(_ names: [AuthorName]) -> String { + let formatted = names.map(formatSingleAuthorRef) + let count = formatted.count + + switch count { + case 0: return "" + case 1: return formatted[0] + case 2: return "\(formatted[0]), & \(formatted[1])" + case 3...20: + let allButLast = formatted.dropLast().joined(separator: ", ") + return "\(allButLast), & \(formatted.last!)" + default: + let first19 = formatted.prefix(19).joined(separator: ", ") + return "\(first19), . . . \(formatted.last!)" + } +} + +/// Format single author for reference list: Last, F. M. +public func formatSingleAuthorRef(_ author: AuthorName) -> String { + if author.isCorporate { return author.lastName } + + let initials = formatInitials(author.firstName) + var result = author.lastName + if !initials.isEmpty { + result += ", \(initials)" + } + if !author.suffix.isEmpty { + result += ", \(author.suffix)" + } + return result +} + +/// Format editor name for "In" clause: F. M. Last +public func formatSingleAuthorForIn(_ author: AuthorName) -> String { + if author.isCorporate { return author.lastName } + let initials = formatInitials(author.firstName) + return initials.isEmpty ? author.lastName : "\(initials) \(author.lastName)" +} + +/// Convert first name to initials: "Hau-Hung" → "H.-H.", "Sarah Michelle" → "S. M." +public func formatInitials(_ firstName: String) -> String { + let parts = firstName.components(separatedBy: " ") + .filter { !$0.isEmpty } + + return parts.map { part in + if part.count <= 2 && part.first?.isUppercase == true { + return part.hasSuffix(".") ? part : "\(part)." + } + if part.contains("-") { + let sub = part.components(separatedBy: "-") + return sub.map { s in + guard let first = s.first else { return "" } + return "\(first.uppercased())." + }.joined(separator: "-") + } + guard let first = part.first else { return "" } + return "\(first.uppercased())." + }.joined(separator: " ") +} + +// MARK: - Citation Authors + +/// APA 7 in-text citation: 1 → Last; 2 → Last & Last; 3+ → Last et al. +public func formatCitationAuthors(_ entry: BibEntry) -> String { + let names = parseAuthors(entry) + if names.isEmpty { return "Unknown" } + + switch names.count { + case 1: return names[0].lastName + case 2: return "\(names[0].lastName) & \(names[1].lastName)" + default: return "\(names[0].lastName) et al." + } +} + +// MARK: - Field Access + +/// Case-insensitive field lookup +public func field(_ entry: BibEntry, _ name: String) -> String? { + entry.fields.caseInsensitiveValue(forKey: name) +} + +// MARK: - Text Processing + +/// Strip outer braces: "{ADHD}" → "ADHD" +public func stripBraces(_ text: String) -> String { + var result = text + while result.hasPrefix("{") && result.hasSuffix("}") { + let inner = String(result.dropFirst().dropLast()) + var depth = 0 + var balanced = true + for ch in inner { + if ch == "{" { depth += 1 } + else if ch == "}" { depth -= 1 } + if depth < 0 { balanced = false; break } + } + if balanced && depth == 0 { + result = inner + } else { + break + } + } + return result +} + +/// Convert to APA sentence case. Preserves content inside braces as-is. +public func toSentenceCase(_ title: String) -> String { + var segments: [(text: String, protected: Bool)] = [] + var current = "" + var depth = 0 + + for ch in title { + if ch == "{" { + if depth == 0 && !current.isEmpty { + segments.append((current, false)) + current = "" + } + depth += 1 + if depth > 1 { current.append(ch) } + } else if ch == "}" { + depth -= 1 + if depth == 0 { + segments.append((current, true)) + current = "" + } else if depth > 0 { + current.append(ch) + } + } else { + current.append(ch) + } + } + if !current.isEmpty { + segments.append((current, false)) + } + + var isFirst = true + let processed = segments.map { segment -> String in + if segment.protected { return segment.text } + + let words = segment.text.components(separatedBy: " ") + let result = words.enumerated().map { (_, word) -> String in + if word.isEmpty { return word } + + if word.count >= 2 && word == word.uppercased() + && word.rangeOfCharacter(from: .lowercaseLetters) == nil { + return word + } + + if isFirst { + isFirst = false + return capitalizeFirst(word.lowercased()) + } + + return word.lowercased() + }.joined(separator: " ") + + return capitalizeAfterColon(result) + } + + return processed.joined() +} + +/// Capitalize the first letter after ": " +public func capitalizeAfterColon(_ text: String) -> String { + guard let regex = try? NSRegularExpression(pattern: ": ([a-z])") else { return text } + let nsText = text as NSString + let matches = regex.matches(in: text, range: NSRange(location: 0, length: nsText.length)) + + var result = text + for match in matches.reversed() { + guard let letterRange = Range(match.range(at: 1), in: result) else { continue } + let letter = result[letterRange].uppercased() + result.replaceSubrange(letterRange, with: letter) + } + return result +} + +public func capitalizeFirst(_ word: String) -> String { + guard let first = word.first else { return word } + return first.uppercased() + word.dropFirst() +} + +// MARK: - Edition & Pages + +public func formatEdition(_ edition: String?) -> String? { + guard let ed = edition, !ed.isEmpty else { return nil } + let clean = stripBraces(ed) + if clean.contains("ed") { return clean } + guard let num = Int(clean), num > 1 else { return nil } + let suffix: String + switch num { + case 2: suffix = "nd" + case 3: suffix = "rd" + default: suffix = "th" + } + return "\(num)\(suffix) ed." +} + +/// Normalize pages: "1--51" → "1–51" +public func normalizePages(_ pages: String) -> String { + var result = stripBraces(pages) + result = result.replacingOccurrences(of: "--", with: "–") + result = result.replacingOccurrences(of: "—", with: "–") + if let regex = try? NSRegularExpression(pattern: "(\\d)-(\\d)") { + result = regex.stringByReplacingMatches( + in: result, + range: NSRange(result.startIndex..., in: result), + withTemplate: "$1–$2" + ) + } + return result +} + +// MARK: - Date Formatting + +public let monthNames = [ + "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December" +] + +/// Format a normalized ISO date: "2025" → "2025", "2025-03" → "2025, March" +public func formatNormalizedDate(_ date: String) -> String { + let parts = date.components(separatedBy: "-") + guard let year = parts.first, !year.isEmpty else { return "n.d." } + + if parts.count >= 2, let monthNum = Int(parts[1]) { + if monthNum >= 21 && monthNum <= 24 { + let seasons = [21: "Spring", 22: "Summer", 23: "Fall", 24: "Winter"] + return "\(year), \(seasons[monthNum]!)" + } + if monthNum >= 1 && monthNum <= 12 { + let monthName = monthNames[monthNum - 1] + if parts.count >= 3, let day = Int(parts[2]), day > 0 { + return "\(year), \(monthName) \(day)" + } + return "\(year), \(monthName)" + } + } + return year +} + +/// Format EVENTDATE ranges: "2025-05-01/2025-05-03" → "2025, May 1–3" +public func formatEventDate(_ eventDate: String) -> String { + let normalized = APAUtilities.normalizeDate(eventDate) + + if normalized.contains("/") { + let parts = normalized.components(separatedBy: "/") + if parts.count == 2 { + let start = parts[0] + let end = parts[1] + + let startParts = start.components(separatedBy: "-") + let endParts = end.components(separatedBy: "-") + + guard startParts.count >= 3, + let year = startParts.first, + let startMonth = Int(startParts[1]), + let startDay = Int(startParts[2]) else { + return formatNormalizedDate(start) + } + + let startMonthName = monthNames[startMonth - 1] + + if endParts.count >= 3, + let endMonth = Int(endParts[1]), + let endDay = Int(endParts[2]) { + if startMonth == endMonth { + return "\(year), \(startMonthName) \(startDay)–\(endDay)" + } else { + let endMonthName = monthNames[endMonth - 1] + return "\(year), \(startMonthName) \(startDay)–\(endMonthName) \(endDay)" + } + } + + return "\(year), \(startMonthName) \(startDay)" + } + } + + return formatNormalizedDate(normalized) +} + +/// Format date for reference list +public func formatDate(_ entry: BibEntry) -> String { + let type = entry.normalizedType + if type == "PRESENTATION" { + if let eventDate = field(entry, "EVENTDATE"), !eventDate.isEmpty { + return formatEventDate(eventDate) + } + } + + guard let dateStr = entry.date, !dateStr.isEmpty else { return "n.d." } + let normalized = APAUtilities.normalizeDate(dateStr) + return formatNormalizedDate(normalized) +} + +/// Extract just the year +public func extractYear(_ entry: BibEntry) -> String { + guard let dateStr = entry.date, !dateStr.isEmpty else { return "n.d." } + let normalized = APAUtilities.normalizeDate(dateStr) + return String(normalized.prefix(4)) +} + +// MARK: - Reference List Authors + +/// Format authors for reference list with trailing period. +public func formatAuthorsString(_ entry: BibEntry) -> String { + var names = parseAuthors(entry) + if names.isEmpty { + if let editorRaw = field(entry, "EDITOR"), !editorRaw.isEmpty { + var mutable = entry + mutable.fields["AUTHOR"] = editorRaw + names = parseAuthors(mutable) + if !names.isEmpty { + let edFormatted = formatAuthorNames(names) + let edLabel = names.count == 1 ? "Ed." : "Eds." + let edResult = edFormatted.hasSuffix(".") ? String(edFormatted.dropLast()) : edFormatted + return "\(edResult) (\(edLabel))." + } + } + return "" + } + let formatted = formatAuthorNames(names) + return formatted.hasSuffix(".") ? formatted : formatted + "." +} diff --git a/packages/bib-apa-swift/Sources/BibAPA/Models/APAReference.swift b/packages/bib-apa-swift/Sources/BibAPA/Models/APAReference.swift new file mode 100644 index 0000000..28d012d --- /dev/null +++ b/packages/bib-apa-swift/Sources/BibAPA/Models/APAReference.swift @@ -0,0 +1,149 @@ +// APAReference.swift — Format-agnostic APA 7 reference semantic model +// Each case carries all the data needed to render a reference list entry. + +/// A fully resolved APA 7 reference, ready for rendering. +public enum APAReference: Equatable, Sendable { + case article(APAArticleRef) + case book(APABookRef) + case chapter(APAChapterRef) + case thesis(APAThesisRef) + case report(APAReportRef) + case presentation(APAPresentationRef) + case online(APAOnlineRef) +} + +// MARK: - Ref Structs + +public struct APAArticleRef: Equatable, Sendable { + public let authors: String // "Cheng, C., Yang, H.-H., & Hsu, Y.-F." + public let date: String // "2025" + public let title: String // sentence case, no formatting + public let journal: String // preserved case (renderer italicizes) + public let volume: String? + public let issue: String? + public let pages: String? // en-dash normalized + public let doi: String? // full URL + public let url: String? + + public init(authors: String, date: String, title: String, journal: String, + volume: String? = nil, issue: String? = nil, pages: String? = nil, + doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.title = title + self.journal = journal; self.volume = volume; self.issue = issue + self.pages = pages; self.doi = doi; self.url = url + } +} + +public struct APABookRef: Equatable, Sendable { + public let authors: String + public let date: String + public let title: String // sentence case (renderer italicizes) + public let edition: String? // "2nd ed." + public let volume: String? + public let publisher: String? + public let doi: String? + public let url: String? + + public init(authors: String, date: String, title: String, + edition: String? = nil, volume: String? = nil, publisher: String? = nil, + doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.title = title + self.edition = edition; self.volume = volume; self.publisher = publisher + self.doi = doi; self.url = url + } +} + +public struct APAChapterRef: Equatable, Sendable { + public let authors: String + public let date: String + public let chapterTitle: String // sentence case, plain + public let editors: String? // "F. M. Last (Ed.)," + public let bookTitle: String // sentence case (renderer italicizes) + public let edition: String? + public let volume: String? + public let pages: String? // "pp. 1–51" + public let publisher: String? + public let doi: String? + public let url: String? + + public init(authors: String, date: String, chapterTitle: String, + editors: String? = nil, bookTitle: String, + edition: String? = nil, volume: String? = nil, pages: String? = nil, + publisher: String? = nil, doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.chapterTitle = chapterTitle + self.editors = editors; self.bookTitle = bookTitle + self.edition = edition; self.volume = volume; self.pages = pages + self.publisher = publisher; self.doi = doi; self.url = url + } +} + +public struct APAThesisRef: Equatable, Sendable { + public let authors: String + public let date: String + public let title: String // sentence case (renderer italicizes) + public let thesisType: String // "Doctoral dissertation" or "Master's thesis" + public let institution: String? + public let doi: String? + public let url: String? + + public init(authors: String, date: String, title: String, + thesisType: String, institution: String? = nil, + doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.title = title + self.thesisType = thesisType; self.institution = institution + self.doi = doi; self.url = url + } +} + +public struct APAReportRef: Equatable, Sendable { + public let authors: String + public let date: String + public let title: String // sentence case (renderer italicizes) + public let titleAddon: String? // e.g. report type description + public let number: String? // "No. 123" or "Technical Report 123" + public let institution: String? + public let doi: String? + public let url: String? + + public init(authors: String, date: String, title: String, + titleAddon: String? = nil, number: String? = nil, + institution: String? = nil, doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.title = title + self.titleAddon = titleAddon; self.number = number + self.institution = institution; self.doi = doi; self.url = url + } +} + +public struct APAPresentationRef: Equatable, Sendable { + public let authors: String + public let date: String // "2025, May 1–3" + public let title: String // sentence case (renderer italicizes) + public let presentationType: String // "Oral presentation", "Poster presentation" + public let conference: String? // event title + public let venue: String? // "Tainan, Taiwan" + public let doi: String? + public let url: String? + + public init(authors: String, date: String, title: String, + presentationType: String, conference: String? = nil, + venue: String? = nil, doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.title = title + self.presentationType = presentationType; self.conference = conference + self.venue = venue; self.doi = doi; self.url = url + } +} + +public struct APAOnlineRef: Equatable, Sendable { + public let authors: String + public let date: String + public let title: String // sentence case (renderer italicizes) + public let publisher: String? + public let doi: String? + public let url: String? + + public init(authors: String, date: String, title: String, + publisher: String? = nil, doi: String? = nil, url: String? = nil) { + self.authors = authors; self.date = date; self.title = title + self.publisher = publisher; self.doi = doi; self.url = url + } +} diff --git a/packages/bib-apa-swift/Sources/BibAPA/Models/TextSegment.swift b/packages/bib-apa-swift/Sources/BibAPA/Models/TextSegment.swift new file mode 100644 index 0000000..f04312c --- /dev/null +++ b/packages/bib-apa-swift/Sources/BibAPA/Models/TextSegment.swift @@ -0,0 +1,35 @@ +// TextSegment.swift — Format-agnostic styled text primitives + +/// A segment of text with a single semantic style. +public enum TextSegment: Equatable, Sendable { + case plain(String) + case italic(String) + case bold(String) + case link(text: String, url: String) +} + +/// A sequence of styled text segments forming a complete styled string. +public struct StyledText: Equatable, Sendable { + public let segments: [TextSegment] + + public init(_ segments: [TextSegment]) { + self.segments = segments + } + + public init(plain text: String) { + self.segments = [.plain(text)] + } + + public init(italic text: String) { + self.segments = [.italic(text)] + } + + public var isEmpty: Bool { + segments.allSatisfy { segment in + switch segment { + case .plain(let t), .italic(let t), .bold(let t): return t.isEmpty + case .link(let t, _): return t.isEmpty + } + } + } +} diff --git a/packages/bib-apa-swift/Tests/BibAPATests/APAStylerTests.swift b/packages/bib-apa-swift/Tests/BibAPATests/APAStylerTests.swift new file mode 100644 index 0000000..0f32da2 --- /dev/null +++ b/packages/bib-apa-swift/Tests/BibAPATests/APAStylerTests.swift @@ -0,0 +1,67 @@ +import XCTest +import BiblatexAPA +@testable import BibAPA + +final class APAStylerTests: XCTestCase { + + // MARK: - Style produces correct case + + func testStyleArticle() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "Some Article Title", + "JOURNALTITLE": "Psychometrika", + "DATE": "2025", + ]) + let ref = APAStyler.style(entry) + guard case .article(let article) = ref else { + XCTFail("Expected .article"); return + } + XCTAssertEqual(article.journal, "Psychometrika") + XCTAssertEqual(article.date, "2025") + } + + func testStylePresentation() { + let entry = makeEntry(type: "PRESENTATION", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "Test Talk", + "TITLEADDON": "Oral presentation", + "EVENTTITLE": "Conference Name", + "VENUE": "Taipei, Taiwan", + "EVENTDATE": "2025-05-01/2025-05-03", + "DATE": "2025", + ]) + let ref = APAStyler.style(entry) + guard case .presentation(let pres) = ref else { + XCTFail("Expected .presentation"); return + } + XCTAssertEqual(pres.presentationType, "Oral presentation") + XCTAssertEqual(pres.conference, "Conference Name") + XCTAssertEqual(pres.venue, "Taipei, Taiwan") + XCTAssertEqual(pres.date, "2025, May 1–3") + } + + func testStyleThesis() { + let entry = makeEntry(type: "THESIS", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "My Thesis Title", + "INSTITUTION": "National Taiwan University", + "TYPE": "mathesis", + "DATE": "2020", + ]) + let ref = APAStyler.style(entry) + guard case .thesis(let thesis) = ref else { + XCTFail("Expected .thesis"); return + } + XCTAssertEqual(thesis.thesisType, "Master's thesis") + XCTAssertEqual(thesis.institution, "National Taiwan University") + } + + // MARK: - Helpers + + func makeEntry(type: String, fields: [String: String]) -> BibEntry { + var dict = OrderedDict() + for (k, v) in fields { dict[k] = v } + return BibEntry(entryType: type, key: "test", fields: dict, rawText: "", lineNumber: 1) + } +} diff --git a/packages/bib-apa-to-html-swift/.gitignore b/packages/bib-apa-to-html-swift/.gitignore new file mode 100644 index 0000000..b5b205d --- /dev/null +++ b/packages/bib-apa-to-html-swift/.gitignore @@ -0,0 +1,8 @@ +.build/ +.swiftpm/ +DerivedData/ +*.xcodeproj +*.xcworkspace +xcuserdata/ +.DS_Store +Package.resolved diff --git a/packages/bib-apa-to-html-swift/Package.swift b/packages/bib-apa-to-html-swift/Package.swift new file mode 100644 index 0000000..46a54bd --- /dev/null +++ b/packages/bib-apa-to-html-swift/Package.swift @@ -0,0 +1,25 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "BibAPAToHTML", + platforms: [.macOS(.v14)], + products: [ + .library(name: "BibAPAToHTML", targets: ["BibAPAToHTML"]) + ], + dependencies: [ + .package(name: "BibAPA", path: "../bib-apa-swift"), + ], + targets: [ + .target( + name: "BibAPAToHTML", + dependencies: ["BibAPA"], + path: "Sources/BibAPAToHTML" + ), + .testTarget( + name: "BibAPAToHTMLTests", + dependencies: ["BibAPAToHTML"], + path: "Tests/BibAPAToHTMLTests" + ) + ] +) diff --git a/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/APACSS.swift b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/APACSS.swift new file mode 100644 index 0000000..d6fe959 --- /dev/null +++ b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/APACSS.swift @@ -0,0 +1,74 @@ +// APACSS.swift — APA 7 reference list CSS styles + +public enum APACSS { + + /// Minimal APA 7 reference list CSS. + /// Includes hanging indent, proper spacing, and link styling. + public static let minimal = """ + .apa-reference-list { + font-family: "Times New Roman", Times, serif; + font-size: 12pt; + line-height: 2; + } + + .apa-reference { + padding-left: 0.5in; + text-indent: -0.5in; + margin-bottom: 0; + } + + .apa-reference a { + color: inherit; + text-decoration: none; + } + + .apa-reference a:hover { + text-decoration: underline; + } + + .apa-reference:target { + background-color: #FEF3C7; + transition: background-color 2s ease; + } + """ + + /// Web-friendly APA CSS with modern font stack and responsive sizing. + public static let web = """ + .apa-reference-list { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + font-size: 1rem; + line-height: 1.8; + max-width: 48rem; + } + + .apa-reference { + padding-left: 2rem; + text-indent: -2rem; + margin-bottom: 0.75rem; + } + + .apa-reference em { + font-style: italic; + } + + .apa-reference a { + color: #2563eb; + text-decoration: none; + word-break: break-all; + } + + .apa-reference a:hover { + text-decoration: underline; + } + + .apa-reference:target { + background-color: #DBEAFE; + transition: background-color 2s ease; + } + """ + + /// Wrap CSS in a " + } +} diff --git a/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/BibToAPAHTMLFormatter.swift b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/BibToAPAHTMLFormatter.swift new file mode 100644 index 0000000..a3fd424 --- /dev/null +++ b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/BibToAPAHTMLFormatter.swift @@ -0,0 +1,65 @@ +// BibToAPAHTMLFormatter.swift — Convenience API for .bib → APA HTML + +import BibAPA +import BiblatexAPA + +public struct BibToAPAHTMLFormatter { + + private static let renderer = HTMLRenderer() + + // MARK: - Reference List (full entries) + + /// Format a single BibEntry as an APA 7 reference in HTML (inner content only, no wrapper). + public static func formatReference(_ entry: BibEntry) -> String { + let reference = APAStyler.style(entry) + return renderer.render(reference) + } + + /// Format multiple BibEntries as an HTML reference list (sorted alphabetically). + /// Each entry wrapped in `

` for anchor linking. + public static func formatReferenceList(_ entries: [BibEntry]) -> String { + let formatted = entries.map { (entry: $0, ref: formatReference($0)) } + let sorted = formatted.sorted { $0.ref.lowercased() < $1.ref.lowercased() } + return sorted + .map { "

\($0.ref)

" } + .joined(separator: "\n") + } + + /// Format as a complete HTML fragment with CSS and wrapping div. + /// - Parameter css: CSS to include. Defaults to `APACSS.web`. + public static func formatReferenceListWithCSS( + _ entries: [BibEntry], + css: String = APACSS.web + ) -> String { + let style = APACSS.styleTag(css) + let list = formatReferenceList(entries) + return "\(style)\n
\n\(list)\n
" + } + + // MARK: - In-Text Citations + + /// Parenthetical citation with anchor link: `(Author, Year)` + public static func formatInTextCitation(_ entry: BibEntry) -> String { + let text = APAStyler.formatCitation(entry) + return "\(escapeHTML(text))" + } + + /// Narrative citation with anchor link: `Author (Year)` + public static func formatNarrativeInTextCitation(_ entry: BibEntry) -> String { + let text = APAStyler.formatNarrativeCitation(entry) + return "\(escapeHTML(text))" + } + + // MARK: - Helpers + + private static func escapeHTML(_ text: String) -> String { + text.replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") + } + + private static func escapeAttr(_ text: String) -> String { + text.replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "\"", with: """) + } +} diff --git a/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/HTMLRenderer.swift b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/HTMLRenderer.swift new file mode 100644 index 0000000..c067fc2 --- /dev/null +++ b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/HTMLRenderer.swift @@ -0,0 +1,176 @@ +// HTMLRenderer.swift — APAReference → HTML string +// Uses semantic HTML: for italics, for links. + +import BibAPA + +public struct HTMLRenderer: APAReferenceRenderer { + public typealias Output = String + + public init() {} + + // MARK: - Article + + public func renderArticle(_ ref: APAArticleRef) -> String { + let body = "\(esc(ref.title))." + + var sourceParts: [String] = [] + if !ref.journal.isEmpty { + var journalPart = "\(esc(ref.journal))" + if let vol = ref.volume, !vol.isEmpty { + journalPart += ", \(esc(vol))" + if let issue = ref.issue, !issue.isEmpty { + journalPart += "(\(esc(issue)))" + } + } + if let pages = ref.pages, !pages.isEmpty { + journalPart += ", \(esc(pages))" + } + journalPart += "." + sourceParts.append(journalPart) + } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Book + + public func renderBook(_ ref: APABookRef) -> String { + var parenthetical: [String] = [] + if let ed = ref.edition { parenthetical.append(esc(ed)) } + if let vol = ref.volume, !vol.isEmpty { parenthetical.append("Vol. \(esc(vol))") } + let paren = parenthetical.isEmpty ? "" : " (\(parenthetical.joined(separator: ", ")))" + let body = "\(esc(ref.title))\(paren)." + + var sourceParts: [String] = [] + if let pub = ref.publisher, !pub.isEmpty { sourceParts.append("\(esc(pub)).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Chapter + + public func renderChapter(_ ref: APAChapterRef) -> String { + let chapterTitle = "\(esc(ref.chapterTitle))." + + var inPart = "In " + if let editors = ref.editors, !editors.isEmpty { + inPart += "\(esc(editors)), " + } + inPart += "\(esc(ref.bookTitle))" + + var parenthetical: [String] = [] + if let ed = ref.edition { parenthetical.append(esc(ed)) } + if let vol = ref.volume, !vol.isEmpty { parenthetical.append(esc(vol)) } + if let pages = ref.pages, !pages.isEmpty { parenthetical.append(esc(pages)) } + if !parenthetical.isEmpty { + inPart += " (\(parenthetical.joined(separator: ", ")))" + } + inPart += "." + + let body = "\(chapterTitle) \(inPart)" + + var sourceParts: [String] = [] + if let pub = ref.publisher, !pub.isEmpty { sourceParts.append("\(esc(pub)).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Thesis + + public func renderThesis(_ ref: APAThesisRef) -> String { + let body: String + if let inst = ref.institution, !inst.isEmpty { + body = "\(esc(ref.title)) [\(esc(ref.thesisType)), \(esc(inst))]." + } else { + body = "\(esc(ref.title)) [\(esc(ref.thesisType))]." + } + + var sourceParts: [String] = [] + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Report + + public func renderReport(_ ref: APAReportRef) -> String { + var titlePart = "\(esc(ref.title))" + var parenthetical: [String] = [] + if let addon = ref.titleAddon, !addon.isEmpty { parenthetical.append(esc(addon)) } + if let num = ref.number, !num.isEmpty { parenthetical.append(esc(num)) } + if !parenthetical.isEmpty { + titlePart += " (\(parenthetical.joined(separator: " ")))" + } + let body = titlePart + "." + + var sourceParts: [String] = [] + if let inst = ref.institution, !inst.isEmpty { sourceParts.append("\(esc(inst)).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Presentation + + public func renderPresentation(_ ref: APAPresentationRef) -> String { + let body = "\(esc(ref.title)) [\(esc(ref.presentationType))]." + + var sourceParts: [String] = [] + if let conf = ref.conference, !conf.isEmpty { + var confPart = esc(conf) + if let venue = ref.venue, !venue.isEmpty { + confPart += ", \(esc(venue))" + } + confPart += "." + sourceParts.append(confPart) + } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Online + + public func renderOnline(_ ref: APAOnlineRef) -> String { + let body = "\(esc(ref.title))." + + var sourceParts: [String] = [] + if let pub = ref.publisher, !pub.isEmpty { sourceParts.append("\(esc(pub)).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Helpers + + private func esc(_ text: String) -> String { + text.replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") + } + + private func appendLink(_ parts: inout [String], doi: String?, url: String?) { + if let doi = doi, !doi.isEmpty { + parts.append("\(esc(doi))") + } else if let url = url, !url.isEmpty { + parts.append("\(esc(url))") + } + } + + private func assemble(authors: String, date: String, body: String, source: String) -> String { + var parts: [String] = [] + if !authors.isEmpty { parts.append(esc(authors)) } + parts.append("(\(esc(date))).") + if !body.isEmpty { parts.append(body) } + if !source.isEmpty { parts.append(source) } + + var result = parts.joined(separator: " ") + if !result.hasSuffix(".") && !result.contains("") { + result += "." + } + return result + } +} diff --git a/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/Reexports.swift b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/Reexports.swift new file mode 100644 index 0000000..94cabb0 --- /dev/null +++ b/packages/bib-apa-to-html-swift/Sources/BibAPAToHTML/Reexports.swift @@ -0,0 +1,4 @@ +// Re-export BiblatexAPA so that Layer 4 consumers (CLI, MCP) can access +// BibParser, BibEntry, OrderedDict etc. through BibAPAToHTML without +// importing the Layer 1 module directly. +@_exported import BiblatexAPA diff --git a/packages/bib-apa-to-html-swift/Tests/BibAPAToHTMLTests/HTMLRendererTests.swift b/packages/bib-apa-to-html-swift/Tests/BibAPAToHTMLTests/HTMLRendererTests.swift new file mode 100644 index 0000000..4447197 --- /dev/null +++ b/packages/bib-apa-to-html-swift/Tests/BibAPAToHTMLTests/HTMLRendererTests.swift @@ -0,0 +1,160 @@ +import XCTest +import BiblatexAPA +@testable import BibAPAToHTML + +final class HTMLRendererTests: XCTestCase { + + // MARK: - Article + + func testArticleHTML() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che and Yang, Hau-Hung and Hsu, Yung-Fong", + "TITLE": "Some Article Title Here", + "JOURNALTITLE": "Psychometrika", + "VOLUME": "90", + "NUMBER": "2", + "PAGES": "757--778", + "DOI": "10.1007/s11336-025-10029-2", + "DATE": "2025", + ]) + let result = BibToAPAHTMLFormatter.formatReference(entry) + XCTAssertTrue(result.contains("Psychometrika")) + XCTAssertTrue(result.contains("90(2)")) + XCTAssertTrue(result.contains("")) + XCTAssertFalse(result.contains("*"), "Should not contain markdown italics") + } + + // MARK: - Presentation + + func testPresentationHTML() { + let entry = makeEntry(type: "PRESENTATION", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "Test Talk", + "TITLEADDON": "Oral presentation", + "EVENTTITLE": "Conference Name", + "VENUE": "Taipei, Taiwan", + "EVENTDATE": "2025-05-01/2025-05-03", + "DATE": "2025", + ]) + let result = BibToAPAHTMLFormatter.formatReference(entry) + XCTAssertTrue(result.contains("Test talk")) + XCTAssertTrue(result.contains("[Oral presentation]")) + XCTAssertTrue(result.contains("Taipei, Taiwan")) + } + + // MARK: - Thesis + + func testThesisHTML() { + let entry = makeEntry(type: "THESIS", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "My Thesis Title", + "INSTITUTION": "National Taiwan University", + "TYPE": "mathesis", + "DATE": "2020", + ]) + let result = BibToAPAHTMLFormatter.formatReference(entry) + XCTAssertTrue(result.contains("My thesis title")) + XCTAssertTrue(result.contains("[Master's thesis" ) || result.contains("[Master's thesis") || result.contains("[Master's thesis")) + XCTAssertTrue(result.contains("National Taiwan University")) + } + + // MARK: - HTML Escaping + + func testHTMLEscaping() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "O'Brien, James", + "TITLE": "A claim & its consequences", + "JOURNALTITLE": "Test Journal", + "DATE": "2025", + ]) + let result = BibToAPAHTMLFormatter.formatReference(entry) + XCTAssertTrue(result.contains("&")) + XCTAssertTrue(result.contains("<bold>")) + XCTAssertFalse(result.contains(""), "Raw HTML tags should be escaped") + } + + // MARK: - Reference List + + func testReferenceListWrapsInParagraphs() { + let entry1 = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Zeta, A.", "TITLE": "First", "JOURNALTITLE": "J", "DATE": "2025", + ]) + let entry2 = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Alpha, B.", "TITLE": "Second", "JOURNALTITLE": "J", "DATE": "2025", + ]) + let result = BibToAPAHTMLFormatter.formatReferenceList([entry1, entry2]) + let lines = result.components(separatedBy: "\n") + XCTAssertEqual(lines.count, 2) + XCTAssertTrue(lines[0].contains("

")) + // Alphabetically sorted: Alpha before Zeta + XCTAssertTrue(lines[0].contains("Alpha")) + } + + // MARK: - Integration with real .bib + + func testEndToEndWithRealBibFile() throws { + let bibPath = "/Users/che/Academic/che-cheng-website/vendor/cheng_che_cv/bibliography/Che.bib" + let bibFile = try BibParser.parse(filePath: bibPath) + let html = BibToAPAHTMLFormatter.formatReferenceList(bibFile.entries) + XCTAssertFalse(html.isEmpty) + XCTAssertTrue(html.contains("")) + XCTAssertTrue(html.contains("")) + XCTAssertTrue(result.contains("Cheng")) + XCTAssertTrue(result.contains("2025")) + XCTAssertTrue(result.contains("")) + } + + func testNarrativeInTextCitation() { + let entry = makeEntry(type: "THESIS", key: "cheng_phd_2025", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "My Dissertation", + "INSTITUTION": "National Taiwan University", + "TYPE": "phdthesis", + "DATE": "2025", + ]) + let result = BibToAPAHTMLFormatter.formatNarrativeInTextCitation(entry) + XCTAssertTrue(result.contains("")) + XCTAssertTrue(result.contains("Cheng")) + XCTAssertTrue(result.contains("(2025)")) + XCTAssertTrue(result.contains("")) + } + + func testReferenceListHasAnchorIDs() throws { + let bibPath = "/Users/che/Academic/che-cheng-website/vendor/cheng_che_cv/bibliography/Che.bib" + let bibFile = try BibParser.parse(filePath: bibPath) + let html = BibToAPAHTMLFormatter.formatReferenceList(bibFile.entries) + // Every entry should have a unique id + XCTAssertTrue(html.contains("id=\"ref-cheng_likert_choices_2021\"")) + XCTAssertTrue(html.contains("id=\"ref-cheng_phd_dissertation_2025\"")) + } + + func testCSSContainsTargetHighlight() { + XCTAssertTrue(APACSS.minimal.contains(":target")) + XCTAssertTrue(APACSS.web.contains(":target")) + } + + // MARK: - Helpers + + func makeEntry(type: String, key: String = "test", fields: [String: String]) -> BibEntry { + var dict = OrderedDict() + for (k, v) in fields { dict[k] = v } + return BibEntry(entryType: type, key: key, fields: dict, rawText: "", lineNumber: 1) + } +} diff --git a/packages/bib-apa-to-json-swift/.gitignore b/packages/bib-apa-to-json-swift/.gitignore new file mode 100644 index 0000000..b5b205d --- /dev/null +++ b/packages/bib-apa-to-json-swift/.gitignore @@ -0,0 +1,8 @@ +.build/ +.swiftpm/ +DerivedData/ +*.xcodeproj +*.xcworkspace +xcuserdata/ +.DS_Store +Package.resolved diff --git a/packages/bib-apa-to-json-swift/Package.swift b/packages/bib-apa-to-json-swift/Package.swift new file mode 100644 index 0000000..e9d612f --- /dev/null +++ b/packages/bib-apa-to-json-swift/Package.swift @@ -0,0 +1,25 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "BibAPAToJSON", + platforms: [.macOS(.v14)], + products: [ + .library(name: "BibAPAToJSON", targets: ["BibAPAToJSON"]) + ], + dependencies: [ + .package(name: "BibAPAToHTML", path: "../bib-apa-to-html-swift"), + ], + targets: [ + .target( + name: "BibAPAToJSON", + dependencies: ["BibAPAToHTML"], + path: "Sources/BibAPAToJSON" + ), + .testTarget( + name: "BibAPAToJSONTests", + dependencies: ["BibAPAToJSON"], + path: "Tests/BibAPAToJSONTests" + ) + ] +) diff --git a/packages/bib-apa-to-json-swift/Sources/BibAPAToJSON/BibToAPAJSONFormatter.swift b/packages/bib-apa-to-json-swift/Sources/BibAPAToJSON/BibToAPAJSONFormatter.swift new file mode 100644 index 0000000..88b5872 --- /dev/null +++ b/packages/bib-apa-to-json-swift/Sources/BibAPAToJSON/BibToAPAJSONFormatter.swift @@ -0,0 +1,69 @@ +// BibToAPAJSONFormatter.swift — .bib → APA 7 JSON with pre-rendered HTML + +import Foundation +import BibAPAToHTML +import BibAPA +import BiblatexAPA + +/// A single reference entry with pre-rendered HTML and in-text citations. +public struct APAJSONEntry: Codable, Sendable { + public let key: String + public let type: String + public let year: String + public let rendered: String + public let citation: String + public let narrativeCitation: String + + public init(key: String, type: String, year: String, + rendered: String, citation: String, narrativeCitation: String) { + self.key = key + self.type = type + self.year = year + self.rendered = rendered + self.citation = citation + self.narrativeCitation = narrativeCitation + } +} + +public struct BibToAPAJSONFormatter { + + /// Convert a single BibEntry to an APAJSONEntry. + public static func formatEntry(_ entry: BibEntry) -> APAJSONEntry { + let html = BibToAPAHTMLFormatter.formatReference(entry) + let citation = BibToAPAHTMLFormatter.formatInTextCitation(entry) + let narrative = BibToAPAHTMLFormatter.formatNarrativeInTextCitation(entry) + let year = entry.date ?? "n.d." + + return APAJSONEntry( + key: entry.key, + type: entry.normalizedType, + year: extractYear(year), + rendered: html, + citation: citation, + narrativeCitation: narrative + ) + } + + /// Convert multiple BibEntries to an array of APAJSONEntry, sorted alphabetically. + public static func formatEntries(_ entries: [BibEntry]) -> [APAJSONEntry] { + entries + .map { formatEntry($0) } + .sorted { $0.rendered.lowercased() < $1.rendered.lowercased() } + } + + /// Convert multiple BibEntries to a JSON string. + public static func formatJSON(_ entries: [BibEntry], prettyPrint: Bool = true) throws -> String { + let jsonEntries = formatEntries(entries) + let encoder = JSONEncoder() + encoder.outputFormatting = prettyPrint ? [.prettyPrinted, .sortedKeys] : .sortedKeys + let data = try encoder.encode(jsonEntries) + return String(data: data, encoding: .utf8) ?? "[]" + } + + // MARK: - Helpers + + private static func extractYear(_ date: String) -> String { + // Extract just the year from dates like "2025", "2025-05-01", etc. + String(date.prefix(4)) + } +} diff --git a/packages/bib-apa-to-json-swift/Tests/BibAPAToJSONTests/JSONFormatterTests.swift b/packages/bib-apa-to-json-swift/Tests/BibAPAToJSONTests/JSONFormatterTests.swift new file mode 100644 index 0000000..809d081 --- /dev/null +++ b/packages/bib-apa-to-json-swift/Tests/BibAPAToJSONTests/JSONFormatterTests.swift @@ -0,0 +1,102 @@ +import XCTest +import BiblatexAPA +@testable import BibAPAToJSON + +final class JSONFormatterTests: XCTestCase { + + // MARK: - Single Entry + + func testSingleEntryHasAllFields() { + let entry = makeEntry(type: "ARTICLE", key: "cheng_test_2025", fields: [ + "AUTHOR": "Cheng, Che and Hsu, Yung-Fong", + "TITLE": "Test Article Title", + "JOURNALTITLE": "Test Journal", + "VOLUME": "1", + "PAGES": "1--10", + "DOI": "10.1234/test", + "DATE": "2025", + ]) + let result = BibToAPAJSONFormatter.formatEntry(entry) + + XCTAssertEqual(result.key, "cheng_test_2025") + XCTAssertEqual(result.type, "ARTICLE") + XCTAssertEqual(result.year, "2025") + XCTAssertTrue(result.rendered.contains("Test article title")) + XCTAssertTrue(result.citation.contains("href=\"#ref-cheng_test_2025\"")) + XCTAssertTrue(result.citation.contains("Cheng")) + XCTAssertTrue(result.narrativeCitation.contains("href=\"#ref-cheng_test_2025\"")) + } + + // MARK: - Multiple Entries + + func testMultipleEntriesSortedAlphabetically() { + let entry1 = makeEntry(type: "ARTICLE", key: "zeta_2025", fields: [ + "AUTHOR": "Zeta, A.", "TITLE": "First", "JOURNALTITLE": "J", "DATE": "2025", + ]) + let entry2 = makeEntry(type: "ARTICLE", key: "alpha_2025", fields: [ + "AUTHOR": "Alpha, B.", "TITLE": "Second", "JOURNALTITLE": "J", "DATE": "2025", + ]) + let results = BibToAPAJSONFormatter.formatEntries([entry1, entry2]) + + XCTAssertEqual(results.count, 2) + XCTAssertEqual(results[0].key, "alpha_2025") + XCTAssertEqual(results[1].key, "zeta_2025") + } + + // MARK: - JSON Output + + func testJSONOutputIsValid() throws { + let entry = makeEntry(type: "THESIS", key: "cheng_phd_2025", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "My Dissertation", + "INSTITUTION": "National Taiwan University", + "TYPE": "phdthesis", + "DATE": "2025", + ]) + let json = try BibToAPAJSONFormatter.formatJSON([entry]) + let data = json.data(using: .utf8)! + let decoded = try JSONDecoder().decode([APAJSONEntry].self, from: data) + + XCTAssertEqual(decoded.count, 1) + XCTAssertEqual(decoded[0].key, "cheng_phd_2025") + XCTAssertEqual(decoded[0].type, "THESIS") + XCTAssertTrue(decoded[0].rendered.contains("Doctoral dissertation")) + } + + // MARK: - Integration + + func testEndToEndWithRealBibFile() throws { + let bibPath = "/Users/che/Academic/che-cheng-website/vendor/cheng_che_cv/bibliography/Che.bib" + let bibFile = try BibParser.parse(filePath: bibPath) + let json = try BibToAPAJSONFormatter.formatJSON(bibFile.entries) + + XCTAssertFalse(json.isEmpty) + + let data = json.data(using: .utf8)! + let decoded = try JSONDecoder().decode([APAJSONEntry].self, from: data) + + XCTAssertEqual(decoded.count, bibFile.entries.count) + // Every entry should have a non-empty rendered field + for entry in decoded { + XCTAssertFalse(entry.rendered.isEmpty, "Entry \(entry.key) has empty rendered HTML") + XCTAssertFalse(entry.citation.isEmpty, "Entry \(entry.key) has empty citation") + } + + print("\n=== JSON output (first entry) ===") + if let first = decoded.first { + print("key: \(first.key)") + print("type: \(first.type)") + print("year: \(first.year)") + print("rendered: \(String(first.rendered.prefix(100)))...") + print("citation: \(first.citation)") + } + } + + // MARK: - Helpers + + func makeEntry(type: String, key: String = "test", fields: [String: String]) -> BibEntry { + var dict = OrderedDict() + for (k, v) in fields { dict[k] = v } + return BibEntry(entryType: type, key: key, fields: dict, rawText: "", lineNumber: 1) + } +} diff --git a/packages/bib-apa-to-md-swift/Package.swift b/packages/bib-apa-to-md-swift/Package.swift new file mode 100644 index 0000000..4ca8fb6 --- /dev/null +++ b/packages/bib-apa-to-md-swift/Package.swift @@ -0,0 +1,25 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "BibAPAToMD", + platforms: [.macOS(.v14)], + products: [ + .library(name: "BibAPAToMD", targets: ["BibAPAToMD"]) + ], + dependencies: [ + .package(name: "BibAPA", path: "../bib-apa-swift"), + ], + targets: [ + .target( + name: "BibAPAToMD", + dependencies: ["BibAPA"], + path: "Sources/BibAPAToMD" + ), + .testTarget( + name: "BibAPAToMDTests", + dependencies: ["BibAPAToMD"], + path: "Tests/BibAPAToMDTests" + ) + ] +) diff --git a/packages/bib-apa-to-md-swift/Sources/BibAPAToMD/BibToAPAFormatter.swift b/packages/bib-apa-to-md-swift/Sources/BibAPAToMD/BibToAPAFormatter.swift new file mode 100644 index 0000000..583660f --- /dev/null +++ b/packages/bib-apa-to-md-swift/Sources/BibAPAToMD/BibToAPAFormatter.swift @@ -0,0 +1,72 @@ +// BibToAPAFormatter.swift — Backward-compatible facade +// Delegates to APAStyler + MarkdownRenderer pipeline. + +import BibAPA +import BiblatexAPA + +public struct BibToAPAFormatter { + + private static let renderer = MarkdownRenderer() + + /// Format a single BibEntry as an APA 7 reference list entry. + /// Returns plain text with markdown-style italics (*...*). + public static func formatReference(_ entry: BibEntry) -> String { + let reference = APAStyler.style(entry) + return renderer.render(reference) + } + + /// Format a single BibEntry as an in-text citation: (Author, Year) + public static func formatCitation(_ entry: BibEntry) -> String { + APAStyler.formatCitation(entry) + } + + /// Format a single BibEntry as a narrative citation: Author (Year) + public static func formatNarrativeCitation(_ entry: BibEntry) -> String { + APAStyler.formatNarrativeCitation(entry) + } + + /// Format multiple BibEntries as a reference list (sorted alphabetically). + public static func formatReferenceList(_ entries: [BibEntry]) -> String { + let formatted = entries.map { (entry: $0, ref: formatReference($0)) } + let sorted = formatted.sorted { $0.ref.lowercased() < $1.ref.lowercased() } + return sorted.map(\.ref).joined(separator: "\n\n") + } + + // MARK: - Re-exported helpers for test compatibility + + public static func parseSingleAuthor(_ raw: String) -> BibAPA.AuthorName { + BibAPA.parseSingleAuthor(raw) + } + + public static func formatInitials(_ firstName: String) -> String { + BibAPA.formatInitials(firstName) + } + + public static func formatAuthors(_ entry: BibEntry) -> String { + BibAPA.formatAuthorsString(entry) + } + + public static func formatCitationAuthors(_ entry: BibEntry) -> String { + BibAPA.formatCitationAuthors(entry) + } + + public static func formatDate(_ entry: BibEntry) -> String { + BibAPA.formatDate(entry) + } + + public static func formatEventDate(_ eventDate: String) -> String { + BibAPA.formatEventDate(eventDate) + } + + public static func toSentenceCase(_ title: String) -> String { + BibAPA.toSentenceCase(title) + } + + public static func normalizePages(_ pages: String) -> String { + BibAPA.normalizePages(pages) + } + + public static func stripBraces(_ text: String) -> String { + BibAPA.stripBraces(text) + } +} diff --git a/packages/bib-apa-to-md-swift/Sources/BibAPAToMD/MarkdownRenderer.swift b/packages/bib-apa-to-md-swift/Sources/BibAPAToMD/MarkdownRenderer.swift new file mode 100644 index 0000000..5d0f5cd --- /dev/null +++ b/packages/bib-apa-to-md-swift/Sources/BibAPAToMD/MarkdownRenderer.swift @@ -0,0 +1,170 @@ +// MarkdownRenderer.swift — APAReference → Markdown string +// Produces output identical to the original BibToAPAFormatter.formatReference() + +import BibAPA + +public struct MarkdownRenderer: APAReferenceRenderer { + public typealias Output = String + + public init() {} + + // MARK: - Article + + public func renderArticle(_ ref: APAArticleRef) -> String { + let body = "\(ref.title)." + + var sourceParts: [String] = [] + if !ref.journal.isEmpty { + var journalPart = "*\(ref.journal)*" + if let vol = ref.volume, !vol.isEmpty { + journalPart += ", *\(vol)*" + if let issue = ref.issue, !issue.isEmpty { + journalPart += "(\(issue))" + } + } + if let pages = ref.pages, !pages.isEmpty { + journalPart += ", \(pages)" + } + journalPart += "." + sourceParts.append(journalPart) + } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Book + + public func renderBook(_ ref: APABookRef) -> String { + var parenthetical: [String] = [] + if let ed = ref.edition { parenthetical.append(ed) } + if let vol = ref.volume, !vol.isEmpty { parenthetical.append("Vol. \(vol)") } + let paren = parenthetical.isEmpty ? "" : " (\(parenthetical.joined(separator: ", ")))" + let body = "*\(ref.title)*\(paren)." + + var sourceParts: [String] = [] + if let pub = ref.publisher, !pub.isEmpty { sourceParts.append("\(pub).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Chapter + + public func renderChapter(_ ref: APAChapterRef) -> String { + let chapterTitle = "\(ref.chapterTitle)." + + var inPart = "In " + if let editors = ref.editors, !editors.isEmpty { + inPart += "\(editors), " + } + inPart += "*\(ref.bookTitle)*" + + var parenthetical: [String] = [] + if let ed = ref.edition { parenthetical.append(ed) } + if let vol = ref.volume, !vol.isEmpty { parenthetical.append(vol) } + if let pages = ref.pages, !pages.isEmpty { parenthetical.append(pages) } + if !parenthetical.isEmpty { + inPart += " (\(parenthetical.joined(separator: ", ")))" + } + inPart += "." + + let body = "\(chapterTitle) \(inPart)" + + var sourceParts: [String] = [] + if let pub = ref.publisher, !pub.isEmpty { sourceParts.append("\(pub).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Thesis + + public func renderThesis(_ ref: APAThesisRef) -> String { + let body: String + if let inst = ref.institution, !inst.isEmpty { + body = "*\(ref.title)* [\(ref.thesisType), \(inst)]." + } else { + body = "*\(ref.title)* [\(ref.thesisType)]." + } + + var sourceParts: [String] = [] + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Report + + public func renderReport(_ ref: APAReportRef) -> String { + var titlePart = "*\(ref.title)*" + var parenthetical: [String] = [] + if let addon = ref.titleAddon, !addon.isEmpty { parenthetical.append(addon) } + if let num = ref.number, !num.isEmpty { parenthetical.append(num) } + if !parenthetical.isEmpty { + titlePart += " (\(parenthetical.joined(separator: " ")))" + } + let body = titlePart + "." + + var sourceParts: [String] = [] + if let inst = ref.institution, !inst.isEmpty { sourceParts.append("\(inst).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Presentation + + public func renderPresentation(_ ref: APAPresentationRef) -> String { + let body = "*\(ref.title)* [\(ref.presentationType)]." + + var sourceParts: [String] = [] + if let conf = ref.conference, !conf.isEmpty { + var confPart = conf + if let venue = ref.venue, !venue.isEmpty { + confPart += ", \(venue)" + } + confPart += "." + sourceParts.append(confPart) + } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Online + + public func renderOnline(_ ref: APAOnlineRef) -> String { + let body = "*\(ref.title)*." + + var sourceParts: [String] = [] + if let pub = ref.publisher, !pub.isEmpty { sourceParts.append("\(pub).") } + appendLink(&sourceParts, doi: ref.doi, url: ref.url) + + return assemble(authors: ref.authors, date: ref.date, body: body, source: sourceParts.joined(separator: " ")) + } + + // MARK: - Assembly + + private func appendLink(_ parts: inout [String], doi: String?, url: String?) { + if let doi = doi, !doi.isEmpty { + parts.append(doi) + } else if let url = url, !url.isEmpty { + parts.append(url) + } + } + + private func assemble(authors: String, date: String, body: String, source: String) -> String { + var parts: [String] = [] + if !authors.isEmpty { parts.append(authors) } + parts.append("(\(date)).") + if !body.isEmpty { parts.append(body) } + if !source.isEmpty { parts.append(source) } + + var result = parts.joined(separator: " ") + if !result.hasSuffix(".") && !result.contains("https://") { + result += "." + } + return result + } +} diff --git a/packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/BibToAPAFormatterTests.swift b/packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/BibToAPAFormatterTests.swift new file mode 100644 index 0000000..160fa24 --- /dev/null +++ b/packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/BibToAPAFormatterTests.swift @@ -0,0 +1,207 @@ +import XCTest +import BiblatexAPA +@testable import BibAPAToMD + +final class BibToAPAFormatterTests: XCTestCase { + + // MARK: - Author Parsing + + func testParseSingleAuthor() { + let author = BibToAPAFormatter.parseSingleAuthor("Cheng, Che") + XCTAssertEqual(author.lastName, "Cheng") + XCTAssertEqual(author.firstName, "Che") + } + + func testParseHyphenatedFirstName() { + let author = BibToAPAFormatter.parseSingleAuthor("Yang, Hau-Hung") + XCTAssertEqual(author.lastName, "Yang") + XCTAssertEqual(author.firstName, "Hau-Hung") + } + + func testParseCorporateAuthor() { + let author = BibToAPAFormatter.parseSingleAuthor("{American Psychological Association}") + XCTAssertEqual(author.lastName, "American Psychological Association") + XCTAssertTrue(author.isCorporate) + } + + func testParseSingleName() { + let author = BibToAPAFormatter.parseSingleAuthor("Plato") + XCTAssertEqual(author.lastName, "Plato") + XCTAssertTrue(author.firstName.isEmpty) + } + + // MARK: - Initials + + func testInitials() { + XCTAssertEqual(BibToAPAFormatter.formatInitials("Che"), "C.") + XCTAssertEqual(BibToAPAFormatter.formatInitials("Hau-Hung"), "H.-H.") + XCTAssertEqual(BibToAPAFormatter.formatInitials("Sarah Michelle"), "S. M.") + XCTAssertEqual(BibToAPAFormatter.formatInitials("A. A."), "A. A.") + } + + // MARK: - Author Formatting (Reference List) + + func testSingleAuthorRef() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "Test", + "DATE": "2025", + ]) + let result = BibToAPAFormatter.formatAuthors(entry) + XCTAssertEqual(result, "Cheng, C.") + } + + func testTwoAuthorsRef() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che and Yang, Hau-Hung", + "TITLE": "Test", + "DATE": "2025", + ]) + let result = BibToAPAFormatter.formatAuthors(entry) + XCTAssertEqual(result, "Cheng, C., & Yang, H.-H.") + } + + func testThreeAuthorsRef() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che and Yang, Hau-Hung and Hsu, Yung-Fong", + "TITLE": "Test", + "DATE": "2025", + ]) + let result = BibToAPAFormatter.formatAuthors(entry) + XCTAssertEqual(result, "Cheng, C., Yang, H.-H., & Hsu, Y.-F.") + } + + // MARK: - Citation Authors + + func testCitationThreeAuthors() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che and Yang, Hau-Hung and Hsu, Yung-Fong", + ]) + let result = BibToAPAFormatter.formatCitationAuthors(entry) + XCTAssertEqual(result, "Cheng et al.") + } + + // MARK: - Date Formatting + + func testYearOnly() { + let entry = makeEntry(type: "ARTICLE", fields: ["DATE": "2025"]) + XCTAssertEqual(BibToAPAFormatter.formatDate(entry), "2025") + } + + func testEventDateRange() { + XCTAssertEqual( + BibToAPAFormatter.formatEventDate("2025-05-01/2025-05-03"), + "2025, May 1–3" + ) + } + + func testEventDateCrossMonth() { + XCTAssertEqual( + BibToAPAFormatter.formatEventDate("2025-08-29/2025-09-01"), + "2025, August 29–September 1" + ) + } + + // MARK: - Full Reference: Journal Article + + func testArticleReference() { + let entry = makeEntry(type: "ARTICLE", fields: [ + "AUTHOR": "Cheng, Che and Yang, Hau-Hung and Hsu, Yung-Fong", + "TITLE": "Some Article Title Here", + "JOURNALTITLE": "Psychometrika", + "VOLUME": "90", + "NUMBER": "2", + "PAGES": "757--778", + "DOI": "10.1007/s11336-025-10029-2", + "DATE": "2025", + ]) + let result = BibToAPAFormatter.formatReference(entry) + XCTAssertTrue(result.hasPrefix("Cheng, C., Yang, H.-H., & Hsu, Y.-F. (2025).")) + XCTAssertTrue(result.contains("*Psychometrika*, *90*(2), 757–778.")) + XCTAssertTrue(result.contains("https://doi.org/10.1007/s11336-025-10029-2")) + } + + // MARK: - Full Reference: Presentation + + func testPresentationReference() { + let entry = makeEntry(type: "PRESENTATION", fields: [ + "AUTHOR": "Cheng, Che and Yang, Hau-Hung and Hsu, Yung-Fong", + "TITLE": "The universal Cramér-Rao lower bound", + "TITLEADDON": "Oral presentation", + "EVENTTITLE": "The 64th Annual Convention of the Taiwanese Psychology Association", + "VENUE": "Tainan, Taiwan", + "EVENTDATE": "2025-10-18/2025-10-19", + "DATE": "2025", + ]) + let result = BibToAPAFormatter.formatReference(entry) + XCTAssertTrue(result.contains("(2025, October 18–19).")) + XCTAssertTrue(result.contains("[Oral presentation].")) + XCTAssertTrue(result.contains("The 64th Annual Convention")) + XCTAssertTrue(result.contains("Tainan, Taiwan")) + } + + // MARK: - Full Reference: Thesis + + func testThesisReference() { + let entry = makeEntry(type: "THESIS", fields: [ + "AUTHOR": "Cheng, Che", + "TITLE": "Analysis of Growth Curves Based on the Semiparametric Latent Curve Model", + "INSTITUTION": "National Taiwan University", + "TYPE": "mathesis", + "DATE": "2020", + ]) + let result = BibToAPAFormatter.formatReference(entry) + XCTAssertTrue(result.hasPrefix("Cheng, C. (2020).")) + XCTAssertTrue(result.contains("National Taiwan University")) + } + + // MARK: - Sentence Case + + func testSentenceCaseBasic() { + let result = BibToAPAFormatter.toSentenceCase("Language Learning as Language Use") + XCTAssertEqual(result, "Language learning as language use") + } + + func testSentenceCaseWithBraceProtection() { + let result = BibToAPAFormatter.toSentenceCase("Identifiability of Ordinal {SEM} and Item Factor Analysis") + XCTAssertTrue(result.contains("SEM")) + XCTAssertTrue(result.hasPrefix("Identifiability")) + } + + func testSentenceCaseWithColon() { + let result = BibToAPAFormatter.toSentenceCase("two are better than one: incorporating data") + XCTAssertTrue(result.hasPrefix("Two")) + XCTAssertTrue(result.contains(": Incorporating") || result.contains(": incorporating")) + } + + // MARK: - Pages Normalization + + func testPagesNormalization() { + XCTAssertEqual(BibToAPAFormatter.normalizePages("757--778"), "757–778") + XCTAssertEqual(BibToAPAFormatter.normalizePages("1-51"), "1–51") + } + + // MARK: - Strip Braces + + func testStripBraces() { + XCTAssertEqual(BibToAPAFormatter.stripBraces("{ADHD}"), "ADHD") + XCTAssertEqual(BibToAPAFormatter.stripBraces("Normal text"), "Normal text") + XCTAssertEqual(BibToAPAFormatter.stripBraces("{Nested {braces}}"), "Nested {braces}") + } + + // MARK: - Helpers + + func makeEntry(type: String, fields: [String: String]) -> BibEntry { + var dict = OrderedDict() + for (k, v) in fields { + dict[k] = v + } + return BibEntry( + entryType: type, + key: "test_key", + fields: dict, + rawText: "", + lineNumber: 1 + ) + } +} diff --git a/packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/IntegrationTests.swift b/packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/IntegrationTests.swift new file mode 100644 index 0000000..c2ef8b0 --- /dev/null +++ b/packages/bib-apa-to-md-swift/Tests/BibAPAToMDTests/IntegrationTests.swift @@ -0,0 +1,20 @@ +import XCTest +import BiblatexAPA +@testable import BibAPAToMD + +final class IntegrationTests: XCTestCase { + + func testEndToEndWithRealBibFile() throws { + let bibPath = "/Users/che/Academic/che-cheng-website/vendor/cheng_che_cv/bibliography/Che.bib" + let bibFile = try BibParser.parse(filePath: bibPath) + + XCTAssertGreaterThan(bibFile.entries.count, 0, "Should have entries") + + let markdown = BibToAPAFormatter.formatReferenceList(bibFile.entries) + XCTAssertFalse(markdown.isEmpty) + + // Print for manual inspection + print("\n=== \(bibFile.entries.count) entries converted ===\n") + print(markdown) + } +} diff --git a/packages/biblatex-apa-swift/.gitignore b/packages/biblatex-apa-swift/.gitignore new file mode 100644 index 0000000..911bac9 --- /dev/null +++ b/packages/biblatex-apa-swift/.gitignore @@ -0,0 +1,5 @@ +.build/ +.swiftpm/ +Package.resolved +*.xcodeproj/ +.DS_Store diff --git a/packages/biblatex-apa-swift/LICENSE b/packages/biblatex-apa-swift/LICENSE new file mode 100644 index 0000000..6452c33 --- /dev/null +++ b/packages/biblatex-apa-swift/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Che Cheng + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/html-to-word-swift/Sources/HTMLToWord/HTMLToWordConverter.swift b/packages/html-to-word-swift/Sources/HTMLToWord/HTMLToWordConverter.swift new file mode 100644 index 0000000..fde717a --- /dev/null +++ b/packages/html-to-word-swift/Sources/HTMLToWord/HTMLToWordConverter.swift @@ -0,0 +1,581 @@ +import Foundation +import CommonConverterSwift +import OOXMLSwift +import SwiftSoup + +public struct HTMLToWordConverter: DocumentConverter { + public static let sourceFormat = "html" + + public init() {} + + public func convert( + input: URL, + output: inout W, + options: ConversionOptions + ) throws { + let document = try convertToDocument(input: input, options: options) + let documentXML = try renderDocumentXML(from: document) + try output.write(documentXML) + } + + public func convertToFile( + input: URL, + output: URL, + options: ConversionOptions = .default + ) throws { + let document = try convertToDocument(input: input, options: options) + try DocxWriter.write(document, to: output) + } + + public func convertToDocument( + input: URL, + options: ConversionOptions = .default + ) throws -> WordDocument { + guard FileManager.default.fileExists(atPath: input.path) else { + throw ConversionError.fileNotFound(input.path) + } + + let html = try loadHTML(from: input) + return try convertHTML(html, sourceURL: input, options: options) + } + + public func convertHTML( + _ html: String, + sourceURL: URL? = nil, + options: ConversionOptions = .default + ) throws -> WordDocument { + let parsed = try SwiftSoup.parse(html, sourceURL?.absoluteString ?? "") + var builder = HTMLWordBuilder(parsed: parsed, sourceURL: sourceURL, options: options) + return try builder.build() + } + + private func loadHTML(from input: URL) throws -> String { + if let utf8 = try? String(contentsOf: input, encoding: .utf8) { + return utf8 + } + return try String(contentsOf: input, encoding: .isoLatin1) + } + + private func renderDocumentXML(from document: WordDocument) throws -> String { + let tempRoot = FileManager.default.temporaryDirectory + .appendingPathComponent("html-to-word-swift") + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try FileManager.default.createDirectory(at: tempRoot, withIntermediateDirectories: true) + defer { try? FileManager.default.removeItem(at: tempRoot) } + + let archiveURL = tempRoot.appendingPathComponent("document.docx") + try DocxWriter.write(document, to: archiveURL) + + let extracted = try ZipHelper.unzip(archiveURL) + defer { ZipHelper.cleanup(extracted) } + + return try String( + contentsOf: extracted.appendingPathComponent("word/document.xml"), + encoding: .utf8 + ) + } +} + +private struct HTMLWordBuilder { + private var document = WordDocument() + private let parsed: SwiftSoup.Document + private let sourceURL: URL? + private let options: ConversionOptions + + init(parsed: SwiftSoup.Document, sourceURL: URL?, options: ConversionOptions) { + self.parsed = parsed + self.sourceURL = sourceURL + self.options = options + } + + mutating func build() throws -> WordDocument { + document.properties.title = try resolvedTitle() + if let author = try resolvedAuthor(), !author.isEmpty { + document.properties.creator = author + } + document.properties.subject = sourceURL?.lastPathComponent ?? "html" + + let nodes: [Node] + if let body = parsed.body() { + nodes = body.getChildNodes() + } else { + nodes = parsed.getChildNodes() + } + + try emitBlockNodes(nodes) + + if document.body.children.isEmpty { + document.appendParagraph(Paragraph()) + } + + return document + } + + private mutating func emitBlockNodes( + _ nodes: [Node], + baseProperties: ParagraphProperties? = nil + ) throws { + for node in nodes { + try emitBlock(node, baseProperties: baseProperties) + } + } + + private mutating func emitBlock( + _ node: Node, + baseProperties: ParagraphProperties? = nil + ) throws { + if let textNode = node as? TextNode { + let text = normalizeInlineText(textNode.getWholeText(), preserveWhitespace: false) + guard !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return } + var paragraph = Paragraph(text: text) + if let baseProperties { + paragraph.properties = baseProperties + } + document.appendParagraph(paragraph) + return + } + + guard let element = node as? Element else { return } + let tag = element.tagName().lowercased() + if ignoredTags.contains(tag) { + return + } + + switch tag { + case "h1", "h2", "h3", "h4", "h5", "h6": + guard var paragraph = try makeParagraph(from: element.getChildNodes(), baseProperties: baseProperties) else { + return + } + paragraph.properties.style = headingStyle(for: tag) + document.appendParagraph(paragraph) + + case "p": + if let paragraph = try makeParagraph(from: element.getChildNodes(), baseProperties: baseProperties) { + document.appendParagraph(paragraph) + } + + case "ul", "ol": + try emitList(element, context: nil) + + case "blockquote": + var quoteProperties = baseProperties ?? ParagraphProperties() + quoteProperties.indentation = Indentation(left: 720) + if containsBlockChildren(element) { + try emitBlockNodes(element.getChildNodes(), baseProperties: quoteProperties) + } else if let paragraph = try makeParagraph(from: element.getChildNodes(), baseProperties: quoteProperties) { + document.appendParagraph(paragraph) + } + + case "pre": + if let paragraph = try makePreformattedParagraph(from: element, baseProperties: baseProperties) { + document.appendParagraph(paragraph) + } + + case "table": + try emitTable(element) + + case "hr": + let line = String(repeating: "—", count: 12) + var paragraph = Paragraph(text: line) + if let baseProperties { + paragraph.properties = baseProperties + } + document.appendParagraph(paragraph) + + default: + if containerTags.contains(tag) { + if containsBlockChildren(element) { + try emitBlockNodes(element.getChildNodes(), baseProperties: baseProperties) + } else if let paragraph = try makeParagraph(from: element.getChildNodes(), baseProperties: baseProperties) { + document.appendParagraph(paragraph) + } + } else if let paragraph = try makeParagraph(from: [element], baseProperties: baseProperties) { + document.appendParagraph(paragraph) + } + } + } + + private mutating func emitList(_ list: Element, context: ListContext?) throws { + let kind = list.tagName().lowercased() == "ol" ? ListKind.ordered : .unordered + let level = context.map { $0.level + 1 } ?? 0 + + let numId: Int + if let context, context.kind == kind { + numId = context.numId + } else { + numId = kind == .ordered + ? document.numbering.createNumberedList() + : document.numbering.createBulletList() + } + + let currentContext = ListContext(kind: kind, numId: numId, level: level) + let items = list.children().array().filter { $0.tagName().lowercased() == "li" } + + for item in items { + let contentNodes = item.getChildNodes().filter { node in + guard let child = node as? Element else { return true } + let tag = child.tagName().lowercased() + return tag != "ul" && tag != "ol" + } + + if var paragraph = try makeParagraph(from: contentNodes) { + paragraph.properties.numbering = NumberingInfo(numId: currentContext.numId, level: currentContext.level) + document.appendParagraph(paragraph) + } + + let nestedLists = item.children().array().filter { + let tag = $0.tagName().lowercased() + return tag == "ul" || tag == "ol" + } + for nested in nestedLists { + try emitList(nested, context: currentContext) + } + } + } + + private mutating func emitTable(_ table: Element) throws { + let rows = try table.select("tr").array() + guard !rows.isEmpty else { return } + + let wordRows: [TableRow] = try rows.map { row in + let cells = row.children().array().filter { + let tag = $0.tagName().lowercased() + return tag == "th" || tag == "td" + } + + let wordCells: [TableCell] = try cells.map { cell in + if let paragraph = try makeParagraph(from: cell.getChildNodes()) { + return TableCell(paragraphs: [paragraph]) + } + return TableCell() + } + + var tableRow = TableRow(cells: wordCells) + tableRow.properties.isHeader = cells.contains { $0.tagName().lowercased() == "th" } + return tableRow + } + + var wordTable = Table(rows: wordRows) + wordTable.properties.borders = .all(Border()) + document.appendTable(wordTable) + } + + private mutating func makePreformattedParagraph( + from element: Element, + baseProperties: ParagraphProperties? + ) throws -> Paragraph? { + let rawText = try plainText(from: element.getChildNodes(), preserveWhitespace: true) + let normalized = rawText + .replacingOccurrences(of: "\r\n", with: "\n") + .replacingOccurrences(of: "\r", with: "\n") + .trimmingCharacters(in: .newlines) + + guard !normalized.isEmpty else { return nil } + + var paragraph = Paragraph() + if let baseProperties { + paragraph.properties = baseProperties + } + + let lines = normalized.components(separatedBy: .newlines) + var codeProperties = RunProperties() + codeProperties.fontName = "Menlo" + + for (index, line) in lines.enumerated() { + appendTextRun(line, properties: codeProperties, to: ¶graph) + if index < lines.count - 1 { + paragraph.runs.append(rawXMLRun("")) + } + } + + finalizeParagraph(¶graph) + return paragraph.runs.isEmpty ? nil : paragraph + } + + private mutating func makeParagraph( + from nodes: [Node], + baseProperties: ParagraphProperties? = nil + ) throws -> Paragraph? { + var paragraph = Paragraph() + if let baseProperties { + paragraph.properties = baseProperties + } + + try appendInline(nodes, to: ¶graph, properties: RunProperties(), preserveWhitespace: false) + finalizeParagraph(¶graph) + + return paragraph.runs.isEmpty ? nil : paragraph + } + + private mutating func appendInline( + _ nodes: [Node], + to paragraph: inout Paragraph, + properties: RunProperties, + preserveWhitespace: Bool + ) throws { + for node in nodes { + try appendInline(node, to: ¶graph, properties: properties, preserveWhitespace: preserveWhitespace) + } + } + + private mutating func appendInline( + _ node: Node, + to paragraph: inout Paragraph, + properties: RunProperties, + preserveWhitespace: Bool + ) throws { + if let textNode = node as? TextNode { + let text = normalizeInlineText(textNode.getWholeText(), preserveWhitespace: preserveWhitespace) + appendTextRun(text, properties: properties, to: ¶graph) + return + } + + guard let element = node as? Element else { return } + let tag = element.tagName().lowercased() + + switch tag { + case "strong", "b": + var next = properties + next.bold = true + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: preserveWhitespace) + + case "em", "i": + var next = properties + next.italic = true + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: preserveWhitespace) + + case "u": + var next = properties + next.underline = .single + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: preserveWhitespace) + + case "del", "s", "strike": + var next = properties + next.strikethrough = true + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: preserveWhitespace) + + case "sup": + var next = properties + next.verticalAlign = .superscript + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: true) + + case "sub": + var next = properties + next.verticalAlign = .subscript + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: true) + + case "mark": + var next = properties + next.highlight = .yellow + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: preserveWhitespace) + + case "code": + var next = properties + next.fontName = "Menlo" + try appendInline(element.getChildNodes(), to: ¶graph, properties: next, preserveWhitespace: true) + + case "br": + paragraph.runs.append(rawXMLRun("")) + + case "a": + let href = (try? element.attr("href"))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + let displayText = try plainText(from: element.getChildNodes(), preserveWhitespace: false) + guard !displayText.isEmpty else { return } + if href.isEmpty { + appendTextRun(displayText, properties: properties, to: ¶graph) + } else { + paragraph.runs.append(makeHyperlinkRun(text: displayText, href: href)) + } + + case "img": + let alt = (try? element.attr("alt"))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + let fallback = (try? element.attr("src"))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "image" + let label = alt.isEmpty ? "[Image: \(fallback)]" : "[Image: \(alt)]" + appendTextRun(label, properties: properties, to: ¶graph) + + default: + try appendInline(element.getChildNodes(), to: ¶graph, properties: properties, preserveWhitespace: preserveWhitespace) + } + } + + private mutating func makeHyperlinkRun(text: String, href: String) -> Run { + let escapedText = escapeXML(text) + + if href.hasPrefix("#") { + let anchor = escapeXML(String(href.dropFirst())) + return rawXMLRun( + "\(escapedText)" + ) + } + + let relationshipId = "rIdHTMLLink\(document.hyperlinkReferences.count + 1)" + document.hyperlinkReferences.append( + HyperlinkReference(relationshipId: relationshipId, url: href) + ) + + return rawXMLRun( + "\(escapedText)" + ) + } + + private func plainText(from nodes: [Node], preserveWhitespace: Bool) throws -> String { + var result = "" + for node in nodes { + if let textNode = node as? TextNode { + result += normalizeInlineText(textNode.getWholeText(), preserveWhitespace: preserveWhitespace) + continue + } + guard let element = node as? Element else { continue } + let tag = element.tagName().lowercased() + if tag == "br" { + result += preserveWhitespace ? "\n" : " " + continue + } + if tag == "img" { + let alt = (try? element.attr("alt"))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + if !alt.isEmpty { + result += alt + } + continue + } + result += try plainText(from: element.getChildNodes(), preserveWhitespace: preserveWhitespace) + } + return preserveWhitespace ? result : collapseSpaces(result) + } + + private func resolvedTitle() throws -> String { + let title = try parsed.title().trimmingCharacters(in: .whitespacesAndNewlines) + if !title.isEmpty { + return title + } + if let sourceURL { + return sourceURL.deletingPathExtension().lastPathComponent + } + return "HTML Document" + } + + private func resolvedAuthor() throws -> String? { + if let content = try parsed + .select("meta[name=author], meta[name=Author]") + .first()? + .attr("content") + .trimmingCharacters(in: .whitespacesAndNewlines), !content.isEmpty { + return content + } + return nil + } + + private func containsBlockChildren(_ element: Element) -> Bool { + element.children().array().contains { blockTags.contains($0.tagName().lowercased()) } + } + + private func headingStyle(for tag: String) -> String { + switch tag { + case "h1": return "Heading1" + case "h2": return "Heading2" + case "h3": return "Heading3" + case "h4": return "Heading4" + case "h5": return "Heading5" + case "h6": return "Heading6" + default: return "Heading3" + } + } + + private func normalizeInlineText(_ text: String, preserveWhitespace: Bool) -> String { + let normalized = text + .replacingOccurrences(of: "\u{00A0}", with: " ") + .replacingOccurrences(of: "\r\n", with: "\n") + .replacingOccurrences(of: "\r", with: "\n") + return preserveWhitespace ? normalized : collapseSpaces(normalized) + } + + private func collapseSpaces(_ text: String) -> String { + text.replacingOccurrences(of: #"\s+"#, with: " ", options: .regularExpression) + } + + private func appendTextRun(_ text: String, properties: RunProperties, to paragraph: inout Paragraph) { + guard !text.isEmpty else { return } + + var value = text + if paragraph.runs.isEmpty { + value = value.replacingOccurrences(of: #"^\s+"#, with: "", options: .regularExpression) + } else if let lastIndex = paragraph.runs.indices.last, + paragraph.runs[lastIndex].rawXML == nil, + paragraph.runs[lastIndex].drawing == nil { + if paragraph.runs[lastIndex].text.hasSuffix(" "), value.hasPrefix(" ") { + value.removeFirst() + } + if paragraph.runs[lastIndex].properties == properties { + paragraph.runs[lastIndex].text += value + return + } + } + + guard !value.isEmpty else { return } + paragraph.runs.append(Run(text: value, properties: properties)) + } + + private func finalizeParagraph(_ paragraph: inout Paragraph) { + var cleaned: [Run] = [] + for var run in paragraph.runs { + if run.rawXML == nil, run.drawing == nil { + if cleaned.isEmpty { + run.text = run.text.replacingOccurrences(of: #"^\s+"#, with: "", options: .regularExpression) + } + if run.text.isEmpty { + continue + } + } + cleaned.append(run) + } + + if let index = cleaned.lastIndex(where: { $0.rawXML == nil && $0.drawing == nil }) { + cleaned[index].text = cleaned[index].text.replacingOccurrences(of: #"\s+$"#, with: "", options: .regularExpression) + if cleaned[index].text.isEmpty { + cleaned.remove(at: index) + } + } + + paragraph.runs = cleaned + } + + private func rawXMLRun(_ xml: String) -> Run { + var run = Run(text: "") + run.rawXML = xml + return run + } + + private func escapeXML(_ text: String) -> String { + text + .replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") + .replacingOccurrences(of: "\"", with: """) + .replacingOccurrences(of: "'", with: "'") + } +} + +private struct ListContext { + let kind: ListKind + let numId: Int + let level: Int +} + +private enum ListKind { + case unordered + case ordered +} + +private let ignoredTags: Set = [ + "script", "style", "noscript", "meta", "link", "head", "title" +] + +private let blockTags: Set = [ + "address", "article", "aside", "blockquote", "details", "div", "dl", "fieldset", "figcaption", + "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", + "main", "nav", "ol", "p", "pre", "section", "table", "ul" +] + +private let containerTags: Set = [ + "article", "body", "div", "figure", "figcaption", "main", "section", "span" +] diff --git a/packages/html-to-word-swift/Tests/HTMLToWordTests/HTMLToWordConverterTests.swift b/packages/html-to-word-swift/Tests/HTMLToWordTests/HTMLToWordConverterTests.swift new file mode 100644 index 0000000..1f157c7 --- /dev/null +++ b/packages/html-to-word-swift/Tests/HTMLToWordTests/HTMLToWordConverterTests.swift @@ -0,0 +1,210 @@ +import Foundation +import OOXMLSwift +@testable import HTMLToWord + +#if canImport(XCTest) +import XCTest + +final class HTMLToWordConverterTests: XCTestCase { + private let converter = HTMLToWordConverter() + private var cleanupURLs: [URL] = [] + + override func tearDown() { + for url in cleanupURLs { + try? FileManager.default.removeItem(at: url) + } + cleanupURLs.removeAll() + super.tearDown() + } + + func testConvertToStringStreamsDocumentXML() throws { + let inputURL = try makeHTMLFile( + named: "basic.html", + html: """ + + + Sample +

Heading

Hello world

+ + """ + ) + + let xml = try converter.convertToString(input: inputURL) + + XCTAssertTrue(xml.contains(" + + + Research Note + + +

Body text

+ + """ + ) + let outputURL = inputURL.deletingPathExtension().appendingPathExtension("docx") + + try converter.convertToFile(input: inputURL, output: outputURL) + let extracted = try extractArchive(outputURL) + + let documentXML = try readFile(extracted.appendingPathComponent("word/document.xml")) + let coreXML = try readFile(extracted.appendingPathComponent("docProps/core.xml")) + + XCTAssertTrue(documentXML.contains("Body text"), "Got: \(documentXML)") + XCTAssertTrue(coreXML.contains("Research Note"), "Got: \(coreXML)") + XCTAssertTrue(coreXML.contains("Che Cheng"), "Got: \(coreXML)") + } + + func testInlineFormattingMapsToOOXMLRunProperties() throws { + let inputURL = try makeHTMLFile( + named: "inline.html", + html: """ +

+ bold + italic + under + gone + H2O and x2 + hot +

+ """ + ) + let outputURL = inputURL.deletingPathExtension().appendingPathExtension("docx") + + try converter.convertToFile(input: inputURL, output: outputURL) + let extracted = try extractArchive(outputURL) + let documentXML = try readFile(extracted.appendingPathComponent("word/document.xml")) + + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("w:vertAlign w:val=\"subscript\""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("w:vertAlign w:val=\"superscript\""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + } + + func testListsProduceNumberingDefinitions() throws { + let inputURL = try makeHTMLFile( + named: "lists.html", + html: """ +
    +
  • First
  • +
  • Second +
      +
    1. Nested one
    2. +
    +
  • +
+ """ + ) + let outputURL = inputURL.deletingPathExtension().appendingPathExtension("docx") + + try converter.convertToFile(input: inputURL, output: outputURL) + let extracted = try extractArchive(outputURL) + let documentXML = try readFile(extracted.appendingPathComponent("word/document.xml")) + let numberingXML = try readFile(extracted.appendingPathComponent("word/numbering.xml")) + + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(numberingXML.contains(""), "Got: \(numberingXML)") + XCTAssertTrue(numberingXML.contains(""), "Got: \(numberingXML)") + } + + func testTableProducesWordTableXML() throws { + let inputURL = try makeHTMLFile( + named: "table.html", + html: """ + + + +
Header AHeader B
Value 1Value 2
+ """ + ) + let outputURL = inputURL.deletingPathExtension().appendingPathExtension("docx") + + try converter.convertToFile(input: inputURL, output: outputURL) + let extracted = try extractArchive(outputURL) + let documentXML = try readFile(extracted.appendingPathComponent("word/document.xml")) + + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Header A"), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Value 2"), "Got: \(documentXML)") + } + + func testHyperlinksProduceDocumentRelationships() throws { + let inputURL = try makeHTMLFile( + named: "links.html", + html: """ +

Visit Example now.

+ """ + ) + let outputURL = inputURL.deletingPathExtension().appendingPathExtension("docx") + + try converter.convertToFile(input: inputURL, output: outputURL) + let extracted = try extractArchive(outputURL) + let documentXML = try readFile(extracted.appendingPathComponent("word/document.xml")) + let relsXML = try readFile(extracted.appendingPathComponent("word/_rels/document.xml.rels")) + + XCTAssertTrue(documentXML.contains("

Quoted text

+
line 1\nline 2
+ """ + ) + let outputURL = inputURL.deletingPathExtension().appendingPathExtension("docx") + + try converter.convertToFile(input: inputURL, output: outputURL) + let extracted = try extractArchive(outputURL) + let documentXML = try readFile(extracted.appendingPathComponent("word/document.xml")) + + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Menlo"), "Got: \(documentXML)") + } + + func testTitleFallsBackToSourceFilename() throws { + let inputURL = try makeHTMLFile( + named: "fallback-title.html", + html: "

No title tag here

" + ) + + let document = try converter.convertToDocument(input: inputURL) + XCTAssertEqual(document.properties.title, "fallback-title") + } + + private func makeHTMLFile(named name: String, html: String) throws -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + cleanupURLs.append(directory) + + let fileURL = directory.appendingPathComponent(name) + try html.write(to: fileURL, atomically: true, encoding: .utf8) + return fileURL + } + + private func extractArchive(_ archiveURL: URL) throws -> URL { + let extracted = try ZipHelper.unzip(archiveURL) + cleanupURLs.append(extracted) + return extracted + } + + private func readFile(_ url: URL) throws -> String { + try String(contentsOf: url, encoding: .utf8) + } +} +#endif diff --git a/packages/md-to-word-swift/.gitignore b/packages/md-to-word-swift/.gitignore new file mode 100644 index 0000000..c3bee17 --- /dev/null +++ b/packages/md-to-word-swift/.gitignore @@ -0,0 +1,7 @@ +.build/ +.swiftpm/ +DerivedData/ +*.xcodeproj +*.xcworkspace +xcuserdata/ +.DS_Store diff --git a/packages/md-to-word-swift/Sources/MDToWord/FigureImporter.swift b/packages/md-to-word-swift/Sources/MDToWord/FigureImporter.swift new file mode 100644 index 0000000..632636a --- /dev/null +++ b/packages/md-to-word-swift/Sources/MDToWord/FigureImporter.swift @@ -0,0 +1,75 @@ +import Foundation +import OOXMLSwift + +/// 圖片匯入器:從 figures/ 目錄載入圖片資料 +public struct FigureImporter { + let directory: URL + + public init(directory: URL) { + self.directory = directory + } + + /// 載入目錄中所有圖片 → [filename: Data] + public func loadAll() throws -> [String: Data] { + var result: [String: Data] = [:] + + guard FileManager.default.fileExists(atPath: directory.path) else { + return result + } + + let contents = try FileManager.default.contentsOfDirectory( + at: directory, + includingPropertiesForKeys: nil + ) + + let imageExtensions: Set = ["png", "jpg", "jpeg", "gif", "bmp", "tiff", "tif", "webp"] + + for fileURL in contents { + let ext = fileURL.pathExtension.lowercased() + guard imageExtensions.contains(ext) else { continue } + let data = try Data(contentsOf: fileURL) + result[fileURL.lastPathComponent] = data + } + + return result + } + + /// 根據 Markdown image path 建立 ImageReference + /// + /// - Parameters: + /// - path: Markdown 中的圖片路徑(如 "figures/image1.png" 或 "image1.png") + /// - id: 關係 ID (rId) + /// - figures: 已載入的圖片 [filename: Data] + public func createImageReference( + path: String, + id: String, + figures: [String: Data] + ) -> ImageReference? { + // 嘗試直接匹配 filename + let fileName = URL(fileURLWithPath: path).lastPathComponent + + guard let data = figures[fileName] else { return nil } + + let ext = (fileName as NSString).pathExtension.lowercased() + let contentType = mimeType(for: ext) + + return ImageReference( + id: id, + fileName: fileName, + contentType: contentType, + data: data + ) + } + + private func mimeType(for ext: String) -> String { + switch ext { + case "png": return "image/png" + case "jpg", "jpeg": return "image/jpeg" + case "gif": return "image/gif" + case "bmp": return "image/bmp" + case "tiff", "tif": return "image/tiff" + case "webp": return "image/webp" + default: return "image/png" + } + } +} diff --git a/packages/md-to-word-swift/Sources/MDToWord/FootnoteParser.swift b/packages/md-to-word-swift/Sources/MDToWord/FootnoteParser.swift new file mode 100644 index 0000000..c90d718 --- /dev/null +++ b/packages/md-to-word-swift/Sources/MDToWord/FootnoteParser.swift @@ -0,0 +1,86 @@ +import Foundation + +/// 解析 Markdown footnote definitions(非 CommonMark 標準) +/// +/// 格式:`[^id]: footnote text` +/// swift-markdown 不支援 footnotes,需要單獨解析。 +struct FootnoteParser { + + struct FootnoteDefinition { + let id: String + let text: String + } + + struct FootnoteReference { + let id: String + let range: Range + } + + /// 從 Markdown 文字中提取所有 footnote definitions + static func parseDefinitions(from markdown: String) -> [FootnoteDefinition] { + var results: [FootnoteDefinition] = [] + let pattern = #"^\[\^([^\]]+)\]:\s*(.+)$"# + guard let regex = try? NSRegularExpression(pattern: pattern, options: .anchorsMatchLines) else { + return results + } + + let nsString = markdown as NSString + let matches = regex.matches(in: markdown, range: NSRange(location: 0, length: nsString.length)) + + for match in matches { + guard match.numberOfRanges >= 3 else { continue } + let id = nsString.substring(with: match.range(at: 1)) + let text = nsString.substring(with: match.range(at: 2)) + results.append(FootnoteDefinition(id: id, text: text)) + } + + return results + } + + /// 從 Markdown 文字中找出所有 footnote references [^id] + static func parseReferences(from markdown: String) -> [FootnoteReference] { + var results: [FootnoteReference] = [] + let pattern = #"\[\^([^\]]+)\](?!:)"# + guard let regex = try? NSRegularExpression(pattern: pattern) else { + return results + } + + let nsString = markdown as NSString + let matches = regex.matches(in: markdown, range: NSRange(location: 0, length: nsString.length)) + + for match in matches { + guard match.numberOfRanges >= 2, + let range = Range(match.range, in: markdown) else { continue } + let id = nsString.substring(with: match.range(at: 1)) + results.append(FootnoteReference(id: id, range: range)) + } + + return results + } + + /// 移除 markdown 中的 footnote definitions(這些不進 AST) + static func stripDefinitions(from markdown: String) -> String { + let pattern = #"^\[\^[^\]]+\]:\s*.+$"# + guard let regex = try? NSRegularExpression(pattern: pattern, options: .anchorsMatchLines) else { + return markdown + } + + let nsString = markdown as NSString + let result = regex.stringByReplacingMatches( + in: markdown, + range: NSRange(location: 0, length: nsString.length), + withTemplate: "" + ) + + // 清理多餘空行 + return result + .components(separatedBy: "\n") + .reduce(into: [String]()) { acc, line in + if line.trimmingCharacters(in: .whitespaces).isEmpty && acc.last?.trimmingCharacters(in: .whitespaces).isEmpty == true { + return // 跳過連續空行 + } + acc.append(line) + } + .joined(separator: "\n") + } +} diff --git a/packages/md-to-word-swift/Sources/MDToWord/MarkdownASTWalker.swift b/packages/md-to-word-swift/Sources/MDToWord/MarkdownASTWalker.swift new file mode 100644 index 0000000..a889c02 --- /dev/null +++ b/packages/md-to-word-swift/Sources/MDToWord/MarkdownASTWalker.swift @@ -0,0 +1,575 @@ +import Foundation +import Markdown +import OOXMLSwift + +/// 走訪 swift-markdown AST,建構 WordDocument +/// +/// 實作 `MarkupWalker` protocol,將每個 block/inline node +/// 轉換為對應的 OOXMLSwift 模型。 +struct MarkdownASTWalker { + var document: WordDocument + var metadata: DocumentMetadata? + var figures: [String: Data] + var footnoteDefinitions: [String: String] // id → text + + /// 追蹤目前的段落索引(用於 metadata overlay) + private var paragraphIndex: Int = 0 + + /// 追蹤 image rId 計數 + private var nextImageRId: Int = 100 + + /// 追蹤 hyperlink rId 計數 + private var nextHyperlinkRId: Int = 200 + + /// 當前段落累積的 hyperlinks(在 processLink 中填入,visitParagraph 中消費) + private var pendingHyperlinks: [Hyperlink] = [] + + init( + metadata: DocumentMetadata? = nil, + figures: [String: Data] = [:], + footnoteDefinitions: [String: String] = [:] + ) { + self.document = WordDocument() + self.metadata = metadata + self.figures = figures + self.footnoteDefinitions = footnoteDefinitions + } + + // MARK: - Public Entry Point + + /// 走訪整個 Document + mutating func walk(_ markupDocument: Markdown.Document) { + for child in markupDocument.children { + walkBlock(child) + } + + // 套用 document-level metadata + applyDocumentMetadata() + } + + // MARK: - Block-Level Dispatch + + private mutating func walkBlock(_ markup: any Markup) { + if let heading = markup as? Heading { + visitHeading(heading) + } else if let paragraph = markup as? Markdown.Paragraph { + visitParagraph(paragraph) + } else if let codeBlock = markup as? CodeBlock { + visitCodeBlock(codeBlock) + } else if let blockQuote = markup as? BlockQuote { + visitBlockQuote(blockQuote) + } else if let orderedList = markup as? OrderedList { + visitOrderedList(orderedList) + } else if let unorderedList = markup as? UnorderedList { + visitUnorderedList(unorderedList) + } else if let table = markup as? Markdown.Table { + visitTable(table) + } else if let thematicBreak = markup as? ThematicBreak { + visitThematicBreak(thematicBreak) + } else if let htmlBlock = markup as? HTMLBlock { + visitHTMLBlock(htmlBlock) + } else { + // 其他 block types — 遞迴走訪 children + for child in markup.children { + walkBlock(child) + } + } + } + + // MARK: - Block Visitors + + private mutating func visitHeading(_ heading: Heading) { + pendingHyperlinks = [] + let level = heading.level + let styleName = "Heading\(level)" + let runs = processInlines(heading.children) + + var props = ParagraphProperties() + props.style = styleName + + var para = OOXMLSwift.Paragraph(runs: runs, properties: props) + para.hyperlinks = pendingHyperlinks + pendingHyperlinks = [] + applyMetadataOverlay(to: ¶) + document.appendParagraph(para) + } + + private mutating func visitParagraph(_ mdParagraph: Markdown.Paragraph) { + pendingHyperlinks = [] + let runs = processInlines(mdParagraph.children) + + // 空段落跳過(除非有 hyperlinks) + guard !runs.isEmpty || !pendingHyperlinks.isEmpty else { return } + + var para = OOXMLSwift.Paragraph(runs: runs) + para.hyperlinks = pendingHyperlinks + pendingHyperlinks = [] + applyMetadataOverlay(to: ¶) + document.appendParagraph(para) + } + + private mutating func visitCodeBlock(_ codeBlock: CodeBlock) { + let code = codeBlock.code.trimmingCharacters(in: .newlines) + let lines = code.components(separatedBy: "\n") + + for line in lines { + var props = ParagraphProperties() + props.style = "Code" + + var runProps = RunProperties() + runProps.fontName = "Consolas" + runProps.fontSize = 20 // 10pt + + let run = Run(text: line, properties: runProps) + var para = OOXMLSwift.Paragraph(runs: [run], properties: props) + applyMetadataOverlay(to: ¶) + document.appendParagraph(para) + } + } + + private mutating func visitBlockQuote(_ blockQuote: BlockQuote) { + // BlockQuote 的 children 是其他 block elements + for child in blockQuote.children { + if let paragraph = child as? Markdown.Paragraph { + let runs = processInlines(paragraph.children) + var props = ParagraphProperties() + props.style = "Quote" + props.indentation = Indentation(left: 720) + + var para = OOXMLSwift.Paragraph(runs: runs, properties: props) + applyMetadataOverlay(to: ¶) + document.appendParagraph(para) + } else { + walkBlock(child) + } + } + } + + private mutating func visitOrderedList(_ orderedList: OrderedList) { + let numId = document.numbering.createNumberedList() + visitListItems(orderedList.children, numId: numId, level: 0) + } + + private mutating func visitUnorderedList(_ unorderedList: UnorderedList) { + let numId = document.numbering.createBulletList() + visitListItems(unorderedList.children, numId: numId, level: 0) + } + + private mutating func visitListItems(_ items: some Sequence, numId: Int, level: Int) { + for item in items { + guard let listItem = item as? ListItem else { continue } + + for child in listItem.children { + if let paragraph = child as? Markdown.Paragraph { + let runs = processInlines(paragraph.children) + + var props = ParagraphProperties() + props.numbering = NumberingInfo(numId: numId, level: level) + + var para = OOXMLSwift.Paragraph(runs: runs, properties: props) + applyMetadataOverlay(to: ¶) + document.appendParagraph(para) + } else if let nestedOrdered = child as? OrderedList { + let nestedNumId = document.numbering.createNumberedList() + visitListItems(nestedOrdered.children, numId: nestedNumId, level: level + 1) + } else if let nestedUnordered = child as? UnorderedList { + let nestedNumId = document.numbering.createBulletList() + visitListItems(nestedUnordered.children, numId: nestedNumId, level: level + 1) + } else { + walkBlock(child) + } + } + } + } + + private mutating func visitTable(_ table: Markdown.Table) { + var wordRows: [TableRow] = [] + + // Header + let headerRow = table.head + var headerCells: [TableCell] = [] + for cell in headerRow.cells { + let runs = processInlines(cell.children) + let para = OOXMLSwift.Paragraph(runs: runs) + headerCells.append(TableCell(paragraphs: [para])) + } + var headerTableRow = TableRow(cells: headerCells) + headerTableRow.properties.isHeader = true + wordRows.append(headerTableRow) + + // Body rows + for bodyRow in table.body.rows { + var cells: [TableCell] = [] + for cell in bodyRow.cells { + let runs = processInlines(cell.children) + let para = OOXMLSwift.Paragraph(runs: runs) + cells.append(TableCell(paragraphs: [para])) + } + wordRows.append(TableRow(cells: cells)) + } + + var tableProps = TableProperties() + tableProps.borders = TableBorders.all(Border()) + tableProps.width = 9000 + tableProps.widthType = .dxa + + let wordTable = Table(rows: wordRows, properties: tableProps) + document.appendTable(wordTable) + paragraphIndex += 1 // Table 佔一個 index + } + + private mutating func visitThematicBreak(_ thematicBreak: ThematicBreak) { + var para = OOXMLSwift.Paragraph() + para.hasPageBreak = true + document.appendParagraph(para) + paragraphIndex += 1 + } + + private mutating func visitHTMLBlock(_ htmlBlock: HTMLBlock) { + // HTML blocks → 當作純文字段落 + let text = htmlBlock.rawHTML.trimmingCharacters(in: .whitespacesAndNewlines) + guard !text.isEmpty else { return } + + let run = Run(text: text) + var para = OOXMLSwift.Paragraph(runs: [run]) + applyMetadataOverlay(to: ¶) + document.appendParagraph(para) + } + + // MARK: - Inline Processing + + /// 處理一組 inline children → [Run] + mutating func processInlines(_ children: S) -> [Run] where S.Element: Markup { + processInlineSequence(Array(children), inheritedProps: RunProperties()) + } + + /// 處理 existential Markup children(Table.Cell 等回傳 [any Markup] 的場景) + mutating func processInlines(_ children: some Sequence) -> [Run] { + processInlineSequence(Array(children), inheritedProps: RunProperties()) + } + + /// 處理一組 inline siblings,支援 HTML extension 狀態追蹤 + /// + /// swift-markdown 把 `text` 解析為三個扁平 sibling: + /// InlineHTML("") → Text("text") → InlineHTML("") + /// 此方法用 activeProps 狀態機在 sibling 間傳遞格式。 + private mutating func processInlineSequence( + _ children: [any Markup], + inheritedProps: RunProperties + ) -> [Run] { + var runs: [Run] = [] + var activeProps = inheritedProps + + for child in children { + if let inlineHTML = child as? InlineHTML { + applyHTMLTag(inlineHTML.rawHTML, to: &activeProps, base: inheritedProps) + } else { + runs.append(contentsOf: processInline(child, inheritedProps: activeProps)) + } + } + return runs + } + + /// 根據 HTML tag 更新 activeProps,closing tag 還原到 base + private func applyHTMLTag(_ tag: String, to props: inout RunProperties, base: RunProperties) { + switch tag { + case "": props.underline = .single + case "": props.underline = base.underline + case "": props.verticalAlign = .superscript + case "": props.verticalAlign = base.verticalAlign + case "": props.verticalAlign = .subscript + case "": props.verticalAlign = base.verticalAlign + case "": props.highlight = .yellow + case "": props.highlight = base.highlight + default: break + } + } + + /// 處理單一 inline node + private mutating func processInline(_ markup: any Markup, inheritedProps: RunProperties) -> [Run] { + if let text = markup as? Markdown.Text { + return [Run(text: text.string, properties: inheritedProps)] + } + + if let strong = markup as? Strong { + var props = inheritedProps + props.bold = true + return processInlineSequence(Array(strong.children), inheritedProps: props) + } + + if let emphasis = markup as? Emphasis { + var props = inheritedProps + props.italic = true + return processInlineSequence(Array(emphasis.children), inheritedProps: props) + } + + if let strikethrough = markup as? Strikethrough { + var props = inheritedProps + props.strikethrough = true + return processInlineSequence(Array(strikethrough.children), inheritedProps: props) + } + + if let inlineCode = markup as? InlineCode { + var props = inheritedProps + props.fontName = "Consolas" + props.fontSize = 20 // 10pt + var run = Run(text: inlineCode.code, properties: props) + run.semantic = SemanticAnnotation(type: .codeBlock) + return [run] + } + + if let link = markup as? Markdown.Link { + return processLink(link, inheritedProps: inheritedProps) + } + + if let image = markup as? Markdown.Image { + return processImage(image) + } + + if let inlineHTML = markup as? InlineHTML { + // 已知 HTML tags 由 processInlineSequence 的 applyHTMLTag 處理 + // 這裡處理未知 HTML(當作純文字) + return [Run(text: inlineHTML.rawHTML, properties: inheritedProps)] + } + + if markup is SoftBreak { + return [Run(text: " ", properties: inheritedProps)] + } + + if markup is LineBreak { + return [Run(text: "\n", properties: inheritedProps)] + } + + // Fallback: 遞迴 children(使用 processInlineSequence 保持 HTML 狀態追蹤) + return processInlineSequence(Array(markup.children), inheritedProps: inheritedProps) + } + + // MARK: - Link Processing + + private mutating func processLink(_ link: Markdown.Link, inheritedProps: RunProperties) -> [Run] { + let text = link.children.compactMap { child -> String? in + if let t = child as? Markdown.Text { return t.string } + return nil + }.joined() + + let destination = link.destination ?? "" + + // 將連結加入 document-level 超連結引用 + let rIdNum = nextHyperlinkRId + nextHyperlinkRId += 1 + let rId = "rId\(rIdNum)" + + document.hyperlinkReferences.append( + HyperlinkReference(relationshipId: rId, url: destination) + ) + + // 建立 Hyperlink 物件(與正向轉換器對齊) + let hyperlink = Hyperlink( + id: "h\(rIdNum)", + text: text, + url: destination, + relationshipId: rId + ) + pendingHyperlinks.append(hyperlink) + + // Link 文字由 Hyperlink 攜帶,不產生 run + return [] + } + + // MARK: - Image Processing + + private mutating func processImage(_ image: Markdown.Image) -> [Run] { + let altText = image.children.compactMap { ($0 as? Markdown.Text)?.string }.joined() + let source = image.source ?? "" + + let fileName = URL(fileURLWithPath: source).lastPathComponent + + guard let imageData = figures[fileName] else { + // 圖片找不到,退化為文字 + return [Run(text: "[\(altText)](\(source))")] + } + + let rId = "rId\(nextImageRId)" + nextImageRId += 1 + + let ext = (fileName as NSString).pathExtension.lowercased() + let contentType: String + switch ext { + case "png": contentType = "image/png" + case "jpg", "jpeg": contentType = "image/jpeg" + case "gif": contentType = "image/gif" + default: contentType = "image/png" + } + + let imageRef = ImageReference( + id: rId, + fileName: fileName, + contentType: contentType, + data: imageData + ) + document.images.append(imageRef) + + // 查詢 metadata 中的圖片尺寸 + var width = 4572000 // 預設 ~4.8 inches + var height = 3429000 // 預設 ~3.6 inches + + if let meta = metadata?.figures.first(where: { $0.file.hasSuffix(fileName) }) { + width = meta.width + height = meta.height + } + + let drawing = Drawing( + width: width, + height: height, + imageId: rId, + name: fileName, + description: altText + ) + + return [Run.withDrawing(drawing)] + } + + // MARK: - Metadata Overlay + + /// 套用 paragraph-level metadata + private mutating func applyMetadataOverlay(to paragraph: inout OOXMLSwift.Paragraph) { + defer { paragraphIndex += 1 } + + guard let meta = metadata?.paragraphs.first(where: { $0.index == paragraphIndex }) else { + return + } + + // Alignment + if let alignmentStr = meta.alignment, let alignment = Alignment(rawValue: alignmentStr) { + paragraph.properties.alignment = alignment + } + + // Spacing + if let spacing = meta.spacing { + paragraph.properties.spacing = Spacing( + before: spacing.before, + after: spacing.after, + line: spacing.line + ) + } + + // Indentation + if let indent = meta.indentation { + paragraph.properties.indentation = Indentation( + left: indent.left, + right: indent.right, + firstLine: indent.firstLine, + hanging: indent.hanging + ) + } + + // Run-level metadata overlay + for runMeta in meta.runs { + applyRunMetadata(to: ¶graph, runMeta: runMeta) + } + } + + /// 套用 run-level metadata(font, color, size) + private func applyRunMetadata(to paragraph: inout OOXMLSwift.Paragraph, runMeta: RunMeta) { + guard runMeta.range.count == 2 else { return } + let start = runMeta.range[0] + let end = runMeta.range[1] + + // 找到對應字元範圍的 runs + var offset = 0 + for i in 0.. start { + if let fontName = runMeta.fontName { + paragraph.runs[i].properties.fontName = fontName + } + if let fontSize = runMeta.fontSize { + paragraph.runs[i].properties.fontSize = fontSize + } + if let color = runMeta.color { + paragraph.runs[i].properties.color = color + } + if let highlight = runMeta.highlightColor, + let highlightColor = HighlightColor(rawValue: highlight) { + paragraph.runs[i].properties.highlight = highlightColor + } + if let underline = runMeta.underlineType, + let underlineType = UnderlineType(rawValue: underline) { + paragraph.runs[i].properties.underline = underlineType + } + } + + offset += runLen + } + } + + // MARK: - Document-Level Metadata + + private mutating func applyDocumentMetadata() { + guard let meta = metadata else { return } + + // Document properties + if let docInfo = meta.document { + document.properties.title = docInfo.properties.title + document.properties.creator = docInfo.properties.creator + document.properties.subject = docInfo.properties.subject + document.properties.description = docInfo.properties.description + + // Section properties + if let section = docInfo.sections.first { + if let pageSize = section.pageSize { + document.sectionProperties.pageSize = PageSize( + width: pageSize.width, + height: pageSize.height + ) + } + if let orientation = section.orientation { + document.sectionProperties.orientation = + orientation == "landscape" ? .landscape : .portrait + } + if let margins = section.margins { + document.sectionProperties.pageMargins = PageMargins( + top: margins.top, + right: margins.right, + bottom: margins.bottom, + left: margins.left + ) + } + } + + // Styles + if !docInfo.styles.isEmpty { + // 保留 default styles,加入 metadata 中定義的 + for styleMeta in docInfo.styles { + if !document.styles.contains(where: { $0.id == styleMeta.id }) { + document.styles.append(Style( + id: styleMeta.id, + name: styleMeta.name, + type: .paragraph, + basedOn: styleMeta.basedOn + )) + } + } + } + } + + // 確保 Code 和 Quote styles 存在 + ensureStyleExists(id: "Code", name: "Code", basedOn: "Normal") + ensureStyleExists(id: "Quote", name: "Quote", basedOn: "Normal") + } + + private mutating func ensureStyleExists(id: String, name: String, basedOn: String?) { + if !document.styles.contains(where: { $0.id == id }) { + document.styles.append(Style( + id: id, + name: name, + type: .paragraph, + basedOn: basedOn + )) + } + } +} diff --git a/packages/md-to-word-swift/Sources/MDToWord/MarkdownToWordConverter.swift b/packages/md-to-word-swift/Sources/MDToWord/MarkdownToWordConverter.swift new file mode 100644 index 0000000..93f03bb --- /dev/null +++ b/packages/md-to-word-swift/Sources/MDToWord/MarkdownToWordConverter.swift @@ -0,0 +1,751 @@ +import Foundation +import Markdown +import CommonConverterSwift +import OOXMLSwift + +private typealias WordParagraph = OOXMLSwift.Paragraph + +/// Direct Markdown → Word (.docx) converter. +/// +/// The streaming `DocumentConverter` surface writes `word/document.xml` so callers can +/// inspect the generated OOXML without materializing an archive. Use `convertToFile` +/// for full `.docx` output. +public struct MarkdownToWordConverter: DocumentConverter { + public static let sourceFormat = "md" + + public init() {} + + public func convert( + input: URL, + output: inout W, + options: ConversionOptions + ) throws { + let document = try convertToDocument(input: input, options: options) + try output.write(renderDocumentXML(document)) + } + + public func convertToFile( + input: URL, + output: URL, + options: ConversionOptions = .default + ) throws { + let document = try convertToDocument(input: input, options: options) + try DocxWriter.write(document, to: output) + } + + public func convertToDocument( + input: URL, + options: ConversionOptions = .default + ) throws -> WordDocument { + let source = try loadSource(from: input) + return try convertMarkdown( + source, + baseURL: input.deletingLastPathComponent(), + sourceName: input.lastPathComponent, + options: options + ) + } + + public func convertMarkdown( + _ source: String, + baseURL: URL? = nil, + sourceName: String? = nil, + options: ConversionOptions = .default + ) throws -> WordDocument { + let extracted = FrontmatterExtractor.extract(from: source) + var builder = MarkdownWordBuilder( + options: options, + baseURL: baseURL, + sourceName: sourceName, + frontmatter: extracted.metadata + ) + return try builder.build(markdown: extracted.body) + } + + private func renderDocumentXML(_ document: WordDocument) -> String { + var xml = """ + + + + """ + + for child in document.body.children { + switch child { + case .paragraph(let paragraph): + xml += paragraph.toXML() + case .table(let table): + xml += table.toXML() + } + } + + xml += renderSectionPropertiesXML(document.sectionProperties) + xml += "" + return xml + } + + private func renderSectionPropertiesXML(_ section: SectionProperties) -> String { + var xml = "" + + if let headerReference = section.headerReference { + xml += "" + } + if let footerReference = section.footerReference { + xml += "" + } + + var pageSizeAttributes = "w:w=\"\(section.pageSize.width)\" w:h=\"\(section.pageSize.height)\"" + if section.orientation == .landscape { + pageSizeAttributes += " w:orient=\"landscape\"" + } + xml += "" + xml += "" + xml += "" + + if let grid = section.docGrid { + var gridAttributes = "w:linePitch=\"\(grid.linePitch)\"" + if let charSpace = grid.charSpace { + gridAttributes += " w:charSpace=\"\(charSpace)\"" + } + xml += "" + } else { + xml += "" + } + + xml += "" + return xml + } + + private func loadSource(from input: URL) throws -> String { + do { + return try String(contentsOf: input, encoding: .utf8) + } catch let error as CocoaError + where error.code == .fileReadNoSuchFile + || error.code == .fileReadNoPermission + || error.code == .fileNoSuchFile { + throw error + } catch { + if let latin1 = try? String(contentsOf: input, encoding: .isoLatin1) { + return latin1 + } + return try String(contentsOf: input, encoding: .utf8) + } + } +} + +public typealias MarkdownDOCXConverter = MarkdownToWordConverter + +private struct MarkdownWordBuilder { + private(set) var document = WordDocument() + private let options: ConversionOptions + private let baseURL: URL? + private let sourceName: String? + private let frontmatter: [String: String] + private var inferredTitle = false + + init( + options: ConversionOptions, + baseURL: URL?, + sourceName: String?, + frontmatter: [String: String] + ) { + self.options = options + self.baseURL = baseURL + self.sourceName = sourceName + self.frontmatter = frontmatter + } + + mutating func build(markdown: String) throws -> WordDocument { + applyDocumentMetadata() + + let parsed = Document(parsing: markdown, options: .parseBlockDirectives) + for child in parsed.children { + try appendBlock(child, quoteDepth: 0) + } + + if document.body.children.isEmpty { + document.appendParagraph(WordParagraph(text: "")) + } + + return document + } + + private mutating func applyDocumentMetadata() { + document.properties.creator = frontmatter["author"] ?? frontmatter["creator"] ?? "macdoc" + document.properties.subject = frontmatter["subject"] + document.properties.keywords = frontmatter["keywords"] + document.properties.description = frontmatter["description"] + ?? sourceName.map { "Converted from Markdown file \($0)" } + document.properties.title = frontmatter["title"] + document.properties.created = Date() + document.properties.modified = Date() + } + + private mutating func appendBlock(_ markup: Markup, quoteDepth: Int) throws { + switch markup { + case let heading as Markdown.Heading: + try appendHeading(heading, quoteDepth: quoteDepth) + case let paragraph as Markdown.Paragraph: + if let converted = try makeParagraph(from: paragraph, quoteDepth: quoteDepth) { + document.appendParagraph(converted) + } + case let blockQuote as Markdown.BlockQuote: + for child in blockQuote.children { + try appendBlock(child, quoteDepth: quoteDepth + 1) + } + case let orderedList as Markdown.OrderedList: + try appendOrderedList(orderedList, level: 0, quoteDepth: quoteDepth) + case let unorderedList as Markdown.UnorderedList: + try appendUnorderedList(unorderedList, level: 0, quoteDepth: quoteDepth) + case let codeBlock as Markdown.CodeBlock: + appendCodeBlock(codeBlock, quoteDepth: quoteDepth, extraIndentLevels: 0) + case let table as Markdown.Table: + if let converted = try makeTable(from: table) { + document.appendTable(converted) + } + case _ as Markdown.ThematicBreak: + appendHorizontalRule(quoteDepth: quoteDepth) + default: + for child in markup.children { + try appendBlock(child, quoteDepth: quoteDepth) + } + } + } + + private mutating func appendHeading(_ heading: Markdown.Heading, quoteDepth: Int) throws { + guard let paragraph = try makeParagraph( + fromInlineChildren: Array(heading.children), + quoteDepth: quoteDepth, + style: headingStyleID(for: heading.level) + ) else { + return + } + + if !inferredTitle, document.properties.title == nil { + let title = paragraph.getText().trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) + if !title.isEmpty { + document.properties.title = title + inferredTitle = true + } + } + + document.appendParagraph(paragraph) + } + + private mutating func appendOrderedList( + _ list: Markdown.OrderedList, + level: Int, + quoteDepth: Int + ) throws { + let numId = document.numbering.createNumberedList() + for child in list.children { + guard let item = child as? Markdown.ListItem else { continue } + try appendListItem(item, numId: numId, level: level, quoteDepth: quoteDepth) + } + } + + private mutating func appendUnorderedList( + _ list: Markdown.UnorderedList, + level: Int, + quoteDepth: Int + ) throws { + let numId = document.numbering.createBulletList() + for child in list.children { + guard let item = child as? Markdown.ListItem else { continue } + try appendListItem(item, numId: numId, level: level, quoteDepth: quoteDepth) + } + } + + private mutating func appendListItem( + _ item: Markdown.ListItem, + numId: Int, + level: Int, + quoteDepth: Int + ) throws { + var emittedPrimaryParagraph = false + + for child in item.children { + switch child { + case let paragraph as Markdown.Paragraph: + let numbering = emittedPrimaryParagraph ? nil : NumberingInfo(numId: numId, level: min(level, 8)) + let extraIndentLevels = emittedPrimaryParagraph ? level + 1 : 0 + if let converted = try makeParagraph( + from: paragraph, + quoteDepth: quoteDepth, + numbering: numbering, + extraIndentLevels: extraIndentLevels + ) { + document.appendParagraph(converted) + emittedPrimaryParagraph = true + } + case let nestedOrdered as Markdown.OrderedList: + try appendOrderedList(nestedOrdered, level: level + 1, quoteDepth: quoteDepth) + case let nestedUnordered as Markdown.UnorderedList: + try appendUnorderedList(nestedUnordered, level: level + 1, quoteDepth: quoteDepth) + case let codeBlock as Markdown.CodeBlock: + appendCodeBlock(codeBlock, quoteDepth: quoteDepth, extraIndentLevels: level + 1) + case let blockQuote as Markdown.BlockQuote: + for grandchild in blockQuote.children { + try appendBlock(grandchild, quoteDepth: quoteDepth + 1) + } + case let table as Markdown.Table: + if let converted = try makeTable(from: table) { + document.appendTable(converted) + } + case let heading as Markdown.Heading: + guard let converted = try makeParagraph( + fromInlineChildren: Array(heading.children), + quoteDepth: quoteDepth, + numbering: emittedPrimaryParagraph ? nil : NumberingInfo(numId: numId, level: min(level, 8)), + extraIndentLevels: emittedPrimaryParagraph ? level + 1 : 0, + style: headingStyleID(for: heading.level) + ) else { + continue + } + document.appendParagraph(converted) + emittedPrimaryParagraph = true + default: + try appendBlock(child, quoteDepth: quoteDepth) + } + } + } + + private mutating func appendCodeBlock( + _ codeBlock: Markdown.CodeBlock, + quoteDepth: Int, + extraIndentLevels: Int + ) { + let code = codeBlock.code + .replacingOccurrences(of: "\r\n", with: "\n") + .replacingOccurrences(of: "\r", with: "\n") + let lines = code.components(separatedBy: .newlines) + + if lines.isEmpty { + var paragraph = WordParagraph(text: "") + applyCodeStyle(to: ¶graph, quoteDepth: quoteDepth, extraIndentLevels: extraIndentLevels) + document.appendParagraph(paragraph) + return + } + + for line in lines { + var runProps = RunProperties() + runProps.fontName = "Menlo" + runProps.fontSize = 20 + let paragraph = WordParagraph(runs: [Run(text: line, properties: runProps)]) + var styled = paragraph + applyCodeStyle(to: &styled, quoteDepth: quoteDepth, extraIndentLevels: extraIndentLevels) + document.appendParagraph(styled) + } + } + + private mutating func appendHorizontalRule(quoteDepth: Int) { + var paragraph = WordParagraph(text: "") + paragraph.properties.spacing = Spacing(before: 120, after: 120) + paragraph.properties.border = ParagraphBorder( + bottom: ParagraphBorderStyle(type: .single, color: "C8C8C8", size: 8, space: 1) + ) + applyQuoteStyle(to: ¶graph.properties, quoteDepth: quoteDepth, extraIndentLevels: 0) + document.appendParagraph(paragraph) + } + + private mutating func makeParagraph( + from paragraph: Markdown.Paragraph, + quoteDepth: Int, + numbering: NumberingInfo? = nil, + extraIndentLevels: Int = 0, + style: String? = nil + ) throws -> WordParagraph? { + try makeParagraph( + fromInlineChildren: Array(paragraph.children), + quoteDepth: quoteDepth, + numbering: numbering, + extraIndentLevels: extraIndentLevels, + style: style + ) + } + + private mutating func makeParagraph( + fromInlineChildren children: [Markup], + quoteDepth: Int, + numbering: NumberingInfo? = nil, + extraIndentLevels: Int = 0, + style: String? = nil + ) throws -> WordParagraph? { + var runs: [Run] = [] + for child in children { + try appendInline(from: child, into: &runs, properties: RunProperties()) + } + runs = coalesceRuns(runs) + + let textualContent = runs + .filter { $0.rawXML == nil && $0.drawing == nil } + .map(\.text) + .joined() + .trimmingCharacters(in: .whitespacesAndNewlines) + + if textualContent.isEmpty && runs.isEmpty { + return nil + } + + var paragraph = WordParagraph(runs: runs) + paragraph.properties.style = style + paragraph.properties.numbering = numbering + paragraph.properties.spacing = Spacing(after: numbering == nil ? 200 : 80, line: 276, lineRule: .auto) + applyQuoteStyle(to: ¶graph.properties, quoteDepth: quoteDepth, extraIndentLevels: extraIndentLevels) + return paragraph + } + + private mutating func appendInline( + from markup: Markup, + into runs: inout [Run], + properties: RunProperties + ) throws { + switch markup { + case let text as Text: + guard !text.string.isEmpty else { return } + runs.append(Run(text: text.string, properties: properties)) + + case let emphasis as Emphasis: + var next = properties + next.italic = true + for child in emphasis.children { + try appendInline(from: child, into: &runs, properties: next) + } + + case let strong as Strong: + var next = properties + next.bold = true + for child in strong.children { + try appendInline(from: child, into: &runs, properties: next) + } + + case let strikethrough as Strikethrough: + var next = properties + next.strikethrough = true + for child in strikethrough.children { + try appendInline(from: child, into: &runs, properties: next) + } + + case let inlineCode as InlineCode: + var next = properties + next.fontName = "Menlo" + next.highlight = .lightGray + runs.append(Run(text: inlineCode.code, properties: next)) + + case _ as SoftBreak: + if options.hardLineBreaks { + runs.append(makeBreakRun()) + } else { + runs.append(Run(text: " ", properties: properties)) + } + + case _ as LineBreak: + runs.append(makeBreakRun()) + + case let link as Link: + let text = plainText(from: link).isEmpty ? (link.destination ?? "") : plainText(from: link) + guard !text.isEmpty else { return } + + if let destination = link.destination, !destination.isEmpty { + if destination.hasPrefix("#") { + let anchor = String(destination.dropFirst()) + runs.append(makeRawRun(makeInternalHyperlinkXML(text: text, anchor: anchor))) + } else { + let relationshipId = nextHyperlinkRelationshipID() + document.hyperlinkReferences.append( + HyperlinkReference(relationshipId: relationshipId, url: destination) + ) + runs.append( + makeRawRun( + makeExternalHyperlinkXML( + text: text, + relationshipId: relationshipId + ) + ) + ) + } + } else { + runs.append(Run(text: text, properties: properties)) + } + + case let image as Image: + let fallback = image.plainText.isEmpty + ? "[Image: \(image.source ?? "image")]" + : "[Image: \(image.plainText)]" + var next = properties + next.italic = true + runs.append(Run(text: fallback, properties: next)) + + case let inlineHTML as InlineHTML: + let stripped = stripHTML(from: inlineHTML.rawHTML) + guard !stripped.isEmpty else { return } + runs.append(Run(text: stripped, properties: properties)) + + default: + for child in markup.children { + try appendInline(from: child, into: &runs, properties: properties) + } + } + } + + private mutating func makeTable(from table: Markdown.Table) throws -> OOXMLSwift.Table? { + var rows: [TableRow] = [] + + for child in table.children { + if let head = child as? Markdown.Table.Head { + rows.append(try makeTableHeaderRow(from: head)) + } else if let body = child as? Markdown.Table.Body { + for row in body.children { + if let row = row as? Markdown.Table.Row { + rows.append(try makeTableRow(from: row, isHeader: false)) + } + } + } + } + + guard !rows.isEmpty else { return nil } + + var properties = TableProperties() + properties.borders = TableBorders.all(Border(style: .single, size: 4, color: "BDBDBD")) + properties.layout = .fixed + properties.widthType = .auto + + return OOXMLSwift.Table(rows: rows, properties: properties) + } + + private mutating func makeTableHeaderRow( + from head: Markdown.Table.Head + ) throws -> TableRow { + var cells: [TableCell] = [] + for child in head.children { + guard let cell = child as? Markdown.Table.Cell else { continue } + let paragraph = try makeParagraph( + fromInlineChildren: Array(cell.children), + quoteDepth: 0 + ) ?? WordParagraph(text: "") + + var properties = TableCellProperties() + properties.width = 2400 + properties.widthType = .dxa + properties.shading = CellShading.solid("EFEFEF") + cells.append(TableCell(paragraphs: [paragraph], properties: properties)) + } + + var props = TableRowProperties() + props.isHeader = true + return TableRow(cells: cells, properties: props) + } + + private mutating func makeTableRow( + from row: Markdown.Table.Row, + isHeader: Bool + ) throws -> TableRow { + var cells: [TableCell] = [] + for child in row.children { + guard let cell = child as? Markdown.Table.Cell else { continue } + let paragraph = try makeParagraph( + fromInlineChildren: Array(cell.children), + quoteDepth: 0 + ) ?? WordParagraph(text: "") + + var properties = TableCellProperties() + properties.width = 2400 + properties.widthType = .dxa + if isHeader { + properties.shading = CellShading.solid("EFEFEF") + } + + cells.append(TableCell(paragraphs: [paragraph], properties: properties)) + } + + var props = TableRowProperties() + props.isHeader = isHeader + return TableRow(cells: cells, properties: props) + } + + private func headingStyleID(for level: Int) -> String { + switch level { + case ...1: return "Heading1" + case 2: return "Heading2" + default: return "Heading3" + } + } + + private func nextHyperlinkRelationshipID() -> String { + let baseID = document.numbering.abstractNums.isEmpty ? 4 : 5 + let usedCount = document.headers.count + document.footers.count + document.images.count + document.hyperlinkReferences.count + return "rId\(baseID + usedCount)" + } + + private func makeExternalHyperlinkXML(text: String, relationshipId: String) -> String { + """ + + + + + + + + \(escapeXML(text)) + + + """ + } + + private func makeInternalHyperlinkXML(text: String, anchor: String) -> String { + """ + + + + + + + + \(escapeXML(text)) + + + """ + } + + private func makeBreakRun() -> Run { + makeRawRun("") + } + + private func makeRawRun(_ rawXML: String) -> Run { + var run = Run(text: "") + run.rawXML = rawXML + return run + } + + private func plainText(from markup: Markup) -> String { + switch markup { + case let text as Text: + return text.string + case let inlineCode as InlineCode: + return inlineCode.code + case let softBreak as SoftBreak: + _ = softBreak + return options.hardLineBreaks ? "\n" : " " + case let lineBreak as LineBreak: + _ = lineBreak + return "\n" + case let image as Image: + return image.plainText + case let inlineHTML as InlineHTML: + return stripHTML(from: inlineHTML.rawHTML) + default: + return markup.children.map(plainText).joined() + } + } + + private func stripHTML(from text: String) -> String { + text.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression) + } + + private func escapeXML(_ text: String) -> String { + text + .replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") + .replacingOccurrences(of: "\"", with: """) + .replacingOccurrences(of: "'", with: "'") + } + + private func applyCodeStyle( + to paragraph: inout WordParagraph, + quoteDepth: Int, + extraIndentLevels: Int + ) { + paragraph.properties.spacing = Spacing(before: 0, after: 0) + paragraph.properties.shading = CellShading.solid("F7F7F7") + paragraph.properties.border = ParagraphBorder( + left: ParagraphBorderStyle(type: .single, color: "D0D0D0", size: 6, space: 4) + ) + applyQuoteStyle(to: ¶graph.properties, quoteDepth: quoteDepth, extraIndentLevels: extraIndentLevels) + } + + private func applyQuoteStyle( + to properties: inout ParagraphProperties, + quoteDepth: Int, + extraIndentLevels: Int + ) { + let totalIndent = max(0, quoteDepth * 720 + extraIndentLevels * 360) + if totalIndent > 0 { + properties.indentation = Indentation(left: totalIndent) + } + + if quoteDepth > 0 { + properties.border = ParagraphBorder( + left: ParagraphBorderStyle(type: .single, color: "B0B0B0", size: 8, space: 4) + ) + if properties.shading == nil { + properties.shading = CellShading.solid("FAFAFA") + } + } + } + + private func coalesceRuns(_ runs: [Run]) -> [Run] { + var merged: [Run] = [] + for run in runs { + guard let last = merged.last else { + merged.append(run) + continue + } + + if canMerge(last, run) { + var updated = merged.removeLast() + updated.text += run.text + merged.append(updated) + } else { + merged.append(run) + } + } + return merged + } + + private func canMerge(_ lhs: Run, _ rhs: Run) -> Bool { + lhs.rawXML == nil + && rhs.rawXML == nil + && lhs.drawing == nil + && rhs.drawing == nil + && lhs.properties == rhs.properties + } +} + +private enum FrontmatterExtractor { + static func extract(from source: String) -> (metadata: [String: String], body: String) { + let normalized = source + .replacingOccurrences(of: "\r\n", with: "\n") + .replacingOccurrences(of: "\r", with: "\n") + + guard normalized.hasPrefix("---\n") else { + return ([:], normalized) + } + + let remainder = String(normalized.dropFirst(4)) + guard let closingRange = remainder.range(of: "\n---\n") else { + return ([:], normalized) + } + + let rawMetadata = String(remainder[..= 2 { + value.removeFirst() + value.removeLast() + } + if !key.isEmpty && !value.isEmpty { + metadata[key] = value + } + } + + return (metadata, body) + } +} diff --git a/packages/md-to-word-swift/Sources/MDToWord/MetadataReader.swift b/packages/md-to-word-swift/Sources/MDToWord/MetadataReader.swift new file mode 100644 index 0000000..6fad94e --- /dev/null +++ b/packages/md-to-word-swift/Sources/MDToWord/MetadataReader.swift @@ -0,0 +1,700 @@ +import Foundation +import Yams + +// MARK: - Metadata Models (鏡像 MetadataCollector 輸出) + +/// 完整的文件 metadata +public struct DocumentMetadata { + public var version: String + public var source: SourceInfo + public var document: DocumentInfo? + public var paragraphs: [ParagraphMeta] + public var tables: [TableMeta] + public var figures: [FigureMeta] + + public init( + version: String = "1.0", + source: SourceInfo = SourceInfo(format: "docx"), + document: DocumentInfo? = nil, + paragraphs: [ParagraphMeta] = [], + tables: [TableMeta] = [], + figures: [FigureMeta] = [] + ) { + self.version = version + self.source = source + self.document = document + self.paragraphs = paragraphs + self.tables = tables + self.figures = figures + } +} + +public struct SourceInfo { + public var format: String + public var file: String? + + public init(format: String, file: String? = nil) { + self.format = format + self.file = file + } +} + +public struct DocumentInfo { + public var properties: PropertyMap + public var styles: [StyleMeta] + public var sections: [SectionMeta] + public var comments: [CommentMeta] + public var numbering: [NumberingDefMeta] + + public init( + properties: PropertyMap = PropertyMap(), + styles: [StyleMeta] = [], + sections: [SectionMeta] = [], + comments: [CommentMeta] = [], + numbering: [NumberingDefMeta] = [] + ) { + self.properties = properties + self.styles = styles + self.sections = sections + self.comments = comments + self.numbering = numbering + } +} + +public struct PropertyMap { + public var title: String? + public var creator: String? + public var subject: String? + public var description: String? + public var keywords: String? + public var created: String? + public var modified: String? + + public init(title: String? = nil, creator: String? = nil, + subject: String? = nil, description: String? = nil, + keywords: String? = nil, created: String? = nil, + modified: String? = nil) { + self.title = title + self.creator = creator + self.subject = subject + self.description = description + self.keywords = keywords + self.created = created + self.modified = modified + } +} + +public struct StyleMeta { + public var id: String + public var name: String + public var basedOn: String? + + public init(id: String, name: String, basedOn: String? = nil) { + self.id = id + self.name = name + self.basedOn = basedOn + } +} + +public struct CommentMeta { + public var id: Int + public var author: String + public var text: String + public var paragraphIndex: Int + public var parentId: Int? + public var done: Bool + + public init(id: Int, author: String, text: String, + paragraphIndex: Int, parentId: Int? = nil, done: Bool = false) { + self.id = id + self.author = author + self.text = text + self.paragraphIndex = paragraphIndex + self.parentId = parentId + self.done = done + } +} + +public struct NumberingDefMeta { + public var abstractNumId: Int + public var levels: [NumberingLevelMeta] + + public init(abstractNumId: Int, levels: [NumberingLevelMeta] = []) { + self.abstractNumId = abstractNumId + self.levels = levels + } +} + +public struct NumberingLevelMeta { + public var ilvl: Int + public var numFmt: String + public var lvlText: String + public var start: Int + public var indent: Int + public var fontName: String? + + public init(ilvl: Int, numFmt: String, lvlText: String, + start: Int = 1, indent: Int = 720, fontName: String? = nil) { + self.ilvl = ilvl + self.numFmt = numFmt + self.lvlText = lvlText + self.start = start + self.indent = indent + self.fontName = fontName + } +} + +public struct SectionMeta { + public var pageSize: PageSizeMeta? + public var orientation: String? + public var margins: MarginsMeta? + + public init(pageSize: PageSizeMeta? = nil, orientation: String? = nil, + margins: MarginsMeta? = nil) { + self.pageSize = pageSize + self.orientation = orientation + self.margins = margins + } +} + +public struct PageSizeMeta { + public var width: Int + public var height: Int + + public init(width: Int, height: Int) { + self.width = width + self.height = height + } +} + +public struct MarginsMeta { + public var top: Int + public var bottom: Int + public var left: Int + public var right: Int + + public init(top: Int, bottom: Int, left: Int, right: Int) { + self.top = top + self.bottom = bottom + self.left = left + self.right = right + } +} + +public struct ParagraphMeta { + public var index: Int + public var alignment: String? + public var spacing: SpacingMeta? + public var indentation: IndentationMeta? + public var commentIds: [Int]? + public var bookmarkNames: [String]? + public var keepNext: Bool? + public var keepLines: Bool? + public var pageBreakBefore: Bool? + public var border: ParagraphBorderMeta? + public var shading: ShadingMeta? + public var runs: [RunMeta] + + public init(index: Int, alignment: String? = nil, + spacing: SpacingMeta? = nil, + indentation: IndentationMeta? = nil, + commentIds: [Int]? = nil, + bookmarkNames: [String]? = nil, + keepNext: Bool? = nil, + keepLines: Bool? = nil, + pageBreakBefore: Bool? = nil, + border: ParagraphBorderMeta? = nil, + shading: ShadingMeta? = nil, + runs: [RunMeta] = []) { + self.index = index + self.alignment = alignment + self.spacing = spacing + self.indentation = indentation + self.commentIds = commentIds + self.bookmarkNames = bookmarkNames + self.keepNext = keepNext + self.keepLines = keepLines + self.pageBreakBefore = pageBreakBefore + self.border = border + self.shading = shading + self.runs = runs + } +} + +public struct ParagraphBorderMeta { + public var top: BorderStyleMeta? + public var bottom: BorderStyleMeta? + public var left: BorderStyleMeta? + public var right: BorderStyleMeta? + + public init(top: BorderStyleMeta? = nil, bottom: BorderStyleMeta? = nil, + left: BorderStyleMeta? = nil, right: BorderStyleMeta? = nil) { + self.top = top + self.bottom = bottom + self.left = left + self.right = right + } +} + +public struct BorderStyleMeta { + public var type: String + public var color: String + public var size: Int + + public init(type: String, color: String, size: Int) { + self.type = type + self.color = color + self.size = size + } +} + +public struct ShadingMeta { + public var fill: String + public var pattern: String? + + public init(fill: String, pattern: String? = nil) { + self.fill = fill + self.pattern = pattern + } +} + +public struct SpacingMeta { + public var before: Int? + public var after: Int? + public var line: Int? + + public init(before: Int? = nil, after: Int? = nil, line: Int? = nil) { + self.before = before + self.after = after + self.line = line + } +} + +public struct IndentationMeta { + public var left: Int? + public var right: Int? + public var firstLine: Int? + public var hanging: Int? + + public init(left: Int? = nil, right: Int? = nil, + firstLine: Int? = nil, hanging: Int? = nil) { + self.left = left + self.right = right + self.firstLine = firstLine + self.hanging = hanging + } +} + +public struct RunMeta { + public var range: [Int] // [start, end] + public var fontName: String? + public var fontSize: Int? + public var color: String? + public var highlightColor: String? + public var underlineType: String? + public var characterSpacing: CharacterSpacingMeta? + + public init(range: [Int], fontName: String? = nil, fontSize: Int? = nil, + color: String? = nil, highlightColor: String? = nil, + underlineType: String? = nil, characterSpacing: CharacterSpacingMeta? = nil) { + self.range = range + self.fontName = fontName + self.fontSize = fontSize + self.color = color + self.highlightColor = highlightColor + self.underlineType = underlineType + self.characterSpacing = characterSpacing + } +} + +public struct CharacterSpacingMeta { + public var spacing: Int? + public var position: Int? + public var kern: Int? + + public init(spacing: Int? = nil, position: Int? = nil, kern: Int? = nil) { + self.spacing = spacing + self.position = position + self.kern = kern + } +} + +public struct TableMeta { + public var index: Int + public var width: Int? + public var widthType: String? + public var alignment: String? + public var layout: String? + public var rows: [TableRowMeta] + + public init(index: Int, width: Int? = nil, widthType: String? = nil, + alignment: String? = nil, layout: String? = nil, + rows: [TableRowMeta] = []) { + self.index = index + self.width = width + self.widthType = widthType + self.alignment = alignment + self.layout = layout + self.rows = rows + } +} + +public struct TableRowMeta { + public var rowIndex: Int + public var isHeader: Bool? + public var height: Int? + + public init(rowIndex: Int, isHeader: Bool? = nil, height: Int? = nil) { + self.rowIndex = rowIndex + self.isHeader = isHeader + self.height = height + } +} + +public struct FigureMeta { + public var id: String + public var file: String + public var contentType: String + public var placement: String + public var width: Int + public var height: Int + public var altText: String? + + public init(id: String, file: String, contentType: String, + placement: String, width: Int, height: Int, + altText: String? = nil) { + self.id = id + self.file = file + self.contentType = contentType + self.placement = placement + self.width = width + self.height = height + self.altText = altText + } +} + +// MARK: - MetadataReader + +/// 讀取 MetadataCollector 輸出的 .meta.yaml sidecar +public struct MetadataReader { + + public static func read(from url: URL) throws -> DocumentMetadata { + let content = try String(contentsOf: url, encoding: .utf8) + return try parse(content) + } + + public static func parse(_ yaml: String) throws -> DocumentMetadata { + guard let dict = try Yams.load(yaml: yaml) as? [String: Any] else { + throw MetadataError.invalidFormat("Root must be a YAML mapping") + } + + let version = dict["version"] as? String ?? "1.0" + + // Source + let source: SourceInfo + if let sourceDict = dict["source"] as? [String: Any] { + source = SourceInfo( + format: sourceDict["format"] as? String ?? "docx", + file: sourceDict["file"] as? String + ) + } else { + source = SourceInfo(format: "docx") + } + + // Document + var documentInfo: DocumentInfo? + if let docDict = dict["document"] as? [String: Any] { + documentInfo = parseDocumentInfo(docDict) + } + + // Paragraphs + var paragraphs: [ParagraphMeta] = [] + if let paraArray = dict["paragraphs"] as? [[String: Any]] { + paragraphs = paraArray.compactMap { parseParagraphMeta($0) } + } + + // Tables + var tables: [TableMeta] = [] + if let tableArray = dict["tables"] as? [[String: Any]] { + tables = tableArray.compactMap { parseTableMeta($0) } + } + + // Figures + var figures: [FigureMeta] = [] + if let figArray = dict["figures"] as? [[String: Any]] { + figures = figArray.compactMap { parseFigureMeta($0) } + } + + return DocumentMetadata( + version: version, + source: source, + document: documentInfo, + paragraphs: paragraphs, + tables: tables, + figures: figures + ) + } + + // MARK: - Private Parsers + + private static func parseDocumentInfo(_ dict: [String: Any]) -> DocumentInfo { + var properties = PropertyMap() + if let propsDict = dict["properties"] as? [String: Any] { + properties.title = propsDict["title"] as? String + properties.creator = propsDict["creator"] as? String + properties.subject = propsDict["subject"] as? String + properties.description = propsDict["description"] as? String + properties.keywords = propsDict["keywords"] as? String + properties.created = propsDict["created"] as? String + properties.modified = propsDict["modified"] as? String + } + + var styles: [StyleMeta] = [] + if let stylesArray = dict["styles"] as? [[String: Any]] { + styles = stylesArray.map { styleDict in + StyleMeta( + id: styleDict["id"] as? String ?? "", + name: styleDict["name"] as? String ?? "", + basedOn: styleDict["basedOn"] as? String + ) + } + } + + var sections: [SectionMeta] = [] + if let sectionsArray = dict["sections"] as? [[String: Any]] { + sections = sectionsArray.map { parseSectionMeta($0) } + } + + var comments: [CommentMeta] = [] + if let commentsArray = dict["comments"] as? [[String: Any]] { + comments = commentsArray.compactMap { parseCommentMeta($0) } + } + + var numbering: [NumberingDefMeta] = [] + if let numberingArray = dict["numbering"] as? [[String: Any]] { + numbering = numberingArray.compactMap { parseNumberingDefMeta($0) } + } + + return DocumentInfo(properties: properties, styles: styles, sections: sections, + comments: comments, numbering: numbering) + } + + private static func parseSectionMeta(_ dict: [String: Any]) -> SectionMeta { + var pageSize: PageSizeMeta? + if let psDict = dict["pageSize"] as? [String: Any] { + pageSize = PageSizeMeta( + width: psDict["width"] as? Int ?? 12240, + height: psDict["height"] as? Int ?? 15840 + ) + } + + var margins: MarginsMeta? + if let mDict = dict["margins"] as? [String: Any] { + margins = MarginsMeta( + top: mDict["top"] as? Int ?? 1440, + bottom: mDict["bottom"] as? Int ?? 1440, + left: mDict["left"] as? Int ?? 1440, + right: mDict["right"] as? Int ?? 1440 + ) + } + + return SectionMeta( + pageSize: pageSize, + orientation: dict["orientation"] as? String, + margins: margins + ) + } + + private static func parseCommentMeta(_ dict: [String: Any]) -> CommentMeta? { + guard let id = dict["id"] as? Int, + let author = dict["author"] as? String, + let text = dict["text"] as? String else { return nil } + + return CommentMeta( + id: id, + author: author, + text: text, + paragraphIndex: dict["paragraphIndex"] as? Int ?? 0, + parentId: dict["parentId"] as? Int, + done: dict["done"] as? Bool ?? false + ) + } + + private static func parseNumberingDefMeta(_ dict: [String: Any]) -> NumberingDefMeta? { + guard let abstractNumId = dict["abstractNumId"] as? Int else { return nil } + + var levels: [NumberingLevelMeta] = [] + if let levelsArray = dict["levels"] as? [[String: Any]] { + levels = levelsArray.compactMap { levelDict in + guard let ilvl = levelDict["ilvl"] as? Int, + let numFmt = levelDict["numFmt"] as? String, + let lvlText = levelDict["lvlText"] as? String else { return nil } + return NumberingLevelMeta( + ilvl: ilvl, + numFmt: numFmt, + lvlText: lvlText, + start: levelDict["start"] as? Int ?? 1, + indent: levelDict["indent"] as? Int ?? 720, + fontName: levelDict["fontName"] as? String + ) + } + } + + return NumberingDefMeta(abstractNumId: abstractNumId, levels: levels) + } + + private static func parseParagraphMeta(_ dict: [String: Any]) -> ParagraphMeta? { + guard let index = dict["index"] as? Int else { return nil } + + var spacing: SpacingMeta? + if let sDict = dict["spacing"] as? [String: Any] { + spacing = SpacingMeta( + before: sDict["before"] as? Int, + after: sDict["after"] as? Int, + line: sDict["line"] as? Int + ) + } + + var indentation: IndentationMeta? + if let iDict = dict["indentation"] as? [String: Any] { + indentation = IndentationMeta( + left: iDict["left"] as? Int, + right: iDict["right"] as? Int, + firstLine: iDict["firstLine"] as? Int, + hanging: iDict["hanging"] as? Int + ) + } + + var runs: [RunMeta] = [] + if let runsArray = dict["runs"] as? [[String: Any]] { + runs = runsArray.compactMap { parseRunMeta($0) } + } + + let commentIds = (dict["commentIds"] as? [Any])?.compactMap { $0 as? Int } + + var border: ParagraphBorderMeta? + if let borderDict = dict["border"] as? [String: Any] { + border = ParagraphBorderMeta( + top: parseBorderStyleMeta(borderDict["top"]), + bottom: parseBorderStyleMeta(borderDict["bottom"]), + left: parseBorderStyleMeta(borderDict["left"]), + right: parseBorderStyleMeta(borderDict["right"]) + ) + } + + var shading: ShadingMeta? + if let shadingDict = dict["shading"] as? [String: Any] { + shading = ShadingMeta( + fill: shadingDict["fill"] as? String ?? "", + pattern: shadingDict["pattern"] as? String + ) + } + + return ParagraphMeta( + index: index, + alignment: dict["alignment"] as? String, + spacing: spacing, + indentation: indentation, + commentIds: commentIds, + bookmarkNames: dict["bookmarkNames"] as? [String], + keepNext: dict["keepNext"] as? Bool, + keepLines: dict["keepLines"] as? Bool, + pageBreakBefore: dict["pageBreakBefore"] as? Bool, + border: border, + shading: shading, + runs: runs + ) + } + + private static func parseBorderStyleMeta(_ value: Any?) -> BorderStyleMeta? { + guard let dict = value as? [String: Any], + let type = dict["type"] as? String, + let color = dict["color"] as? String, + let size = dict["size"] as? Int else { return nil } + return BorderStyleMeta(type: type, color: color, size: size) + } + + private static func parseRunMeta(_ dict: [String: Any]) -> RunMeta? { + guard let range = (dict["range"] as? [Any])?.compactMap({ $0 as? Int }), + range.count == 2 else { return nil } + + var color = dict["color"] as? String + // 移除 "#" 前綴(MetadataCollector 輸出 "#FF0000" 格式) + if let c = color, c.hasPrefix("#") { + color = String(c.dropFirst()) + } + + var characterSpacing: CharacterSpacingMeta? + if let csDict = dict["characterSpacing"] as? [String: Any] { + characterSpacing = CharacterSpacingMeta( + spacing: csDict["spacing"] as? Int, + position: csDict["position"] as? Int, + kern: csDict["kern"] as? Int + ) + } + + return RunMeta( + range: range, + fontName: dict["font"] as? String, + fontSize: dict["fontSize"] as? Int, + color: color, + highlightColor: dict["highlight"] as? String, + underlineType: dict["underline"] as? String, + characterSpacing: characterSpacing + ) + } + + private static func parseTableMeta(_ dict: [String: Any]) -> TableMeta? { + guard let index = dict["index"] as? Int else { return nil } + + var rows: [TableRowMeta] = [] + if let rowsArray = dict["rows"] as? [[String: Any]] { + rows = rowsArray.compactMap { rowDict in + guard let rowIndex = rowDict["rowIndex"] as? Int else { return nil } + return TableRowMeta( + rowIndex: rowIndex, + isHeader: rowDict["isHeader"] as? Bool, + height: rowDict["height"] as? Int + ) + } + } + + return TableMeta( + index: index, + width: dict["width"] as? Int, + widthType: dict["widthType"] as? String, + alignment: dict["alignment"] as? String, + layout: dict["layout"] as? String, + rows: rows + ) + } + + private static func parseFigureMeta(_ dict: [String: Any]) -> FigureMeta? { + guard let id = dict["id"] as? String, + let file = dict["file"] as? String else { return nil } + + return FigureMeta( + id: id, + file: file, + contentType: dict["contentType"] as? String ?? "image/png", + placement: dict["placement"] as? String ?? "inline", + width: dict["width"] as? Int ?? 0, + height: dict["height"] as? Int ?? 0, + altText: dict["altText"] as? String + ) + } +} + +// MARK: - Errors + +public enum MetadataError: Error, LocalizedError { + case invalidFormat(String) + case fileNotFound(String) + + public var errorDescription: String? { + switch self { + case .invalidFormat(let msg): return "Invalid metadata format: \(msg)" + case .fileNotFound(let path): return "Metadata file not found: \(path)" + } + } +} diff --git a/packages/md-to-word-swift/Tests/MDToWordTests/E2ETests.swift b/packages/md-to-word-swift/Tests/MDToWordTests/E2ETests.swift new file mode 100644 index 0000000..b90a11f --- /dev/null +++ b/packages/md-to-word-swift/Tests/MDToWordTests/E2ETests.swift @@ -0,0 +1,332 @@ +import XCTest +@testable import MDToWord +import OOXMLSwift +import WordToMDSwift +import CommonConverterSwift + +/// End-to-End round-trip tests starting from .docx files. +/// +/// Pipeline: +/// ``` +/// DocxWriter.write(doc₀) → .docx +/// → DocxReader.read() → WordDocument₁ +/// → WordConverter → MD₁ +/// → MarkdownToWordConverter → WordDocument₂ +/// → WordConverter → MD₂ +/// → assert MD₁ == MD₂ (retraction) +/// ``` +final class E2ETests: XCTestCase { + + private var tempDir: URL! + private let forward = WordConverter() + private let reverse = MarkdownToWordConverter() + + override func setUp() { + super.setUp() + tempDir = FileManager.default.temporaryDirectory + .appendingPathComponent("E2ETests-\(UUID().uuidString)") + try? FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) + } + + override func tearDown() { + try? FileManager.default.removeItem(at: tempDir) + super.tearDown() + } + + // MARK: - Helpers + + /// Full E2E pipeline: WordDocument → .docx → WordDocument → MD₁ → Word → MD₂ + /// Returns (MD₁, MD₂) for comparison. + private func e2eRoundTrip( + _ doc: WordDocument, + options: ConversionOptions = .default + ) throws -> (md1: String, md2: String) { + // 1. Write to .docx + let docxURL = tempDir.appendingPathComponent("test-\(UUID().uuidString).docx") + try DocxWriter.write(doc, to: docxURL) + + // 2. Read back + let readDoc = try DocxReader.read(from: docxURL) + + // 3. Convert to MD₁ + let md1 = try forward.convertToString(document: readDoc, options: options) + + // 4. Convert MD₁ → WordDocument + let wordDoc2 = try reverse.convertMarkdown(md1) + + // 5. Convert back to MD₂ + let md2 = try forward.convertToString(document: wordDoc2, options: options) + + return (normalize(md1), normalize(md2)) + } + + /// Normalize markdown for comparison + private func normalize(_ md: String) -> String { + let lines = md.split(separator: "\n", omittingEmptySubsequences: false) + .map { $0.trimmingCharacters(in: .whitespaces) } + + var result: [String] = [] + var lastWasEmpty = false + for line in lines { + if line.isEmpty { + if !lastWasEmpty { + result.append(line) + } + lastWasEmpty = true + } else { + result.append(line) + lastWasEmpty = false + } + } + + return result.joined(separator: "\n") + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + // MARK: - E2E Tests + + func testE2E_SimpleParagraph() throws { + var doc = WordDocument() + doc.body.children = [.paragraph(Paragraph(text: "Hello World"))] + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("Hello World")) + } + + func testE2E_Headings() throws { + var doc = WordDocument() + doc.styles = Style.defaultStyles + + for level in 1...3 { + var props = ParagraphProperties() + props.style = "Heading\(level)" + doc.body.children.append(.paragraph(Paragraph(text: "Heading \(level)", properties: props))) + } + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("# Heading 1")) + XCTAssertTrue(md1.contains("## Heading 2")) + XCTAssertTrue(md1.contains("### Heading 3")) + } + + func testE2E_Formatting() throws { + var doc = WordDocument() + var boldProps = RunProperties() + boldProps.bold = true + var italicProps = RunProperties() + italicProps.italic = true + + doc.body.children = [.paragraph(Paragraph(runs: [ + Run(text: "bold", properties: boldProps), + Run(text: " normal "), + Run(text: "italic", properties: italicProps) + ]))] + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("**bold**")) + // italic may be rendered as *italic* or _italic_ + XCTAssertTrue(md1.contains("italic")) + } + + func testE2E_Lists() throws { + var doc = WordDocument() + let bulletId = doc.numbering.createBulletList() + let numId = doc.numbering.createNumberedList() + + // Bullet items + for text in ["Apple", "Banana"] { + var props = ParagraphProperties() + props.numbering = NumberingInfo(numId: bulletId, level: 0) + doc.body.children.append(.paragraph(Paragraph(text: text, properties: props))) + } + + // Separator + doc.body.children.append(.paragraph(Paragraph(text: ""))) + + // Numbered items + for text in ["First", "Second"] { + var props = ParagraphProperties() + props.numbering = NumberingInfo(numId: numId, level: 0) + doc.body.children.append(.paragraph(Paragraph(text: text, properties: props))) + } + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("Apple")) + XCTAssertTrue(md1.contains("First")) + } + + func testE2E_Table() throws { + var doc = WordDocument() + var headerRowProps = TableRowProperties() + headerRowProps.isHeader = true + let table = Table(rows: [ + TableRow(cells: [TableCell(text: "Name"), TableCell(text: "Value")], properties: headerRowProps), + TableRow(cells: [TableCell(text: "A"), TableCell(text: "1")]), + TableRow(cells: [TableCell(text: "B"), TableCell(text: "2")]) + ]) + doc.body.children = [.table(table)] + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("Name")) + XCTAssertTrue(md1.contains("|")) + } + + func testE2E_Footnotes() throws { + var doc = WordDocument() + doc.footnotes.footnotes.append(Footnote(id: 1, text: "A footnote explanation", paragraphIndex: 0)) + var para = Paragraph(text: "Text with footnote") + para.footnoteIds = [1] + doc.body.children = [.paragraph(para)] + + // Note: DocxReader doesn't parse footnotes, so MD₁ won't have footnotes. + // This test verifies the pipeline doesn't crash and produces consistent output. + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + } + + func testE2E_MixedContent() throws { + var doc = WordDocument() + doc.styles = Style.defaultStyles + + // Heading + var h1Props = ParagraphProperties() + h1Props.style = "Heading1" + doc.body.children.append(.paragraph(Paragraph(text: "Document Title", properties: h1Props))) + + // Normal paragraph with formatting + var boldProps = RunProperties() + boldProps.bold = true + doc.body.children.append(.paragraph(Paragraph(runs: [ + Run(text: "This is "), + Run(text: "important", properties: boldProps), + Run(text: " text.") + ]))) + + // Table + let table = Table(rows: [ + TableRow(cells: [TableCell(text: "Key"), TableCell(text: "Value")]), + TableRow(cells: [TableCell(text: "A"), TableCell(text: "1")]) + ]) + doc.body.children.append(.table(table)) + + // Another paragraph + doc.body.children.append(.paragraph(Paragraph(text: "End of document."))) + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("# Document Title")) + XCTAssertTrue(md1.contains("**important**")) + XCTAssertTrue(md1.contains("End of document.")) + } + + // MARK: - Tier 3 Metadata Pipeline + + func testE2E_Tier3Metadata() throws { + var doc = WordDocument() + doc.properties.title = "Test Document" + doc.properties.creator = "Author" + + // Paragraph with alignment (Layer C property) + var props = ParagraphProperties() + props.alignment = .center + doc.body.children = [.paragraph(Paragraph(text: "Centered text", properties: props))] + + // Write .docx + let docxURL = tempDir.appendingPathComponent("tier3.docx") + try DocxWriter.write(doc, to: docxURL) + + // Read back + let readDoc = try DocxReader.read(from: docxURL) + + // Convert to MD + YAML (Tier 3) + let metaURL = tempDir.appendingPathComponent("test.meta.yaml") + var options = ConversionOptions(fidelity: .marker, metadataOutput: metaURL) + let md = try forward.convertToString(document: readDoc, options: options) + + // Verify YAML was created + XCTAssertTrue(FileManager.default.fileExists(atPath: metaURL.path)) + + // Verify MD content + XCTAssertTrue(md.contains("Centered text")) + + // Verify YAML content + let yamlContent = try String(contentsOf: metaURL, encoding: .utf8) + XCTAssertTrue(yamlContent.contains("version:")) + XCTAssertTrue(yamlContent.contains("alignment: center")) + } + + func testE2E_Tier3MetadataRestoration() throws { + var doc = WordDocument() + doc.styles = Style.defaultStyles + + // Create a document with Layer C properties + var runProps = RunProperties() + runProps.fontName = "Arial" + runProps.fontSize = 28 + runProps.color = "FF0000" + var paraProps = ParagraphProperties() + paraProps.alignment = .center + doc.body.children = [.paragraph(Paragraph( + runs: [Run(text: "Styled text", properties: runProps)], + properties: paraProps + ))] + + // Write .docx + let docxURL = tempDir.appendingPathComponent("tier3-restore.docx") + try DocxWriter.write(doc, to: docxURL) + + // Read back + let readDoc = try DocxReader.read(from: docxURL) + + // Convert to MD + YAML + let metaURL = tempDir.appendingPathComponent("restore.meta.yaml") + var options = ConversionOptions(fidelity: .marker, metadataOutput: metaURL) + let md = try forward.convertToString(document: readDoc, options: options) + + // Now restore: MD + YAML → Word + let metadata = try MetadataReader.read(from: metaURL) + let restoredDoc = try reverse.convertMarkdown(md) + + // Convert restored doc back to MD + YAML + let metaURL2 = tempDir.appendingPathComponent("restore2.meta.yaml") + var options2 = ConversionOptions(fidelity: .marker, metadataOutput: metaURL2) + let md2 = try forward.convertToString(document: restoredDoc, options: options2) + + // Verify MD is consistent + XCTAssertEqual(normalize(md), normalize(md2)) + + // Verify YAML alignment was restored + let yaml2 = try String(contentsOf: metaURL2, encoding: .utf8) + XCTAssertTrue(yaml2.contains("alignment: center")) + } + + // MARK: - Edge Cases + + func testE2E_EmptyDocument() throws { + let doc = WordDocument() + let docxURL = tempDir.appendingPathComponent("empty.docx") + try DocxWriter.write(doc, to: docxURL) + + let readDoc = try DocxReader.read(from: docxURL) + let md = try forward.convertToString(document: readDoc) + + // Empty document should produce empty or minimal markdown + XCTAssertEqual(md.trimmingCharacters(in: .whitespacesAndNewlines), "") + } + + func testE2E_XMLSpecialCharacters() throws { + // Use characters that survive both XML and Markdown round-trip + var doc = WordDocument() + doc.body.children = [.paragraph(Paragraph(text: "Price: $100 & tax = total"))] + + let (md1, md2) = try e2eRoundTrip(doc) + XCTAssertEqual(md1, md2) + XCTAssertTrue(md1.contains("$100")) + XCTAssertTrue(md1.contains("&")) + } +} diff --git a/packages/md-to-word-swift/Tests/MDToWordTests/MarkdownToWordConverterTests.swift b/packages/md-to-word-swift/Tests/MDToWordTests/MarkdownToWordConverterTests.swift new file mode 100644 index 0000000..4ce9dcd --- /dev/null +++ b/packages/md-to-word-swift/Tests/MDToWordTests/MarkdownToWordConverterTests.swift @@ -0,0 +1,155 @@ +#if canImport(XCTest) +import XCTest +@testable import MDToWord + +final class MarkdownToWordConverterTests: XCTestCase { + private let converter = MarkdownToWordConverter() + + func testFrontmatterHeadingAndInlineFormatting() throws { + let markdown = """ + --- + title: "Frontmatter Title" + author: Jane Doe + description: Converted from test + --- + + # Visible Heading + + Hello **bold** _italic_ ~~gone~~ `code`. + """ + + let directory = try makeWorkspace() + defer { try? FileManager.default.removeItem(at: directory) } + + let output = try convert(markdown: markdown, in: directory) + let documentXML = try archiveEntry(named: "word/document.xml", in: output) + let coreXML = try archiveEntry(named: "docProps/core.xml", in: output) + + XCTAssertTrue(documentXML.contains("Heading1"), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Visible Heading"), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Menlo"), "Got: \(documentXML)") + XCTAssertTrue(coreXML.contains("Frontmatter Title"), "Got: \(coreXML)") + XCTAssertTrue(coreXML.contains("Jane Doe"), "Got: \(coreXML)") + } + + func testListsCreateNumberingDefinitions() throws { + let markdown = """ + - One + - Two + + 1. First + 2. Second + """ + + let directory = try makeWorkspace() + defer { try? FileManager.default.removeItem(at: directory) } + + let output = try convert(markdown: markdown, in: directory) + let documentXML = try archiveEntry(named: "word/document.xml", in: output) + let numberingXML = try archiveEntry(named: "word/numbering.xml", in: output) + + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(numberingXML.contains("w:numFmt w:val=\"bullet\""), "Got: \(numberingXML)") + XCTAssertTrue(numberingXML.contains("w:numFmt w:val=\"decimal\""), "Got: \(numberingXML)") + } + + func testTablesRenderAsWordTables() throws { + let markdown = """ + | Name | Value | + |------|-------| + | A | 1 | + | B | 2 | + """ + + let directory = try makeWorkspace() + defer { try? FileManager.default.removeItem(at: directory) } + + let output = try convert(markdown: markdown, in: directory) + let documentXML = try archiveEntry(named: "word/document.xml", in: output) + + XCTAssertTrue(documentXML.contains(""), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Name"), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("Value"), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("A"), "Got: \(documentXML)") + XCTAssertTrue(documentXML.contains("B"), "Got: \(documentXML)") + } + + func testLinksCreateRelationships() throws { + let markdown = "See [Docs](https://example.com/docs) for details." + + let directory = try makeWorkspace() + defer { try? FileManager.default.removeItem(at: directory) } + + let output = try convert(markdown: markdown, in: directory) + let documentXML = try archiveEntry(named: "word/document.xml", in: output) + let relsXML = try archiveEntry(named: "word/_rels/document.xml.rels", in: output) + + XCTAssertTrue(documentXML.contains(" URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent("md-to-word-tests-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } + + private func convert(markdown: String, in directory: URL) throws -> URL { + let input = directory.appendingPathComponent("fixture.md") + let output = directory.appendingPathComponent("fixture.docx") + try markdown.write(to: input, atomically: true, encoding: .utf8) + try converter.convertToFile(input: input, output: output) + XCTAssertTrue(FileManager.default.fileExists(atPath: output.path)) + return output + } + + private func archiveEntry(named path: String, in archiveURL: URL) throws -> String { + let process = Process() + process.executableURL = URL(fileURLWithPath: "/usr/bin/unzip") + process.arguments = ["-p", archiveURL.path, path] + + let stdout = Pipe() + let stderr = Pipe() + process.standardOutput = stdout + process.standardError = stderr + try process.run() + process.waitUntilExit() + + let data = stdout.fileHandleForReading.readDataToEndOfFile() + let errorData = stderr.fileHandleForReading.readDataToEndOfFile() + + guard process.terminationStatus == 0 else { + let errorText = String(decoding: errorData, as: UTF8.self) + XCTFail("Failed to read archive entry \(path): \(errorText)") + return "" + } + + return String(decoding: data, as: UTF8.self) + } +} +#endif diff --git a/packages/md-to-word-swift/Tests/MDToWordTests/MetadataReaderTests.swift b/packages/md-to-word-swift/Tests/MDToWordTests/MetadataReaderTests.swift new file mode 100644 index 0000000..99ffef7 --- /dev/null +++ b/packages/md-to-word-swift/Tests/MDToWordTests/MetadataReaderTests.swift @@ -0,0 +1,438 @@ +import XCTest +@testable import MDToWord + +final class MetadataReaderTests: XCTestCase { + + func testBasicMetadata() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + file: "test.docx" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.version, "1.0") + XCTAssertEqual(meta.source.format, "docx") + XCTAssertEqual(meta.source.file, "test.docx") + } + + func testDocumentProperties() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test Document" + creator: "Author Name" + subject: "Test Subject" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertNotNil(meta.document) + XCTAssertEqual(meta.document?.properties.title, "Test Document") + XCTAssertEqual(meta.document?.properties.creator, "Author Name") + XCTAssertEqual(meta.document?.properties.subject, "Test Subject") + } + + func testDocumentPropertiesKeywords() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test" + keywords: "swift, docx, converter" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.document?.properties.keywords, "swift, docx, converter") + } + + func testDocumentPropertiesTimestamps() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test" + created: "2024-01-15T10:30:00Z" + modified: "2024-06-20T15:45:00Z" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.document?.properties.created, "2024-01-15T10:30:00Z") + XCTAssertEqual(meta.document?.properties.modified, "2024-06-20T15:45:00Z") + } + + func testStyles() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test" + styles: + - id: "Heading1" + name: "heading 1" + basedOn: "Normal" + - id: "Normal" + name: "Normal" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.document?.styles.count, 2) + XCTAssertEqual(meta.document?.styles[0].id, "Heading1") + XCTAssertEqual(meta.document?.styles[0].basedOn, "Normal") + } + + func testSectionProperties() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test" + sections: + - pageSize: + width: 12240 + height: 15840 + orientation: portrait + margins: + top: 1440 + bottom: 1440 + left: 1440 + right: 1440 + """ + + let meta = try MetadataReader.parse(yaml) + let section = meta.document?.sections.first + XCTAssertNotNil(section) + XCTAssertEqual(section?.pageSize?.width, 12240) + XCTAssertEqual(section?.pageSize?.height, 15840) + XCTAssertEqual(section?.orientation, "portrait") + XCTAssertEqual(section?.margins?.top, 1440) + } + + // MARK: - Comment Content + + func testCommentMeta() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test" + comments: + - id: 1 + author: "Alice" + text: "Review this" + paragraphIndex: 0 + - id: 2 + author: "Bob" + text: "Done" + paragraphIndex: 0 + parentId: 1 + done: true + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.document?.comments.count, 2) + + let comment1 = meta.document?.comments[0] + XCTAssertEqual(comment1?.id, 1) + XCTAssertEqual(comment1?.author, "Alice") + XCTAssertEqual(comment1?.text, "Review this") + XCTAssertNil(comment1?.parentId) + XCTAssertEqual(comment1?.done, false) + + let comment2 = meta.document?.comments[1] + XCTAssertEqual(comment2?.parentId, 1) + XCTAssertEqual(comment2?.done, true) + } + + // MARK: - Numbering Definitions + + func testNumberingDefMeta() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + document: + properties: + title: "Test" + numbering: + - abstractNumId: 0 + levels: + - ilvl: 0 + numFmt: bullet + lvlText: "•" + start: 1 + indent: 720 + fontName: "Symbol" + - ilvl: 1 + numFmt: bullet + lvlText: "o" + start: 1 + indent: 1440 + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.document?.numbering.count, 1) + + let def = meta.document?.numbering[0] + XCTAssertEqual(def?.abstractNumId, 0) + XCTAssertEqual(def?.levels.count, 2) + XCTAssertEqual(def?.levels[0].numFmt, "bullet") + XCTAssertEqual(def?.levels[0].fontName, "Symbol") + XCTAssertEqual(def?.levels[1].indent, 1440) + } + + // MARK: - Paragraph Meta + + func testParagraphMeta() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + alignment: center + spacing: { before: 240, after: 0 } + - index: 3 + alignment: right + indentation: { left: 720, firstLine: 360 } + runs: + - range: [0, 5] + font: "Arial" + fontSize: 24 + color: "#FF0000" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.paragraphs.count, 2) + + let para0 = meta.paragraphs[0] + XCTAssertEqual(para0.index, 0) + XCTAssertEqual(para0.alignment, "center") + XCTAssertEqual(para0.spacing?.before, 240) + XCTAssertEqual(para0.spacing?.after, 0) + + let para3 = meta.paragraphs[1] + XCTAssertEqual(para3.index, 3) + XCTAssertEqual(para3.alignment, "right") + XCTAssertEqual(para3.indentation?.left, 720) + XCTAssertEqual(para3.indentation?.firstLine, 360) + + XCTAssertEqual(para3.runs.count, 1) + XCTAssertEqual(para3.runs[0].range, [0, 5]) + XCTAssertEqual(para3.runs[0].fontName, "Arial") + XCTAssertEqual(para3.runs[0].fontSize, 24) + XCTAssertEqual(para3.runs[0].color, "FF0000") // "#" 已被移除 + } + + // MARK: - Paragraph Advanced Properties + + func testParagraphKeepNext() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + keepNext: true + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.paragraphs[0].keepNext, true) + } + + func testParagraphKeepLines() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + keepLines: true + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.paragraphs[0].keepLines, true) + } + + func testParagraphPageBreakBefore() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + pageBreakBefore: true + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.paragraphs[0].pageBreakBefore, true) + } + + func testParagraphBorder() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + border: + top: { type: single, color: "FF0000", size: 8 } + bottom: { type: single, color: "FF0000", size: 8 } + """ + + let meta = try MetadataReader.parse(yaml) + let border = meta.paragraphs[0].border + XCTAssertNotNil(border) + XCTAssertEqual(border?.top?.type, "single") + XCTAssertEqual(border?.top?.color, "FF0000") + XCTAssertEqual(border?.top?.size, 8) + XCTAssertNotNil(border?.bottom) + XCTAssertNil(border?.left) + } + + func testParagraphShading() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + shading: { fill: "FFFF00", pattern: clear } + """ + + let meta = try MetadataReader.parse(yaml) + let shading = meta.paragraphs[0].shading + XCTAssertNotNil(shading) + XCTAssertEqual(shading?.fill, "FFFF00") + XCTAssertEqual(shading?.pattern, "clear") + } + + // MARK: - CharacterSpacing in RunMeta + + func testRunCharacterSpacing() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + paragraphs: + - index: 0 + runs: + - range: [0, 5] + characterSpacing: { spacing: 20, position: 5, kern: 16 } + """ + + let meta = try MetadataReader.parse(yaml) + let run = meta.paragraphs[0].runs[0] + XCTAssertNotNil(run.characterSpacing) + XCTAssertEqual(run.characterSpacing?.spacing, 20) + XCTAssertEqual(run.characterSpacing?.position, 5) + XCTAssertEqual(run.characterSpacing?.kern, 16) + } + + // MARK: - Table Meta + + func testTableMeta() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + tables: + - index: 1 + width: 9000 + widthType: dxa + alignment: center + layout: fixed + rows: + - rowIndex: 0 + isHeader: true + height: 400 + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.tables.count, 1) + + let table = meta.tables[0] + XCTAssertEqual(table.index, 1) + XCTAssertEqual(table.width, 9000) + XCTAssertEqual(table.widthType, "dxa") + XCTAssertEqual(table.alignment, "center") + XCTAssertEqual(table.layout, "fixed") + + XCTAssertEqual(table.rows.count, 1) + XCTAssertEqual(table.rows[0].isHeader, true) + XCTAssertEqual(table.rows[0].height, 400) + } + + // MARK: - Figure Meta + + func testFigureMeta() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + + figures: + - id: "rId5" + file: "figures/image1.png" + contentType: "image/png" + placement: inline + width: 4572000 + height: 3429000 + altText: "A test image" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertEqual(meta.figures.count, 1) + + let fig = meta.figures[0] + XCTAssertEqual(fig.id, "rId5") + XCTAssertEqual(fig.file, "figures/image1.png") + XCTAssertEqual(fig.contentType, "image/png") + XCTAssertEqual(fig.placement, "inline") + XCTAssertEqual(fig.width, 4572000) + XCTAssertEqual(fig.height, 3429000) + XCTAssertEqual(fig.altText, "A test image") + } + + func testEmptyMetadata() throws { + let yaml = """ + version: "1.0" + source: + format: "docx" + """ + + let meta = try MetadataReader.parse(yaml) + XCTAssertNil(meta.document) + XCTAssertTrue(meta.paragraphs.isEmpty) + XCTAssertTrue(meta.tables.isEmpty) + XCTAssertTrue(meta.figures.isEmpty) + } +} diff --git a/packages/md-to-word-swift/Tests/MDToWordTests/RoundTripTests.swift b/packages/md-to-word-swift/Tests/MDToWordTests/RoundTripTests.swift new file mode 100644 index 0000000..9f2ebdb --- /dev/null +++ b/packages/md-to-word-swift/Tests/MDToWordTests/RoundTripTests.swift @@ -0,0 +1,660 @@ +import XCTest +@testable import MDToWord +import OOXMLSwift +import WordToMDSwift +import CommonConverterSwift + +/// Bijection 驗證: +/// A: convert(convert⁻¹(w)) ≡ w — Word → MD → Word → MD,兩次 MD 一致 +/// B: convert⁻¹(convert(md)) ≡ md — MD → Word → MD,round-trip 後 MD 一致 +/// C: g ∘ f = id_{W*} — WordDocument 等價驗證(需 Equatable) +final class RoundTripTests: XCTestCase { + + let forward = WordConverter() // Word → MD + let reverse = MarkdownToWordConverter() // MD → Word + + // MARK: - Helpers + + /// Word → MD string(Tier 1,純 markdown) + func toMarkdown(_ doc: WordDocument) throws -> String { + try forward.convertToString(document: doc, options: .default) + } + + /// Word → MD string(Layer B,含 HTML extensions) + func toMarkdownHTML(_ doc: WordDocument) throws -> String { + var options = ConversionOptions.default + options.useHTMLExtensions = true + return try forward.convertToString(document: doc, options: options) + } + + /// 正規化 markdown:去尾空白、統一換行、壓縮連續空行、去頭尾空行 + func normalize(_ md: String) -> String { + let lines = md.split(separator: "\n", omittingEmptySubsequences: false) + .map { $0.trimmingCharacters(in: .whitespaces) } + + // 壓縮連續空行為單一空行 + var result: [String] = [] + var lastWasEmpty = false + for line in lines { + if line.isEmpty { + if !lastWasEmpty { + result.append(line) + } + lastWasEmpty = true + } else { + result.append(line) + lastWasEmpty = false + } + } + + return result.joined(separator: "\n") + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + // ========================================================= + // MARK: - Direction A: Word → MD → Word → MD + // ========================================================= + + func testRoundTripA_BasicParagraph() throws { + // 1. 建構 WordDocument + var doc = WordDocument() + var para = Paragraph() + para.runs = [Run(text: "Hello world")] + doc.body.children.append(.paragraph(para)) + + // 2. Word → MD + let md1 = try toMarkdown(doc) + + // 3. MD → Word' + let doc2 = try reverse.convertMarkdown(md1) + + // 4. Word' → MD' + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_Heading() throws { + var doc = WordDocument() + var para = Paragraph() + para.properties.style = "Heading1" + para.runs = [Run(text: "Title")] + doc.body.children.append(.paragraph(para)) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_BoldItalic() throws { + var doc = WordDocument() + var para = Paragraph() + + let normalRun = Run(text: "This is ") + var boldRun = Run(text: "bold") + boldRun.properties.bold = true + let italicRun = Run(text: " and ") + var biRun = Run(text: "both") + biRun.properties.bold = true + biRun.properties.italic = true + let tail = Run(text: " text.") + + para.runs = [normalRun, boldRun, italicRun, biRun, tail] + doc.body.children.append(.paragraph(para)) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_UnorderedList() throws { + var doc = WordDocument() + let numId = doc.numbering.createBulletList() + + for text in ["Apple", "Banana", "Cherry"] { + var para = Paragraph() + para.runs = [Run(text: text)] + para.properties.numbering = NumberingInfo(numId: numId, level: 0) + doc.body.children.append(.paragraph(para)) + } + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_OrderedList() throws { + var doc = WordDocument() + let numId = doc.numbering.createNumberedList() + + for text in ["First", "Second", "Third"] { + var para = Paragraph() + para.runs = [Run(text: text)] + para.properties.numbering = NumberingInfo(numId: numId, level: 0) + doc.body.children.append(.paragraph(para)) + } + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_CodeBlock() throws { + var doc = WordDocument() + + for line in ["let x = 1", "let y = 2"] { + var para = Paragraph() + para.properties.style = "Code" + para.runs = [Run(text: line)] + doc.body.children.append(.paragraph(para)) + } + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_Blockquote() throws { + var doc = WordDocument() + var para = Paragraph() + para.properties.style = "Quote" + para.runs = [Run(text: "A wise saying.")] + doc.body.children.append(.paragraph(para)) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_Table() throws { + var doc = WordDocument() + + var headerProps = TableRowProperties() + headerProps.isHeader = true + let headerRow = TableRow(cells: [ + TableCell(paragraphs: [Paragraph(runs: [Run(text: "Name")])]), + TableCell(paragraphs: [Paragraph(runs: [Run(text: "Age")])]) + ], properties: headerProps) + + let dataRow = TableRow(cells: [ + TableCell(paragraphs: [Paragraph(runs: [Run(text: "Alice")])]), + TableCell(paragraphs: [Paragraph(runs: [Run(text: "30")])]) + ]) + + let table = Table(rows: [headerRow, dataRow]) + doc.body.children.append(.table(table)) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_Link() throws { + var doc = WordDocument() + var para = Paragraph() + + let hyperlink = Hyperlink( + id: "h1", + text: "Example", + url: "https://example.com", + relationshipId: "rId1" + ) + para.hyperlinks = [hyperlink] + doc.body.children.append(.paragraph(para)) + doc.hyperlinkReferences.append( + HyperlinkReference(relationshipId: "rId1", url: "https://example.com") + ) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_ThematicBreak() throws { + var doc = WordDocument() + + var before = Paragraph() + before.runs = [Run(text: "Before")] + doc.body.children.append(.paragraph(before)) + + var hr = Paragraph() + hr.hasPageBreak = true + doc.body.children.append(.paragraph(hr)) + + var after = Paragraph() + after.runs = [Run(text: "After")] + doc.body.children.append(.paragraph(after)) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + func testRoundTripA_Footnote() throws { + var doc = WordDocument() + + var para = Paragraph() + para.runs = [Run(text: "Important claim")] + para.footnoteIds = [1] + doc.body.children.append(.paragraph(para)) + + doc.footnotes.footnotes.append( + Footnote(id: 1, text: "Source: Wikipedia", paragraphIndex: 0) + ) + + let md1 = try toMarkdown(doc) + let doc2 = try reverse.convertMarkdown(md1) + let md2 = try toMarkdown(doc2) + + XCTAssertEqual(normalize(md1), normalize(md2)) + } + + // ========================================================= + // MARK: - Direction B: MD → Word → MD + // ========================================================= + + func testRoundTripB_BasicParagraph() throws { + let md = "Hello world\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_Heading() throws { + let md = "# Title\n\n## Subtitle\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_BoldItalicStrike() throws { + // 正向轉換器用 _ 做 italic,所以輸入也用 _ + let md = "This is **bold** and _italic_ and ~~deleted~~ text.\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_UnorderedList() throws { + let md = "- Apple\n- Banana\n- Cherry\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_OrderedList() throws { + let md = "1. First\n1. Second\n1. Third\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_CodeBlock() throws { + let md = "```\nlet x = 1\n```\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_Blockquote() throws { + let md = "> A wise saying.\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_Table() throws { + let md = "| Name | Age |\n|---|---|\n| Alice | 30 |\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_Link() throws { + // Word 模型中 runs 和 hyperlinks 分開存儲, + // 正向轉換器先輸出 runs 再輸出 hyperlinks, + // 所以 inline link 位置會移動。改用二次穩定性驗證: + // MD → Word → MD' → Word' → MD'',確認 MD' == MD'' + let md = "Visit [Example](https://example.com) now.\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + + // 再走一次 + let doc3 = try reverse.convertMarkdown(md2) + let md3 = try toMarkdown(doc3) + + XCTAssertEqual(normalize(md2), normalize(md3), + "Round-trip should be idempotent after first pass") + } + + func testRoundTripB_ThematicBreak() throws { + let md = "Before\n\n---\n\nAfter\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_Footnote() throws { + // 正向轉換器輸出:文字後面接 footnote ref,句號在 ref 之前 + let md = "Important claim.[^1]\n\n[^1]: Source: Wikipedia\n" + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + func testRoundTripB_MixedContent() throws { + // 使用正向轉換器的輸出格式(_ for italic, blockquote 後無空行) + let md = """ + # Introduction + + This is a **bold** paragraph with _italic_ text. + + - Item 1 + - Item 2 + > A quote + + ``` + code here + ``` + """ + + let doc = try reverse.convertMarkdown(md) + let md2 = try toMarkdown(doc) + XCTAssertEqual(normalize(md), normalize(md2)) + } + + // ========================================================= + // MARK: - Direction C: g ∘ f = id_{W*}(WordDocument 等價驗證) + // ========================================================= + // + // 數學定義(docs/lossless-conversion.md §4): + // f: W → MD (forward, WordConverter) + // g: MD → W (reverse, MarkdownToWordConverter) + // W* = Im(g|_{MD*}) + // + // 驗證:∀ w ∈ W*: g(f(w)) = w + // + // 策略:三步走法確保在 W* 中操作 + // 1. w₀ = g(md) — 從任意 md 開始 + // 2. md* = f(w₀) — 正規化為 MD* + // 3. w₁ = g(md*) — 建構 W* 元素 + // 4. md₁ = f(w₁) — round-trip + // 5. w₂ = g(md₁) — 重建 + // 6. assert w₁ == w₂ — g ∘ f = id_{W*} + + /// 三步走法 helper:確保在 W* 上測試 g ∘ f = id + private func assertWordLevelRoundTrip( + _ md: String, + file: StaticString = #filePath, + line: UInt = #line + ) throws { + // 1. 正規化為 MD*:md → w₀ → md* + let w0 = try reverse.convertMarkdown(md) + let mdStar = try toMarkdown(w0) + + // 2. 建構 W* 元素:md* → w₁ + let w1 = try reverse.convertMarkdown(mdStar) + + // 3. Round-trip:w₁ → md₁ → w₂ + let md1 = try toMarkdown(w1) + let w2 = try reverse.convertMarkdown(md1) + + // 4. g ∘ f = id_{W*} + XCTAssertEqual(w1, w2, + "g ∘ f should be identity on W*.\n" + + "md* = \(mdStar.debugDescription)\n" + + "md₁ = \(md1.debugDescription)", + file: file, line: line) + + // Bonus: f ∘ g = id_{MD*}(MD 層也應一致) + XCTAssertEqual(normalize(mdStar), normalize(md1), + "f ∘ g should also be identity on MD*", + file: file, line: line) + } + + // MARK: - C.1 Basic Paragraph + + func testRoundTripC_BasicParagraph() throws { + try assertWordLevelRoundTrip("Hello world") + } + + func testRoundTripC_MultipleParagraphs() throws { + try assertWordLevelRoundTrip("First paragraph.\n\nSecond paragraph.") + } + + // MARK: - C.2 Headings + + func testRoundTripC_Heading1() throws { + try assertWordLevelRoundTrip("# Title") + } + + func testRoundTripC_Heading2() throws { + try assertWordLevelRoundTrip("## Subtitle") + } + + func testRoundTripC_Heading3() throws { + try assertWordLevelRoundTrip("### Section") + } + + func testRoundTripC_MultipleHeadings() throws { + try assertWordLevelRoundTrip("# Title\n\n## Section\n\n### Subsection") + } + + // MARK: - C.3 Inline Formatting (Layer A) + + func testRoundTripC_Bold() throws { + try assertWordLevelRoundTrip("This is **bold** text.") + } + + func testRoundTripC_Italic() throws { + try assertWordLevelRoundTrip("This is _italic_ text.") + } + + func testRoundTripC_BoldItalic() throws { + try assertWordLevelRoundTrip("This is ***bold italic*** text.") + } + + func testRoundTripC_Strikethrough() throws { + try assertWordLevelRoundTrip("This is ~~deleted~~ text.") + } + + func testRoundTripC_MixedInline() throws { + try assertWordLevelRoundTrip("Normal **bold** _italic_ ~~strike~~ end.") + } + + // MARK: - C.4 Code + + func testRoundTripC_CodeBlock() throws { + try assertWordLevelRoundTrip("```\nlet x = 1\nlet y = 2\n```") + } + + func testRoundTripC_SingleLineCodeBlock() throws { + try assertWordLevelRoundTrip("```\nprint(hello)\n```") + } + + // MARK: - C.5 Blockquote + + func testRoundTripC_Blockquote() throws { + try assertWordLevelRoundTrip("> A wise saying.") + } + + // MARK: - C.6 Lists + + func testRoundTripC_UnorderedList() throws { + try assertWordLevelRoundTrip("- Apple\n- Banana\n- Cherry") + } + + func testRoundTripC_OrderedList() throws { + try assertWordLevelRoundTrip("1. First\n2. Second\n3. Third") + } + + // MARK: - C.7 Table + + func testRoundTripC_Table() throws { + try assertWordLevelRoundTrip( + "| Name | Age |\n|---|---|\n| Alice | 30 |" + ) + } + + // MARK: - C.8 Thematic Break + + func testRoundTripC_ThematicBreak() throws { + try assertWordLevelRoundTrip("Before\n\n---\n\nAfter") + } + + // MARK: - C.9 Link + + func testRoundTripC_Link() throws { + // Link 的位置可能在 canonicalization 時移動, + // 但 W* 層的 round-trip 應該穩定 + try assertWordLevelRoundTrip("[Example](https://example.com)") + } + + // MARK: - C.10 Mixed Content + + // MARK: - C.11 Inline Code (Layer A Advanced) + + func testRoundTripC_InlineCode() throws { + try assertWordLevelRoundTrip("Use `code` here.") + } + + // MARK: - C.12 Footnote + + func testRoundTripC_Footnote() throws { + try assertWordLevelRoundTrip("Important claim.[^1]\n\n[^1]: Source: Wikipedia") + } + + // MARK: - C.13 Nested Lists + + func testRoundTripC_NestedUnorderedList() throws { + try assertWordLevelRoundTrip("- A\n - B\n - C\n- D") + } + + func testRoundTripC_NestedOrderedList() throws { + try assertWordLevelRoundTrip("1. A\n 1. B\n2. C") + } + + // MARK: - C.14 Internal Link + + func testRoundTripC_InternalLink() throws { + try assertWordLevelRoundTrip("[Go](#section)") + } + + // MARK: - C.10 Mixed Content + + func testRoundTripC_HeadingWithFormatting() throws { + try assertWordLevelRoundTrip("# **Bold** Title") + } + + func testRoundTripC_ComplexDocument() throws { + try assertWordLevelRoundTrip(""" + # Introduction + + This is a **bold** paragraph with _italic_ text. + + ## List Section + + - Item one + - Item two + + > A quote + + ``` + code here + ``` + + End of document. + """) + } + + // ========================================================= + // MARK: - Direction C: Layer B(HTML Extensions) + // ========================================================= + // + // Layer B 的 round-trip 在 forward 方向需要 useHTMLExtensions: true, + // 否則 underline/sup/sub/highlight 不會輸出 HTML tags。 + + /// 三步走法 helper(HTML extensions 版本) + private func assertWordLevelRoundTripHTML( + _ md: String, + file: StaticString = #filePath, + line: UInt = #line + ) throws { + // 1. 正規化為 MD*:md → w₀ → md* + let w0 = try reverse.convertMarkdown(md) + let mdStar = try toMarkdownHTML(w0) + + // 2. 建構 W* 元素:md* → w₁ + let w1 = try reverse.convertMarkdown(mdStar) + + // 3. Round-trip:w₁ → md₁ → w₂ + let md1 = try toMarkdownHTML(w1) + let w2 = try reverse.convertMarkdown(md1) + + // 4. g ∘ f = id_{W*} + XCTAssertEqual(w1, w2, + "g ∘ f should be identity on W* (HTML).\n" + + "md* = \(mdStar.debugDescription)\n" + + "md₁ = \(md1.debugDescription)", + file: file, line: line) + + // Bonus: f ∘ g = id_{MD*} + XCTAssertEqual(normalize(mdStar), normalize(md1), + "f ∘ g should also be identity on MD* (HTML)", + file: file, line: line) + } + + // MARK: - C.15 Underline + + func testRoundTripC_Underline() throws { + try assertWordLevelRoundTripHTML("text") + } + + // MARK: - C.16 Superscript + + func testRoundTripC_Superscript() throws { + try assertWordLevelRoundTripHTML("x2") + } + + // MARK: - C.17 Subscript + + func testRoundTripC_Subscript() throws { + try assertWordLevelRoundTripHTML("H2O") + } + + // MARK: - C.18 Highlight + + func testRoundTripC_Highlight() throws { + try assertWordLevelRoundTripHTML("text") + } + + // MARK: - C.19 Combined: Bold + Underline + + func testRoundTripC_BoldUnderline() throws { + try assertWordLevelRoundTripHTML("**text**") + } + + // MARK: - C.20 Combined: Underline wrapping Bold + + func testRoundTripC_UnderlineBold() throws { + try assertWordLevelRoundTripHTML("**text**") + } +} diff --git a/packages/pdf-to-docx-swift/Sources/PDFToDOCX/PDFToDOCXConverter.swift b/packages/pdf-to-docx-swift/Sources/PDFToDOCX/PDFToDOCXConverter.swift new file mode 100644 index 0000000..7795e52 --- /dev/null +++ b/packages/pdf-to-docx-swift/Sources/PDFToDOCX/PDFToDOCXConverter.swift @@ -0,0 +1,469 @@ +import Foundation +import PDFKit +import CommonConverterSwift +import OOXMLSwift + +public struct PDFToDOCXConverter: DocumentConverter { + public static let sourceFormat = "pdf" + + public init() {} + + public func convert( + input: URL, + output: inout W, + options: ConversionOptions + ) throws { + let document = try convertToDocument(input: input, options: options) + try output.write(renderDocumentXML(document)) + } + + public func convertToFile( + input: URL, + output: URL, + options: ConversionOptions = .default + ) throws { + let document = try convertToDocument(input: input, options: options) + try DocxWriter.write(document, to: output) + } + + public func convertToDocument( + input: URL, + options: ConversionOptions = .default + ) throws -> WordDocument { + guard FileManager.default.fileExists(atPath: input.path) else { + throw ConversionError.fileNotFound(input.path) + } + guard let pdf = PDFDocument(url: input) else { + throw ConversionError.invalidDocument("無法開啟 PDF:\(input.lastPathComponent)") + } + + var builder = PDFWordBuilder(pdf: pdf, sourceURL: input, options: options) + return builder.build() + } + + private func renderDocumentXML(_ document: WordDocument) -> String { + var xml = """ + + + + """ + + for child in document.body.children { + switch child { + case .paragraph(let paragraph): + xml += paragraph.toXML() + case .table(let table): + xml += table.toXML() + } + } + + xml += renderSectionPropertiesXML(document.sectionProperties) + xml += "" + return xml + } + + private func renderSectionPropertiesXML(_ section: SectionProperties) -> String { + var xml = "" + + if let headerReference = section.headerReference { + xml += "" + } + if let footerReference = section.footerReference { + xml += "" + } + + var pageSizeAttributes = "w:w=\"\(section.pageSize.width)\" w:h=\"\(section.pageSize.height)\"" + if section.orientation == .landscape { + pageSizeAttributes += " w:orient=\"landscape\"" + } + xml += "" + xml += "" + xml += "" + + if let grid = section.docGrid { + var gridAttributes = "w:linePitch=\"\(grid.linePitch)\"" + if let charSpace = grid.charSpace { + gridAttributes += " w:charSpace=\"\(charSpace)\"" + } + xml += "" + } else { + xml += "" + } + + xml += "" + return xml + } +} + +public typealias PDFConverter = PDFToDOCXConverter + +private struct PDFWordBuilder { + private enum ListKind { + case bullet + case ordered + } + + private let pdf: PDFDocument + private let sourceURL: URL + private let options: ConversionOptions + private var document = WordDocument() + + init(pdf: PDFDocument, sourceURL: URL, options: ConversionOptions) { + self.pdf = pdf + self.sourceURL = sourceURL + self.options = options + } + + mutating func build() -> WordDocument { + applyDocumentMetadata() + + for pageIndex in 0.. 0 { + appendPageBreak() + } + emitPage(page, pageIndex: pageIndex) + } + + if document.body.children.isEmpty { + document.appendParagraph(Paragraph()) + } + + return document + } + + private mutating func applyDocumentMetadata() { + let attributes = pdf.documentAttributes ?? [:] + document.properties.title = nonEmptyString(attributes[PDFDocumentAttribute.titleAttribute]) ?? inferTitle() + document.properties.creator = nonEmptyString(attributes[PDFDocumentAttribute.authorAttribute]) + ?? nonEmptyString(attributes[PDFDocumentAttribute.creatorAttribute]) + ?? "macdoc" + document.properties.subject = nonEmptyString(attributes[PDFDocumentAttribute.subjectAttribute]) + ?? sourceURL.lastPathComponent + + if let keywords = attributes[PDFDocumentAttribute.keywordsAttribute] as? [String], !keywords.isEmpty { + document.properties.keywords = keywords.joined(separator: ", ") + } else { + document.properties.keywords = nonEmptyString(attributes[PDFDocumentAttribute.keywordsAttribute]) + } + + document.properties.description = "Converted from PDF file \(sourceURL.lastPathComponent)" + document.properties.created = attributes[PDFDocumentAttribute.creationDateAttribute] as? Date ?? Date() + document.properties.modified = attributes[PDFDocumentAttribute.modificationDateAttribute] as? Date ?? Date() + } + + private mutating func emitPage(_ page: PDFPage, pageIndex: Int) { + let blocks = splitIntoBlockCandidates(page.string ?? "") + + for (blockIndex, blockLines) in blocks.enumerated() { + if let rows = detectTable(in: blockLines) { + document.appendTable(makeTable(from: rows)) + continue + } + + if let listKind = detectListKind(in: blockLines) { + emitList(blockLines, kind: listKind) + continue + } + + let normalizedText = normalizeParagraphText(blockLines) + guard !normalizedText.isEmpty else { continue } + + let properties: ParagraphProperties + if let headingStyle = detectHeadingStyle(text: normalizedText, lines: blockLines, pageIndex: pageIndex, blockIndex: blockIndex) { + var heading = ParagraphProperties() + heading.style = headingStyle + properties = heading + } else { + properties = ParagraphProperties() + } + + document.appendParagraph(makeParagraph(lines: blockLines, properties: properties)) + } + } + + private mutating func appendPageBreak() { + var paragraph = Paragraph() + paragraph.hasPageBreak = true + paragraph.properties.pageBreakBefore = true + document.appendParagraph(paragraph) + } + + private mutating func emitList(_ lines: [String], kind: ListKind) { + let numId = kind == .ordered + ? document.numbering.createNumberedList() + : document.numbering.createBulletList() + + for line in lines { + let text = stripListMarker(from: line) + guard !text.isEmpty else { continue } + var properties = ParagraphProperties() + properties.numbering = NumberingInfo(numId: numId, level: 0) + document.appendParagraph(makeParagraph(lines: [text], properties: properties)) + } + } + + private func makeParagraph(lines: [String], properties: ParagraphProperties) -> Paragraph { + let normalizedLines = lines.map(normalizeInlineWhitespace).filter { !$0.isEmpty } + if normalizedLines.isEmpty { + return Paragraph(properties: properties) + } + + if options.hardLineBreaks, normalizedLines.count > 1 { + var runs: [Run] = [] + for (index, line) in normalizedLines.enumerated() { + runs.append(Run(text: line)) + if index < normalizedLines.count - 1 { + var lineBreak = Run(text: "") + lineBreak.rawXML = "" + runs.append(lineBreak) + } + } + return Paragraph(runs: runs, properties: properties) + } + + let text = normalizedLines.joined(separator: " ") + return Paragraph(text: text, properties: properties) + } + + private func makeTable(from rows: [[String]]) -> Table { + var properties = TableProperties() + properties.borders = .all(Border(style: .single, size: 4, color: "808080")) + properties.cellMargins = .all(80) + properties.layout = .autofit + + let wordRows = rows.enumerated().map { rowIndex, cells in + let wordCells = cells.map { cell -> TableCell in + let paragraph = Paragraph(text: normalizeInlineWhitespace(cell)) + return TableCell(paragraphs: [paragraph]) + } + var row = TableRow(cells: wordCells) + row.properties.isHeader = rowIndex == 0 + return row + } + + return Table(rows: wordRows, properties: properties) + } + + private func splitIntoBlockCandidates(_ rawText: String) -> [[String]] { + enum BlockKind: Equatable { + case paragraph + case table + case bulletList + case orderedList + } + + let normalized = rawText + .replacingOccurrences(of: "\r\n", with: "\n") + .replacingOccurrences(of: "\r", with: "\n") + + var blocks: [[String]] = [] + var current: [String] = [] + var currentKind: BlockKind? + + func flush() { + guard !current.isEmpty else { return } + blocks.append(current) + current = [] + currentKind = nil + } + + for line in normalized.components(separatedBy: .newlines) { + let trimmed = line.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmed.isEmpty { + flush() + continue + } + + let kind: BlockKind + if isTableCandidateLine(trimmed) { + kind = .table + } else if isBulletListLine(trimmed) { + kind = .bulletList + } else if isOrderedListLine(trimmed) { + kind = .orderedList + } else { + kind = .paragraph + } + + if kind == .paragraph, isStandaloneHeadingLine(trimmed) { + flush() + blocks.append([trimmed]) + continue + } + + if currentKind == nil || currentKind == kind { + current.append(trimmed) + currentKind = kind + } else { + flush() + current.append(trimmed) + currentKind = kind + } + + if kind == .paragraph, looksLikeParagraphBoundary(trimmed) { + flush() + } + } + + flush() + return blocks + } + + private func normalizeParagraphText(_ lines: [String]) -> String { + lines + .map(normalizeInlineWhitespace) + .joined(separator: options.hardLineBreaks ? "\n" : " ") + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + private func normalizeInlineWhitespace(_ text: String) -> String { + text + .replacingOccurrences(of: #"\s+"#, with: " ", options: .regularExpression) + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + private func detectHeadingStyle( + text: String, + lines: [String], + pageIndex: Int, + blockIndex: Int + ) -> String? { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return nil } + + let wordCount = trimmed.split(whereSeparator: { $0.isWhitespace }).count + let characterCount = trimmed.count + let uppercaseLetters = trimmed.filter { $0.isLetter && $0.isUppercase }.count + let letterCount = trimmed.filter(\.isLetter).count + let uppercaseRatio = letterCount > 0 ? Double(uppercaseLetters) / Double(letterCount) : 0 + let endsLikeSentence = ".。!?!?::;;".contains(trimmed.last ?? " ") + let startsWithSectionNumber = trimmed.range(of: #"^(?:[0-9]+(?:\.[0-9]+)*|[IVXLCMivxlcm]+|[A-Z])[\.)]?\s+"#, options: .regularExpression) != nil + let titlecaseWords = lines + .joined(separator: " ") + .split(whereSeparator: { $0.isWhitespace }) + .filter { !$0.isEmpty } + let titlecaseRatio = titlecaseWords.isEmpty ? 0 : Double(titlecaseWords.filter(isHeadingLikeWord).count) / Double(titlecaseWords.count) + + if pageIndex == 0 && blockIndex == 0 && characterCount <= 140 { + return "Heading1" + } + + if wordCount <= 12 && characterCount <= 90 && !endsLikeSentence { + if startsWithSectionNumber || uppercaseRatio >= 0.65 { + return "Heading2" + } + + if titlecaseRatio >= 0.7 { + return "Heading2" + } + } + + if lines.count == 1, + wordCount <= 6, + characterCount <= 60, + !endsLikeSentence, + startsWithSectionNumber || uppercaseRatio >= 0.5 || titlecaseRatio >= 0.8 { + return "Heading3" + } + + return nil + } + + private func isHeadingLikeWord(_ word: Substring) -> Bool { + guard let scalar = word.unicodeScalars.first else { return false } + if CharacterSet.decimalDigits.contains(scalar) { + return true + } + let letters = word.filter(\.isLetter) + guard let first = letters.first else { return false } + return first.isUppercase || letters.allSatisfy(\.isUppercase) + } + + private func isStandaloneHeadingLine(_ line: String) -> Bool { + detectHeadingStyle(text: line, lines: [line], pageIndex: 1, blockIndex: 1) != nil + } + + private func looksLikeParagraphBoundary(_ line: String) -> Bool { + if let last = line.trimmingCharacters(in: .whitespacesAndNewlines).last, + ".。!?!?::;;".contains(last) { + return true + } + return line.count >= 120 + } + + private func detectTable(in lines: [String]) -> [[String]]? { + guard lines.count >= 2 else { return nil } + let rows = lines.map(splitTableRow) + guard let columnCount = rows.first?.count, columnCount >= 2 else { return nil } + guard rows.allSatisfy({ $0.count == columnCount }) else { return nil } + guard rows.allSatisfy({ $0.allSatisfy { !$0.isEmpty } }) else { return nil } + return rows + } + + private func isTableCandidateLine(_ line: String) -> Bool { + line.contains("|") || line.contains("\t") || line.range(of: #" {3,}"#, options: .regularExpression) != nil + } + + private func splitTableRow(_ line: String) -> [String] { + let tabSeparated = line + .replacingOccurrences(of: #"\s*\|\s*"#, with: "\t", options: .regularExpression) + .replacingOccurrences(of: #" {3,}"#, with: "\t", options: .regularExpression) + .replacingOccurrences(of: #"\t+"#, with: "\t", options: .regularExpression) + + return tabSeparated + .components(separatedBy: "\t") + .map(normalizeInlineWhitespace) + .filter { !$0.isEmpty } + } + + private func detectListKind(in lines: [String]) -> ListKind? { + guard lines.count >= 2 else { return nil } + + if lines.allSatisfy(isBulletListLine) { + return .bullet + } + if lines.allSatisfy(isOrderedListLine) { + return .ordered + } + return nil + } + + private func isBulletListLine(_ line: String) -> Bool { + line.range(of: #"^(?:[•◦▪‣\-*])\s+"#, options: .regularExpression) != nil + } + + private func isOrderedListLine(_ line: String) -> Bool { + line.range(of: #"^(?:(?:\d+|[A-Za-z]|[IVXLCMivxlcm]+)[\.)])\s+"#, options: .regularExpression) != nil + } + + private func stripListMarker(from line: String) -> String { + let stripped = line.replacingOccurrences( + of: #"^(?:[•◦▪‣\-*]|(?:\d+|[A-Za-z]|[IVXLCMivxlcm]+)[\.)])\s+"#, + with: "", + options: .regularExpression + ) + return normalizeInlineWhitespace(stripped) + } + + private func inferTitle() -> String { + for pageIndex in 0.. String? { + guard let string = value as? String else { return nil } + let trimmed = string.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } +} diff --git a/packages/pdf-to-docx-swift/Tests/PDFToDOCXTests/PDFToDOCXConverterTests.swift b/packages/pdf-to-docx-swift/Tests/PDFToDOCXTests/PDFToDOCXConverterTests.swift new file mode 100644 index 0000000..9c744e8 --- /dev/null +++ b/packages/pdf-to-docx-swift/Tests/PDFToDOCXTests/PDFToDOCXConverterTests.swift @@ -0,0 +1,188 @@ +import Foundation +import AppKit +import CoreGraphics +import OOXMLSwift +@testable import PDFToDOCX + +#if canImport(XCTest) +import XCTest + +final class PDFToDOCXConverterTests: XCTestCase { + private let converter = PDFToDOCXConverter() + private var cleanupURLs: [URL] = [] + + override func tearDown() { + for url in cleanupURLs { + try? FileManager.default.removeItem(at: url) + } + cleanupURLs.removeAll() + super.tearDown() + } + + func testConvertToStringStreamsWordDocumentXML() throws { + let pdfURL = try makePDF( + named: "basic.pdf", + metadata: [.title: "Sample Title"], + pages: [["Sample Title", "", "Hello world from PDF"]] + ) + + let xml = try converter.convertToString(input: pdfURL) + + XCTAssertTrue(xml.contains(" URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + cleanupURLs.append(directory) + + let url = directory.appendingPathComponent(name) + var mediaBox = CGRect(x: 0, y: 0, width: 612, height: 792) + let auxiliaryInfo = metadata.isEmpty ? nil : metadata.toCoreGraphicsDictionary() + + guard let consumer = CGDataConsumer(url: url as CFURL), + let context = CGContext( + consumer: consumer, + mediaBox: &mediaBox, + auxiliaryInfo as CFDictionary? + ) else { + XCTFail("Failed to create PDF context") + return url + } + + let attributes: [NSAttributedString.Key: Any] = [ + .font: NSFont.systemFont(ofSize: 18) + ] + + for lines in pages { + context.beginPDFPage(nil) + let graphicsContext = NSGraphicsContext(cgContext: context, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = graphicsContext + + var y: CGFloat = 720 + for line in lines { + NSString(string: line).draw(at: CGPoint(x: 72, y: y), withAttributes: attributes) + y -= 28 + } + + NSGraphicsContext.restoreGraphicsState() + context.endPDFPage() + } + + context.closePDF() + return url + } +} + +private enum PDFMetadataKey { + case title + case author + case subject + case keywords +} + +private typealias PDFMetadata = [PDFMetadataKey: Any] + +private extension Dictionary where Key == PDFMetadataKey, Value == Any { + func toCoreGraphicsDictionary() -> [CFString: Any] { + var dictionary: [CFString: Any] = [:] + + if let title = self[.title] as? String { + dictionary[kCGPDFContextTitle] = title + } + if let author = self[.author] as? String { + dictionary[kCGPDFContextAuthor] = author + } + if let subject = self[.subject] as? String { + dictionary[kCGPDFContextSubject] = subject + } + if let keywords = self[.keywords] as? [String] { + dictionary[kCGPDFContextKeywords] = keywords + } + + return dictionary + } +} +#endif diff --git a/packages/pdf-to-md-swift/Sources/PDFToMD/PDFConverter.swift b/packages/pdf-to-md-swift/Sources/PDFToMD/PDFConverter.swift new file mode 100644 index 0000000..c42daf9 --- /dev/null +++ b/packages/pdf-to-md-swift/Sources/PDFToMD/PDFConverter.swift @@ -0,0 +1,330 @@ +import Foundation +import PDFKit +import CommonConverterSwift + +public struct PDFConverter: DocumentConverter { + public static let sourceFormat = "pdf" + + public init() {} + + public func convert( + input: URL, + output: inout W, + options: ConversionOptions + ) throws { + guard FileManager.default.fileExists(atPath: input.path) else { + throw ConversionError.fileNotFound(input.path) + } + guard let document = PDFDocument(url: input) else { + throw ConversionError.invalidDocument("無法開啟 PDF: \(input.lastPathComponent)") + } + + if options.includeFrontmatter { + try emitFrontmatter(document: document, source: input, output: &output) + } + + var emittedPage = false + for pageIndex in 0..( + document: PDFDocument, + source: URL, + output: inout W + ) throws { + try output.writeLine("---") + if let title = document.documentAttributes?[PDFDocumentAttribute.titleAttribute] as? String, + !title.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + try output.writeLine("title: \"\(escapeYAML(title))\"") + } + try output.writeLine("source: \"\(escapeYAML(source.lastPathComponent))\"") + try output.writeLine("format: \"pdf\"") + try output.writeLine("pages: \(document.pageCount)") + try output.writeLine("---") + try output.writeBlankLine() + } + + private func extractBlocks(from page: PDFPage, options: ConversionOptions) -> [MarkdownBlock] { + let lines = extractLineFragments(from: page) + guard !lines.isEmpty else { return [] } + + let grouped = groupLinesIntoBlocks(lines) + let bodyLineHeight = median(lines.map(\.height)) + + return grouped.compactMap { block in + classify(block: block, bodyLineHeight: bodyLineHeight, options: options) + } + } + + private func extractLineFragments(from page: PDFPage) -> [LineFragment] { + let bounds = page.bounds(for: .mediaBox) + guard let selection = page.selection(for: bounds) else { return [] } + + return selection.selectionsByLine() + .compactMap { line in + let text = normalizedLine(line.string ?? "") + guard !text.isEmpty else { return nil } + return LineFragment( + text: text, + minY: line.bounds(for: page).minY, + height: line.bounds(for: page).height, + minX: line.bounds(for: page).minX + ) + } + .sorted { + if abs($0.minY - $1.minY) > 0.5 { + return $0.minY > $1.minY + } + return $0.minX < $1.minX + } + } + + private func groupLinesIntoBlocks(_ lines: [LineFragment]) -> [[LineFragment]] { + guard !lines.isEmpty else { return [] } + guard lines.count > 1 else { return [lines] } + + let verticalSteps = zip(lines, lines.dropFirst()) + .map { max(0, $0.0.minY - $0.1.minY) } + .filter { $0 > 0 } + + let baseStep = median(verticalSteps) + let threshold = max(18, baseStep * 1.6) + + var blocks: [[LineFragment]] = [] + var current: [LineFragment] = [lines[0]] + + for (previous, currentLine) in zip(lines, lines.dropFirst()) { + let delta = max(0, previous.minY - currentLine.minY) + if delta > threshold { + blocks.append(current) + current = [currentLine] + } else { + current.append(currentLine) + } + } + + if !current.isEmpty { + blocks.append(current) + } + + return blocks + } + + private func classify( + block: [LineFragment], + bodyLineHeight: Double, + options: ConversionOptions + ) -> MarkdownBlock? { + let lines = block.map(\.text) + + if let items = parseUnorderedList(lines) { + return .unorderedList(items) + } + if let items = parseOrderedList(lines) { + return .orderedList(items) + } + + let merged = mergeLines(lines, hardBreaks: options.hardLineBreaks) + guard !merged.isEmpty else { return nil } + + let isHeading = block.count == 1 + && block[0].height > max(bodyLineHeight * 1.15, 18) + && looksLikeHeading(merged) + + if isHeading { + let level = headingLevel(height: block[0].height, bodyLineHeight: bodyLineHeight) + return .heading(merged, level: level) + } + + return .paragraph(merged) + } + + private func parseUnorderedList(_ lines: [String]) -> [String]? { + guard !lines.isEmpty else { return nil } + + var items: [String] = [] + for line in lines { + guard let captures = captureGroups(in: line, pattern: #"^([\t ]*)([•◦▪‣*\-])\s+(.+)$"#), captures.count == 3 else { + return nil + } + let indentLevel = indentationLevel(captures[0]) + let text = normalizeInlineSpacing(captures[2]) + guard !text.isEmpty else { return nil } + items.append("\(String(repeating: " ", count: indentLevel))- \(text)") + } + + return items + } + + private func parseOrderedList(_ lines: [String]) -> [String]? { + guard !lines.isEmpty else { return nil } + + var items: [String] = [] + for line in lines { + guard let captures = captureGroups(in: line, pattern: #"^([\t ]*)(\d+)[\.)]\s+(.+)$"#), captures.count == 3 else { + return nil + } + let indentLevel = indentationLevel(captures[0]) + let ordinal = captures[1] + let text = normalizeInlineSpacing(captures[2]) + guard !text.isEmpty else { return nil } + items.append("\(String(repeating: " ", count: indentLevel))\(ordinal). \(text)") + } + + return items + } + + private func mergeLines(_ lines: [String], hardBreaks: Bool) -> String { + guard let first = lines.first else { return "" } + + var merged = normalizeInlineSpacing(first) + for line in lines.dropFirst() { + let next = normalizeInlineSpacing(line) + guard !next.isEmpty else { continue } + + if merged.hasSuffix("-"), startsWithLowercaseWord(next) { + merged.removeLast() + merged += next + } else if hardBreaks { + merged += " \n" + next + } else { + merged += " " + next + } + } + + return merged.trimmingCharacters(in: .whitespacesAndNewlines) + } + + private func looksLikeHeading(_ text: String) -> Bool { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return false } + guard trimmed.count <= 120 else { return false } + guard !trimmed.hasSuffix("."), !trimmed.hasSuffix("?"), !trimmed.hasSuffix("!"), !trimmed.hasSuffix(";"), !trimmed.hasSuffix(",") else { + return false + } + + let words = trimmed.split(whereSeparator: \.isWhitespace) + guard (1...12).contains(words.count) else { return false } + + let lowercase = trimmed.lowercased() + if lowercase.hasPrefix("chapter ") || lowercase.hasPrefix("section ") || lowercase.hasPrefix("appendix ") { + return true + } + + return true + } + + /// Map font size ratio to heading level (1-3). + /// Ratio = heading height / body line height. + private func headingLevel(height: Double, bodyLineHeight: Double) -> Int { + let ratio = height / max(bodyLineHeight, 1) + if ratio > 1.8 { return 1 } + if ratio > 1.4 { return 2 } + return 3 + } + + private func startsWithLowercaseWord(_ text: String) -> Bool { + guard let scalar = text.unicodeScalars.first else { return false } + return CharacterSet.lowercaseLetters.contains(scalar) + } + + private func indentationLevel(_ rawIndent: String) -> Int { + let spaces = rawIndent.reduce(into: 0) { partial, character in + partial += character == "\t" ? 2 : 1 + } + return max(0, spaces / 2) + } + + private func normalizedLine(_ text: String) -> String { + text + .replacingOccurrences(of: "\r\n", with: "\n") + .replacingOccurrences(of: "\r", with: "\n") + .replacingOccurrences(of: "\n", with: " ") + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + private func normalizeInlineSpacing(_ text: String) -> String { + text.replacingOccurrences(of: #"\s+"#, with: " ", options: .regularExpression) + } + + private func captureGroups(in text: String, pattern: String) -> [String]? { + guard let regex = try? NSRegularExpression(pattern: pattern) else { + return nil + } + let range = NSRange(text.startIndex..., in: text) + guard let match = regex.firstMatch(in: text, options: [], range: range) else { + return nil + } + + return (1.. Double { + guard !values.isEmpty else { return 0 } + let sorted = values.sorted() + let middle = sorted.count / 2 + if sorted.count.isMultiple(of: 2) { + return (sorted[middle - 1] + sorted[middle]) / 2 + } + return sorted[middle] + } + + private func escapeYAML(_ text: String) -> String { + text + .replacingOccurrences(of: "\\", with: "\\\\") + .replacingOccurrences(of: "\"", with: "\\\"") + } +} + +private struct LineFragment { + let text: String + let minY: Double + let height: Double + let minX: Double +} + +private enum MarkdownBlock { + case heading(String, level: Int) + case paragraph(String) + case unorderedList([String]) + case orderedList([String]) +} diff --git a/packages/pdf-to-md-swift/Sources/PDFToMDSmokeTests/main.swift b/packages/pdf-to-md-swift/Sources/PDFToMDSmokeTests/main.swift new file mode 100644 index 0000000..a198672 --- /dev/null +++ b/packages/pdf-to-md-swift/Sources/PDFToMDSmokeTests/main.swift @@ -0,0 +1,176 @@ +import Foundation +import AppKit +import PDFKit +import CommonConverterSwift +import PDFToMD + +@main +struct PDFToMDSwiftSmokeTests { + static func main() throws { + let runner = Runner() + try runner.runAll() + print("pdf-to-md smoke tests: OK") + } +} + +private struct Runner { + private let converter = PDFConverter() + + func runAll() throws { + try convertsHeadingParagraphAndBulletList() + try convertsOrderedListAndHardBreaks() + try insertsPageBreakBetweenPages() + try joinsHyphenatedLineBreaks() + try frontmatterIncludesSourceAndPageCount() + } + + private func convertsHeadingParagraphAndBulletList() throws { + let pdf = try makePDF( + named: "structure.pdf", + pages: [[ + DrawBlock(text: "Quarterly Results", fontSize: 28, origin: CGPoint(x: 72, y: 700)), + DrawBlock( + text: "Revenue increased year over year.\nMargin expansion continued.", + fontSize: 18, + origin: CGPoint(x: 72, y: 300) + ), + DrawBlock(text: "• Revenue\n• Margin", fontSize: 18, origin: CGPoint(x: 72, y: 140)), + ]] + ) + let markdown = try convert(pdf) + try expect(markdown.contains("# Quarterly Results"), "missing heading", markdown) + try expect(markdown.contains("Revenue increased year over year. Margin expansion continued."), "missing paragraph", markdown) + try expect(markdown.contains("- Revenue"), "missing first bullet", markdown) + try expect(markdown.contains("- Margin"), "missing second bullet", markdown) + } + + private func convertsOrderedListAndHardBreaks() throws { + let pdf = try makePDF( + named: "ordered-and-breaks.pdf", + pages: [[ + DrawBlock(text: "Line one\nLine two", fontSize: 18, origin: CGPoint(x: 72, y: 560)), + DrawBlock(text: "1. Collect data\n2. Review output", fontSize: 18, origin: CGPoint(x: 72, y: 380)), + ]] + ) + var options = ConversionOptions.default + options.hardLineBreaks = true + let markdown = try convert(pdf, options: options) + try expect(markdown.contains("Line one \nLine two"), "missing hard break paragraph", markdown) + try expect(markdown.contains("1. Collect data"), "missing ordered item 1", markdown) + try expect(markdown.contains("2. Review output"), "missing ordered item 2", markdown) + } + + private func insertsPageBreakBetweenPages() throws { + let pdf = try makePDF( + named: "page-breaks.pdf", + pages: [ + [DrawBlock(text: "Page one paragraph.", fontSize: 18, origin: CGPoint(x: 72, y: 640))], + [DrawBlock(text: "Page two paragraph.", fontSize: 18, origin: CGPoint(x: 72, y: 640))], + ] + ) + let markdown = try convert(pdf) + try expect(markdown.contains("Page one paragraph."), "missing page 1 text", markdown) + try expect(markdown.contains("\n---\n\nPage two paragraph."), "missing page break", markdown) + } + + private func joinsHyphenatedLineBreaks() throws { + let pdf = try makePDF( + named: "hyphenation.pdf", + pages: [[ + DrawBlock(text: "micro-\nservice migration", fontSize: 18, origin: CGPoint(x: 72, y: 560)), + ]] + ) + let markdown = try convert(pdf) + try expect(markdown.contains("microservice migration"), "missing dehyphenated text", markdown) + try expect(!markdown.contains("micro- service"), "still contains broken hyphenation", markdown) + } + + private func frontmatterIncludesSourceAndPageCount() throws { + let pdf = try makePDF( + named: "frontmatter.pdf", + pages: [[ + DrawBlock(text: "Frontmatter body.", fontSize: 18, origin: CGPoint(x: 72, y: 640)), + ]] + ) + var options = ConversionOptions.default + options.includeFrontmatter = true + let markdown = try convert(pdf, options: options) + try expect(markdown.contains("source: \"frontmatter.pdf\""), "missing frontmatter source", markdown) + try expect(markdown.contains("format: \"pdf\""), "missing frontmatter format", markdown) + try expect(markdown.contains("pages: 1"), "missing frontmatter page count", markdown) + } + + private func expect(_ condition: Bool, _ message: String, _ markdown: String) throws { + guard condition else { + throw SmokeTestError.failed("\(message)\n--- output ---\n\(markdown)") + } + } + + private func convert(_ input: URL, options: ConversionOptions = .default) throws -> String { + defer { try? FileManager.default.removeItem(at: input.deletingLastPathComponent()) } + return try converter.convertToString(input: input, options: options) + } + + private func makePDF(named fileName: String, pages: [[DrawBlock]]) throws -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent("pdf-to-md-smoke-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + + let url = directory.appendingPathComponent(fileName) + let data = NSMutableData() + var mediaBox = CGRect(x: 0, y: 0, width: 612, height: 792) + + guard let consumer = CGDataConsumer(data: data as CFMutableData) else { + throw SmokeTestError.failed("cannot create PDF consumer") + } + guard let context = CGContext(consumer: consumer, mediaBox: &mediaBox, nil) else { + throw SmokeTestError.failed("cannot create PDF context") + } + + for page in pages { + context.beginPDFPage(nil) + let graphicsContext = NSGraphicsContext(cgContext: context, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = graphicsContext + + for block in page { + let style = NSMutableParagraphStyle() + style.lineSpacing = block.lineSpacing + let attributed = NSAttributedString( + string: block.text, + attributes: [ + .font: block.font, + .paragraphStyle: style, + ] + ) + if block.text.contains("\n") { + attributed.draw(in: CGRect(x: block.origin.x, y: block.origin.y, width: 468, height: 200)) + } else { + attributed.draw(at: block.origin) + } + } + + NSGraphicsContext.restoreGraphicsState() + context.endPDFPage() + } + + context.closePDF() + try Data(referencing: data).write(to: url) + return url + } +} + +private struct DrawBlock { + let text: String + let fontSize: CGFloat + let origin: CGPoint + var lineSpacing: CGFloat = 6 + + var font: NSFont { + NSFont(name: "Times New Roman", size: fontSize) ?? .systemFont(ofSize: fontSize) + } +} + +private enum SmokeTestError: Error { + case failed(String) +} diff --git a/packages/pdf-to-md-swift/Tests/PDFToMDTests/PDFConverterTests.swift b/packages/pdf-to-md-swift/Tests/PDFToMDTests/PDFConverterTests.swift new file mode 100644 index 0000000..8833a27 --- /dev/null +++ b/packages/pdf-to-md-swift/Tests/PDFToMDTests/PDFConverterTests.swift @@ -0,0 +1,167 @@ +#if canImport(XCTest) +import XCTest +import AppKit +import PDFKit +import CommonConverterSwift +@testable import PDFToMD + +final class PDFConverterTests: XCTestCase { + private let converter = PDFConverter() + + func testConvertsHeadingParagraphAndBulletList() throws { + let pdf = try makePDF( + named: "structure.pdf", + pages: [[ + DrawBlock(text: "Quarterly Results", fontSize: 28, origin: CGPoint(x: 72, y: 700)), + DrawBlock( + text: "Revenue increased year over year.\nMargin expansion continued.", + fontSize: 18, + origin: CGPoint(x: 72, y: 300) + ), + DrawBlock(text: "• Revenue\n• Margin", fontSize: 18, origin: CGPoint(x: 72, y: 140)), + ]] + ) + + let markdown = try convert(pdf) + + XCTAssert(markdown.contains("# Quarterly Results"), "Got: \(markdown)") + XCTAssert(markdown.contains("Revenue increased year over year. Margin expansion continued."), "Got: \(markdown)") + XCTAssert(markdown.contains("- Revenue"), "Got: \(markdown)") + XCTAssert(markdown.contains("- Margin"), "Got: \(markdown)") + } + + func testConvertsOrderedListAndHardBreaks() throws { + let pdf = try makePDF( + named: "ordered-and-breaks.pdf", + pages: [[ + DrawBlock(text: "Line one\nLine two", fontSize: 18, origin: CGPoint(x: 72, y: 560)), + DrawBlock(text: "1. Collect data\n2. Review output", fontSize: 18, origin: CGPoint(x: 72, y: 380)), + ]] + ) + + var options = ConversionOptions.default + options.hardLineBreaks = true + let markdown = try convert(pdf, options: options) + + XCTAssert(markdown.contains("Line one \nLine two"), "Got: \(markdown)") + XCTAssert(markdown.contains("1. Collect data"), "Got: \(markdown)") + XCTAssert(markdown.contains("2. Review output"), "Got: \(markdown)") + } + + func testInsertsPageBreakBetweenPages() throws { + let pdf = try makePDF( + named: "page-breaks.pdf", + pages: [ + [DrawBlock(text: "Page one paragraph.", fontSize: 18, origin: CGPoint(x: 72, y: 640))], + [DrawBlock(text: "Page two paragraph.", fontSize: 18, origin: CGPoint(x: 72, y: 640))], + ] + ) + + let markdown = try convert(pdf) + + XCTAssert(markdown.contains("Page one paragraph."), "Got: \(markdown)") + XCTAssert(markdown.contains("\n---\n\nPage two paragraph."), "Got: \(markdown)") + } + + func testJoinsHyphenatedLineBreaks() throws { + let pdf = try makePDF( + named: "hyphenation.pdf", + pages: [[ + DrawBlock(text: "micro-\nservice migration", fontSize: 18, origin: CGPoint(x: 72, y: 560)), + ]] + ) + + let markdown = try convert(pdf) + + XCTAssert(markdown.contains("microservice migration"), "Got: \(markdown)") + XCTAssertFalse(markdown.contains("micro- service"), "Got: \(markdown)") + } + + func testFrontmatterIncludesSourceAndPageCount() throws { + let pdf = try makePDF( + named: "frontmatter.pdf", + pages: [[ + DrawBlock(text: "Frontmatter body.", fontSize: 18, origin: CGPoint(x: 72, y: 640)), + ]] + ) + + var options = ConversionOptions.default + options.includeFrontmatter = true + let markdown = try convert(pdf, options: options) + + XCTAssert(markdown.contains("source: \"frontmatter.pdf\""), "Got: \(markdown)") + XCTAssert(markdown.contains("format: \"pdf\""), "Got: \(markdown)") + XCTAssert(markdown.contains("pages: 1"), "Got: \(markdown)") + } + + // MARK: - Helpers + + private func convert(_ input: URL, options: ConversionOptions = .default) throws -> String { + defer { try? FileManager.default.removeItem(at: input.deletingLastPathComponent()) } + return try converter.convertToString(input: input, options: options) + } + + private func makePDF(named fileName: String, pages: [[DrawBlock]]) throws -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent("pdf-to-md-tests-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + + let url = directory.appendingPathComponent(fileName) + let data = NSMutableData() + var mediaBox = CGRect(x: 0, y: 0, width: 612, height: 792) + + guard let consumer = CGDataConsumer(data: data as CFMutableData) else { + throw TestError.cannotCreatePDFContext + } + guard let context = CGContext(consumer: consumer, mediaBox: &mediaBox, nil) else { + throw TestError.cannotCreatePDFContext + } + + for page in pages { + context.beginPDFPage(nil) + let graphicsContext = NSGraphicsContext(cgContext: context, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = graphicsContext + + for block in page { + let style = NSMutableParagraphStyle() + style.lineSpacing = block.lineSpacing + let attributed = NSAttributedString( + string: block.text, + attributes: [ + .font: block.font, + .paragraphStyle: style, + ] + ) + if block.text.contains("\n") { + attributed.draw(in: CGRect(x: block.origin.x, y: block.origin.y, width: 468, height: 200)) + } else { + attributed.draw(at: block.origin) + } + } + + NSGraphicsContext.restoreGraphicsState() + context.endPDFPage() + } + + context.closePDF() + try Data(referencing: data).write(to: url) + return url + } +} + +private struct DrawBlock { + let text: String + let fontSize: CGFloat + let origin: CGPoint + var lineSpacing: CGFloat = 6 + + var font: NSFont { + NSFont(name: "Times New Roman", size: fontSize) ?? .systemFont(ofSize: fontSize) + } +} + +private enum TestError: Error { + case cannotCreatePDFContext +} +#endif diff --git a/packages/word-to-html-swift/Sources/WordToHTML/WordHTMLConverter.swift b/packages/word-to-html-swift/Sources/WordToHTML/WordHTMLConverter.swift new file mode 100644 index 0000000..856f838 --- /dev/null +++ b/packages/word-to-html-swift/Sources/WordToHTML/WordHTMLConverter.swift @@ -0,0 +1,655 @@ +import Foundation +import CommonConverterSwift +import OOXMLSwift + +public struct WordHTMLConverter: DocumentConverter { + public static let sourceFormat = "docx" + + public init() {} + + public func convert( + input: URL, + output: inout W, + options: ConversionOptions + ) throws { + let document = try DocxReader.read(from: input) + try convert(document: document, source: input, output: &output, options: options) + } + + public func convert( + document: WordDocument, + output: inout W, + options: ConversionOptions = .default + ) throws { + try convert(document: document, source: nil, output: &output, options: options) + } + + public func convertToString( + document: WordDocument, + options: ConversionOptions = .default + ) throws -> String { + var writer = StringOutput() + try convert(document: document, output: &writer, options: options) + return writer.content + } + + private func convert( + document: WordDocument, + source: URL?, + output: inout W, + options: ConversionOptions + ) throws { + var context = ConversionContext(document: document, options: options) + let title = resolvedTitle(for: document, source: source) + + if options.includeFrontmatter { + try emitFrontmatter(document: document, source: source, title: title, output: &output) + } + + try emitDocumentStart(title: title, output: &output) + + let children = document.body.children + var index = 0 + while index < children.count { + switch children[index] { + case .paragraph(let paragraph): + if paragraph.properties.numbering != nil { + let (items, nextIndex) = collectListItems(children: children, startIndex: index, context: &context) + try emitListBlock(items, output: &output) + index = nextIndex + } else { + try emitParagraph(paragraph, context: &context, output: &output) + index += 1 + } + case .table(let table): + try emitTable(table, context: &context, output: &output) + index += 1 + } + } + + try emitFootnotes(context: context, output: &output) + try emitDocumentEnd(output: &output) + } + + // MARK: - Document Shell + + private func emitFrontmatter( + document: WordDocument, + source: URL?, + title: String, + output: inout W + ) throws { + try output.writeLine("") + } + + private func emitDocumentStart( + title: String, + output: inout W + ) throws { + try output.writeLine("") + try output.writeLine("") + try output.writeLine("") + try output.writeLine(" ") + try output.writeLine(" ") + try output.writeLine(" ") + try output.writeLine(" \(escapeHTML(title))") + try output.writeLine(" ") + try output.writeLine("") + try output.writeLine("") + try output.writeLine("
") + } + + private func emitDocumentEnd(output: inout W) throws { + try output.writeLine("
") + try output.writeLine("") + try output.writeLine("") + } + + // MARK: - Blocks + + private func emitParagraph( + _ paragraph: Paragraph, + context: inout ConversionContext, + output: inout W + ) throws { + if paragraph.hasPageBreak || paragraph.properties.pageBreakBefore { + try output.writeLine("
") + if paragraph.runs.isEmpty && paragraph.hyperlinks.isEmpty { + return + } + } + + if let styleName = paragraph.properties.style, + isCodeStyle(styleName, styles: context.styles) { + let plain = escapeHTML(collectPlainText(paragraph)) + guard !plain.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return } + try output.writeLine("
\(plain)
") + return + } + + let html = renderParagraphContent(paragraph, context: &context) + guard !isHTMLContentEmpty(html) else { return } + + if let styleName = paragraph.properties.style, + isBlockquoteStyle(styleName, styles: context.styles) { + try output.writeLine("

\(html)

") + return + } + + if let styleName = paragraph.properties.style, + let level = detectHeadingLevel(styleName: styleName, styles: context.styles) { + let clamped = max(1, min(level, 6)) + try output.writeLine(" \(html)") + return + } + + try output.writeLine("

\(html)

") + } + + private func emitTable( + _ table: Table, + context: inout ConversionContext, + output: inout W + ) throws { + guard !table.rows.isEmpty else { return } + + try output.writeLine(" ") + + if let headerRow = table.rows.first { + try output.writeLine(" ") + try output.writeLine(" ") + for cell in headerRow.cells { + let content = renderTableCell(cell, context: &context) + try output.writeLine(" ") + } + try output.writeLine(" ") + try output.writeLine(" ") + } + + if table.rows.count > 1 { + try output.writeLine(" ") + for row in table.rows.dropFirst() { + try output.writeLine(" ") + for cell in row.cells { + let content = renderTableCell(cell, context: &context) + try output.writeLine(" ") + } + try output.writeLine(" ") + } + try output.writeLine(" ") + } + + try output.writeLine("
\(content)
\(content)
") + } + + private func renderTableCell( + _ cell: TableCell, + context: inout ConversionContext + ) -> String { + let joined = cell.paragraphs + .map { renderParagraphContent($0, context: &context) } + .filter { !isHTMLContentEmpty($0) } + .joined(separator: "
") + return joined.isEmpty ? " " : joined + } + + // MARK: - Lists + + private func collectListItems( + children: [BodyChild], + startIndex: Int, + context: inout ConversionContext + ) -> ([FlatListItem], Int) { + var items: [FlatListItem] = [] + var index = startIndex + + while index < children.count { + guard case .paragraph(let paragraph) = children[index], + let numInfo = paragraph.properties.numbering else { + break + } + + let kind: ListKind = isListBullet( + numId: numInfo.numId, + level: numInfo.level, + numbering: context.numbering + ) ? .unordered : .ordered + + let html = renderParagraphContent(paragraph, context: &context) + items.append( + FlatListItem( + kind: kind, + level: max(0, numInfo.level), + content: html.isEmpty ? " " : html + ) + ) + index += 1 + } + + return (normalizeListLevels(items), index) + } + + private func normalizeListLevels(_ items: [FlatListItem]) -> [FlatListItem] { + guard let minLevel = items.map(\.level).min() else { return items } + return items.map { item in + FlatListItem(kind: item.kind, level: max(0, item.level - minLevel), content: item.content) + } + } + + private func emitListBlock( + _ items: [FlatListItem], + output: inout W + ) throws { + guard !items.isEmpty else { return } + var index = 0 + while index < items.count { + try renderList(items: items, index: &index, level: items[index].level, kind: items[index].kind, output: &output) + } + } + + private func renderList( + items: [FlatListItem], + index: inout Int, + level: Int, + kind: ListKind, + output: inout W + ) throws { + let indent = String(repeating: " ", count: level + 2) + let itemIndent = String(repeating: " ", count: level + 3) + + try output.writeLine("\(indent)<\(kind.tagName)>") + while index < items.count { + let item = items[index] + if item.level < level { break } + if item.level != level || item.kind != kind { break } + + try output.write("\(itemIndent)
  • \(item.content)") + index += 1 + + while index < items.count, items[index].level > level { + try output.writeLine("") + try renderList(items: items, index: &index, level: items[index].level, kind: items[index].kind, output: &output) + try output.write("\(itemIndent)") + } + + try output.writeLine("
  • ") + } + try output.writeLine("\(indent)") + } + + // MARK: - Inlines + + private func renderParagraphContent( + _ paragraph: Paragraph, + context: inout ConversionContext + ) -> String { + var result = "" + + for run in paragraph.runs { + result += renderRun(run, context: &context) + } + + for hyperlink in paragraph.hyperlinks { + result += renderHyperlink(hyperlink, context: context) + } + + for footnoteId in paragraph.footnoteIds { + context.registerFootnote(id: footnoteId) + let id = escapeAttribute("fn-\(footnoteId)") + let refId = escapeAttribute("fnref-\(footnoteId)") + result += "\(footnoteId)" + } + + for endnoteId in paragraph.endnoteIds { + let mappedId = "en\(endnoteId)" + context.registerEndnote(id: endnoteId, mappedId: mappedId) + let id = escapeAttribute("fn-\(mappedId)") + let refId = escapeAttribute("fnref-\(mappedId)") + result += "\(escapeHTML(mappedId))" + } + + return result + } + + private func renderRun( + _ run: Run, + context: inout ConversionContext + ) -> String { + if let drawing = run.drawing { + return renderDrawing(drawing, context: &context) + } + + guard !run.text.isEmpty else { return "" } + + var text = escapeHTML(run.text) + let props = run.properties + + if let semantic = run.semantic, + case .formula = semantic.type { + return "\(text)" + } + + if props.bold && props.italic { + text = "\(text)" + } else if props.bold { + text = "\(text)" + } else if props.italic { + text = "\(text)" + } + + if props.strikethrough { + text = "\(text)" + } + + if props.underline != nil { + text = "\(text)" + } + if props.verticalAlign == .superscript { + text = "\(text)" + } + if props.verticalAlign == .subscript { + text = "\(text)" + } + if props.highlight != nil { + text = "\(text)" + } + + return text.replacingOccurrences(of: "\n", with: context.options.hardLineBreaks ? "
    " : "\n") + } + + private func renderHyperlink( + _ hyperlink: Hyperlink, + context: ConversionContext + ) -> String { + let label = escapeHTML(hyperlink.text) + + switch hyperlink.type { + case .external: + if let url = hyperlink.url, !url.isEmpty { + return "\(label.isEmpty ? escapeHTML(url) : label)" + } + if let relationshipId = hyperlink.relationshipId, + let reference = context.document.hyperlinkReferences.first(where: { $0.relationshipId == relationshipId }) { + let url = reference.url + return "\(label.isEmpty ? escapeHTML(url) : label)" + } + return label + + case .internal: + if let anchor = hyperlink.anchor, !anchor.isEmpty { + return "\(label)" + } + return label + } + } + + private func renderDrawing( + _ drawing: Drawing, + context: inout ConversionContext + ) -> String { + let imageRef = context.document.images.first { $0.id == drawing.imageId } + let alt = escapeAttribute(drawing.description.isEmpty ? drawing.name : drawing.description) + let src = resolveImageSource(imageRef: imageRef, options: context.options) ?? drawing.imageId + return "\"\(alt)\"" + } + + private func resolveImageSource(imageRef: ImageReference?, options: ConversionOptions) -> String? { + guard let imageRef else { return nil } + + if let figuresDirectory = options.figuresDirectory { + try? FileManager.default.createDirectory(at: figuresDirectory, withIntermediateDirectories: true) + let targetURL = figuresDirectory.appendingPathComponent(imageRef.fileName) + if !FileManager.default.fileExists(atPath: targetURL.path) { + try? imageRef.data.write(to: targetURL) + } + return figuresDirectory.lastPathComponent + "/" + imageRef.fileName + } + + return imageRef.fileName + } + + // MARK: - Footnotes + + private func emitFootnotes( + context: ConversionContext, + output: inout W + ) throws { + let hasFootnotes = !context.referencedFootnoteIds.isEmpty + let hasEndnotes = !context.referencedEndnoteIds.isEmpty + guard hasFootnotes || hasEndnotes else { return } + + try output.writeLine("
    ") + try output.writeLine("
    ") + try output.writeLine("
      ") + + for id in context.referencedFootnoteIds.sorted() { + if let footnote = context.document.footnotes.footnotes.first(where: { $0.id == id }) { + let escapedText = escapeHTML(footnote.text) + let liId = escapeAttribute("fn-\(id)") + let refId = escapeAttribute("fnref-\(id)") + try output.writeLine("
    1. \(escapedText)

    2. ") + } + } + + for (id, mappedId) in context.endnoteIdMapping.sorted(by: { $0.key < $1.key }) { + if let endnote = context.document.endnotes.endnotes.first(where: { $0.id == id }) { + let escapedText = escapeHTML(endnote.text) + let liId = escapeAttribute("fn-\(mappedId)") + let refId = escapeAttribute("fnref-\(mappedId)") + try output.writeLine("
    3. \(escapedText)

    4. ") + } + } + + try output.writeLine("
    ") + try output.writeLine("
    ") + } + + // MARK: - Helpers + + private func resolvedTitle(for document: WordDocument, source: URL?) -> String { + if let title = document.properties.title?.trimmingCharacters(in: .whitespacesAndNewlines), !title.isEmpty { + return title + } + if let source { + return source.deletingPathExtension().lastPathComponent + } + return "Document" + } + + private func collectPlainText(_ paragraph: Paragraph) -> String { + var text = paragraph.runs.map(\.text).joined() + for hyperlink in paragraph.hyperlinks { + text += hyperlink.text + } + return text + } + + private func isHTMLContentEmpty(_ html: String) -> Bool { + // 含有 等 self-closing 標籤的內容不算空 + if html.contains("]+>", with: "", options: .regularExpression) + let normalized = withoutTags + .replacingOccurrences(of: " ", with: "") + .trimmingCharacters(in: .whitespacesAndNewlines) + return normalized.isEmpty + } + + private func isCodeStyle(_ styleName: String, styles: [Style]) -> Bool { + let lower = styleName.lowercased() + let codePatterns = ["code", "source", "listing", "verbatim", "preformatted"] + if codePatterns.contains(where: { lower.contains($0) }) { + return true + } + if let style = styles.first(where: { $0.id.lowercased() == lower }), + let basedOn = style.basedOn { + return isCodeStyle(basedOn, styles: styles) + } + return false + } + + private func isBlockquoteStyle(_ styleName: String, styles: [Style]) -> Bool { + let lower = styleName.lowercased() + let quotePatterns = ["quote", "block text"] + if quotePatterns.contains(where: { lower.contains($0) }) { + return true + } + if let style = styles.first(where: { $0.id.lowercased() == lower }), + let basedOn = style.basedOn { + return isBlockquoteStyle(basedOn, styles: styles) + } + return false + } + + private func detectHeadingLevel(styleName: String, styles: [Style]) -> Int? { + let lower = styleName.lowercased() + let patterns: [(String, Int)] = [ + ("heading1", 1), ("heading 1", 1), ("標題 1", 1), ("標題1", 1), + ("heading2", 2), ("heading 2", 2), ("標題 2", 2), ("標題2", 2), + ("heading3", 3), ("heading 3", 3), ("標題 3", 3), ("標題3", 3), + ("heading4", 4), ("heading 4", 4), ("標題 4", 4), ("標題4", 4), + ("heading5", 5), ("heading 5", 5), ("標題 5", 5), ("標題5", 5), + ("heading6", 6), ("heading 6", 6), ("標題 6", 6), ("標題6", 6), + ("title", 1), ("subtitle", 2), + ] + + for (pattern, level) in patterns where lower == pattern { + return level + } + + if let style = styles.first(where: { $0.id.lowercased() == lower }), + let basedOn = style.basedOn { + return detectHeadingLevel(styleName: basedOn, styles: styles) + } + return nil + } + + private func isListBullet(numId: Int, level: Int, numbering: Numbering) -> Bool { + guard let num = numbering.nums.first(where: { $0.numId == numId }) else { + return true + } + guard let abstractNum = numbering.abstractNums.first(where: { $0.abstractNumId == num.abstractNumId }) else { + return true + } + guard let levelDef = abstractNum.levels.first(where: { $0.ilvl == level }) else { + return true + } + return levelDef.numFmt == .bullet + } + + private func escapeHTML(_ text: String) -> String { + text + .replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") + .replacingOccurrences(of: "\"", with: """) + .replacingOccurrences(of: "'", with: "'") + } + + private func escapeAttribute(_ text: String) -> String { + escapeHTML(text) + } + + private let stylesheet = """ + body { + margin: 0; + background: #ffffff; + color: #111827; + font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", sans-serif; + line-height: 1.65; + } + .document { + max-width: 860px; + margin: 0 auto; + padding: 40px 24px 72px; + } + table { + width: 100%; + border-collapse: collapse; + margin: 1.25rem 0; + } + th, td { + border: 1px solid #d1d5db; + padding: 0.5rem 0.75rem; + vertical-align: top; + text-align: left; + } + blockquote { + margin: 1.25rem 0; + padding-left: 1rem; + border-left: 4px solid #d1d5db; + color: #374151; + } + pre { + overflow-x: auto; + padding: 0.875rem 1rem; + border-radius: 10px; + background: #111827; + color: #f9fafb; + } + code { + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + } + img { + max-width: 100%; + height: auto; + } + .footnotes { + margin-top: 2.5rem; + color: #374151; + font-size: 0.95rem; + } + .footnote-ref { + font-size: 0.8em; + } + """ +} + +private struct ConversionContext { + let document: WordDocument + let options: ConversionOptions + var styles: [Style] { document.styles } + var numbering: Numbering { document.numbering } + var referencedFootnoteIds: Set = [] + var referencedEndnoteIds: Set = [] + var endnoteIdMapping: [Int: String] = [:] + + mutating func registerFootnote(id: Int) { + referencedFootnoteIds.insert(id) + } + + mutating func registerEndnote(id: Int, mappedId: String) { + referencedEndnoteIds.insert(id) + endnoteIdMapping[id] = mappedId + } +} + +private struct FlatListItem { + let kind: ListKind + let level: Int + let content: String +} + +private enum ListKind { + case unordered + case ordered + + var tagName: String { + switch self { + case .unordered: return "ul" + case .ordered: return "ol" + } + } +} diff --git a/packages/word-to-html-swift/Tests/WordToHTMLTests/WordHTMLConverterTests.swift b/packages/word-to-html-swift/Tests/WordToHTMLTests/WordHTMLConverterTests.swift new file mode 100644 index 0000000..d1c8f93 --- /dev/null +++ b/packages/word-to-html-swift/Tests/WordToHTMLTests/WordHTMLConverterTests.swift @@ -0,0 +1,279 @@ +import Foundation +import CommonConverterSwift +import OOXMLSwift +@testable import WordToHTML + +#if canImport(XCTest) +import XCTest + +final class WordHTMLConverterTests: XCTestCase { + private let converter = WordHTMLConverter() + + private func makeDocument(paragraphs: [Paragraph]) -> WordDocument { + var doc = WordDocument() + for paragraph in paragraphs { + doc.appendParagraph(paragraph) + } + return doc + } + + private func makeDocument(paragraph: Paragraph) -> WordDocument { + makeDocument(paragraphs: [paragraph]) + } + + private func convert(_ document: WordDocument, options: ConversionOptions = .default) throws -> String { + try converter.convertToString(document: document, options: options) + } + + private func temporaryDirectory() throws -> URL { + let url = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString, isDirectory: true) + try FileManager.default.createDirectory(at: url, withIntermediateDirectories: true) + return url + } + + private func temporaryDocx(from document: WordDocument, name: String = "test.docx") throws -> URL { + let dir = try temporaryDirectory() + let url = dir.appendingPathComponent(name) + try DocxWriter.write(document, to: url) + return url + } + + func testBasicParagraph() throws { + let html = try convert(makeDocument(paragraph: Paragraph(text: "Hello world"))) + XCTAssertTrue(html.contains("

    Hello world

    "), "Got: \(html)") + } + + func testHeadingLevelOne() throws { + var paragraph = Paragraph(text: "Title") + paragraph.properties.style = "Heading1" + + let html = try convert(makeDocument(paragraph: paragraph)) + XCTAssertTrue(html.contains("

    Title

    "), "Got: \(html)") + } + + func testHeadingLevelThree() throws { + var paragraph = Paragraph(text: "Section") + paragraph.properties.style = "Heading 3" + + let html = try convert(makeDocument(paragraph: paragraph)) + XCTAssertTrue(html.contains("

    Section

    "), "Got: \(html)") + } + + func testInlineFormatting() throws { + let runs = [ + Run(text: "bold", properties: RunProperties(bold: true)), + Run(text: " "), + Run(text: "italic", properties: RunProperties(italic: true)), + Run(text: " "), + Run(text: "gone", properties: RunProperties(strikethrough: true)), + ] + let html = try convert(makeDocument(paragraph: Paragraph(runs: runs))) + + XCTAssertTrue(html.contains("bold"), "Got: \(html)") + XCTAssertTrue(html.contains("italic"), "Got: \(html)") + XCTAssertTrue(html.contains("gone"), "Got: \(html)") + } + + func testHTMLNativeInlineFormatting() throws { + let runs = [ + Run(text: "u", properties: RunProperties(underline: .single)), + Run(text: "2", properties: RunProperties(verticalAlign: .superscript)), + Run(text: "mark", properties: RunProperties(highlight: .yellow)), + ] + let html = try convert(makeDocument(paragraph: Paragraph(runs: runs))) + + XCTAssertTrue(html.contains("u"), "Got: \(html)") + XCTAssertTrue(html.contains("2"), "Got: \(html)") + XCTAssertTrue(html.contains("mark"), "Got: \(html)") + } + + func testExternalHyperlink() throws { + var paragraph = Paragraph() + paragraph.runs = [Run(text: "See ")] + paragraph.hyperlinks = [Hyperlink(id: "h1", text: "Example", url: "https://example.com", relationshipId: "rId9")] + + var document = WordDocument() + document.hyperlinkReferences = [HyperlinkReference(relationshipId: "rId9", url: "https://example.com")] + document.appendParagraph(paragraph) + + let html = try convert(document) + XCTAssertTrue(html.contains("Example"), "Got: \(html)") + } + + func testInternalHyperlink() throws { + var paragraph = Paragraph() + paragraph.hyperlinks = [Hyperlink(id: "h1", text: "Jump", anchor: "target")] + + let html = try convert(makeDocument(paragraph: paragraph)) + XCTAssertTrue(html.contains("Jump"), "Got: \(html)") + } + + func testBulletList() throws { + var first = Paragraph(text: "One") + first.properties.numbering = NumberingInfo(numId: 1, level: 0) + var second = Paragraph(text: "Two") + second.properties.numbering = NumberingInfo(numId: 1, level: 0) + + var document = makeDocument(paragraphs: [first, second]) + var abstractNum = AbstractNum(abstractNumId: 0) + abstractNum.levels = [Level(ilvl: 0, numFmt: .bullet, lvlText: "•", indent: 720)] + document.numbering.abstractNums = [abstractNum] + document.numbering.nums = [Num(numId: 1, abstractNumId: 0)] + + let html = try convert(document) + XCTAssertTrue(html.contains("
      "), "Got: \(html)") + XCTAssertTrue(html.contains("
    • One
    • "), "Got: \(html)") + XCTAssertTrue(html.contains("
    • Two
    • "), "Got: \(html)") + } + + func testOrderedNestedList() throws { + var first = Paragraph(text: "Step 1") + first.properties.numbering = NumberingInfo(numId: 2, level: 0) + var nested = Paragraph(text: "Detail") + nested.properties.numbering = NumberingInfo(numId: 2, level: 1) + var second = Paragraph(text: "Step 2") + second.properties.numbering = NumberingInfo(numId: 2, level: 0) + + var document = makeDocument(paragraphs: [first, nested, second]) + var abstractNum = AbstractNum(abstractNumId: 1) + abstractNum.levels = [ + Level(ilvl: 0, numFmt: .decimal, lvlText: "%1.", indent: 720), + Level(ilvl: 1, numFmt: .decimal, lvlText: "%2.", indent: 1440), + ] + document.numbering.abstractNums = [abstractNum] + document.numbering.nums = [Num(numId: 2, abstractNumId: 1)] + + let html = try convert(document) + XCTAssertTrue(html.contains("
        "), "Got: \(html)") + XCTAssertTrue(html.contains("
      1. Step 1"), "Got: \(html)") + XCTAssertTrue(html.contains("
      2. Detail
      3. "), "Got: \(html)") + XCTAssertTrue(html.contains("
      4. Step 2
      5. "), "Got: \(html)") + } + + func testCodeBlockStyle() throws { + var paragraph = Paragraph(text: "let x = 42") + paragraph.properties.style = "Code" + + let html = try convert(makeDocument(paragraph: paragraph)) + XCTAssertTrue(html.contains("
        let x = 42
        "), "Got: \(html)") + } + + func testBlockquoteStyle() throws { + var paragraph = Paragraph(text: "Quoted") + paragraph.properties.style = "Quote" + + let html = try convert(makeDocument(paragraph: paragraph)) + XCTAssertTrue(html.contains("

        Quoted

        "), "Got: \(html)") + } + + func testPageBreakProducesHorizontalRule() throws { + var paragraph = Paragraph() + paragraph.hasPageBreak = true + + let html = try convert(makeDocument(paragraph: paragraph)) + XCTAssertTrue(html.contains("
        "), "Got: \(html)") + } + + func testBasicTable() throws { + let table = Table(rows: [ + TableRow(cells: [TableCell(text: "Header 1"), TableCell(text: "Header 2")]), + TableRow(cells: [TableCell(text: "A"), TableCell(text: "B")]), + ]) + var document = WordDocument() + document.body.children.append(.table(table)) + + let html = try convert(document) + XCTAssertTrue(html.contains(""), "Got: \(html)") + XCTAssertTrue(html.contains(""), "Got: \(html)") + XCTAssertTrue(html.contains(""), "Got: \(html)") + } + + func testInlineImageReference() throws { + let drawing = Drawing( + type: .inline, + width: 914400, + height: 914400, + imageId: "rId5", + name: "diagram", + description: "Architecture diagram" + ) + var run = Run(text: "") + run.drawing = drawing + + var document = WordDocument() + document.images = [ImageReference(id: "rId5", fileName: "figure.png", contentType: "image/png", data: Data([0x01, 0x02]))] + document.appendParagraph(Paragraph(runs: [run])) + + let html = try convert(document) + XCTAssertTrue(html.contains("\"Architecture"), "Got: \(html)") + } + + func testImageExtractionWhenFiguresDirectoryProvided() throws { + let drawing = Drawing( + type: .inline, + width: 914400, + height: 914400, + imageId: "rId6", + name: "photo", + description: "Photo" + ) + var run = Run(text: "") + run.drawing = drawing + + var document = WordDocument() + document.images = [ImageReference(id: "rId6", fileName: "photo.png", contentType: "image/png", data: Data([0x89, 0x50, 0x4E, 0x47]))] + document.appendParagraph(Paragraph(runs: [run])) + + let base = try temporaryDirectory() + let directory = base.appendingPathComponent("images", isDirectory: true) + var options = ConversionOptions.default + options.fidelity = .markdownWithFigures + options.figuresDirectory = directory + + let html = try convert(document, options: options) + XCTAssertTrue(html.contains("\"Photo\""), "Got: \(html)") + XCTAssertTrue(FileManager.default.fileExists(atPath: directory.appendingPathComponent("photo.png").path)) + } + + func testFootnoteEmission() throws { + var paragraph = Paragraph(text: "Text") + paragraph.footnoteIds = [1] + + var document = WordDocument() + document.footnotes.footnotes = [Footnote(id: 1, text: "A footnote.", paragraphIndex: 0)] + document.appendParagraph(paragraph) + + let html = try convert(document) + XCTAssertTrue(html.contains("href=\"#fn-1\""), "Got: \(html)") + XCTAssertTrue(html.contains("
        "), "Got: \(html)") + XCTAssertTrue(html.contains("A footnote."), "Got: \(html)") + } + + func testFrontmatterIncludesMetadata() throws { + var document = WordDocument() + document.properties.title = "My Doc" + document.properties.creator = "Author" + document.appendParagraph(Paragraph(text: "content")) + + var options = ConversionOptions.default + options.includeFrontmatter = true + + let html = try convert(document, options: options) + XCTAssertTrue(html.contains("
        Header 1A