Skip to content

Commit

Permalink
feat(docx-to-vfile): be less prescriptive about which files are to be…
Browse files Browse the repository at this point in the history
… included in the vfile, defer that to `reoff-parse`
  • Loading branch information
tefkah committed Mar 5, 2023
1 parent 4621f7d commit 5ec48a5
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 56 deletions.
8 changes: 1 addition & 7 deletions libs/reoff/docx-to-vfile/src/fixtures/test.xml

Large diffs are not rendered by default.

10 changes: 1 addition & 9 deletions libs/reoff/docx-to-vfile/src/fixtures/testimages.xml

Large diffs are not rendered by default.

8 changes: 1 addition & 7 deletions libs/reoff/docx-to-vfile/src/fixtures/testrelations.xml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@ describe('reoffDocxToVfile', () => {
const vfile = await docxToVFile(new Uint8Array(docimg))
const url = new URL('../fixtures/testimages.xml', import.meta.url)
fs.writeFileSync(url, String(vfile))
console.dir(vfile.data, { depth: null })

expect(vfile.data.images).toBeDefined()
expect(vfile.data.media).toBeDefined()
})
})

interface Data {
[key: `${string}.xml` | `${string}.rels`]: string | undefined
media: { [key: string]: ArrayBuffer }
relations: { [key: string]: string }
}
137 changes: 105 additions & 32 deletions libs/reoff/docx-to-vfile/src/lib/docx-to-vfile-unzipit.ts
Original file line number Diff line number Diff line change
@@ -1,54 +1,127 @@
import { VFile } from 'vfile'
import { Data, VFile } from 'vfile'
import { unzip } from 'unzipit'

const removeHeader = (text: string | undefined) => (text ? text.replace(/<\?xml.*?\?>/, '') : '')

const removeCarriage = (text: string | undefined) => (text ? text.replace(/\r/g, '') : '')

export interface Options {
withoutImages: boolean
/**
* Whether or not to include media in the VFile.
*
* By default, images are included on the `data.media` attribute of the VFile as an object of ArrayBuffers, which are accessible both client and serverside.
*
* @default false
*/
withoutMedia?: boolean
/**
* Include only the specified files on the `data` attribute of the VFile.
*
* - If an array of strings or regexps is passed, only files that match one of the values will be included.
* - If a function is passed, it will be called for each file and should return true to include the file.
* - If the value is 'all', almost all files will be included, except for 'word/document.xml', as that already is the root of the VFile.
* - If the value is 'allWithDocumentXML', all files will be included, including `word/document.xml`, even though that is already the root of the VFile. Useful if you really want to mimic the original docx file.
*
* @default ["word/footnotes.xml", "word/endnotes.xml", "word/styles.xml", "customXml/item1.xml", "word/glossary/document.xml"]
*/
include?: string[] | RegExp[] | ((key: string) => boolean) | 'all' | 'allWithDocumentXML'
}

export async function docxToVFile(file: ArrayBuffer, options: Options = { withoutImages: false }) {
export interface DocxData extends Data {
/**
* The textcontent of .xml files in the .docx file
*/
[key: `${string}.xml` | `${string}.rels`]: string | undefined
/**
* The media files in the .docx file
*/
media: { [key: string]: ArrayBuffer }
/**
* The relations between the .xml files in the .docx file
*/
relations: { [key: string]: string }
}

/**
* Extends VFile with a custom data attribute
*/
export interface DocxVFile extends VFile {
data: DocxData
}

/**
* Takes a docx file as an ArrayBuffer and returns a VFile with the contents of the document.xml file as the root, and the contents of the other xml files as data.
*
* @param file The docx file as an ArrayBuffer
* @param options Options
* @returns A VFile with the contents of the document.xml file as the root, and the contents of the other xml files as data.
*/
export async function docxToVFile(
file: ArrayBuffer,
userOptions: Options = {},
): Promise<DocxVFile> {
const options: Options = {
withoutMedia: false,
include: 'all',
...userOptions,
}

const { entries } = await unzip(file)
console.log(entries)
const rels = await entries['word/_rels/document.xml.rels'].text()
const relations = Object.fromEntries(
[...rels.matchAll(/Id="(.*?)".*?Target="(.*?)"/g)].map((match) => [match[1], match[2]]),
)

const doc = await entries['word/document.xml'].text()
const foot = (await entries?.['word/footnotes.xml']?.text()) || ''
const end = (await entries?.['word/endnotes.xml']?.text()) || ''
const styles = (await entries?.['word/styles.xml']?.text()) || ''
const bib = (await entries?.['customXml/item1.xml']?.text()) || ''

// const {
// 'word/document.xml': document,
// 'word/footnotes.xml': footnotes,
// ...bibliography
// } = data

const total = `${removeCarriage(doc).slice(0, -'</w:document>'.length)}
${removeHeader(foot)}
${removeHeader(bib)}
${removeHeader(end)}
${removeHeader(styles)}
</w:document>`

const vfile = new VFile(total)

vfile.data.relations = relations

if (options.withoutImages) {
return vfile

const textEntriesObjectEntries = await Promise.all(
Object.entries(entries)
.filter(([key]) => /xml|rels/.test(key))
.filter(([key]) => {
if (options.include === 'all') {
return key !== 'word/document.xml'
}
if (options.include === 'allWithDocumentXML') {
return true
}
if (typeof options.include === 'function') {
return options.include(key)
}
if (Array.isArray(options.include)) {
return options.include.some((include) => {
if (typeof include === 'string') {
return key.includes(include)
}
if (include instanceof RegExp) {
return include.test(key)
}
return false
})
}
return false
})
.map(async ([key, value]) => [key, removeCarriage(await value.text())]),
)

const textEntriesObject = Object.fromEntries(textEntriesObjectEntries)

// const vfile = new VFile(removeCarriage(doc))

const vfileData: DocxData = textEntriesObject
vfileData.relations = relations
vfileData.media = {} as { [key: string]: ArrayBuffer }

// vfile.data = vfileData

if (options.withoutMedia) {
return new VFile({ value: removeCarriage(doc), data: vfileData }) as DocxVFile
}

const mediaUrls = Object.values(relations).filter((rel: string) => rel.includes('media/'))
const images = {} as { [key: string]: ArrayBuffer }
const media = {} as { [key: string]: ArrayBuffer }
for (const url of mediaUrls) {
images[url] = await entries[`word/${url}`].arrayBuffer()
media[url] = await entries[`word/${url}`].arrayBuffer()
}
vfile.data.images = images
return vfile
vfileData.media = media
return new VFile({ value: removeCarriage(doc), data: vfileData }) as DocxVFile
}

0 comments on commit 5ec48a5

Please sign in to comment.