Skip to content

Commit

Permalink
feat(reoff): parse docx to unist
Browse files Browse the repository at this point in the history
  • Loading branch information
tefkah committed Feb 10, 2022
1 parent 5ac7dcf commit 7dcdb70
Show file tree
Hide file tree
Showing 12 changed files with 1,874,748 additions and 116 deletions.
Binary file not shown.
1 change: 1 addition & 0 deletions libs/reoff/docx-to-vfile/src/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export * from './lib/get-xml-data'
export * from './lib/docx-to-vfile'

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.spec.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { reoffDocxToVfile } from './reoff-docx-to-vfile'
import { docxToVFile } from './docx-to-vfile'
import fs from 'fs'
import path from 'path'

describe('reoffDocxToVfile', () => {
it('should work', () => {
expect(reoffDocxToVfile()).toEqual('reoff-docx-to-vfile')
const doc = fs.readFileSync(
path.join(__dirname, '../fixtures/Manuscript-2.docx')
)
it('should work', async () => {
expect(await docxToVFile(doc)).toMatchSnapshot()
})
})
75 changes: 18 additions & 57 deletions libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.ts
Original file line number Diff line number Diff line change
@@ -1,59 +1,20 @@
import AdmZip from 'adm-zip'
import { extname } from 'path'
import { VFile } from 'vfile'
import { getXMLData } from './get-xml-data'

const tab = '\t',
cr = '\n\n',
empty = ''
const tabRegex = new RegExp('<w:tab/>', 'g')
const tagRegex = new RegExp('(</|<)w:[^>]*>', 'g')
const extensionRegex = new RegExp('^(.docx|.xlsx|.pptx)$')
const paragraphRegex = new RegExp(
'(<w:t>|<w:t xml:space="preserve">)[^]*?(?=</w:p>)',
'g'
)

export async function getXMLData(
file: string | Buffer,
{
filename,
xml = true,
returnBuffer = false,
}: { filename: string | string[]; xml?: boolean; returnBuffer?: boolean }
) {
return new Promise((resolve, reject) => {
if (
typeof file === 'string' &&
!extensionRegex.test(extname(file).toLowerCase())
) {
reject(new Error('The file must be either a .docx, .xlsx or .pptx'))
}
try {
const zip = new AdmZip(file)
zip
?.getEntry(xml ? `word/${filename}.xml` : filename)
?.getDataAsync((data) => resolve(data.toString()))
} catch (err) {
reject(`${err} (${file})`)
}
})
}

/**
* Extracts the text from your Office file.
*
* @param {String} path Path to the file you want to extract the text from.
* @param {String} [xmlFilename='document'] Optional argument used to specify
* the XML component of the file from which to extract the text (default is: 'document').
*/
export const extractText = async (
path: string,
xmlFilename: string = 'document'
) => {
const xml = await getXMLData(path, xmlFilename)
let paragraph,
text = ''
while ((paragraph = paragraphRegex.exec(xml))) {
text += paragraph[0].replace(tabRegex, tab).replace(tagRegex, empty) + cr
}
return text
export async function docxToVFile(file: Buffer | string) {
const mainXML = (await getXMLData(file)).slice(0, -'</w:document>'.length)
// xast-util-from-xml cannot handle two xml headers in one doc
const footnotes = (await getXMLData(file, { filename: 'footnotes' })).replace(
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>',
''
)
// easier to put the footnotes in the same spot
const total = `${mainXML}${footnotes}
</w:document>`
console.log(footnotes)
const vfile = new VFile(total)
// if (footnotes) {
// Object.assign(vfile.data, { footnotes })
// }
return vfile
}
59 changes: 59 additions & 0 deletions libs/reoff/docx-to-vfile/src/lib/get-xml-data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import AdmZip from 'adm-zip'
import { extname } from 'path'

const tab = '\t',
cr = '\n\n',
empty = ''
const tabRegex = new RegExp('<w:tab/>', 'g')
const tagRegex = new RegExp('(</|<)w:[^>]*>', 'g')
const extensionRegex = new RegExp('^(.docx|.xlsx|.pptx)$')
const paragraphRegex = new RegExp(
'(<w:t>|<w:t xml:space="preserve">)[^]*?(?=</w:p>)',
'g'
)

export async function getXMLData(
file: string | Buffer,
{ filename = 'document', xml = true, returnBuffer = false } = {
filename: 'document',
xml: true,
returnBuffer: false,
}
): Promise<string> {
return new Promise((resolve, reject) => {
if (
typeof file === 'string' &&
!extensionRegex.test(extname(file).toLowerCase())
) {
reject(new Error('The file must be either a .docx, .xlsx or .pptx'))
}
try {
const zip = new AdmZip(file)
zip
?.getEntry(xml ? `word/${filename}.xml` : filename)
?.getDataAsync((data) => resolve(data.toString()))
} catch (err) {
reject(`${err} (${file})`)
}
})
}

/**
* Extracts the text from your Office file.
*
* @param {String} path Path to the file you want to extract the text from.
* @param {String} [xmlFilename='document'] Optional argument used to specify
* the XML component of the file from which to extract the text (default is: 'document').
*/
export const extractText = async (
path: string,
xmlFilename: string = 'document'
) => {
const xml = await getXMLData(path, { filename: xmlFilename })
let paragraph,
text = ''
while ((paragraph = paragraphRegex.exec(xml))) {
text += paragraph[0].replace(tabRegex, tab).replace(tagRegex, empty) + cr
}
return text
}
Loading

0 comments on commit 7dcdb70

Please sign in to comment.