feat(reoff): parse docx to unist

TrialAndErrorOrg · Feb 10, 2022 · 7dcdb70 · 7dcdb70
1 parent 5ac7dcf
commit 7dcdb70
Show file tree

Hide file tree

Showing 12 changed files with 1,874,748 additions and 116 deletions.
diff --git a/libs/reoff/docx-to-vfile/src/fixtures/Manuscript-2.docx b/libs/reoff/docx-to-vfile/src/fixtures/Manuscript-2.docx
diff --git a/libs/reoff/docx-to-vfile/src/index.ts b/libs/reoff/docx-to-vfile/src/index.ts
@@ -1 +1,2 @@
+export * from './lib/get-xml-data'
 export * from './lib/docx-to-vfile'
diff --git a/libs/reoff/docx-to-vfile/src/lib/__snapshots__/docx-to-vfile.spec.ts.snap b/libs/reoff/docx-to-vfile/src/lib/__snapshots__/docx-to-vfile.spec.ts.snap
diff --git a/libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.spec.ts b/libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.spec.ts
@@ -1,7 +1,12 @@
-import { reoffDocxToVfile } from './reoff-docx-to-vfile'
+import { docxToVFile } from './docx-to-vfile'
+import fs from 'fs'
+import path from 'path'
 
 describe('reoffDocxToVfile', () => {
-  it('should work', () => {
-    expect(reoffDocxToVfile()).toEqual('reoff-docx-to-vfile')
+  const doc = fs.readFileSync(
+    path.join(__dirname, '../fixtures/Manuscript-2.docx')
+  )
+  it('should work', async () => {
+    expect(await docxToVFile(doc)).toMatchSnapshot()
   })
 })
diff --git a/libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.ts b/libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.ts
@@ -1,59 +1,20 @@
-import AdmZip from 'adm-zip'
-import { extname } from 'path'
+import { VFile } from 'vfile'
+import { getXMLData } from './get-xml-data'
 
-const tab = '\t',
-  cr = '\n\n',
-  empty = ''
-const tabRegex = new RegExp('<w:tab/>', 'g')
-const tagRegex = new RegExp('(</|<)w:[^>]*>', 'g')
-const extensionRegex = new RegExp('^(.docx|.xlsx|.pptx)$')
-const paragraphRegex = new RegExp(
-  '(<w:t>|<w:t xml:space="preserve">)[^]*?(?=</w:p>)',
-  'g'
-)
-
-export async function getXMLData(
-  file: string | Buffer,
-  {
-    filename,
-    xml = true,
-    returnBuffer = false,
-  }: { filename: string | string[]; xml?: boolean; returnBuffer?: boolean }
-) {
-  return new Promise((resolve, reject) => {
-    if (
-      typeof file === 'string' &&
-      !extensionRegex.test(extname(file).toLowerCase())
-    ) {
-      reject(new Error('The file must be either a .docx, .xlsx or .pptx'))
-    }
-    try {
-      const zip = new AdmZip(file)
-      zip
-        ?.getEntry(xml ? `word/${filename}.xml` : filename)
-        ?.getDataAsync((data) => resolve(data.toString()))
-    } catch (err) {
-      reject(`${err} (${file})`)
-    }
-  })
-}
-
-/**
- * Extracts the text from your Office file.
- *
- * @param {String} path Path to the file you want to extract the text from.
- * @param {String} [xmlFilename='document'] Optional argument used to specify
- * the XML component of the file from which to extract the text (default is: 'document').
- */
-export const extractText = async (
-  path: string,
-  xmlFilename: string = 'document'
-) => {
-  const xml = await getXMLData(path, xmlFilename)
-  let paragraph,
-    text = ''
-  while ((paragraph = paragraphRegex.exec(xml))) {
-    text += paragraph[0].replace(tabRegex, tab).replace(tagRegex, empty) + cr
-  }
-  return text
+export async function docxToVFile(file: Buffer | string) {
+  const mainXML = (await getXMLData(file)).slice(0, -'</w:document>'.length)
+  // xast-util-from-xml cannot handle two xml headers in one doc
+  const footnotes = (await getXMLData(file, { filename: 'footnotes' })).replace(
+    '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>',
+    ''
+  )
+  // easier to put the footnotes in the same spot
+  const total = `${mainXML}${footnotes}
+  </w:document>`
+  console.log(footnotes)
+  const vfile = new VFile(total)
+  // if (footnotes) {
+  //   Object.assign(vfile.data, { footnotes })
+  // }
+  return vfile
 }
diff --git a/libs/reoff/docx-to-vfile/src/lib/get-xml-data.ts b/libs/reoff/docx-to-vfile/src/lib/get-xml-data.ts
@@ -0,0 +1,59 @@
+import AdmZip from 'adm-zip'
+import { extname } from 'path'
+
+const tab = '\t',
+  cr = '\n\n',
+  empty = ''
+const tabRegex = new RegExp('<w:tab/>', 'g')
+const tagRegex = new RegExp('(</|<)w:[^>]*>', 'g')
+const extensionRegex = new RegExp('^(.docx|.xlsx|.pptx)$')
+const paragraphRegex = new RegExp(
+  '(<w:t>|<w:t xml:space="preserve">)[^]*?(?=</w:p>)',
+  'g'
+)
+
+export async function getXMLData(
+  file: string | Buffer,
+  { filename = 'document', xml = true, returnBuffer = false } = {
+    filename: 'document',
+    xml: true,
+    returnBuffer: false,
+  }
+): Promise<string> {
+  return new Promise((resolve, reject) => {
+    if (
+      typeof file === 'string' &&
+      !extensionRegex.test(extname(file).toLowerCase())
+    ) {
+      reject(new Error('The file must be either a .docx, .xlsx or .pptx'))
+    }
+    try {
+      const zip = new AdmZip(file)
+      zip
+        ?.getEntry(xml ? `word/${filename}.xml` : filename)
+        ?.getDataAsync((data) => resolve(data.toString()))
+    } catch (err) {
+      reject(`${err} (${file})`)
+    }
+  })
+}
+
+/**
+ * Extracts the text from your Office file.
+ *
+ * @param {String} path Path to the file you want to extract the text from.
+ * @param {String} [xmlFilename='document'] Optional argument used to specify
+ * the XML component of the file from which to extract the text (default is: 'document').
+ */
+export const extractText = async (
+  path: string,
+  xmlFilename: string = 'document'
+) => {
+  const xml = await getXMLData(path, { filename: xmlFilename })
+  let paragraph,
+    text = ''
+  while ((paragraph = paragraphRegex.exec(xml))) {
+    text += paragraph[0].replace(tabRegex, tab).replace(tagRegex, empty) + cr
+  }
+  return text
+}