-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
1,874,748 additions
and
116 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
export * from './lib/get-xml-data' | ||
export * from './lib/docx-to-vfile' |
15 changes: 15 additions & 0 deletions
15
libs/reoff/docx-to-vfile/src/lib/__snapshots__/docx-to-vfile.spec.ts.snap
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,12 @@ | ||
import { reoffDocxToVfile } from './reoff-docx-to-vfile' | ||
import { docxToVFile } from './docx-to-vfile' | ||
import fs from 'fs' | ||
import path from 'path' | ||
|
||
describe('reoffDocxToVfile', () => { | ||
it('should work', () => { | ||
expect(reoffDocxToVfile()).toEqual('reoff-docx-to-vfile') | ||
const doc = fs.readFileSync( | ||
path.join(__dirname, '../fixtures/Manuscript-2.docx') | ||
) | ||
it('should work', async () => { | ||
expect(await docxToVFile(doc)).toMatchSnapshot() | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,59 +1,20 @@ | ||
import AdmZip from 'adm-zip' | ||
import { extname } from 'path' | ||
import { VFile } from 'vfile' | ||
import { getXMLData } from './get-xml-data' | ||
|
||
const tab = '\t', | ||
cr = '\n\n', | ||
empty = '' | ||
const tabRegex = new RegExp('<w:tab/>', 'g') | ||
const tagRegex = new RegExp('(</|<)w:[^>]*>', 'g') | ||
const extensionRegex = new RegExp('^(.docx|.xlsx|.pptx)$') | ||
const paragraphRegex = new RegExp( | ||
'(<w:t>|<w:t xml:space="preserve">)[^]*?(?=</w:p>)', | ||
'g' | ||
) | ||
|
||
export async function getXMLData( | ||
file: string | Buffer, | ||
{ | ||
filename, | ||
xml = true, | ||
returnBuffer = false, | ||
}: { filename: string | string[]; xml?: boolean; returnBuffer?: boolean } | ||
) { | ||
return new Promise((resolve, reject) => { | ||
if ( | ||
typeof file === 'string' && | ||
!extensionRegex.test(extname(file).toLowerCase()) | ||
) { | ||
reject(new Error('The file must be either a .docx, .xlsx or .pptx')) | ||
} | ||
try { | ||
const zip = new AdmZip(file) | ||
zip | ||
?.getEntry(xml ? `word/${filename}.xml` : filename) | ||
?.getDataAsync((data) => resolve(data.toString())) | ||
} catch (err) { | ||
reject(`${err} (${file})`) | ||
} | ||
}) | ||
} | ||
|
||
/** | ||
* Extracts the text from your Office file. | ||
* | ||
* @param {String} path Path to the file you want to extract the text from. | ||
* @param {String} [xmlFilename='document'] Optional argument used to specify | ||
* the XML component of the file from which to extract the text (default is: 'document'). | ||
*/ | ||
export const extractText = async ( | ||
path: string, | ||
xmlFilename: string = 'document' | ||
) => { | ||
const xml = await getXMLData(path, xmlFilename) | ||
let paragraph, | ||
text = '' | ||
while ((paragraph = paragraphRegex.exec(xml))) { | ||
text += paragraph[0].replace(tabRegex, tab).replace(tagRegex, empty) + cr | ||
} | ||
return text | ||
export async function docxToVFile(file: Buffer | string) { | ||
const mainXML = (await getXMLData(file)).slice(0, -'</w:document>'.length) | ||
// xast-util-from-xml cannot handle two xml headers in one doc | ||
const footnotes = (await getXMLData(file, { filename: 'footnotes' })).replace( | ||
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>', | ||
'' | ||
) | ||
// easier to put the footnotes in the same spot | ||
const total = `${mainXML}${footnotes} | ||
</w:document>` | ||
console.log(footnotes) | ||
const vfile = new VFile(total) | ||
// if (footnotes) { | ||
// Object.assign(vfile.data, { footnotes }) | ||
// } | ||
return vfile | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import AdmZip from 'adm-zip' | ||
import { extname } from 'path' | ||
|
||
const tab = '\t', | ||
cr = '\n\n', | ||
empty = '' | ||
const tabRegex = new RegExp('<w:tab/>', 'g') | ||
const tagRegex = new RegExp('(</|<)w:[^>]*>', 'g') | ||
const extensionRegex = new RegExp('^(.docx|.xlsx|.pptx)$') | ||
const paragraphRegex = new RegExp( | ||
'(<w:t>|<w:t xml:space="preserve">)[^]*?(?=</w:p>)', | ||
'g' | ||
) | ||
|
||
export async function getXMLData( | ||
file: string | Buffer, | ||
{ filename = 'document', xml = true, returnBuffer = false } = { | ||
filename: 'document', | ||
xml: true, | ||
returnBuffer: false, | ||
} | ||
): Promise<string> { | ||
return new Promise((resolve, reject) => { | ||
if ( | ||
typeof file === 'string' && | ||
!extensionRegex.test(extname(file).toLowerCase()) | ||
) { | ||
reject(new Error('The file must be either a .docx, .xlsx or .pptx')) | ||
} | ||
try { | ||
const zip = new AdmZip(file) | ||
zip | ||
?.getEntry(xml ? `word/${filename}.xml` : filename) | ||
?.getDataAsync((data) => resolve(data.toString())) | ||
} catch (err) { | ||
reject(`${err} (${file})`) | ||
} | ||
}) | ||
} | ||
|
||
/** | ||
* Extracts the text from your Office file. | ||
* | ||
* @param {String} path Path to the file you want to extract the text from. | ||
* @param {String} [xmlFilename='document'] Optional argument used to specify | ||
* the XML component of the file from which to extract the text (default is: 'document'). | ||
*/ | ||
export const extractText = async ( | ||
path: string, | ||
xmlFilename: string = 'document' | ||
) => { | ||
const xml = await getXMLData(path, { filename: xmlFilename }) | ||
let paragraph, | ||
text = '' | ||
while ((paragraph = paragraphRegex.exec(xml))) { | ||
text += paragraph[0].replace(tabRegex, tab).replace(tagRegex, empty) + cr | ||
} | ||
return text | ||
} |
Oops, something went wrong.