Skip to content

Commit

Permalink
feat(reoff): read file more async
Browse files Browse the repository at this point in the history
  • Loading branch information
tefkah committed Feb 14, 2022
1 parent 2cb81c8 commit fed7aa9
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 18 deletions.
3 changes: 2 additions & 1 deletion libs/ooxast/ooxast/tsconfig.spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"src/lib/purl.oclc.org/ooxml/officeDocument/math.ts",
"src/lib/purl.oclc.org/ooxml/drawingml/wordprocessingDrawing.ts",
"src/lib/purl.oclc.org/ooxml/drawingml/picture.ts",
"src/lib/purl.oclc.org/ooxml/drawingml/main.ts"
"src/lib/purl.oclc.org/ooxml/drawingml/main.ts",
"src/lib/ooxml/officeDocument/bibliography.ts"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP

exports[`reoffDocxToVfile should work 1`] = `
VFile {
"cwd": "/Users/thomas/Projects/jote-monorepo",
"data": Object {},
"history": Array [],
"messages": Array [],
"value": "
<w:document xmlns:wpc=\\"http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas\\" xmlns:cx=\\"http://schemas.microsoft.com/office/drawing/2014/chartex\\" xmlns:cx1=\\"http://schemas.microsoft.com/office/drawing/2015/9/8/chartex\\" xmlns:cx2=\\"http://schemas.microsoft.com/office/drawing/2015/10/21/chartex\\" xmlns:cx3=\\"http://schemas.microsoft.com/office/drawing/2016/5/9/chartex\\" xmlns:cx4=\\"http://schemas.microsoft.com/office/drawing/2016/5/10/chartex\\" xmlns:cx5=\\"http://schemas.microsoft.com/office/drawing/2016/5/11/chartex\\" xmlns:cx6=\\"http://schemas.microsoft.com/office/drawing/2016/5/12/chartex\\" xmlns:cx7=\\"http://schemas.microsoft.com/office/drawing/2016/5/13/chartex\\" xmlns:cx8=\\"http://schemas.microsoft.com/office/drawing/2016/5/14/chartex\\" xmlns:mc=\\"http://schemas.openxmlformats.org/markup-compatibility/2006\\" xmlns:aink=\\"http://schemas.microsoft.com/office/drawing/2016/ink\\" xmlns:am3d=\\"http://schemas.microsoft.com/office/drawing/2017/model3d\\" xmlns:o=\\"urn:schemas-microsoft-com:office:office\\" xmlns:r=\\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\\" xmlns:m=\\"http://schemas.openxmlformats.org/officeDocument/2006/math\\" xmlns:v=\\"urn:schemas-microsoft-com:vml\\" xmlns:wp14=\\"http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing\\" xmlns:wp=\\"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing\\" xmlns:w10=\\"urn:schemas-microsoft-com:office:word\\" xmlns:w=\\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\\" xmlns:w14=\\"http://schemas.microsoft.com/office/word/2010/wordml\\" xmlns:w15=\\"http://schemas.microsoft.com/office/word/2012/wordml\\" xmlns:w16cex=\\"http://schemas.microsoft.com/office/word/2018/wordml/cex\\" xmlns:w16cid=\\"http://schemas.microsoft.com/office/word/2016/wordml/cid\\" xmlns:w16=\\"http://schemas.microsoft.com/office/word/2018/wordml\\" xmlns:w16sdtdh=\\"http://schemas.microsoft.com/office/word/2020/wordml/sdtdatahash\\" xmlns:w16se=\\"http://schemas.microsoft.com/office/word/2015/wordml/symex\\" xmlns:wpg=\\"http://schemas.microsoft.com/office/word/2010/wordprocessingGroup\\" xmlns:wpi=\\"http://schemas.microsoft.com/office/word/2010/wordprocessingInk\\" xmlns:wne=\\"http://schemas.microsoft.com/office/word/2006/wordml\\" xmlns:wps=\\"http://schemas.microsoft.com/office/word/2010/wordprocessingShape\\" mc:Ignorable=\\"w14 w15 w16se w16cid w16 w16cex w16sdtdh wp14\\"><w:body><w:p w14:paraId=\\"36CB8745\\" w14:textId=\\"13B42306\\" w:rsidR=\\"00E974A4\\" w:rsidRDefault=\\"006132C7\\"><w:sdt><w:sdtPr><w:id w:val=\\"692576755\\"/><w:citation/></w:sdtPr><w:sdtContent><w:r><w:fldChar w:fldCharType=\\"begin\\"/></w:r><w:r><w:rPr><w:lang w:val=\\"en-US\\"/></w:rPr><w:instrText xml:space=\\"preserve\\">CITATION Aut22 \\\\l 1033 </w:instrText></w:r><w:r><w:fldChar w:fldCharType=\\"separate\\"/></w:r><w:r w:rsidR=\\"00B565CB\\"><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t>(Author, 2022)</w:t></w:r><w:r><w:fldChar w:fldCharType=\\"end\\"/></w:r></w:sdtContent></w:sdt></w:p><w:p w14:paraId=\\"00B04DAA\\" w14:textId=\\"6F2D9244\\" w:rsidR=\\"00DA74D6\\" w:rsidRDefault=\\"00DA74D6\\"/><w:p w14:paraId=\\"42512746\\" w14:textId=\\"7F23958B\\" w:rsidR=\\"00DA74D6\\" w:rsidRPr=\\"00DA74D6\\" w:rsidRDefault=\\"00DA74D6\\"><w:pPr><w:rPr><w:lang w:val=\\"en-US\\"/></w:rPr></w:pPr><w:r><w:rPr><w:lang w:val=\\"en-US\\"/></w:rPr><w:t xml:space=\\"preserve\\"> </w:t></w:r></w:p><w:p w14:paraId=\\"2EC280C4\\" w14:textId=\\"065D19B9\\" w:rsidR=\\"00B565CB\\" w:rsidRDefault=\\"00B565CB\\"/><w:p w14:paraId=\\"3CB0703A\\" w14:textId=\\"3D7D332A\\" w:rsidR=\\"00B565CB\\" w:rsidRDefault=\\"00B565CB\\"><w:sdt><w:sdtPr><w:id w:val=\\"57686830\\"/><w:citation/></w:sdtPr><w:sdtContent><w:r><w:fldChar w:fldCharType=\\"begin\\"/></w:r><w:r><w:rPr><w:lang w:val=\\"en-US\\"/></w:rPr><w:instrText xml:space=\\"preserve\\"> CITATION Ano00 \\\\l 1033 </w:instrText></w:r><w:r><w:fldChar w:fldCharType=\\"separate\\"/></w:r><w:r><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t>(Another, 1000)</w:t></w:r><w:r><w:fldChar w:fldCharType=\\"end\\"/></w:r></w:sdtContent></w:sdt></w:p><w:p w14:paraId=\\"26288588\\" w14:textId=\\"7B977025\\" w:rsidR=\\"006132C7\\" w:rsidRDefault=\\"006132C7\\"/><w:sdt><w:sdtPr><w:id w:val=\\"1024518974\\"/><w:docPartObj><w:docPartGallery w:val=\\"Bibliographies\\"/><w:docPartUnique/></w:docPartObj></w:sdtPr><w:sdtEndPr><w:rPr><w:rFonts w:asciiTheme=\\"minorHAnsi\\" w:eastAsiaTheme=\\"minorHAnsi\\" w:hAnsiTheme=\\"minorHAnsi\\" w:cstheme=\\"minorBidi\\"/><w:b w:val=\\"0\\"/><w:bCs w:val=\\"0\\"/><w:color w:val=\\"auto\\"/><w:sz w:val=\\"24\\"/><w:szCs w:val=\\"24\\"/><w:lang w:val=\\"en-NL\\" w:bidi=\\"ar-SA\\"/></w:rPr></w:sdtEndPr><w:sdtContent><w:p w14:paraId=\\"2D88CF2C\\" w14:textId=\\"2380C814\\" w:rsidR=\\"006132C7\\" w:rsidRDefault=\\"006132C7\\"><w:pPr><w:pStyle w:val=\\"Heading1\\"/></w:pPr><w:r><w:t>Bibliography</w:t></w:r></w:p><w:sdt><w:sdtPr><w:id w:val=\\"111145805\\"/><w:bibliography/></w:sdtPr><w:sdtContent><w:p w14:paraId=\\"469D0581\\" w14:textId=\\"77777777\\" w:rsidR=\\"00B565CB\\" w:rsidRDefault=\\"006132C7\\" w:rsidP=\\"00B565CB\\"><w:pPr><w:pStyle w:val=\\"Bibliography\\"/><w:ind w:left=\\"720\\" w:hanging=\\"720\\"/><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr></w:pPr><w:r><w:fldChar w:fldCharType=\\"begin\\"/></w:r><w:r><w:instrText xml:space=\\"preserve\\"> BIBLIOGRAPHY </w:instrText></w:r><w:r><w:fldChar w:fldCharType=\\"separate\\"/></w:r><w:r w:rsidR=\\"00B565CB\\"><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t xml:space=\\"preserve\\">Author, A. (2022, 1 1). Title of the thing. (Editor, Ed.) </w:t></w:r><w:r w:rsidR=\\"00B565CB\\"><w:rPr><w:i/><w:iCs/><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t>Journal, 1</w:t></w:r><w:r w:rsidR=\\"00B565CB\\"><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t>(1), 11-12.</w:t></w:r></w:p><w:p w14:paraId=\\"10E906AD\\" w14:textId=\\"77777777\\" w:rsidR=\\"00B565CB\\" w:rsidRDefault=\\"00B565CB\\" w:rsidP=\\"00B565CB\\"><w:pPr><w:pStyle w:val=\\"Bibliography\\"/><w:ind w:left=\\"720\\" w:hanging=\\"720\\"/><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr></w:pPr><w:r><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t xml:space=\\"preserve\\">Another. (1000). Artikel. </w:t></w:r><w:r><w:rPr><w:i/><w:iCs/><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t>Journal1, 2</w:t></w:r><w:r><w:rPr><w:noProof/><w:lang w:val=\\"en-US\\"/></w:rPr><w:t>(2), 99-104.</w:t></w:r></w:p><w:p w14:paraId=\\"748089B2\\" w14:textId=\\"794D3F42\\" w:rsidR=\\"006132C7\\" w:rsidRDefault=\\"006132C7\\" w:rsidP=\\"00B565CB\\"><w:r><w:rPr><w:b/><w:bCs/><w:noProof/></w:rPr><w:fldChar w:fldCharType=\\"end\\"/></w:r></w:p></w:sdtContent></w:sdt></w:sdtContent></w:sdt><w:p w14:paraId=\\"7BB62F0D\\" w14:textId=\\"77777777\\" w:rsidR=\\"006132C7\\" w:rsidRDefault=\\"006132C7\\"/><w:sectPr w:rsidR=\\"006132C7\\"><w:pgSz w:w=\\"11906\\" w:h=\\"16838\\"/><w:pgMar w:top=\\"1440\\" w:right=\\"1440\\" w:bottom=\\"1440\\" w:left=\\"1440\\" w:header=\\"708\\" w:footer=\\"708\\" w:gutter=\\"0\\"/><w:cols w:space=\\"708\\"/><w:docGrid w:linePitch=\\"360\\"/></w:sectPr></w:body>
<b:Sources xmlns:b=\\"http://schemas.openxmlformats.org/officeDocument/2006/bibliography\\" xmlns=\\"http://schemas.openxmlformats.org/officeDocument/2006/bibliography\\" SelectedStyle=\\"/APASixthEditionOfficeOnline.xsl\\" StyleName=\\"APA\\" Version=\\"6\\"><b:Source><b:Tag>Aut22</b:Tag><b:SourceType>JournalArticle</b:SourceType><b:Guid>{9EB08ABC-6B94-7C45-96C8-3C8098B8767B}</b:Guid><b:Author><b:Author><b:NameList><b:Person><b:Last>Author</b:Last><b:First>An</b:First></b:Person></b:NameList></b:Author><b:Editor><b:NameList><b:Person><b:Last>Editor</b:Last></b:Person></b:NameList></b:Editor></b:Author><b:Title>Title of the thing</b:Title><b:City>City</b:City><b:Publisher>Publisher</b:Publisher><b:Year>2022</b:Year><b:JournalName>Journal</b:JournalName><b:Pages>11-12</b:Pages><b:Month>1</b:Month><b:Day>1</b:Day><b:Volume>1</b:Volume><b:Issue>1</b:Issue><b:ShortTitle>Short Title</b:ShortTitle><b:StandardNumber>https://doi.org/10.368850/e1</b:StandardNumber><b:Comments>rest</b:Comments><b:RefOrder>1</b:RefOrder></b:Source><b:Source><b:Tag>Ano00</b:Tag><b:SourceType>JournalArticle</b:SourceType><b:Guid>{2ABA2811-735F-3E43-9ECA-D91DBB4082D4}</b:Guid><b:Author><b:Author><b:NameList><b:Person><b:Last>Another</b:Last></b:Person></b:NameList></b:Author></b:Author><b:Title>Artikel</b:Title><b:JournalName>Journal1</b:JournalName><b:Year>1000</b:Year><b:Pages>99-104</b:Pages><b:Publisher>JOTE Publishers</b:Publisher><b:City>Heerenveen</b:City><b:Day>11</b:Day><b:Volume>2</b:Volume><b:Issue>2</b:Issue><b:ShortTitle>Artikel</b:ShortTitle><b:RefOrder>2</b:RefOrder></b:Source></b:Sources>
</w:document>",
}
`;
15 changes: 15 additions & 0 deletions libs/reoff/docx-to-vfile/src/lib/docx-to-vfile-yauzl.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { docxToVFile } from './docx-to-vfile-yauzl'
import fs from 'fs'
import path from 'path'

describe('reoffDocxToVfile', () => {
const doc = fs.readFileSync(
path.join(__dirname, '../../../reoff-parse/src/test/word-citation.docx')
)
jest.setTimeout(10000)
it('should work', async () => {
const vfile = await docxToVFile(doc)
console.log(vfile)
expect(vfile).toMatchSnapshot()
})
})
19 changes: 15 additions & 4 deletions libs/reoff/docx-to-vfile/src/lib/docx-to-vfile-yauzl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,23 @@ const removeHeader = (text: string | undefined) =>

export async function docxToVFile(file: Buffer | string) {
const data = await getXMLDatas(file, {
filenames: [/customXml(\/|\\)/, 'word/document.xml', 'word/footnotes.xml'],
filenames: [
/customXml\/item\d+\.xml/,
'word/document.xml',
'word/footnotes.xml',
],
})

const total = `${removeHeader(data['word/document.xml'])}
${removeHeader(data['word/footnotes.xml'])}
const {
'word/document.xml': document,
'word/footnotes.xml': footnotes,
...bibliography
} = data
const total = `${removeHeader(document).slice(0, -'</w:document>'.length)}
${removeHeader(footnotes)}
${Object.values(bibliography)
.map((bib) => removeHeader(bib))
.join('\n')}
</w:document>`
const vfile = new VFile(total)
// if (footnotes) {
Expand Down
3 changes: 2 additions & 1 deletion libs/reoff/docx-to-vfile/src/lib/docx-to-vfile.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ describe('reoffDocxToVfile', () => {
path.join(__dirname, '../fixtures/Manuscript-2.docx')
)
it('should work', async () => {
expect(await docxToVFile(doc)).toMatchSnapshot()
const vfile = await docxToVFile(doc)
expect(vfile).toMatchSnapshot()
})
})
24 changes: 12 additions & 12 deletions libs/reoff/docx-to-vfile/src/lib/get-xml-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ function unzipCallback(
reject(new Error('Empty zip file'))
return
}
let result: { [key: string]: string }
let result: { [key: string]: string } = {}

const openReadStream = promisify(zip.openReadStream.bind(zip))
zip.readEntry()
Expand All @@ -76,18 +76,18 @@ function unzipCallback(
!filenames.some((filename) => entry.fileName.match(filename))
) {
zip.readEntry()
return
}
let stream = await openReadStream(entry)
let entryChunks: any[] = []
if (!stream) {
zip.readEntry()
return
} else {
let stream = await openReadStream(entry)
let entryChunks: any[] = []
if (stream) {
stream!.on('data', (chunk) => entryChunks.push(chunk))
stream!.on('end', () => {
const string = Buffer.concat(entryChunks).toString()
result[entry.fileName] = string
zip.readEntry()
})
}
}
stream.on('data', (chunk) => entryChunks.push(chunk))
stream.on('end', () => {
result[entry.fileName] = Buffer.from(entryChunks).toString()
})
})
zip.on('end', () => {
resolve(result)
Expand Down
Binary file modified libs/reoff/reoff-parse/src/test/word-citation.docx
Binary file not shown.

0 comments on commit fed7aa9

Please sign in to comment.