/
doc.py
114 lines (103 loc) · 4.35 KB
/
doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from __future__ import annotations
import os
import tempfile
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
convert_office_doc,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
)
from unstructured.partition.docx import partition_docx
@process_metadata()
@add_metadata_with_filetype(FileType.DOC)
@add_chunking_strategy
def partition_doc(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = True,
include_metadata: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
libre_office_filter: Optional[str] = "MS Word 2007 XML",
chunking_strategy: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
libre_office_filter
The filter to use when coverting to .doc. The default is the
filter that is required when using LibreOffice7. Pass in None
if you do not want to apply any filter.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
"""
# Verify that only one of the arguments was provided
if filename is None:
filename = ""
exactly_one(filename=filename, file=file)
if len(filename) > 0:
_, filename_no_path = os.path.split(os.path.abspath(filename))
base_filename, _ = os.path.splitext(filename_no_path)
if not os.path.exists(filename):
raise ValueError(f"The file {filename} does not exist.")
last_modification_date = get_last_modified_date(filename)
elif file is not None:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(file.read())
tmp.close()
filename = tmp.name
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
base_filename, _ = os.path.splitext(filename_no_path)
last_modification_date = (
get_last_modified_date_from_file(file) if date_from_file_object else None
)
with tempfile.TemporaryDirectory() as tmpdir:
convert_office_doc(
filename,
tmpdir,
target_format="docx",
target_filter=libre_office_filter,
)
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
elements = partition_docx(
filename=docx_filename,
metadata_filename=metadata_filename,
include_page_breaks=include_page_breaks,
include_metadata=include_metadata,
metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
)
# remove tmp.name from filename if parsing file
if file:
for element in elements:
element.metadata.filename = metadata_filename
return elements