/
text.py
169 lines (144 loc) · 5.4 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
from typing import IO, Callable, List, Optional, Tuple
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
Address,
Element,
ElementMetadata,
ListItem,
NarrativeText,
Text,
Title,
process_metadata,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.nlp.tokenize import sent_tokenize
from unstructured.partition.common import exactly_one
from unstructured.partition.text_type import (
is_bulleted_text,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
def split_by_paragraph(content: str, max_partition: Optional[int] = 1500) -> List[str]:
paragraphs = re.split(PARAGRAPH_PATTERN, content)
if max_partition is None:
return paragraphs
split_paragraphs = []
for paragraph in paragraphs:
split_paragraphs.extend(
_split_to_fit_max_content(paragraph, max_partition=max_partition),
)
return split_paragraphs
def _split_content_size_n(content: str, n: int) -> List[str]:
"""Splits a string into chunks that are at most size n."""
segments = []
for i in range(0, len(content), n):
segment = content[i : i + n] # noqa: E203
segments.append(segment)
return segments
def _split_to_fit_max_content(content: str, max_partition: int = 1500) -> List[str]:
"""Splits a section of content so that all of the elements fit into the
max partition window."""
sentences = sent_tokenize(content)
num_sentences = len(sentences)
chunks = []
chunk = ""
for i, sentence in enumerate(sentences):
if len(sentence) > max_partition:
chunks.extend(_split_content_size_n(sentence, n=max_partition))
if len(chunk + " " + sentence) > max_partition:
chunks.append(chunk)
chunk = sentence
else:
chunk += " " + sentence
if i == num_sentences - 1:
chunks.append(chunk)
return chunks
@process_metadata()
@add_metadata_with_filetype(FileType.TXT)
def partition_text(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
encoding: Optional[str] = None,
paragraph_grouper: Optional[Callable[[str], str]] = None,
metadata_filename: Optional[str] = None,
include_metadata: bool = True,
max_partition: Optional[int] = 1500,
**kwargs,
) -> List[Element]:
"""Partitions an .txt documents into its constituent elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the .txt document.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
paragrapher_grouper
A str -> str function for fixing paragraphs that are interrupted by line breaks
for formatting purposes.
include_metadata
Determines whether or not metadata is included in the output.
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied.
"""
if text is not None and text.strip() == "" and not file and not filename:
return []
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)
if filename is not None:
encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
elif file is not None:
encoding, file_text = read_txt_file(file=file, encoding=encoding)
elif text is not None:
file_text = str(text)
if paragraph_grouper is not None:
file_text = paragraph_grouper(file_text)
else:
file_text = group_broken_paragraphs(file_text)
file_content = split_by_paragraph(file_text, max_partition=max_partition)
elements: List[Element] = []
metadata = (
ElementMetadata(filename=metadata_filename or filename)
if include_metadata
else ElementMetadata()
)
for ctext in file_content:
ctext = ctext.strip()
if ctext:
element = element_from_text(ctext)
element.metadata = metadata
elements.append(element)
return elements
def element_from_text(
text: str,
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
) -> Element:
if is_bulleted_text(text):
return ListItem(
text=clean_bullets(text),
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_us_city_state_zip(text):
return Address(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
elif is_possible_narrative_text(text):
return NarrativeText(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_possible_title(text):
return Title(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
else:
return Text(text=text, coordinates=coordinates, coordinate_system=coordinate_system)