-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
spacy_tasks.py
220 lines (171 loc) · 6.61 KB
/
spacy_tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import spacy
from prefect import Task
from prefect.utilities.tasks import defaults_from_attrs
class SpacyNLP(Task):
"""
Task for processing text with a spaCy pipeline.
Args:
- text (unicode, optional): string to be processed, can be provided during construction
or when task is run
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline, if provided, this pipeline will be used instead
of being created from spacy_model_name
- spacy_model_name (str, optional): name of the spaCy language model, default
model is 'en_core_web_sm', will be ignored if nlp is provided
- disable (List[str], optional): list of pipeline components
to disable, only applicable to pipelines loaded from spacy_model_name
- component_cfg (dict, optional): a dictionary with extra keyword
arguments for specific components, only applicable to pipelines loaded from
spacy_model_name
- **kwargs (dict, optional): additional keyword arguments to pass to the
Task constructor
"""
def __init__(
self,
text: str = u"",
nlp=None,
spacy_model_name: str = "en_core_web_sm",
disable: list = None,
component_cfg: dict = None,
**kwargs
):
self.text = text
self.disable = disable or []
self.component_cfg = component_cfg or {}
## load spacy model
if nlp:
self.nlp = nlp
else:
try:
self.nlp = spacy.load(
spacy_model_name,
disable=self.disable,
component_cfg=self.component_cfg,
)
except IOError:
raise ValueError("spaCy model %s not found." % spacy_model_name)
super().__init__(**kwargs)
@defaults_from_attrs("text")
def run(self, text: str = u""):
"""
Task run method. Creates a spaCy document.
Args:
- text (unicode, optional): text to be processed
Returns:
- Doc: spaCy document
"""
doc = self.nlp(text)
return doc
class SpacyTagger(Task):
"""
Task for returning tagger from a spaCy pipeline.
Args:
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline
- **kwargs (dict, optional): additional keyword arguments to pass to the
Task constructor
"""
def __init__(self, nlp=None, **kwargs):
self.nlp = nlp
super().__init__(**kwargs)
@defaults_from_attrs("nlp")
def run(self, nlp=None):
"""
Task run method. Returns tagger component of spaCy pipeline.
Args:
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline, must be provided if not
specified in construction
Returns:
- Tagger: spaCy Tagger object
"""
if nlp is None:
raise ValueError("A spaCy pipeline must be provided")
return nlp.tagger
class SpacyParser(Task):
"""
Task for returning parser from a spaCy pipeline.
Args:
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline
- **kwargs (dict, optional): additional keyword arguments to pass to the
Task constructor
"""
def __init__(self, nlp=None, **kwargs):
self.nlp = nlp
super().__init__(**kwargs)
@defaults_from_attrs("nlp")
def run(self, nlp=None):
"""
Task run method. Returns parser component of spaCy pipeline.
Args:
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline, must be provided if not
specified in construction
Returns:
- Parser: spaCy Parser object
"""
if nlp is None:
raise ValueError("A spaCy pipeline must be provided")
return nlp.parser
class SpacyNER(Task):
"""
Task for returning named entity recognizer from a spaCy pipeline.
Args:
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline
- **kwargs (dict, optional): additional keyword arguments to pass to the
Task constructor
"""
def __init__(self, nlp=None, **kwargs):
self.nlp = nlp
super().__init__(**kwargs)
@defaults_from_attrs("nlp")
def run(self, nlp=None):
"""
Task run method. Returns named entity recognition component of spaCy pipeline.
Args:
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline, must be provided if not
specified in construction
Returns:
- NER: spaCy NER object
"""
if nlp is None:
raise ValueError("A spaCy pipeline must be provided")
return nlp.entity
class SpacyComponent(Task):
"""
Task for returning named component from a spaCy pipeline.
Args:
- component_name (str, optional): name of spaCy pipeline component to return,
must be provided during construction or run time
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline
- **kwargs (dict, optional): additional keyword arguments to pass to the
Task constructor
"""
def __init__(self, component_name: str = "", nlp=None, **kwargs):
self.component_name = component_name
self.nlp = nlp
super().__init__(**kwargs)
@defaults_from_attrs("component_name", "nlp")
def run(self, component_name: str, nlp=None):
"""
Task run method. Returns named component of spaCy pipeline.
Args:
- component_name (str, optional): name of spaCy pipeline component to return,
must be provided during construction or run time
- nlp (spaCy text processing pipeline, optional): a custom spaCy text
processing pipeline, must be provided if not
specified in construction
Returns:
- Component: spaCy pipeline component object
"""
if nlp is None:
raise ValueError("A spaCy pipeline must be provided")
## iterate through pipeline to find object
for name, component in nlp.pipeline:
if name == component_name:
return component
raise ValueError("Pipeline component %s not found" % component_name)