-
Notifications
You must be signed in to change notification settings - Fork 12
/
single_word.py
233 lines (184 loc) · 9.83 KB
/
single_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
from typing import Dict, List, Optional, cast
import srsly
from pymusas.lexicon_collection import LexiconCollection, LexiconType
from pymusas.rankers.lexical_match import LexicalMatch
from pymusas.rankers.ranking_meta_data import RankingMetaData
from pymusas.taggers.rules.rule import Rule
class SingleWordRule(Rule):
'''
A single word rule match, is a rule that matches on single word lexicon
entries. Entires can be matched on:
1. Token and the token's Part Of Speech (POS) tag, e.g. `driving|adj`
2. Lemma and the lemma's POS tag, e.g. `drive|adj`
3. Token, e.g. `driving`
4. Lemma, e.g. `drive`
In all cases matches are found based on the original token/lemma and lower
cased versions of the token/lemma. These matches are found through searching
the `lexicon_collection` and `lemma_lexicon_collection` attributes.
# Parameters
lexicon_collection : `Dict[str, List[str]]`
The data to create `lexicon_collection` instance attribute. A
Dictionary where the keys are a combination of
lemma/token and POS in the following format: `{lemma}|{POS}` and the
values are a list of associated semantic tags.
lemma_lexicon_collection : `Dict[str, List[str]]`
The data to create `lemma_lexicon_collection` instance attribute. A
Dictionary where the keys are either just a lemma/token
in the following format: `{lemma}` and the
values are a list of associated semantic tags.
pos_mapper : `Dict[str, List[str]]`, optional (default = `None`)
If not `None`, maps from the given token's POS tagset to the desired
POS tagset, whereby the mapping is a `List` of tags, at the moment there
is no preference order in this list of POS tags. The POS mapping is
useful in situtation whereby the token's POS tagset is different to
those used in the lexicons. **Note** the longer the `List[str]` for
each POS mapping the slower the tagger, a one to one mapping will have
no speed impact on the tagger. A selection of POS mappers can be found in
:mod:`pymusas.pos_mapper`.
# Instance Attributes
lexicon_collection : `pymusas.lexicon_collection.LexiconCollection`
A :class:`pymusas.lexicon_collection.LexiconCollection` instance that
has been initialised using the `lexicon_collection` parameter.
lemma_lexicon_collection : `pymusas.lexicon_collection.LexiconCollection`
A :class:`pymusas.lexicon_collection.LexiconCollection` instance that
has been initialised using the `lemma_lexicon_collection` parameter.
pos_mapper : `Dict[str, List[str]]`, optional (default = `None`)
The given `pos_mapper`.
'''
def __init__(self, lexicon_collection: Dict[str, List[str]],
lemma_lexicon_collection: Dict[str, List[str]],
pos_mapper: Optional[Dict[str, List[str]]] = None):
self.lexicon_collection = LexiconCollection(lexicon_collection)
self.lemma_lexicon_collection = LexiconCollection(lemma_lexicon_collection)
self.pos_mapper = pos_mapper
def __call__(self, tokens: List[str], lemmas: List[str], pos_tags: List[str]
) -> List[List[RankingMetaData]]:
'''
Given the tokens, lemmas, and POS tags for each word in a text,
it returns for each token a `List` of rules matches defined by
the :class:`pymusas.rankers.ranking_meta_data.RankingMetaData`
object based on the rule matches stated in the class docstring above.
# Parameters
tokens : `List[str]`
The tokens that are within the text.
lemmas : `List[str]`
The lemmas of the tokens.
pos_tags : `List[str]`
The Part Of Speech tags of the tokens.
# Returns
`List[List[RankingMetaData]]`
'''
def find_match_and_add_to_ranking_data(lexicon_entry: str,
exclude_pos_information: bool,
lexical_match: LexicalMatch,
start_index: int, end_index: int,
token_ranking_meta_data: List[List[RankingMetaData]]
) -> None:
collection = self.lexicon_collection
if exclude_pos_information:
collection = self.lemma_lexicon_collection
if lexicon_entry in collection:
semantic_tags = tuple(collection[lexicon_entry])
ranking_data = RankingMetaData(LexiconType.SINGLE_NON_SPECIAL,
1, 0, exclude_pos_information,
lexical_match,
start_index, end_index,
lexicon_entry, semantic_tags)
token_ranking_meta_data[start_index].append(ranking_data)
token_ranking_meta_data: List[List[RankingMetaData]] \
= [[] for _ in range(len(tokens))]
index = 0
for token, lemma, initial_pos in zip(tokens, lemmas, pos_tags):
token_lower = token.lower()
lemma_lower = lemma.lower()
start_index = index
end_index = start_index + 1
pos_tags = [initial_pos]
if self.pos_mapper is not None:
pos_tags = self.pos_mapper.get(initial_pos, [])
# All of these use POS information
for pos in pos_tags:
token_pos = f'{token}|{pos}'
find_match_and_add_to_ranking_data(token_pos, False,
LexicalMatch.TOKEN,
start_index, end_index,
token_ranking_meta_data)
lemma_pos = f'{lemma}|{pos}'
find_match_and_add_to_ranking_data(lemma_pos, False,
LexicalMatch.LEMMA,
start_index, end_index,
token_ranking_meta_data)
token_lower_pos = f'{token_lower}|{pos}'
find_match_and_add_to_ranking_data(token_lower_pos, False,
LexicalMatch.TOKEN_LOWER,
start_index, end_index,
token_ranking_meta_data)
lemma_lower_pos = f'{lemma_lower}|{pos}'
find_match_and_add_to_ranking_data(lemma_lower_pos, False,
LexicalMatch.LEMMA_LOWER,
start_index, end_index,
token_ranking_meta_data)
# All of these do not use POS information
lexical_value_type = [(token, LexicalMatch.TOKEN),
(lemma, LexicalMatch.LEMMA),
(token_lower, LexicalMatch.TOKEN_LOWER),
(lemma_lower, LexicalMatch.LEMMA_LOWER)]
for lexical_value, lexical_type in lexical_value_type:
find_match_and_add_to_ranking_data(lexical_value, True, lexical_type,
start_index, end_index,
token_ranking_meta_data)
index += 1
return token_ranking_meta_data
def to_bytes(self) -> bytes:
'''
Serialises the :class:`SingleWordRule` to a bytestring.
# Returns
`bytes`
'''
serialise = {}
serialise['lexicon_collection'] = self.lexicon_collection.to_bytes()
serialise['lemma_lexicon_collection'] \
= self.lemma_lexicon_collection.to_bytes()
serialise['pos_mapper'] = srsly.msgpack_dumps(self.pos_mapper)
return cast(bytes, srsly.msgpack_dumps(serialise))
@staticmethod
def from_bytes(bytes_data: bytes) -> "SingleWordRule":
'''
Loads :class:`SingleWordRule` from the given bytestring and returns it.
# Parameters
bytes_data : `bytes`
The bytestring to load.
# Returns
:class:`SingleWordRule`
'''
serialise_data = srsly.msgpack_loads(bytes_data)
lexicon_collection \
= LexiconCollection.from_bytes(serialise_data['lexicon_collection'])
lemma_lexicon_collection \
= LexiconCollection.from_bytes(serialise_data['lemma_lexicon_collection'])
pos_mapper = srsly.msgpack_loads(serialise_data['pos_mapper'])
single_word_rule = SingleWordRule({}, {}, None)
single_word_rule.lexicon_collection = lexicon_collection
single_word_rule.lemma_lexicon_collection = lemma_lexicon_collection
single_word_rule.pos_mapper = pos_mapper
return single_word_rule
def __eq__(self, other: object) -> bool:
'''
Given another object to compare too it will return `True` if the other
object is the same class and initialised using with the same argument
values.
# Parameters
other : `object`
The object to compare too.
# Returns
`True`
'''
if not isinstance(other, SingleWordRule):
return False
if self.pos_mapper != other.pos_mapper:
return False
if self.lexicon_collection != other.lexicon_collection:
return False
if self.lemma_lexicon_collection != other.lemma_lexicon_collection:
return False
return True