/
lexicon_entry.py
533 lines (428 loc) · 23 KB
/
lexicon_entry.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
from abc import abstractmethod
import collections
from typing import DefaultDict, Dict, List, Optional, Set, Tuple, cast
import srsly
from pymusas.base import Serialise
from pymusas.lexicon_collection import LexiconType
from pymusas.rankers.ranking_meta_data import RankingMetaData
from pymusas.taggers.rules.mwe import MWERule
from pymusas.taggers.rules.rule import Rule
from pymusas.taggers.rules.single_word import SingleWordRule
class LexiconEntryRanker(Serialise):
'''
An **abstract class** that defines the basic methods, `__call__`,
`to_bytes`, and `from_bytes`, that is required for all
:class:`LexiconEntryRanker`s.
Each lexicon entry match is represented by a
:class:`pymusas.rankers.ranking_meta_data.RankingMetaData` object.
**Lower ranked lexicon entry matches should be given priority when making
tagging decisions. A rank of 0 is better than a rank of 1.**
A LexcionEntryRanker when called, `__call__`, returns a tuple of two `List`s
whereby each entry in the list corresponds to a token:
1. Contains the ranks of the lexicon entry matches as a `List[int]`.
**Note** that the `List` can be empty if a token has no lexicon entry matches.
2. An `Optional[RankingMetaData]` that is the global lowest ranked entry
match for that token. If the value is `None` then no global lowest ranked
entry can be found for that token. If the `RankingMetaData` represents more
than one token, like a Multi Word Expression (MWE) match, then those associated tokens
will have the same `RankingMetaData` object as the global lowest ranked entry match.
**The tagger will have to make a decision how to handle global lowest ranked
matches of value `None`, a suggested approach would be to assign an
unmatched/unknown semantic tag to those tokens.**
The reason for the adding the second list is that the **global** lowest
ranked match is not the same as the local/token lowest ranked match, this is
due to the potential of overlapping matches, e.g. `North East London brewery`
can have a match of `North East`, `North`, and `East London brewery` in this
case the lowest rank for `North` would be `North East`, but as we have a
lower match that uses `East` which is `East London brewery` then the
**global** lowest rank for `North` would be `North`.
'''
@abstractmethod
def __call__(self, token_ranking_data: List[List[RankingMetaData]]
) -> Tuple[List[List[int]], List[Optional[RankingMetaData]]]:
'''
For each token it returns a `List` of rankings for each lexicon entry
match and the optional :class:`pymusas.rankers.ranking_meta_data.RankingMetaData`
object of the **global** lowest ranked match for each token.
# Parameters
token_ranking_data : `List[List[RankingMetaData]]`
For each token a `List` of :class:`pymusas.rankers.ranking_meta_data.RankingMetaData`
representing the lexicon entry match.
# Returns
`Tuple[List[List[int]], List[Optional[RankingMetaData]]]`
'''
... # pragma: no cover
@abstractmethod
def __eq__(self, other: object) -> bool:
... # pragma: no cover
class ContextualRuleBasedRanker(LexiconEntryRanker):
'''
The contextual rule based ranker creates ranks based on the rules stated below.
Each lexicon entry match is represented by a
:class:`pymusas.rankers.ranking_meta_data.RankingMetaData` object.
**Lower ranked lexicon entry matches should be given priority when making
tagging decisions. See the :class:`LexiconEntryRanker` class docstring for
more details on the returned value of the `__call__` method.**
**Ranking Rules:**
The ranking of lexicon entires is based off the following rules, these rules
are based on the 6 heuristic stated at the top of column 2 on page 4 of
[Piao et al. 2003](https://aclanthology.org/W03-1807.pdf):
First we create an initial ranking based on lexicon entry type:
1. Multi Word Expression (MWE) entries ranked lower than single and Non-Special
entries are ranked lower than wild card entires.
Then within these rankings we further rank based on:
2. Longer entries, based on n-gram length, are ranked lower.
3. Entries with fewer wildcards are ranked lower.
Then we apply the following contextual ranking rules:
4. Whether the POS information was excluded in the match if so these are ranked
higher. This is only `True` when the match ignores the POS information for
single word lexicon entries. This is always `False` when used in a
MWE lexicon entry match.
5. Whether the lexicon entry was matched on Token < Lemma <
Lower cased token < Lower cased lemma. Token is the lowest ranked and lower
cased lemma is highest.
6. The lexicon entry that first appears in the text is ranked lowest,
this is required for matches that do not apply to the same sequence
of tokens.
In the case whereby the global lowest ranked lexicon entry match is joint
ranked with another entry then it is random which lexicon entry match is chosen.
# Parameters
maximum_n_gram_length : `int`
The largest n_gram rule match that will be encountered, e.g. a match
of `ski_noun boot_noun` will have a n-gram length of 2.
maximum_number_wildcards : `int`
The number of wildcards in the rule that contains the most wildcards, e.g.
the rule `ski_* *_noun` would contain 2 wildcards. This can be 0 if you
have no wildcard rules.
# Instance Attributes
n_gram_number_indexes : `int`
The number of indexes that each n-gram length value should have when
converting the n-gram length to a string using
`pymusas.rankers.lexicon_entry.ContextualRuleBasedRanker.int_2_str`.
wildcards_number_indexes : `int`
The number of indexes that each wildcard count value should have when
converting the wildcard count value to a string using
`pymusas.rankers.lexicon_entry.ContextualRuleBasedRanker.int_2_str`.
n_gram_ranking_dictionary : `Dict[int, int]`
Maps the n-gram length to it's rank value, as the n-gram length is
inverse to it's rank, as the larger the n-gram length the lower it's
rank.
'''
def __init__(self, maximum_n_gram_length: int,
maximum_number_wildcards: int) -> None:
self._maximum_n_gram_length = maximum_n_gram_length
self._maximum_number_wildcards = maximum_number_wildcards
self.n_gram_number_indexes = len(str(maximum_n_gram_length))
self.wildcards_number_indexes = len(str(maximum_number_wildcards))
self.n_gram_ranking_dictionary: Dict[int, int] = \
dict(zip(range(1, maximum_n_gram_length + 1, 1),
range(maximum_n_gram_length, 0, -1)))
def to_bytes(self) -> bytes:
'''
Serialises the :class:`ContextualRuleBasedRanker` to a bytestring.
# Returns
`bytes`
'''
serialise = {}
serialise['maximum_n_gram_length'] = srsly.msgpack_dumps(self._maximum_n_gram_length)
serialise['maximum_number_wildcards'] = srsly.msgpack_dumps(self._maximum_number_wildcards)
return cast(bytes, srsly.msgpack_dumps(serialise))
@staticmethod
def from_bytes(bytes_data: bytes) -> "ContextualRuleBasedRanker":
'''
Loads :class:`ContextualRuleBasedRanker` from the given bytestring and
returns it.
# Parameters
bytes_data : `bytes`
The bytestring to load.
# Returns
:class:`ContextualRuleBasedRanker`
'''
serialise_data = srsly.msgpack_loads(bytes_data)
maximum_n_gram_length = srsly.msgpack_loads(serialise_data['maximum_n_gram_length'])
maximum_number_wildcards = srsly.msgpack_loads(serialise_data['maximum_number_wildcards'])
return ContextualRuleBasedRanker(maximum_n_gram_length,
maximum_number_wildcards)
@staticmethod
def get_construction_arguments(rules: List['Rule']) -> Tuple[int, int]:
'''
Given a `List` of rules it will return the `maximum_n_gram_length` and
`maximum_number_wildcards` from the lexicon collections that those
:class:`pymusas.taggers.rules.rule.Rule`(s) are based on. The output from
this function can then be used as the arguments to the constructor of
:class:`ContextualRuleBasedRanker`.
# Parameters
rules : `List[Rule]`
A `List` of rules. This `List` is typically required when creating
a :class:`pymusas.taggers.rule_based.RuleBasedTagger` tagger.
# Returns
`Tuple[int, int]`
# Examples
``` python
>>> from pymusas.rankers.lexicon_entry import ContextualRuleBasedRanker
>>> from pymusas.taggers.rules.mwe import MWERule
>>> from pymusas.lexicon_collection import MWELexiconCollection
>>> pt_mwe_lexicon_url = "https://raw.githubusercontent.com/UCREL/Multilingual-USAS/master/Portuguese/mwe-pt.tsv"
>>> mwe_dict = MWELexiconCollection.from_tsv(pt_mwe_lexicon_url)
>>> mwe_rule = MWERule(mwe_dict)
>>> ranker_construction_arguments = ContextualRuleBasedRanker.get_construction_arguments([mwe_rule])
>>> ranker = ContextualRuleBasedRanker(*ranker_construction_arguments)
```
'''
maximum_n_gram_length = 0
maximum_number_wildcards = 0
for rule in rules:
rule_max_n_gram_length = 0
rule_max_number_wildcards = 0
if isinstance(rule, SingleWordRule):
rule_max_n_gram_length = 1
elif isinstance(rule, MWERule):
rule_max_n_gram_length = rule.mwe_lexicon_collection.longest_mwe_template
rule_max_number_wildcards = rule.mwe_lexicon_collection.most_wildcards_in_mwe_template
if rule_max_n_gram_length > maximum_n_gram_length:
maximum_n_gram_length = rule_max_n_gram_length
if rule_max_number_wildcards > maximum_number_wildcards:
maximum_number_wildcards = rule_max_number_wildcards
return (maximum_n_gram_length, maximum_number_wildcards)
@staticmethod
def int_2_str(int_value: int, number_indexes: int) -> str:
'''
Converts the integer, `int_value`, to a string with `number_indexes`,
e.g. `10` and `05` both have `number_indexes` of 2 and `001`, `020`,
and `211` have `number_indexes` of 3.
# Parameters
int_value : `int`
The integer to converts to a string with the given `number_indexes`.
number_indexes : `int`
The number of indexes the `int_value` should have in the returned
string.
# Returns
`str`
# Raises
ValueError
If the `number_indexes` of the `int_value` when converted to a
string is greater than the given `number_indexes`.
'''
str_value = str(int_value)
str_value_number_indexes = len(str_value)
if str_value_number_indexes > number_indexes:
error_msg = (f"Cannot convert int ({int_value}) to a ranked"
f" string as the maximum number of indexes it can be"
f" is {number_indexes}.")
raise ValueError(error_msg)
number_prefix_zeros_to_add = number_indexes - str_value_number_indexes
prefix_zeros = '0' * number_prefix_zeros_to_add
return f'{prefix_zeros}{str_value}'
@staticmethod
def get_global_lowest_ranks(token_ranking_data: List[List[RankingMetaData]],
token_rankings: List[List[int]],
ranking_data_to_exclude: Optional[Set[RankingMetaData]] = None
) -> List[Optional[RankingMetaData]]:
'''
Returns the global lowest ranked entry match for each token. If the value
is `None` then no global lowest ranked entry can be found for that token.
If the `RankingMetaData` represents more than one token, like a Multi
Word Expression (MWE) match, then those associated tokens will have the
same `RankingMetaData` object as the global lowest ranked entry match.
Time Complexity, given *N* is the number of tokens, *M* is the number
of unique ranking data, and *P* is the number of ranking data (non-unique)
then the time complexity is:
O(N + P) + O(M log M) + O(M)
# Parameters
token_ranking_data : `List[List[RankingMetaData]]`
For each token a `List` of
:class:`pymusas.rankers.ranking_meta_data.RankingMetaData` representing
the lexicon entry match.
token_rankings : `List[List[int]]`
For each token contains the ranks of the lexicon entry matches.
**Note** that the `List` can be empty if a token has no lexicon
entry matches.
ranking_data_to_exclude : `Set[RankingMetaData]`, optional (default = `None`)
Any :class:`pymusas.rankers.ranking_meta_data.RankingMetaData` to
exclude from the ranking selection, this can be useful when wanting
to get the next best global rank for each token.
# Raises
`AssertionError`
If the length of `token_ranking_data` is not equal to the length of
`token_rankings`, for both the outer and inner `List`s.
# Examples
``` python
>>> from pymusas.rankers.lexicon_entry import ContextualRuleBasedRanker
>>> from pymusas.rankers.ranking_meta_data import RankingMetaData
>>> from pymusas.lexicon_collection import LexiconType
>>> from pymusas.rankers.lexical_match import LexicalMatch
>>> north_east = RankingMetaData(LexiconType.MWE_NON_SPECIAL, 2, 0,
... False, LexicalMatch.TOKEN, 0, 2,
... 'North_noun East_noun', ('Z1',))
>>> east_london_brewery = RankingMetaData(LexiconType.MWE_NON_SPECIAL, 3, 0,
... False, LexicalMatch.TOKEN, 1, 4,
... 'East_noun London_noun brewery_noun', ('Z1',))
>>> token_ranking_data = [
... [
... north_east
... ],
... [
... north_east,
... east_london_brewery
... ],
... [
... east_london_brewery
... ],
... [
... east_london_brewery
... ]
... ]
>>> token_rankings = [[120110], [120110, 110111], [110111], [110111]]
>>> expected_lowest_ranked_matches = [None, east_london_brewery,
... east_london_brewery, east_london_brewery]
>>> assert (ContextualRuleBasedRanker.get_global_lowest_ranks(token_ranking_data, token_rankings, None)
... == expected_lowest_ranked_matches)
```
Following on from the previous example, we now want to find the next best
global match for each token so we exclude the current best global match
for each token which is the `east_london_brewery` match:
``` python
>>> expected_lowest_ranked_matches = [north_east, north_east, None, None]
>>> ranking_data_to_exclude = {east_london_brewery}
>>> assert (ContextualRuleBasedRanker.get_global_lowest_ranks(token_ranking_data, token_rankings,
... ranking_data_to_exclude)
... == expected_lowest_ranked_matches)
```
'''
if ranking_data_to_exclude is None:
ranking_data_to_exclude = set()
assert len(token_ranking_data) == len(token_rankings), 'Lengths should be equal'
ranking_meta_data: DefaultDict[int, Set[RankingMetaData]] = collections.defaultdict(set)
for token_data, token_ranking in zip(token_ranking_data, token_rankings):
assert len(token_data) == len(token_ranking), 'Lengths should be equal'
for data, rank in zip(token_data, token_ranking):
if data in ranking_data_to_exclude:
continue
ranking_meta_data[rank].add(data)
global_lowest_ranks: List[Optional[RankingMetaData]] = \
[None for _ in token_ranking_data]
ordered_ranking_meta_data = sorted(ranking_meta_data.items(),
key=lambda x: x[0])
for rank, meta_data in ordered_ranking_meta_data:
for data in meta_data:
start, end = data.token_match_start_index, data.token_match_end_index
if any(global_lowest_ranks[start: end]):
continue
for index in range(start, end):
global_lowest_ranks[index] = data
return global_lowest_ranks
def __call__(self, token_ranking_data: List[List[RankingMetaData]]
) -> Tuple[List[List[int]], List[Optional[RankingMetaData]]]:
'''
For each token it returns a `List` of rankings for each lexicon entry
match and the optional :class:`pymusas.rankers.ranking_meta_data.RankingMetaData`
object of the **global** lowest ranked match for each token.
See the ranking rules in the class docstring for details on how
each lexicon entry match is ranked.
Time Complexity, given *N* is the number of tokens, *M* is the number
of unique ranking data, and *P* is the number of ranking data (non-unique)
then the time complexity is:
O(3(N + P)) + O(M log M) + O(M)
# Parameters
token_ranking_data : `List[List[RankingMetaData]]`
For each token a `List` of :class:`pymusas.rankers.ranking_meta_data.RankingMetaData`
representing the lexicon entry match.
# Returns
`Tuple[List[List[int]], List[Optional[RankingMetaData]]]`
# Examples
```python
>>> from pymusas.rankers.lexicon_entry import ContextualRuleBasedRanker
>>> from pymusas.rankers.ranking_meta_data import RankingMetaData
>>> from pymusas.lexicon_collection import LexiconType
>>> from pymusas.rankers.lexical_match import LexicalMatch
>>> north_east = RankingMetaData(LexiconType.MWE_NON_SPECIAL, 2, 0,
... False, LexicalMatch.TOKEN, 0, 2,
... 'North_noun East_noun', ('Z1',))
>>> east_london_brewery = RankingMetaData(LexiconType.MWE_NON_SPECIAL, 3, 0,
... False, LexicalMatch.TOKEN, 1, 4,
... 'East_noun London_noun brewery_noun', ('Z1',))
>>> token_ranking_data = [
... [
... north_east
... ],
... [
... north_east,
... east_london_brewery
... ],
... [
... east_london_brewery
... ],
... [
... east_london_brewery
... ]
... ]
>>> expected_ranks = [[120110], [120110, 110111], [110111], [110111]]
>>> expected_lowest_ranked_matches = [None, east_london_brewery,
... east_london_brewery, east_london_brewery]
>>> ranker = ContextualRuleBasedRanker(3, 0)
>>> assert ((expected_ranks, expected_lowest_ranked_matches)
... == ranker(token_ranking_data))
```
'''
lexicon_type_to_rank = {
LexiconType.MWE_NON_SPECIAL: 1,
LexiconType.MWE_WILDCARD: 2,
LexiconType.MWE_CURLY_BRACES: 3,
LexiconType.SINGLE_NON_SPECIAL: 4
}
exclude_pos_information_to_rank = {
False: 1,
True: 2
}
initial_rankings: List[List[str]] = []
largest_token_index = 0
for token in token_ranking_data:
token_rankings: List[str] = []
for ranking_data in token:
lexicon_type_rank = lexicon_type_to_rank[ranking_data.lexicon_type]
n_gram_rank = self.n_gram_ranking_dictionary[ranking_data.lexicon_n_gram_length]
n_gram_str_rank = self.int_2_str(n_gram_rank, self.n_gram_number_indexes)
wildcard_str_rank = self.int_2_str(ranking_data.lexicon_wildcard_count,
self.wildcards_number_indexes)
exclude_pos_information_rank = exclude_pos_information_to_rank[ranking_data.exclude_pos_information]
lexical_match_rank = ranking_data.lexical_match.value
rank_str = (f'{lexicon_type_rank}{n_gram_str_rank}{wildcard_str_rank}'
f'{exclude_pos_information_rank}{lexical_match_rank}')
token_rankings.append(rank_str)
if largest_token_index < ranking_data.token_match_start_index:
largest_token_index = ranking_data.token_match_start_index
if largest_token_index < ranking_data.token_match_end_index:
largest_token_index = ranking_data.token_match_end_index
initial_rankings.append(token_rankings)
# Add to each token ranking where it first appears in the text, rule 6.
largest_token_index_number_indexes = len(str(largest_token_index))
rankings: List[List[int]] = []
for str_token_rankings, token in zip(initial_rankings, token_ranking_data):
int_token_rankings: List[int] = []
for str_ranking, ranking_data in zip(str_token_rankings, token):
start_index_str_rank = self.int_2_str(ranking_data.token_match_start_index,
largest_token_index_number_indexes)
int_token_ranking = int(f'{str_ranking}{start_index_str_rank}')
int_token_rankings.append(int_token_ranking)
rankings.append(int_token_rankings)
global_lowest_rank_indexes = self.get_global_lowest_ranks(token_ranking_data,
rankings, None)
return (rankings, global_lowest_rank_indexes)
def __eq__(self, other: object) -> bool:
'''
Given another object to compare too it will return `True` if the other
object is the same class and was initialised using with the same
`maximum_n_gram_length` and `maximum_number_wildcards` values.
# Parameters
other : `object`
The object to compare too.
# Returns
`True`
'''
if not isinstance(other, ContextualRuleBasedRanker):
return False
if self._maximum_n_gram_length != other._maximum_n_gram_length:
return False
if self._maximum_number_wildcards != other._maximum_number_wildcards:
return False
return True