This repository has been archived by the owner on Mar 9, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 48
/
test_join_katakana_oov_plugin.py
86 lines (69 loc) · 3.06 KB
/
test_join_katakana_oov_plugin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Copyright (c) 2019 Works Applications Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
from sudachipy.config import settings
from sudachipy.dictionary import Dictionary
from sudachipy.plugin.path_rewrite import JoinKatakanaOovPlugin
from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
class TestJoinKatakanaOOVPlugin(unittest.TestCase):
def setUp(self):
resource_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, 'resources')
self.dict_ = Dictionary(os.path.join(resource_dir, 'sudachi.json'), resource_dir)
self.tokenizer = self.dict_.create()
self.plugin = JoinKatakanaOovPlugin(settings['pathRewritePlugin'][1])
def test_katakana_length(self):
# アイ, アイウ in the dictionary
self.plugin._min_length = 0
path = self.get_path('アイアイウ')
self.assertEqual(2, len(path))
self.plugin._min_length = 1
path = self.get_path('アイアイウ')
self.assertEqual(2, len(path))
self.plugin._min_length = 2
path = self.get_path('アイアイウ')
self.assertEqual(2, len(path))
self.plugin._min_length = 3
path = self.get_path('アイアイウ')
self.assertEqual(1, len(path))
def test_pos(self):
# アイアイウ is 名詞-固有名詞-地名-一般 in the dictionary
self.plugin._min_length = 3
path = self.get_path('アイアイウ')
self.assertEqual(1, len(path))
self.assertFalse(path[0].is_oov())
def test_starts_with_middle(self):
self.plugin._min_length = 3
path = self.get_path('アイウアイアイウ')
self.assertEqual(1, len(path))
def test_starts_with_tail(self):
self.plugin._min_length = 3
path = self.get_path('アイウアイウアイ')
self.assertEqual(1, len(path))
def test_with_nooovbow(self):
self.plugin._min_length = 3
path = self.get_path('ァアイアイウ')
self.assertEqual(2, len(path))
self.assertEqual('ァ', path[0].get_word_info().surface)
path = self.get_path('アイウァアイウ')
self.assertEqual(1, len(path))
def get_path(self, text: str):
input_ = UTF8InputTextBuilder(text, self.tokenizer._grammar).build()
self.tokenizer._build_lattice(input_)
path = self.tokenizer._lattice.get_best_path()
self.plugin.rewrite(input_, path, self.tokenizer._lattice)
self.tokenizer._lattice.clear()
return path
if __name__ == '__main__':
unittest.main()