From 0cfb15a88ea3a95b111ad301456d5c0413f41998 Mon Sep 17 00:00:00 2001 From: Kazuma TAKAOKA Date: Thu, 4 Apr 2019 14:21:11 +0900 Subject: [PATCH] Fix follwing bugs * Unmatch POS when a longer POS includes it is unmatched - When A and A-B in filter, A is unmatched * Match too long POS - When A-B-C-D-E in filter, A-B-C-D is matched --- .../nlp/lucene/sudachi/ja/PartOfSpeechTrie.java | 8 +++++--- .../ja/TestSudachiPartOfSpeechStopFilter.java | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java index a8e8fb21..e50d9e7d 100644 --- a/src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java +++ b/src/main/java/com/worksap/nlp/lucene/sudachi/ja/PartOfSpeechTrie.java @@ -23,6 +23,7 @@ public class PartOfSpeechTrie { static final String EMPTY_SYMBOL = "*"; + static final String LEAF = ""; Map root = new HashMap<>(); @@ -37,6 +38,7 @@ public void add(String... items) { (Map)node.computeIfAbsent(item, k -> new HashMap<>()); node = newNode; } + node.put(LEAF, LEAF); } public boolean isPrefixOf(List items, int begin, int end) { @@ -47,17 +49,17 @@ public boolean isPrefixOf(List items, int begin, int end) { for (int i = begin; i < end; i++) { String item = items.get(i); if (EMPTY_SYMBOL.equals(item)) { - return node.isEmpty(); + return node.containsKey(LEAF); } @SuppressWarnings("unchecked") Map newNode = (Map)node.get(item); node = newNode; if (node == null) { return false; - } else if (node.isEmpty()) { + } else if (node.containsKey(LEAF)) { return true; } } - return true; + return node.containsKey(LEAF); } } diff --git a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java index d816517b..0728b136 100644 --- a/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java +++ b/src/test/java/com/worksap/nlp/lucene/sudachi/ja/TestSudachiPartOfSpeechStopFilter.java @@ -91,4 +91,21 @@ public void testConjugationForm() throws IOException { assertTokenStreamContents(tokenStream, new String[] {"東京都", "東京", "都", "に", "行っ"}); } + + public void testPrefixWithUnmatchedSubcategory() throws IOException { + String tags = "助詞,格助詞\n助詞,格助詞,引用\n"; + factory.inform(new StringResourceLoader(tags)); + tokenStream = factory.create(tokenStream); + assertTokenStreamContents(tokenStream, + new String[] {"東京都", "東京", "都", "行っ", "た"}); + } + + public void testTooLongCategory() throws IOException { + String tags = "名詞,固有名詞,地名,一般,一般\n"; + factory.inform(new StringResourceLoader(tags)); + tokenStream = factory.create(tokenStream); + assertTokenStreamContents(tokenStream, + new String[] {"東京都", "東京", "都", "に", "行っ", "た"}); + } + }