diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java index 1db86ebc..e0dc39c7 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java @@ -16,8 +16,12 @@ package com.worksap.nlp.sudachi; +import com.worksap.nlp.sudachi.dictionary.POS; + import java.io.IOException; +import java.util.Arrays; import java.util.List; +import java.util.function.Predicate; /** * A lexicon and a grammar for morphological analysis. @@ -64,4 +68,48 @@ public interface Dictionary extends AutoCloseable { * if {@code posId} is out of the range */ public List getPartOfSpeechString(short posId); + + /** + * Create a POS matcher that will match any of POS for which the passed + * predicate returns true. PosMatcher will be much faster than doing string + * comparison on POS objects. + * + * @param predicate + * returns true if the POS is needed + * @return PosMatcher object that mirrors behavior of the predicate + */ + PosMatcher posMatcher(Predicate predicate); + + /** + * Create a POS matcher that will mirror matching behavior of passed list of + * partially-defined POS. + * + * @param posList + * list of partially defined part-of-speech objects + * @return mirroring PosMatcher object + * @see PartialPOS + */ + default PosMatcher posMatcher(Iterable posList) { + return posMatcher(posRepr -> { + for (PartialPOS p : posList) { + if (p.matches(posRepr)) { + return true; + } + } + return false; + }); + } + + /** + * Create a POS matcher that will mirror matching behavior of passed list of + * partially-defined POS. + * + * @param posList + * list of partially defined part-of-speech objects + * @return mirroring PosMatcher object + * @see PartialPOS + */ + default PosMatcher posMatcher(PartialPOS... posList) { + return posMatcher(Arrays.asList(posList)); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java index cd3f5f12..70b547e0 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java @@ -26,6 +26,8 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.function.Predicate; +import java.util.stream.IntStream; public class JapaneseDictionary implements Dictionary, DictionaryAccess { @@ -166,4 +168,12 @@ public LexiconSet getLexicon() { return lexicon; } + @Override + public PosMatcher posMatcher(Predicate predicate) { + GrammarImpl grammar = getGrammar(); + int numPos = grammar.getPartOfSpeechSize(); + int[] ids = IntStream.range(0, numPos).filter(id -> predicate.test(grammar.getPartOfSpeechString((short) id))) + .toArray(); + return new PosMatcher(ids, this); + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/PartialPOS.java b/src/main/java/com/worksap/nlp/sudachi/PartialPOS.java new file mode 100644 index 00000000..d43bab82 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/PartialPOS.java @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi; + +import com.worksap.nlp.sudachi.dictionary.POS; + +import java.util.AbstractList; +import java.util.Arrays; +import java.util.List; + +public class PartialPOS extends AbstractList { + private final List data; + + public PartialPOS(List data) { + if (data.size() == 0) { + throw new IllegalArgumentException("Partial POS must have at least 1 component"); + } + if (data.size() > POS.DEPTH) { + throw new IllegalArgumentException("Partial POS can have at most 6 components, was " + data); + } + for (String component : data) { + if (component != null && component.length() > POS.MAX_COMPONENT_LENGTH) { + throw new IllegalArgumentException("Component length can't be more than " + POS.MAX_COMPONENT_LENGTH + + ", was " + component.length() + ":" + component); + } + } + this.data = data; + } + + public PartialPOS(String... data) { + this(Arrays.asList(data)); + } + + @Override + public String get(int index) { + return data.get(index); + } + + @Override + public int size() { + return data.size(); + } + + boolean matches(POS pos) { + for (int level = 0; level < data.size(); ++level) { + String s = data.get(level); + if (s == null) { + continue; + } + if (!s.equals(pos.get(level))) { + return false; + } + } + return true; + } + + @Override + public String toString() { + return String.join(",", data); + } + + public static PartialPOS of(String... parts) { + return new PartialPOS(parts); + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java b/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java new file mode 100644 index 00000000..f174dd4b --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/PosMatcher.java @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi; + +import com.worksap.nlp.sudachi.dictionary.POS; + +import java.util.BitSet; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.function.Predicate; +import java.util.stream.IntStream; + +/** + * API for checking if a morpheme belongs to a set of part of speech. Use + * factory methods of {@link Dictionary} object to create instances. + * + * @see Dictionary#posMatcher(Predicate) + * @see Dictionary#posMatcher(Iterable) + */ +public class PosMatcher implements Predicate, Iterable { + private final BitSet matching; + private final JapaneseDictionary dictionary; + + /** + * Creates a PosMatcher for a given Dictionary and list of POS id. This is a + * low-level API, use factory method on {@link Dictionary} instead + * + * @param ids + * list of POS ids + * @param dictionary + * related dictionary + */ + public PosMatcher(int[] ids, JapaneseDictionary dictionary) { + BitSet bits = new BitSet(); + for (int id : ids) { + bits.set(id); + } + matching = bits; + this.dictionary = dictionary; + } + + private PosMatcher(BitSet data, JapaneseDictionary dictionary) { + this.matching = data; + this.dictionary = dictionary; + } + + /** + * Returns a PosMatcher which matches POS present in any of matchers + * + * @param other + * second matcher + * @return PosMatcher which matches union of POS tags + */ + public PosMatcher union(PosMatcher other) { + checkCompatibility(other); + BitSet merged = new BitSet(); + merged.or(matching); + merged.or(other.matching); + return new PosMatcher(merged, dictionary); + } + + /** + * Returns a PosMatcher which matches POS present in both of matchers + * + * @param other + * second matcher + * @return PosMatcher which matches intersection of POS tags + */ + public PosMatcher intersection(PosMatcher other) { + checkCompatibility(other); + BitSet merged = new BitSet(); + merged.or(matching); + merged.and(other.matching); + return new PosMatcher(merged, dictionary); + } + + /** + * Returns a PosMatcher for POS not present in current PosMatcher + * + * @return PosMatcher which is inverse of this + */ + public PosMatcher invert() { + // bitset can be shorter than number of ids, so we create the full id range and + // filter matching items + int[] indices = IntStream.range(0, dictionary.getPartOfSpeechSize()).filter(idx -> !matching.get(idx)) + .toArray(); + return new PosMatcher(indices, dictionary); + } + + private void checkCompatibility(PosMatcher other) { + if (dictionary != other.dictionary) { + throw new IllegalArgumentException("PosMatchers are using different dictionaries"); + } + } + + /** + * Checks that {@link Morpheme} matches the POS configuration. It is incorrect + * to pass the Morpheme produced from other {@link Dictionary} than the one + * which was used to create the current instance of {@link PosMatcher}. + * + * When assertions are enabled, this method checks if Morpheme was produced by + * the same dictionary. + * + * @param morpheme + * the input argument + * @return true if morpheme matches the current configuration, false otherwise + */ + @Override + public boolean test(Morpheme morpheme) { + assert ((MorphemeImpl) morpheme).list.grammar == dictionary.grammar; + return matching.get(morpheme.partOfSpeechId()); + } + + /** + * Iterates POS tags which are matched by this matcher + * + * @return Iterator for POS tags + */ + @Override + public Iterator iterator() { + return new Iterator() { + private int index = matching.nextSetBit(0); + + @Override + public boolean hasNext() { + return index >= 0; + } + + @Override + public POS next() { + if (index < 0) { + throw new NoSuchElementException(); + } + short posId = (short) index; + POS result = dictionary.getGrammar().getPartOfSpeechString(posId); + index = matching.nextSetBit(index + 1); + return result; + } + }; + } +} diff --git a/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt b/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt new file mode 100644 index 00000000..cdca9810 --- /dev/null +++ b/src/test/java/com/worksap/nlp/sudachi/PosMatcherTest.kt @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi + +import com.worksap.nlp.sudachi.dictionary.POS +import kotlin.test.* + +class PosMatcherTest { + + private val dic = DictionaryFactory().create(TestDictionary.user2Cfg()) as JapaneseDictionary + + @Test + fun basic() { + val nouns = dic.posMatcher(PartialPOS("名詞")) + val morphs = dic.create().tokenize("京都に行った") + assertEquals(4, morphs.size) + assertTrue(nouns.test(morphs[0])) + assertFalse(nouns.test(morphs[1])) + assertFalse(nouns.test(morphs[2])) + assertFalse(nouns.test(morphs[3])) + } + + @Test + fun userDic() { + val filter = dic.posMatcher { it[3] == "ミカン科" } + val morphs = dic.create().tokenize("すだちにかぼす") + assertEquals(3, morphs.size) + assertTrue(filter.test(morphs[0])) + assertFalse(filter.test(morphs[1])) + assertTrue(filter.test(morphs[2])) + } + + @Test + fun union() { + val f1 = dic.posMatcher { it[5] == "スダチ" } + val f2 = dic.posMatcher { it[5] == "カボス" } + val filter = f1.union(f2) + val morphs = dic.create().tokenize("すだちにかぼす") + assertEquals(3, morphs.size) + assertTrue(filter.test(morphs[0])) + assertFalse(filter.test(morphs[1])) + assertTrue(filter.test(morphs[2])) + } + + @Test + fun intersection() { + val f1 = dic.posMatcher { it[5] == "終止形-一般" } + val f2 = dic.posMatcher { it[0] == "動詞" } + val filter = f1.intersection(f2) + val morphs = dic.create().tokenize("いった東京行く") + assertEquals(4, morphs.size) + assertFalse(filter.test(morphs[0])) + assertFalse(filter.test(morphs[1])) + assertFalse(filter.test(morphs[2])) + assertTrue(filter.test(morphs[3])) + } + + @Test + fun invert() { + val filter = dic.posMatcher { it[3] == "ミカン科" }.invert() + val morphs = dic.create().tokenize("すだちにかぼす") + assertEquals(3, morphs.size) + assertFalse(filter.test(morphs[0])) + assertTrue(filter.test(morphs[1])) + assertFalse(filter.test(morphs[2])) + } + + @Test + fun iterator() { + val filter = dic.posMatcher { it[3] == "ミカン科" } + val posList = filter.toList() + assertEquals( + listOf( + POS(*"被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ".split(",").toTypedArray()), + POS(*"被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス".split(",").toTypedArray())), + posList) + } + + @Test + fun iteratorThrows() { + val filter = dic.posMatcher(PartialPOS("動詞")) + val iter = filter.iterator() + assertTrue(iter.hasNext()) + assertEquals(POS("動詞", "非自立可能", "*", "*", "五段-カ行", "終止形-一般"), iter.next()) + assertTrue(iter.hasNext()) + assertEquals(POS("動詞", "非自立可能", "*", "*", "五段-カ行", "連用形-促音便"), iter.next()) + assertFalse(iter.hasNext()) + assertFailsWith { iter.next() } + } + + @Test + fun partialPos() { + assertFails { PartialPOS("a", "b", "c", "d", "e", "f", "g") } + assertFails { PartialPOS() } + assertFails { PartialPOS.of("1".repeat(300)) } + } + + @Test + fun partialPosNull() { + val filter = dic.posMatcher(PartialPOS(null, "数詞")) + val matchingTags = filter.toList() + assertContains(matchingTags, POS("名詞", "数詞", "*", "*", "*", "*")) + } +}