Skip to content

Commit

Permalink
Merge pull request #194 from WorksApplications/feature/arseny/posmatcher
Browse files Browse the repository at this point in the history
port SudachiPy PosMatcher to Java
  • Loading branch information
eiennohito committed Jul 21, 2022
2 parents aee8684 + 679d618 commit b906e97
Show file tree
Hide file tree
Showing 5 changed files with 410 additions and 0 deletions.
48 changes: 48 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@

package com.worksap.nlp.sudachi;

import com.worksap.nlp.sudachi.dictionary.POS;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;

/**
* A lexicon and a grammar for morphological analysis.
Expand Down Expand Up @@ -64,4 +68,48 @@ public interface Dictionary extends AutoCloseable {
* if {@code posId} is out of the range
*/
public List<String> getPartOfSpeechString(short posId);

/**
* Create a POS matcher that will match any of POS for which the passed
* predicate returns true. PosMatcher will be much faster than doing string
* comparison on POS objects.
*
* @param predicate
* returns true if the POS is needed
* @return PosMatcher object that mirrors behavior of the predicate
*/
PosMatcher posMatcher(Predicate<POS> predicate);

/**
* Create a POS matcher that will mirror matching behavior of passed list of
* partially-defined POS.
*
* @param posList
* list of partially defined part-of-speech objects
* @return mirroring PosMatcher object
* @see PartialPOS
*/
default PosMatcher posMatcher(Iterable<PartialPOS> posList) {
return posMatcher(posRepr -> {
for (PartialPOS p : posList) {
if (p.matches(posRepr)) {
return true;
}
}
return false;
});
}

/**
* Create a POS matcher that will mirror matching behavior of passed list of
* partially-defined POS.
*
* @param posList
* list of partially defined part-of-speech objects
* @return mirroring PosMatcher object
* @see PartialPOS
*/
default PosMatcher posMatcher(PartialPOS... posList) {
return posMatcher(Arrays.asList(posList));
}
}
10 changes: 10 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.IntStream;

public class JapaneseDictionary implements Dictionary, DictionaryAccess {

Expand Down Expand Up @@ -166,4 +168,12 @@ public LexiconSet getLexicon() {
return lexicon;
}

@Override
public PosMatcher posMatcher(Predicate<POS> predicate) {
GrammarImpl grammar = getGrammar();
int numPos = grammar.getPartOfSpeechSize();
int[] ids = IntStream.range(0, numPos).filter(id -> predicate.test(grammar.getPartOfSpeechString((short) id)))
.toArray();
return new PosMatcher(ids, this);
}
}
79 changes: 79 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/PartialPOS.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright (c) 2022 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.sudachi;

import com.worksap.nlp.sudachi.dictionary.POS;

import java.util.AbstractList;
import java.util.Arrays;
import java.util.List;

public class PartialPOS extends AbstractList<String> {
private final List<String> data;

public PartialPOS(List<String> data) {
if (data.size() == 0) {
throw new IllegalArgumentException("Partial POS must have at least 1 component");
}
if (data.size() > POS.DEPTH) {
throw new IllegalArgumentException("Partial POS can have at most 6 components, was " + data);
}
for (String component : data) {
if (component != null && component.length() > POS.MAX_COMPONENT_LENGTH) {
throw new IllegalArgumentException("Component length can't be more than " + POS.MAX_COMPONENT_LENGTH
+ ", was " + component.length() + ":" + component);
}
}
this.data = data;
}

public PartialPOS(String... data) {
this(Arrays.asList(data));
}

@Override
public String get(int index) {
return data.get(index);
}

@Override
public int size() {
return data.size();
}

boolean matches(POS pos) {
for (int level = 0; level < data.size(); ++level) {
String s = data.get(level);
if (s == null) {
continue;
}
if (!s.equals(pos.get(level))) {
return false;
}
}
return true;
}

@Override
public String toString() {
return String.join(",", data);
}

public static PartialPOS of(String... parts) {
return new PartialPOS(parts);
}
}
155 changes: 155 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/PosMatcher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2022 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.sudachi;

import com.worksap.nlp.sudachi.dictionary.POS;

import java.util.BitSet;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.function.Predicate;
import java.util.stream.IntStream;

/**
* API for checking if a morpheme belongs to a set of part of speech. Use
* factory methods of {@link Dictionary} object to create instances.
*
* @see Dictionary#posMatcher(Predicate)
* @see Dictionary#posMatcher(Iterable)
*/
public class PosMatcher implements Predicate<Morpheme>, Iterable<POS> {
private final BitSet matching;
private final JapaneseDictionary dictionary;

/**
* Creates a PosMatcher for a given Dictionary and list of POS id. This is a
* low-level API, use factory method on {@link Dictionary} instead
*
* @param ids
* list of POS ids
* @param dictionary
* related dictionary
*/
public PosMatcher(int[] ids, JapaneseDictionary dictionary) {
BitSet bits = new BitSet();
for (int id : ids) {
bits.set(id);
}
matching = bits;
this.dictionary = dictionary;
}

private PosMatcher(BitSet data, JapaneseDictionary dictionary) {
this.matching = data;
this.dictionary = dictionary;
}

/**
* Returns a PosMatcher which matches POS present in any of matchers
*
* @param other
* second matcher
* @return PosMatcher which matches union of POS tags
*/
public PosMatcher union(PosMatcher other) {
checkCompatibility(other);
BitSet merged = new BitSet();
merged.or(matching);
merged.or(other.matching);
return new PosMatcher(merged, dictionary);
}

/**
* Returns a PosMatcher which matches POS present in both of matchers
*
* @param other
* second matcher
* @return PosMatcher which matches intersection of POS tags
*/
public PosMatcher intersection(PosMatcher other) {
checkCompatibility(other);
BitSet merged = new BitSet();
merged.or(matching);
merged.and(other.matching);
return new PosMatcher(merged, dictionary);
}

/**
* Returns a PosMatcher for POS not present in current PosMatcher
*
* @return PosMatcher which is inverse of this
*/
public PosMatcher invert() {
// bitset can be shorter than number of ids, so we create the full id range and
// filter matching items
int[] indices = IntStream.range(0, dictionary.getPartOfSpeechSize()).filter(idx -> !matching.get(idx))
.toArray();
return new PosMatcher(indices, dictionary);
}

private void checkCompatibility(PosMatcher other) {
if (dictionary != other.dictionary) {
throw new IllegalArgumentException("PosMatchers are using different dictionaries");
}
}

/**
* Checks that {@link Morpheme} matches the POS configuration. It is incorrect
* to pass the Morpheme produced from other {@link Dictionary} than the one
* which was used to create the current instance of {@link PosMatcher}.
*
* When assertions are enabled, this method checks if Morpheme was produced by
* the same dictionary.
*
* @param morpheme
* the input argument
* @return true if morpheme matches the current configuration, false otherwise
*/
@Override
public boolean test(Morpheme morpheme) {
assert ((MorphemeImpl) morpheme).list.grammar == dictionary.grammar;
return matching.get(morpheme.partOfSpeechId());
}

/**
* Iterates POS tags which are matched by this matcher
*
* @return Iterator for POS tags
*/
@Override
public Iterator<POS> iterator() {
return new Iterator<POS>() {
private int index = matching.nextSetBit(0);

@Override
public boolean hasNext() {
return index >= 0;
}

@Override
public POS next() {
if (index < 0) {
throw new NoSuchElementException();
}
short posId = (short) index;
POS result = dictionary.getGrammar().getPartOfSpeechString(posId);
index = matching.nextSetBit(index + 1);
return result;
}
};
}
}
Loading

0 comments on commit b906e97

Please sign in to comment.