Skip to content

Commit

Permalink
do not segfault on tokenizing with closed dictionary (#217)
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Sep 7, 2023
1 parent 8a06b2f commit c0ee006
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ void setupCharacterDefinition(Config config) throws IOException {

@Override
public void close() throws IOException {
grammar.invalidate();
grammar = null;
lexicon.invalidate();
lexicon = null;
for (BinaryDictionary dictionary : dictionaries) {
dictionary.close();
Expand All @@ -127,6 +129,9 @@ public void close() throws IOException {

@Override
public Tokenizer create() {
if (grammar == null || lexicon == null) {
throw new IllegalStateException("trying to use closed dictionary");
}
JapaneseTokenizer tokenizer = new JapaneseTokenizer(grammar, lexicon, inputTextPlugins, oovProviderPlugins,
pathRewritePlugins);
if (!allowEmptyMorpheme) {
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ UTF8InputText buildInputText(CharSequence text) {
}

MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
checkIfAlive();
buildLattice(input);

if (dumpOutput != null) {
Expand Down Expand Up @@ -286,4 +287,11 @@ JsonArrayBuilder pathToJson(List<LatticeNode> path, LatticeImpl lattice) {
void disableEmptyMorpheme() {
allowEmptyMorpheme = false;
}

void checkIfAlive() {
if (lexicon.isValid() && grammar.isValid()) {
return;
}
throw new IllegalStateException("dictionary was closed prior to tokenization");
}
}
2 changes: 2 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/Grammar.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,6 @@ public interface Grammar {
default Connection getConnection() {
throw new UnsupportedOperationException();
}

boolean isValid();
}
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,13 @@ public short registerPOS(POS pos) {
}
return (short) i;
}

@Override
public boolean isValid() {
return matrix != null;
}

public void invalidate() {
matrix = null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,12 @@ private void convertSplit(int[] split, int dictionaryId) {
public WordLookup makeLookup() {
return new WordLookup(this.lexicons);
}

public boolean isValid() {
return lexicons != null;
}

public void invalidate() {
lexicons = null;
}
}
29 changes: 29 additions & 0 deletions src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,33 @@ public void instantiateConfigWithoutCharDef() throws IOException {
assertThat(jd.create(), notNullValue());
}
}

private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
Config cfg = Config.fromClasspath("sudachi_minimum.json");
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
return jd;
}
}

@Test(expected = IllegalStateException.class)
public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
JapaneseDictionary dic = makeDictionaryIncorrectly();
Tokenizer ignored = dic.create();
}

private Tokenizer makeTokenizerIncorrectly() throws IOException {
Config cfg = Config.fromClasspath("sudachi_minimum.json");
cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
return jd.create();
}
}

@Test(expected = IllegalStateException.class)
public void throwExceptionOnTokenizerUsageAfterClose() throws IOException {
Tokenizer tok = makeTokenizerIncorrectly();
tok.tokenize("a");
}

}
5 changes: 5 additions & 0 deletions src/test/java/com/worksap/nlp/sudachi/MockGrammar.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,9 @@ public static CharacterCategory defaultCharCategory() {
@Override
public void setCharacterCategory(CharacterCategory charCategory) {
}

@Override
public boolean isValid() {
return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -214,5 +214,10 @@ public CharacterCategory getCharacterCategory() {
@Override
public void setCharacterCategory(CharacterCategory charCategory) {
}

@Override
public boolean isValid() {
return true;
}
}
}

0 comments on commit c0ee006

Please sign in to comment.