do not segfault on tokenizing with closed dictionary (#217)

WorksApplications · Sep 7, 2023 · c0ee006 · c0ee006
1 parent 8a06b2f
commit c0ee006
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 0 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -118,7 +118,9 @@ void setupCharacterDefinition(Config config) throws IOException {
 
     @Override
     public void close() throws IOException {
+        grammar.invalidate();
         grammar = null;
+        lexicon.invalidate();
         lexicon = null;
         for (BinaryDictionary dictionary : dictionaries) {
             dictionary.close();
@@ -127,6 +129,9 @@ public void close() throws IOException {
 
     @Override
     public Tokenizer create() {
+        if (grammar == null || lexicon == null) {
+            throw new IllegalStateException("trying to use closed dictionary");
+        }
         JapaneseTokenizer tokenizer = new JapaneseTokenizer(grammar, lexicon, inputTextPlugins, oovProviderPlugins,
                 pathRewritePlugins);
         if (!allowEmptyMorpheme) {

diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
@@ -155,6 +155,7 @@ UTF8InputText buildInputText(CharSequence text) {
     }
 
     MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
+        checkIfAlive();
         buildLattice(input);
 
         if (dumpOutput != null) {
@@ -286,4 +287,11 @@ JsonArrayBuilder pathToJson(List<LatticeNode> path, LatticeImpl lattice) {
     void disableEmptyMorpheme() {
         allowEmptyMorpheme = false;
     }
+
+    void checkIfAlive() {
+        if (lexicon.isValid() && grammar.isValid()) {
+            return;
+        }
+        throw new IllegalStateException("dictionary was closed prior to tokenization");
+    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/Grammar.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/Grammar.java
@@ -126,4 +126,6 @@ public interface Grammar {
     default Connection getConnection() {
         throw new UnsupportedOperationException();
     }
+
+    boolean isValid();
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/GrammarImpl.java
@@ -164,4 +164,13 @@ public short registerPOS(POS pos) {
         }
         return (short) i;
     }
+
+    @Override
+    public boolean isValid() {
+        return matrix != null;
+    }
+
+    public void invalidate() {
+        matrix = null;
+    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
@@ -168,4 +168,12 @@ private void convertSplit(int[] split, int dictionaryId) {
     public WordLookup makeLookup() {
         return new WordLookup(this.lexicons);
     }
+
+    public boolean isValid() {
+        return lexicons != null;
+    }
+
+    public void invalidate() {
+        lexicons = null;
+    }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java
@@ -67,4 +67,33 @@ public void instantiateConfigWithoutCharDef() throws IOException {
             assertThat(jd.create(), notNullValue());
         }
     }
+
+    private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
+        Config cfg = Config.fromClasspath("sudachi_minimum.json");
+        cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
+        try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
+            return jd;
+        }
+    }
+
+    @Test(expected = IllegalStateException.class)
+    public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
+        JapaneseDictionary dic = makeDictionaryIncorrectly();
+        Tokenizer ignored = dic.create();
+    }
+
+    private Tokenizer makeTokenizerIncorrectly() throws IOException {
+        Config cfg = Config.fromClasspath("sudachi_minimum.json");
+        cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
+        try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
+            return jd.create();
+        }
+    }
+
+    @Test(expected = IllegalStateException.class)
+    public void throwExceptionOnTokenizerUsageAfterClose() throws IOException {
+        Tokenizer tok = makeTokenizerIncorrectly();
+        tok.tokenize("a");
+    }
+
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/MockGrammar.java b/src/test/java/com/worksap/nlp/sudachi/MockGrammar.java
@@ -82,4 +82,9 @@ public static CharacterCategory defaultCharCategory() {
     @Override
     public void setCharacterCategory(CharacterCategory charCategory) {
     }
+
+    @Override
+    public boolean isValid() {
+        return true;
+    }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPluginTest.java b/src/test/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPluginTest.java
@@ -214,5 +214,10 @@ public CharacterCategory getCharacterCategory() {
         @Override
         public void setCharacterCategory(CharacterCategory charCategory) {
         }
+
+        @Override
+        public boolean isValid() {
+            return true;
+        }
     }
 }