Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for user-defined POS for all OOV handlers #186

Merged
merged 3 commits into from
Jun 7, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public void setUp(Grammar grammar) throws IOException {
Config.Resource<Object> charDef = settings.getResource("charDef");
readCharacterProperty(charDef);
Config.Resource<Object> unkDef = settings.getResource("unkDef");
readOOV(unkDef, grammar, settings.getString("userPos", "forbid"));
readOOV(unkDef, grammar, settings.getString(USER_POS, USER_POS_FORBID));
}

@Override
Expand Down Expand Up @@ -168,7 +168,7 @@ <T> void readCharacterProperty(Config.Resource<T> charDef) throws IOException {
}
}

<T> void readOOV(Config.Resource<T> unkDef, Grammar grammar, String userPosType) throws IOException {
<T> void readOOV(Config.Resource<T> unkDef, Grammar grammar, String userPosMode) throws IOException {
if (unkDef == null) {
unkDef = settings.base.toResource(Paths.get("unk.def"));
}
Expand Down Expand Up @@ -196,7 +196,7 @@ <T> void readOOV(Config.Resource<T> unkDef, Grammar grammar, String userPosType)
oov.rightId = Short.parseShort(cols[2]);
oov.cost = Short.parseShort(cols[3]);
POS pos = new POS(cols[4], cols[5], cols[6], cols[7], cols[8], cols[9]);
oov.posId = posIdOf(grammar, pos, userPosType);
oov.posId = posIdOf(grammar, pos, userPosMode);

oovList.computeIfAbsent(type, t -> new ArrayList<>()).add(oov);
}
Expand Down
12 changes: 10 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/OovProviderPlugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,26 @@ protected LatticeNodeImpl createNode() {
return node;
}

/**
* Recommended name for user POS flag
*/
public static final String USER_POS = "userPos";
kazuma-t marked this conversation as resolved.
Show resolved Hide resolved

public static final String USER_POS_FORBID = "forbid";
public static final String USER_POS_ALLOW = "allow";

protected short posIdOf(Grammar grammar, POS pos, String userPosMode) {
short posIdPresent = grammar.getPartOfSpeechId(pos);
userPosMode = userPosMode.toLowerCase(Locale.ROOT);

if (Objects.equals(userPosMode, "forbid")) {
if (Objects.equals(userPosMode, USER_POS_FORBID)) {
if (posIdPresent >= 0) {
return posIdPresent;
}
throw new IllegalArgumentException(String.format(
"POS %s WAS NOT present in dictionary and OOV Plugin %s is forbidden to add new POS tags", pos,
this));
} else if (!Objects.equals(userPosMode, "allow")) {
} else if (!Objects.equals(userPosMode, USER_POS_ALLOW)) {
throw new IllegalArgumentException(
"Unknown user POS mode: " + userPosMode + " allowed values are: forbid, allow");
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/RegexOovProvider.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ public class RegexOovProvider extends OovProviderPlugin {
public void setUp(Grammar grammar) throws IOException {
super.setUp(grammar);
POS stringPos = new POS(settings.getStringList("pos"));
String userPosType = settings.getString("userPos", "forbid");
posId = posIdOf(grammar, stringPos, userPosType);
String userPosMode = settings.getString(USER_POS, USER_POS_FORBID);
posId = posIdOf(grammar, stringPos, userPosMode);
if (posId == -1) {
throw new IllegalArgumentException("POS " + stringPos + " was not present in the dictionary");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ public void setUp(Grammar grammar) {
leftId = (short) settings.getInt("leftId");
rightId = (short) settings.getInt("rightId");
cost = (short) settings.getInt("cost");
String userPosType = settings.getString("userPos", "forbid");
oovPOSId = posIdOf(grammar, pos, userPosType);
String userPosMode = settings.getString(USER_POS, USER_POS_FORBID);
oovPOSId = posIdOf(grammar, pos, userPosMode);
}

@Override
Expand Down
11 changes: 6 additions & 5 deletions src/test/java/com/worksap/nlp/sudachi/OovProviderPluginTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.worksap.nlp.sudachi

import com.worksap.nlp.sudachi.OovProviderPlugin.*
import com.worksap.nlp.sudachi.dictionary.Grammar
import com.worksap.nlp.sudachi.dictionary.POS
import kotlin.test.Test
Expand All @@ -35,7 +36,7 @@ class OovProviderPluginTest {
}

override fun setUp(grammar: Grammar?) {
val kind = settings.getString("posKind", "forbid")
val kind = settings.getString(USER_POS, USER_POS_FORBID)
val pos = POS(settings.getStringList("pos"))
posId = posIdOf(grammar, pos, kind)
}
Expand All @@ -58,7 +59,7 @@ class OovProviderPluginTest {
val cfg = TestDictionary.user0Cfg()
cfg.addOovProviderPlugin(FakeOovProvider::class.java)
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "new")
.add("posKind", "allow")
.add(USER_POS, USER_POS_ALLOW)
val inst = DictionaryFactory().create(cfg) as JapaneseDictionary
val plugin = assertIs<FakeOovProvider>(inst.oovProviderPlugins.last())
assertEquals(8, plugin.posId)
Expand All @@ -69,7 +70,7 @@ class OovProviderPluginTest {
val cfg = TestDictionary.user0Cfg()
cfg.addOovProviderPlugin(FakeOovProvider::class.java)
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "*")
.add("posKind", "test")
.add(USER_POS, "test")
assertFails { DictionaryFactory().create(cfg) }
}

Expand All @@ -86,10 +87,10 @@ class OovProviderPluginTest {
val cfg = TestDictionary.user0Cfg()
cfg.addOovProviderPlugin(FakeOovProvider::class.java)
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "new")
.add("posKind", "allow")
.add(USER_POS, USER_POS_ALLOW)
cfg.addOovProviderPlugin(FakeOovProvider::class.java)
.addList("pos", "名詞", "普通名詞", "一般", "*", "*", "new")
.add("posKind", "allow")
.add(USER_POS, USER_POS_ALLOW)
val inst = DictionaryFactory().create(cfg) as JapaneseDictionary
val oovPlugins = inst.oovProviderPlugins
val p1 = assertIs<FakeOovProvider>(oovPlugins[oovPlugins.size - 2])
Expand Down