Skip to content

Commit

Permalink
Respect "NoSubstitution" flag for combining marks applied on spaces (#74
Browse files Browse the repository at this point in the history
)

* Respect "NoSubstitution" flag for combining marks applied on spaces

* Fix style
  • Loading branch information
guillaumekln committed Jun 12, 2019
1 parent 3025d9e commit 120c287
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/Tokenizer.cc
Expand Up @@ -629,7 +629,7 @@ namespace onmt
token.join_left();
}

if (sub_c[0] == ' ')
if (sub_c[0] == ' ' && !_no_substitution)
token.append(protected_character + "0020" + sub_c.substr(1));
else
token.append(sub_c);
Expand Down
22 changes: 22 additions & 0 deletions test/test.cc
Expand Up @@ -24,6 +24,22 @@ static void test_tok(ITokenizer& tokenizer,
}
}

static void test_tok(ITokenizer& tokenizer,
const std::string& in,
const std::vector<std::string>& expected,
bool detokenize = false) {
std::vector<std::string> tokens;
tokenizer.tokenize(in, tokens);
ASSERT_EQ(tokens.size(), expected.size());
for (size_t i = 0; i < tokens.size(); ++i) {
EXPECT_EQ(tokens[i], expected[i]) << "Unexpected token mismatch at index " << i;
}
if (detokenize) {
auto text = tokenizer.detokenize(tokens);
EXPECT_EQ(text, in);
}
}

static void test_detok(ITokenizer& tokenizer, const std::string& in, const std::string& expected) {
std::vector<std::string> tokens;
onmt::SpaceTokenizer::get_instance().tokenize(in, tokens);
Expand Down Expand Up @@ -262,6 +278,12 @@ TEST(TokenizerTest, MarkOnSpace) {
"b %0020̇ c");
}

TEST(TokenizerTest, MarkOnSpaceNoSubstitution) {
Tokenizer tokenizer(Tokenizer::Mode::Conservative,
Tokenizer::Flags::JoinerAnnotate | Tokenizer::Flags::NoSubstitution);
test_tok(tokenizer, "angles ၧ1 and ၧ2", {"angles", "■ ၧ■", "1", "and", "■ ၧ■", "2"}, true);
}

TEST(TokenizerTest, CaseFeature) {
Tokenizer tokenizer(Tokenizer::Mode::Conservative,
Tokenizer::Flags::CaseFeature | Tokenizer::Flags::JoinerAnnotate);
Expand Down

0 comments on commit 120c287

Please sign in to comment.