From 120c287edbd9ff6a712525b3cf98fdeece6343c3 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Wed, 12 Jun 2019 16:56:58 +0200 Subject: [PATCH] Respect "NoSubstitution" flag for combining marks applied on spaces (#74) * Respect "NoSubstitution" flag for combining marks applied on spaces * Fix style --- src/Tokenizer.cc | 2 +- test/test.cc | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/Tokenizer.cc b/src/Tokenizer.cc index 2b616d5a..7277eb22 100644 --- a/src/Tokenizer.cc +++ b/src/Tokenizer.cc @@ -629,7 +629,7 @@ namespace onmt token.join_left(); } - if (sub_c[0] == ' ') + if (sub_c[0] == ' ' && !_no_substitution) token.append(protected_character + "0020" + sub_c.substr(1)); else token.append(sub_c); diff --git a/test/test.cc b/test/test.cc index 58815904..067c72b2 100644 --- a/test/test.cc +++ b/test/test.cc @@ -24,6 +24,22 @@ static void test_tok(ITokenizer& tokenizer, } } +static void test_tok(ITokenizer& tokenizer, + const std::string& in, + const std::vector& expected, + bool detokenize = false) { + std::vector tokens; + tokenizer.tokenize(in, tokens); + ASSERT_EQ(tokens.size(), expected.size()); + for (size_t i = 0; i < tokens.size(); ++i) { + EXPECT_EQ(tokens[i], expected[i]) << "Unexpected token mismatch at index " << i; + } + if (detokenize) { + auto text = tokenizer.detokenize(tokens); + EXPECT_EQ(text, in); + } +} + static void test_detok(ITokenizer& tokenizer, const std::string& in, const std::string& expected) { std::vector tokens; onmt::SpaceTokenizer::get_instance().tokenize(in, tokens); @@ -262,6 +278,12 @@ TEST(TokenizerTest, MarkOnSpace) { "b %0020̇ c"); } +TEST(TokenizerTest, MarkOnSpaceNoSubstitution) { + Tokenizer tokenizer(Tokenizer::Mode::Conservative, + Tokenizer::Flags::JoinerAnnotate | Tokenizer::Flags::NoSubstitution); + test_tok(tokenizer, "angles ၧ1 and ၧ2", {"angles", "■ ၧ■", "1", "and", "■ ၧ■", "2"}, true); +} + TEST(TokenizerTest, CaseFeature) { Tokenizer tokenizer(Tokenizer::Mode::Conservative, Tokenizer::Flags::CaseFeature | Tokenizer::Flags::JoinerAnnotate);