From df21c83d6f5be7ea0bf8083702080eb112107b6c Mon Sep 17 00:00:00 2001 From: lv <992526373@qq.com> Date: Sun, 28 Apr 2024 16:29:35 +0800 Subject: [PATCH 1/2] 'fixed:jieba' --- rasa/nlu/tokenizers/jieba_tokenizer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index e0631c19f3c..d6fc90ac566 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -101,7 +101,15 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) tokenized = jieba.tokenize(text) - tokens = [Token(word, start) for (word, start, end) in tokenized] + tokens = [] + current_position = 0 + for word, start, end in tokenized: + if word.strip() == "": + continue + word_start = text.find(word, current_position) + word_end = word_start + len(word) + tokens.append(Token(word, word_start, word_end)) + current_position = word_end return self._apply_token_pattern(tokens) From 2805e5d49eb99db4b327ec4de20955d2aefee037 Mon Sep 17 00:00:00 2001 From: lv <992526373@qq.com> Date: Sun, 28 Apr 2024 17:12:13 +0800 Subject: [PATCH 2/2] fix: add tests for whitespace token issue in JiebaTokenizer --- tests/nlu/tokenizers/test_jieba_tokenizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py index c0628f901a8..f2d93471d84 100644 --- a/tests/nlu/tokenizers/test_jieba_tokenizer.py +++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py @@ -37,6 +37,11 @@ def create_jieba(config: Optional[Dict] = None) -> JiebaTokenizer: ["Micheal", "你好", "吗", "?"], [(0, 7), (7, 9), (9, 10), (10, 11)], ), + ( + "安装 rasa 应用", + ["安装", "rasa", "应用"], + [(0, 2), (3, 7), (8, 10)], + ), ], ) def test_jieba(text, expected_tokens, expected_indices):