From ea6dd7252fd80f4f2d9c2078190de9094dbee4b5 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:12:16 +0200 Subject: [PATCH 01/11] add token pattern to tokenizers --- rasa/nlu/tokenizers/convert_tokenizer.py | 2 + rasa/nlu/tokenizers/jieba_tokenizer.py | 2 + rasa/nlu/tokenizers/lm_tokenizer.py | 7 ++- rasa/nlu/tokenizers/mitie_tokenizer.py | 2 + rasa/nlu/tokenizers/spacy_tokenizer.py | 2 + rasa/nlu/tokenizers/tokenizer.py | 32 +++++++++++++ rasa/nlu/tokenizers/whitespace_tokenizer.py | 3 +- tests/nlu/tokenizers/test_tokenizer.py | 53 +++++++++++++++++++++ 8 files changed, 98 insertions(+), 5 deletions(-) diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index e4b63e60a6f8..0de0ee01b9be 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -26,6 +26,8 @@ class ConveRTTokenizer(WhitespaceTokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Pattern to further split identified tokens + "token_pattern": None, # Text will be tokenized with case sensitive as default "case_sensitive": True, } diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index 59dd9425a404..4974361fe786 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -28,6 +28,8 @@ class JiebaTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Pattern to further split identified tokens + "token_pattern": None, } # default don't load custom dictionary def __init__(self, component_config: Dict[Text, Any] = None) -> None: diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index 56ac683ddf60..ca338b9af849 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -5,10 +5,7 @@ from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.nlu.training_data import Message -from rasa.nlu.constants import ( - LANGUAGE_MODEL_DOCS, - TOKENS, -) +from rasa.nlu.constants import LANGUAGE_MODEL_DOCS, TOKENS class LanguageModelTokenizer(Tokenizer): @@ -27,6 +24,8 @@ def required_components(cls) -> List[Type[Component]]: "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Pattern to further split identified tokens + "token_pattern": None, } def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]: diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py index 054e3225fb10..d7e2cb6ee779 100644 --- a/rasa/nlu/tokenizers/mitie_tokenizer.py +++ b/rasa/nlu/tokenizers/mitie_tokenizer.py @@ -14,6 +14,8 @@ class MitieTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Pattern to further split identified tokens + "token_pattern": None, } @classmethod diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index b3ab4cdc6b64..1aac12660342 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -25,6 +25,8 @@ def required_components(cls) -> List[Type[Component]]: "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Pattern to further split identified tokens + "token_pattern": None, } def get_doc(self, message: Message, attribute: Text) -> "Doc": diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index a1e8ccc13e17..63a34be07f37 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -1,4 +1,5 @@ import logging +import re from typing import Text, List, Optional, Dict, Any @@ -65,6 +66,8 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: ) # split symbol for intents self.intent_split_symbol = self.component_config.get("intent_split_symbol", "_") + # token pattern to further split tokens + self.token_pattern = self.component_config.get("token_pattern", None) def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenizes the text of the provided attribute of the incoming message.""" @@ -86,12 +89,14 @@ def train( tokens = self._split_intent(example) else: tokens = self.tokenize(example, attribute) + tokens = self.apply_token_pattern(tokens) example.set(TOKENS_NAMES[attribute], tokens) def process(self, message: Message, **kwargs: Any) -> None: """Tokenize the incoming message.""" tokens = self.tokenize(message, TEXT) + tokens = self.apply_token_pattern(tokens) message.set(TOKENS_NAMES[TEXT], tokens) def _split_intent(self, message: Message): @@ -105,6 +110,33 @@ def _split_intent(self, message: Message): return self._convert_words_to_tokens(words, text) + def apply_token_pattern(self, tokens: List[Token]) -> List[Token]: + """Apply the token pattern to the given tokens. + + Args: + tokens: list of tokens to split + + Returns: + List of tokens. + """ + if not self.token_pattern: + return tokens + + token_pattern = re.compile(self.token_pattern) + + final_tokens = [] + for token in tokens: + new_tokens = token_pattern.findall(token.text) + new_tokens = [t for t in new_tokens if t] + running_offset = 0 + for new_token in new_tokens: + word_offset = token.text.index(new_token, running_offset) + word_len = len(new_token) + running_offset = word_offset + word_len + final_tokens.append(Token(new_token, token.start + word_offset)) + + return final_tokens + @staticmethod def _convert_words_to_tokens(words: List[Text], text: Text) -> List[Token]: running_offset = 0 diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index 85ad4d07bf0d..f1bcff88b9e3 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -3,7 +3,6 @@ from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.training_data import Message -from rasa.nlu.constants import TOKENS_NAMES, MESSAGE_ATTRIBUTES class WhitespaceTokenizer(Tokenizer): @@ -13,6 +12,8 @@ class WhitespaceTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", + # Pattern to further split identified tokens + "token_pattern": None, # Text will be tokenized with case sensitive as default "case_sensitive": True, } diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index b6d6de5a14a0..995eebf00562 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -1,5 +1,8 @@ +from typing import List, Text + import pytest +from nlu.tokenizers.tokenizer import Token from rasa.nlu.constants import TEXT, INTENT, RESPONSE, TOKENS_NAMES from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer @@ -83,3 +86,53 @@ def test_split_intent(text, expected_tokens): message.set(INTENT, text) assert [t.text for t in tk._split_intent(message)] == expected_tokens + + +@pytest.mark.parametrize( + "token_pattern, tokens, expected_tokens", + [ + ( + None, + [Token("hello", 0), Token("there", 6)], + [Token("hello", 0), Token("there", 6)], + ), + ( + "", + [Token("hello", 0), Token("there", 6)], + [Token("hello", 0), Token("there", 6)], + ), + ( + r"(?u)\b\w\w+\b", + [Token("role-based", 0), Token("access-control", 11)], + [ + Token("role", 0), + Token("based", 5), + Token("access", 11), + Token("control", 18), + ], + ), + ( + r".*", + [Token("role-based", 0), Token("access-control", 11)], + [Token("role-based", 0), Token("access-control", 11)], + ), + ], +) +def test_apply_token_pattern( + token_pattern: Text, tokens: List[Token], expected_tokens: List[Token] +): + component_config = {"token_pattern": token_pattern} + + tokenizer = WhitespaceTokenizer(component_config) + actual_tokens = tokenizer.apply_token_pattern(tokens) + + for t in actual_tokens: + print(t.text, t.start) + for t in expected_tokens: + print(t.text, t.start) + + assert len(actual_tokens) == len(expected_tokens) + for actual_token, expected_token in zip(actual_tokens, expected_tokens): + assert actual_token.text == expected_token.text + assert actual_token.start == expected_token.start + assert actual_token.end == expected_token.end From 6dd0872acbe8655a7e690a48bc37fd1702982a34 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:19:27 +0200 Subject: [PATCH 02/11] remove token pattern from count vectors featurizer --- docs/nlu/components.rst | 15 ++++++++++++--- .../sparse_featurizer/count_vectors_featurizer.py | 14 ++------------ rasa/nlu/tokenizers/convert_tokenizer.py | 2 +- rasa/nlu/tokenizers/jieba_tokenizer.py | 2 +- rasa/nlu/tokenizers/lm_tokenizer.py | 2 +- rasa/nlu/tokenizers/mitie_tokenizer.py | 2 +- rasa/nlu/tokenizers/spacy_tokenizer.py | 2 +- rasa/nlu/tokenizers/whitespace_tokenizer.py | 2 +- 8 files changed, 20 insertions(+), 21 deletions(-) diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst index 06ed7274c391..ca4708125588 100644 --- a/docs/nlu/components.rst +++ b/docs/nlu/components.rst @@ -182,6 +182,8 @@ WhitespaceTokenizer "intent_split_symbol": "_" # Text will be tokenized with case sensitive as default "case_sensitive": True + # Regular expression to detect tokens + "token_pattern": None JiebaTokenizer @@ -210,6 +212,8 @@ JiebaTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None MitieTokenizer @@ -229,6 +233,8 @@ MitieTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None SpacyTokenizer ~~~~~~~~~~~~~~ @@ -248,6 +254,8 @@ SpacyTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None .. _ConveRTTokenizer: @@ -282,6 +290,8 @@ ConveRTTokenizer "intent_split_symbol": "_" # Text will be tokenized with case sensitive as default "case_sensitive": True + # Regular expression to detect tokens + "token_pattern": None .. _LanguageModelTokenizer: @@ -304,6 +314,8 @@ LanguageModelTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" + # Regular expression to detect tokens + "token_pattern": None @@ -582,9 +594,6 @@ CountVectorsFeaturizer | | | n-grams at the edges of words are padded with space. | | | | Valid values: 'word', 'char', 'char_wb'. | +-------------------+-------------------------+--------------------------------------------------------------+ - | token_pattern | r"(?u)\b\w\w+\b" | Regular expression used to detect tokens. | - | | | Only used if 'analyzer' is set to 'word'. | - +-------------------+-------------------------+--------------------------------------------------------------+ | strip_accents | None | Remove accents during the pre-processing step. | | | | Valid values: 'ascii', 'unicode', 'None'. | +-------------------+-------------------------+--------------------------------------------------------------+ diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index e7fef6f46e32..a79c15d59e77 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -53,11 +53,6 @@ def required_components(cls) -> List[Type[Component]]: # 'char_wb' creates character n-grams inside word boundaries # n-grams at the edges of words are padded with space. "analyzer": "word", # use 'char' or 'char_wb' for character - # regular expression for tokens - # only used if analyzer == 'word' - # WARNING this pattern is used during training - # but not currently used during inference! - "token_pattern": r"(?u)\b\w\w+\b", # remove accents during the preprocessing step "strip_accents": None, # {'ascii', 'unicode', None} # list of stop words @@ -95,9 +90,6 @@ def _load_count_vect_params(self) -> None: # set analyzer self.analyzer = self.component_config["analyzer"] - # regular expression for tokens - self.token_pattern = self.component_config["token_pattern"] - # remove accents during the preprocessing step self.strip_accents = self.component_config["strip_accents"] @@ -341,7 +333,6 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): self.vectorizers = self._create_shared_vocab_vectorizers( { - "token_pattern": self.token_pattern, "strip_accents": self.strip_accents, "lowercase": self.lowercase, "stop_words": self.stop_words, @@ -375,7 +366,6 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) self.vectorizers = self._create_independent_vocab_vectorizers( { - "token_pattern": self.token_pattern, "strip_accents": self.strip_accents, "lowercase": self.lowercase, "stop_words": self.stop_words, @@ -605,7 +595,7 @@ def _create_shared_vocab_vectorizers( """Create vectorizers for all attributes with shared vocabulary""" shared_vectorizer = CountVectorizer( - token_pattern=parameters["token_pattern"], + tokenizer="", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], @@ -637,7 +627,7 @@ def _create_independent_vocab_vectorizers( attribute_vocabulary = vocabulary[attribute] if vocabulary else None attribute_vectorizer = CountVectorizer( - token_pattern=parameters["token_pattern"], + token_pattern="", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index 0de0ee01b9be..a60785191bed 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -26,7 +26,7 @@ class ConveRTTokenizer(WhitespaceTokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Pattern to further split identified tokens + # Regular expression to detect tokens "token_pattern": None, # Text will be tokenized with case sensitive as default "case_sensitive": True, diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index 4974361fe786..32b3310bc5f7 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -28,7 +28,7 @@ class JiebaTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Pattern to further split identified tokens + # Regular expression to detect tokens "token_pattern": None, } # default don't load custom dictionary diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index ca338b9af849..c661ea8eee39 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -24,7 +24,7 @@ def required_components(cls) -> List[Type[Component]]: "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Pattern to further split identified tokens + # Regular expression to detect tokens "token_pattern": None, } diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py index d7e2cb6ee779..e26f998117b0 100644 --- a/rasa/nlu/tokenizers/mitie_tokenizer.py +++ b/rasa/nlu/tokenizers/mitie_tokenizer.py @@ -14,7 +14,7 @@ class MitieTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Pattern to further split identified tokens + # Regular expression to detect tokens "token_pattern": None, } diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index 1aac12660342..bff9e8ec8033 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -25,7 +25,7 @@ def required_components(cls) -> List[Type[Component]]: "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Pattern to further split identified tokens + # Regular expression to detect tokens "token_pattern": None, } diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index f1bcff88b9e3..faa57e98072a 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -12,7 +12,7 @@ class WhitespaceTokenizer(Tokenizer): "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Pattern to further split identified tokens + # Regular expression to detect tokens "token_pattern": None, # Text will be tokenized with case sensitive as default "case_sensitive": True, From e387ab6c647e951c43bdc4a1418ec8fcd22d3d56 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:23:24 +0200 Subject: [PATCH 03/11] add changelog --- changelog/5905.bugfix.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog/5905.bugfix.rst diff --git a/changelog/5905.bugfix.rst b/changelog/5905.bugfix.rst new file mode 100644 index 000000000000..9eedb3028c7c --- /dev/null +++ b/changelog/5905.bugfix.rst @@ -0,0 +1,4 @@ +Remove option ``token_pattern`` from ``CountVectorsFeaturizer``. +Instead all tokenizers now have the option ``token_pattern``. +If a regular expression is set, all detected tokens of the tokenizer are further split into tokens according to the +regular expression. From 4ab54e9f0a022610379fe91bb2feeada6bd0b11a Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:30:22 +0200 Subject: [PATCH 04/11] check if pattern could be applied --- rasa/nlu/tokenizers/tokenizer.py | 4 ++++ tests/nlu/tokenizers/test_tokenizer.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index 63a34be07f37..d2c32818099f 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -128,6 +128,10 @@ def apply_token_pattern(self, tokens: List[Token]) -> List[Token]: for token in tokens: new_tokens = token_pattern.findall(token.text) new_tokens = [t for t in new_tokens if t] + + if not new_tokens: + final_tokens.append(token) + running_offset = 0 for new_token in new_tokens: word_offset = token.text.index(new_token, running_offset) diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index 995eebf00562..8b05ea3a6951 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -116,6 +116,11 @@ def test_split_intent(text, expected_tokens): [Token("role-based", 0), Token("access-control", 11)], [Token("role-based", 0), Token("access-control", 11)], ), + ( + r"(test)", + [Token("role-based", 0), Token("access-control", 11)], + [Token("role-based", 0), Token("access-control", 11)], + ), ], ) def test_apply_token_pattern( From 7cba953e80337e5fb77fbf2895fdffdccb06dfeb Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:31:06 +0200 Subject: [PATCH 05/11] remove print from test --- tests/nlu/tokenizers/test_tokenizer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index 8b05ea3a6951..69754647855a 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -131,11 +131,6 @@ def test_apply_token_pattern( tokenizer = WhitespaceTokenizer(component_config) actual_tokens = tokenizer.apply_token_pattern(tokens) - for t in actual_tokens: - print(t.text, t.start) - for t in expected_tokens: - print(t.text, t.start) - assert len(actual_tokens) == len(expected_tokens) for actual_token, expected_token in zip(actual_tokens, expected_tokens): assert actual_token.text == expected_token.text From dc54e96a80db7588efc49e5183d4ca871366e536 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:34:48 +0200 Subject: [PATCH 06/11] fix option in count_vectors_featurizer --- .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index a79c15d59e77..f21edcda44f4 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -595,7 +595,7 @@ def _create_shared_vocab_vectorizers( """Create vectorizers for all attributes with shared vocabulary""" shared_vectorizer = CountVectorizer( - tokenizer="", + token_pattern="", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], From 586ee5bae758083b7f537aa34a82fbb2214c1f8e Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 26 Jun 2020 15:36:28 +0200 Subject: [PATCH 07/11] make apply token pattern private --- rasa/nlu/tokenizers/tokenizer.py | 6 +++--- tests/nlu/tokenizers/test_tokenizer.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index d2c32818099f..2dfc7884f032 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -89,14 +89,14 @@ def train( tokens = self._split_intent(example) else: tokens = self.tokenize(example, attribute) - tokens = self.apply_token_pattern(tokens) + tokens = self._apply_token_pattern(tokens) example.set(TOKENS_NAMES[attribute], tokens) def process(self, message: Message, **kwargs: Any) -> None: """Tokenize the incoming message.""" tokens = self.tokenize(message, TEXT) - tokens = self.apply_token_pattern(tokens) + tokens = self._apply_token_pattern(tokens) message.set(TOKENS_NAMES[TEXT], tokens) def _split_intent(self, message: Message): @@ -110,7 +110,7 @@ def _split_intent(self, message: Message): return self._convert_words_to_tokens(words, text) - def apply_token_pattern(self, tokens: List[Token]) -> List[Token]: + def _apply_token_pattern(self, tokens: List[Token]) -> List[Token]: """Apply the token pattern to the given tokens. Args: diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index 69754647855a..d18a8b7982c9 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -129,7 +129,7 @@ def test_apply_token_pattern( component_config = {"token_pattern": token_pattern} tokenizer = WhitespaceTokenizer(component_config) - actual_tokens = tokenizer.apply_token_pattern(tokens) + actual_tokens = tokenizer._apply_token_pattern(tokens) assert len(actual_tokens) == len(expected_tokens) for actual_token, expected_token in zip(actual_tokens, expected_tokens): From cbbcf0baa473d7c2f3131cd9137fb79cdb74e956 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 29 Jun 2020 10:50:32 +0200 Subject: [PATCH 08/11] update default token pattern of CountVectorizer --- .../count_vectors_featurizer.py | 4 ++-- .../test_count_vectors_featurizer.py | 23 ++++++------------- tests/nlu/tokenizers/test_tokenizer.py | 2 +- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index f21edcda44f4..1d1e08cc2c34 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -595,7 +595,7 @@ def _create_shared_vocab_vectorizers( """Create vectorizers for all attributes with shared vocabulary""" shared_vectorizer = CountVectorizer( - token_pattern="", + token_pattern=r"(?u)\b\w+\b", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], @@ -627,7 +627,7 @@ def _create_independent_vocab_vectorizers( attribute_vocabulary = vocabulary[attribute] if vocabulary else None attribute_vectorizer = CountVectorizer( - token_pattern="", + token_pattern=r"(?u)\b\w+\b", strip_accents=parameters["strip_accents"], lowercase=parameters["lowercase"], stop_words=parameters["stop_words"], diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 14bf35dc8ef9..42effc649aeb 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -23,7 +23,7 @@ ], ) def test_count_vector_featurizer(sentence, expected, expected_cls): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() train_message = Message(sentence) test_message = Message(sentence) @@ -54,7 +54,7 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features ): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) @@ -104,7 +104,7 @@ def test_count_vector_featurizer_response_attribute_featurization( def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features ): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) @@ -153,9 +153,7 @@ def test_count_vector_featurizer_attribute_featurization( def test_count_vector_featurizer_shared_vocab( sentence, intent, response, text_features, intent_features, response_features ): - ftr = CountVectorsFeaturizer( - {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True} - ) + ftr = CountVectorsFeaturizer({"use_shared_vocab": True}) tk = WhitespaceTokenizer() train_message = Message(sentence) @@ -188,9 +186,7 @@ def test_count_vector_featurizer_shared_vocab( ], ) def test_count_vector_featurizer_oov_token(sentence, expected): - ftr = CountVectorsFeaturizer( - {"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"} - ) + ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) @@ -217,11 +213,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected): def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer( - { - "token_pattern": r"(?u)\b\w+\b", - "OOV_token": "__oov__", - "OOV_words": ["oov_word0", "OOV_word1"], - } + {"OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"]} ) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) @@ -251,7 +243,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected): ) def test_count_vector_featurizer_using_tokens(tokens, expected): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. @@ -307,7 +299,6 @@ def test_count_vector_featurizer_persist_load(tmp_path): # set non default values to config config = { "analyzer": "char", - "token_pattern": r"(?u)\b\w+\b", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index d18a8b7982c9..c23e5e3f5f5b 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -2,7 +2,7 @@ import pytest -from nlu.tokenizers.tokenizer import Token +from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.constants import TEXT, INTENT, RESPONSE, TOKENS_NAMES from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer From f159c5ab0521b430b60ce9afd0c2ef26bf1b3978 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 29 Jun 2020 11:06:01 +0200 Subject: [PATCH 09/11] fix deepsource issues --- rasa/nlu/tokenizers/convert_tokenizer.py | 3 ++- tests/nlu/tokenizers/test_tokenizer.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index a60785191bed..426f19cf3246 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -84,7 +84,8 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: return tokens_out - def _clean_tokens(self, tokens: List[bytes]): + @staticmethod + def _clean_tokens(tokens: List[bytes]): """Encode tokens and remove special char added by ConveRT.""" tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py index c23e5e3f5f5b..f250472f182f 100644 --- a/tests/nlu/tokenizers/test_tokenizer.py +++ b/tests/nlu/tokenizers/test_tokenizer.py @@ -9,8 +9,6 @@ def test_tokens_comparison(): - from rasa.nlu.tokenizers.tokenizer import Token - x = Token("hello", 0) y = Token("Hello", 0) From 9540d6111244bbec4aa18ca4393ffbe846b9208d Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 1 Jul 2020 13:49:55 +0200 Subject: [PATCH 10/11] apply token pattern in tokenize method --- docs/nlu/components.rst | 3 --- rasa/nlu/tokenizers/jieba_tokenizer.py | 2 +- rasa/nlu/tokenizers/lm_tokenizer.py | 2 -- rasa/nlu/tokenizers/mitie_tokenizer.py | 2 +- rasa/nlu/tokenizers/spacy_tokenizer.py | 4 +++- rasa/nlu/tokenizers/tokenizer.py | 11 ++++++++--- rasa/nlu/tokenizers/whitespace_tokenizer.py | 4 +++- 7 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst index ca4708125588..08437b51ea27 100644 --- a/docs/nlu/components.rst +++ b/docs/nlu/components.rst @@ -314,9 +314,6 @@ LanguageModelTokenizer "intent_tokenization_flag": False # Symbol on which intent should be split "intent_split_symbol": "_" - # Regular expression to detect tokens - "token_pattern": None - .. _text-featurizers: diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index 32b3310bc5f7..ef034bde430f 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -71,7 +71,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] - return tokens + return self._apply_token_pattern(tokens) @classmethod def load( diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py index c661ea8eee39..4edb431b5986 100644 --- a/rasa/nlu/tokenizers/lm_tokenizer.py +++ b/rasa/nlu/tokenizers/lm_tokenizer.py @@ -24,8 +24,6 @@ def required_components(cls) -> List[Type[Component]]: "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", - # Regular expression to detect tokens - "token_pattern": None, } def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]: diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py index e26f998117b0..376aa901fba0 100644 --- a/rasa/nlu/tokenizers/mitie_tokenizer.py +++ b/rasa/nlu/tokenizers/mitie_tokenizer.py @@ -34,7 +34,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: for token, offset in tokenized ] - return tokens + return self._apply_token_pattern(tokens) def _token_from_offset( self, text: bytes, offset: int, encoded_sentence: bytes diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index bff9e8ec8033..3860cc274443 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -35,7 +35,7 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc": def tokenize(self, message: Message, attribute: Text) -> List[Token]: doc = self.get_doc(message, attribute) - return [ + tokens = [ Token( t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)} ) @@ -43,6 +43,8 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: if t.text and t.text.strip() ] + return self._apply_token_pattern(tokens) + @staticmethod def _tag_of_token(token: Any) -> Text: import spacy diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index 2dfc7884f032..7f78c25cd722 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -89,14 +89,12 @@ def train( tokens = self._split_intent(example) else: tokens = self.tokenize(example, attribute) - tokens = self._apply_token_pattern(tokens) example.set(TOKENS_NAMES[attribute], tokens) def process(self, message: Message, **kwargs: Any) -> None: """Tokenize the incoming message.""" tokens = self.tokenize(message, TEXT) - tokens = self._apply_token_pattern(tokens) message.set(TOKENS_NAMES[TEXT], tokens) def _split_intent(self, message: Message): @@ -137,7 +135,14 @@ def _apply_token_pattern(self, tokens: List[Token]) -> List[Token]: word_offset = token.text.index(new_token, running_offset) word_len = len(new_token) running_offset = word_offset + word_len - final_tokens.append(Token(new_token, token.start + word_offset)) + final_tokens.append( + Token( + new_token, + token.start + word_offset, + data=token.data, + lemma=token.lemma, + ) + ) return final_tokens diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py index 451ed3325fed..5daaeacc5487 100644 --- a/rasa/nlu/tokenizers/whitespace_tokenizer.py +++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py @@ -53,4 +53,6 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: if not words: words = [text] - return self._convert_words_to_tokens(words, text) + tokens = self._convert_words_to_tokens(words, text) + + return self._apply_token_pattern(tokens) From e0aebb5b7fe5b36ea205eb0f63226a9ec2cd413e Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 6 Jul 2020 14:29:42 +0200 Subject: [PATCH 11/11] review comments --- changelog/5905.bugfix.rst | 3 +-- rasa/nlu/tokenizers/convert_tokenizer.py | 2 +- rasa/nlu/tokenizers/tokenizer.py | 11 ++++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/changelog/5905.bugfix.rst b/changelog/5905.bugfix.rst index 9eedb3028c7c..4e4d3f0d70da 100644 --- a/changelog/5905.bugfix.rst +++ b/changelog/5905.bugfix.rst @@ -1,4 +1,3 @@ Remove option ``token_pattern`` from ``CountVectorsFeaturizer``. Instead all tokenizers now have the option ``token_pattern``. -If a regular expression is set, all detected tokens of the tokenizer are further split into tokens according to the -regular expression. +If a regular expression is set, the tokenizer will apply the token pattern. diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index 426f19cf3246..acae4ade5130 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -85,7 +85,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: return tokens_out @staticmethod - def _clean_tokens(tokens: List[bytes]): + def _clean_tokens(tokens: List[bytes]) -> List[Text]: """Encode tokens and remove special char added by ConveRT.""" tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py index 7f78c25cd722..4d3bad85e73c 100644 --- a/rasa/nlu/tokenizers/tokenizer.py +++ b/rasa/nlu/tokenizers/tokenizer.py @@ -67,7 +67,10 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None: # split symbol for intents self.intent_split_symbol = self.component_config.get("intent_split_symbol", "_") # token pattern to further split tokens - self.token_pattern = self.component_config.get("token_pattern", None) + token_pattern = self.component_config.get("token_pattern", None) + self.token_pattern_regex = None + if token_pattern: + self.token_pattern_regex = re.compile(token_pattern) def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenizes the text of the provided attribute of the incoming message.""" @@ -117,14 +120,12 @@ def _apply_token_pattern(self, tokens: List[Token]) -> List[Token]: Returns: List of tokens. """ - if not self.token_pattern: + if not self.token_pattern_regex: return tokens - token_pattern = re.compile(self.token_pattern) - final_tokens = [] for token in tokens: - new_tokens = token_pattern.findall(token.text) + new_tokens = self.token_pattern_regex.findall(token.text) new_tokens = [t for t in new_tokens if t] if not new_tokens: