Skip to content

Commit

Permalink
Merge pull request #6073 from RasaHQ/token-pattern
Browse files Browse the repository at this point in the history
Move token_pattern to tokenizers
  • Loading branch information
tabergma committed Jul 7, 2020
2 parents 8f51534 + 05866fb commit 87f2e9d
Show file tree
Hide file tree
Showing 12 changed files with 138 additions and 43 deletions.
3 changes: 3 additions & 0 deletions changelog/5905.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Remove option ``token_pattern`` from ``CountVectorsFeaturizer``.
Instead all tokenizers now have the option ``token_pattern``.
If a regular expression is set, the tokenizer will apply the token pattern.
14 changes: 10 additions & 4 deletions docs/nlu/components.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ WhitespaceTokenizer
"intent_split_symbol": "_"
# Text will be tokenized with case sensitive as default
"case_sensitive": True
# Regular expression to detect tokens
"token_pattern": None
JiebaTokenizer
Expand Down Expand Up @@ -210,6 +212,8 @@ JiebaTokenizer
"intent_tokenization_flag": False
# Symbol on which intent should be split
"intent_split_symbol": "_"
# Regular expression to detect tokens
"token_pattern": None
MitieTokenizer
Expand All @@ -229,6 +233,8 @@ MitieTokenizer
"intent_tokenization_flag": False
# Symbol on which intent should be split
"intent_split_symbol": "_"
# Regular expression to detect tokens
"token_pattern": None
SpacyTokenizer
~~~~~~~~~~~~~~
Expand All @@ -248,6 +254,8 @@ SpacyTokenizer
"intent_tokenization_flag": False
# Symbol on which intent should be split
"intent_split_symbol": "_"
# Regular expression to detect tokens
"token_pattern": None
.. _ConveRTTokenizer:

Expand Down Expand Up @@ -282,6 +290,8 @@ ConveRTTokenizer
"intent_split_symbol": "_"
# Text will be tokenized with case sensitive as default
"case_sensitive": True
# Regular expression to detect tokens
"token_pattern": None
.. _LanguageModelTokenizer:

Expand All @@ -306,7 +316,6 @@ LanguageModelTokenizer
"intent_split_symbol": "_"
.. _text-featurizers:

Text Featurizers
Expand Down Expand Up @@ -582,9 +591,6 @@ CountVectorsFeaturizer
| | | n-grams at the edges of words are padded with space. |
| | | Valid values: 'word', 'char', 'char_wb'. |
+-------------------+-------------------------+--------------------------------------------------------------+
| token_pattern | r"(?u)\b\w\w+\b" | Regular expression used to detect tokens. |
| | | Only used if 'analyzer' is set to 'word'. |
+-------------------+-------------------------+--------------------------------------------------------------+
| strip_accents | None | Remove accents during the pre-processing step. |
| | | Valid values: 'ascii', 'unicode', 'None'. |
+-------------------+-------------------------+--------------------------------------------------------------+
Expand Down
14 changes: 2 additions & 12 deletions rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,6 @@ def required_components(cls) -> List[Type[Component]]:
# 'char_wb' creates character n-grams inside word boundaries
# n-grams at the edges of words are padded with space.
"analyzer": "word", # use 'char' or 'char_wb' for character
# regular expression for tokens
# only used if analyzer == 'word'
# WARNING this pattern is used during training
# but not currently used during inference!
"token_pattern": r"(?u)\b\w\w+\b",
# remove accents during the preprocessing step
"strip_accents": None, # {'ascii', 'unicode', None}
# list of stop words
Expand Down Expand Up @@ -95,9 +90,6 @@ def _load_count_vect_params(self) -> None:
# set analyzer
self.analyzer = self.component_config["analyzer"]

# regular expression for tokens
self.token_pattern = self.component_config["token_pattern"]

# remove accents during the preprocessing step
self.strip_accents = self.component_config["strip_accents"]

Expand Down Expand Up @@ -341,7 +333,6 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):

self.vectorizers = self._create_shared_vocab_vectorizers(
{
"token_pattern": self.token_pattern,
"strip_accents": self.strip_accents,
"lowercase": self.lowercase,
"stop_words": self.stop_words,
Expand Down Expand Up @@ -375,7 +366,6 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])

self.vectorizers = self._create_independent_vocab_vectorizers(
{
"token_pattern": self.token_pattern,
"strip_accents": self.strip_accents,
"lowercase": self.lowercase,
"stop_words": self.stop_words,
Expand Down Expand Up @@ -605,7 +595,7 @@ def _create_shared_vocab_vectorizers(
"""Create vectorizers for all attributes with shared vocabulary"""

shared_vectorizer = CountVectorizer(
token_pattern=parameters["token_pattern"],
token_pattern=r"(?u)\b\w+\b",
strip_accents=parameters["strip_accents"],
lowercase=parameters["lowercase"],
stop_words=parameters["stop_words"],
Expand Down Expand Up @@ -637,7 +627,7 @@ def _create_independent_vocab_vectorizers(
attribute_vocabulary = vocabulary[attribute] if vocabulary else None

attribute_vectorizer = CountVectorizer(
token_pattern=parameters["token_pattern"],
token_pattern=r"(?u)\b\w+\b",
strip_accents=parameters["strip_accents"],
lowercase=parameters["lowercase"],
stop_words=parameters["stop_words"],
Expand Down
5 changes: 4 additions & 1 deletion rasa/nlu/tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class ConveRTTokenizer(WhitespaceTokenizer):
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
# Text will be tokenized with case sensitive as default
"case_sensitive": True,
}
Expand Down Expand Up @@ -82,7 +84,8 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:

return tokens_out

def _clean_tokens(self, tokens: List[bytes]):
@staticmethod
def _clean_tokens(tokens: List[bytes]) -> List[Text]:
"""Encode tokens and remove special char added by ConveRT."""

tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens]
Expand Down
4 changes: 3 additions & 1 deletion rasa/nlu/tokenizers/jieba_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class JiebaTokenizer(Tokenizer):
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
} # default don't load custom dictionary

def __init__(self, component_config: Dict[Text, Any] = None) -> None:
Expand Down Expand Up @@ -69,7 +71,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
tokenized = jieba.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]

return tokens
return self._apply_token_pattern(tokens)

@classmethod
def load(
Expand Down
5 changes: 1 addition & 4 deletions rasa/nlu/tokenizers/lm_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
from rasa.nlu.training_data import Message

from rasa.nlu.constants import (
LANGUAGE_MODEL_DOCS,
TOKENS,
)
from rasa.nlu.constants import LANGUAGE_MODEL_DOCS, TOKENS


class LanguageModelTokenizer(Tokenizer):
Expand Down
4 changes: 3 additions & 1 deletion rasa/nlu/tokenizers/mitie_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class MitieTokenizer(Tokenizer):
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
}

@classmethod
Expand All @@ -32,7 +34,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
for token, offset in tokenized
]

return tokens
return self._apply_token_pattern(tokens)

def _token_from_offset(
self, text: bytes, offset: int, encoded_sentence: bytes
Expand Down
6 changes: 5 additions & 1 deletion rasa/nlu/tokenizers/spacy_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def required_components(cls) -> List[Type[Component]]:
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
}

def get_doc(self, message: Message, attribute: Text) -> "Doc":
Expand All @@ -33,14 +35,16 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
doc = self.get_doc(message, attribute)

return [
tokens = [
Token(
t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}
)
for t in doc
if t.text and t.text.strip()
]

return self._apply_token_pattern(tokens)

@staticmethod
def _tag_of_token(token: Any) -> Text:
import spacy
Expand Down
42 changes: 42 additions & 0 deletions rasa/nlu/tokenizers/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re

from typing import Text, List, Optional, Dict, Any

Expand Down Expand Up @@ -65,6 +66,11 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
)
# split symbol for intents
self.intent_split_symbol = self.component_config.get("intent_split_symbol", "_")
# token pattern to further split tokens
token_pattern = self.component_config.get("token_pattern", None)
self.token_pattern_regex = None
if token_pattern:
self.token_pattern_regex = re.compile(token_pattern)

def tokenize(self, message: Message, attribute: Text) -> List[Token]:
"""Tokenizes the text of the provided attribute of the incoming message."""
Expand Down Expand Up @@ -105,6 +111,42 @@ def _split_intent(self, message: Message):

return self._convert_words_to_tokens(words, text)

def _apply_token_pattern(self, tokens: List[Token]) -> List[Token]:
"""Apply the token pattern to the given tokens.
Args:
tokens: list of tokens to split
Returns:
List of tokens.
"""
if not self.token_pattern_regex:
return tokens

final_tokens = []
for token in tokens:
new_tokens = self.token_pattern_regex.findall(token.text)
new_tokens = [t for t in new_tokens if t]

if not new_tokens:
final_tokens.append(token)

running_offset = 0
for new_token in new_tokens:
word_offset = token.text.index(new_token, running_offset)
word_len = len(new_token)
running_offset = word_offset + word_len
final_tokens.append(
Token(
new_token,
token.start + word_offset,
data=token.data,
lemma=token.lemma,
)
)

return final_tokens

@staticmethod
def _convert_words_to_tokens(words: List[Text], text: Text) -> List[Token]:
running_offset = 0
Expand Down
6 changes: 5 additions & 1 deletion rasa/nlu/tokenizers/whitespace_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class WhitespaceTokenizer(Tokenizer):
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
# Text will be tokenized with case sensitive as default
"case_sensitive": True,
}
Expand Down Expand Up @@ -77,4 +79,6 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
if not words:
words = [text]

return self._convert_words_to_tokens(words, text)
tokens = self._convert_words_to_tokens(words, text)

return self._apply_token_pattern(tokens)
23 changes: 7 additions & 16 deletions tests/nlu/featurizers/test_count_vectors_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
],
)
def test_count_vector_featurizer(sentence, expected, expected_cls):
ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
ftr = CountVectorsFeaturizer()

train_message = Message(sentence)
test_message = Message(sentence)
Expand Down Expand Up @@ -54,7 +54,7 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
def test_count_vector_featurizer_response_attribute_featurization(
sentence, intent, response, intent_features, response_features
):
ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
ftr = CountVectorsFeaturizer()
tk = WhitespaceTokenizer()

train_message = Message(sentence)
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_count_vector_featurizer_response_attribute_featurization(
def test_count_vector_featurizer_attribute_featurization(
sentence, intent, response, intent_features, response_features
):
ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
ftr = CountVectorsFeaturizer()
tk = WhitespaceTokenizer()

train_message = Message(sentence)
Expand Down Expand Up @@ -153,9 +153,7 @@ def test_count_vector_featurizer_attribute_featurization(
def test_count_vector_featurizer_shared_vocab(
sentence, intent, response, text_features, intent_features, response_features
):
ftr = CountVectorsFeaturizer(
{"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True}
)
ftr = CountVectorsFeaturizer({"use_shared_vocab": True})
tk = WhitespaceTokenizer()

train_message = Message(sentence)
Expand Down Expand Up @@ -188,9 +186,7 @@ def test_count_vector_featurizer_shared_vocab(
],
)
def test_count_vector_featurizer_oov_token(sentence, expected):
ftr = CountVectorsFeaturizer(
{"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"}
)
ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"})
train_message = Message(sentence)
WhitespaceTokenizer().process(train_message)

Expand All @@ -217,11 +213,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
def test_count_vector_featurizer_oov_words(sentence, expected):

ftr = CountVectorsFeaturizer(
{
"token_pattern": r"(?u)\b\w+\b",
"OOV_token": "__oov__",
"OOV_words": ["oov_word0", "OOV_word1"],
}
{"OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"]}
)
train_message = Message(sentence)
WhitespaceTokenizer().process(train_message)
Expand Down Expand Up @@ -251,7 +243,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
)
def test_count_vector_featurizer_using_tokens(tokens, expected):

ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
ftr = CountVectorsFeaturizer()

# using empty string instead of real text string to make sure
# count vector only can come from `tokens` feature.
Expand Down Expand Up @@ -307,7 +299,6 @@ def test_count_vector_featurizer_persist_load(tmp_path):
# set non default values to config
config = {
"analyzer": "char",
"token_pattern": r"(?u)\b\w+\b",
"strip_accents": "ascii",
"stop_words": "stop",
"min_df": 2,
Expand Down
Loading

0 comments on commit 87f2e9d

Please sign in to comment.