Merge pull request #1115 from howl-anderson/feature/count_vector_from…

…_tokens New feature: component `count_vectors_featurizer` can use `tokens` provide by tokenizers
RasaHQ · Jun 1, 2018 · 17d3962 · 17d3962
2 parents 2de4dbb + b4136e5
commit 17d3962
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 4 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -12,6 +12,7 @@ This project adheres to `Semantic Versioning`_ starting with version 0.7.0.
 Added
 -----
 - doc link to a community contribution for Rasa NLU in Chinese
+- support for component ``count_vectors_featurizer`` use ``tokens`` feature provide by tokenizer
 
 Changed
 -------

diff --git a/docs/pipeline.rst b/docs/pipeline.rst
@@ -270,6 +270,8 @@ intent_featurizer_count_vectors
     Creates bag-of-words representation of intent features using
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_. All tokens which consist only of digits (e.g. 123 and 99 but not a123d) will be assigned to the same feature.
 
+    .. note:: If the words in the model language cannot be split by the white-space, a language-specific tokenizer is required in the pipeline before this component (e.g. using ``tokenizer_jieba`` for Chinese language)
+
 :Configuration:
     See `sklearn's CountVectorizer docs <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
     for detailed description of the configuration parameters

diff --git a/rasa_nlu/featurizers/count_vectors_featurizer.py b/rasa_nlu/featurizers/count_vectors_featurizer.py
@@ -126,7 +126,7 @@ def train(self, training_data, cfg=None, **kwargs):
                                     max_features=self.max_features,
                                     preprocessor=self.preprocessor)
 
-        lem_exs = [self._lemmatize(example)
+        lem_exs = [self._get_message_text(example)
                    for example in training_data.intent_examples]
 
         try:
@@ -146,13 +146,17 @@ def process(self, message, **kwargs):
                          "component is either not trained or "
                          "didn't receive enough training data")
         else:
-            bag = self.vect.transform([self._lemmatize(message)]).toarray()
+            bag = self.vect.transform(
+                [self._get_message_text(message)]
+            ).toarray()
             message.set("text_features", bag)
 
     @staticmethod
-    def _lemmatize(message):
-        if message.get("spacy_doc"):
+    def _get_message_text(message):
+        if message.get("spacy_doc"):  # if lemmatize is possible
             return ' '.join([t.lemma_ for t in message.get("spacy_doc")])
+        elif message.get("tokens"):  # if directly tokens is provided
+            return ' '.join([t.text for t in message.get("tokens")])
         else:
             return message.text
 

diff --git a/tests/base/test_featurizers.py b/tests/base/test_featurizers.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -14,6 +16,7 @@
 from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa_nlu.training_data import Message
 from rasa_nlu.training_data import TrainingData
+from rasa_nlu.tokenizers import Token
 
 
 @pytest.mark.parametrize("sentence, expected", [
@@ -141,3 +144,36 @@ def test_count_vector_featurizer(sentence, expected):
     ftr.process(message)
 
     assert np.all(message.get("text_features")[0] == expected)
+
+
+@pytest.mark.parametrize("tokens, expected", [
+    (["hello", "hello", "hello", "hello", "hello"], [5]),
+    (["你好", "你好", "你好", "你好", "你好"], [5]),  # test for unicode chars
+    (["hello", "goodbye", "hello"], [1, 2]),
+
+    # Note: order has changed in Chinese version of "hello" & "goodbye"
+    (["你好", "再见", "你好"], [2, 1]),  # test for unicode chars
+    (["a", "b", "c", "d", "e", "f"], [1, 1, 1, 1, 1, 1]),
+    (["a", "1", "2"], [2, 1])
+])
+def test_count_vector_featurizer_using_tokens(tokens, expected):
+    from rasa_nlu.featurizers.count_vectors_featurizer import \
+        CountVectorsFeaturizer
+
+    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
+
+    # using empty string instead of real text string to make sure
+    # count vector only can come from `tokens` feature.
+    # using `message.text` can not get correct result
+    message = Message("")
+
+    tokens_feature = [Token(i, 0) for i in tokens]
+    message.set("tokens", tokens_feature)
+    message.set("intent", "bla")  # this is needed for a valid training example
+
+    data = TrainingData([message])
+
+    ftr.train(data)
+    ftr.process(message)
+
+    assert np.all(message.get("text_features")[0] == expected)