Skip to content

Commit

Permalink
Merge pull request #1115 from howl-anderson/feature/count_vector_from…
Browse files Browse the repository at this point in the history
…_tokens

New feature: component `count_vectors_featurizer` can use `tokens` provide by tokenizers
  • Loading branch information
Ghostvv committed Jun 1, 2018
2 parents 2de4dbb + b4136e5 commit 17d3962
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ This project adheres to `Semantic Versioning`_ starting with version 0.7.0.
Added
-----
- doc link to a community contribution for Rasa NLU in Chinese
- support for component ``count_vectors_featurizer`` use ``tokens`` feature provide by tokenizer

Changed
-------
Expand Down
2 changes: 2 additions & 0 deletions docs/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ intent_featurizer_count_vectors
Creates bag-of-words representation of intent features using
`sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_. All tokens which consist only of digits (e.g. 123 and 99 but not a123d) will be assigned to the same feature.

.. note:: If the words in the model language cannot be split by the white-space, a language-specific tokenizer is required in the pipeline before this component (e.g. using ``tokenizer_jieba`` for Chinese language)

:Configuration:
See `sklearn's CountVectorizer docs <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
for detailed description of the configuration parameters
Expand Down
12 changes: 8 additions & 4 deletions rasa_nlu/featurizers/count_vectors_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def train(self, training_data, cfg=None, **kwargs):
max_features=self.max_features,
preprocessor=self.preprocessor)

lem_exs = [self._lemmatize(example)
lem_exs = [self._get_message_text(example)
for example in training_data.intent_examples]

try:
Expand All @@ -146,13 +146,17 @@ def process(self, message, **kwargs):
"component is either not trained or "
"didn't receive enough training data")
else:
bag = self.vect.transform([self._lemmatize(message)]).toarray()
bag = self.vect.transform(
[self._get_message_text(message)]
).toarray()
message.set("text_features", bag)

@staticmethod
def _lemmatize(message):
if message.get("spacy_doc"):
def _get_message_text(message):
if message.get("spacy_doc"): # if lemmatize is possible
return ' '.join([t.lemma_ for t in message.get("spacy_doc")])
elif message.get("tokens"): # if directly tokens is provided
return ' '.join([t.text for t in message.get("tokens")])
else:
return message.text

Expand Down
36 changes: 36 additions & 0 deletions tests/base/test_featurizers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
Expand All @@ -14,6 +16,7 @@
from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
from rasa_nlu.training_data import Message
from rasa_nlu.training_data import TrainingData
from rasa_nlu.tokenizers import Token


@pytest.mark.parametrize("sentence, expected", [
Expand Down Expand Up @@ -141,3 +144,36 @@ def test_count_vector_featurizer(sentence, expected):
ftr.process(message)

assert np.all(message.get("text_features")[0] == expected)


@pytest.mark.parametrize("tokens, expected", [
(["hello", "hello", "hello", "hello", "hello"], [5]),
(["你好", "你好", "你好", "你好", "你好"], [5]), # test for unicode chars
(["hello", "goodbye", "hello"], [1, 2]),
# Note: order has changed in Chinese version of "hello" & "goodbye"
(["你好", "再见", "你好"], [2, 1]), # test for unicode chars
(["a", "b", "c", "d", "e", "f"], [1, 1, 1, 1, 1, 1]),
(["a", "1", "2"], [2, 1])
])
def test_count_vector_featurizer_using_tokens(tokens, expected):
from rasa_nlu.featurizers.count_vectors_featurizer import \
CountVectorsFeaturizer

ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

# using empty string instead of real text string to make sure
# count vector only can come from `tokens` feature.
# using `message.text` can not get correct result
message = Message("")

tokens_feature = [Token(i, 0) for i in tokens]
message.set("tokens", tokens_feature)
message.set("intent", "bla") # this is needed for a valid training example

data = TrainingData([message])

ftr.train(data)
ftr.process(message)

assert np.all(message.get("text_features")[0] == expected)

0 comments on commit 17d3962

Please sign in to comment.