Merge pull request #1177 from RasaHQ/whitespace_tokenizer

replace punctuation with space
RasaHQ · Jun 22, 2018 · ed00590 · ed00590
2 parents 467f9b0 + 4e6691f
commit ed00590
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 7 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -32,6 +32,7 @@ Added
 Changed
 -------
 - L1 and L2 regularisation defaults in ``ner_crf`` both set to 0.1
+- ``whitespace_tokenizer`` ignores punctuation ``.,!?`` before whitespace or end of string
 
 Removed
 -------

diff --git a/rasa_nlu/extractors/crf_entity_extractor.py b/rasa_nlu/extractors/crf_entity_extractor.py
@@ -128,6 +128,7 @@ def train(self, training_data, config, **kwargs):
         # checks whether there is at least one
         # example with an entity annotation
         if training_data.entity_examples:
+            self._check_spacy_doc(training_data.training_examples[0])
 
             # filter out pre-trained entity examples
             filtered_entity_examples = self.filter_trainable_entities(
@@ -452,11 +453,12 @@ def _from_json_to_crf(self,
             ents = self._bilou_tags_from_offsets(tokens, entity_offsets)
 
         if '-' in ents:
-            logger.warn("Misaligned entity annotation in sentence '{}'. "
-                        "Make sure the start and end values of the "
-                        "annotated training examples end at token "
-                        "boundaries (e.g. don't include trailing "
-                        "whitespaces).".format(message.text))
+            logger.warning("Misaligned entity annotation in sentence '{}'. "
+                           "Make sure the start and end values of the "
+                           "annotated training examples end at token "
+                           "boundaries (e.g. don't include trailing "
+                           "whitespaces or punctuation)."
+                           "".format(message.text))
         if not self.component_config["BILOU_flag"]:
             for i, label in enumerate(ents):
                 if self._bilou_from_label(label) in {"B", "I", "U", "L"}:

diff --git a/rasa_nlu/tokenizers/whitespace_tokenizer.py b/rasa_nlu/tokenizers/whitespace_tokenizer.py
@@ -3,6 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import re
 from typing import Any, List, Text
 
 from rasa_nlu.components import Component
@@ -31,7 +32,10 @@ def process(self, message, **kwargs):
     def tokenize(self, text):
         # type: (Text) -> List[Token]
 
-        words = text.split()
+        # there is space or end of string after punctuation
+        # because we do not want to replace 10.000 with 10 000
+        words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split()
+
         running_offset = 0
         tokens = []
         for word in words:

diff --git a/tests/base/test_tokenizers.py b/tests/base/test_tokenizers.py
@@ -15,12 +15,19 @@ def test_whitespace():
     assert [t.offset for t in tk.tokenize("Forecast for lunch")] == \
            [0, 9, 13]
 
+    # we ignore .,!?
     assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == \
-           ['hey', 'ńöñàśçií', 'how\'re', 'you?']
+           ['hey', 'ńöñàśçií', 'how\'re', 'you']
 
     assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == \
            [0, 4, 13, 20]
 
+    assert [t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \
+           ['привет', '10.000', 'ńöñàśçií', 'how\'re', 'you']
+
+    assert [t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \
+           [0, 8, 16, 26, 33]
+
 
 def test_spacy(spacy_nlp):
     from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer