Skip to content

Commit

Permalink
Merge pull request #1177 from RasaHQ/whitespace_tokenizer
Browse files Browse the repository at this point in the history
replace punctuation with space
  • Loading branch information
Ghostvv committed Jun 22, 2018
2 parents 467f9b0 + 4e6691f commit ed00590
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Added
Changed
-------
- L1 and L2 regularisation defaults in ``ner_crf`` both set to 0.1
- ``whitespace_tokenizer`` ignores punctuation ``.,!?`` before whitespace or end of string

Removed
-------
Expand Down
12 changes: 7 additions & 5 deletions rasa_nlu/extractors/crf_entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def train(self, training_data, config, **kwargs):
# checks whether there is at least one
# example with an entity annotation
if training_data.entity_examples:
self._check_spacy_doc(training_data.training_examples[0])

# filter out pre-trained entity examples
filtered_entity_examples = self.filter_trainable_entities(
Expand Down Expand Up @@ -452,11 +453,12 @@ def _from_json_to_crf(self,
ents = self._bilou_tags_from_offsets(tokens, entity_offsets)

if '-' in ents:
logger.warn("Misaligned entity annotation in sentence '{}'. "
"Make sure the start and end values of the "
"annotated training examples end at token "
"boundaries (e.g. don't include trailing "
"whitespaces).".format(message.text))
logger.warning("Misaligned entity annotation in sentence '{}'. "
"Make sure the start and end values of the "
"annotated training examples end at token "
"boundaries (e.g. don't include trailing "
"whitespaces or punctuation)."
"".format(message.text))
if not self.component_config["BILOU_flag"]:
for i, label in enumerate(ents):
if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
Expand Down
6 changes: 5 additions & 1 deletion rasa_nlu/tokenizers/whitespace_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import print_function
from __future__ import unicode_literals

import re
from typing import Any, List, Text

from rasa_nlu.components import Component
Expand Down Expand Up @@ -31,7 +32,10 @@ def process(self, message, **kwargs):
def tokenize(self, text):
# type: (Text) -> List[Token]

words = text.split()
# there is space or end of string after punctuation
# because we do not want to replace 10.000 with 10 000
words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split()

running_offset = 0
tokens = []
for word in words:
Expand Down
9 changes: 8 additions & 1 deletion tests/base/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,19 @@ def test_whitespace():
assert [t.offset for t in tk.tokenize("Forecast for lunch")] == \
[0, 9, 13]

# we ignore .,!?
assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == \
['hey', 'ńöñàśçií', 'how\'re', 'you?']
['hey', 'ńöñàśçií', 'how\'re', 'you']

assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == \
[0, 4, 13, 20]

assert [t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \
['привет', '10.000', 'ńöñàśçií', 'how\'re', 'you']

assert [t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \
[0, 8, 16, 26, 33]


def test_spacy(spacy_nlp):
from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
Expand Down

0 comments on commit ed00590

Please sign in to comment.