Skip to content

Commit

Permalink
merged master
Browse files Browse the repository at this point in the history
  • Loading branch information
tmbo committed Jul 4, 2017
2 parents 1c5d04a + 23a7cbc commit a1ac37a
Show file tree
Hide file tree
Showing 59 changed files with 1,488 additions and 1,375 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
language: python
sudo: required
group: deprecated-2017Q2
services:
- docker
cache:
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@ Changed
- logging format of logged request now includes model name and timestamp
- use module specific loggers instead of default python root logger
- output format of the duckling extractor changed. the ``value`` field now includes the complete value from duckling instead of just text (so this is an property is an object now instead of just text). includes granularity information now.
- deprecated ``intent_examples`` and ``entity_examples`` sections in training data. all examples should go into the ``common_examples`` section
- weight training samples based on class distribution during ner_crf cross validation and sklearn intent classification training
- large refactoring of the internal training data structure and pipeline architecture

Removed
-------
- luis data tokenizer configuration value (not used anymore, luis exports char offsets now)

Fixed
-----
Expand Down Expand Up @@ -90,6 +94,7 @@ Added
- replaced pre-wired backends with more flexible pipeline definitions
- return top 10 intents with sklearn classifier `#199 <https://github.com/RasaHQ/rasa_nlu/pull/199>`_
- python type annotations for nearly all public functions
- added alternative method of defining entity synonyms
- support for arbitrary spacy language model names
- duckling components to provide normalized output for structured entities
- Conditional random field entity extraction (Markov model for entity tagging, better named entity recognition with low and medium data and similarly well at big data level)
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright 2016 LastMile Technologies Ltd
Copyright 2017 Lastmile Technologies GmbH

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
include LICENSE README.rst requirements.txt dev_requirements.txt
include LICENSE README.rst requirements.txt dev-requirements.txt
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,32 @@ Releasing a new version is quite simple, as the packages are build and distribut
git push origin 0.7.x
```
## License
Licensed under the Apache License, Version 2.0. Copyright 2016 LastMile Technologies Ltd. [Copy of the license](LICENSE.txt).
Licensed under the Apache License, Version 2.0. Copyright 2017 Lastmile Technologies GmbH. [Copy of the license](LICENSE.txt).

As a reference, the following contains a listing of the licenses of the different dependencies as of this writing.
Licenses of minimal dependencies:

| required package | License |
|------------------|------------------------|
| gevent | MIT |
| flask | BSD 3-clause |
| boto3 | Apache License 2.0 |
| typing | PSF |
| future | MIT |
| six | MIT |
| jsonschema | MIT |
| matplotlib | PSF |

Licenses of optional dependencies (only required for certain components of Rasa NLU. Hence, they are optional):

| optional package | License |
|----------------------|----------------------------|
| MITIE | Boost Software License 1.0 |
| spacy | MIT |
| scikit-learn | BSD 3-clause |
| scipy | BSD 3-clause |
| numpy | BSD 3-clause |
| duckling | Apache License 2.0 |
| sklearn-crfsuite | MIT |
| cloudpickle | BSD 3-clause |
| google-cloud-storage | Apache License 2.0 |
13 changes: 7 additions & 6 deletions _pytest/test_classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@

def test_classifier_regex_no_intent():
from rasa_nlu.classifiers.regex_intent_classifier import RegExIntentClassifier
regex_dict = {u'[0-9]+': u'provide_number',
u'\\bhey*': u'greet'}
regex_dict = {u'[0-9]+': u'provide_number',
u'\\bhey*': u'greet'}
txt = "find me indian food!"
ext = RegExIntentClassifier(regex_dict)
assert ext.parse(txt) == None, "No regexp from the dict matches the input"
assert ext.find_pattern_match(txt) is None, "No regexp from the dict matches the input"


def test_classifier_regex_intent():
from rasa_nlu.classifiers.regex_intent_classifier import RegExIntentClassifier
regex_dict = {u'[0-9]+': u'provide_number',
u'\\bhey*': u'greet'}
regex_dict = {u'[0-9]+': u'provide_number',
u'\\bhey*': u'greet'}
txt = "heyy there!"
ext = RegExIntentClassifier(regex_dict)
assert ext.parse(txt) == "greet", "Intent should be 'greet'"
assert ext.find_pattern_match(txt) == "greet", "Intent should be 'greet'"
95 changes: 6 additions & 89 deletions _pytest/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
from rasa_nlu.model import Metadata

from rasa_nlu import registry
from rasa_nlu.components import fill_args, load_component, create_component, MissingArgumentError, \
find_unavailable_packages, _read_dev_requirements
from rasa_nlu.components import MissingArgumentError, find_unavailable_packages, _read_dev_requirements
from rasa_nlu.extractors import EntityExtractor


Expand Down Expand Up @@ -42,104 +41,22 @@ def test_all_components_are_in_all_components_template():


@pytest.mark.parametrize("component_class", registry.component_classes)
def test_all_arguments_can_be_satisfied_during_init(component_class, default_config, component_builder):
"""Check that `pipeline_init` method parameters can be filled filled from the context.
The parameters declared on the `pipeline_init` are not filled directly, rather the method is called via reflection.
During the reflection, the parameters are filled from a so called context that is created when creating the
pipeline and gets initialized with the configuration values. To make sure all arguments `pipeline_init` declares
can be provided during the reflection, we do a 'dry run' where we check all parameters are part of the context."""

# All available context arguments that will ever be generated during init
component = component_builder.create_component(component_class.name, default_config)
context_arguments = {}
for clz in registry.component_classes:
for ctx_arg in clz.context_provides.get("pipeline_init", []):
context_arguments[ctx_arg] = None

filled_args = fill_args(component.pipeline_init_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component.pipeline_init_args())


@pytest.mark.parametrize("component_class", registry.component_classes)
def test_all_arguments_can_be_satisfied_during_train(component_class, default_config, component_builder):
def test_all_arguments_can_be_satisfied(component_class):
"""Check that `train` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""

# All available context arguments that will ever be generated during train
# it might still happen, that in a certain pipeline configuration arguments can not be satisfied!
component = component_builder.create_component(component_class.name, default_config)
context_arguments = {"training_data": None}
for clz in registry.component_classes:
for ctx_arg in clz.context_provides.get("pipeline_init", []):
context_arguments[ctx_arg] = None
for ctx_arg in clz.context_provides.get("train", []):
context_arguments[ctx_arg] = None

filled_args = fill_args(component.train_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component.train_args())


@pytest.mark.parametrize("component_class", registry.component_classes)
def test_all_arguments_can_be_satisfied_during_parse(component_class, default_config, component_builder):
"""Check that `parse` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""

# All available context arguments that will ever be generated during parse
component = component_builder.create_component(component_class.name, default_config)
context_arguments = {"text": None}
for clz in registry.component_classes:
for ctx_arg in clz.context_provides.get("pipeline_init", []):
context_arguments[ctx_arg] = None
for ctx_arg in clz.context_provides.get("process", []):
context_arguments[ctx_arg] = None

filled_args = fill_args(component.process_args(), context_arguments, default_config.as_dict())
assert len(filled_args) == len(component.process_args())
provided_properties = {provided for c in registry.component_classes for provided in c.provides}


def test_all_extractors_use_previous_entities():
extractors = [c for c in registry.component_classes if isinstance(c, EntityExtractor)]
assert all(["entities" in ex.process_args() for ex in extractors])


def test_load_can_handle_none():
assert load_component(None, {}, {}) is None


def test_create_can_handle_none():
assert create_component(None, {}) is None


def test_fill_args_with_unsatisfiable_param_from_context():
with pytest.raises(MissingArgumentError) as excinfo:
fill_args(["good_one", "bad_one"], {"good_one": 1}, {})
assert "bad_one" in str(excinfo.value)
assert "good_one" not in str(excinfo.value)


def test_fill_args_with_unsatisfiable_param_from_config():
with pytest.raises(MissingArgumentError) as excinfo:
fill_args(["good_one", "bad_one"], {}, {"good_one": 1})
assert "bad_one" in str(excinfo.value)
assert "good_one" not in str(excinfo.value)
for req in component_class.requires:
assert req in provided_properties, "No component provides required property."


def test_find_unavailable_packages():
unavailable = find_unavailable_packages(["my_made_up_package_name", "io", "foo_bar", "foo_bar"])
assert unavailable == {"my_made_up_package_name", "foo_bar"}


def test_read_dev_requirements(tmpdir):
package_name = "my_made_up_package_name"

# two imaginary packages should be installed if imaginary `package_name` is required
install_names = ["my_install_name_one", "my_install_name_two"]
f = tmpdir.join("tmp-requirements.txt")
f.write("# {}\n{}".format(package_name, "\n".join(install_names)))
requirements = _read_dev_requirements(f.strpath)
assert package_name in requirements
assert requirements[package_name] == install_names


def test_builder_create_unknown(component_builder, default_config):
with pytest.raises(Exception) as excinfo:
component_builder.create_component("my_made_up_componment", default_config)
Expand All @@ -148,5 +65,5 @@ def test_builder_create_unknown(component_builder, default_config):

def test_builder_load_unknown(component_builder):
with pytest.raises(Exception) as excinfo:
component_builder.load_component("my_made_up_componment", {}, {}, Metadata({}, None))
component_builder.load_component("my_made_up_componment", "", Metadata({}, None))
assert "Unknown component name" in str(excinfo.value)
12 changes: 6 additions & 6 deletions _pytest/test_emulators.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_luis_request():
from rasa_nlu.emulators.luis import LUISEmulator
em = LUISEmulator()
norm = em.normalise_request_json({"q": ["arb text"]})
assert norm == {"text": "arb text", "model": "default"}
assert norm == {"text": "arb text", "model": "default", "time": None}


def test_luis_response():
Expand Down Expand Up @@ -78,7 +78,7 @@ def test_wit_request():
from rasa_nlu.emulators.wit import WitEmulator
em = WitEmulator()
norm = em.normalise_request_json({"q": ["arb text"]})
assert norm == {"text": "arb text", "model": "default"}
assert norm == {"text": "arb text", "model": "default", "time": None}


def test_wit_response():
Expand Down Expand Up @@ -109,7 +109,7 @@ def test_api_request():
from rasa_nlu.emulators.api import ApiEmulator
em = ApiEmulator()
norm = em.normalise_request_json({"q": ["arb text"]})
assert norm == {"text": "arb text", "model": "default"}
assert norm == {"text": "arb text", "model": "default", "time": None}


def test_api_response():
Expand Down Expand Up @@ -159,10 +159,10 @@ def test_dummy_request():
from rasa_nlu.emulators import NoEmulator
em = NoEmulator()
norm = em.normalise_request_json({"q": ["arb text"]})
assert norm == {"text": "arb text", "model": "default"}
assert norm == {"text": "arb text", "model": "default", "time": None}

norm = em.normalise_request_json({"q": ["arb text"], "model": "specific"})
assert norm == {"text": "arb text", "model": "specific"}
norm = em.normalise_request_json({"q": ["arb text"], "model": "specific", "time": "1499279161658"})
assert norm == {"text": "arb text", "model": "specific", "time": "1499279161658"}


def test_dummy_response():
Expand Down
52 changes: 35 additions & 17 deletions _pytest/test_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,44 @@
from __future__ import print_function
from __future__ import unicode_literals

from rasa_nlu.training_data import TrainingData
import utilities
from rasa_nlu.training_data import TrainingData, Message


def test_crf_extractor(spacy_nlp):
from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
ext = CRFEntityExtractor()
examples = [
{
"text": "anywhere in the west",
Message("anywhere in the west", {
"intent": "restaurant_search",
"entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}]
},
{
"text": "central indian restaurant",
"entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}],
"spacy_doc": spacy_nlp("anywhere in the west")
}),
Message("central indian restaurant", {
"intent": "restaurant_search",
"entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}]
}]
ext.train(TrainingData(entity_examples_only=examples), spacy_nlp, True, ext.crf_features)
crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp)
"entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}],
"spacy_doc": spacy_nlp("central indian restaurant")
})]
config = {"entity_crf_BILOU_flag": True, "entity_crf_features": ext.crf_features}
ext.train(TrainingData(training_examples=examples), config)
sentence = 'anywhere in the west'
crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
feats = ext._sentence_to_features(crf_format)
assert ('BOS' in feats[0])
assert ('EOS' in feats[-1])
assert ('0:low:in' in feats[1])
ext.extract_entities('anywhere in the west', spacy_nlp)
sentence = 'anywhere in the west'
ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))


def test_crf_json_from_BILOU(spacy_nlp):
from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
ext = CRFEntityExtractor()
ext.BILOU_flag = True
sentence = u"I need a home cleaning close-by"
r = ext._from_crf_to_json(spacy_nlp(sentence), ['O', 'O', 'O', 'B-what', 'L-what', 'B-where', 'I-where', 'L-where'])
r = ext._from_crf_to_json(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}),
['O', 'O', 'O', 'B-what', 'L-what', 'B-where', 'I-where', 'L-where'])
assert len(r) == 2, "There should be two entities"
assert r[0] == {u'start': 9, u'end': 22, u'value': u'home cleaning', u'entity': u'what'}
assert r[1] == {u'start': 23, u'end': 31, u'value': u'close-by', u'entity': u'where'}
Expand All @@ -46,38 +51,51 @@ def test_crf_json_from_non_BILOU(spacy_nlp):
ext = CRFEntityExtractor()
ext.BILOU_flag = False
sentence = u"I need a home cleaning close-by"
r = ext._from_crf_to_json(spacy_nlp(sentence), ['O', 'O', 'O', 'what', 'what', 'where', 'where', 'where'])
r = ext._from_crf_to_json(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}),
['O', 'O', 'O', 'what', 'what', 'where', 'where', 'where'])
assert len(r) == 5, "There should be five entities" # non BILOU will split multi-word entities - hence 5
assert r[0] == {u'start': 9, u'end': 13, u'value': u'home', u'entity': u'what'}
assert r[1] == {u'start': 14, u'end': 22, u'value': u'cleaning', u'entity': u'what'}
assert r[2] == {u'start': 23, u'end': 28, u'value': u'close', u'entity': u'where'}
assert r[3] == {u'start': 28, u'end': 29, u'value': u'-', u'entity': u'where'}
assert r[4] == {u'start': 29, u'end': 31, u'value': u'by', u'entity': u'where'}


def test_ner_regex_no_entities():
from rasa_nlu.extractors.regex_entity_extractor import RegExEntityExtractor
regex_dict = {u'\\bmexican\\b': u'mexican',
regex_dict = {u'\\bmexican\\b': u'mexican',
u'[0-9]+': u'number'}
txt = "I want indian food"
ext = RegExEntityExtractor(regex_dict)
assert ext.extract_entities(txt) == []


def test_ner_regex_multi_entities():
from rasa_nlu.extractors.regex_entity_extractor import RegExEntityExtractor
regex_dict = {u'\\bmexican\\b': u'mexican',
regex_dict = {u'\\bmexican\\b': u'mexican',
u'[0-9]+': u'number'}
txt = "find me 2 mexican restaurants"
ext = RegExEntityExtractor(regex_dict)
r = ext.extract_entities(txt)
assert r[0] == {u'start': 10, u'end': 17, u'value': 'mexican', u'entity': 'mexican'}
assert r[1] == {u'start': 8, u'end': 9, u'value': '2', u'entity': 'number'}


def test_ner_regex_1_entity():
from rasa_nlu.extractors.regex_entity_extractor import RegExEntityExtractor
regex_dict = {u'\\bmexican\\b': u'mexican',
regex_dict = {u'\\bmexican\\b': u'mexican',
u'[0-9]+': u'number'}
txt = "my insurance number is 934049430"
ext = RegExEntityExtractor(regex_dict)
r = ext.extract_entities(txt)
assert r[0] == {u'start': 23, u'end': 32, u'value': '934049430', u'entity': 'number'}


def test_duckling_entity_extractor(component_builder):
_config = utilities.base_test_conf("all_components")
_config["duckling_dimensions"] = ["time"]
duckling = component_builder.create_component("ner_duckling", _config)
message = Message("Today is the 5th of May. Let us meet tomorrow.")
duckling.process(message)
entities = message.get("entities")
assert len(entities) == 3

0 comments on commit a1ac37a

Please sign in to comment.