merged master

RasaHQ · Jul 4, 2017 · a1ac37a · a1ac37a
2 parents 1c5d04a + 23a7cbc
commit a1ac37a
Show file tree

Hide file tree

Showing 59 changed files with 1,488 additions and 1,375 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,6 @@
 language: python
 sudo: required
+group: deprecated-2017Q2
 services:
   - docker
 cache:

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -20,9 +20,13 @@ Changed
 - logging format of logged request now includes model name and timestamp
 - use module specific loggers instead of default python root logger
 - output format of the duckling extractor changed. the ``value`` field now includes the complete value from duckling instead of just text (so this is an property is an object now instead of just text). includes granularity information now.
+- deprecated ``intent_examples`` and ``entity_examples`` sections in training data. all examples should go into the ``common_examples`` section
+- weight training samples based on class distribution during ner_crf cross validation and sklearn intent classification training
+- large refactoring of the internal training data structure and pipeline architecture
 
 Removed
 -------
+- luis data tokenizer configuration value (not used anymore, luis exports char offsets now)
 
 Fixed
 -----
@@ -90,6 +94,7 @@ Added
 - replaced pre-wired backends with more flexible pipeline definitions
 - return top 10 intents with sklearn classifier `#199 <https://github.com/RasaHQ/rasa_nlu/pull/199>`_
 - python type annotations for nearly all public functions
+- added alternative method of defining entity synonyms
 - support for arbitrary spacy language model names
 - duckling components to provide normalized output for structured entities
 - Conditional random field entity extraction (Markov model for entity tagging, better named entity recognition with low and medium data and similarly well at big data level)

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2016 LastMile Technologies Ltd
+   Copyright 2017 Lastmile Technologies GmbH
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1 @@
-include LICENSE README.rst requirements.txt dev_requirements.txt
+include LICENSE README.rst requirements.txt dev-requirements.txt
diff --git a/README.md b/README.md
@@ -112,4 +112,32 @@ Releasing a new version is quite simple, as the packages are build and distribut
     git push origin 0.7.x
     ```
 ## License
-Licensed under the Apache License, Version 2.0. Copyright 2016 LastMile Technologies Ltd. [Copy of the license](LICENSE.txt).
+Licensed under the Apache License, Version 2.0. Copyright 2017 Lastmile Technologies GmbH. [Copy of the license](LICENSE.txt).
+
+As a reference, the following contains a listing of the licenses of the different dependencies as of this writing. 
+Licenses of minimal dependencies:
+
+| required package | License            	|
+|------------------|------------------------|
+| gevent     	   | MIT                	|
+| flask      	   | BSD 3-clause       	|
+| boto3      	   | Apache License 2.0 	|
+| typing     	   | PSF                	|
+| future     	   | MIT                	|
+| six        	   | MIT                	|
+| jsonschema 	   | MIT                	|
+| matplotlib       | PSF                    |
+
+Licenses of optional dependencies (only required for certain components of Rasa NLU. Hence, they are optional):
+
+| optional package     | License            	    |
+|----------------------|----------------------------|
+| MITIE     	       | Boost Software License 1.0 |
+| spacy      	       | MIT       	                |
+| scikit-learn         | BSD 3-clause             	|
+| scipy                | BSD 3-clause             	|
+| numpy                | BSD 3-clause             	|
+| duckling     	       | Apache License 2.0         |
+| sklearn-crfsuite     | MIT                     	|
+| cloudpickle          | BSD 3-clause             	|
+| google-cloud-storage | Apache License 2.0    	    |
diff --git a/_pytest/test_classifiers.py b/_pytest/test_classifiers.py
@@ -8,16 +8,17 @@
 
 def test_classifier_regex_no_intent():
     from rasa_nlu.classifiers.regex_intent_classifier import RegExIntentClassifier
-    regex_dict = {u'[0-9]+': u'provide_number', 
-    			  u'\\bhey*': u'greet'}
+    regex_dict = {u'[0-9]+': u'provide_number',
+                  u'\\bhey*': u'greet'}
     txt = "find me indian food!"
     ext = RegExIntentClassifier(regex_dict)
-    assert ext.parse(txt) == None, "No regexp from the dict matches the input"
+    assert ext.find_pattern_match(txt) is None, "No regexp from the dict matches the input"
+
 
 def test_classifier_regex_intent():
     from rasa_nlu.classifiers.regex_intent_classifier import RegExIntentClassifier
-    regex_dict = {u'[0-9]+': u'provide_number', 
-    			  u'\\bhey*': u'greet'}
+    regex_dict = {u'[0-9]+': u'provide_number',
+                  u'\\bhey*': u'greet'}
     txt = "heyy there!"
     ext = RegExIntentClassifier(regex_dict)
-    assert ext.parse(txt) == "greet", "Intent should be 'greet'"
+    assert ext.find_pattern_match(txt) == "greet", "Intent should be 'greet'"
diff --git a/_pytest/test_components.py b/_pytest/test_components.py
@@ -7,8 +7,7 @@
 from rasa_nlu.model import Metadata
 
 from rasa_nlu import registry
-from rasa_nlu.components import fill_args, load_component, create_component, MissingArgumentError, \
-    find_unavailable_packages, _read_dev_requirements
+from rasa_nlu.components import MissingArgumentError, find_unavailable_packages, _read_dev_requirements
 from rasa_nlu.extractors import EntityExtractor
 
 
@@ -42,104 +41,22 @@ def test_all_components_are_in_all_components_template():
 
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied_during_init(component_class, default_config, component_builder):
-    """Check that `pipeline_init` method parameters can be filled filled from the context.
-
-    The parameters declared on the `pipeline_init` are not filled directly, rather the method is called via reflection.
-    During the reflection, the parameters are filled from a so called context that is created when creating the
-    pipeline and gets initialized with the configuration values. To make sure all arguments `pipeline_init` declares
-    can be provided during the reflection, we do a 'dry run' where we check all parameters are part of the context."""
-
-    # All available context arguments that will ever be generated during init
-    component = component_builder.create_component(component_class.name, default_config)
-    context_arguments = {}
-    for clz in registry.component_classes:
-        for ctx_arg in clz.context_provides.get("pipeline_init", []):
-            context_arguments[ctx_arg] = None
-
-    filled_args = fill_args(component.pipeline_init_args(), context_arguments, default_config.as_dict())
-    assert len(filled_args) == len(component.pipeline_init_args())
-
-
-@pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied_during_train(component_class, default_config, component_builder):
+def test_all_arguments_can_be_satisfied(component_class):
     """Check that `train` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""
 
     # All available context arguments that will ever be generated during train
     # it might still happen, that in a certain pipeline configuration arguments can not be satisfied!
-    component = component_builder.create_component(component_class.name, default_config)
-    context_arguments = {"training_data": None}
-    for clz in registry.component_classes:
-        for ctx_arg in clz.context_provides.get("pipeline_init", []):
-            context_arguments[ctx_arg] = None
-        for ctx_arg in clz.context_provides.get("train", []):
-            context_arguments[ctx_arg] = None
-
-    filled_args = fill_args(component.train_args(), context_arguments, default_config.as_dict())
-    assert len(filled_args) == len(component.train_args())
-
-
-@pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied_during_parse(component_class, default_config, component_builder):
-    """Check that `parse` method parameters can be filled filled from the context. Similar to `pipeline_init` test."""
-
-    # All available context arguments that will ever be generated during parse
-    component = component_builder.create_component(component_class.name, default_config)
-    context_arguments = {"text": None}
-    for clz in registry.component_classes:
-        for ctx_arg in clz.context_provides.get("pipeline_init", []):
-            context_arguments[ctx_arg] = None
-        for ctx_arg in clz.context_provides.get("process", []):
-            context_arguments[ctx_arg] = None
-
-    filled_args = fill_args(component.process_args(), context_arguments, default_config.as_dict())
-    assert len(filled_args) == len(component.process_args())
+    provided_properties = {provided for c in registry.component_classes for provided in c.provides}
 
-
-def test_all_extractors_use_previous_entities():
-    extractors = [c for c in registry.component_classes if isinstance(c, EntityExtractor)]
-    assert all(["entities" in ex.process_args() for ex in extractors])
-
-
-def test_load_can_handle_none():
-    assert load_component(None, {}, {}) is None
-
-
-def test_create_can_handle_none():
-    assert create_component(None, {}) is None
-
-
-def test_fill_args_with_unsatisfiable_param_from_context():
-    with pytest.raises(MissingArgumentError) as excinfo:
-        fill_args(["good_one", "bad_one"], {"good_one": 1}, {})
-    assert "bad_one" in str(excinfo.value)
-    assert "good_one" not in str(excinfo.value)
-
-
-def test_fill_args_with_unsatisfiable_param_from_config():
-    with pytest.raises(MissingArgumentError) as excinfo:
-        fill_args(["good_one", "bad_one"], {}, {"good_one": 1})
-    assert "bad_one" in str(excinfo.value)
-    assert "good_one" not in str(excinfo.value)
+    for req in component_class.requires:
+        assert req in provided_properties, "No component provides required property."
 
 
 def test_find_unavailable_packages():
     unavailable = find_unavailable_packages(["my_made_up_package_name", "io", "foo_bar", "foo_bar"])
     assert unavailable == {"my_made_up_package_name", "foo_bar"}
 
 
-def test_read_dev_requirements(tmpdir):
-    package_name = "my_made_up_package_name"
-
-    # two imaginary packages should be installed if imaginary `package_name` is required
-    install_names = ["my_install_name_one", "my_install_name_two"]
-    f = tmpdir.join("tmp-requirements.txt")
-    f.write("# {}\n{}".format(package_name, "\n".join(install_names)))
-    requirements = _read_dev_requirements(f.strpath)
-    assert package_name in requirements
-    assert requirements[package_name] == install_names
-
-
 def test_builder_create_unknown(component_builder, default_config):
     with pytest.raises(Exception) as excinfo:
         component_builder.create_component("my_made_up_componment", default_config)
@@ -148,5 +65,5 @@ def test_builder_create_unknown(component_builder, default_config):
 
 def test_builder_load_unknown(component_builder):
     with pytest.raises(Exception) as excinfo:
-        component_builder.load_component("my_made_up_componment", {}, {}, Metadata({}, None))
+        component_builder.load_component("my_made_up_componment", "", Metadata({}, None))
     assert "Unknown component name" in str(excinfo.value)
diff --git a/_pytest/test_emulators.py b/_pytest/test_emulators.py
@@ -8,7 +8,7 @@ def test_luis_request():
     from rasa_nlu.emulators.luis import LUISEmulator
     em = LUISEmulator()
     norm = em.normalise_request_json({"q": ["arb text"]})
-    assert norm == {"text": "arb text", "model": "default"}
+    assert norm == {"text": "arb text", "model": "default", "time": None}
 
 
 def test_luis_response():
@@ -78,7 +78,7 @@ def test_wit_request():
     from rasa_nlu.emulators.wit import WitEmulator
     em = WitEmulator()
     norm = em.normalise_request_json({"q": ["arb text"]})
-    assert norm == {"text": "arb text", "model": "default"}
+    assert norm == {"text": "arb text", "model": "default", "time": None}
 
 
 def test_wit_response():
@@ -109,7 +109,7 @@ def test_api_request():
     from rasa_nlu.emulators.api import ApiEmulator
     em = ApiEmulator()
     norm = em.normalise_request_json({"q": ["arb text"]})
-    assert norm == {"text": "arb text", "model": "default"}
+    assert norm == {"text": "arb text", "model": "default", "time": None}
 
 
 def test_api_response():
@@ -159,10 +159,10 @@ def test_dummy_request():
     from rasa_nlu.emulators import NoEmulator
     em = NoEmulator()
     norm = em.normalise_request_json({"q": ["arb text"]})
-    assert norm == {"text": "arb text", "model": "default"}
+    assert norm == {"text": "arb text", "model": "default", "time": None}
 
-    norm = em.normalise_request_json({"q": ["arb text"], "model": "specific"})
-    assert norm == {"text": "arb text", "model": "specific"}
+    norm = em.normalise_request_json({"q": ["arb text"], "model": "specific", "time": "1499279161658"})
+    assert norm == {"text": "arb text", "model": "specific", "time": "1499279161658"}
 
 
 def test_dummy_response():

diff --git a/_pytest/test_extractors.py b/_pytest/test_extractors.py
@@ -3,39 +3,44 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from rasa_nlu.training_data import TrainingData
+import utilities
+from rasa_nlu.training_data import TrainingData, Message
 
 
 def test_crf_extractor(spacy_nlp):
     from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
     ext = CRFEntityExtractor()
     examples = [
-        {
-            "text": "anywhere in the west",
+        Message("anywhere in the west", {
             "intent": "restaurant_search",
-            "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}]
-        },
-        {
-            "text": "central indian restaurant",
+            "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}],
+            "spacy_doc": spacy_nlp("anywhere in the west")
+        }),
+        Message("central indian restaurant", {
             "intent": "restaurant_search",
-            "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}]
-        }]
-    ext.train(TrainingData(entity_examples_only=examples), spacy_nlp, True, ext.crf_features)
-    crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp)
+            "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}],
+            "spacy_doc": spacy_nlp("central indian restaurant")
+        })]
+    config = {"entity_crf_BILOU_flag": True, "entity_crf_features": ext.crf_features}
+    ext.train(TrainingData(training_examples=examples), config)
+    sentence = 'anywhere in the west'
+    crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
     assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
     feats = ext._sentence_to_features(crf_format)
     assert ('BOS' in feats[0])
     assert ('EOS' in feats[-1])
     assert ('0:low:in' in feats[1])
-    ext.extract_entities('anywhere in the west', spacy_nlp)
+    sentence = 'anywhere in the west'
+    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
 
 
 def test_crf_json_from_BILOU(spacy_nlp):
     from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
     ext = CRFEntityExtractor()
     ext.BILOU_flag = True
     sentence = u"I need a home cleaning close-by"
-    r = ext._from_crf_to_json(spacy_nlp(sentence), ['O', 'O', 'O', 'B-what', 'L-what', 'B-where', 'I-where', 'L-where'])
+    r = ext._from_crf_to_json(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}),
+                              ['O', 'O', 'O', 'B-what', 'L-what', 'B-where', 'I-where', 'L-where'])
     assert len(r) == 2, "There should be two entities"
     assert r[0] == {u'start': 9, u'end': 22, u'value': u'home cleaning', u'entity': u'what'}
     assert r[1] == {u'start': 23, u'end': 31, u'value': u'close-by', u'entity': u'where'}
@@ -46,38 +51,51 @@ def test_crf_json_from_non_BILOU(spacy_nlp):
     ext = CRFEntityExtractor()
     ext.BILOU_flag = False
     sentence = u"I need a home cleaning close-by"
-    r = ext._from_crf_to_json(spacy_nlp(sentence), ['O', 'O', 'O', 'what', 'what', 'where', 'where', 'where'])
+    r = ext._from_crf_to_json(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}),
+                              ['O', 'O', 'O', 'what', 'what', 'where', 'where', 'where'])
     assert len(r) == 5, "There should be five entities"  # non BILOU will split multi-word entities - hence 5
     assert r[0] == {u'start': 9, u'end': 13, u'value': u'home', u'entity': u'what'}
     assert r[1] == {u'start': 14, u'end': 22, u'value': u'cleaning', u'entity': u'what'}
     assert r[2] == {u'start': 23, u'end': 28, u'value': u'close', u'entity': u'where'}
     assert r[3] == {u'start': 28, u'end': 29, u'value': u'-', u'entity': u'where'}
     assert r[4] == {u'start': 29, u'end': 31, u'value': u'by', u'entity': u'where'}
 
+
 def test_ner_regex_no_entities():
     from rasa_nlu.extractors.regex_entity_extractor import RegExEntityExtractor
-    regex_dict = {u'\\bmexican\\b': u'mexican', 
+    regex_dict = {u'\\bmexican\\b': u'mexican',
                   u'[0-9]+': u'number'}
     txt = "I want indian food"
     ext = RegExEntityExtractor(regex_dict)
     assert ext.extract_entities(txt) == []
 
+
 def test_ner_regex_multi_entities():
     from rasa_nlu.extractors.regex_entity_extractor import RegExEntityExtractor
-    regex_dict = {u'\\bmexican\\b': u'mexican', 
+    regex_dict = {u'\\bmexican\\b': u'mexican',
                   u'[0-9]+': u'number'}
     txt = "find me 2 mexican restaurants"
     ext = RegExEntityExtractor(regex_dict)
     r = ext.extract_entities(txt)
     assert r[0] == {u'start': 10, u'end': 17, u'value': 'mexican', u'entity': 'mexican'}
     assert r[1] == {u'start': 8, u'end': 9, u'value': '2', u'entity': 'number'}
 
+
 def test_ner_regex_1_entity():
     from rasa_nlu.extractors.regex_entity_extractor import RegExEntityExtractor
-    regex_dict = {u'\\bmexican\\b': u'mexican', 
+    regex_dict = {u'\\bmexican\\b': u'mexican',
                   u'[0-9]+': u'number'}
     txt = "my insurance number is 934049430"
     ext = RegExEntityExtractor(regex_dict)
     r = ext.extract_entities(txt)
     assert r[0] == {u'start': 23, u'end': 32, u'value': '934049430', u'entity': 'number'}
 
+
+def test_duckling_entity_extractor(component_builder):
+    _config = utilities.base_test_conf("all_components")
+    _config["duckling_dimensions"] = ["time"]
+    duckling = component_builder.create_component("ner_duckling", _config)
+    message = Message("Today is the 5th of May. Let us meet tomorrow.")
+    duckling.process(message)
+    entities = message.get("entities")
+    assert len(entities) == 3