Merge 3a6327c into 2edc18a

RasaHQ · Feb 26, 2020 · 3dad743 · 3dad743
2 parents 2edc18a + 3a6327c
commit 3dad743
Show file tree

Hide file tree

Showing 8 changed files with 180 additions and 13 deletions.
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
@@ -1,9 +1,10 @@
+import itertools
 import logging
 import typing
-from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
+from typing import Any, Dict, Hashable, Iterable, List, Optional, Set, Text, Tuple
 
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults
-from rasa.nlu.constants import RESPONSE_ATTRIBUTE
+from rasa.nlu.constants import TRAINABLE_EXTRACTORS
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
 
@@ -113,20 +114,93 @@ def validate_requires_any_of(
         )
 
 
+def any_components_in_pipeline(components: Iterable[Text], pipeline: List["Component"]):
+    """Check if any of the provided components are listed in the pipeline."""
+    return any(
+        [any([component.name == c for component in pipeline]) for c in components]
+    )
+
+
 def validate_required_components_from_data(
     pipeline: List["Component"], data: TrainingData
 ):
+    """Check training data for features.
 
-    response_selector_exists = False
-    for component in pipeline:
-        # check if a response selector is part of NLU pipeline
-        if RESPONSE_ATTRIBUTE in component.provides:
-            response_selector_exists = True
+    If those features require specific components to featurize or
+    process them, warn the user if the required component is missing.
+    """
+
+    if data.entity_examples and not any_components_in_pipeline(
+        TRAINABLE_EXTRACTORS, pipeline
+    ):
+        raise_warning(
+            "You have defined training data consisting of entity examples, but "
+            "your NLU pipeline does not include an entity extractor trained on "
+            "your training data. To extract entity examples, add one of "
+            f"{TRAINABLE_EXTRACTORS} to your pipeline."
+        )
+
+    if data.regex_features and not any_components_in_pipeline(
+        ["RegexFeaturizer"], pipeline
+    ):
+        raise_warning(
+            "You have defined training data with regexes, but "
+            "your NLU pipeline does not include a RegexFeaturizer. "
+            "To featurize regexes for entity extraction, you need "
+            "to have a RegexFeaturizer in your pipeline."
+        )
+
+    if data.lookup_tables and not any_components_in_pipeline(
+        ["RegexFeaturizer"], pipeline
+    ):
+        raise_warning(
+            "You have defined training data consisting of lookup tables, but "
+            "your NLU pipeline does not include a RegexFeaturizer. "
+            "To featurize lookup tables, add a RegexFeaturizer to your pipeline."
+        )
+
+    if data.lookup_tables:
+        if not any_components_in_pipeline(["CRFEntityExtractor"], pipeline):
+            raise_warning(
+                "You have defined training data consisting of lookup tables, but "
+                "your NLU pipeline does not include a CRFEntityExtractor. "
+                "To featurize lookup tables, add a CRFEntityExtractor to your pipeline."
+            )
+        else:
+            crf_components = [c for c in pipeline if c.name == "CRFEntityExtractor"]
+            # check to see if any of the possible CRFEntityExtractors will featurize `pattern`
+            has_pattern_feature = False
+            for crf in crf_components:
+                crf_features = crf.component_config.get("features")
+                # iterate through [[before],[word],[after]] features
+                if "pattern" in itertools.chain(*crf_features):
+                    has_pattern_feature = True
+
+            if not has_pattern_feature:
+                raise_warning(
+                    "You have defined training data consisting of lookup tables, but "
+                    "your NLU pipeline's CRFEntityExtractor does not include the `pattern` feature. "
+                    "To featurize lookup tables, add the `pattern` feature to the CRFEntityExtractor in "
+                    "your pipeline."
+                )
+
+    if data.entity_synonyms and not any_components_in_pipeline(
+        ["EntitySynonymMapper"], pipeline
+    ):
+        raise_warning(
+            "You have defined training data consisting of synonyms, but "
+            "your NLU pipeline does not include an EntitySynonymMapper. "
+            "To map synonyms, add an EntitySynonymMapper to your pipeline."
+        )
 
-    if len(data.response_examples) and not response_selector_exists:
+    if data.response_examples and not any_components_in_pipeline(
+        ["ResponseSelector"], pipeline
+    ):
         raise_warning(
-            "Training data consists examples for training a response selector but "
-            "no response selector component specified inside NLU pipeline."
+            "Your training data includes examples for training a response selector, but "
+            "your NLU pipeline does not include a ResponseSelector component. "
+            "Either add a ResponseSelector to your pipeline or "
+            "remove the response selector training data."
         )
 
 

diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import ruamel.yaml as yaml
-from typing import Any, Dict, List, Optional, Text, Union, Tuple
+from typing import Any, Dict, List, Optional, Text, Union
 
 import rasa.utils.io
 from rasa.constants import DEFAULT_CONFIG_PATH, DOCS_URL_PIPELINE

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
@@ -12,6 +12,8 @@
 
 PRETRAINED_EXTRACTORS = {"DucklingHTTPExtractor", "SpacyEntityExtractor"}
 
+TRAINABLE_EXTRACTORS = {"MitieEntityExtractor", "CRFEntityExtractor"}
+
 CLS_TOKEN = "__CLS__"
 
 MESSAGE_ATTRIBUTES = [TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, RESPONSE_ATTRIBUTE]

diff --git a/sample_configs/config_crf_no_pattern_feature.yml b/sample_configs/config_crf_no_pattern_feature.yml
@@ -0,0 +1,7 @@
+language: en
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "RegexFeaturizer"
+  - name: "CRFEntityExtractor"
+    features: [['low', 'title', 'upper'],['bias', 'low', 'digit'],['low', 'title', 'upper']]
+  - name: "EntitySynonymMapper"
diff --git a/sample_configs/config_crf_no_regex.yml b/sample_configs/config_crf_no_regex.yml
@@ -0,0 +1,5 @@
+language: en
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "CRFEntityExtractor"
+  - name: "EntitySynonymMapper"
diff --git a/sample_configs/config_crf_no_synonyms.yml b/sample_configs/config_crf_no_synonyms.yml
@@ -0,0 +1,4 @@
+language: en
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "CRFEntityExtractor"
diff --git a/sample_configs/config_spacy_entity_extractor.yml b/sample_configs/config_spacy_entity_extractor.yml
@@ -0,0 +1,8 @@
+language: en
+pipeline:
+    - name: "SpacyNLP"
+    - name: "SpacyTokenizer"
+    - name: "SpacyFeaturizer"
+    - name: "RegexFeaturizer"
+    - name: "SpacyEntityExtractor"
+    - name: "EntitySynonymMapper"
diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
@@ -5,11 +5,13 @@
 import pytest
 
 import rasa.utils.io
-from rasa.nlu import config
+from rasa.nlu import components, config, load_data
 from rasa.nlu.components import ComponentBuilder
+from rasa.nlu.constants import TRAINABLE_EXTRACTORS
 from rasa.nlu.registry import registered_pipeline_templates
-from tests.nlu.conftest import CONFIG_DEFAULTS_PATH
+from tests.nlu.conftest import CONFIG_DEFAULTS_PATH, DEFAULT_DATA_PATH
 from tests.nlu.utilities import write_file_config
+from rasa.nlu.model import Trainer
 
 defaults = rasa.utils.io.read_config_file(CONFIG_DEFAULTS_PATH)
 
@@ -81,3 +83,68 @@ def test_override_defaults_supervised_embeddings_pipeline():
     component2_cfg = cfg.for_component(1)
     component2 = builder.create_component(component2_cfg, cfg)
     assert component2.epochs == 10
+
+
+def test_warn_no_trainable_extractor():
+    cfg = config.load("sample_configs/config_spacy_entity_extractor.yml")
+    trainer = Trainer(cfg)
+    training_data = load_data(DEFAULT_DATA_PATH)
+    with pytest.warns(UserWarning) as record:
+        components.validate_required_components_from_data(
+            trainer.pipeline, training_data
+        )
+
+    assert len(record) == 1
+    assert str(TRAINABLE_EXTRACTORS) in record[0].message.args[0]
+
+
+def test_warn_missing_regex_featurizer():
+    cfg = config.load("sample_configs/config_crf_no_regex.yml")
+    trainer = Trainer(cfg)
+    training_data = load_data(DEFAULT_DATA_PATH)
+    with pytest.warns(UserWarning) as record:
+        components.validate_required_components_from_data(
+            trainer.pipeline, training_data
+        )
+
+    assert len(record) == 1
+    assert "RegexFeaturizer" in record[0].message.args[0]
+
+
+def test_warn_missing_pattern_feature_lookup_tables():
+    cfg = config.load("sample_configs/config_crf_no_pattern_feature.yml")
+    trainer = Trainer(cfg)
+    training_data = load_data("data/test/lookup_tables/lookup_table.md")
+    with pytest.warns(UserWarning) as record:
+        components.validate_required_components_from_data(
+            trainer.pipeline, training_data
+        )
+
+    assert len(record) == 1
+    assert "`pattern` feature" in record[0].message.args[0]
+
+
+def test_warn_missing_synonym_mapper():
+    cfg = config.load("sample_configs/config_crf_no_synonyms.yml")
+    trainer = Trainer(cfg)
+    training_data = load_data("data/test/markdown_single_sections/synonyms_only.md")
+    with pytest.warns(UserWarning) as record:
+        components.validate_required_components_from_data(
+            trainer.pipeline, training_data
+        )
+
+    assert len(record) == 1
+    assert "EntitySynonymMapper" in record[0].message.args[0]
+
+
+def test_warn_missing_response_selector():
+    cfg = config.load("sample_configs/config_supervised_embeddings.yml")
+    trainer = Trainer(cfg)
+    training_data = load_data("data/examples/rasa")
+    with pytest.warns(UserWarning) as record:
+        components.validate_required_components_from_data(
+            trainer.pipeline, training_data
+        )
+
+    assert len(record) == 1
+    assert "ResponseSelector" in record[0].message.args[0]