Skip to content

Commit

Permalink
Merge 3a6327c into 2edc18a
Browse files Browse the repository at this point in the history
  • Loading branch information
btotharye committed Feb 26, 2020
2 parents 2edc18a + 3a6327c commit 3dad743
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 13 deletions.
94 changes: 84 additions & 10 deletions rasa/nlu/components.py
@@ -1,9 +1,10 @@
import itertools
import logging
import typing
from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
from typing import Any, Dict, Hashable, Iterable, List, Optional, Set, Text, Tuple

from rasa.nlu.config import RasaNLUModelConfig, override_defaults
from rasa.nlu.constants import RESPONSE_ATTRIBUTE
from rasa.nlu.constants import TRAINABLE_EXTRACTORS
from rasa.nlu.training_data import Message, TrainingData
from rasa.utils.common import raise_warning

Expand Down Expand Up @@ -113,20 +114,93 @@ def validate_requires_any_of(
)


def any_components_in_pipeline(components: Iterable[Text], pipeline: List["Component"]):
"""Check if any of the provided components are listed in the pipeline."""
return any(
[any([component.name == c for component in pipeline]) for c in components]
)


def validate_required_components_from_data(
pipeline: List["Component"], data: TrainingData
):
"""Check training data for features.
response_selector_exists = False
for component in pipeline:
# check if a response selector is part of NLU pipeline
if RESPONSE_ATTRIBUTE in component.provides:
response_selector_exists = True
If those features require specific components to featurize or
process them, warn the user if the required component is missing.
"""

if data.entity_examples and not any_components_in_pipeline(
TRAINABLE_EXTRACTORS, pipeline
):
raise_warning(
"You have defined training data consisting of entity examples, but "
"your NLU pipeline does not include an entity extractor trained on "
"your training data. To extract entity examples, add one of "
f"{TRAINABLE_EXTRACTORS} to your pipeline."
)

if data.regex_features and not any_components_in_pipeline(
["RegexFeaturizer"], pipeline
):
raise_warning(
"You have defined training data with regexes, but "
"your NLU pipeline does not include a RegexFeaturizer. "
"To featurize regexes for entity extraction, you need "
"to have a RegexFeaturizer in your pipeline."
)

if data.lookup_tables and not any_components_in_pipeline(
["RegexFeaturizer"], pipeline
):
raise_warning(
"You have defined training data consisting of lookup tables, but "
"your NLU pipeline does not include a RegexFeaturizer. "
"To featurize lookup tables, add a RegexFeaturizer to your pipeline."
)

if data.lookup_tables:
if not any_components_in_pipeline(["CRFEntityExtractor"], pipeline):
raise_warning(
"You have defined training data consisting of lookup tables, but "
"your NLU pipeline does not include a CRFEntityExtractor. "
"To featurize lookup tables, add a CRFEntityExtractor to your pipeline."
)
else:
crf_components = [c for c in pipeline if c.name == "CRFEntityExtractor"]
# check to see if any of the possible CRFEntityExtractors will featurize `pattern`
has_pattern_feature = False
for crf in crf_components:
crf_features = crf.component_config.get("features")
# iterate through [[before],[word],[after]] features
if "pattern" in itertools.chain(*crf_features):
has_pattern_feature = True

if not has_pattern_feature:
raise_warning(
"You have defined training data consisting of lookup tables, but "
"your NLU pipeline's CRFEntityExtractor does not include the `pattern` feature. "
"To featurize lookup tables, add the `pattern` feature to the CRFEntityExtractor in "
"your pipeline."
)

if data.entity_synonyms and not any_components_in_pipeline(
["EntitySynonymMapper"], pipeline
):
raise_warning(
"You have defined training data consisting of synonyms, but "
"your NLU pipeline does not include an EntitySynonymMapper. "
"To map synonyms, add an EntitySynonymMapper to your pipeline."
)

if len(data.response_examples) and not response_selector_exists:
if data.response_examples and not any_components_in_pipeline(
["ResponseSelector"], pipeline
):
raise_warning(
"Training data consists examples for training a response selector but "
"no response selector component specified inside NLU pipeline."
"Your training data includes examples for training a response selector, but "
"your NLU pipeline does not include a ResponseSelector component. "
"Either add a ResponseSelector to your pipeline or "
"remove the response selector training data."
)


Expand Down
2 changes: 1 addition & 1 deletion rasa/nlu/config.py
Expand Up @@ -2,7 +2,7 @@
import logging
import os
import ruamel.yaml as yaml
from typing import Any, Dict, List, Optional, Text, Union, Tuple
from typing import Any, Dict, List, Optional, Text, Union

import rasa.utils.io
from rasa.constants import DEFAULT_CONFIG_PATH, DOCS_URL_PIPELINE
Expand Down
2 changes: 2 additions & 0 deletions rasa/nlu/constants.py
Expand Up @@ -12,6 +12,8 @@

PRETRAINED_EXTRACTORS = {"DucklingHTTPExtractor", "SpacyEntityExtractor"}

TRAINABLE_EXTRACTORS = {"MitieEntityExtractor", "CRFEntityExtractor"}

CLS_TOKEN = "__CLS__"

MESSAGE_ATTRIBUTES = [TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
Expand Down
7 changes: 7 additions & 0 deletions sample_configs/config_crf_no_pattern_feature.yml
@@ -0,0 +1,7 @@
language: en
pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
features: [['low', 'title', 'upper'],['bias', 'low', 'digit'],['low', 'title', 'upper']]
- name: "EntitySynonymMapper"
5 changes: 5 additions & 0 deletions sample_configs/config_crf_no_regex.yml
@@ -0,0 +1,5 @@
language: en
pipeline:
- name: "WhitespaceTokenizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
4 changes: 4 additions & 0 deletions sample_configs/config_crf_no_synonyms.yml
@@ -0,0 +1,4 @@
language: en
pipeline:
- name: "WhitespaceTokenizer"
- name: "CRFEntityExtractor"
8 changes: 8 additions & 0 deletions sample_configs/config_spacy_entity_extractor.yml
@@ -0,0 +1,8 @@
language: en
pipeline:
- name: "SpacyNLP"
- name: "SpacyTokenizer"
- name: "SpacyFeaturizer"
- name: "RegexFeaturizer"
- name: "SpacyEntityExtractor"
- name: "EntitySynonymMapper"
71 changes: 69 additions & 2 deletions tests/nlu/base/test_config.py
Expand Up @@ -5,11 +5,13 @@
import pytest

import rasa.utils.io
from rasa.nlu import config
from rasa.nlu import components, config, load_data
from rasa.nlu.components import ComponentBuilder
from rasa.nlu.constants import TRAINABLE_EXTRACTORS
from rasa.nlu.registry import registered_pipeline_templates
from tests.nlu.conftest import CONFIG_DEFAULTS_PATH
from tests.nlu.conftest import CONFIG_DEFAULTS_PATH, DEFAULT_DATA_PATH
from tests.nlu.utilities import write_file_config
from rasa.nlu.model import Trainer

defaults = rasa.utils.io.read_config_file(CONFIG_DEFAULTS_PATH)

Expand Down Expand Up @@ -81,3 +83,68 @@ def test_override_defaults_supervised_embeddings_pipeline():
component2_cfg = cfg.for_component(1)
component2 = builder.create_component(component2_cfg, cfg)
assert component2.epochs == 10


def test_warn_no_trainable_extractor():
cfg = config.load("sample_configs/config_spacy_entity_extractor.yml")
trainer = Trainer(cfg)
training_data = load_data(DEFAULT_DATA_PATH)
with pytest.warns(UserWarning) as record:
components.validate_required_components_from_data(
trainer.pipeline, training_data
)

assert len(record) == 1
assert str(TRAINABLE_EXTRACTORS) in record[0].message.args[0]


def test_warn_missing_regex_featurizer():
cfg = config.load("sample_configs/config_crf_no_regex.yml")
trainer = Trainer(cfg)
training_data = load_data(DEFAULT_DATA_PATH)
with pytest.warns(UserWarning) as record:
components.validate_required_components_from_data(
trainer.pipeline, training_data
)

assert len(record) == 1
assert "RegexFeaturizer" in record[0].message.args[0]


def test_warn_missing_pattern_feature_lookup_tables():
cfg = config.load("sample_configs/config_crf_no_pattern_feature.yml")
trainer = Trainer(cfg)
training_data = load_data("data/test/lookup_tables/lookup_table.md")
with pytest.warns(UserWarning) as record:
components.validate_required_components_from_data(
trainer.pipeline, training_data
)

assert len(record) == 1
assert "`pattern` feature" in record[0].message.args[0]


def test_warn_missing_synonym_mapper():
cfg = config.load("sample_configs/config_crf_no_synonyms.yml")
trainer = Trainer(cfg)
training_data = load_data("data/test/markdown_single_sections/synonyms_only.md")
with pytest.warns(UserWarning) as record:
components.validate_required_components_from_data(
trainer.pipeline, training_data
)

assert len(record) == 1
assert "EntitySynonymMapper" in record[0].message.args[0]


def test_warn_missing_response_selector():
cfg = config.load("sample_configs/config_supervised_embeddings.yml")
trainer = Trainer(cfg)
training_data = load_data("data/examples/rasa")
with pytest.warns(UserWarning) as record:
components.validate_required_components_from_data(
trainer.pipeline, training_data
)

assert len(record) == 1
assert "ResponseSelector" in record[0].message.args[0]

0 comments on commit 3dad743

Please sign in to comment.