diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b4cfbe8d92b4..99d9c930f21e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,8 +11,12 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0. Added ----- -- `FallbackPolicy` can now be configured to trigger when the difference between - confidences of two predicted intents is too narrow + +Changed +------- +- messages with multiple entities are now handled properly with e2e evaluation +- ``data/test_evaluations/end_to_end_story.md`` was re-written in the restaurantbot domain +- `FallbackPolicy` can now be configured to trigger when the difference between confidences of two predicted intents is too narrow - throw error during training when triggers are defined in the domain without ``MappingPolicy`` being present in the policy ensemble - experimental training data importer which supports training with data of multiple @@ -33,7 +37,6 @@ Changed Removed ------- - Fixed ----- - ``rasa test nlu`` with a folder of configuration files @@ -69,6 +72,9 @@ Added Changed ------- +- ``Agent.update_model()`` and ``Agent.handle_message()`` now work without needing to set a domain + or a policy ensemble +- Update pytype to ``2019.7.11`` - new event broker class: ``SQLProducer``. This event broker is now used when running locally with Rasa X - API requests are not longer logged to ``rasa_core.log`` by default in order to avoid diff --git a/data/test_evaluations/end_to_end_story.md b/data/test_evaluations/end_to_end_story.md index 93f91684e6f8..0600e9a9da41 100644 --- a/data/test_evaluations/end_to_end_story.md +++ b/data/test_evaluations/end_to_end_story.md @@ -1,17 +1,27 @@ ## simple_story_with_only_start -> check_greet -* default:/default - - utter_default - -## simple_story_with_only_end -* greet:/greet - - utter_greet -> check_greet +* greet: Hello + - utter_ask_howcanhelp ## simple_story_with_multiple_turns -* greet:/greet - - utter_greet -* default:/default - - utter_default -* goodbye:/goodbye - - utter_goodbye +* greet: good morning + - utter_ask_howcanhelp +* inform: im looking for a [moderately](price:moderate) priced restaurant in the [east](location) part of town + - utter_on_it + - utter_ask_cuisine +* inform: [french](cuisine) food + - utter_ask_numpeople + + ## story_with_multiple_entities_correction_and_search +* greet: hello + - utter_ask_howcanhelp +* inform: im looking for a [cheap](price:lo) restaurant which has [french](cuisine) food and is located in [bombay](location) + - utter_on_it + - utter_ask_numpeople +* inform: for [six](people:6) please + - utter_ask_moreupdates +* inform: actually i need a [moderately](price:moderate) priced restaurant + - utter_ask_moreupdates +* deny: no + - utter_ack_dosearch + - action_search_restaurants + - action_suggest \ No newline at end of file diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml index 26f82e50914f..695005196169 100644 --- a/examples/restaurantbot/config.yml +++ b/examples/restaurantbot/config.yml @@ -11,7 +11,7 @@ pipeline: policies: - name: "examples.restaurantbot.policy.RestaurantPolicy" batch_size: 100 - epochs: 400 + epochs: 100 validation_split: 0.2 - name: MemoizationPolicy - name: MappingPolicy diff --git a/examples/restaurantbot/run.py b/examples/restaurantbot/run.py index 2add336a84ca..da4954d115f3 100644 --- a/examples/restaurantbot/run.py +++ b/examples/restaurantbot/run.py @@ -37,11 +37,11 @@ async def train_core( policies=[ MemoizationPolicy(max_history=3), MappingPolicy(), - RestaurantPolicy(batch_size=100, epochs=400, validation_split=0.2), + RestaurantPolicy(batch_size=100, epochs=100, validation_split=0.2), ], ) - training_data = await agent.load_data(training_data_file) + training_data = await agent.load_data(training_data_file, augmentation_factor=10) agent.train(training_data) # Attention: agent.persist stores the model and all meta data into a folder. diff --git a/rasa/core/test.py b/rasa/core/test.py index d6830603e938..254190ab0e78 100644 --- a/rasa/core/test.py +++ b/rasa/core/test.py @@ -1,17 +1,18 @@ -import json import logging import os -import typing import warnings +import typing from collections import defaultdict, namedtuple from typing import Any, Dict, List, Optional, Text, Tuple from rasa.constants import RESULTS_FILE +from rasa.core.utils import pad_lists_to_size from rasa.core.events import ActionExecuted, UserUttered +from rasa.nlu.training_data.formats.markdown import MarkdownWriter +from rasa.core.trackers import DialogueStateTracker if typing.TYPE_CHECKING: from rasa.core.agent import Agent - from rasa.core.trackers import DialogueStateTracker logger = logging.getLogger(__name__) @@ -76,32 +77,28 @@ def has_prediction_target_mismatch(self): or self.action_predictions != self.action_targets ) - def serialise_targets( - self, include_actions=True, include_intents=True, include_entities=False - ): - targets = [] - if include_actions: - targets += self.action_targets - if include_intents: - targets += self.intent_targets - if include_entities: - targets += self.entity_targets - - return [json.dumps(t) if isinstance(t, dict) else t for t in targets] - - def serialise_predictions( - self, include_actions=True, include_intents=True, include_entities=False - ): - predictions = [] + def serialise(self) -> Tuple[List[Text], List[Text]]: + """Turn targets and predictions to lists of equal size for sklearn.""" - if include_actions: - predictions += self.action_predictions - if include_intents: - predictions += self.intent_predictions - if include_entities: - predictions += self.entity_predictions + targets = ( + self.action_targets + + self.intent_targets + + [ + MarkdownWriter._generate_entity_md(gold.get("text"), gold) + for gold in self.entity_targets + ] + ) + predictions = ( + self.action_predictions + + self.intent_predictions + + [ + MarkdownWriter._generate_entity_md(predicted.get("text"), predicted) + for predicted in self.entity_predictions + ] + ) - return [json.dumps(t) if isinstance(t, dict) else t for t in predictions] + # sklearn does not cope with lists of unequal size, nor None values + return pad_lists_to_size(targets, predictions, padding_value="None") class WronglyPredictedAction(ActionExecuted): @@ -144,24 +141,23 @@ class WronglyClassifiedUserUtterance(UserUttered): type_name = "wrong_utterance" - def __init__( - self, - text, - correct_intent, - correct_entities, - parse_data=None, - timestamp=None, - input_channel=None, - predicted_intent=None, - predicted_entities=None, - ): - self.predicted_intent = predicted_intent - self.predicted_entities = predicted_entities + def __init__(self, event: UserUttered, eval_store: EvaluationStore): - intent = {"name": correct_intent} + if not eval_store.intent_predictions: + self.predicted_intent = None + else: + self.predicted_intent = eval_store.intent_predictions[0] + self.predicted_entities = eval_store.entity_predictions + + intent = {"name": eval_store.intent_targets[0]} super(WronglyClassifiedUserUtterance, self).__init__( - text, intent, correct_entities, parse_data, timestamp, input_channel + event.text, + intent, + eval_store.entity_targets, + event.parse_data, + event.timestamp, + event.input_channel, ) def as_story_string(self, e2e=True): @@ -197,24 +193,35 @@ async def _generate_trackers(resource_name, agent, max_stories=None, use_e2e=Fal return g.generate() -def _clean_entity_results(entity_results): - return [ - {k: r[k] for k in ("start", "end", "entity", "value") if k in r} - for r in entity_results - ] +def _clean_entity_results( + text: Text, entity_results: List[Dict[Text, Any]] +) -> List[Dict[Text, Any]]: + """Extract only the token variables from an entity dict.""" + cleaned_entities = [] + for r in tuple(entity_results): + cleaned_entity = {"text": text} + for k in ("start", "end", "entity", "value"): + if k in set(r): + cleaned_entity[k] = r[k] + cleaned_entities.append(cleaned_entity) + + return cleaned_entities -def _collect_user_uttered_predictions( - event, partial_tracker, fail_on_prediction_errors -): - from rasa.core.utils import pad_list_to_size +def _collect_user_uttered_predictions( + event: UserUttered, + partial_tracker: DialogueStateTracker, + fail_on_prediction_errors: bool, +) -> EvaluationStore: user_uttered_eval_store = EvaluationStore() intent_gold = event.parse_data.get("true_intent") - predicted_intent = event.parse_data.get("intent").get("name") - if predicted_intent is None: - predicted_intent = "None" + predicted_intent = event.parse_data.get("intent", {}).get("name") + + if not predicted_intent: + predicted_intent = [None] + user_uttered_eval_store.add_to_store( intent_predictions=predicted_intent, intent_targets=intent_gold ) @@ -223,30 +230,14 @@ def _collect_user_uttered_predictions( predicted_entities = event.parse_data.get("entities") if entity_gold or predicted_entities: - if len(entity_gold) > len(predicted_entities): - predicted_entities = pad_list_to_size( - predicted_entities, len(entity_gold), "None" - ) - elif len(predicted_entities) > len(entity_gold): - entity_gold = pad_list_to_size(entity_gold, len(predicted_entities), "None") - user_uttered_eval_store.add_to_store( - entity_targets=_clean_entity_results(entity_gold), - entity_predictions=_clean_entity_results(predicted_entities), + entity_targets=_clean_entity_results(event.text, entity_gold), + entity_predictions=_clean_entity_results(event.text, predicted_entities), ) if user_uttered_eval_store.has_prediction_target_mismatch(): partial_tracker.update( - WronglyClassifiedUserUtterance( - event.text, - intent_gold, - user_uttered_eval_store.entity_predictions, - event.parse_data, - event.timestamp, - event.input_channel, - predicted_intent, - user_uttered_eval_store.entity_targets, - ) + WronglyClassifiedUserUtterance(event, user_uttered_eval_store) ) if fail_on_prediction_errors: raise ValueError( @@ -493,10 +484,9 @@ async def test( from sklearn.exceptions import UndefinedMetricWarning warnings.simplefilter("ignore", UndefinedMetricWarning) - report, precision, f1, accuracy = get_evaluation_metrics( - evaluation_store.serialise_targets(), - evaluation_store.serialise_predictions(), - ) + + targets, predictions = evaluation_store.serialise() + report, precision, f1, accuracy = get_evaluation_metrics(targets, predictions) if out_directory: plot_story_evaluation( diff --git a/rasa/core/utils.py b/rasa/core/utils.py index 0f09d95895cc..483ed5d3fa4c 100644 --- a/rasa/core/utils.py +++ b/rasa/core/utils.py @@ -379,9 +379,19 @@ def remove_none_values(obj: Dict[Text, Any]) -> Dict[Text, Any]: return {k: v for k, v in obj.items() if v is not None} -def pad_list_to_size(_list, size, padding_value=None): - """Pads _list with padding_value up to size""" - return _list + [padding_value] * (size - len(_list)) +def pad_lists_to_size( + list_x: List, list_y: List, padding_value: Optional[Any] = None +) -> Tuple[List, List]: + """Compares list sizes and pads them to equal length.""" + + difference = len(list_x) - len(list_y) + + if difference > 0: + return list_x, list_y + [padding_value] * difference + elif difference < 0: + return list_x + [padding_value] * (-difference), list_y + else: + return list_x, list_y class AvailableEndpoints(object): diff --git a/tests/core/conftest.py b/tests/core/conftest.py index e02931972211..2126153834fe 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -9,8 +9,7 @@ import rasa.utils.io from rasa.core import train from rasa.core.agent import Agent -from rasa.core.channels import channel -from rasa.core.channels.channel import CollectingOutputChannel, RestInput +from rasa.core.channels.channel import CollectingOutputChannel from rasa.core.domain import Domain from rasa.core.interpreter import RegexInterpreter from rasa.core.nlg import TemplatedNaturalLanguageGenerator @@ -44,6 +43,8 @@ MOODBOT_MODEL_PATH = "examples/moodbot/models/" +RESTAURANTBOT_PATH = "examples/restaurantbot/" + DEFAULT_ENDPOINTS_FILE = "data/test_endpoints/example_endpoints.yml" TEST_DIALOGUES = [ @@ -237,3 +238,16 @@ def train_model(project: Text, filename: Text = "test.tar.gz"): @pytest.fixture(scope="session") def trained_model(project) -> Text: return train_model(project) + + +@pytest.fixture +async def restaurantbot(tmpdir_factory) -> Text: + model_path = tmpdir_factory.mktemp("model").strpath + restaurant_domain = os.path.join(RESTAURANTBOT_PATH, "domain.yml") + restaurant_config = os.path.join(RESTAURANTBOT_PATH, "config.yml") + restaurant_data = os.path.join(RESTAURANTBOT_PATH, "data/") + + agent = await train_async( + restaurant_domain, restaurant_config, restaurant_data, model_path + ) + return agent diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py index d2a042a2701a..bc377623ea87 100644 --- a/tests/core/test_evaluation.py +++ b/tests/core/test_evaluation.py @@ -5,6 +5,7 @@ # we need this import to ignore the warning... # noinspection PyUnresolvedReferences from rasa.nlu.test import run_evaluation +from rasa.core.agent import Agent from tests.core.conftest import ( DEFAULT_STORIES_FILE, E2E_STORY_FILE_UNKNOWN_ENTITY, @@ -28,31 +29,62 @@ async def test_evaluation_image_creation(tmpdir, default_agent): assert os.path.isfile(stories_path) -async def test_action_evaluation_script(tmpdir, default_agent): +async def test_end_to_end_evaluation_script(tmpdir, restaurantbot): + restaurantbot = Agent.load(restaurantbot) completed_trackers = await _generate_trackers( - DEFAULT_STORIES_FILE, default_agent, use_e2e=False - ) - story_evaluation, num_stories = collect_story_predictions( - completed_trackers, default_agent, use_e2e=False - ) - - assert not story_evaluation.evaluation_store.has_prediction_target_mismatch() - assert len(story_evaluation.failed_stories) == 0 - assert num_stories == 3 - - -async def test_end_to_end_evaluation_script(tmpdir, default_agent): - completed_trackers = await _generate_trackers( - END_TO_END_STORY_FILE, default_agent, use_e2e=True + END_TO_END_STORY_FILE, restaurantbot, use_e2e=True ) story_evaluation, num_stories = collect_story_predictions( - completed_trackers, default_agent, use_e2e=True + completed_trackers, restaurantbot, use_e2e=True ) + serialised_store = [ + "utter_ask_howcanhelp", + "action_listen", + "utter_ask_howcanhelp", + "action_listen", + "utter_on_it", + "utter_ask_cuisine", + "action_listen", + "utter_ask_numpeople", + "action_listen", + "utter_ask_howcanhelp", + "action_listen", + "utter_on_it", + "utter_ask_numpeople", + "action_listen", + "utter_ask_moreupdates", + "action_listen", + "utter_ask_moreupdates", + "action_listen", + "utter_ack_dosearch", + "action_search_restaurants", + "action_suggest", + "action_listen", + "greet", + "greet", + "inform", + "inform", + "greet", + "inform", + "inform", + "inform", + "deny", + "[moderately](price:moderate)", + "[east](location)", + "[french](cuisine)", + "[cheap](price:lo)", + "[french](cuisine)", + "[bombay](location)", + "[six](people:6)", + "[moderately](price:moderate)", + ] + + assert story_evaluation.evaluation_store.serialise()[0] == serialised_store assert not story_evaluation.evaluation_store.has_prediction_target_mismatch() assert len(story_evaluation.failed_stories) == 0 - assert num_stories == 2 + assert num_stories == 3 async def test_end_to_end_evaluation_script_unknown_entity(tmpdir, default_agent): diff --git a/tests/core/test_utils.py b/tests/core/test_utils.py index 15afc1ecfddd..f37ca4d593ba 100644 --- a/tests/core/test_utils.py +++ b/tests/core/test_utils.py @@ -63,15 +63,6 @@ def test_cap_length_with_short_string(): assert utils.cap_length("my", 3) == "my" -def test_pad_list_to_size(): - assert utils.pad_list_to_size(["e1", "e2"], 4, "other") == [ - "e1", - "e2", - "other", - "other", - ] - - def test_read_lines(): lines = utils.read_lines( "data/test_stories/stories.md", max_line_limit=2, line_pattern=r"\*.*" @@ -80,3 +71,13 @@ def test_read_lines(): lines = list(lines) assert len(lines) == 2 + + +def test_pad_lists_to_size(): + list_x = [1, 2, 3] + list_y = ["a", "b"] + list_z = [None, None, None] + + assert utils.pad_lists_to_size(list_x, list_y) == (list_x, ["a", "b", None]) + assert utils.pad_lists_to_size(list_y, list_x, "c") == (["a", "b", "c"], list_x) + assert utils.pad_lists_to_size(list_z, list_x) == (list_z, list_x)