Merge pull request #4246 from RasaHQ/new_compare

Update train compare script
RasaHQ · Aug 15, 2019 · 3154ae9 · 3154ae9
2 parents 63be756 + 51fd047
commit 3154ae9
Show file tree

Hide file tree

Showing 8 changed files with 48 additions and 33 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,7 +6,7 @@ Rasa Change Log
 All notable changes to this project will be documented in this file.
 This project adheres to `Semantic Versioning`_ starting with version 1.0.
 
-[Unreleased 1.2.3] - `master`_
+[Unreleased 1.3] - `master`_
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Added
@@ -16,7 +16,7 @@ Changed
 -------
 - messages with multiple entities are now handled properly with e2e evaluation
 - ``data/test_evaluations/end_to_end_story.md`` was re-written in the restaurantbot domain
-- `FallbackPolicy` can now be configured to trigger when the difference between confidences of two predicted intents is too narrow
+- ``FallbackPolicy`` can now be configured to trigger when the difference between confidences of two predicted intents is too narrow
 - throw error during training when triggers are defined in the domain without
   ``MappingPolicy`` being present in the policy ensemble
 - experimental training data importer which supports training with data of multiple
@@ -30,9 +30,11 @@ Changed
 -------
 - added character-level ``CountVectorsFeaturizer`` with empirically found parameters 
   into the ``supervised_embeddings`` NLU pipeline template
-- bot messages contain the `timestamp` of the `BotUttered` event, which can be used in channels
+- bot messages contain the ``timestamp`` of the ``BotUttered`` event, which can be used in channels
 - NLU evaluations now also stores its output in the output directory like the core evaluation
 - show warning in case a default path is used instead of a provided, invalid path
+- compare mode of ``rasa train core`` allows the whole core config comparison,
+  naming style of models trained for comparison is changed (this is a breaking change)
 
 Removed
 -------

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
@@ -11,6 +11,25 @@ Migration Guide
 This page contains information about changes between major versions and
 how you can migrate from one version to another.
 
+.. _migration-to-rasa-1.3:
+
+Rasa 1.2 to Rasa 1.3
+------------------------------------------------
+.. warning::
+
+  This is a release **breaking backwards compatibility**.
+  It is not possible to load previously trained models. Please make sure to retrain a
+  model before trying to use it with this improved version.
+
+General
+~~~~~~~
+
+- **Compare** mode of ``rasa train core`` allows the whole core config comparison.
+  Therefore, we changed the naming of trained models. They are named by config file
+  name instead of policy name. Old naming style will not be read correctly when
+  creating **compare** plots (``rasa test core``). Please remove old trained models in comparison folder
+  and retrain. Normal core training is unaffected.
+
 .. _migration-to-rasa-1.0:
 
 Rasa NLU 0.14.x and Rasa Core 0.13.x to Rasa 1.0

diff --git a/docs/user-guide/evaluating-models.rst b/docs/user-guide/evaluating-models.rst
@@ -175,10 +175,10 @@ The full list of options for the script is:
 .. program-output:: rasa test core --help
 
 
-Comparing Policies
-------------------
+Comparing Core Configurations
+-----------------------------
 
-To choose a specific policy configuration, or to choose hyperparameters for a
+To choose a configuration for your core model, or to choose hyperparameters for a
 specific policy, you want to measure how well Rasa Core will `generalise`
 to conversations which it hasn't seen before. Especially in the beginning
 of a project, you do not have a lot of real conversations to use to train
@@ -187,9 +187,8 @@ your bot, so you don't just want to throw some away to use as a test set.
 Rasa Core has some scripts to help you choose and fine-tune your policy configuration.
 Once you are happy with it, you can then train your final configuration on your
 full data set. To do this, you first have to train models for your different
-policies. Create two (or more) config files including the policies you want to
-compare (containing only one policy each), and then use the ``compare`` mode of
-the train script to train your models:
+configurations. Create two (or more) config files including the policies you want to
+compare, and then use the ``compare`` mode of the train script to train your models:
 
 .. code-block:: bash
 
@@ -209,10 +208,12 @@ mode to evaluate the models you just trained:
   $ rasa test core -m comparison_models --stories stories_folder
   --out comparison_results --evaluate-model-directory
 
-This will evaluate each of the models on the training set and plot some graphs
+This will evaluate each of the models on the provided stories
+(can be either training or test set) and plot some graphs
 to show you which policy performs best. By evaluating on the full set of stories, you
 can measure how well Rasa Core is predicting the held-out stories.
 
+To compare single policies create config files containing only one policy each.
 If you're not sure which policies to compare, we'd recommend trying out the
 ``EmbeddingPolicy`` and the ``KerasPolicy`` to see which one works better for
 you.

diff --git a/rasa/cli/train.py b/rasa/cli/train.py
@@ -91,6 +91,7 @@ def train_core(
     story_file = get_validated_path(
         args.stories, "stories", DEFAULT_DATA_PATH, none_is_valid=True
     )
+    kwargs = extract_additional_arguments(args)
 
     # Policies might be a list for the compare training. Do normal training
     # if only list item was passed.
@@ -107,12 +108,12 @@ def train_core(
             output=output,
             train_path=train_path,
             fixed_model_name=args.fixed_model_name,
-            kwargs=extract_additional_arguments(args),
+            kwargs=kwargs,
         )
     else:
         from rasa.core.train import do_compare_training
 
-        loop.run_until_complete(do_compare_training(args, story_file))
+        loop.run_until_complete(do_compare_training(args, story_file, kwargs))
 
 
 def train_nlu(

diff --git a/rasa/constants.py b/rasa/constants.py
@@ -16,6 +16,7 @@
 TRAIN_DATA_FILE = "train.md"
 RESULTS_FILE = "results.json"
 NUMBER_OF_TRAINING_STORIES_FILE = "num_stories.json"
+PERCENTAGE_KEY = "__percentage__"
 
 PACKAGE_NAME = "rasa"
 

diff --git a/rasa/core/test.py b/rasa/core/test.py
@@ -5,7 +5,7 @@
 from collections import defaultdict, namedtuple
 from typing import Any, Dict, List, Optional, Text, Tuple
 
-from rasa.constants import RESULTS_FILE
+from rasa.constants import RESULTS_FILE, PERCENTAGE_KEY
 from rasa.core.utils import pad_lists_to_size
 from rasa.core.events import ActionExecuted, UserUttered
 from rasa.nlu.training_data.formats.markdown import MarkdownWriter
@@ -167,7 +167,7 @@ def as_story_string(self, e2e=True):
         predicted_message = md_format_message(
             self.text, self.predicted_intent, self.predicted_entities
         )
-        return ("{}: {}   <!-- predicted: {}: {} -->").format(
+        return "{}: {}   <!-- predicted: {}: {} -->".format(
             self.intent.get("name"),
             correct_message,
             self.predicted_intent,
@@ -593,13 +593,11 @@ async def compare_models_in_dir(
             if not model.endswith("tar.gz"):
                 continue
 
-            # The model files are named like <policy-name><number>.tar.gz
-            # Remove the number from the name to get the policy name
-            policy_name = "".join(
-                [i for i in os.path.basename(model) if not i.isdigit()]
-            )
+            # The model files are named like <config-name>PERCENTAGE_KEY<number>.tar.gz
+            # Remove the percentage key and number from the name to get the config name
+            config_name = os.path.basename(model).split(PERCENTAGE_KEY)[0]
             number_of_correct_stories = await _evaluate_core_model(model, stories_file)
-            number_correct_in_run[policy_name].append(number_of_correct_stories)
+            number_correct_in_run[config_name].append(number_of_correct_stories)
 
         for k, v in number_correct_in_run.items():
             number_correct[k].append(v)
@@ -686,7 +684,7 @@ def _plot_curve(
     data = rasa.utils.io.read_json_file(os.path.join(output, RESULTS_FILE))
     x = number_of_examples
 
-    # compute mean of all the runs for keras/embed policies
+    # compute mean of all the runs for different configs
     for label in data.keys():
         if len(data[label]) == 0:
             continue

diff --git a/rasa/core/train.py b/rasa/core/train.py
@@ -5,7 +5,7 @@
 import typing
 from typing import Dict, Optional, Text, Union, List
 
-from rasa.constants import NUMBER_OF_TRAINING_STORIES_FILE
+from rasa.constants import NUMBER_OF_TRAINING_STORIES_FILE, PERCENTAGE_KEY
 from rasa.core.domain import Domain
 from rasa.utils.common import TempDirectoryPath
 
@@ -80,7 +80,6 @@ async def train_comparison_models(
     kwargs: Optional[Dict] = None,
 ):
     """Train multiple models for comparison of policies"""
-    from rasa.core import config
     from rasa import model
     from rasa.importers.importer import TrainingDataImporter
 
@@ -92,23 +91,17 @@ async def train_comparison_models(
 
         for current_run, percentage in enumerate(exclusion_percentages, 1):
             for policy_config in policy_configs:
-                policies = config.load(policy_config)
-
-                if len(policies) > 1:
-                    raise ValueError(
-                        "You can only specify one policy per model for comparison"
-                    )
 
                 file_importer = TrainingDataImporter.load_core_importer_from_config(
                     policy_config, domain, [story_file]
                 )
 
-                policy_name = type(policies[0]).__name__
+                config_name = os.path.splitext(os.path.basename(policy_config))[0]
                 logging.info(
                     "Starting to train {} round {}/{}"
                     " with {}% exclusion"
                     "".format(
-                        policy_name, current_run, len(exclusion_percentages), percentage
+                        config_name, current_run, len(exclusion_percentages), percentage
                     )
                 )
 
@@ -126,7 +119,7 @@ async def train_comparison_models(
                     new_fingerprint = await model.model_fingerprint(file_importer)
 
                     output_dir = os.path.join(output_path, "run_" + str(r + 1))
-                    model_name = policy_name + str(current_run)
+                    model_name = config_name + PERCENTAGE_KEY + str(percentage)
                     model.package_model(
                         fingerprint=new_fingerprint,
                         output_directory=output_dir,

diff --git a/rasa/version.py b/rasa/version.py
@@ -1 +1 @@
-__version__ = "1.2.2"
+__version__ = "1.3"