Merge 97764d4 into 8bb05aa

RasaHQ · Nov 9, 2018 · f8ee048 · f8ee048
2 parents 8bb05aa + 97764d4
commit f8ee048
Show file tree

Hide file tree

Showing 22 changed files with 561 additions and 367 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,7 @@ This project adheres to `Semantic Versioning`_ starting with version 0.2.0.
 
 Added
 -----
+- added train/eval scripts to compare policies
 - openapi documentation of server API
 - NLU data learned through interactive learning will now be stored in a
   separate markdown-format file (any previous NLU data is merged)

diff --git a/data/test_config/max_hist_config.yml b/data/test_config/max_hist_config.yml
@@ -0,0 +1,5 @@
+policies:
+  - name: MemoizationPolicy
+    max_history: 5
+  - name: KerasPolicy
+    max_history: 5
diff --git a/data/test_config/no_max_hist_config.yml b/data/test_config/no_max_hist_config.yml
@@ -0,0 +1,3 @@
+policies:
+  - name: MemoizationPolicy
+  - name: KerasPolicy
diff --git a/default_config.yml b/default_config.yml
@@ -0,0 +1,9 @@
+policies:
+  - name: KerasPolicy
+    epochs: 100
+    max_history: 5
+  - name: FallbackPolicy
+    fallback_action_name: 'action_default_fallback'
+  - name: MemoizationPolicy
+    max_history: 5
+  - name: FormPolicy
diff --git a/docs/evaluation.rst b/docs/evaluation.rst
@@ -19,7 +19,7 @@ by using the evaluate script:
 
 .. code-block:: bash
 
-    $ python -m rasa_core.evaluate -d models/dialogue \
+    $ python -m rasa_core.evaluate default -d models/dialogue \
       -s test_stories.md -o matrix.pdf --failed failed_stories.md
 
 
@@ -34,7 +34,7 @@ incorrect action was predicted instead.
 
 The full list of options for the script is:
 
-.. program-output:: python -m rasa_core.evaluate -h
+.. program-output:: python -m rasa_core.evaluate default -h
 
 .. _end_to_end_evaluation:
 
@@ -77,7 +77,7 @@ the full end-to-end evaluation command is this:
 
 .. code-block:: bash
 
-  $ python -m rasa_core.evaluate -d models/dialogue --nlu models/nlu/current \
+  $ python -m rasa_core.evaluate default -d models/dialogue --nlu models/nlu/current \
     -s e2e_stories.md --e2e
 
 .. note::
@@ -98,14 +98,40 @@ your bot, so you don't just want to throw some away to use as a test set.
 
 Rasa Core has some scripts to help you choose and fine-tune your policy.
 Once you are happy with it, you can then train your final policy on your
-full data set. To do this, split your training data into multiple files
-in a single directory. You can then use the ``train_paper`` script to
-train multiple policies on the same data. You can choose one of the
-files to be partially excluded. This means that Rasa Core will be
-trained multiple times, with 0, 5, 25, 50, 70, 90, 95, and 100% of
-the stories in that file removed from the training data. By evaluating
-on the full set of stories, you can measure how well Rasa Core is
-predicting the held-out stories.
+full data set. To do this, you first have to train models for your different
+policies. Create two (or more) policy config files of the policies you want to
+compare (containing only one policy each), and then use the ``compare`` mode of
+the train script to train your models:
+
+.. code-block:: bash
+
+  $ python -m rasa_core.train compare -c policy_config1.yml policy_config2.yml \
+    -d domain.yml -s stories_folder -o comparison_models --runs 3 --percentages \
+    0 5 25 50 70 90 95
+
+For each policy configuration provided, Rasa Core will be trained multiple times
+with 0, 5, 25, 50, 70 and 95% of your training stories excluded from the training
+data. This is done for multiple runs, to ensure consistent results.
+
+Once this script has finished, you can now use the evaluate script in compare
+mode to evaluate the models you just trained:
+
+.. code-block:: bash
+
+  $ python -m rasa_core.evaluate compare -s stories_folder -d comparison_models \
+    -o comparison_results
+
+This will evaluate each of the models on the training set, and plot some graphs
+to show you which policy is best.  By evaluating on the full set of stories, you
+can measure how well Rasa Core is predicting the held-out stories.
+
+If you're not sure which policies to compare, we'd recommend trying out the
+``EmbeddingPolicy`` and the ``KerasPolicy`` to see which one works better for
+you.
+
+.. note::
+    This training process can take a long time, so we'd suggest letting it run
+    somewhere in the background where it can't be interrupted
 
 
 Evaluating stories over http
@@ -129,5 +155,3 @@ you may do so by adding the ``e2e=true`` query parameter:
   $ curl --data-binary @eval_stories.md "localhost:5005/evaluate?e2e=true" | python -m json.tool
 
 .. include:: feedback.inc
-
-
diff --git a/examples/moodbot/Makefile b/examples/moodbot/Makefile
@@ -11,7 +11,7 @@ train-nlu:
 	       --data ./data/nlu.md --path models/ --project nlu
 
 train-core:
-	python -m rasa_core.train -s data/stories.md -d domain.yml -o models/dialogue --epochs 300
+	python -m rasa_core.train default -s data/stories.md -d domain.yml -o models/dialogue -c ../../default_config.yml
 
 run-fb:
 	python -m rasa_core.run -d models/dialogue -u models/nlu/current -p 5002 -c facebook --credentials credentials.yml

diff --git a/examples/restaurantbot/bot.py b/examples/restaurantbot/bot.py
@@ -25,14 +25,12 @@ def train_dialogue(domain_file="restaurant_domain.yml",
                    training_data_file="data/babi_stories.md"):
     agent = Agent(domain_file,
                   policies=[MemoizationPolicy(max_history=3),
-                            RestaurantPolicy()])
+                            RestaurantPolicy(batch_size=100, epochs=400,
+                                             validation_split=0.2)])
 
     training_data = agent.load_data(training_data_file)
     agent.train(
-            training_data,
-            epochs=400,
-            batch_size=100,
-            validation_split=0.2
+            training_data
     )
 
     agent.persist(model_path)

diff --git a/rasa_core/agent.py b/rasa_core/agent.py
@@ -441,7 +441,8 @@ def load_data(self,
                   augmentation_factor=20,  # type: int
                   tracker_limit=None,  # type: Optional[int]
                   use_story_concatenation=True,  # type: bool
-                  debug_plots=False  # type: bool
+                  debug_plots=False,  # type: bool
+                  exclusion_percentage=None  # type: int
                   ):
         # type: (...) -> List[DialogueStateTracker]
         """Load training data from a resource."""
@@ -478,7 +479,8 @@ def load_data(self,
                                   remove_duplicates, unique_last_num_states,
                                   augmentation_factor,
                                   tracker_limit, use_story_concatenation,
-                                  debug_plots)
+                                  debug_plots,
+                                  exclusion_percentage=exclusion_percentage)
 
     def train(self,
               training_trackers,  # type: List[DialogueStateTracker]

diff --git a/rasa_core/config.py b/rasa_core/config.py
@@ -5,62 +5,19 @@
 
 from typing import Optional, Text, Dict, Any, List
 
-from rasa_core.constants import (
-    DEFAULT_NLU_FALLBACK_THRESHOLD,
-    DEFAULT_CORE_FALLBACK_THRESHOLD, DEFAULT_FALLBACK_ACTION)
 from rasa_core import utils
-from rasa_core.policies import PolicyEnsemble, Policy
+from rasa_core.policies import PolicyEnsemble
 
 
-def load(config_file, fallback_args, max_history):
+def load(config_file):
     # type: (Optional[Text], Dict[Text, Any], int) -> List[Policy]
     """Load policy data stored in the specified file. fallback_args and
     max_history are typically command line arguments. They take precedence
     over the arguments specified in the config yaml.
     """
-
-    if config_file is None:
-        return PolicyEnsemble.default_policies(fallback_args, max_history)
-
-    config_data = utils.read_yaml_file(config_file)
-    config_data = handle_precedence_and_defaults(
-                            config_data, fallback_args, max_history)
+    if config_file:
+        config_data = utils.read_yaml_file(config_file)
+    else:
+        raise ValueError("You have to provide a config file")
 
     return PolicyEnsemble.from_dict(config_data)
-
-
-def handle_precedence_and_defaults(config_data, fallback_args, max_history):
-    # type: (Dict[Text, Any], Dict[Text, Any], int) -> Dict[Text, Any]
-
-    for policy in config_data.get('policies'):
-
-        if policy.get('name') == 'FallbackPolicy' and fallback_args is not None:
-            set_fallback_args(policy, fallback_args)
-
-        elif policy.get('name') in {'KerasPolicy', 'MemoizationPolicy'}:
-            set_arg(policy, "max_history", max_history, 3)
-
-    return config_data
-
-
-def set_arg(data_dict, argument, value, default):
-
-    if value is not None:
-        data_dict[argument] = value
-    elif data_dict.get(argument) is None:
-        data_dict[argument] = default
-
-    return data_dict
-
-
-def set_fallback_args(policy, fallback_args):
-
-    set_arg(policy, "nlu_threshold",
-            fallback_args.get("nlu_threshold"),
-            DEFAULT_NLU_FALLBACK_THRESHOLD)
-    set_arg(policy, "core_threshold",
-            fallback_args.get("core_threshold"),
-            DEFAULT_CORE_FALLBACK_THRESHOLD)
-    set_arg(policy, "fallback_action_name",
-            fallback_args.get("fallback_action_name"),
-            DEFAULT_FALLBACK_ACTION)