Merge branch '1.3.x' into merge_1.2.x

RasaHQ · Sep 19, 2019 · b67f56b · b67f56b
2 parents 6145e06 + c41144c
commit b67f56b
Show file tree

Hide file tree

Showing 17 changed files with 270 additions and 50 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -114,4 +114,4 @@ jobs:
       distributions: "sdist bdist_wheel"
       password:
         secure: "MeL1Ve97eBY+VbNWuQNuLzkPs0TPc+Zh8OfZkhw69ez5imsiWpvp0LrUOLVW3CcC0vNTANEBOVX/n1kHxfcqkf/cChNqAkZ6zTMmvR9zHDwQxXVGZ3jEQSQM+fHdQpjwtH7BwojyxaCIC/5iza7DFMcca/Q6Xr+atdTd0V8Q7Nc5jFHEQf3/4oIIm6YeCUiHcEu981LRdS04+jvuFUN0Ejy+KLukGVyIWyYDjjGjs880Mj4J1mgmCihvVkJ1ujB65rYBdTjls3JpP3eTk63+xH8aHilIuvqB8TDYih8ovE/Vv6YwLI+u2HoEHAtBD4Ez3r71Ju6JKJM7DhWb5aurN4M7K6DC8AvpUl+PsJbNP4ZeW2jXMH6lT6qXKVaSw7lhZ0XY3wunyVcAbArX4RS0B9pb1nHBYUBWZjxXtr8lhkpGFu7H43hw63Y19qb8z4+1cGnijgz1mqXSAssuc+3r0W0cSr+OsCjmOs7cwT6HMQvPEKxLohwBOS/I3EbuKQOYMjFN5BWP5JXbsG45awV9tquxEW8zxjMetR+AOcYoyrDeiR8sAnj1/F99DE0bL1KyW/G5VNu2Xi/c+0M3KvP3+F8XTCuUY/5zTvqh1Qz1jcdiwsiAhO4eBQzQnjeFlxdiVeue2kmD5qsh+VLKKuKLfyVoaV7b1kBlAtBDu7+hDpA="
-    after_deploy: bash scripts/ping_slack_about_package_release.sh
+    after_deploy: ./scripts/ping_slack_about_package_release.sh
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,25 @@ Rasa Change Log
 All notable changes to this project will be documented in this file.
 This project adheres to `Semantic Versioning`_ starting with version 1.0.
 
+[1.3.4] - 2019-09-19
+^^^^^^^^^^^^^^^^^^^^
+
+Added
+-----
+- Added the ability to set the ``backlog`` parameter in Sanics ``run()`` method using
+  the ``SANIC_BACKLOG`` environment variable. This parameter sets the
+  number of unaccepted connections the server allows before refusing new
+  connections. A default value of 100 is used if the variable is not set.
+- Status endpoint (``/status``) now also returns the number of training processes currently running
+
+Fixed
+-----
+- Added the ability to properly deal with spaCy ``Doc``-objects created on
+  empty strings as discussed `here <https://github.com/RasaHQ/rasa/issues/4445>`_.
+  Only training samples that actually bear content are sent to ``self.nlp.pipe``
+  for every given attribute. Non-content-bearing samples are converted to empty
+  ``Doc``-objects. The resulting lists are merged with their preserved order and
+  properly returned.
 
 [1.3.3] - 2019-09-13
 ^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/_static/spec/rasa.yml b/docs/_static/spec/rasa.yml
@@ -69,9 +69,9 @@ paths:
       operationId: getStatus
       tags:
       - Server Information
-      summary: Status of the currently loaded Rasa model
+      summary: Status of the Rasa server
       description: >-
-        Information about the currently loaded Rasa model.
+        Information about the server and the currently loaded Rasa model.
       responses:
         200:
           description: Success
@@ -98,6 +98,10 @@ paths:
                     type: string
                     description: Path of the loaded model
                     example: 20190429-103105.tar.gz
+                  num_active_training_jobs:
+                    type: integer
+                    description: Number of running training processes
+                    example: 2
         401:
           $ref: '#/components/responses/401NotAuthenticated'
         403:

diff --git a/docs/core/retrieval-actions.rst b/docs/core/retrieval-actions.rst
@@ -52,6 +52,9 @@ You can cover all of these with a single story where the above intents are group
 A retrieval action uses the output of a :ref:`response-selector` component from NLU which learns a
 retrieval model to predict the correct response from a list of candidate responses given a user message text.
 
+
+.. _retrieval-training-data:
+
 Training Data
 ^^^^^^^^^^^^^
 
@@ -95,6 +98,10 @@ This is a key difference to the response templates in your domain file.
     to the training process. The contents of it cannot be a part of the file which contains training data for other
     components of NLU.
 
+.. note::
+    As shown in the above examples, ``/`` symbol is reserved as a delimiter to separate retrieval intents from response text identifier. Make sure not to
+    use it in the name of your intents.
+
 Config File
 ^^^^^^^^^^^
 

diff --git a/docs/core/stories.rst b/docs/core/stories.rst
@@ -74,6 +74,12 @@ to predict the next action based on a *combination* of both the intent and
 entities (you can, however, change this behavior using the
 :ref:`use_entities <use_entities>` attribute).
 
+.. warning::
+    ``/`` symbol is reserved as a delimiter to separate retrieval intents from response text identifiers.
+    Refer to ``Training Data Format`` section of :ref:`retrieval-actions` for more details on this format.
+    If any of the intent names contain the delimiter, the file containing these stories will be considered as a training
+    file for :ref:`response-selector` model and will be ignored for training Core models.
+
 Actions
 ~~~~~~~
 While writing stories, you will encounter two types of actions: utterances

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
@@ -50,6 +50,8 @@ General
   an entity set, this will influence the weighted precision and f1-score quite a bit. From now on we
   exclude ``no-entity`` from the evaluation. The overall metrics now only include proper entities. You
   might see a drop in the performance scores when running the evaluation again.
+- ``/`` is reserved as a delimiter token to distinguish between retrieval intent and the corresponding response text
+  identifier. Make sure you don't include ``/`` symbol in the name of your intents.
 
 .. _migration-to-rasa-1.0:
 

diff --git a/docs/nlu/training-data-format.rst b/docs/nlu/training-data-format.rst
@@ -72,6 +72,10 @@ Lookup tables may be specified either directly as lists or as txt files containi
 .. note::
     The common theme here is that common examples, regex features and lookup tables merely act as cues to the final NLU model by providing additional features to the machine learning algorithm during training. Therefore, it must not be assumed that having a single example would be enough for the model to robustly identify intents and/or entities across all variants of that example.
 
+.. note::
+    ``/`` symbol is reserved as a delimiter to separate retrieval intents from response text identifiers. Make sure not to
+    use it in the name of your intents.
+
 JSON Format
 -----------
 

diff --git a/examples/nlg_server/nlg_server.py b/examples/nlg_server/nlg_server.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import os
 
 from sanic import Sanic, response
 
@@ -70,7 +71,12 @@ async def nlg(request):
 
         return response.json(bot_response)
 
-    app.run(host="0.0.0.0", port=port, workers=workers)
+    app.run(
+        host="0.0.0.0",
+        port=port,
+        workers=workers,
+        backlog=int(os.environ.get("SANIC_BACKLOG", "100")),
+    )
 
 
 if __name__ == "__main__":

diff --git a/rasa/core/agent.py b/rasa/core/agent.py
@@ -711,7 +711,11 @@ def handle_channels(
 
         update_sanic_log_level()
 
-        app.run(host="0.0.0.0", port=http_port)
+        app.run(
+            host="0.0.0.0",
+            port=http_port,
+            backlog=int(os.environ.get("SANIC_BACKLOG", "100")),
+        )
 
         # this might seem unnecessary (as run does not return until the server
         # is killed) - but we use it for tests where we mock `.run` to directly

diff --git a/rasa/core/run.py b/rasa/core/run.py
@@ -1,5 +1,6 @@
 import asyncio
 import logging
+import os
 import shutil
 from functools import partial
 from typing import List, Optional, Text, Union
@@ -192,7 +193,12 @@ async def clear_model_files(app: Sanic, _loop: Text) -> None:
 
     update_sanic_log_level(log_file)
 
-    app.run(host="0.0.0.0", port=port, ssl=ssl_context)
+    app.run(
+        host="0.0.0.0",
+        port=port,
+        ssl=ssl_context,
+        backlog=int(os.environ.get("SANIC_BACKLOG", "100")),
+    )
 
 
 # noinspection PyUnusedLocal

diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py
@@ -132,7 +132,6 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training
     if fformat == UNK:
         raise ValueError("Unknown data format for file '{}'.".format(filename))
 
-    logger.debug("Training data format of '{}' is '{}'.".format(filename, fformat))
     reader = _reader_factory(fformat)
 
     if reader:
@@ -174,6 +173,8 @@ def guess_format(filename: Text) -> Text:
                 guess = fformat
                 break
 
+    logger.debug("Training data format of '{}' is '{}'.".format(filename, guess))
+
     return guess
 
 

diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
@@ -1,7 +1,8 @@
 import logging
 import typing
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Tuple
 
+from spacy.tokens import Doc
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults
 from rasa.nlu.training_data import Message, TrainingData
@@ -129,18 +130,97 @@ def get_text(self, example, attribute):
 
         return self.preprocess_text(example.get(attribute))
 
+    @staticmethod
+    def merge_content_lists(
+        indexed_training_samples: List[Tuple[int, Text]],
+        doc_lists: List[Tuple[int, "Doc"]],
+    ) -> List[Tuple[int, "Doc"]]:
+        """Merge lists with processed Docs back into their original order."""
+
+        dct = dict(indexed_training_samples)
+        dct.update(dict(doc_lists))
+        return sorted(dct.items())
+
+    @staticmethod
+    def filter_training_samples_by_content(
+        indexed_training_samples: List[Tuple[int, Text]]
+    ) -> Tuple[List[Tuple[int, Text]], List[Tuple[int, Text]]]:
+        """Separates empty training samples from content bearing ones."""
+
+        docs_to_pipe = list(
+            filter(
+                lambda training_sample: training_sample[1] != "",
+                indexed_training_samples,
+            )
+        )
+        empty_docs = list(
+            filter(
+                lambda training_sample: training_sample[1] == "",
+                indexed_training_samples,
+            )
+        )
+        return docs_to_pipe, empty_docs
+
+    def process_content_bearing_samples(
+        self, samples_to_pipe: List[Tuple[int, Text]]
+    ) -> List[Tuple[int, "Doc"]]:
+        """Sends content bearing training samples to spaCy's pipe."""
+
+        docs = [
+            (to_pipe_sample[0], doc)
+            for to_pipe_sample, doc in zip(
+                samples_to_pipe,
+                [
+                    doc
+                    for doc in self.nlp.pipe(
+                        [txt for _, txt in samples_to_pipe], batch_size=50
+                    )
+                ],
+            )
+        ]
+        return docs
+
+    def process_non_content_bearing_samples(
+        self, empty_samples: List[Tuple[int, Text]]
+    ) -> List[Tuple[int, "Doc"]]:
+        """Creates empty Doc-objects from zero-lengthed training samples strings."""
+
+        n_docs = [
+            (empty_sample[0], doc)
+            for empty_sample, doc in zip(
+                empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples]
+            )
+        ]
+        return n_docs
+
     def docs_for_training_data(
         self, training_data: TrainingData
     ) -> Dict[Text, List[Any]]:
-
         attribute_docs = {}
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
-
             texts = [self.get_text(e, attribute) for e in training_data.intent_examples]
+            # Index and freeze indices of the training samples for preserving the order
+            # after processing the data.
+            indexed_training_samples = [(idx, text) for idx, text in enumerate(texts)]
+
+            samples_to_pipe, empty_samples = self.filter_training_samples_by_content(
+                indexed_training_samples
+            )
+
+            content_bearing_docs = self.process_content_bearing_samples(samples_to_pipe)
 
-            docs = [doc for doc in self.nlp.pipe(texts, batch_size=50)]
+            non_content_bearing_docs = self.process_non_content_bearing_samples(
+                empty_samples
+            )
+
+            attribute_document_list = self.merge_content_lists(
+                indexed_training_samples,
+                content_bearing_docs + non_content_bearing_docs,
+            )
 
-            attribute_docs[attribute] = docs
+            # Since we only need the training samples strings, we create a list to get them out
+            # of the tuple.
+            attribute_docs[attribute] = [doc for _, doc in attribute_document_list]
         return attribute_docs
 
     def train(

diff --git a/rasa/server.py b/rasa/server.py
@@ -2,6 +2,7 @@
 import os
 import tempfile
 import traceback
+import multiprocessing
 from functools import wraps, reduce
 from inspect import isawaitable
 from typing import Any, Callable, List, Optional, Text, Union
@@ -358,6 +359,9 @@ def create_app(
         )
 
     app.agent = agent
+    # Initialize shared object of type unsigned int for tracking
+    # the number of active training processes
+    app.active_training_processes = multiprocessing.Value("I", 0)
 
     @app.exception(ErrorResponse)
     async def handle_error_response(request: Request, exception: ErrorResponse):
@@ -386,6 +390,7 @@ async def status(request: Request):
             {
                 "model_file": app.agent.model_directory,
                 "fingerprint": fingerprint_from_path(app.agent.model_directory),
+                "num_active_training_jobs": app.active_training_processes.value,
             }
         )
 
@@ -666,6 +671,9 @@ async def train(request: Request):
             dump_obj_as_str_to_file(domain_path, rjs["domain"])
 
         try:
+            with app.active_training_processes.get_lock():
+                app.active_training_processes.value += 1
+
             model_path = await train_async(
                 domain=domain_path,
                 config=config_path,
@@ -692,6 +700,9 @@ async def train(request: Request):
                 "TrainingError",
                 "An unexpected error occurred during training. Error: {}".format(e),
             )
+        finally:
+            with app.active_training_processes.get_lock():
+                app.active_training_processes.value -= 1
 
     def validate_request(rjs):
         if "config" not in rjs:

diff --git a/rasa/version.py b/rasa/version.py
@@ -1 +1 @@
-__version__ = "1.3.3"
+__version__ = "1.3.4"
diff --git a/scripts/ping_slack_about_package_release.sh b/scripts/ping_slack_about_package_release.sh
@@ -1,4 +1,6 @@
-#!/usr/bin/env bash
+#!/bin/bash
+
+set -Eeuo pipefail
 
 if [[ ${TRAVIS_TAG} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
     curl -X POST -H "Content-type: application/json" \

diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
@@ -29,6 +29,31 @@ def test_spacy_featurizer(sentence, expected, spacy_nlp):
     assert np.allclose(vecs, doc.vector, atol=1e-5)
 
 
+def test_spacy_training_sample_alignment(spacy_nlp_component):
+    from spacy.tokens import Doc
+
+    m1 = Message.build(text="I have a feeling", intent="feeling")
+    m2 = Message.build(text="", intent="feeling")
+    m3 = Message.build(text="I am the last message", intent="feeling")
+    td = TrainingData(training_examples=[m1, m2, m3])
+
+    attribute_docs = spacy_nlp_component.docs_for_training_data(td)
+
+    assert isinstance(attribute_docs["text"][0], Doc)
+    assert isinstance(attribute_docs["text"][1], Doc)
+    assert isinstance(attribute_docs["text"][2], Doc)
+
+    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
+    assert [t.text for t in attribute_docs["text"][1]] == []
+    assert [t.text for t in attribute_docs["text"][2]] == [
+        "i",
+        "am",
+        "the",
+        "last",
+        "message",
+    ]
+
+
 def test_spacy_intent_featurizer(spacy_nlp_component):
     from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer