Merge branch 'master' into tracker-sessions

RasaHQ · Dec 11, 2019 · a1d7243 · a1d7243
2 parents 3b167f4 + 322da75
commit a1d7243
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 22 deletions.
diff --git a/changelog/684.removal.rst b/changelog/684.removal.rst
@@ -0,0 +1,2 @@
+Specifying lookup tables directly in the NLU file is now deprecated. Please specify
+them in an external file.
diff --git a/docs/nlu/training-data-format.rst b/docs/nlu/training-data-format.rst
@@ -46,12 +46,7 @@ e.g. ``[entity](entity name)``.
     ## regex:zipcode
     - [0-9]{5}
 
-    ## lookup:currencies   <!-- lookup table list -->
-    - Yen
-    - USD
-    - Euro
-
-    ## lookup:additional_currencies  <!-- no list to specify lookup table file -->
+    ## lookup:additional_currencies  <!-- specify lookup tables in an external file -->
     path/to/currencies.txt
 
 The training data for Rasa NLU is structured into different parts:
@@ -67,7 +62,9 @@ learn the domain with fewer examples and also help it be more confident of its p
 Synonyms will map extracted entities to the same name, for example mapping "my savings account" to simply "savings".
 However, this only happens *after* the entities have been extracted, so you need to provide examples with the synonyms present so that Rasa can learn to pick them up.
 
-Lookup tables may be specified either directly as lists or as txt files containing newline-separated words or phrases.  Upon loading the training data, these files are used to generate case-insensitive regex patterns that are added to the regex features.  For example, in this case a list of currency names is supplied so that it is easier to pick out this entity.
+Lookup tables may be specified as plain text files containing newline-separated words or 
+phrases. Upon loading the training data, these files are used to generate
+case-insensitive regex patterns that are added to the regex features.
 
 .. note::
     The common theme here is that common examples, regex features and lookup tables merely act as cues to the final NLU model by providing additional features to the machine learning algorithm during training. Therefore, it must not be assumed that having a single example would be enough for the model to robustly identify intents and/or entities across all variants of that example.
@@ -155,8 +152,7 @@ for these extractors. Currently, all intent classifiers make use of available re
 
 Lookup Tables
 -------------
-Lookup tables in the form of external files or lists of elements may also be specified in the training data.
-The externally supplied lookup tables must be in a newline-separated format.
+The supplied lookup table files must be in a newline-delimited format.
 For example, ``data/test/lookup_tables/plates.txt`` may contain:
 
 .. literalinclude:: ../../data/test/lookup_tables/plates.txt
@@ -168,16 +164,6 @@ And can be loaded as:
     ## lookup:plates
     data/test/lookup_tables/plates.txt
 
-Alternatively, lookup elements may be directly included as a list
-
-.. code-block:: md
-
-    ## lookup:plates
-    - beans
-    - rice
-    - tacos
-    - cheese
-
 When lookup tables are supplied in training data, the contents are combined
 into a large, case-insensitive regex pattern that looks for exact matches in
 the training examples. These regexes match over multiple tokens, so

diff --git a/rasa/nlu/featurizers/regex_featurizer.py b/rasa/nlu/featurizers/regex_featurizer.py
@@ -1,10 +1,11 @@
-import io
 import logging
+import warnings
+
 import numpy as np
 import os
 import re
 import typing
-from typing import Any, Dict, Optional, Text
+from typing import Any, Dict, Optional, Text, Union, List
 
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
@@ -16,6 +17,7 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_FEATURE_NAMES,
 )
+from rasa.constants import DOCS_BASE_URL
 
 logger = logging.getLogger(__name__)
 
@@ -101,14 +103,22 @@ def features_for_patterns(self, message) -> np.array:
 
         return np.array(found_patterns).astype(float)
 
-    def _generate_lookup_regex(self, lookup_table) -> Text:
+    def _generate_lookup_regex(
+        self, lookup_table: Dict[Text, Union[Text, List[Text]]]
+    ) -> Text:
         """creates a regex out of the contents of a lookup table file"""
         lookup_elements = lookup_table["elements"]
         elements_to_regex = []
 
         # if it's a list, it should be the elements directly
         if isinstance(lookup_elements, list):
             elements_to_regex = lookup_elements
+            warnings.warn(
+                f"Directly including lookup tables as a list is deprecated since Rasa "
+                f"1.6. See {DOCS_BASE_URL}/nlu/training-data-format/#lookup-tables "
+                f"how to do so.",
+                FutureWarning,
+            )
 
         # otherwise it's a file path.
         else:

diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
@@ -275,6 +275,20 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
         assert num_matches == labeled_tokens.count(i)
 
 
+def test_lookup_table_deprecation():
+    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
+
+    lookups = [
+        {
+            "name": "drinks",
+            "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
+        }
+    ]
+
+    with pytest.warns(FutureWarning):
+        RegexFeaturizer(lookup_tables=lookups)
+
+
 def test_spacy_featurizer_casing(spacy_nlp):
     from rasa.nlu.featurizers import spacy_featurizer