RasaHQ · twhughes · Sep 11, 2018 · Aug 13, 2018 · Aug 13, 2018 · Aug 13, 2018
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -15,6 +15,7 @@ Added
 - ``DataRouter()`` class supports a ``model_server`` ``EndpointConfig``, which it regularly queries to fetch NLU models
 - this can be used with ``rasa_nlu.server`` with the ``--endpoint`` option (the key for this the model server config is ``model``)
 - docs on model fetching from a URL
+- ability to specify lookup tables in training data
 
 Changed
 -------

diff --git a/data/test/lookup_tables/lookup_table.json b/data/test/lookup_tables/lookup_table.json
@@ -0,0 +1,21 @@
+{
+  "rasa_nlu_data": {
+    "lookup_tables": [
+      {
+        "name": "plates",
+        "elements": "data/test/lookup_tables/plates.txt"
+      },
+      {
+        "name": "drinks",
+        "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club mate"]
+      }
+    ],    
+    "common_examples": [
+      {
+        "text": "hey", 
+        "intent": "greet", 
+        "entities": []
+      }
+    ]
+  }
+}
diff --git a/data/test/lookup_tables/lookup_table.md b/data/test/lookup_tables/lookup_table.md
@@ -0,0 +1,15 @@
+## intent:restaurant_search
+- i'm looking for a [sushi](food) place to eat
+- I want to grab [tacos](food)
+- I am searching for a [pizza](food) spot
+- I would like to drink [sweet berry wine](beverage) with my meal
+
+## lookup:plates
+  data/test/lookup_tables/plates.txt
+
+## lookup:drinks
+ - mojito
+ - lemonade
+ - sweet berry wine
+ - tea
+ - club mate
diff --git a/data/test/lookup_tables/plates.txt b/data/test/lookup_tables/plates.txt
@@ -0,0 +1,5 @@
+tacos
+beef
+mapo tofu
+burrito
+lettuce wrap
diff --git a/docs/dataformat.rst b/docs/dataformat.rst
@@ -26,6 +26,7 @@ Examples are grouped by intent, and entities are annotated as markdown links.
     - what is my balance <!-- no entity -->
     - how much do I have on my [savings](source_account) <!-- entity "source_account" has value "savings" -->
     - how much do I have on my [savings account](source_account:savings) <!-- synonyms, method 1-->
+    - Could I pay in [yen](currency)?  <!-- entity matched by lookup table -->
 
     ## intent:greet
     - hey
@@ -37,13 +38,21 @@ Examples are grouped by intent, and entities are annotated as markdown links.
     ## regex:zipcode
     - [0-9]{5}
 
+    ## lookup:currencies   <!-- lookup table list -->
+    - Yen
+    - USD
+    - Euro
+
+    ## lookup:additional_currencies  <!-- no list to specify lookup table file -->
+    path/to/currencies.txt
 
 The training data for Rasa NLU is structured into different parts:
-examples, synonyms, and regex features. 
+examples, synonyms, regex features, and lookup tables. 
 
 Synonyms will map extracted entities to the same name, for example mapping "my savings account" to simply "savings".
 However, this only happens *after* the entities have been extracted, so you need to provide examples with the synonyms present so that Rasa can learn to pick them up. 
 
+Lookup tables may be specified either directly as lists or as txt files containing newline-separated words or phrases.  Upon loading the training data, these files are used to generate case-insensitive regex patterns that are added to the regex features.  For example, in this case a list of currency names is supplied so that it is easier to pick out this entity.
 
 JSON Format
 -----------
@@ -58,6 +67,7 @@ The most important one is ``common_examples``.
         "rasa_nlu_data": {
             "common_examples": [],
             "regex_features" : [],
+            "lookup_tables"  : [],
             "entity_synonyms": []
         }
     }
@@ -230,6 +240,49 @@ for these extractors. Currently, all intent classifiers make use of available re
     training data!
 
 
+Lookup Tables
+-------------
+Lookup tables in the form of external files or lists of elements may also be specified in the training data.  The externally supplied lookup tables must be in a newline-separated format.  For example, ``data/test/lookup_tables/plates.txt`` may contain
+
+.. include:: ../data/test/lookup_tables/plates.txt
+
+And can be loaded as:
+
+.. code-block:: json
+
+    {
+        "rasa_nlu_data": {
+            "lookup_tables": [
+                {
+                    "name": "plates",
+                    "elements": "data/test/lookup_tables/plates.txt"
+                }
+            ]
+        }
+    }
+
+Alternatively, lookup elements may be directly included as a list
+
+    {
+        "rasa_nlu_data": {
+            "lookup_tables": [
+                {
+                    "name": "plates",
+                    "elements": ["beans", "rice", "tacos", "cheese"]
+                }
+            ]
+        }
+    }
+
+When lookup tables are supplied in training data, the contents are combined into a large, case-insensitive regex pattern that looks for exact matches in the training examples.  These regexes match over multiple tokens, so ``lettuce wrap`` would match ``get me a lettuce wrap ASAP`` as ``[0 0 0 1 1 0]``.  These regexes are processed identically to the regular regex patterns directly specified in the training data.
+
+.. note::
+    For lookup tables to be effective, there must be a few examples of matches in your training data.  Otherwise the model will not learn to use the lookup table match features.
+
+.. warning::
+    One must be careful with what kind of data is present in the lookup table.  For example if some of the elements are matched with commonly occuring words that are not the entity you wish to extract, this will limit the effectiveness of this method.  In fact, it might hurt the performance of entity recognition.  Therefore, try to use lookup tables only when you have a list of unambiguous phrases or tokens that you wish to match and make sure you filter out potentially problematic elements.
+
+
 Organization
 ------------
 

diff --git a/rasa_nlu/featurizers/regex_featurizer.py b/rasa_nlu/featurizers/regex_featurizer.py
@@ -7,6 +7,8 @@
 import os
 import re
 import warnings
+import io
+import sys
 
 import typing
 from typing import Any, Dict, List, Optional, Text
@@ -35,15 +37,20 @@ class RegexFeaturizer(Featurizer):
 
     requires = ["tokens"]
 
-    def __init__(self, component_config=None, known_patterns=None):
+    def __init__(self, component_config=None,
+                 known_patterns=None, lookup_tables=None):
+
         super(RegexFeaturizer, self).__init__(component_config)
 
         self.known_patterns = known_patterns if known_patterns else []
+        lookup_tables = lookup_tables or []
+        self._add_lookup_table_regexes(lookup_tables)
 
     def train(self, training_data, config, **kwargs):
         # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
 
         self.known_patterns = training_data.regex_features
+        self._add_lookup_table_regexes(training_data.lookup_tables)
 
         for example in training_data.training_examples:
             updated = self._text_features_with_regex(example)
@@ -62,6 +69,15 @@ def _text_features_with_regex(self, message):
         else:
             return message.get("text_features")
 
+    def _add_lookup_table_regexes(self, lookup_tables):
+        # appends the regex features from the lookup tables to
+        # self.known_patterns
+        for table in lookup_tables:
+            regex_pattern = self._generate_lookup_regex(table)
+            lookup_regex = {'name': table['name'],
+                            'pattern': regex_pattern}
+            self.known_patterns.append(lookup_regex)
+
     def features_for_patterns(self, message):
         """Checks which known patterns match the message.
 
@@ -87,6 +103,38 @@ def features_for_patterns(self, message):
         found = [1.0 if m is not None else 0.0 for m in matches]
         return np.array(found)
 
+    def _generate_lookup_regex(self, lookup_table):
+        """creates a regex out of the contents of a lookup table file"""
+        lookup_elements = lookup_table['elements']
+        elements_to_regex = []
+
+        # if it's a list, it should be the elements directly
+        if isinstance(lookup_elements, list):
+            elements_to_regex = lookup_elements
+
+        # otherwise it's a file path.
+        else:
+
+            try:
+                f = io.open(lookup_elements, 'r')
+            except IOError:
+                raise ValueError("Could not load lookup table {}"
+                                 "Make sure you've provided the correct path"
+                                 .format(lookup_elements))
+
+            with f:
+                for line in f:
+                    new_element = line.strip()
+                    if new_element:
+                        elements_to_regex.append(new_element)
+
+        # sanitize the regex, escape special characters
+        elements_sanitized = [re.escape(e) for e in elements_to_regex]
+
+        # regex matching elements with word boundaries on either side
+        regex_string = '(?i)(\\b' + '\\b|\\b'.join(elements_sanitized) + '\\b)'
+        return regex_string
+
     @classmethod
     def load(cls,
              model_dir=None,   # type: Optional[Text]

diff --git a/rasa_nlu/training_data/formats/markdown.py b/rasa_nlu/training_data/formats/markdown.py
@@ -15,13 +15,15 @@
 INTENT = "intent"
 SYNONYM = "synonym"
 REGEX = "regex"
-available_sections = [INTENT, SYNONYM, REGEX]
+LOOKUP = "lookup"
+available_sections = [INTENT, SYNONYM, REGEX, LOOKUP]
 ent_regex = re.compile(r'\[(?P<entity_text>[^\]]+)'
                        r'\]\((?P<entity>[^:)]*?)'
                        r'(?:\:(?P<value>[^)]+))?\)')  # [entity_text](entity_type(:entity_synonym)?)
 
 item_regex = re.compile(r'\s*[-\*+]\s*(.+)')
 comment_regex = re.compile(r'<!--[\s\S]*?--!*>', re.MULTILINE)
+fname_regex = re.compile(r'\s*([^-\*+]+)')
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +38,7 @@ def __init__(self):
         self.entity_synonyms = {}
         self.regex_features = []
         self.section_regexes = self._create_section_regexes(available_sections)
+        self.lookup_tables = []
 
     def reads(self, s, **kwargs):
         """Read markdown string and create TrainingData object"""
@@ -48,8 +51,9 @@ def reads(self, s, **kwargs):
                 self._set_current_section(header[0], header[1])
             else:
                 self._parse_item(line)
-
-        return TrainingData(self.training_examples, self.entity_synonyms, self.regex_features)
+                self._load_files(line)
+        return TrainingData(self.training_examples, self.entity_synonyms,
+                            self.regex_features, self.lookup_tables)
 
     @staticmethod
     def _strip_comments(text):
@@ -71,6 +75,17 @@ def _find_section_header(self, line):
                 return name, match.group(1)
         return None
 
+    def _load_files(self, line):
+        """Checks line to see if filename was supplied.  If so, inserts the
+        filename into the lookup table slot for processing from the regex
+        featurizer."""
+        if self.current_section == LOOKUP:
+            match = re.match(fname_regex, line)
+            if match:
+                fname = match.group(1)
+                self.lookup_tables.append(
+                    {"name": self.current_title, "elements": str(fname)})
+
     def _parse_item(self, line):
         """Parses an md list item line based on the current section type."""
         match = re.match(item_regex, line)
@@ -81,8 +96,22 @@ def _parse_item(self, line):
                 self.training_examples.append(parsed)
             elif self.current_section == SYNONYM:
                 self._add_synonym(item, self.current_title)
-            else:
-                self.regex_features.append({"name": self.current_title, "pattern": item})
+            elif self.current_section == REGEX:
+                self.regex_features.append(
+                    {"name": self.current_title, "pattern": item})
+            elif self.current_section == LOOKUP:
+                self._add_item_to_lookup(item)
+
+    def _add_item_to_lookup(self, item):
+        """Takes a list of lookup table dictionaries.  Finds the one associated
+        with the current lookup, then adds the item to the list."""
+        matches = [l for l in self.lookup_tables
+                   if l["name"] == self.current_title]
+        if not matches:
+            self.lookup_tables.append({"name": self.current_title, "elements": [item]})
+        else:
+            elements = matches[0]['elements']
+            elements.append(item)
 
     def _find_entities_in_training_example(self, example):
         """Extracts entities from a markdown intent example."""
@@ -141,6 +170,7 @@ def dumps(self, training_data):
         md += self._generate_training_examples_md(training_data)
         md += self._generate_synonyms_md(training_data)
         md += self._generate_regex_features_md(training_data)
+        md += self._generate_lookup_tables_md(training_data)
 
         return md
 
@@ -183,6 +213,21 @@ def _generate_regex_features_md(self, training_data):
 
         return md
 
+    def _generate_lookup_tables_md(self, training_data):
+        """generates markdown for regex features."""
+        md = u''
+        # regex features are already sorted
+        lookup_tables = training_data.lookup_tables
+        for i, lookup_table in enumerate(lookup_tables):
+            md += self._generate_section_header_md(LOOKUP, lookup_table["name"])
+            elements = lookup_table["elements"]
+            if isinstance(elements, list):
+                for e in elements:
+                    md += self._generate_item_md(e)
+            else:
+                md += self._generate_fname_md(elements)
+        return md
+
     def _generate_section_header_md(self, section_type, title, prepend_newline=True):
         """generates markdown section header."""
         prefix = "\n" if prepend_newline else ""
@@ -192,6 +237,10 @@ def _generate_item_md(self, text):
         """generates markdown for a list item."""
         return "- {}\n".format(text)
 
+    def _generate_fname_md(self, text):
+        """generates markdown for a lookup table file path."""
+        return "  {}\n".format(text)
+
     def _generate_message_md(self, message):
         """generates markdown for a message object."""
         md = ''