Merge pull request #1095 from RasaHQ/lean-CRF

Fast language-agnostic NER CRF without spaCy
RasaHQ · Jun 1, 2018 · 271cebf · 271cebf
2 parents e0dd9fa + 95fecd7
commit 271cebf
Show file tree

Hide file tree

Showing 14 changed files with 220 additions and 76 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,8 @@ Added
 -----
 - doc link to a community contribution for Rasa NLU in Chinese
 - support for component ``count_vectors_featurizer`` use ``tokens`` feature provide by tokenizer
+- 2-character and a 5-character prefix features to ``ner_crf``
+- ``ner_crf`` with whitespaced tokens to ``tensorflow_embedding`` pipeline
 - predict empty string instead of None for intent name
 - update default parameters for tensorflow embedding classifier
 - do not predict anything if feature vector contains only zeros in tensorflow embedding classifier
@@ -22,9 +24,11 @@ Added
 
 Changed
 -------
+- L1 and L2 regularisation defaults in ``ner_crf`` both set to 0.1
 
 Removed
 -------
+- dependence on spaCy when training ``ner_crf`` without POS features
 
 Fixed
 -----

diff --git a/NOTICE b/NOTICE
@@ -0,0 +1,5 @@
+Rasa Technologies GmbH
+Copyright 2016-2018 Rasa Technologies GmbH
+
+This product includes software from spaCy (https://github.com/explosion/spaCy),
+under the MIT License (see: rasa_nlu.extractors.crf_entity_extractor).
diff --git a/alt_requirements/requirements_tensorflow_sklearn.txt b/alt_requirements/requirements_tensorflow_sklearn.txt
@@ -2,4 +2,5 @@
 -r requirements_bare.txt
 
 scikit-learn==0.19.1
-tensorflow==1.6.0
+tensorflow==1.6.0
+sklearn-crfsuite==0.3.6
diff --git a/docs/pipeline.rst b/docs/pipeline.rst
@@ -125,6 +125,8 @@ Here's an example configuration:
     language: "en"
 
     pipeline:
+    - name: "tokenizer_whitespace"
+    - name: "ner_crf"
     - name: "intent_featurizer_count_vectors"
     - name: "intent_classifier_tensorflow_embedding"
       intent_tokenization_flag: true
@@ -638,7 +640,7 @@ ner_spacy
         }
 
 :Description:
-    Using spacy this component predicts the entities of a message. spacy uses a statistical BILUO transition model.
+    Using spacy this component predicts the entities of a message. spacy uses a statistical BILOU transition model.
     As of now, this component can only use the spacy builtin entity extraction models and can not be retrained.
     This extractor does not provide any confidence scores.
 
@@ -702,6 +704,7 @@ ner_crf
     and the states are entity classes. Features of the words (capitalisation, POS tagging,
     etc.) give probabilities to certain entity classes, as are transitions between
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
+    If POS features are used (pos or pos2), spaCy has to be installed.
 :Configuration:
    .. code-block:: yaml
 
@@ -713,9 +716,10 @@ ner_crf
           # in array before will have the feature
           # "is the preceding word in title case?".
           # Available features are:
-          # ``low``, ``title``, ``word3``, ``word2``, ``pos``,
-          # ``pos2``, ``bias``, ``upper`` and ``digit``
-          features: [["low", "title"], ["bias", "word3"], ["upper", "pos", "pos2"]]
+          # ``low``, ``title``, ``suffix5``, ``suffix3``, ``suffix2``,
+          # ``suffix1``, ``pos``, ``pos2``, ``prefix5``, ``prefix2``,
+          # ``bias``, ``upper`` and ``digit``
+          features: [["low", "title"], ["bias", "suffix3"], ["upper", "pos", "pos2"]]
 
           # The flag determines whether to use BILOU tagging or not. BILOU
           # tagging is more rigorous however
@@ -728,11 +732,11 @@ ner_crf
 
           # This is the value given to sklearn_crfcuite.CRF tagger before training.
           # Specifies the L1 regularization coefficient.
-          L1_c: 1.0
+          L1_c: 0.1
 
           # This is the value given to sklearn_crfcuite.CRF tagger before training.
           # Specifies the L2 regularization coefficient.
-          L2_c: 1e-3
+          L2_c: 0.1
 
 .. _section_pipeline_duckling:
 

diff --git a/rasa_nlu/config.py b/rasa_nlu/config.py
@@ -3,14 +3,15 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import copy
 import logging
 import os
 
 import six
 import yaml
 from builtins import object
 # Describes where to search for the config file if no location is specified
-from typing import Text
+from typing import Text, Optional, Dict, Any, List
 
 from rasa_nlu import utils
 from rasa_nlu.utils import json_to_string
@@ -23,7 +24,6 @@
     "data": None,
 }
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -53,20 +53,42 @@ def load(filename=None, **kwargs):
         return RasaNLUModelConfig(kwargs)
 
 
-def override_defaults(defaults, custom):
-    cfg = defaults or {}
+def override_defaults(
+        defaults,  # type: Optional[Dict[Text, Any]]
+        custom  # type: Optional[Dict[Text, Any]]
+):
+    # type: (...) -> Dict[Text, Any]
+    if defaults:
+        cfg = copy.deepcopy(defaults)
+    else:
+        cfg = {}
+
     if custom:
         cfg.update(custom)
     return cfg
 
 
 def make_path_absolute(path):
+    # type: (Text) -> Text
     if path and not os.path.isabs(path):
         return os.path.join(os.getcwd(), path)
     else:
         return path
 
 
+def component_config_from_pipeline(
+        name,  # type: Text
+        pipeline,  # type: List[Dict[Text, Any]]
+        defaults=None  # type: Optional[Dict[Text, Any]]
+):
+    # type: (...) -> Dict[Text, Any]
+    for c in pipeline:
+        if c.get("name") == name:
+            return override_defaults(defaults, c)
+    else:
+        return override_defaults(defaults, {})
+
+
 class RasaNLUModelConfig(object):
     DEFAULT_PROJECT_NAME = "default"
 
@@ -134,11 +156,7 @@ def view(self):
         return json_to_string(self.__dict__, indent=4)
 
     def for_component(self, name, defaults=None):
-        for c in self.pipeline:
-            if c.get("name") == name:
-                return override_defaults(defaults, c)
-        else:
-            return defaults or {}
+        return component_config_from_pipeline(name, self.pipeline, defaults)
 
     @property
     def component_names(self):