From 2e16ad2a957e756967fd9d5a2b9917f014713854 Mon Sep 17 00:00:00 2001
From: Tom Bocklisch <tom.bocklisch@scalableminds.com>
Date: Thu, 6 Jul 2017 14:33:00 +0200
Subject: [PATCH] Add pattern feature to default crf extractor

---
 _pytest/test_extractors.py                  |  8 ++++----
 config_defaults.json                        |  2 +-
 data/examples/rasa/demo-rasa.json           | 12 +++++++++++
 rasa_nlu/config.py                          |  2 +-
 rasa_nlu/extractors/crf_entity_extractor.py | 22 ++++++++++-----------
 5 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/_pytest/test_extractors.py b/_pytest/test_extractors.py
index e8e41b90971f..7d099f67a9fb 100644
--- a/_pytest/test_extractors.py
+++ b/_pytest/test_extractors.py
@@ -25,11 +25,11 @@ def test_crf_extractor(spacy_nlp):
     ext.train(TrainingData(training_examples=examples), config)
     sentence = 'anywhere in the west'
     crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
-    assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
+    assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']
     feats = ext._sentence_to_features(crf_format)
-    assert ('BOS' in feats[0])
-    assert ('EOS' in feats[-1])
-    assert ('0:low:in' in feats[1])
+    assert 'BOS' in feats[0]
+    assert 'EOS' in feats[-1]
+    assert feats[1]['0:low'] == "in"
     sentence = 'anywhere in the west'
     ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
 
diff --git a/config_defaults.json b/config_defaults.json
index 4cbfda5fd3dc..920829a11ed5 100644
--- a/config_defaults.json
+++ b/config_defaults.json
@@ -20,6 +20,6 @@
   "entity_crf_BILOU_flag": true,
   "entity_crf_features": [
     ["low", "title", "upper", "pos", "pos2"],
-    ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"],
+    ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"],
     ["low", "title", "upper", "pos", "pos2"]]
 }
diff --git a/data/examples/rasa/demo-rasa.json b/data/examples/rasa/demo-rasa.json
index ee3305e95caf..01330e98c563 100644
--- a/data/examples/rasa/demo-rasa.json
+++ b/data/examples/rasa/demo-rasa.json
@@ -229,6 +229,18 @@
           }
         ]
       },
+      {
+        "text": "I am looking a restaurant in 29432",
+        "intent": "restaurant_search",
+        "entities": [
+          {
+            "start": 29,
+            "end": 34,
+            "value": "29432",
+            "entity": "location"
+          }
+        ]
+      },
       {
         "text": "I am looking for mexican indian fusion",
         "intent": "restaurant_search",
diff --git a/rasa_nlu/config.py b/rasa_nlu/config.py
index aa3295601586..ef3b8f345c18 100644
--- a/rasa_nlu/config.py
+++ b/rasa_nlu/config.py
@@ -37,7 +37,7 @@
     "entity_crf_BILOU_flag": True,
     "entity_crf_features": [
         ["low", "title", "upper", "pos", "pos2"],
-        ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"],
+        ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"],
         ["low", "title", "upper", "pos", "pos2"]]
 }
 
diff --git a/rasa_nlu/extractors/crf_entity_extractor.py b/rasa_nlu/extractors/crf_entity_extractor.py
index 8a6a52ad6ade..af519d399a0c 100644
--- a/rasa_nlu/extractors/crf_entity_extractor.py
+++ b/rasa_nlu/extractors/crf_entity_extractor.py
@@ -39,15 +39,15 @@ class CRFEntityExtractor(EntityExtractor):
 
     function_dict = {
         'low': lambda doc: doc[0].lower(),
-        'title': lambda doc: str(doc[0].istitle()),
+        'title': lambda doc: doc[0].istitle(),
         'word3': lambda doc: doc[0][-3:],
         'word2': lambda doc: doc[0][-2:],
         'pos': lambda doc: doc[1],
         'pos2': lambda doc: doc[1][:2],
         'bias': lambda doc: 'bias',
-        'upper': lambda doc: str(doc[0].isupper()),
-        'digit': lambda doc: str(doc[0].isdigit()),
-        'pattern': lambda doc: doc[2],
+        'upper': lambda doc: doc[0].isupper(),
+        'digit': lambda doc: doc[0].isdigit(),
+        'pattern': lambda doc: str(doc[3]) if doc[3] is not None else 'N/A',
     }
 
     def __init__(self, ent_tagger=None, entity_crf_features=None, entity_crf_BILOU_flag=True):
@@ -217,20 +217,20 @@ def persist(self, model_dir):
             return {"entity_extractor_crf": None}
 
     def _sentence_to_features(self, sentence):
-        # type: (List[Tuple[Text, Text, Text, Text]]) -> List[List[Text]]
+        # type: (List[Tuple[Text, Text, Text, Text]]) -> List[Dict[Text, Any]]
         """Convert a word into discrete features in self.crf_features, including word before and word after."""
 
         sentence_features = []
         for word_idx in range(len(sentence)):
             # word before(-1), current word(0), next word(+1)
             prefixes = ['-1', '0', '+1']
-            word_features = []
+            word_features = {}
             for i in range(3):
                 if word_idx == len(sentence) - 1 and i == 2:
-                    word_features.append('EOS')
+                    word_features['EOS'] = True
                     # End Of Sentence
                 elif word_idx == 0 and i == 0:
-                    word_features.append('BOS')
+                    word_features['BOS'] = True
                     # Beginning Of Sentence
                 else:
                     word = sentence[word_idx - 1 + i]
@@ -239,7 +239,7 @@ def _sentence_to_features(self, sentence):
                     for feature in features:
                         # append each feature to a feature vector
                         # word_features.append(prefix + feature + ':' + self.function_dict[feature](word))
-                        word_features.append(':'.join((prefix, feature, self.function_dict[feature](word))))
+                        word_features[prefix + ":" + feature] = self.function_dict[feature](word)
             sentence_features.append(word_features)
         return sentence_features
 
@@ -272,9 +272,9 @@ def _from_json_to_crf(self, message, entity_offsets):
 
     def __pattern_of_token(self, message, i):
         if message.get("tokens"):
-            return str(message.get("tokens")[i].get("pattern", "N/A"))
+            return message.get("tokens")[i].get("pattern")
         else:
-            return "N/A"
+            return None
 
     def _from_text_to_crf(self, message, entities=None):
         # type: (Message, List[Text]) -> List[Tuple[Text, Text, Text, Text]]