From 2e16ad2a957e756967fd9d5a2b9917f014713854 Mon Sep 17 00:00:00 2001 From: Tom Bocklisch Date: Thu, 6 Jul 2017 14:33:00 +0200 Subject: [PATCH] Add pattern feature to default crf extractor --- _pytest/test_extractors.py | 8 ++++---- config_defaults.json | 2 +- data/examples/rasa/demo-rasa.json | 12 +++++++++++ rasa_nlu/config.py | 2 +- rasa_nlu/extractors/crf_entity_extractor.py | 22 ++++++++++----------- 5 files changed, 29 insertions(+), 17 deletions(-) diff --git a/_pytest/test_extractors.py b/_pytest/test_extractors.py index e8e41b90971f..7d099f67a9fb 100644 --- a/_pytest/test_extractors.py +++ b/_pytest/test_extractors.py @@ -25,11 +25,11 @@ def test_crf_extractor(spacy_nlp): ext.train(TrainingData(training_examples=examples), config) sentence = 'anywhere in the west' crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) - assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']) + assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) - assert ('BOS' in feats[0]) - assert ('EOS' in feats[-1]) - assert ('0:low:in' in feats[1]) + assert 'BOS' in feats[0] + assert 'EOS' in feats[-1] + assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) diff --git a/config_defaults.json b/config_defaults.json index 4cbfda5fd3dc..920829a11ed5 100644 --- a/config_defaults.json +++ b/config_defaults.json @@ -20,6 +20,6 @@ "entity_crf_BILOU_flag": true, "entity_crf_features": [ ["low", "title", "upper", "pos", "pos2"], - ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"], + ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"], ["low", "title", "upper", "pos", "pos2"]] } diff --git a/data/examples/rasa/demo-rasa.json b/data/examples/rasa/demo-rasa.json index ee3305e95caf..01330e98c563 100644 --- a/data/examples/rasa/demo-rasa.json +++ b/data/examples/rasa/demo-rasa.json @@ -229,6 +229,18 @@ } ] }, + { + "text": "I am looking a restaurant in 29432", + "intent": "restaurant_search", + "entities": [ + { + "start": 29, + "end": 34, + "value": "29432", + "entity": "location" + } + ] + }, { "text": "I am looking for mexican indian fusion", "intent": "restaurant_search", diff --git a/rasa_nlu/config.py b/rasa_nlu/config.py index aa3295601586..ef3b8f345c18 100644 --- a/rasa_nlu/config.py +++ b/rasa_nlu/config.py @@ -37,7 +37,7 @@ "entity_crf_BILOU_flag": True, "entity_crf_features": [ ["low", "title", "upper", "pos", "pos2"], - ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"], + ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"], ["low", "title", "upper", "pos", "pos2"]] } diff --git a/rasa_nlu/extractors/crf_entity_extractor.py b/rasa_nlu/extractors/crf_entity_extractor.py index 8a6a52ad6ade..af519d399a0c 100644 --- a/rasa_nlu/extractors/crf_entity_extractor.py +++ b/rasa_nlu/extractors/crf_entity_extractor.py @@ -39,15 +39,15 @@ class CRFEntityExtractor(EntityExtractor): function_dict = { 'low': lambda doc: doc[0].lower(), - 'title': lambda doc: str(doc[0].istitle()), + 'title': lambda doc: doc[0].istitle(), 'word3': lambda doc: doc[0][-3:], 'word2': lambda doc: doc[0][-2:], 'pos': lambda doc: doc[1], 'pos2': lambda doc: doc[1][:2], 'bias': lambda doc: 'bias', - 'upper': lambda doc: str(doc[0].isupper()), - 'digit': lambda doc: str(doc[0].isdigit()), - 'pattern': lambda doc: doc[2], + 'upper': lambda doc: doc[0].isupper(), + 'digit': lambda doc: doc[0].isdigit(), + 'pattern': lambda doc: str(doc[3]) if doc[3] is not None else 'N/A', } def __init__(self, ent_tagger=None, entity_crf_features=None, entity_crf_BILOU_flag=True): @@ -217,20 +217,20 @@ def persist(self, model_dir): return {"entity_extractor_crf": None} def _sentence_to_features(self, sentence): - # type: (List[Tuple[Text, Text, Text, Text]]) -> List[List[Text]] + # type: (List[Tuple[Text, Text, Text, Text]]) -> List[Dict[Text, Any]] """Convert a word into discrete features in self.crf_features, including word before and word after.""" sentence_features = [] for word_idx in range(len(sentence)): # word before(-1), current word(0), next word(+1) prefixes = ['-1', '0', '+1'] - word_features = [] + word_features = {} for i in range(3): if word_idx == len(sentence) - 1 and i == 2: - word_features.append('EOS') + word_features['EOS'] = True # End Of Sentence elif word_idx == 0 and i == 0: - word_features.append('BOS') + word_features['BOS'] = True # Beginning Of Sentence else: word = sentence[word_idx - 1 + i] @@ -239,7 +239,7 @@ def _sentence_to_features(self, sentence): for feature in features: # append each feature to a feature vector # word_features.append(prefix + feature + ':' + self.function_dict[feature](word)) - word_features.append(':'.join((prefix, feature, self.function_dict[feature](word)))) + word_features[prefix + ":" + feature] = self.function_dict[feature](word) sentence_features.append(word_features) return sentence_features @@ -272,9 +272,9 @@ def _from_json_to_crf(self, message, entity_offsets): def __pattern_of_token(self, message, i): if message.get("tokens"): - return str(message.get("tokens")[i].get("pattern", "N/A")) + return message.get("tokens")[i].get("pattern") else: - return "N/A" + return None def _from_text_to_crf(self, message, entities=None): # type: (Message, List[Text]) -> List[Tuple[Text, Text, Text, Text]]