Skip to content

Commit

Permalink
Add pattern feature to default crf extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
tmbo committed Jul 6, 2017
1 parent 5870081 commit 2e16ad2
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 17 deletions.
8 changes: 4 additions & 4 deletions _pytest/test_extractors.py
Expand Up @@ -25,11 +25,11 @@ def test_crf_extractor(spacy_nlp):
ext.train(TrainingData(training_examples=examples), config)
sentence = 'anywhere in the west'
crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']
feats = ext._sentence_to_features(crf_format)
assert ('BOS' in feats[0])
assert ('EOS' in feats[-1])
assert ('0:low:in' in feats[1])
assert 'BOS' in feats[0]
assert 'EOS' in feats[-1]
assert feats[1]['0:low'] == "in"
sentence = 'anywhere in the west'
ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))

Expand Down
2 changes: 1 addition & 1 deletion config_defaults.json
Expand Up @@ -20,6 +20,6 @@
"entity_crf_BILOU_flag": true,
"entity_crf_features": [
["low", "title", "upper", "pos", "pos2"],
["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"],
["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"],
["low", "title", "upper", "pos", "pos2"]]
}
12 changes: 12 additions & 0 deletions data/examples/rasa/demo-rasa.json
Expand Up @@ -229,6 +229,18 @@
}
]
},
{
"text": "I am looking a restaurant in 29432",
"intent": "restaurant_search",
"entities": [
{
"start": 29,
"end": 34,
"value": "29432",
"entity": "location"
}
]
},
{
"text": "I am looking for mexican indian fusion",
"intent": "restaurant_search",
Expand Down
2 changes: 1 addition & 1 deletion rasa_nlu/config.py
Expand Up @@ -37,7 +37,7 @@
"entity_crf_BILOU_flag": True,
"entity_crf_features": [
["low", "title", "upper", "pos", "pos2"],
["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"],
["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"],
["low", "title", "upper", "pos", "pos2"]]
}

Expand Down
22 changes: 11 additions & 11 deletions rasa_nlu/extractors/crf_entity_extractor.py
Expand Up @@ -39,15 +39,15 @@ class CRFEntityExtractor(EntityExtractor):

function_dict = {
'low': lambda doc: doc[0].lower(),
'title': lambda doc: str(doc[0].istitle()),
'title': lambda doc: doc[0].istitle(),
'word3': lambda doc: doc[0][-3:],
'word2': lambda doc: doc[0][-2:],
'pos': lambda doc: doc[1],
'pos2': lambda doc: doc[1][:2],
'bias': lambda doc: 'bias',
'upper': lambda doc: str(doc[0].isupper()),
'digit': lambda doc: str(doc[0].isdigit()),
'pattern': lambda doc: doc[2],
'upper': lambda doc: doc[0].isupper(),
'digit': lambda doc: doc[0].isdigit(),
'pattern': lambda doc: str(doc[3]) if doc[3] is not None else 'N/A',
}

def __init__(self, ent_tagger=None, entity_crf_features=None, entity_crf_BILOU_flag=True):
Expand Down Expand Up @@ -217,20 +217,20 @@ def persist(self, model_dir):
return {"entity_extractor_crf": None}

def _sentence_to_features(self, sentence):
# type: (List[Tuple[Text, Text, Text, Text]]) -> List[List[Text]]
# type: (List[Tuple[Text, Text, Text, Text]]) -> List[Dict[Text, Any]]
"""Convert a word into discrete features in self.crf_features, including word before and word after."""

sentence_features = []
for word_idx in range(len(sentence)):
# word before(-1), current word(0), next word(+1)
prefixes = ['-1', '0', '+1']
word_features = []
word_features = {}
for i in range(3):
if word_idx == len(sentence) - 1 and i == 2:
word_features.append('EOS')
word_features['EOS'] = True
# End Of Sentence
elif word_idx == 0 and i == 0:
word_features.append('BOS')
word_features['BOS'] = True
# Beginning Of Sentence
else:
word = sentence[word_idx - 1 + i]
Expand All @@ -239,7 +239,7 @@ def _sentence_to_features(self, sentence):
for feature in features:
# append each feature to a feature vector
# word_features.append(prefix + feature + ':' + self.function_dict[feature](word))
word_features.append(':'.join((prefix, feature, self.function_dict[feature](word))))
word_features[prefix + ":" + feature] = self.function_dict[feature](word)
sentence_features.append(word_features)
return sentence_features

Expand Down Expand Up @@ -272,9 +272,9 @@ def _from_json_to_crf(self, message, entity_offsets):

def __pattern_of_token(self, message, i):
if message.get("tokens"):
return str(message.get("tokens")[i].get("pattern", "N/A"))
return message.get("tokens")[i].get("pattern")
else:
return "N/A"
return None

def _from_text_to_crf(self, message, entities=None):
# type: (Message, List[Text]) -> List[Tuple[Text, Text, Text, Text]]
Expand Down

0 comments on commit 2e16ad2

Please sign in to comment.