diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e244a64f74ea..beb4d0379263 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -43,9 +43,11 @@ Removed Fixed ----- +- ``RegexFeaturizer`` detects all regex in user message (not just first) - do_extractors_support_overlap now correctly throws an exception only if no extractors are passed or if extractors that do not support overlapping entities are used. + [0.14.3] - 2019-02-01 ^^^^^^^^^^^^^^^^^^^^^ - diff --git a/rasa_nlu/featurizers/regex_featurizer.py b/rasa_nlu/featurizers/regex_featurizer.py index 9ea03e832655..143e866c45e1 100644 --- a/rasa_nlu/featurizers/regex_featurizer.py +++ b/rasa_nlu/featurizers/regex_featurizer.py @@ -71,22 +71,23 @@ def features_for_patterns(self, message): message is tokenized, the function will mark all tokens with a dict relating the name of the regex to whether it was matched.""" - matches = [] - for i, exp in enumerate(self.known_patterns): - match = re.search(exp["pattern"], message.text) - matches.append(match) + found_patterns = [] + for exp in self.known_patterns: + matches = re.finditer(exp["pattern"], message.text) + matches = list(matches) + found_patterns.append(False) for token_index, t in enumerate(message.get("tokens", [])): patterns = t.get("pattern", default={}) - if match is not None: + patterns[exp["name"]] = False + + for match in matches: if t.offset < match.end() and t.end > match.start(): patterns[exp["name"]] = True - else: - patterns[exp["name"]] = False - else: - patterns[exp["name"]] = False + found_patterns[-1] = True + t.set("pattern", patterns) - found = [1.0 if m is not None else 0.0 for m in matches] - return np.array(found) + + return np.array(found_patterns).astype(float) def _generate_lookup_regex(self, lookup_table): """creates a regex out of the contents of a lookup table file"""