Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3.x port 10482 #10483

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,4 @@ rasa/keys
/results/
# Local Netlify folder
.netlify
.venv
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
BEGIN_OF_SENTENCE = "BOS"

FEATURES = "features"
CASE_SENSITIVITY = "prefix_suffix_case_sensitive"


@DefaultV1Recipe.register(
Expand Down Expand Up @@ -79,7 +80,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):

# NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple
# prefixes, short words will be represented/encoded repeatedly.
_FUNCTION_DICT: Dict[Text, Callable[[Token], Union[Text, bool, None]]] = {
_FUNCTION_DICT_DEFAULT: Dict[Text, Callable[[Token], Union[Text, bool, None]]] = {
"low": lambda token: token.text.islower(),
"title": lambda token: token.text.istitle(),
"prefix5": lambda token: token.text[:5],
Expand All @@ -95,14 +96,20 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
"upper": lambda token: token.text.isupper(),
"digit": lambda token: token.text.isdigit(),
}

_FUNCTION_DICT_LOWER: Dict[Text, Callable[[Token], Union[bool, Text, None]]] = {
"prefix5": lambda token: token.text[:5].lower(),
"prefix2": lambda token: token.text[:2].lower(),
"suffix5": lambda token: token.text[-5:].lower(),
"suffix3": lambda token: token.text[-3:].lower(),
"suffix2": lambda token: token.text[-2:].lower(),
"suffix1": lambda token: token.text[-1:].lower(),
}
SUPPORTED_FEATURES = sorted(
set(_FUNCTION_DICT.keys()).union([END_OF_SENTENCE, BEGIN_OF_SENTENCE])
set(_FUNCTION_DICT_DEFAULT.keys()).union([END_OF_SENTENCE, BEGIN_OF_SENTENCE])
)

@classmethod
def _extract_raw_features_from_token(
cls, feature_name: Text, token: Token, token_position: int, num_tokens: int,
self, feature_name: Text, token: Token, token_position: int, num_tokens: int,
) -> Text:
"""Extracts a raw feature from the token at the given position.

Expand All @@ -114,7 +121,7 @@ def _extract_raw_features_from_token(
Returns:
the raw feature value as text
"""
if feature_name not in cls.SUPPORTED_FEATURES:
if feature_name not in self.SUPPORTED_FEATURES:
raise InvalidConfigException(
f"Configured feature '{feature_name}' not valid. Please check "
f"'{DOCS_URL_COMPONENTS}' for valid configuration parameters."
Expand All @@ -123,7 +130,7 @@ def _extract_raw_features_from_token(
return str(token_position == num_tokens - 1)
if feature_name == BEGIN_OF_SENTENCE:
return str(token_position == 0)
return str(cls._FUNCTION_DICT[feature_name](token))
return str(self._function_dict[feature_name](token))

@classmethod
def required_components(cls) -> List[Type]:
Expand All @@ -140,6 +147,7 @@ def get_default_config() -> Dict[Text, Any]:
["BOS", "EOS", "low", "upper", "title", "digit"],
["low", "title", "upper"],
],
CASE_SENSITIVITY: True,
}

def __init__(
Expand All @@ -161,6 +169,13 @@ def __init__(
self._set_feature_to_idx_dict(
feature_to_idx_dict or {}, check_consistency_with_config=True
)
self._function_dict = self._FUNCTION_DICT_DEFAULT.copy()
self.case_sensitive = self._config.get(CASE_SENSITIVITY, True)
if not self.case_sensitive:
self._function_dict.update(self._FUNCTION_DICT_LOWER)




@classmethod
def validate_config(cls, config: Dict[Text, Any]) -> None:
Expand Down
57 changes: 57 additions & 0 deletions tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,62 @@ def test_feature_computation(
assert np.all(feature.features.todense()[0] == expected_features)


@pytest.mark.parametrize(
"sentence, expected_features",
[
(
"goodbye Goodbye GOODBYE gOoDbyE",
[
[1.0, 1.0,], # check if all
[1.0, 1.0,], # spellings of
[1.0, 1.0,], # goodbye are
[1.0, 1.0,], # featurized the same.
],
),
(
"a A",
[
[1.0, 1.0,], # is A
[1.0, 1.0,], # equal to a?
],
),
],
)
def test_text_featurizer_case_insensitive(
create_lexical_syntactic_featurizer: Callable[
[Dict[Text, Any]], LexicalSyntacticFeaturizer
],
sentence,
expected_features):

featurizer = create_lexical_syntactic_featurizer(
{"alias": "lsf", "features": [[], ["prefix2", "suffix2"], [],], "prefix_suffix_case_sensitive": False }
)

# build the message
tokens = [
Token(text=match[0], start=match.start())
for match in re.finditer(r"\w+", sentence)
]
message = Message(data={TOKENS_NAMES[TEXT]: tokens})

featurizer.train(TrainingData([message]))

featurizer.process([message])

seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
if seq_vec:
seq_vec = seq_vec.features
if sen_vec:
sen_vec = sen_vec.features

assert sen_vec is None
assert np.all(seq_vec.toarray() == expected_features)

# check pairwise equality of features
for i in range(seq_vec.shape[0]-1):
assert np.all(seq_vec.toarray()[i] == seq_vec.toarray()[i+1])

def test_features_for_messages_with_missing_part_of_speech_tags(
create_lexical_syntactic_featurizer: Callable[
[Dict[Text, Any]], LexicalSyntacticFeaturizer
Expand Down Expand Up @@ -324,3 +380,4 @@ def test_warn_if_part_of_speech_features_cannot_be_computed(
assert len(message.features) == 1
feature = message.features[0]
assert np.all(feature.features.todense() == expected_features)