RasaHQ · raoulvm · Jun 16, 2021 · Aug 9, 2021 · Sep 7, 2021 · Sep 9, 2021
diff --git a/.gitignore b/.gitignore
@@ -88,3 +88,4 @@ rasa/keys
 /results/
 # Local Netlify folder
 .netlify
+.venv
@@ -43,6 +43,7 @@
 BEGIN_OF_SENTENCE = "BOS"
 
 FEATURES = "features"
+CASE_SENSITIVITY = "prefix_suffix_case_sensitive"
 
 
 @DefaultV1Recipe.register(
@@ -79,7 +80,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
 
     # NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple
     # prefixes, short words will be represented/encoded repeatedly.
-    _FUNCTION_DICT: Dict[Text, Callable[[Token], Union[Text, bool, None]]] = {
+    _FUNCTION_DICT_DEFAULT: Dict[Text, Callable[[Token], Union[Text, bool, None]]] = {
         "low": lambda token: token.text.islower(),
         "title": lambda token: token.text.istitle(),
         "prefix5": lambda token: token.text[:5],
@@ -95,14 +96,20 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent):
         "upper": lambda token: token.text.isupper(),
         "digit": lambda token: token.text.isdigit(),
     }
-
+    _FUNCTION_DICT_LOWER: Dict[Text, Callable[[Token], Union[bool, Text, None]]] = {
+        "prefix5": lambda token: token.text[:5].lower(),
+        "prefix2": lambda token: token.text[:2].lower(),
+        "suffix5": lambda token: token.text[-5:].lower(),
+        "suffix3": lambda token: token.text[-3:].lower(),
+        "suffix2": lambda token: token.text[-2:].lower(),
+        "suffix1": lambda token: token.text[-1:].lower(),
+    }
     SUPPORTED_FEATURES = sorted(
-        set(_FUNCTION_DICT.keys()).union([END_OF_SENTENCE, BEGIN_OF_SENTENCE])
+        set(_FUNCTION_DICT_DEFAULT.keys()).union([END_OF_SENTENCE, BEGIN_OF_SENTENCE])
     )
 
-    @classmethod
     def _extract_raw_features_from_token(
-        cls, feature_name: Text, token: Token, token_position: int, num_tokens: int,
+        self, feature_name: Text, token: Token, token_position: int, num_tokens: int,
     ) -> Text:
         """Extracts a raw feature from the token at the given position.
 
@@ -114,7 +121,7 @@ def _extract_raw_features_from_token(
         Returns:
           the raw feature value as text
         """
-        if feature_name not in cls.SUPPORTED_FEATURES:
+        if feature_name not in self.SUPPORTED_FEATURES:
             raise InvalidConfigException(
                 f"Configured feature '{feature_name}' not valid. Please check "
                 f"'{DOCS_URL_COMPONENTS}' for valid configuration parameters."
@@ -123,7 +130,7 @@ def _extract_raw_features_from_token(
             return str(token_position == num_tokens - 1)
         if feature_name == BEGIN_OF_SENTENCE:
             return str(token_position == 0)
-        return str(cls._FUNCTION_DICT[feature_name](token))
+        return str(self._function_dict[feature_name](token))
 
     @classmethod
     def required_components(cls) -> List[Type]:
@@ -140,6 +147,7 @@ def get_default_config() -> Dict[Text, Any]:
                 ["BOS", "EOS", "low", "upper", "title", "digit"],
                 ["low", "title", "upper"],
             ],
+            CASE_SENSITIVITY: True, 
         }
 
     def __init__(
@@ -161,6 +169,13 @@ def __init__(
         self._set_feature_to_idx_dict(
             feature_to_idx_dict or {}, check_consistency_with_config=True
         )
+        self._function_dict = self._FUNCTION_DICT_DEFAULT.copy()
+        self.case_sensitive = self._config.get(CASE_SENSITIVITY, True)
+        if not self.case_sensitive:
+            self._function_dict.update(self._FUNCTION_DICT_LOWER)
+
+
+
 
     @classmethod
     def validate_config(cls, config: Dict[Text, Any]) -> None:

diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -142,6 +142,62 @@ def test_feature_computation(
         assert np.all(feature.features.todense()[0] == expected_features)
 
 
+@pytest.mark.parametrize(
+    "sentence, expected_features",
+    [
+        (
+            "goodbye Goodbye GOODBYE gOoDbyE",
+            [
+                [1.0, 1.0,], # check if all 
+                [1.0, 1.0,], # spellings of
+                [1.0, 1.0,], # goodbye are
+                [1.0, 1.0,], # featurized the same.
+            ],
+        ),
+        (
+            "a A",
+            [
+                [1.0, 1.0,], # is A
+                [1.0, 1.0,], # equal to a?
+            ],
+        ),
+    ],
+)
+def test_text_featurizer_case_insensitive(
+    create_lexical_syntactic_featurizer: Callable[
+        [Dict[Text, Any]], LexicalSyntacticFeaturizer
+    ], 
+    sentence, 
+    expected_features):
+
+    featurizer = create_lexical_syntactic_featurizer(
+        {"alias": "lsf", "features": [[], ["prefix2", "suffix2"], [],], "prefix_suffix_case_sensitive": False }
+    )
+
+    # build the message
+    tokens = [
+        Token(text=match[0], start=match.start())
+        for match in re.finditer(r"\w+", sentence)
+    ]
+    message = Message(data={TOKENS_NAMES[TEXT]: tokens})
+
+    featurizer.train(TrainingData([message]))
+
+    featurizer.process([message])
+
+    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
+    if seq_vec:
+        seq_vec = seq_vec.features
+    if sen_vec:
+        sen_vec = sen_vec.features
+
+    assert sen_vec is None
+    assert np.all(seq_vec.toarray() == expected_features)
+
+    # check pairwise equality of features
+    for i in range(seq_vec.shape[0]-1):
+        assert np.all(seq_vec.toarray()[i] == seq_vec.toarray()[i+1]) 
+
 def test_features_for_messages_with_missing_part_of_speech_tags(
     create_lexical_syntactic_featurizer: Callable[
         [Dict[Text, Any]], LexicalSyntacticFeaturizer
@@ -324,3 +380,4 @@ def test_warn_if_part_of_speech_features_cannot_be_computed(
     assert len(message.features) == 1
     feature = message.features[0]
     assert np.all(feature.features.todense() == expected_features)
+