Merge pull request #1090 from howl-anderson/feature/custom_dictionary…

…_for_jieba_tokenizer New feature: JiebaTokenizer will load custom dictionary from config
RasaHQ · Jul 9, 2018 · cc5318a · cc5318a
2 parents 153c2c5 + 6e890e3
commit cc5318a
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 0 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -17,6 +17,7 @@ This project adheres to `Semantic Versioning`_ starting with version 0.7.0.
 
 Added
 -----
+- support for `tokenizer_jieba` load custom dictionary from config
 - allow pure json including pipeline configuration on train endpoint
 - doc link to a community contribution for Rasa NLU in Chinese
 - support for component ``count_vectors_featurizer`` use ``tokens`` feature provide by tokenizer

diff --git a/docs/pipeline.rst b/docs/pipeline.rst
@@ -580,11 +580,13 @@ tokenizer_jieba
     ``tokenizer_whitespace``. Can be used to define tokens for the
     MITIE entity extractor. Make sure to install Jieba, ``pip install jieba``.
 :Configuration:
+    User's custom dictionary files can be auto loaded by specific the files' directory path via ``dictionary_path``
 
     .. code-block:: yaml
 
         pipeline:
         - name: "tokenizer_jieba"
+          dictionary_path: "path/to/custom/dictionary/dir"  # or None (which is default value) means don't use custom dictionaries
 
 tokenizer_mitie
 ~~~~~~~~~~~~~~~

diff --git a/rasa_nlu/tokenizers/jieba_tokenizer.py b/rasa_nlu/tokenizers/jieba_tokenizer.py
@@ -5,6 +5,8 @@
 
 import glob
 import logging
+import os
+import shutil
 
 from rasa_nlu.components import Component
 from rasa_nlu.config import RasaNLUModelConfig
@@ -14,6 +16,8 @@
 
 logger = logging.getLogger(__name__)
 
+JIEBA_CUSTOM_DICTIONARY_PATH = "tokenizer_jieba"
+
 
 class JiebaTokenizer(Tokenizer, Component):
     name = "tokenizer_jieba"
@@ -22,6 +26,19 @@ class JiebaTokenizer(Tokenizer, Component):
 
     language_list = ["zh"]
 
+    defaults = {
+        "dictionary_path": None  # default don't load custom dictionary
+    }
+
+    def __init__(self, component_config=None):
+        # type: (Dict[Text, Any]) -> None
+        """Construct a new intent classifier using the MITIE framework."""
+
+        super(JiebaTokenizer, self).__init__(component_config)
+
+        # path to dictionary file or None
+        self.dictionary_path = self.component_config.get('dictionary_path')
+
     @classmethod
     def required_packages(cls):
         # type: () -> List[Text]
@@ -56,6 +73,58 @@ def process(self, message, **kwargs):
     def tokenize(self, text):
         # type: (Text) -> List[Token]
         import jieba
+
+        if self.dictionary_path is not None:
+            self.load_custom_dictionary(self.dictionary_path)
+
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
         return tokens
+
+    @classmethod
+    def load(cls,
+             model_dir=None,  # type: Optional[Text]
+             model_metadata=None,  # type: Optional[Metadata]
+             cached_component=None,  # type: Optional[Component]
+             **kwargs  # type: **Any
+             ):
+        # type: (...) -> JiebaTokenizer
+
+        meta = model_metadata.for_component(cls.name)
+        relative_dictionary_path = meta.get("dictionary_path")
+
+        # get real path of dictionary path, if any
+        if relative_dictionary_path is not None:
+            dictionary_path = os.path.join(model_dir, relative_dictionary_path)
+
+            meta["dictionary_path"] = dictionary_path
+
+        return cls(meta)
+
+    @staticmethod
+    def copy_files_dir_to_dir(input_dir, output_dir):
+        # make sure target path exists
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        target_file_list = glob.glob("{}/*".format(input_dir))
+        for target_file in target_file_list:
+            shutil.copy2(target_file, output_dir)
+
+    def persist(self, model_dir):
+        # type: (Text) -> Optional[Dict[Text, Any]]
+        """Persist this model into the passed directory."""
+
+        model_dictionary_path = None
+
+        # copy custom dictionaries to model dir, if any
+        if self.dictionary_path is not None:
+            target_dictionary_path = os.path.join(model_dir,
+                                                  JIEBA_CUSTOM_DICTIONARY_PATH)
+            self.copy_files_dir_to_dir(self.dictionary_path,
+                                       target_dictionary_path)
+
+            # set dictionary_path of model metadata to relative path
+            model_dictionary_path = JIEBA_CUSTOM_DICTIONARY_PATH
+
+        return {"dictionary_path": model_dictionary_path}
diff --git a/tests/base/test_tokenizers.py b/tests/base/test_tokenizers.py
@@ -4,6 +4,8 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import mock
+
 
 def test_whitespace():
     from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
@@ -78,3 +80,20 @@ def test_jieba():
 
     assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == \
            [0, 7, 9, 10]
+
+
+def test_jieba_load_dictionary(tmpdir_factory):
+    from rasa_nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
+
+    component_config = {
+        "dictionary_path": dictionary_path
+    }
+
+    with mock.patch.object(JiebaTokenizer, 'load_custom_dictionary',
+                           return_value=None) as mock_method:
+        tk = JiebaTokenizer(component_config)
+        tk.tokenize("")
+
+    mock_method.assert_called_once_with(dictionary_path)