Skip to content

Commit

Permalink
Merge pull request #1090 from howl-anderson/feature/custom_dictionary…
Browse files Browse the repository at this point in the history
…_for_jieba_tokenizer

New feature: JiebaTokenizer will load custom dictionary from config
  • Loading branch information
tmbo committed Jul 9, 2018
2 parents 153c2c5 + 6e890e3 commit cc5318a
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ This project adheres to `Semantic Versioning`_ starting with version 0.7.0.

Added
-----
- support for `tokenizer_jieba` load custom dictionary from config
- allow pure json including pipeline configuration on train endpoint
- doc link to a community contribution for Rasa NLU in Chinese
- support for component ``count_vectors_featurizer`` use ``tokens`` feature provide by tokenizer
Expand Down
2 changes: 2 additions & 0 deletions docs/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -580,11 +580,13 @@ tokenizer_jieba
``tokenizer_whitespace``. Can be used to define tokens for the
MITIE entity extractor. Make sure to install Jieba, ``pip install jieba``.
:Configuration:
User's custom dictionary files can be auto loaded by specific the files' directory path via ``dictionary_path``

.. code-block:: yaml
pipeline:
- name: "tokenizer_jieba"
dictionary_path: "path/to/custom/dictionary/dir" # or None (which is default value) means don't use custom dictionaries
tokenizer_mitie
~~~~~~~~~~~~~~~
Expand Down
69 changes: 69 additions & 0 deletions rasa_nlu/tokenizers/jieba_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import glob
import logging
import os
import shutil

from rasa_nlu.components import Component
from rasa_nlu.config import RasaNLUModelConfig
Expand All @@ -14,6 +16,8 @@

logger = logging.getLogger(__name__)

JIEBA_CUSTOM_DICTIONARY_PATH = "tokenizer_jieba"


class JiebaTokenizer(Tokenizer, Component):
name = "tokenizer_jieba"
Expand All @@ -22,6 +26,19 @@ class JiebaTokenizer(Tokenizer, Component):

language_list = ["zh"]

defaults = {
"dictionary_path": None # default don't load custom dictionary
}

def __init__(self, component_config=None):
# type: (Dict[Text, Any]) -> None
"""Construct a new intent classifier using the MITIE framework."""

super(JiebaTokenizer, self).__init__(component_config)

# path to dictionary file or None
self.dictionary_path = self.component_config.get('dictionary_path')

@classmethod
def required_packages(cls):
# type: () -> List[Text]
Expand Down Expand Up @@ -56,6 +73,58 @@ def process(self, message, **kwargs):
def tokenize(self, text):
# type: (Text) -> List[Token]
import jieba

if self.dictionary_path is not None:
self.load_custom_dictionary(self.dictionary_path)

tokenized = jieba.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]
return tokens

@classmethod
def load(cls,
model_dir=None, # type: Optional[Text]
model_metadata=None, # type: Optional[Metadata]
cached_component=None, # type: Optional[Component]
**kwargs # type: **Any
):
# type: (...) -> JiebaTokenizer

meta = model_metadata.for_component(cls.name)
relative_dictionary_path = meta.get("dictionary_path")

# get real path of dictionary path, if any
if relative_dictionary_path is not None:
dictionary_path = os.path.join(model_dir, relative_dictionary_path)

meta["dictionary_path"] = dictionary_path

return cls(meta)

@staticmethod
def copy_files_dir_to_dir(input_dir, output_dir):
# make sure target path exists
if not os.path.exists(output_dir):
os.makedirs(output_dir)

target_file_list = glob.glob("{}/*".format(input_dir))
for target_file in target_file_list:
shutil.copy2(target_file, output_dir)

def persist(self, model_dir):
# type: (Text) -> Optional[Dict[Text, Any]]
"""Persist this model into the passed directory."""

model_dictionary_path = None

# copy custom dictionaries to model dir, if any
if self.dictionary_path is not None:
target_dictionary_path = os.path.join(model_dir,
JIEBA_CUSTOM_DICTIONARY_PATH)
self.copy_files_dir_to_dir(self.dictionary_path,
target_dictionary_path)

# set dictionary_path of model metadata to relative path
model_dictionary_path = JIEBA_CUSTOM_DICTIONARY_PATH

return {"dictionary_path": model_dictionary_path}
19 changes: 19 additions & 0 deletions tests/base/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from __future__ import print_function
from __future__ import unicode_literals

import mock


def test_whitespace():
from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
Expand Down Expand Up @@ -78,3 +80,20 @@ def test_jieba():

assert [t.offset for t in tk.tokenize("Micheal你好吗?")] == \
[0, 7, 9, 10]


def test_jieba_load_dictionary(tmpdir_factory):
from rasa_nlu.tokenizers.jieba_tokenizer import JiebaTokenizer

dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath

component_config = {
"dictionary_path": dictionary_path
}

with mock.patch.object(JiebaTokenizer, 'load_custom_dictionary',
return_value=None) as mock_method:
tk = JiebaTokenizer(component_config)
tk.tokenize("")

mock_method.assert_called_once_with(dictionary_path)

0 comments on commit cc5318a

Please sign in to comment.