PaddlePaddle · linjieccc · Mar 15, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
@@ -80,8 +80,18 @@ seg("第十四届全运会在西安举办")
 
 seg(["第十四届全运会在西安举办", "三亚是一个美丽的城市"])
 >>> [['第十四届', '全运会', '在', '西安', '举办'], ['三亚', '是', '一个', '美丽', '的', '城市']]
+
+# 使用WordTag模型进行分词
+seg = Taskflow("word_segmentation", model="wordtag")
+seg("李伟拿出具有科学性、可操作性的《陕西省高校管理体制改革实施方案》")
+>>> ['李伟', '拿出', '具有', '科学性', '、', '可操作性', '的', '《', '陕西省高校管理体制改革实施方案', '》']
+
+seg("国家卫健委修订完成了新型冠状病毒肺炎诊疗方案")
+>>> ['国家卫健委', '修订', '完成', '了', '新型冠状病毒肺炎', '诊疗', '方案']
 ```
 
+**NOTE**：使用WordTag模型进行分词，在公司名、机构名等实体词的切分上表现更好，但由于预训练模型体积较大，推理速度也相对较慢，请结合具体应用场景选择分词模型。
+
 #### 自定义词典
 
 用户可以通过装载自定义词典来定制化分词结果。词典文件每一行表示一个自定义item，可以由一个单词或者多个单词组成。
@@ -93,7 +103,7 @@ seg(["第十四届全运会在西安举办", "三亚是一个美丽的城市"])
 年 末
 ```
 
-以"平原上的火焰计划于年末上映"为例，原本的输出结果为：
+以默认模型为例，"平原上的火焰计划于年末上映"原本的输出结果为：
 
 ```text
 ['平原', '上', '的', '火焰', '计划', '于', '年末', '上映']
@@ -111,6 +121,8 @@ my_seg("平原上的火焰计划于年末上映")
 
 #### 自定义任务
 
+- 自定义LAC模型
+
 任务的默认路径为`$HOME/.paddlenlp/taskflow/word_sementation/lac/`，默认路径下包含了执行该任务需要的所有文件。
 
 用户也可以使用自己的数据训练自定义中文分词模型，参考[词法分析训练示例](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/lexical_analysis)。
@@ -133,6 +145,30 @@ from paddlenlp import Taskflow
 
 my_seg = Taskflow("word_segmentation", task_path="./custom_task_path/")
 ```
+
+- 自定义WordTag模型
+
+任务的默认路径为`$HOME/.paddlenlp/taskflow/word_segmentation/wordtag/`，默认路径下包含了执行该任务需要的所有文件。
+
+用户也可以使用自己的数据训练自定义WordTag模型，参考[NER-WordTag增量训练示例](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_to_knowledge/ernie-ctm)。
+
+有了自定义模型后可通过`task_path`指定用户自定义路径，自定义路径下的文件需要和默认路径的文件一致。
+
+自定义路径需要有如下文件（用户自己的模型权重、标签文件）：
+```text
+custom_task_path/
+├── model_state.pdparams
+└── tags.txt
+```
+
+使用Taskflow加载自定义模型进行一键预测：
+
+```python
+from paddlenlp import Taskflow
+
+my_seg = Taskflow("word_segmentation", model="wordtag", task_path="./custom_task_path/")
+```
+
 #### 可配置参数说明
 
 * `batch_size`：批处理大小，请结合机器情况进行调整，默认为1。

diff --git a/paddlenlp/taskflow/knowledge_mining.py b/paddlenlp/taskflow/knowledge_mining.py
@@ -27,11 +27,7 @@
 import paddle.nn as nn
 from paddlenlp.layers.crf import LinearChainCrf
 from paddlenlp.utils.tools import compare_version
-if compare_version(paddle.version.full_version, "2.2.0") >= 0:
-    # paddle.text.ViterbiDecoder is supported by paddle after version 2.2.0
-    from paddle.text import ViterbiDecoder
-else:
-    from paddlenlp.layers.crf import ViterbiDecoder
+from paddle.text import ViterbiDecoder
 
 from ..datasets import MapDataset, load_dataset
 from ..data import Stack, Pad, Tuple
@@ -217,6 +213,16 @@ def __init__(self,
             self._custom.load_customization(self._user_dict)
         else:
             self._custom = None
+        self._num_workers = self.kwargs[
+            'num_workers'] if 'num_workers' in self.kwargs else 0
+        self._batch_size = self.kwargs[
+            'batch_size'] if 'batch_size' in self.kwargs else 1
+        self._lazy_load = self.kwargs[
+            'lazy_load'] if 'lazy_load' in self.kwargs else False
+        self._max_seq_len = self.kwargs[
+            'max_seq_len'] if 'max_seq_len' in self.kwargs else 512
+        self._split_sentence = self.kwargs[
+            'split_sentence'] if 'split_sentence' in self.kwargs else False
 
     @property
     def summary_num(self):
@@ -263,127 +269,34 @@ def _load_task_resources(self):
             self._termtree = TermTree.from_dir(
                 self._term_schema_path, self._term_data_path, self._linking)
 
-    def _split_long_text_input(self, input_texts, max_text_len):
-        """
-        Split the long text to list of short text, the max_seq_len of input text is 512,
-        if the text length greater than 512, will this function that spliting the long text.
-        """
-        short_input_texts = []
-        for text in input_texts:
-            if len(text) <= max_text_len:
-                short_input_texts.append(text)
-            else:
-                lens = len(text)
-                temp_text_list = text.split("？。！")
-                temp_text_list = [
-                    temp_text for temp_text in temp_text_list
-                    if len(temp_text) > 0
-                ]
-                if len(temp_text_list) <= 1:
-                    temp_text_list = [
-                        text[i:i + max_text_len]
-                        for i in range(0, len(text), max_text_len)
-                    ]
-                    short_input_texts.extend(temp_text_list)
-                else:
-                    list_len = len(temp_text_list)
-                    start = 0
-                    end = 0
-                    for i in range(0, list_len):
-                        if len(temp_text_list[i]) + 1 >= max_text_len:
-                            if start != end:
-                                short_input_texts.extend(
-                                    self._split_long_text_input(
-                                        [text[start:end]], max_text_len))
-                            short_input_texts.extend(
-                                self._split_long_text_input([
-                                    text[end:end + len(temp_text_list[i]) + 1]
-                                ], max_text_len))
-                            start = end + len(temp_text_list[i]) + 1
-                            end = start
-                        else:
-                            if start + len(temp_text_list[
-                                    i]) + 1 > max_text_len:
-                                short_input_texts.extend(
-                                    self._split_long_text_input(
-                                        [text[start:end]], max_text_len))
-                                start = end
-                                end = end + len(temp_text_list[i]) + 1
-                            else:
-                                end = len(temp_text_list[i]) + 1
-                    if start != end:
-                        short_input_texts.extend(
-                            self._split_long_text_input([text[start:end]],
-                                                        max_text_len))
-        return short_input_texts
-
-    def _concat_short_text_reuslts(self, input_texts, results):
-        """
-        Concat the model output of short texts to the total result of long text.
-        """
-        long_text_lens = [len(text) for text in input_texts]
-        concat_results = []
-        single_results = {}
-        count = 0
-        for text in input_texts:
-            text_len = len(text)
-            while True:
-                if len(single_results) == 0 or len(single_results[
-                        "text"]) < text_len:
-                    if len(single_results) == 0:
-                        single_results = copy.deepcopy(results[count])
-                    else:
-                        single_results["text"] += results[count]["text"]
-                        single_results["items"].extend(results[count]["items"])
-                    count += 1
-                elif len(single_results["text"]) == text_len:
-                    concat_results.append(single_results)
-                    single_results = {}
-                    break
-                else:
-                    raise Exception(
-                        "The length of input text and raw text is not equal.")
-        for result in concat_results:
-            pred_words = result['items']
-            pred_words = self._reset_offset(pred_words)
-            result['items'] = pred_words
-        return concat_results
-
     def _preprocess_text(self, input_texts):
         """
         Create the dataset and dataloader for the predict.
         """
-        batch_size = self.kwargs[
-            'batch_size'] if 'batch_size' in self.kwargs else 1
-        num_workers = self.kwargs[
-            'num_workers'] if 'num_workers' in self.kwargs else 0
-
-        max_seq_length = 512
-        if 'max_seq_length' in self.kwargs:
-            max_seq_length = self.kwargs['max_seq_length']
         infer_data = []
-        max_predict_len = max_seq_length - self.summary_num - 1
+        max_predict_len = self._max_seq_len - self.summary_num - 1
         filter_input_texts = []
         for input_text in input_texts:
             if not (isinstance(input_text, str) and len(input_text) > 0):
                 continue
             filter_input_texts.append(input_text)
         input_texts = filter_input_texts
 
-        short_input_texts = self._split_long_text_input(input_texts,
-                                                        max_predict_len)
+        short_input_texts, self.input_mapping = self._auto_splitter(
+            input_texts, max_predict_len, split_sentence=self._split_sentence)
 
         def read(inputs):
             for text in inputs:
                 tokenized_output = self._tokenizer(
                     list(text),
                     return_length=True,
                     is_split_into_words=True,
-                    max_seq_len=max_seq_length)
+                    max_seq_len=self._max_seq_len)
                 yield tokenized_output['input_ids'], tokenized_output[
                     'token_type_ids'], tokenized_output['seq_len']
 
-        infer_ds = load_dataset(read, inputs=short_input_texts, lazy=False)
+        infer_ds = load_dataset(
+            read, inputs=short_input_texts, lazy=self._lazy_load)
         batchify_fn = lambda samples, fn=Tuple(
             Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype='int64'
                 ),  # input_ids
@@ -396,15 +309,14 @@ def read(inputs):
         infer_data_loader = paddle.io.DataLoader(
             infer_ds,
             collate_fn=batchify_fn,
-            num_workers=num_workers,
-            batch_size=batch_size,
+            num_workers=self._num_workers,
+            batch_size=self._batch_size,
             shuffle=False,
             return_list=True)
 
         outputs = {}
         outputs['data_loader'] = infer_data_loader
         outputs['short_input_texts'] = short_input_texts
-        outputs['inputs'] = input_texts
         return outputs
 
     def _reset_offset(self, pred_words):
@@ -420,9 +332,8 @@ def _decode(self, batch_texts, batch_pred_tags):
         for sent_index in range(len(batch_texts)):
             sent = batch_texts[sent_index]
             tags = [
-                self._index_to_tags[index]
-                for index in batch_pred_tags[sent_index][self.summary_num:len(
-                    sent) + self.summary_num]
+                self._index_to_tags[index] for index in batch_pred_tags[
+                    sent_index][self.summary_num:len(sent) + self.summary_num]
             ]
             if self._custom:
                 self._custom.parse_customization(sent, tags, prefix=True)
@@ -561,7 +472,11 @@ def _postprocess(self, inputs):
         """
         results = self._decode(inputs['short_input_texts'],
                                inputs['all_pred_tags'])
-        results = self._concat_short_text_reuslts(inputs['inputs'], results)
+        results = self._auto_joiner(results, self.input_mapping, is_dict=True)
+        for result in results:
+            pred_words = result['items']
+            pred_words = self._reset_offset(pred_words)
+            result['items'] = pred_words
         if self.linking is True:
             for res in results:
                 self._term_linking(res)
@@ -804,7 +719,6 @@ def _run_model(self, inputs):
         all_scores_can = []
         all_preds_can = []
         pred_ids = []
-
         for batch in inputs['data_loader']:
             input_ids, token_type_ids, label_indices = batch
             self.input_handles[0].copy_from_cpu(input_ids.numpy())

diff --git a/paddlenlp/taskflow/lexical_analysis.py b/paddlenlp/taskflow/lexical_analysis.py
@@ -118,6 +118,7 @@ def __init__(self, task, model, user_dict=None, **kwargs):
         self._check_task_files()
         self._construct_vocabs()
         self._get_inference_model()
+        self._max_seq_len = 512
         if self._user_dict:
             self._custom = Customization()
             self._custom.load_customization(self._user_dict)
@@ -179,17 +180,24 @@ def _preprocess(self, inputs, padding=True, add_special_tokens=True):
             'batch_size'] if 'batch_size' in self.kwargs else 1
         num_workers = self.kwargs[
             'num_workers'] if 'num_workers' in self.kwargs else 0
+        self._split_sentence = self.kwargs[
+            'split_sentence'] if 'split_sentence' in self.kwargs else False
         infer_data = []
         oov_token_id = self._word_vocab.get("OOV")
 
         filter_inputs = []
+        for input in inputs:
+            if not (isinstance(input, str) and len(input.strip()) > 0):
+                continue
+            filter_inputs.append(input)
+
+        short_input_texts, self.input_mapping = self._auto_splitter(
+            filter_inputs,
+            self._max_seq_len,
+            split_sentence=self._split_sentence)
 
         def read(inputs):
             for input_tokens in inputs:
-                if not (isinstance(input_tokens, str) and
-                        len(input_tokens.strip()) > 0):
-                    continue
-                filter_inputs.append(input_tokens)
                 ids = []
                 for token in input_tokens:
                     token = self._q2b_vocab.get(token, token)
@@ -198,7 +206,7 @@ def read(inputs):
                 lens = len(ids)
                 yield ids, lens
 
-        infer_ds = load_dataset(read, inputs=inputs, lazy=False)
+        infer_ds = load_dataset(read, inputs=short_input_texts, lazy=False)
         batchify_fn = lambda samples, fn=Tuple(
             Pad(axis=0, pad_val=0, dtype="int64"),  # input_ids
             Stack(dtype='int64'),  # seq_len
@@ -211,7 +219,7 @@ def read(inputs):
             shuffle=False,
             return_list=True)
         outputs = {}
-        outputs['text'] = filter_inputs
+        outputs['text'] = short_input_texts
         outputs['data_loader'] = infer_data_loader
         return outputs
 
@@ -273,4 +281,6 @@ def _postprocess(self, inputs):
             single_result['segs'] = sent_out
             single_result['tags'] = tags_out
             final_results.append(single_result)
+        final_results = self._auto_joiner(
+            final_results, self.input_mapping, is_dict=True)
         return final_results
diff --git a/paddlenlp/taskflow/models/lexical_analysis_model.py b/paddlenlp/taskflow/models/lexical_analysis_model.py
@@ -19,11 +19,7 @@
 from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss
 from paddlenlp.utils.tools import compare_version
 
-if compare_version(paddle.version.full_version, "2.2.0") >= 0:
-    # paddle.text.ViterbiDecoder is supported by paddle after version 2.2.0
-    from paddle.text import ViterbiDecoder
-else:
-    from paddlenlp.layers.crf import ViterbiDecoder
+from paddle.text import ViterbiDecoder
 
 
 class BiGruCrf(nn.Layer):