merge and fix conflicts

PaddlePaddle · Feb 17, 2023 · 03626da · 03626da
2 parents 171fb53 + 9f78fa5
commit 03626da
Show file tree

Hide file tree

Showing 343 changed files with 20,693 additions and 6,679 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E501, E741, W503, W605
+ignore = E203, E501, E731, E741, W503, W605
 max-line-length = 119
 
 # E402: module level import not at top of file

diff --git a/applications/document_intelligence/doc_vqa/Rerank/src/utils/args.py b/applications/document_intelligence/doc_vqa/Rerank/src/utils/args.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Arguments for configuration."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import absolute_import
+from __future__ import absolute_import, division, print_function, unicode_literals
 
-import six
+import logging
 import os
 import sys
-import argparse
-import logging
 
 import paddle.fluid as fluid
+import six
+
+from paddlenlp.trainer.argparser import strtobool
 
 log = logging.getLogger(__name__)
 
@@ -42,19 +39,13 @@ def prepare_logger(logger, debug=False, save_to_file=None):
     logger.propagate = False
 
 
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
 class ArgumentGroup(object):
     def __init__(self, parser, title, des):
         self._group = parser.add_argument_group(title=title, description=des)
 
     def add_arg(self, name, type, default, help, positional_arg=False, **kwargs):
         prefix = "" if positional_arg else "--"
-        type = str2bool if type == bool else type
+        type = strtobool if type == bool else type
         self._group.add_argument(
             prefix + name, default=default, type=type, help=help + " Default: %(default)s.", **kwargs
         )
@@ -73,8 +64,8 @@ def check_cuda(
     Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n",
 ):
     try:
-        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+        if use_cuda is True and fluid.is_compiled_with_cuda() is False:
             log.error(err)
             sys.exit(1)
-    except Exception as e:
+    except Exception:
         pass
diff --git a/applications/information_extraction/document/utils.py b/applications/information_extraction/document/utils.py
@@ -292,16 +292,18 @@ def _encode_doc(tokenizer, offset_mapping, last_offset, prompt, this_text_line,
         q_sep_index = content_encoded_inputs["input_ids"].index(2, 1)
 
         bias = 0
-        for index in range(len(sub_offset_mapping)):
-            if index == 0:
+        for i in range(len(sub_offset_mapping)):
+            if i == 0:
                 continue
-            mapping = sub_offset_mapping[index]
+            mapping = sub_offset_mapping[i]
             if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
-                bias = sub_offset_mapping[index - 1][-1] + 1
+                bias = sub_offset_mapping[i - 1][-1] + 1
             if mapping[0] == 0 and mapping[1] == 0:
                 continue
-            sub_offset_mapping[index][0] += bias
-            sub_offset_mapping[index][1] += bias
+            if mapping == sub_offset_mapping[i - 1]:
+                continue
+            sub_offset_mapping[i][0] += bias
+            sub_offset_mapping[i][1] += bias
 
         offset_mapping = sub_offset_mapping[:-1]
         last_offset = offset_mapping[-1][-1]
@@ -316,7 +318,7 @@ def _encode_doc(tokenizer, offset_mapping, last_offset, prompt, this_text_line,
             if i == 0:
                 org_offset = sub_list[1]
             else:
-                if sub_list[0] != org_offset:
+                if sub_list[0] != org_offset and sub_offset_mapping[1:-1][i - 1] != sub_list:
                     last_offset += 1
                 org_offset = sub_list[1]
             offset_mapping += [[last_offset, sub_list[1] - sub_list[0] + last_offset]]

diff --git a/applications/information_extraction/label_studio.py b/applications/information_extraction/label_studio.py
@@ -23,6 +23,7 @@
 import numpy as np
 import paddle
 
+from paddlenlp.trainer.argparser import strtobool
 from paddlenlp.utils.log import logger
 from paddlenlp.utils.tools import DataConverter
 
@@ -33,16 +34,6 @@ def set_seed(seed):
     np.random.seed(seed)
 
 
-def str2bool(v):
-    """Support bool type for argparse."""
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise argparse.ArgumentTypeError("Unsupported value encountered.")
-
-
 def do_convert():
     set_seed(args.seed)
 
@@ -135,7 +126,7 @@ def _save_examples(save_dir, file_name, examples):
     parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str, help="Select task type, ext for the extraction task and cls for the classification task, defaults to ext.")
     parser.add_argument("--options", default=["正向", "负向"], type=str, nargs="+", help="Used only for the classification task, the options for classification")
     parser.add_argument("--prompt_prefix", default="情感倾向", type=str, help="Used only for the classification task, the prompt prefix for classification")
-    parser.add_argument("--is_shuffle", default="True", type=str2bool, help="Whether to shuffle the labeled dataset, defaults to True.")
+    parser.add_argument("--is_shuffle", default="True", type=strtobool, help="Whether to shuffle the labeled dataset, defaults to True.")
     parser.add_argument("--layout_analysis", default=False, type=bool, help="Enable layout analysis to optimize the order of OCR result.")
     parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization")
     parser.add_argument("--separator", type=str, default='##', help="Used only for entity/aspect-level classification task, separator for entity label and classification label")

diff --git a/applications/neural_search/ranking/cross_encoder/train_ce.py b/applications/neural_search/ranking/cross_encoder/train_ce.py
@@ -12,31 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from functools import partial
 import argparse
 import os
 import random
 import time
-import distutils.util
+from functools import partial
 
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from paddlenlp.data import Stack, Tuple, Pad
+from data import convert_example, create_dataloader, read_data
+
+from paddlenlp.data import Pad, Stack, Tuple
 from paddlenlp.datasets import load_dataset
-from paddlenlp.transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
-from paddlenlp.transformers import PolyDecayWithWarmup
-from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
-from data import convert_example, read_data, create_dataloader
+from paddlenlp.trainer.argparser import strtobool
+from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
 
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
 parser.add_argument("--train_set", type=str, required=True, help="The full path of train_set_file.")
 parser.add_argument("--test_file", type=str, required=True, help="The full path of test file")
 
-parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. "
-    "Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
 parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
 parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
 parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
@@ -48,7 +46,7 @@
 parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
 parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
 parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'npu'], default="gpu", help="Select which device to train model, defaults to gpu.")
-parser.add_argument("--use_amp", type=distutils.util.strtobool, default=False, help="Enable mixed precision training.")
+parser.add_argument("--use_amp", type=strtobool, default=False, help="Enable mixed precision training.")
 parser.add_argument("--scale_loss", type=float, default=2**15, help="The value of scale_loss for fp16.")
 parser.add_argument('--model_name_or_path', default="rocketqa-base-cross-encoder", help="The pretrained model used for training")
 parser.add_argument("--eval_step", default=200, type=int, help="Step interval for evaluation.")
@@ -149,8 +147,6 @@ def do_train():
 
     criterion = paddle.nn.loss.CrossEntropyLoss()
     metric = paddle.metric.Auc()
-    if args.use_amp:
-        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
     global_step = 0
     tic_train = time.time()
     for epoch in range(1, args.epochs + 1):

diff --git a/applications/sentiment_analysis/unified_sentiment_extraction/label_studio.py b/applications/sentiment_analysis/unified_sentiment_extraction/label_studio.py
@@ -23,8 +23,9 @@
 
 import numpy as np
 import paddle
-from utils import load_txt, str2bool
+from utils import load_txt
 
+from paddlenlp.trainer.argparser import strtobool
 from paddlenlp.utils.log import logger
 
 
@@ -727,7 +728,7 @@ def _save_examples(save_dir, file_name, examples):
     parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*", help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60% samples used for training, 20% for evaluation and 20% for test.")
     parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str, help="Two task types [ext, cls] are supported, ext represents the aspect-based extraction task and cls represents the sentence-level classification task, defaults to ext.")
     parser.add_argument("--options", type=str, nargs="+", help="Used only for the classification task, the options for classification")
-    parser.add_argument("--is_shuffle", type=str2bool, default="True", help="Whether to shuffle the labeled dataset, defaults to True.")
+    parser.add_argument("--is_shuffle", type=strtobool, default="True", help="Whether to shuffle the labeled dataset, defaults to True.")
     parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization")
 
     args = parser.parse_args()

diff --git a/applications/text_classification/hierarchical/few-shot/README.md b/applications/text_classification/hierarchical/few-shot/README.md
@@ -1,5 +1,7 @@
 # 小样本场景下的多标签层次分类任务指南
 
+**零样本/小样本文本分类推荐使用 UTC 模型，详情见[目录](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/zero_shot_text_classification)，本项目将会在2.5.2版本下线。**
+
 ## 目录
 
 - [1. 项目说明](#项目说明)