Skip to content

Commit

Permalink
merge and fix conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
Yam0214 committed Feb 17, 2023
2 parents 171fb53 + 9f78fa5 commit 03626da
Show file tree
Hide file tree
Showing 343 changed files with 20,693 additions and 6,679 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
ignore = E203, E501, E741, W503, W605
ignore = E203, E501, E731, E741, W503, W605
max-line-length = 119

# E402: module level import not at top of file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Arguments for configuration."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import absolute_import, division, print_function, unicode_literals

import six
import logging
import os
import sys
import argparse
import logging

import paddle.fluid as fluid
import six

from paddlenlp.trainer.argparser import strtobool

log = logging.getLogger(__name__)

Expand All @@ -42,19 +39,13 @@ def prepare_logger(logger, debug=False, save_to_file=None):
logger.propagate = False


def str2bool(v):
# because argparse does not support to parse "true, False" as python
# boolean directly
return v.lower() in ("true", "t", "1")


class ArgumentGroup(object):
def __init__(self, parser, title, des):
self._group = parser.add_argument_group(title=title, description=des)

def add_arg(self, name, type, default, help, positional_arg=False, **kwargs):
prefix = "" if positional_arg else "--"
type = str2bool if type == bool else type
type = strtobool if type == bool else type
self._group.add_argument(
prefix + name, default=default, type=type, help=help + " Default: %(default)s.", **kwargs
)
Expand All @@ -73,8 +64,8 @@ def check_cuda(
Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n",
):
try:
if use_cuda == True and fluid.is_compiled_with_cuda() == False:
if use_cuda is True and fluid.is_compiled_with_cuda() is False:
log.error(err)
sys.exit(1)
except Exception as e:
except Exception:
pass
16 changes: 9 additions & 7 deletions applications/information_extraction/document/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,16 +292,18 @@ def _encode_doc(tokenizer, offset_mapping, last_offset, prompt, this_text_line,
q_sep_index = content_encoded_inputs["input_ids"].index(2, 1)

bias = 0
for index in range(len(sub_offset_mapping)):
if index == 0:
for i in range(len(sub_offset_mapping)):
if i == 0:
continue
mapping = sub_offset_mapping[index]
mapping = sub_offset_mapping[i]
if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
bias = sub_offset_mapping[index - 1][-1] + 1
bias = sub_offset_mapping[i - 1][-1] + 1
if mapping[0] == 0 and mapping[1] == 0:
continue
sub_offset_mapping[index][0] += bias
sub_offset_mapping[index][1] += bias
if mapping == sub_offset_mapping[i - 1]:
continue
sub_offset_mapping[i][0] += bias
sub_offset_mapping[i][1] += bias

offset_mapping = sub_offset_mapping[:-1]
last_offset = offset_mapping[-1][-1]
Expand All @@ -316,7 +318,7 @@ def _encode_doc(tokenizer, offset_mapping, last_offset, prompt, this_text_line,
if i == 0:
org_offset = sub_list[1]
else:
if sub_list[0] != org_offset:
if sub_list[0] != org_offset and sub_offset_mapping[1:-1][i - 1] != sub_list:
last_offset += 1
org_offset = sub_list[1]
offset_mapping += [[last_offset, sub_list[1] - sub_list[0] + last_offset]]
Expand Down
13 changes: 2 additions & 11 deletions applications/information_extraction/label_studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import numpy as np
import paddle

from paddlenlp.trainer.argparser import strtobool
from paddlenlp.utils.log import logger
from paddlenlp.utils.tools import DataConverter

Expand All @@ -33,16 +34,6 @@ def set_seed(seed):
np.random.seed(seed)


def str2bool(v):
"""Support bool type for argparse."""
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Unsupported value encountered.")


def do_convert():
set_seed(args.seed)

Expand Down Expand Up @@ -135,7 +126,7 @@ def _save_examples(save_dir, file_name, examples):
parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str, help="Select task type, ext for the extraction task and cls for the classification task, defaults to ext.")
parser.add_argument("--options", default=["正向", "负向"], type=str, nargs="+", help="Used only for the classification task, the options for classification")
parser.add_argument("--prompt_prefix", default="情感倾向", type=str, help="Used only for the classification task, the prompt prefix for classification")
parser.add_argument("--is_shuffle", default="True", type=str2bool, help="Whether to shuffle the labeled dataset, defaults to True.")
parser.add_argument("--is_shuffle", default="True", type=strtobool, help="Whether to shuffle the labeled dataset, defaults to True.")
parser.add_argument("--layout_analysis", default=False, type=bool, help="Enable layout analysis to optimize the order of OCR result.")
parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization")
parser.add_argument("--separator", type=str, default='##', help="Used only for entity/aspect-level classification task, separator for entity label and classification label")
Expand Down
20 changes: 8 additions & 12 deletions applications/neural_search/ranking/cross_encoder/train_ce.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import partial
import argparse
import os
import random
import time
import distutils.util
from functools import partial

import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad
from data import convert_example, create_dataloader, read_data

from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from paddlenlp.transformers import PolyDecayWithWarmup
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
from data import convert_example, read_data, create_dataloader
from paddlenlp.trainer.argparser import strtobool
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--train_set", type=str, required=True, help="The full path of train_set_file.")
parser.add_argument("--test_file", type=str, required=True, help="The full path of test file")

parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. "
"Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
Expand All @@ -48,7 +46,7 @@
parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'npu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--use_amp", type=distutils.util.strtobool, default=False, help="Enable mixed precision training.")
parser.add_argument("--use_amp", type=strtobool, default=False, help="Enable mixed precision training.")
parser.add_argument("--scale_loss", type=float, default=2**15, help="The value of scale_loss for fp16.")
parser.add_argument('--model_name_or_path', default="rocketqa-base-cross-encoder", help="The pretrained model used for training")
parser.add_argument("--eval_step", default=200, type=int, help="Step interval for evaluation.")
Expand Down Expand Up @@ -149,8 +147,6 @@ def do_train():

criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Auc()
if args.use_amp:
scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
global_step = 0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@

import numpy as np
import paddle
from utils import load_txt, str2bool
from utils import load_txt

from paddlenlp.trainer.argparser import strtobool
from paddlenlp.utils.log import logger


Expand Down Expand Up @@ -727,7 +728,7 @@ def _save_examples(save_dir, file_name, examples):
parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*", help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60% samples used for training, 20% for evaluation and 20% for test.")
parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str, help="Two task types [ext, cls] are supported, ext represents the aspect-based extraction task and cls represents the sentence-level classification task, defaults to ext.")
parser.add_argument("--options", type=str, nargs="+", help="Used only for the classification task, the options for classification")
parser.add_argument("--is_shuffle", type=str2bool, default="True", help="Whether to shuffle the labeled dataset, defaults to True.")
parser.add_argument("--is_shuffle", type=strtobool, default="True", help="Whether to shuffle the labeled dataset, defaults to True.")
parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization")

args = parser.parse_args()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# 小样本场景下的多标签层次分类任务指南

**零样本/小样本文本分类推荐使用 UTC 模型,详情见[目录](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/zero_shot_text_classification),本项目将会在2.5.2版本下线。**

## 目录

- [1. 项目说明](#项目说明)
Expand Down
Loading

0 comments on commit 03626da

Please sign in to comment.