From 29bf84c0cae6a8c29b246d02ec60c72b4d71568b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AA=91=E9=A9=AC=E5=B0=8F=E7=8C=AB?= <1435130236@qq.com> Date: Tue, 31 Jan 2023 10:31:29 +0800 Subject: [PATCH] [New features] testing example of model zoo (#4398) * update model-zoo test * update config * update model-zoo test * update config for gpt * add from_hf_hubarg * update model-zoo gpt * add export & infer test * use with argv context * update gpt testing * update gpt configuration * skip gpt-ci * fix lint * Update tests/model_zoo/test_gpt.py Co-authored-by: Zhong Hui * update ci-case & config variable name --------- Co-authored-by: Zhong Hui --- model_zoo/gpt/configs/default/generation.yaml | 2 - model_zoo/gpt/configs/default/glue.yaml | 12 --- model_zoo/gpt/configs/default/msra_ner.yaml | 10 -- model_zoo/gpt/configs/default/pretrain.yaml | 14 --- model_zoo/gpt/configs/test/glue.yaml | 13 --- model_zoo/gpt/configs/test/msra_ner.yaml | 11 --- model_zoo/gpt/configs/test/pretrain.yaml | 14 --- model_zoo/gpt/run_generation.py | 5 +- model_zoo/gpt/run_glue.py | 8 -- model_zoo/gpt/run_msra_ner.py | 4 - model_zoo/gpt/run_pretrain.py | 8 +- scripts/regression/ci_case.sh | 49 +--------- tests/fixtures/model_zoo/gpt.yaml | 95 +++++++++++++++++++ tests/model_zoo/__init__.py | 13 +++ tests/model_zoo/test_gpt.py | 82 ++++++++++++++++ tests/testing_utils.py | 64 +++++++++++++ 16 files changed, 264 insertions(+), 140 deletions(-) delete mode 100644 model_zoo/gpt/configs/default/generation.yaml delete mode 100644 model_zoo/gpt/configs/default/glue.yaml delete mode 100644 model_zoo/gpt/configs/default/msra_ner.yaml delete mode 100644 model_zoo/gpt/configs/default/pretrain.yaml delete mode 100644 model_zoo/gpt/configs/test/glue.yaml delete mode 100644 model_zoo/gpt/configs/test/msra_ner.yaml delete mode 100644 model_zoo/gpt/configs/test/pretrain.yaml create mode 100644 tests/fixtures/model_zoo/gpt.yaml create mode 100644 tests/model_zoo/__init__.py create mode 100644 tests/model_zoo/test_gpt.py diff --git a/model_zoo/gpt/configs/default/generation.yaml b/model_zoo/gpt/configs/default/generation.yaml deleted file mode 100644 index 935870143f5..00000000000 --- a/model_zoo/gpt/configs/default/generation.yaml +++ /dev/null @@ -1,2 +0,0 @@ -model_type: gpt2-cn -model_name_or_path: gpt-cpm-small-cn-distill \ No newline at end of file diff --git a/model_zoo/gpt/configs/default/glue.yaml b/model_zoo/gpt/configs/default/glue.yaml deleted file mode 100644 index 1f7b7042de8..00000000000 --- a/model_zoo/gpt/configs/default/glue.yaml +++ /dev/null @@ -1,12 +0,0 @@ -model_name_or_path: gpt2-medium-en -task_name: SST-2 -max_seq_length: 128 -per_device_train_batch_size: 32 -learning_rate: 2e-5 -num_train_epochs: 3 -logging_steps: 1 -save_steps: 500 -output_dir: ./output_dir/glue -eval_steps: 1 -device: gpu -do_train: true \ No newline at end of file diff --git a/model_zoo/gpt/configs/default/msra_ner.yaml b/model_zoo/gpt/configs/default/msra_ner.yaml deleted file mode 100644 index d4e17cb0cd7..00000000000 --- a/model_zoo/gpt/configs/default/msra_ner.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model_name_or_path: gpt-cpm-small-cn-distill -max_seq_length: 128 -per_device_eval_batch_size: 32 -learning_rate: 2e-5 -num_train_epochs: 3 -logging_steps: 25 -save_steps: 250 -output_dir: ./tmp/msra_ner/ -device: gpu -do_train: true \ No newline at end of file diff --git a/model_zoo/gpt/configs/default/pretrain.yaml b/model_zoo/gpt/configs/default/pretrain.yaml deleted file mode 100644 index a306b9a1c4d..00000000000 --- a/model_zoo/gpt/configs/default/pretrain.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model_name_or_path: gpt2-en -input_dir: ./data -output_dir: ./output_dir/pretrain -weight_decay: 0.01 -max_steps: 500000 -save_steps: 100000 -device: gpu -lr_decay_style: none -warmup_steps: 320000 -warmup_ratio: 0.01 -per_device_train_batch_size: 4 -eval_steps: 100 -do_train: true -do_predict: true \ No newline at end of file diff --git a/model_zoo/gpt/configs/test/glue.yaml b/model_zoo/gpt/configs/test/glue.yaml deleted file mode 100644 index 194be6a1f3d..00000000000 --- a/model_zoo/gpt/configs/test/glue.yaml +++ /dev/null @@ -1,13 +0,0 @@ -model_name_or_path: __internal_testing__/gpt -task_name: SST-2 -max_seq_length: 128 -per_device_train_batch_size: 32 -learning_rate: 2e-5 -num_train_epochs: 3 -logging_steps: 1 -save_steps: 500 -output_dir: ./output_dir/glue -max_steps: 10 -eval_steps: 1 -device: cpu -do_train: true \ No newline at end of file diff --git a/model_zoo/gpt/configs/test/msra_ner.yaml b/model_zoo/gpt/configs/test/msra_ner.yaml deleted file mode 100644 index 5088b524f80..00000000000 --- a/model_zoo/gpt/configs/test/msra_ner.yaml +++ /dev/null @@ -1,11 +0,0 @@ -model_name_or_path: __internal_testing__/gpt_cn -max_seq_length: 32 -per_device_eval_batch_size: 4 -learning_rate: 2e-5 -num_train_epochs: 3 -logging_steps: 25 -save_steps: 250 -output_dir: ./output_dir/msra_ner/ -device: cpu -do_train: true -max_steps: 10 \ No newline at end of file diff --git a/model_zoo/gpt/configs/test/pretrain.yaml b/model_zoo/gpt/configs/test/pretrain.yaml deleted file mode 100644 index 8bc98097001..00000000000 --- a/model_zoo/gpt/configs/test/pretrain.yaml +++ /dev/null @@ -1,14 +0,0 @@ -model_name_or_path: __internal_testing__/gpt -input_dir: ./data -output_dir: ./output_dir/pretrain -weight_decay: 0.01 -max_steps: 34 -lr_decay_style: none -save_steps: 10 -warmup_steps: 320000 -warmup_ratio: 0.01 -per_device_train_batch_size: 4 -device: cpu -eval_steps: 100 -do_train: true -do_predict: true \ No newline at end of file diff --git a/model_zoo/gpt/run_generation.py b/model_zoo/gpt/run_generation.py index 6a737feaf8b..11a646ecbbe 100644 --- a/model_zoo/gpt/run_generation.py +++ b/model_zoo/gpt/run_generation.py @@ -40,6 +40,7 @@ def parse_args(): type=str, help="The path or shortcut name of the pre-trained model.", ) + parser.add_argument("--from_hf_hub", type=bool, default=False, help="Whether load model from hf hub") parser.add_argument( "--decode_strategy", type=str, default="greedy_search", help="The decode strategy in generation." ) @@ -112,8 +113,8 @@ def main(args, input_text): ) ) - model = model_class.from_pretrained(args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) + model = model_class.from_pretrained(args.model_name_or_path, from_hf_hub=args.from_hf_hub) + tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, from_hf_hub=args.from_hf_hub) model.eval() args.max_dec_len = adjust_length_to_model(args.max_dec_len, model.max_position_embeddings) diff --git a/model_zoo/gpt/run_glue.py b/model_zoo/gpt/run_glue.py index 17a419b9f60..6a563f01edc 100644 --- a/model_zoo/gpt/run_glue.py +++ b/model_zoo/gpt/run_glue.py @@ -18,7 +18,6 @@ import numpy as np import paddle -from args import parse_config_file from paddle.io import DataLoader from paddle.metric import Accuracy @@ -224,11 +223,4 @@ def do_train(): if __name__ == "__main__": - # support: python run_glue.py --config=./configs/default.yaml - config_file = parse_config_file() - if config_file is not None: - from args import init_argv - - init_argv("glue", config_file) - do_train() diff --git a/model_zoo/gpt/run_msra_ner.py b/model_zoo/gpt/run_msra_ner.py index e3ad7a51f97..c9790ac5200 100644 --- a/model_zoo/gpt/run_msra_ner.py +++ b/model_zoo/gpt/run_msra_ner.py @@ -17,7 +17,6 @@ from functools import partial import paddle -from args import init_argv, parse_config_file from paddlenlp.data import DataCollatorForTokenClassification from paddlenlp.datasets import load_dataset @@ -154,7 +153,4 @@ def do_train(): if __name__ == "__main__": - config_file = parse_config_file() - if config_file is not None: - init_argv("msra_ner", config_file) do_train() diff --git a/model_zoo/gpt/run_pretrain.py b/model_zoo/gpt/run_pretrain.py index 3a9175ee10c..a2d557aa401 100644 --- a/model_zoo/gpt/run_pretrain.py +++ b/model_zoo/gpt/run_pretrain.py @@ -61,6 +61,7 @@ def eval_freq(self): @dataclass class ModelArguments: + model_type: str = field(default="gpt", metadata={"help": "the type of model"}) model_name_or_path: str = field(default="gpt2-en", metadata={"help": ""}) max_seq_len: int = field(default=128, metadata={"help": "max sequence length"}) to_static: bool = field(default=False, metadata={"help": "whether use static pretraining mode."}) @@ -369,9 +370,10 @@ def do_train(): # Now, we only support data parallel in dygraph mode for now. topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=worker_num) - tokenizer = GPTTokenizer.from_pretrained(model_args.model_name_or_path) - pretrained_models_list = list(GPTForPretraining.pretrained_init_configuration.keys()) - model = GPTForPretraining.from_pretrained(model_args.model_name_or_path) + model_class, tokenizer_class = MODEL_CLASSES[model_args.model_type] + tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path) + pretrained_models_list = list(model_class.pretrained_init_configuration.keys()) + model = model_class.from_pretrained(model_args.model_name_or_path) # Create the critrion for the gpt model criterion = GPTPretrainingCriterion() diff --git a/scripts/regression/ci_case.sh b/scripts/regression/ci_case.sh index 9aa1dbbfdee..15e7c920241 100644 --- a/scripts/regression/ci_case.sh +++ b/scripts/regression/ci_case.sh @@ -279,53 +279,8 @@ print_info $? gpt_deploy_C_FT } # 8 gpt gpt(){ -if [ ! -f 'test.py' ];then - echo '模型测试文件不存在!' - # data process - cd ${nlp_dir}/model_zoo/ernie-1.0/data_tools - sed -i "s/python3/python/g" Makefile - sed -i "s/python-config/python3.7m-config/g" Makefile - cd ${nlp_dir}/model_zoo/gpt/ - # pretrain - python -m paddle.distributed.launch run_pretrain.py \ - --model_name_or_path "__internal_testing__/gpt" \ - --input_dir "./pre_data" \ - --output_dir "output" \ - --weight_decay 0.01 \ - --max_steps 2 \ - --save_steps 2 \ - --device gpu \ - --warmup_steps 320000 \ - --warmup_ratio 0.01 \ - --micro_batch_size 8 \ - --eval_steps 100 \ - --overwrite_output_dir true \ - --dataloader_drop_last true \ - --do_train true \ - --do_predict true >${log_path}/gpt_pretrain >>${log_path}/gpt_pretrain 2>&1 - print_info $? gpt_pretrain - # export model - python export_model.py --model_type=gpt \ - --model_path=gpt2-medium-en \ - --output_path=./infer_model/model >${log_path}/gpt_export >>${log_path}/gpt_export 2>&1 - print_info $? gpt_export - # inference - python deploy/python/inference.py \ - --model_type gpt \ - --model_path ./infer_model/model >${log_path}/gpt_p_depoly >>${log_path}/gpt_p_depoly 2>&1 - print_info $? gpt_p_depoly - # test acc - # cd ${nlp_dir}/tests/examples/gpt/ - # time (python -m unittest test_accuracy.py >${log_path}/gpt_test_acc) >>${log_path}/gpt_test_acc 2>&1 - # print_info $? gpt_test_acc -else - pytest ${nlp_dir}/model_zoo/gpt/ >${log_path}/gpt >>${log_path}/gpt 2>&1 - print_info $? gpt -fi -fast_gpt -cd ${nlp_dir}/fast_generation/samples -python gpt_sample.py >${log_path}/fast_generation_gpt >>${log_path}/fast_generation_gpt 2>&1 -print_info $? fast_generation_gpt +# TODO(wj-Mcat): need remove the gpt related code scripts in paddle-ci +echo 'skip gpt testing in paddle-ci, for details you can see: https://github.com/PaddlePaddle/PaddleNLP/pull/4398' } # 9 ernie-1.0 ernie-1.0 (){ diff --git a/tests/fixtures/model_zoo/gpt.yaml b/tests/fixtures/model_zoo/gpt.yaml new file mode 100644 index 00000000000..1f90d9e2c4f --- /dev/null +++ b/tests/fixtures/model_zoo/gpt.yaml @@ -0,0 +1,95 @@ +pretrain: + slow: + model_type: gpt + model_name_or_path: __internal_testing__/gpt + input_dir: ./data + output_dir: ./output_dir/pretrain + weight_decay: 0.01 + max_steps: 2 + save_steps: 10 + warmup_steps: 10 + warmup_ratio: 0.01 + per_device_train_batch_size: 4 + device: cpu + eval_steps: 10 + do_train: true + do_predict: true + + default: + model_type: gpt + model_name_or_path: gpt2-en + warmup_steps: 320000 + warmup_ratio: 0.01 + per_device_train_batch_size: 4 + device: cpu + eval_steps: 100 + do_train: true + do_predict: true + + +msra_ner: + slow: + model_name_or_path: __internal_testing__/gpt_cn + max_seq_length: 32 + per_device_eval_batch_size: 4 + learning_rate: 2e-5 + num_train_epochs: 3 + logging_steps: 25 + save_steps: 250 + output_dir: ./output_dir/msra_ner/ + device: cpu + do_train: true + max_steps: 10 + + default: + model_name_or_path: gpt-cpm-small-cn-distill + max_seq_length: 128 + per_device_eval_batch_size: 32 + learning_rate: 2e-5 + num_train_epochs: 3 + logging_steps: 25 + save_steps: 250 + output_dir: ./tmp/msra_ner/ + device: gpu + do_train: true + +glue: + slow: + model_name_or_path: __internal_testing__/gpt + task_name: SST-2 + max_seq_length: 128 + per_device_train_batch_size: 32 + learning_rate: 2e-5 + num_train_epochs: 3 + logging_steps: 1 + save_steps: 500 + output_dir: ./output_dir/glue + max_steps: 10 + eval_steps: 1 + device: cpu + do_train: true + + default: + model_name_or_path: gpt2-medium-en + task_name: SST-2 + max_seq_length: 128 + per_device_train_batch_size: 32 + learning_rate: 2e-5 + num_train_epochs: 3 + logging_steps: 1 + save_steps: 500 + output_dir: ./output_dir/glue + eval_steps: 1 + device: gpu + do_train: true + + +generation: + slow: + model_type: gpt2 + model_name_or_path: hf-internal-testing/tiny-random-GPT2Model + from_hf_hub: true + + default: + model_type: gpt2-cn + model_name_or_path: gpt-cpm-small-cn-distill \ No newline at end of file diff --git a/tests/model_zoo/__init__.py b/tests/model_zoo/__init__.py new file mode 100644 index 00000000000..595add0aed9 --- /dev/null +++ b/tests/model_zoo/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/model_zoo/test_gpt.py b/tests/model_zoo/test_gpt.py new file mode 100644 index 00000000000..91a0e4b3085 --- /dev/null +++ b/tests/model_zoo/test_gpt.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import os +import sys +from unittest import TestCase + +from tests.testing_utils import argv_context_guard, load_test_config + + +class GPTTest(TestCase): + def setUp(self) -> None: + self.path = "./model_zoo/gpt" + self.config_path = "./tests/fixtures/model_zoo/gpt.yaml" + sys.path.insert(0, self.path) + + def tearDown(self) -> None: + sys.path.remove(self.path) + + def test_pretrain(self): + + # 1. run pretrain + pretrain_config = load_test_config(self.config_path, "pretrain") + with argv_context_guard(pretrain_config): + from run_pretrain import do_train + + do_train() + + # 2. export model + export_config = { + "model_type": pretrain_config["model_type"], + "model_path": pretrain_config["output_dir"], + "output_path": os.path.join(pretrain_config["output_dir"], "export_model"), + } + with argv_context_guard(export_config): + from export_model import main + + main() + + # 3. infer model + infer_config = { + "model_type": export_config["model_type"], + "model_path": export_config["output_path"], + "select_device": pretrain_config["device"], + } + with argv_context_guard(infer_config): + from deploy.python.inference import main + + main() + + def test_msra_ner(self): + config = load_test_config(self.config_path, "msra_ner") + with argv_context_guard(config): + from run_msra_ner import do_train + + do_train() + + def test_run_glue(self): + config = load_test_config(self.config_path, "glue") + with argv_context_guard(config): + from run_glue import do_train + + do_train() + + def test_generation(self): + config = load_test_config(self.config_path, "generation") + with argv_context_guard(config): + from run_generation import run + + run() diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 59ab9ddc979..e8514fd1c79 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -13,15 +13,19 @@ # limitations under the License. from __future__ import annotations +import copy import gc import inspect import os +import sys import unittest from collections.abc import Mapping +from contextlib import contextmanager from distutils.util import strtobool import numpy as np import paddle +import yaml from paddlenlp.utils.import_utils import is_package_available @@ -283,3 +287,63 @@ def decorator(func): return func return decorator + + +def load_test_config(config_file: str, key: str) -> dict: + """parse config file to argv + + Args: + config_dir (str, optional): the path of config file. Defaults to None. + config_name (str, optional): the name key in config file. Defaults to None. + """ + # 1. load the config with key and test env(default, test) + with open(config_file, "r", encoding="utf-8") as f: + config = yaml.safe_load(f) + + assert key in config, f"<{key}> should be the top key in configuration file" + config = config[key] + + sub_key = "slow" + if os.getenv("RUN_SLOW_TEST", None): + sub_key = "default" + + assert sub_key in config, f"<{sub_key}> not found in {key} configuration" + config = config[sub_key] + return config + + +def construct_argv(config: dict) -> list[str]: + """construct argv by configs + + Args: + config (dict): the config data + + Returns: + list[str]: the argvs + """ + # get current test + # refer to: https://docs.pytest.org/en/latest/example/simple.html#pytest-current-test-environment-variable + current_test = "tests/__init__.py" + if "PYTEST_CURRENT_TEST" in os.environ: + current_test = os.getenv("PYTEST_CURRENT_TEST").split("::")[0] + + argv = [current_test] + for key, value in config.items(): + argv.append(f"--{key}") + argv.append(str(value)) + + return argv + + +@contextmanager +def argv_context_guard(config: dict): + """construct argv by config + + Args: + config (dict): the configuration to argv + """ + old_argv = copy.deepcopy(sys.argv) + argv = construct_argv(config) + sys.argv = argv + yield + sys.argv = old_argv