From 9ac0c8f5cde50869b9fd8ff506d1fb18fdacc344 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Mon, 20 May 2024 16:07:00 +0800 Subject: [PATCH 1/2] fix dataset with empty char. --- docs/llm/pretraining/data/OpenWebText2.md | 2 +- llm/README.md | 12 ++++++------ .../ernie-1.0/preprocess/create_pretraining_data.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/llm/pretraining/data/OpenWebText2.md b/docs/llm/pretraining/data/OpenWebText2.md index 08b29ca7880a..5e7c569c7eca 100644 --- a/docs/llm/pretraining/data/OpenWebText2.md +++ b/docs/llm/pretraining/data/OpenWebText2.md @@ -14,7 +14,7 @@ ```shell # wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar wget https://paddlenlp.bj.bcebos.com/models/transformers/gpt/openwebtext2.jsonl.zst.tar -tar -xvf openwebtext2.json.zst.tar -C /path/to/openwebtext +tar -xvf openwebtext2.jsonl.zst.tar -C /path/to/openwebtext ``` ## Llama训练数据制作 diff --git a/llm/README.md b/llm/README.md index 9b8520d58179..8b7ed0646f13 100644 --- a/llm/README.md +++ b/llm/README.md @@ -61,20 +61,20 @@ PaddleNLP将飞桨4D并行策略加入到Trainer API中, 用户只需修改Tra 为了方便用户运行测试本模型,本项目提供了处理好的100k条doc的训练样本: ```shell # llama 模型数据下载 -wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy -wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx # gpt 模型数据下载 -# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy -# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz +# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt2_openwebtext_100k.bin +# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt2_openwebtext_100k.idx ``` 将所有预处理得到的文件统一放入一个文件夹中,以备训练使用: ``` mkdir data -mv llama_openwebtext_100k_ids.npy ./data -mv llama_openwebtext_100k_idx.npz ./data +mv llama_openwebtext_100k.bin ./data +mv llama_openwebtext_100k.idx ./data ``` ```shell diff --git a/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py b/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py index c1874f8b936a..2393cebf68a5 100644 --- a/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py +++ b/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py @@ -103,7 +103,7 @@ def get_args(): group.add_argument("--append_eos", action="store_true", help="Append an token to the end of a document.") group.add_argument("--log_interval", type=int, default=100, help="Interval between progress updates") group.add_argument("--workers", type=int, default=1, help="Number of worker processes to launch") - group.add_argument("--max_doc_num", type=int, default=sys.maxsize, help="Number of worker processes to launch") + group.add_argument("--max_doc_num", type=int, default=sys.maxsize, help="Stop when reach max_doc_num.") group.add_argument( "--max_repeated_len", type=int, default=100, help="The maximum length of the repeated characters to keep" ) From 9e6f12b8024433d7d5fe05cfe78459631004cbb3 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Wed, 12 Jun 2024 16:25:02 +0800 Subject: [PATCH 2/2] revert file links. --- llm/docs/pretrain.rst | 8 ++++---- tests/llm/test_pretrain.py | 4 ++-- .../llama_pretrain/benchmark_common/prepare.sh | 8 ++++---- tests/trainer/test_unified_checkpoint.py | 12 ++++++------ 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/llm/docs/pretrain.rst b/llm/docs/pretrain.rst index aaa5b5acd243..987e6c53f90d 100644 --- a/llm/docs/pretrain.rst +++ b/llm/docs/pretrain.rst @@ -44,8 +44,8 @@ git clone 代码到本地,即可开始。 .. code-block:: bash # llama 模型数据下载 - wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy - wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz + wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin + wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx # gpt 模型数据下载 # wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy @@ -57,8 +57,8 @@ git clone 代码到本地,即可开始。 .. code-block:: bash mkdir data - mv llama_openwebtext_100k_ids.npy ./data - mv llama_openwebtext_100k_idx.npz ./data + mv llama_openwebtext_100k.bin ./data + mv llama_openwebtext_100k.idx ./data diff --git a/tests/llm/test_pretrain.py b/tests/llm/test_pretrain.py index 35882cec6567..5241fbcc3dd4 100644 --- a/tests/llm/test_pretrain.py +++ b/tests/llm/test_pretrain.py @@ -59,8 +59,8 @@ def test_pretrain(self): del sys.modules["run_pretrain"] # Run pretrain - URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy" - URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz" + URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin" + URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx" get_path_from_url(URL, root_dir=self.dataset_dir) get_path_from_url(URL2, root_dir=self.dataset_dir) diff --git a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh index 2401c2481e5a..df82a104c254 100644 --- a/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh +++ b/tests/test_tipc/auto_tuner/llama_pretrain/benchmark_common/prepare.sh @@ -24,11 +24,11 @@ cd ../../../llm/llama python -m pip install tool_helpers rm -rf data && mkdir data -wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy -wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx -mv llama_openwebtext_100k_ids.npy ./data -mv llama_openwebtext_100k_idx.npz ./data +mv llama_openwebtext_100k.bin ./data +mv llama_openwebtext_100k.idx ./data # mv autoconfig rm -rf autoconfig diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py index a5e4563d0317..b695432aa2ce 100644 --- a/tests/trainer/test_unified_checkpoint.py +++ b/tests/trainer/test_unified_checkpoint.py @@ -186,8 +186,8 @@ def setUp(self): os.environ.update(environment_variables) files = [ - "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy", - "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz", + "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin", + "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx", ] self.prepare_inputs_data(pretrain_arguments["input_dir"], files) @@ -646,8 +646,8 @@ def setUp(self): os.environ.update(environment_variables) files = [ - "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy", - "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz", + "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin", + "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx", ] self.prepare_inputs_data(pretrain_arguments["input_dir"], files) @@ -687,8 +687,8 @@ def setUp(self): os.environ.update(environment_variables) files = [ - "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy", - "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz", + "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin", + "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx", ] self.prepare_inputs_data(pretrain_arguments["input_dir"], files)