Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dataset with empty char. #8469

Merged
merged 2 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/llm/pretraining/data/OpenWebText2.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
```shell
# wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar
wget https://paddlenlp.bj.bcebos.com/models/transformers/gpt/openwebtext2.jsonl.zst.tar
tar -xvf openwebtext2.json.zst.tar -C /path/to/openwebtext
tar -xvf openwebtext2.jsonl.zst.tar -C /path/to/openwebtext
```

## Llama训练数据制作
Expand Down
12 changes: 6 additions & 6 deletions llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,20 @@ PaddleNLP将飞桨4D并行策略加入到Trainer API中, 用户只需修改Tra
为了方便用户运行测试本模型,本项目提供了处理好的100k条doc的训练样本:
```shell
# llama 模型数据下载
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx

# gpt 模型数据下载
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt2_openwebtext_100k.bin
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt2_openwebtext_100k.idx
```

将所有预处理得到的文件统一放入一个文件夹中,以备训练使用:

```
mkdir data
mv llama_openwebtext_100k_ids.npy ./data
mv llama_openwebtext_100k_idx.npz ./data
mv llama_openwebtext_100k.bin ./data
mv llama_openwebtext_100k.idx ./data
```

```shell
Expand Down
8 changes: 4 additions & 4 deletions llm/docs/pretrain.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ git clone 代码到本地,即可开始。
.. code-block:: bash

# llama 模型数据下载
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx

# gpt 模型数据下载
# wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
Expand All @@ -57,8 +57,8 @@ git clone 代码到本地,即可开始。
.. code-block:: bash

mkdir data
mv llama_openwebtext_100k_ids.npy ./data
mv llama_openwebtext_100k_idx.npz ./data
mv llama_openwebtext_100k.bin ./data
mv llama_openwebtext_100k.idx ./data



Expand Down
2 changes: 1 addition & 1 deletion model_zoo/ernie-1.0/preprocess/create_pretraining_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def get_args():
group.add_argument("--append_eos", action="store_true", help="Append an <eos> token to the end of a document.")
group.add_argument("--log_interval", type=int, default=100, help="Interval between progress updates")
group.add_argument("--workers", type=int, default=1, help="Number of worker processes to launch")
group.add_argument("--max_doc_num", type=int, default=sys.maxsize, help="Number of worker processes to launch")
group.add_argument("--max_doc_num", type=int, default=sys.maxsize, help="Stop when reach max_doc_num.")
group.add_argument(
"--max_repeated_len", type=int, default=100, help="The maximum length of the repeated characters to keep"
)
Expand Down
4 changes: 2 additions & 2 deletions tests/llm/test_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ def test_pretrain(self):
del sys.modules["run_pretrain"]

# Run pretrain
URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy"
URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz"
URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin"
URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx"
get_path_from_url(URL, root_dir=self.dataset_dir)
get_path_from_url(URL2, root_dir=self.dataset_dir)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ cd ../../../llm/llama
python -m pip install tool_helpers

rm -rf data && mkdir data
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx

mv llama_openwebtext_100k_ids.npy ./data
mv llama_openwebtext_100k_idx.npz ./data
mv llama_openwebtext_100k.bin ./data
mv llama_openwebtext_100k.idx ./data

# mv autoconfig
rm -rf autoconfig
Expand Down
12 changes: 6 additions & 6 deletions tests/trainer/test_unified_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,8 @@ def setUp(self):
os.environ.update(environment_variables)

files = [
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx",
]
self.prepare_inputs_data(pretrain_arguments["input_dir"], files)

Expand Down Expand Up @@ -646,8 +646,8 @@ def setUp(self):
os.environ.update(environment_variables)

files = [
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx",
]
self.prepare_inputs_data(pretrain_arguments["input_dir"], files)

Expand Down Expand Up @@ -687,8 +687,8 @@ def setUp(self):
os.environ.update(environment_variables)

files = [
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin",
"https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx",
]
self.prepare_inputs_data(pretrain_arguments["input_dir"], files)

Expand Down
Loading