-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
【benchmark】 add llama-7b_auto_dp2mp2pp2 benchmark script for cinn (#8423
) * add llama-7b_auto_dp2mp2pp2 benchmark sh * add llama-7b_auto_dp2mp2pp2 benchmark script for cinn * update llama-7b_auto_dp2mp2pp2 benchmark script * Update run_benchmark.sh * Update run_benchmark.sh * fix llama-7b_auto_dp2mp2pp2/benchmark_common * Update run_benchmark.sh
- Loading branch information
Showing
10 changed files
with
566 additions
and
0 deletions.
There are no files selected for viewing
12 changes: 12 additions & 0 deletions
12
tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
model_item=llama-7b_auto_dp2mp2pp2 | ||
bs_item=1 | ||
fp_item=fp16 | ||
run_mode=DP | ||
device_num=N1C1 | ||
|
||
max_iter=100 | ||
|
||
# prepare | ||
bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh | ||
# run | ||
bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; |
75 changes: 75 additions & 0 deletions
75
tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import json | ||
import os | ||
import re | ||
import sys | ||
|
||
import numpy as np | ||
|
||
|
||
def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): | ||
with open(str(log_file), "r", encoding="utf8") as f: | ||
data = f.readlines() | ||
ips_lines = [] | ||
for eachline in data: | ||
if "train_samples_per_second:" in eachline: | ||
ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", "")) | ||
print("----ips: ", ips) | ||
ips_lines.append(ips) | ||
print("----ips_lines: ", ips_lines) | ||
ips = np.round(np.mean(ips_lines), 3) | ||
ngpus = int(re.findall("\d+", device_num)[-1]) | ||
print("----ips: ", ips, "ngpus", ngpus) | ||
ips *= ngpus | ||
run_mode = "DP" | ||
|
||
model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode | ||
info = { | ||
"model_branch": os.getenv("model_branch"), | ||
"model_commit": os.getenv("model_commit"), | ||
"model_name": model_name, | ||
"batch_size": bs, | ||
"fp_item": fp_item, | ||
"run_mode": run_mode, | ||
"convergence_value": 0, | ||
"convergence_key": "", | ||
"ips": ips, | ||
"speed_unit": "sample/sec", | ||
"device_num": device_num, | ||
"model_run_time": os.getenv("model_run_time"), | ||
"frame_commit": os.getenv("frame_commit"), | ||
"frame_version": os.getenv("frame_version"), | ||
} | ||
json_info = json.dumps(info) | ||
print(json_info) | ||
with open(res_log_file, "w") as of: | ||
of.write(json_info) | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) != 7: | ||
print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file") | ||
sys.exit() | ||
|
||
model_item = sys.argv[1] | ||
log_file = sys.argv[2] | ||
res_log_file = sys.argv[3] | ||
device_num = sys.argv[4] | ||
bs = int(sys.argv[5]) | ||
fp_item = sys.argv[6] | ||
|
||
analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) |
16 changes: 16 additions & 0 deletions
16
tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# install可选 | ||
cd ../ # PaddleNLP 根目录 | ||
pwd | ||
pip install -e . | ||
cd - | ||
|
||
# 下载、解压、拷贝必要数据集 | ||
cd ../llm/llama/auto_parallel/ | ||
# llama 模型数据下载 | ||
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy | ||
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz | ||
|
||
mkdir data | ||
mv llama_openwebtext_100k_ids.npy ./data | ||
mv llama_openwebtext_100k_idx.npz ./data | ||
cd - |
174 changes: 174 additions & 0 deletions
174
tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/run_benchmark.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# Test training benchmark for a model. | ||
# Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} | ||
function _set_params(){ | ||
model_item=${1:-"llama-7b_auto_dp2mp2pp2"} # (必选) 模型 item | ||
base_batch_size=${2:-"1"} # (必选) | ||
fp_item=${3:-"fp32"} # (必选) fp32|fp16|bf16 | ||
run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1 | ||
device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) | ||
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 | ||
|
||
model_repo="PaddleNLP" # (必选) 模型套件的名字 | ||
speed_unit="sample/s" # (必选)速度指标单位 | ||
skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step | ||
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 | ||
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" | ||
max_iter=${6:-100} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 | ||
num_workers=0 # (可选) | ||
|
||
# 以下为通用执行命令,无特殊可不用修改 | ||
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 | ||
device=${CUDA_VISIBLE_DEVICES//,/ } | ||
arr=(${device}) | ||
num_gpu_devices=${#arr[*]} | ||
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 | ||
profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 | ||
speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} | ||
|
||
train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log | ||
profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling | ||
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed | ||
} | ||
function _train(){ | ||
batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs | ||
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" | ||
|
||
if [ ${profiling} = "true" ];then | ||
add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" | ||
log_file=${profiling_log_file} | ||
else | ||
add_options="" | ||
log_file=${train_log_file} | ||
fi | ||
export FLAGS_group_schedule_tiling_first=1 | ||
export FLAGS_cinn_bucket_compile=1 | ||
export FLAGS_cinn_new_cluster_op_method=1 | ||
export FLAGS_deny_cinn_ops="gather" | ||
export FLAGS_prim_forward_blacklist="pd_op.embedding" | ||
export FLAGS_enable_prim_after_distribute=True | ||
export FLAGS_disable_dyshape_in_train=True | ||
export FLAGS_enable_pir_in_executor=True | ||
export FLAGS_enable_prim_after_distribute=1 | ||
|
||
|
||
use_fp16_cmd="" | ||
if [ $fp_item = "fp16" ]; then | ||
use_fp16_cmd="--fp16 1 --fp16_opt_level O2" | ||
fi | ||
to_static=0 # 是否开启动转静训练 | ||
train_cmd="run_pretrain_auto.py \ | ||
--model_type "llama" \ | ||
--model_name_or_path "facebook/llama-7b" \ | ||
--tokenizer_name_or_path "facebook/llama-7b" \ | ||
--input_dir "./data" \ | ||
--output_dir "output/$model_item" \ | ||
--split 949,50,1 \ | ||
--max_seq_length 2048 \ | ||
--per_device_train_batch_size ${batch_size} \ | ||
--per_device_eval_batch_size 2 \ | ||
--gradient_accumulation_steps 1 \ | ||
--use_flash_attention 1 \ | ||
--use_fused_rms_norm 0 \ | ||
--scale_loss 1024 \ | ||
--pipeline_parallel_degree 1 \ | ||
--tensor_parallel_degree 1 \ | ||
--sharding_parallel_degree 1 \ | ||
--learning_rate 0.0001 \ | ||
--min_learning_rate 0.00001 \ | ||
--max_steps ${max_iter} \ | ||
--save_steps 5000000 \ | ||
--weight_decay 0.01 \ | ||
--warmup_ratio 0.01 \ | ||
--logging_steps 20 \ | ||
--dataloader_num_workers 1 \ | ||
--sharding '' \ | ||
--eval_steps 1000000 \ | ||
--disable_tqdm true \ | ||
--continue_training 0\ | ||
--recompute 0 \ | ||
--do_train \ | ||
--do_eval \ | ||
--device gpu \ | ||
--data_impl mmap \ | ||
--enable_auto_parallel 1 \ | ||
--max_grad_norm 1.0 \ | ||
--num_hidden_layers 4 \ | ||
--to_static ${to_static} \ | ||
${use_fp16_cmd} " | ||
|
||
# 以下为通用执行命令,无特殊可不用修改 | ||
case ${run_mode} in | ||
DP) | ||
rm -rf ./mylog # 注意执行前删掉log目录 | ||
rm -rf output/$model_item | ||
train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog \ | ||
--gpus $CUDA_VISIBLE_DEVICES ${train_cmd}" | ||
;; | ||
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;; | ||
*) echo "choose run_mode "; exit 1; | ||
esac | ||
|
||
cd ../llm/llama/auto_parallel/ | ||
rm -rf ./mylog # 注意执行前删掉log目录 | ||
rm -rf output/$model_item | ||
echo "train_cmd: ${train_cmd} log_file: ${log_file}" | ||
|
||
python -c "import paddlenlp" | ||
timeout 15m ${train_cmd} > ${log_file} 2>&1 | ||
if [ $? -ne 0 ];then | ||
echo -e "${model_name}, FAIL" | ||
else | ||
echo -e "${model_name}, SUCCESS" | ||
fi | ||
#kill -9 `ps -ef|grep 'python'|awk '{print $2}'` | ||
if [ -d mylog ]; then | ||
case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog | ||
cp -r ${case_path}/mylog/workerlog.* ./mylog/ | ||
rm ${log_file} | ||
cp ${case_path}/mylog/workerlog.0 ${log_file} | ||
fi | ||
|
||
echo ${train_cmd} >> ${log_file} | ||
cat ${log_file} | ||
} | ||
|
||
function _analysis_log(){ | ||
# PaddleNLP/tests 目录 | ||
analysis_log_cmd="python test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py \ | ||
${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}" | ||
echo ${analysis_log_cmd} | ||
eval ${analysis_log_cmd} | ||
} | ||
|
||
_set_params $@ | ||
str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) | ||
export frame_version=${str_tmp%%.post*} | ||
export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`) | ||
export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` | ||
export model_commit=$(git log|head -n1|awk '{print $2}') | ||
echo "---------frame_version is ${frame_version}" | ||
echo "---------Paddle commit is ${frame_commit}" | ||
echo "---------Model commit is ${model_commit}" | ||
echo "---------model_branch is ${model_branch}" | ||
|
||
job_bt=`date '+%Y%m%d%H%M%S'` | ||
_train | ||
job_et=`date '+%Y%m%d%H%M%S'` | ||
export model_run_time=$((${job_et}-${job_bt})) | ||
_analysis_log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
添加记录: | ||
2024Q2在CINN推全项目中添加. | ||
模型名llama_auto_dp2mp2pp2 | ||
执行脚本:llm/llama/auto_parallel/run_pretrain_auto.py |
12 changes: 12 additions & 0 deletions
12
..._tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/N1C1/llama-7b_auto_dp2mp2pp2_bs1_fp16_DP.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
model_item=llama-7b_auto_dp2mp2pp2 | ||
bs_item=1 | ||
fp_item=fp16 | ||
run_mode=DP | ||
device_num=N1C1 | ||
|
||
max_iter=100 | ||
|
||
# prepare | ||
bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/prepare.sh | ||
# run | ||
bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1; |
75 changes: 75 additions & 0 deletions
75
tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import json | ||
import os | ||
import re | ||
import sys | ||
|
||
import numpy as np | ||
|
||
|
||
def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item): | ||
with open(str(log_file), "r", encoding="utf8") as f: | ||
data = f.readlines() | ||
ips_lines = [] | ||
for eachline in data: | ||
if "train_samples_per_second:" in eachline: | ||
ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", "")) | ||
print("----ips: ", ips) | ||
ips_lines.append(ips) | ||
print("----ips_lines: ", ips_lines) | ||
ips = np.round(np.mean(ips_lines), 3) | ||
ngpus = int(re.findall("\d+", device_num)[-1]) | ||
print("----ips: ", ips, "ngpus", ngpus) | ||
ips *= ngpus | ||
run_mode = "DP" | ||
|
||
model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode | ||
info = { | ||
"model_branch": os.getenv("model_branch"), | ||
"model_commit": os.getenv("model_commit"), | ||
"model_name": model_name, | ||
"batch_size": bs, | ||
"fp_item": fp_item, | ||
"run_mode": run_mode, | ||
"convergence_value": 0, | ||
"convergence_key": "", | ||
"ips": ips, | ||
"speed_unit": "sample/sec", | ||
"device_num": device_num, | ||
"model_run_time": os.getenv("model_run_time"), | ||
"frame_commit": os.getenv("frame_commit"), | ||
"frame_version": os.getenv("frame_version"), | ||
} | ||
json_info = json.dumps(info) | ||
print(json_info) | ||
with open(res_log_file, "w") as of: | ||
of.write(json_info) | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) != 7: | ||
print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file") | ||
sys.exit() | ||
|
||
model_item = sys.argv[1] | ||
log_file = sys.argv[2] | ||
res_log_file = sys.argv[3] | ||
device_num = sys.argv[4] | ||
bs = int(sys.argv[5]) | ||
fp_item = sys.argv[6] | ||
|
||
analyze(model_item, log_file, res_log_file, device_num, bs, fp_item) |
16 changes: 16 additions & 0 deletions
16
tests/test_tipc/dynamicTostatic/llama-7b_auto_dp2mp2pp2/benchmark_common/prepare.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# install可选 | ||
cd ../ # PaddleNLP 根目录 | ||
pwd | ||
pip install -e . | ||
cd - | ||
|
||
# 下载、解压、拷贝必要数据集 | ||
cd ../llm/llama/auto_parallel/ | ||
# llama 模型数据下载 | ||
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy | ||
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz | ||
|
||
mkdir data | ||
mv llama_openwebtext_100k_ids.npy ./data | ||
mv llama_openwebtext_100k_idx.npz ./data | ||
cd - |
Oops, something went wrong.