Skip to content

Commit

Permalink
【benchmark】 add llama-7b_auto_dp2mp2pp2 benchmark script for cinn (#8423
Browse files Browse the repository at this point in the history
)

* add llama-7b_auto_dp2mp2pp2 benchmark sh

* add llama-7b_auto_dp2mp2pp2 benchmark script for cinn

* update llama-7b_auto_dp2mp2pp2 benchmark script

* Update run_benchmark.sh

* Update run_benchmark.sh

* fix llama-7b_auto_dp2mp2pp2/benchmark_common

* Update run_benchmark.sh
  • Loading branch information
mmglove committed Jun 6, 2024
1 parent 87edf28 commit b0a8cdd
Show file tree
Hide file tree
Showing 10 changed files with 566 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
model_item=llama-7b_auto_dp2mp2pp2
bs_item=1
fp_item=fp16
run_mode=DP
device_num=N1C1

max_iter=100

# prepare
bash ./test_tipc/dygraph/${model_item}/benchmark_common/prepare.sh
# run
bash ./test_tipc/dygraph/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os
import re
import sys

import numpy as np


def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
with open(str(log_file), "r", encoding="utf8") as f:
data = f.readlines()
ips_lines = []
for eachline in data:
if "train_samples_per_second:" in eachline:
ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", ""))
print("----ips: ", ips)
ips_lines.append(ips)
print("----ips_lines: ", ips_lines)
ips = np.round(np.mean(ips_lines), 3)
ngpus = int(re.findall("\d+", device_num)[-1])
print("----ips: ", ips, "ngpus", ngpus)
ips *= ngpus
run_mode = "DP"

model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode
info = {
"model_branch": os.getenv("model_branch"),
"model_commit": os.getenv("model_commit"),
"model_name": model_name,
"batch_size": bs,
"fp_item": fp_item,
"run_mode": run_mode,
"convergence_value": 0,
"convergence_key": "",
"ips": ips,
"speed_unit": "sample/sec",
"device_num": device_num,
"model_run_time": os.getenv("model_run_time"),
"frame_commit": os.getenv("frame_commit"),
"frame_version": os.getenv("frame_version"),
}
json_info = json.dumps(info)
print(json_info)
with open(res_log_file, "w") as of:
of.write(json_info)


if __name__ == "__main__":
if len(sys.argv) != 7:
print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file")
sys.exit()

model_item = sys.argv[1]
log_file = sys.argv[2]
res_log_file = sys.argv[3]
device_num = sys.argv[4]
bs = int(sys.argv[5])
fp_item = sys.argv[6]

analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# install可选
cd ../ # PaddleNLP 根目录
pwd
pip install -e .
cd -

# 下载、解压、拷贝必要数据集
cd ../llm/llama/auto_parallel/
# llama 模型数据下载
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz

mkdir data
mv llama_openwebtext_100k_ids.npy ./data
mv llama_openwebtext_100k_idx.npz ./data
cd -
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
model_item=${1:-"llama-7b_auto_dp2mp2pp2"} # (必选) 模型 item
base_batch_size=${2:-"1"} # (必选)
fp_item=${3:-"fp32"} # (必选) fp32|fp16|bf16
run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递

model_repo="PaddleNLP" # (必选) 模型套件的名字
speed_unit="sample/s" # (必选)速度指标单位
skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
max_iter=${6:-100} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
num_workers=0 # (可选)

# 以下为通用执行命令,无特殊可不用修改
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量
profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}

train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
}
function _train(){
batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

if [ ${profiling} = "true" ];then
add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
log_file=${profiling_log_file}
else
add_options=""
log_file=${train_log_file}
fi
export FLAGS_group_schedule_tiling_first=1
export FLAGS_cinn_bucket_compile=1
export FLAGS_cinn_new_cluster_op_method=1
export FLAGS_deny_cinn_ops="gather"
export FLAGS_prim_forward_blacklist="pd_op.embedding"
export FLAGS_enable_prim_after_distribute=True
export FLAGS_disable_dyshape_in_train=True
export FLAGS_enable_pir_in_executor=True
export FLAGS_enable_prim_after_distribute=1


use_fp16_cmd=""
if [ $fp_item = "fp16" ]; then
use_fp16_cmd="--fp16 1 --fp16_opt_level O2"
fi
to_static=0 # 是否开启动转静训练
train_cmd="run_pretrain_auto.py \
--model_type "llama" \
--model_name_or_path "facebook/llama-7b" \
--tokenizer_name_or_path "facebook/llama-7b" \
--input_dir "./data" \
--output_dir "output/$model_item" \
--split 949,50,1 \
--max_seq_length 2048 \
--per_device_train_batch_size ${batch_size} \
--per_device_eval_batch_size 2 \
--gradient_accumulation_steps 1 \
--use_flash_attention 1 \
--use_fused_rms_norm 0 \
--scale_loss 1024 \
--pipeline_parallel_degree 1 \
--tensor_parallel_degree 1 \
--sharding_parallel_degree 1 \
--learning_rate 0.0001 \
--min_learning_rate 0.00001 \
--max_steps ${max_iter} \
--save_steps 5000000 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--logging_steps 20 \
--dataloader_num_workers 1 \
--sharding '' \
--eval_steps 1000000 \
--disable_tqdm true \
--continue_training 0\
--recompute 0 \
--do_train \
--do_eval \
--device gpu \
--data_impl mmap \
--enable_auto_parallel 1 \
--max_grad_norm 1.0 \
--num_hidden_layers 4 \
--to_static ${to_static} \
${use_fp16_cmd} "

# 以下为通用执行命令,无特殊可不用修改
case ${run_mode} in
DP)
rm -rf ./mylog # 注意执行前删掉log目录
rm -rf output/$model_item
train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog \
--gpus $CUDA_VISIBLE_DEVICES ${train_cmd}"
;;
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;;
*) echo "choose run_mode "; exit 1;
esac

cd ../llm/llama/auto_parallel/
rm -rf ./mylog # 注意执行前删掉log目录
rm -rf output/$model_item
echo "train_cmd: ${train_cmd} log_file: ${log_file}"

python -c "import paddlenlp"
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL"
else
echo -e "${model_name}, SUCCESS"
fi
#kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ -d mylog ]; then
case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog
cp -r ${case_path}/mylog/workerlog.* ./mylog/
rm ${log_file}
cp ${case_path}/mylog/workerlog.0 ${log_file}
fi

echo ${train_cmd} >> ${log_file}
cat ${log_file}
}

function _analysis_log(){
# PaddleNLP/tests 目录
analysis_log_cmd="python test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/benchmark_common/analysis_log.py \
${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}"
echo ${analysis_log_cmd}
eval ${analysis_log_cmd}
}

_set_params $@
str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
export frame_version=${str_tmp%%.post*}
export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`)
export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
export model_commit=$(git log|head -n1|awk '{print $2}')
echo "---------frame_version is ${frame_version}"
echo "---------Paddle commit is ${frame_commit}"
echo "---------Model commit is ${model_commit}"
echo "---------model_branch is ${model_branch}"

job_bt=`date '+%Y%m%d%H%M%S'`
_train
job_et=`date '+%Y%m%d%H%M%S'`
export model_run_time=$((${job_et}-${job_bt}))
_analysis_log
4 changes: 4 additions & 0 deletions tests/test_tipc/dygraph/llama-7b_auto_dp2mp2pp2/readme.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
添加记录:
2024Q2在CINN推全项目中添加.
模型名llama_auto_dp2mp2pp2
执行脚本:llm/llama/auto_parallel/run_pretrain_auto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
model_item=llama-7b_auto_dp2mp2pp2
bs_item=1
fp_item=fp16
run_mode=DP
device_num=N1C1

max_iter=100

# prepare
bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/prepare.sh
# run
bash ./test_tipc/dynamicTostatic/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import os
import re
import sys

import numpy as np


def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
with open(str(log_file), "r", encoding="utf8") as f:
data = f.readlines()
ips_lines = []
for eachline in data:
if "train_samples_per_second:" in eachline:
ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(",", ""))
print("----ips: ", ips)
ips_lines.append(ips)
print("----ips_lines: ", ips_lines)
ips = np.round(np.mean(ips_lines), 3)
ngpus = int(re.findall("\d+", device_num)[-1])
print("----ips: ", ips, "ngpus", ngpus)
ips *= ngpus
run_mode = "DP"

model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode
info = {
"model_branch": os.getenv("model_branch"),
"model_commit": os.getenv("model_commit"),
"model_name": model_name,
"batch_size": bs,
"fp_item": fp_item,
"run_mode": run_mode,
"convergence_value": 0,
"convergence_key": "",
"ips": ips,
"speed_unit": "sample/sec",
"device_num": device_num,
"model_run_time": os.getenv("model_run_time"),
"frame_commit": os.getenv("frame_commit"),
"frame_version": os.getenv("frame_version"),
}
json_info = json.dumps(info)
print(json_info)
with open(res_log_file, "w") as of:
of.write(json_info)


if __name__ == "__main__":
if len(sys.argv) != 7:
print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file")
sys.exit()

model_item = sys.argv[1]
log_file = sys.argv[2]
res_log_file = sys.argv[3]
device_num = sys.argv[4]
bs = int(sys.argv[5])
fp_item = sys.argv[6]

analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# install可选
cd ../ # PaddleNLP 根目录
pwd
pip install -e .
cd -

# 下载、解压、拷贝必要数据集
cd ../llm/llama/auto_parallel/
# llama 模型数据下载
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz

mkdir data
mv llama_openwebtext_100k_ids.npy ./data
mv llama_openwebtext_100k_idx.npz ./data
cd -
Loading

0 comments on commit b0a8cdd

Please sign in to comment.