Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add npu-llama-opt0-script #8401

Merged
merged 14 commits into from
May 13, 2024
72 changes: 72 additions & 0 deletions llm/llama/npu/llama_npu_opt_lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

max_steps=${1:-1000}

export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export FLAGS_use_stride_kernel=0
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
export FLAGS_NPU_MC2=1
export MC2_Recompute=1
export MC2=1
export FLAGS_allocator_strategy=naive_best_fit
source /usr/local/Ascend/ascend-toolkit/set_env.sh


rm -rf lora_bf16_llama_N1C8
rm -rf output/lora_bf16_llama_N1C8
ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
export PYTHONPATH=../../../:$PYTHONPATH
python -u -m paddle.distributed.launch \
--devices "0,1,2,3,4,5,6,7" \
--log_dir "./lora_bf16_llama_N1C8" \
../../finetune_generation.py \
--device "npu" \
--model_name_or_path "meta-llama/Llama-2-13b" \
--dataset_name_or_path "data/" \
--output_dir "./output/lora_bf16_llama_N1C8" \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 16 \
--per_device_eval_batch_size 1 \
--eval_accumulation_steps 1 \
--max_steps ${max_steps} \
--decay_steps 2000 \
--learning_rate 3e-06 \
--warmup_steps 2 \
--save_steps 1000 \
--logging_steps 1 \
--evaluation_strategy "epoch" \
--src_length 1024 \
--max_length 4096 \
--bf16 true \
--fp16_opt_level "O2" \
--do_train true \
--disable_tqdm true \
--eval_with_do_generation false \
--metric_for_best_model "accuracy" \
--recompute false \
--tensor_parallel_degree 8 \
--pipeline_parallel_degree 1 \
--zero_padding 0 \
--sequence_parallel 1 \
--amp_master_grad true \
--fuse_attention_qkv true \
--fuse_attention_ffn true \
--use_flash_attention 1 \
--use_fused_rope 1 \
--use_fused_rms_norm 1 \
--lora true \
--lora_rank 32 \
--pad_to_multiple_of 4096
90 changes: 90 additions & 0 deletions llm/llama/npu/llama_npu_opt_ppt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

max_steps=${1:-800}

set -x
ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
rm -rf ./log_8.0
rm -rf output
export PYTHONPATH=../../../:$PYTHONPATH
export MC2=1
export GLOG_v=0
export FLAGS_npu_storage_format=1
export HCCL_INTRA_PCIE_EHABLE=0
export HCCL_INTRA_ROCE_ENABLE=1
export FLAGS_allocator_strategy=naive_best_fit
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export FLAGS_NPU_MC2=1
export MC2_Recompute=1
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS

export FLAGS_use_stride_kernel=0
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
export MULTI_STREAM_MEMORY_REUSE=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh

python -u -m paddle.distributed.launch \
--log_dir "./log_8.0" \
../run_pretrain.py \
--model_name_or_path "meta-llama/Llama-2-13b" \
--tokenizer_name_or_path "meta-llama/Llama-2-13b" \
--input_dir "./pre-data" \
--output_dir "./output" \
--split 949,50,1 \
--max_seq_length 4096 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 32 \
--per_device_eval_batch_size 1 \
--use_flash_attention 1 \
--use_fused_rms_norm 1 \
--virtual_pp_degree 1 \
--learning_rate 0.00001 \
--min_learning_rate 0.000001 \
--max_steps ${max_steps} \
--decay_steps 2000 \
--save_steps 2000 \
--seed 100 \
--weight_decay 0.01 \
--warmup_steps 20 \
--max_grad_norm 1.0 \
--logging_steps 1 \
--dataloader_num_workers 1 \
--eval_steps 1001 \
--tensor_parallel_degree 4 \
--disable_tqdm true \
--continue_training 0 \
--do_train \
--device "npu" \
--enable_linear_fused_grad_add false \
--fuse_attention_qkv true \
--fuse_attention_ffn true \
--use_fused_rope true \
--recompute_use_reentrant true \
--data_cache "./data_cache" \
--bf16 \
--fp16_opt_level "O2" \
--amp_master_grad \
--load_sharded_model true \
--save_sharded_model true \
--pipeline_parallel_degree 1 \
--ignore_data_skip 0 \
--force_reshard_pp true \
--tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
--sequence_parallel 1 \
--pipeline_parallel_config "disable_partial_send_recv" \
--sharding "stage1" \
--sharding_parallel_degree 2
78 changes: 78 additions & 0 deletions llm/llama/npu/llama_npu_opt_sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export FLAGS_use_stride_kernel=0
export FLAGS_npu_storage_format=1
export HCCL_INTRA_PCIE_EHABLE=0
export HCCL_INTRA_ROCE_ENABLE=1
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
export GLOG_v=0
export FLAGS_NPU_MC2=1
export MC2_Recompute=1
export MC2=1
export FLAGS_allocator_strategy=naive_best_fit
source /usr/local/Ascend/ascend-toolkit/set_env.sh

export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
export MULTI_STREAM_MEMORY_REUSE=1

export PYTHONPATH=../../../:$PYTHONPATH
rm -rf sft_bf16_llama_N1C8
rm -rf output/sft_bf16_llama_N1C8
ps aux | grep "finetune_generation.py" | grep -v grep | awk '{print $2}' | xargs kill -9
python -u -m paddle.distributed.launch \
--devices "0,1,2,3,4,5,6,7" \
--log_dir "./sft_bf16_llama_N1C8" \
../../finetune_generation.py \
--device "npu" \
--model_name_or_path "meta-llama/Llama-2-13b" \
--dataset_name_or_path "data/" \
--output_dir "./output/sft_bf16_llama_N1C8" \
--logging_dir "./sft_logs" \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 32 \
--per_device_eval_batch_size 1 \
--eval_accumulation_steps 1 \
--max_steps 2000 \
--learning_rate 3e-06 \
--warmup_steps 2 \
--save_steps 1000 \
--logging_steps 1 \
--evaluation_strategy "epoch" \
--src_length 1024 \
--max_length 4096 \
--fp16 true \
--fp16_opt_level "O2" \
--do_train true \
--disable_tqdm true \
--eval_with_do_generation false \
--metric_for_best_model "accuracy" \
--recompute false \
--tensor_parallel_degree 4 \
--pipeline_parallel_degree 1 \
--zero_padding 0 \
--amp_master_grad true \
--fuse_attention_qkv true \
--fuse_attention_ffn true \
--sequence_parallel 1 \
--use_flash_attention 1 \
--use_fused_rope 1 \
--use_fused_rms_norm 1 \
--sharding_parallel_degree 2 \
--pad_to_multiple_of 4096 \
--sharding "stage1" \
--sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap"