-
Notifications
You must be signed in to change notification settings - Fork 258
/
internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh
79 lines (73 loc) · 2.19 KB
/
internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
set -x
PARTITION=${PARTITION:-"INTERN2"}
GPUS=${GPUS:-256}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
NODES=$((GPUS / GPUS_PER_NODE))
CPUS_PER_TASK=${CPUS_PER_TASK:-10}
SRUN_ARGS=${SRUN_ARGS:-""}
BATCH_SIZE=${BATCH_SIZE:-2048}
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=34229
export TF_CPP_MIN_LOG_LEVEL=3
OUTPUT_DIR='work_dirs/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain'
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi
# number of gpus: 256
# batch size per gpu: 4
# gradient accumulation steps: 2
# total batch size: 2048
# epoch: 1
srun -p ${PARTITION} \
--gres=gpu:${GPUS_PER_NODE} \
--nodes=${NODES} \
--ntasks=${GPUS} \
--ntasks-per-node=${GPUS_PER_NODE} \
--cpus-per-task=${CPUS_PER_TASK} \
--kill-on-bad-exit=1 \
--quotatype=${QUOTA_TYPE} \
${SRUN_ARGS} \
python -u internvl/train/internvl_chat_pretrain.py \
--vision_path "./pretrained/InternViT-6B-448px-V1-5" \
--llm_path "./pretrained/internlm2-chat-20b" \
--conv_style "internlm2-chat" \
--output_dir ${OUTPUT_DIR} \
--meta_path "path/to/pretrain/data.json" \
--overwrite_output_dir True \
--force_image_size 448 \
--max_dynamic_patch 12 \
--down_sample_ratio 0.5 \
--drop_path_rate 0.2 \
--pad2square False \
--freeze_llm True \
--freeze_mlp False \
--freeze_backbone False \
--vision_select_layer -1 \
--use_data_resampling False \
--dataloader_num_workers 4 \
--bf16 True \
--num_train_epochs 1 \
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
--gradient_accumulation_steps ${GRADIENT_ACC} \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 3 \
--learning_rate 1e-5 \
--weight_decay 0.05 \
--warmup_steps 100 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--max_seq_length 4096 \
--do_train True \
--grad_checkpoint True \
--group_by_length False \
--dynamic_image_size True \
--use_thumbnail True \
--ps_version 'v2' \
--deepspeed "zero_stage3_config.json" \
--report_to "tensorboard" \
2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"