Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Features] Support multi_modal training #628

Merged
merged 14 commits into from
Sep 6, 2023
17 changes: 17 additions & 0 deletions configs/ds_config_vis_chatbot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"fp16": {
"enabled": false
},
"bf16": {
"enabled": false
},
"comms_logger": {
"enabled": false,
"verbose": false,
"prof_all": false,
"debug": false
},
"steps_per_print": 20000000000000000,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false
}
2 changes: 1 addition & 1 deletion examples/finetune_multi_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main():
data_args=data_args,
pipeline_args=pipeline_args,
)
model = AutoModel.get_model(model_args, tune_strategy='none',
model = AutoModel.get_model(model_args, tune_strategy='finetune',
ds_config=pipeline_args.deepspeed,
custom_model=True)
for param in model.backend_model.language_model.parameters():
Expand Down
44 changes: 34 additions & 10 deletions examples/vis_chatbot.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason to check whether to use deepspeed is that when loading the model with 8bit, using deepspeed would raise an error.
huggingface/transformers#24540

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem. Thanks!

Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ class ChatbotArguments:
"help": "whether to do the stream inference"
}
)

with_deepspeed: Optional[bool] = field(
default=True,
metadata={
"help": "whether to use deepspeed"
}
)

def main():
pipeline_name = "inferencer"
Expand All @@ -104,10 +109,11 @@ def main():
ds_config=ds_config,
device=pipeline_args.device,
custom_model=model_args.custom_model,
with_deepspeed=chatbot_args.with_deepspeed,
)

data_args = DatasetArguments(dataset_path=None)
dataset = Dataset(data_args)
dataset = Dataset(data_args, backend="dict")

inferencer = AutoPipeline.get_pipeline(
pipeline_name=pipeline_name,
Expand Down Expand Up @@ -140,13 +146,21 @@ def main():
# " unconditionally."
# )

sep = "###"

end_string = chatbot_args.end_string
if chatbot_args.prompt_format == "mini_gpt":
context = "Give the following image: <Img>ImageContent</Img>. " + "You will be able to see the image once I provide it to you. Please answer my questions."
user_name = "Human"
sep = "###"

elif chatbot_args.prompt_format == "llava":
context = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
user_name = "USER"
sep = " "
else:
context = ""
user_name = ""
sep = "###"
prompt_structure = chatbot_args.prompt_structure

# Load image and input text for reasoning
Expand All @@ -162,7 +176,9 @@ def main():
if chatbot_args.task == "image_caption" and len(input_text) == 0:
input_text = "a photography of"
if chatbot_args.prompt_format == "mini_gpt":
context += sep + "Human: " + "<Img><ImageHere></Img> "
context += sep + user_name + ": " + "<Img><ImageHere></Img> "
elif chatbot_args.prompt_format == "llava":
context += sep + user_name + ": " + "<image>\n"

# this flag is for determining if we need to add the ###Human: prompt
# if text after loading image, we add it when loading image
Expand All @@ -179,7 +195,7 @@ def main():
input_dataset = dataset.from_dict({
"type": "image_text",
"instances": [{"images": np.stack(image_list),
"text": input_text,}]
"text": input_text,}]
})
output = inferencer.inference(model, input_dataset)
print(output.backend_dataset['text'])
Expand All @@ -200,7 +216,12 @@ def main():
# batch of image with different shape
raw_image = raw_image.resize(base_size)
image_list.append(np.array(raw_image))
context += sep + "Human: " + "<Img><ImageHere></Img> "
if chatbot_args.prompt_format == "mini_gpt":
context += sep + user_name + ": " + "<Img><ImageHere></Img> "
elif chatbot_args.prompt_format == "llava":
context += sep + user_name + ": " + "<image>\n"
else:
raise NotImplementedError
text_after_loading_image = True
print("Finish loading image with path {}".format(image_path))
continue
Expand All @@ -213,8 +234,7 @@ def main():
continue

if text_after_loading_image is False:
if chatbot_args.prompt_format == "mini_gpt":
context += sep + "Human: "
context += sep + user_name + ": "
else:
text_after_loading_image = False

Expand All @@ -229,14 +249,18 @@ def main():
"instances": [{"images": np.stack(image_list),
"text": context,}]
})
remove_image_flag = chatbot_args.prompt_format=="mini_gpt"
if chatbot_args.prompt_format in ["mini_gpt", "llava"]:
remove_image_flag = True
else:
remove_image_flag = False
begin_time = time.time()
if not chatbot_args.stream_inference:
# directly inference the results
output_dataset = inferencer.inference(
model,
input_dataset,
remove_image_flag=remove_image_flag)
remove_image_flag=remove_image_flag,
prompt_format=chatbot_args.prompt_format,)
response = output_dataset.backend_dataset['text']
print(response[0])
print("\n", end="")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ model_name_or_path=Salesforce/blip2-flan-t5-xxl
dataset_path=/home/qlianab/data1/llm/CC3M-Pretrain-595K/cc3m_595k.json
image_folder=/home/qlianab/data1/llm/CC3M-Pretrain-595K/images
output_dir=output_models/finetune
deepspeed_args="--master_port=12000 --include localhost:8"
deepspeed_args="--master_port=12000 --include localhost:9"

while [[ $# -ge 1 ]]; do
key="$1"
Expand Down Expand Up @@ -55,7 +55,6 @@ deepspeed ${deepspeed_args} \
--custom_vision_model True \
--llm_model_name_or_path lmsys/vicuna-7b-v1.5 \
--image_aspect_ratio None \
--num_train_epochs 1 \
--fp16 True \
--learning_rate 2e-5 \
--gradient_accumulation_steps 1 \
Expand All @@ -71,5 +70,6 @@ deepspeed ${deepspeed_args} \
--ddp_timeout 72000 \
--save_steps 5000 \
--dataloader_num_workers 1 \
--num_train_epochs 1 \
| tee ${log_dir}/train.log \
2> ${log_dir}/train.err
77 changes: 77 additions & 0 deletions scripts/run_finetune_multi_modal_stage2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash
# Please run this script under ${project_id} in project directory of
# https://github.com/shizhediao/llm-ft
# COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4

# Parses argumen
model_name_or_path=Salesforce/blip2-flan-t5-xxl
dataset_path=/home/qlianab/data1/llm/llava_instruct_80k.json
image_folder=/home/qlianab/data1/2d_detection/coco/train2017
output_dir=output_models/finetune
deepspeed_args="--master_port=12000 --include localhost:9"

while [[ $# -ge 1 ]]; do
key="$1"
case ${key} in
-m|--model_name_or_path)
model_name_or_path="$2"
shift
;;
-d|--dataset_path)
dataset_path="$2"
shift
;;
-o|--output_model_path)
output_dir="$2"
shift
;;
--deepspeed_args)
deepspeed_args="$2"
shift
;;
*)
echo "error: unknown option \"${key}\"" 1>&2
exit 1
esac
shift
done

# Finetune
exp_id=finetune
project_dir=$(cd "$(dirname $0)"/..; pwd)
log_dir=${project_dir}/log/${exp_id}
mkdir -p ${output_dir} ${log_dir}

deepspeed ${deepspeed_args} \
examples/finetune_multi_modal.py \
--deepspeed configs/ds_config_multimodal.json \
--arch_type vision_encoder_decoder \
--llava_loading True \
--model_name_or_path ${model_name_or_path} \
--image_encoder_name_or_path openai/clip-vit-large-patch14 \
--pretrained_language_projection_path /home/qlianab/checkpoints/llava-336px-pretrain-vicuna-7b-v1.3/mm_projector.bin \
--dataset_path ${dataset_path} \
--output_dir ${output_dir} --overwrite_output_dir \
--image_folder ${image_folder} \
--custom_vision_model True \
--llm_model_name_or_path lmsys/vicuna-7b-v1.5 \
--image_aspect_ratio None \
--fp16 True \
--learning_rate 2e-5 \
--gradient_accumulation_steps 1 \
--per_device_train_batch_size 2 \
--learning_rate 2e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--run_name finetune \
--validation_split_percentage 0 \
--logging_steps 20 \
--do_train \
--ddp_timeout 72000 \
--save_steps 5000 \
--dataloader_num_workers 1 \
--sep_style "v1" \
--num_train_epochs 1 \
| tee ${log_dir}/train.log \
2> ${log_dir}/tra
2 changes: 1 addition & 1 deletion scripts/run_vis_chatbot_blip2.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
model=Salesforce/blip2-opt-2.7b
deepspeed examples/vis_chatbot.py --model_name_or_path ${model} \
--deepspeed configs/ds_config_multimodal.json \
--deepspeed configs/ds_config_vis_chatbot.json \
--arch_type vision_encoder_decoder \
--task vqa \
${@:1}
4 changes: 2 additions & 2 deletions scripts/run_vis_chatbot_gradio_minigpt4.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ fi

deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \
--model_name_or_path ${model} \
--deepspeed configs/ds_config_multimodal.json \
--deepspeed configs/ds_config_vis_chatbot.json \
--arch_type vision_encoder_decoder \
--task vqa \
--custom_model \
--prompt_format mini_gpt \
--prompt_structure "###Human: {input_text}###Assistant:" \
--llm_model_name_or_path LMFlow/Full-Robin-13b-v2 \
--checkpoint_path output_models/pretrained_minigpt4_13b_converted.pth \
--pretrained_language_projection_path output_models/pretrained_minigpt4_13b_converted.pth \
--low_resource True \
--max_new_tokens 1024
17 changes: 17 additions & 0 deletions scripts/run_vis_chatbot_llava.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
model=/home/qlianab/data1/checkpoints/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3
deepspeed_args="--master_port=12000 --include localhost:9"

deepspeed ${deepspeed_args} \
examples/vis_chatbot.py \
--deepspeed configs/ds_config_vis_chatbot.json \
--arch_type vision_encoder_decoder \
--task vqa \
--custom_model \
--model_name_or_path ${model} \
--prompt_format llava \
--prompt_structure '{input_text} ASSISTANT:' \
--low_resource True \
--llava_loading True \
--with_deepspeed False \
${@:1}

10 changes: 6 additions & 4 deletions scripts/run_vis_chatbot_minigpt4.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
model=Salesforce/blip2-flan-t5-xxl
checkpoint_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/minigpt4/prerained_minigpt4_7b_converted.pth
llm_model_name_or_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/vicuna-7b/
deepspeed examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_multimodal.json --arch_type vision_encoder_decoder --task vqa --custom_model \
checkpoint_path=/home/qlianab/checkpoints/pretrained_weights/minigpt4/prerained_minigpt4_7b_converted.pth
llm_model_name_or_path=lmsys/vicuna-7b-v1.3
deepspeed_args="--master_port=12000 --include localhost:8"

deepspeed ${deepspeed_args} examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_vis_chatbot.json --arch_type vision_encoder_decoder --task vqa --custom_model \
--prompt_format mini_gpt \
--prompt_structure "{input_text}###Assistant:" \
--checkpoint_path ${checkpoint_path} \
--pretrained_language_projection_path ${checkpoint_path} \
--llm_model_name_or_path ${llm_model_name_or_path} \
--low_resource True \
${@:1}
Expand Down
17 changes: 13 additions & 4 deletions src/lmflow/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,9 +284,9 @@ class VisModelArguments(ModelArguments):
default=False,
metadata={"help": "flag for the model from huggingface or not"}
)
checkpoint_path: str = field(
pretrained_language_projection_path: str = field(
default=None,
metadata={"help": "path for model checkpoint"}
metadata={"help": "path for model pretrained_language_projection_path"}
)
custom_vision_model: bool = field(
default=False,
Expand Down Expand Up @@ -336,8 +336,14 @@ class VisModelArguments(ModelArguments):
default=-2,
metadata={"help": "Which layer to select in vision model."},
)


llava_pretrain_model_path: Optional[str] = field(
default=None,
metadata={"help": "Path to llava pretrained model."},
)
save_pretrain_model_path: Optional[str] = field(
default=None,
metadata={"help": "Path to pretrained model."},
)

@dataclass
class DatasetArguments:
Expand Down Expand Up @@ -515,6 +521,9 @@ class MultiModalDatasetArguments(DatasetArguments):
use_image_start_end: Optional[bool] = field(
default=True, metadata={"help": "Flag for the modality type."}
)
sep_style: Optional[str] = field(
default="plain", metadata={"help": "Sep style in multi_modality dataset."}
)



Expand Down
Loading