OptimalScale · research4pan · Sep 6, 2023 · Aug 28, 2023 · Sep 3, 2023 · Sep 3, 2023
diff --git a/configs/ds_config_vis_chatbot.json b/configs/ds_config_vis_chatbot.json
@@ -0,0 +1,17 @@
+{
+    "fp16": {
+        "enabled": false
+    },
+    "bf16": {
+        "enabled": false
+    },
+    "comms_logger": {
+        "enabled": false,
+        "verbose": false,
+        "prof_all": false,
+        "debug": false
+    },
+    "steps_per_print": 20000000000000000,
+    "train_micro_batch_size_per_gpu": 1,
+    "wall_clock_breakdown": false
+}
diff --git a/examples/finetune_multi_modal.py b/examples/finetune_multi_modal.py
@@ -58,7 +58,7 @@ def main():
         data_args=data_args,
         pipeline_args=pipeline_args,
     )
-    model = AutoModel.get_model(model_args, tune_strategy='none',
+    model = AutoModel.get_model(model_args, tune_strategy='finetune',
                                 ds_config=pipeline_args.deepspeed,
                                 custom_model=True)
     for param in model.backend_model.language_model.parameters():

diff --git a/examples/vis_chatbot.py b/examples/vis_chatbot.py
@@ -80,7 +80,12 @@ class ChatbotArguments:
             "help": "whether to do the stream inference"
         }
     )
-
+    with_deepspeed: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "whether to use deepspeed"
+        }
+    )
 
 def main():
     pipeline_name = "inferencer"
@@ -104,10 +109,11 @@ def main():
         ds_config=ds_config,
         device=pipeline_args.device,
         custom_model=model_args.custom_model,
+        with_deepspeed=chatbot_args.with_deepspeed,
     )
 
     data_args = DatasetArguments(dataset_path=None)
-    dataset = Dataset(data_args)
+    dataset = Dataset(data_args, backend="dict")
 
     inferencer = AutoPipeline.get_pipeline(
         pipeline_name=pipeline_name,
@@ -140,13 +146,21 @@ def main():
     #     " unconditionally."
     # )
 
-    sep = "###"
 
     end_string = chatbot_args.end_string
     if chatbot_args.prompt_format == "mini_gpt":
         context = "Give the following image: <Img>ImageContent</Img>. " + "You will be able to see the image once I provide it to you. Please answer my questions."
+        user_name = "Human"
+        sep = "###"
+
+    elif chatbot_args.prompt_format == "llava":
+        context = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+        user_name = "USER"
+        sep = " "
     else:
         context = ""
+        user_name = ""
+        sep = "###"
     prompt_structure = chatbot_args.prompt_structure
 
     # Load image and input text for reasoning
@@ -162,7 +176,9 @@ def main():
     if chatbot_args.task == "image_caption" and len(input_text) == 0:
         input_text = "a photography of"
     if chatbot_args.prompt_format == "mini_gpt":
-        context += sep + "Human: " + "<Img><ImageHere></Img> "
+        context += sep + user_name + ": " + "<Img><ImageHere></Img> "
+    elif chatbot_args.prompt_format == "llava":
+        context += sep + user_name + ": " + "<image>\n"
 
     # this flag is for determining if we need to add the ###Human: prompt
     # if text after loading image, we add it when loading image
@@ -179,7 +195,7 @@ def main():
         input_dataset = dataset.from_dict({
             "type": "image_text",
             "instances": [{"images": np.stack(image_list),
-                        "text":  input_text,}]
+                           "text":  input_text,}]
         })
         output = inferencer.inference(model, input_dataset)
         print(output.backend_dataset['text'])
@@ -200,7 +216,12 @@ def main():
                     # batch of image with different shape
                     raw_image = raw_image.resize(base_size)
                     image_list.append(np.array(raw_image))
-                    context += sep + "Human: " + "<Img><ImageHere></Img> "
+                    if chatbot_args.prompt_format == "mini_gpt":
+                        context += sep + user_name + ": " + "<Img><ImageHere></Img> "
+                    elif chatbot_args.prompt_format == "llava":
+                        context += sep + user_name + ": " + "<image>\n"
+                    else:
+                        raise NotImplementedError
                     text_after_loading_image = True
                     print("Finish loading image with path {}".format(image_path))
                     continue
@@ -213,8 +234,7 @@ def main():
                 continue
 
             if text_after_loading_image is False:
-                if chatbot_args.prompt_format == "mini_gpt":
-                    context += sep + "Human: "
+                context += sep + user_name + ": "
             else:
                 text_after_loading_image = False
 
@@ -229,14 +249,18 @@ def main():
                 "instances": [{"images": np.stack(image_list),
                                "text":  context,}]
             })
-            remove_image_flag = chatbot_args.prompt_format=="mini_gpt"
+            if chatbot_args.prompt_format in ["mini_gpt", "llava"]:
+                remove_image_flag = True
+            else:
+                remove_image_flag = False
             begin_time = time.time()
             if not chatbot_args.stream_inference:
                 # directly inference the results
                 output_dataset = inferencer.inference(
                     model,
                     input_dataset,
-                    remove_image_flag=remove_image_flag)
+                    remove_image_flag=remove_image_flag,
+                    prompt_format=chatbot_args.prompt_format,)
                 response = output_dataset.backend_dataset['text']
                 print(response[0])
                 print("\n", end="")

diff --git a/scripts/run_finetune_multi_modal.sh → scripts/run_finetune_multi_modal_stage1.sh b/scripts/run_finetune_multi_modal.sh → scripts/run_finetune_multi_modal_stage1.sh
@@ -8,7 +8,7 @@ model_name_or_path=Salesforce/blip2-flan-t5-xxl
 dataset_path=/home/qlianab/data1/llm/CC3M-Pretrain-595K/cc3m_595k.json
 image_folder=/home/qlianab/data1/llm/CC3M-Pretrain-595K/images
 output_dir=output_models/finetune
-deepspeed_args="--master_port=12000 --include localhost:8"
+deepspeed_args="--master_port=12000 --include localhost:9"
 
 while [[ $# -ge 1 ]]; do
   key="$1"
@@ -55,7 +55,6 @@ deepspeed ${deepspeed_args} \
     --custom_vision_model True \
     --llm_model_name_or_path lmsys/vicuna-7b-v1.5 \
     --image_aspect_ratio None \
-    --num_train_epochs 1 \
     --fp16 True \
     --learning_rate 2e-5 \
     --gradient_accumulation_steps 1 \
@@ -71,5 +70,6 @@ deepspeed ${deepspeed_args} \
     --ddp_timeout 72000 \
     --save_steps 5000 \
     --dataloader_num_workers 1 \
+    --num_train_epochs 1 \
     | tee ${log_dir}/train.log \
     2> ${log_dir}/train.err
diff --git a/scripts/run_finetune_multi_modal_stage2.sh b/scripts/run_finetune_multi_modal_stage2.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Please run this script under ${project_id} in project directory of
+#   https://github.com/shizhediao/llm-ft
+#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
+
+# Parses argumen
+model_name_or_path=Salesforce/blip2-flan-t5-xxl
+dataset_path=/home/qlianab/data1/llm/llava_instruct_80k.json
+image_folder=/home/qlianab/data1/2d_detection/coco/train2017
+output_dir=output_models/finetune
+deepspeed_args="--master_port=12000 --include localhost:9"
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    -o|--output_model_path)
+      output_dir="$2"
+      shift
+      ;;
+    --deepspeed_args)
+      deepspeed_args="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# Finetune
+exp_id=finetune
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${exp_id}
+mkdir -p ${output_dir} ${log_dir}
+
+deepspeed ${deepspeed_args} \
+  examples/finetune_multi_modal.py \
+    --deepspeed configs/ds_config_multimodal.json \
+    --arch_type vision_encoder_decoder \
+    --llava_loading True \
+    --model_name_or_path ${model_name_or_path} \
+    --image_encoder_name_or_path openai/clip-vit-large-patch14 \
+    --pretrained_language_projection_path /home/qlianab/checkpoints/llava-336px-pretrain-vicuna-7b-v1.3/mm_projector.bin \
+    --dataset_path ${dataset_path} \
+    --output_dir ${output_dir} --overwrite_output_dir \
+    --image_folder ${image_folder} \
+    --custom_vision_model True \
+    --llm_model_name_or_path lmsys/vicuna-7b-v1.5 \
+    --image_aspect_ratio None \
+    --fp16 True \
+    --learning_rate 2e-5 \
+    --gradient_accumulation_steps 1 \
+    --per_device_train_batch_size 2 \
+    --learning_rate 2e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --run_name finetune \
+    --validation_split_percentage 0 \
+    --logging_steps 20 \
+    --do_train \
+    --ddp_timeout 72000 \
+    --save_steps 5000 \
+    --dataloader_num_workers 1 \
+    --sep_style "v1" \
+    --num_train_epochs 1 \
+    | tee ${log_dir}/train.log \
+    2> ${log_dir}/tra
diff --git a/scripts/run_vis_chatbot_blip2.sh b/scripts/run_vis_chatbot_blip2.sh
@@ -1,6 +1,6 @@
 model=Salesforce/blip2-opt-2.7b
 deepspeed examples/vis_chatbot.py --model_name_or_path ${model} \
-                                  --deepspeed configs/ds_config_multimodal.json \
+                                  --deepspeed configs/ds_config_vis_chatbot.json \
                                   --arch_type vision_encoder_decoder \
                                   --task vqa \
                                   ${@:1}
diff --git a/scripts/run_vis_chatbot_gradio_minigpt4.sh b/scripts/run_vis_chatbot_gradio_minigpt4.sh
@@ -37,13 +37,13 @@ fi
 
 deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \
     --model_name_or_path ${model} \
-    --deepspeed configs/ds_config_multimodal.json \
+    --deepspeed configs/ds_config_vis_chatbot.json \
     --arch_type vision_encoder_decoder \
     --task vqa \
     --custom_model \
     --prompt_format mini_gpt \
     --prompt_structure "###Human: {input_text}###Assistant:" \
     --llm_model_name_or_path LMFlow/Full-Robin-13b-v2 \
-    --checkpoint_path output_models/pretrained_minigpt4_13b_converted.pth \
+    --pretrained_language_projection_path output_models/pretrained_minigpt4_13b_converted.pth \
     --low_resource True \
     --max_new_tokens 1024
diff --git a/scripts/run_vis_chatbot_llava.sh b/scripts/run_vis_chatbot_llava.sh
@@ -0,0 +1,17 @@
+model=/home/qlianab/data1/checkpoints/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3
+deepspeed_args="--master_port=12000 --include localhost:9"
+
+deepspeed ${deepspeed_args} \
+    examples/vis_chatbot.py \
+    --deepspeed configs/ds_config_vis_chatbot.json \
+    --arch_type vision_encoder_decoder \
+    --task vqa \
+    --custom_model \
+    --model_name_or_path ${model} \
+    --prompt_format llava \
+    --prompt_structure '{input_text} ASSISTANT:' \
+    --low_resource True \
+    --llava_loading True \
+    --with_deepspeed False \
+    ${@:1}
+
diff --git a/scripts/run_vis_chatbot_minigpt4.sh b/scripts/run_vis_chatbot_minigpt4.sh
@@ -1,10 +1,12 @@
 model=Salesforce/blip2-flan-t5-xxl
-checkpoint_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/minigpt4/prerained_minigpt4_7b_converted.pth
-llm_model_name_or_path=/scratch/PI/tongzhang/qinglian/checkpoints/pretrained_weights/vicuna-7b/
-deepspeed examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_multimodal.json --arch_type vision_encoder_decoder --task vqa --custom_model \
+checkpoint_path=/home/qlianab/checkpoints/pretrained_weights/minigpt4/prerained_minigpt4_7b_converted.pth
+llm_model_name_or_path=lmsys/vicuna-7b-v1.3
+deepspeed_args="--master_port=12000 --include localhost:8"
+
+deepspeed ${deepspeed_args} examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_vis_chatbot.json --arch_type vision_encoder_decoder --task vqa --custom_model \
                             --prompt_format mini_gpt \
                             --prompt_structure "{input_text}###Assistant:" \
-                            --checkpoint_path ${checkpoint_path} \
+                            --pretrained_language_projection_path ${checkpoint_path} \
                             --llm_model_name_or_path ${llm_model_name_or_path} \
                             --low_resource True \
                             ${@:1}

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
@@ -284,9 +284,9 @@ class VisModelArguments(ModelArguments):
         default=False,
         metadata={"help": "flag for the model from huggingface or not"}
     )
-    checkpoint_path: str = field(
+    pretrained_language_projection_path: str = field(
         default=None,
-        metadata={"help": "path for model checkpoint"}
+        metadata={"help": "path for model pretrained_language_projection_path"}
     )
     custom_vision_model: bool = field(
         default=False,
@@ -336,8 +336,14 @@ class VisModelArguments(ModelArguments):
         default=-2,
         metadata={"help": "Which layer to select in vision model."},
     )
-
-
+    llava_pretrain_model_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to llava pretrained model."},
+    )
+    save_pretrain_model_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained model."},
+    )
 
 @dataclass
 class DatasetArguments:
@@ -515,6 +521,9 @@ class MultiModalDatasetArguments(DatasetArguments):
     use_image_start_end: Optional[bool] = field(
         default=True, metadata={"help": "Flag for the modality type."}
     )
+    sep_style: Optional[str] = field(
+        default="plain", metadata={"help": "Sep style in multi_modality dataset."}
+    )