# Initialization of LLaVA

Add `<image>` to the special tokens

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

modify_qwen_tokenizer_dir = "/root/MEGL/Qwen2-1.5B-Instruct"
modify_qwen_tokenizer = AutoTokenizer.from_pretrained(modify_qwen_tokenizer_dir)

modify_qwen_tokenizer.encode("<image>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[151646]

In [2]:
clip_model_name_or_path = (
    "/root/MEGL/openai/clip-vit-large-patch14-336"
)
qwen_model_name_or_path = "/root/MEGL/Qwen2-1.5B-Instruct"

In [3]:
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoProcessor

clip_model = AutoModel.from_pretrained(clip_model_name_or_path, device_map="cuda:0")
llm_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name_or_path, device_map="cuda:0"
)

In [4]:
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)
llm_tokenizer.encode("<image>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[151646]

Initialize LLaVA with llavaconfig

In [6]:
from transformers import (
    LlavaForConditionalGeneration,
    LlavaConfig
)

# Initializing a CLIP-vision config
vision_config = clip_model.vision_model.config

# Initializing a Llama config
text_config = llm_model.config

# Initializing a Llava llava-1.5-7b style configuration
configuration = LlavaConfig(vision_config, text_config)

# Initializing a model from the llava-1.5-7b style configuration
model = LlavaForConditionalGeneration(configuration)

Load the weight of CLIP and LLM into the LLaVA model, and copy the `pad_token_id` and `image_token_index`

In [7]:
model.vision_tower.vision_model = clip_model.vision_model
model.language_model = llm_model

model.config.pad_token_id = llm_tokenizer.pad_token_id
model.config.image_token_index = llm_tokenizer.encode("<image>")[0]

Save the model and the processor

In [8]:
model.save_pretrained("show_model/model001")
llm_tokenizer.save_pretrained("show_model/model001")

autoprocessor = AutoProcessor.from_pretrained(clip_model_name_or_path)
autoprocessor.save_pretrained("show_model/model002")

[]

Note: remember to move the `preprocessor_config.json` file in `model002` to `model001`

# Test of LLaVA Inference

In [9]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch


model_name_or_path = "show_model/model001"  # 
# model_name_or_path = "test_model_copy/model001"  #

llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name_or_path, device_map="cuda:0", torch_dtype=torch.bfloat16
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from PIL import Image

prompt_text = "<image>\nWhat are these?"


messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt_text},
]
prompt = llava_processor.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)


image_path = "/root/MEGL/Dataset/Action_Dataset/images/000003072.jpg"
image = Image.open(image_path)


inputs = llava_processor(text=prompt, images=image, return_tensors="pt")

for tk in inputs.keys():
    inputs[tk] = inputs[tk].to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=20)
gen_text = llava_processor.batch_decode(
    generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]

print(gen_text)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
What are these?<|im_end|>
<|im_start|>assistant
These are Korean words.<|im_end|>
