## Install required packages

In [None]:
%cd /content
!git clone https://github.com/haotian-liu/LLaVA
%cd /content/LLaVA

!pip install ninja
!pip install flash-attn --no-build-isolation
!pip install -e .

## Load frameworks

In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig
from llava.model import LlavaLlamaForCausalLM
import torch
import requests
from PIL import Image
from io import BytesIO
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from transformers import TextStreamer

[2023-10-25 14:06:08,701] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


## Load pretrained LLaVA

In [None]:
model_path = "4bit/llava-v1.5-7b-5GB"

kwargs = {"device_map": "auto"}
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)
model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at 4bit/llava-v1.5-7b-5GB were not used when initializing LlavaLlamaForCausalLM: ['model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias', 'model.vision_tower.vision_tower.vision_mod

We will use this function to load the image. We can either load the picture from http adress or as a local file.

In [None]:
def load_image(image_file):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    return image

Load image preprocessor

In [None]:
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device='cpu')
image_processor = vision_tower.image_processor

In [None]:
disable_torch_init()
conv_mode = "llava_v0"
conv = conv_templates[conv_mode].copy()
roles = conv.roles

image = load_image("pizza.jpeg")
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()

In [None]:
while True:
  try:
      inp = input(f"{roles[0]}: ")
  except EOFError:
      inp = ""
  if not inp:
      print("exit...")

  print(f"{roles[1]}: ", end="")

  if image is not None:
      # first message
      if model.config.mm_use_im_start_end:
          inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
      else:
          inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
      conv.append_message(conv.roles[0], inp)
      image = None
  else:
      # later messages
      conv.append_message(conv.roles[0], inp)
  conv.append_message(conv.roles[1], None)
  prompt = conv.get_prompt()

  input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
  stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
  keywords = [stop_str]
  stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

  with torch.inference_mode():
      output_ids = model.generate(
          input_ids,
          images=image_tensor,
          do_sample=True,
          temperature=0.2,
          max_new_tokens=1024,
          streamer=streamer,
          use_cache=True,
          stopping_criteria=[stopping_criteria])

  outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
  conv.messages[-1][-1] = outputs

Human: Write a post as restaurant owner that you will publish on a social network where you invite customers to come to your restaurant and try this dish
Assistant: "Calling all pizza enthusiasts! Experience the ultimate taste of our homemade pizza with cheese, spinach, and herbs. Our pizza is made with the freshest ingredients and baked to perfection. Come and enjoy a scrumptious meal with us! #PizzaLover #FreshIngredients #Homemade"
Human: 
