# Vision-Language Model

* 4bit quantised idefics-80b VLM
* 4bit quantised idefics-80b VLM with instruction-tuning

*use `idefics_colab` conda env*

In [1]:
import torch
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
import torchvision.transforms as transforms

# Check available gpu (gpustat -i)
device = 7

# checkpoint = "HuggingFaceM4/tiny-random-idefics"
local_path = "/mounts/data/corp/huggingface/"
checkpoint = local_path+"idefics/idefics-80b"

# Here we skip some special modules that can't be quantized properly
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

processor = AutoProcessor.from_pretrained(checkpoint, use_auth_token=True)
# Simply take-off the quantization_config arg if you want to load the original model
model = IdeficsForVisionText2Text.from_pretrained(checkpoint,quantization_config=bnb_config, device_map={"":device}) #  quantization_config=bnb_config,

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 17/17 [03:53<00:00, 13.73s/it]


In [2]:
def check_inference(model, processor, prompts, max_new_tokens=50):
    tokenizer = processor.tokenizer
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    inputs = processor(prompts, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

## Run experiment

In [10]:
images = []
for i in range(1,5):
    img_path = "../../data/direction0"+str(i)+".png"
    images.append(Image.open(img_path).convert("RGB"))

prompts = [
    # "Instruction: provide an answer to the question. Use the image to answer.\n",
    "Image A:",
    images[0],
    "Image B:",
    images[1],
    "Image C:",
    images[2],
    "Image D:",
    images[3],
    "Question: Which one of the images (A-D) describes the event 'arrow down' best? Answer:"
]
check_inference(model, processor, prompts, max_new_tokens=5)


Image A: Image B: Image C: Image D: Question: Which one of the images (A-D) describes the event 'arrow down' best? Answer: Image A

Question


In [19]:
images = []
for i in range(1,5):
    img_path = "../../data/direction0"+str(i)+".png"
    images.append(Image.open(img_path).convert("RGB"))

prompts = [
    # "Instruction: provide an answer to the question. Use the image to answer.\n",
    "Image A:",
    images[0],
    "Image B:",
    images[1],
    "Image C:",
    images[2],
    "Image D:",
    images[3],
    "Question: Give a one sentence description for each image. The description should focus on the differences. Answer:"
]
check_inference(model, processor, prompts, max_new_tokens=60)


Image A: Image B: Image C: Image D: Question: Give a one sentence description for each image. The description should focus on the differences. Answer:

Image A: The object is moving to the right.

Image B: The object is moving to the left.

Image C: The object is moving up and down.

Image D: The object is moving up and down.

Question: What is the difference
