In [1]:
import torch
import random
import torchvision.transforms as transforms
from sidemethods import load_richardson_data, convert_to_float
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig

# local path to model repository on our servers
server_model_path = "/mounts/data/corp/huggingface/"
# loading the original human data as vectors for each action word
_, _, richardson_normed = load_richardson_data()
action_words = richardson_normed.keys()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def check_inference(model, processor, prompts, max_new_tokens=50):
    tokenizer = processor.tokenizer
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    inputs = processor(prompts, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_text)

In [6]:
from huggingface_hub import login

with open("../../hf.key", "r") as f_in:
    hf_key = f_in.readline().strip()

login(token = hf_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /mounts/Users/cisintern/pwicke/.cache/huggingface/token
Login successful


In [5]:
device = 2

checkpoint = "HuggingFaceM4/tiny-random-idefics"
processor = AutoProcessor.from_pretrained(checkpoint, use_auth_token=True)
model = IdeficsForVisionText2Text.from_pretrained(checkpoint ,device_map={"":device}) #  quantization_config=bnb_config,

FileNotFoundError: [Errno 2] No such file or directory: 'huggingface-cli'

In [3]:
device = 2

# checkpoint = "HuggingFaceM4/tiny-random-idefics"
local_path = "/mounts/data/corp/huggingface/"
checkpoint = local_path+"idefics/idefics-80b"

# Here we skip some special modules that can't be quantized properly
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

processor = AutoProcessor.from_pretrained(checkpoint, use_auth_token=True)
# Simply take-off the quantization_config arg if you want to load the original model
model = IdeficsForVisionText2Text.from_pretrained(checkpoint,quantization_config=bnb_config ,device_map={"":device}) #  quantization_config=bnb_config,

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards:   6%|▌         | 1/17 [00:30<08:06, 30.42s/it]


KeyboardInterrupt: 

In [19]:
img_path = "test_images/put-block-in-matching-bowl_3.png"

for i in range(1,5):
    prompts = [Image.open("../../data/direction0"+str(i)+".png").convert("RGB"),
        "Question: In the image, which direction does the arrow point towards to? Answer:",]
    check_inference(model, processor, prompts, max_new_tokens=25)
print()
for i in range(1,5):
    prompts = [Image.open("../../data/direction0"+str(i)+".png").convert("RGB"),
        "Question: Which direction is depicted? Answer:",]
    check_inference(model, processor, prompts, max_new_tokens=25)
print()
for i in range(1,5):
    prompts = [Image.open("../../data/direction0"+str(i)+".png").convert("RGB"),
        "Which direction does the arrow point towards to? Answer:",]
    check_inference(model, processor, prompts, max_new_tokens=25)
print()
for i in range(1,5):
    prompts = [Image.open("../../data/direction0"+str(i)+".png").convert("RGB"),
        "The image shows",]
    check_inference(model, processor, prompts, max_new_tokens=25)



Question: In the image, which direction does the arrow point towards to? Answer: The arrow points towards the right.
Question: In the image, which direction does the arrow point towards to? Answer: The arrow points towards the right.
Question: In the image, which direction does the arrow point towards to? Answer: The arrow points towards the right.
Question: In the image, which direction does the arrow point towards to? Answer: Upwards

Question: Which direction is depicted? Answer: The arrow is pointing to the right.
Question: Which direction is depicted? Answer: The arrow is pointing to the right.
Question: Which direction is depicted? Answer: The direction of the force is from the left to the right.
Question: Which direction is depicted? Answer: The direction of the force is from the box to the person.

Which direction does the arrow point towards to? Answer: The arrow points towards the right.
Which direction does the arrow point towards to? Answer: The arrow points towards the obj

In [9]:
model_choices = dict()
arrows = ['UP', 'DOWN', 'LEFT', 'RIGHT']

for action_word in action_words:
    print(action_word)

    # Creating list of images for processing
    prompt = []
    letters = ["KYMD", "PWTX", "YZPR", "DHNV"]
    l = list(range(1,5))
    random.shuffle(l)
    for idx, selected in enumerate(l):
        print("direction0"+str(selected)+" is "+letters[idx])

        prompt.append(Image.open("../../data/direction0"+str(selected)+".png").convert("RGB"))
        prompt.append("Image "+letters[idx]+". ")

    prompt.append(" Question: Which of the images presents the event 'circle "+action_word+" square' best? Answer:")
    check_inference(model, processor, prompt, max_new_tokens=15)
    print()


fled
direction02 is KYMD
direction01 is PWTX
direction04 is YZPR
direction03 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.Question: Which of the images presents the event 'circle fled square' best? Answer: Image 1.

The event 'circle fled square' is best

pointed at
direction01 is KYMD
direction04 is PWTX
direction03 is YZPR
direction02 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.Question: Which of the images presents the event 'circle pointed at square' best? Answer: Image 1.

pulled
direction02 is KYMD
direction04 is PWTX
direction03 is YZPR
direction01 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.Question: Which of the images presents the event 'circle pulled square' best? Answer: Image 1.

The event 'circle pulled square' is best

pushed
direction03 is KYMD
direction01 is PWTX
direction02 is YZPR
direction04 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.Question: Which of the images presents the event 'circle pushed square' best? Answer: Imag

KeyboardInterrupt: 

In [21]:
model_choices = dict()
arrows = ['UP', 'DOWN', 'LEFT', 'RIGHT']

for action_word in action_words:
    print(action_word)

    # Creating list of images for processing
    prompt = []
    letters = ["KYMD", "PWTX", "YZPR", "DHNV"]
    l = list(range(1,5))
    random.shuffle(l)
    for idx, selected in enumerate(l):
        print("direction0"+str(selected)+" is "+letters[idx])

        prompt.append(Image.open("../../data/direction0"+str(selected)+".png").convert("RGB"))
        prompt.append("Image "+letters[idx]+". ")

    prompt.append(" The event '"+action_word+"' is best represented by the image with code")
    check_inference(model, processor, prompt, max_new_tokens=15)
    print()


fled
direction04 is KYMD
direction01 is PWTX
direction02 is YZPR
direction03 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.The event 'fled' is best represented by the image with code 'DHNV'. The image shows a bird flying away from a

pointed at
direction04 is KYMD
direction02 is PWTX
direction01 is YZPR
direction03 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.The event 'pointed at' is best represented by the image with code 'DHNV'.

pulled
direction01 is KYMD
direction04 is PWTX
direction02 is YZPR
direction03 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.The event 'pulled' is best represented by the image with code 'DHNV'. The image shows a person pulling a rope

pushed
direction01 is KYMD
direction02 is PWTX
direction04 is YZPR
direction03 is DHNV
Image KYMD. Image PWTX. Image YZPR. Image DHNV.The event 'pushed' is best represented by the image with code DHNV. The event 'pushed' is best represented by

walked
direction02 is KYMD
direction01 is PWTX
directi