# Experiment 01
## with visual interpretation
### Richardson et. al (2002) Experiment 01 with **Vision** Language Models instead of Humans

The subjects were presented with a single page,
containing a list of the verbs and four pictures, labelled A to
D. Each one contained a circle and a square aligned along a
vertical or horizontal axis, connected by an arrow pointing
up, down, left or right. Since we didn't expect any
interesting item variation between left or right placement of
the circle or square, the horizontal schemas differed only in
the direction of the arrow.
For each sentence, subjects were asked to select one of
the four sparse images that best depicted the event described
by the sentence (Figure 1)
The items were randomised in three different orders, and
crossed with two different orderings of the images. The six
lists were then distributed randomly to subjects.



### Setup of Experimental Environment

In [1]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from huggingface_hub import hf_hub_download
import torch, random
from PIL import Image

local_path = "/mounts/data/corp/huggingface/"
gpu_model_1 = "cuda:6"
gpu_model_2 = "cuda:7"

def convert_to_float(value):
    try:
        return float(value)
    except ValueError:
        return value

  from .autonotebook import tqdm as notebook_tqdm


### Loading preprocessed data by Richardson

Creates three dictionaries:
 * `richardson_data`
 
 All choices as vectors, e.g. `{'fled': [7.2, 4.2, 80.8, 7.8], 'pointed at': [7.2, 3.6, 0.0, 89.2] ...`
 
 * `richardson_categorial`
 
 Maximum choice as binary choice, e.g. `{'fled': [0, 0, 1, 0], 'pointed at': [0, 0, 0, 1] ...`
 
 * `richardson_normed`
 
 Maximum choice divided by all choices, disregarding all other choices, e.g.  `{'fled': [0.0, 0.0, 0.808, 0.0], 'pointed at': [0.0, 0.0, 0.0, 0.892] ...`
 

In [2]:
with open("../../data/richardson_actions.txt", "r") as d_in:
    lines = [line.split() for line in d_in.readlines()]

output = []
for entry in lines:
    new_entry = [convert_to_float(item) for item in entry]
    
    if isinstance(new_entry[1],str):
        new_entry[0] = " ".join(new_entry[:2])
        del new_entry[1]
    output.append(new_entry)

richardson_data = dict()
for elem in output:
    richardson_data[elem[0]] = [i for i in elem[1:]]

# Randomizing Richardson's data
action_words = list(richardson_data.keys())
random.shuffle(action_words)

richardson_categorial = dict()
for k, v in richardson_data.items():
    if k == 0:
        continue
    vals = [0,0,0,0]
    vals[v.index(max(v))] = 1

    richardson_categorial[k] = vals
richardson_normed = dict()

for action, values in richardson_data.items():
    if action == 0:
        continue
    
    richardson_normed[action] = [round(val/sum(values),4) for val in values]

print(richardson_normed)

{'fled': [0.072, 0.042, 0.808, 0.078], 'pointed at': [0.072, 0.036, 0.0, 0.892], 'pulled': [0.06, 0.054, 0.754, 0.132], 'pushed': [0.072, 0.036, 0.012, 0.88], 'walked': [0.0905, 0.0362, 0.2412, 0.6322], 'hunted': [0.0959, 0.2038, 0.018, 0.6823], 'impacted': [0.072, 0.371, 0.03, 0.527], 'perched': [0.12, 0.76, 0.066, 0.054], 'showed': [0.1499, 0.0899, 0.1019, 0.6583], 'smashed': [0.036, 0.665, 0.012, 0.287], 'bombed': [0.048, 0.868, 0.018, 0.066], 'flew': [0.377, 0.443, 0.15, 0.03], 'floated': [0.329, 0.563, 0.078, 0.03], 'lifted': [0.874, 0.096, 0.024, 0.006], 'sank': [0.2218, 0.7183, 0.042, 0.018], 'argued with': [0.1139, 0.1379, 0.1259, 0.6224], 'gave to': [0.084, 0.096, 0.012, 0.808], 'offended': [0.09, 0.317, 0.246, 0.347], 'rushed': [0.1025, 0.1085, 0.2352, 0.5538], 'warned': [0.1079, 0.2218, 0.0599, 0.6104], 'owned': [0.0539, 0.5564, 0.1858, 0.2038], 'regretted': [0.1978, 0.2398, 0.4126, 0.1499], 'rested': [0.144, 0.365, 0.401, 0.09], 'tempted': [0.168, 0.114, 0.455, 0.263], 'wan

## Choosing model

### Loading InstructBlip-Vicuna7b

In [3]:
model_instructblip = InstructBlipForConditionalGeneration.from_pretrained(local_path+"instructblip-vicuna/instructblip-vicuna-7b")
processor_instructblip = InstructBlipProcessor.from_pretrained(local_path+"instructblip-vicuna/instructblip-vicuna-7b")
model_instructblip.to(gpu_model_1);

In [3]:
# auto : tau 30sec to load 
model_instructblip = InstructBlipForConditionalGeneration.from_pretrained(local_path+"instructblip-vicuna/instructblip-vicuna-7b", device_map="auto")
processor_instructblip = InstructBlipProcessor.from_pretrained(local_path+"instructblip-vicuna/instructblip-vicuna-7b", device_map="auto")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 4/4 [00:23<00:00,  5.87s/it]


### Loading OpenFlamingo-9B-vitl

In [4]:
from open_flamingo import create_model_and_transforms
model_flamingo, image_processor, tokenizer = create_model_and_transforms(clip_vision_encoder_path="ViT-L-14",clip_vision_encoder_pretrained="openai",lang_encoder_path="anas-awadalla/mpt-7b",tokenizer_path="anas-awadalla/mpt-7b",cross_attn_every_n_layers=4)
checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
model_flamingo.load_state_dict(torch.load(checkpoint_path), strict=False)
#model_flamingo.to(gpu_model_2) 

# 7min 

Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|██████████| 3/3 [03:56<00:00, 78.85s/it]


Flamingo model initialized with 1384781840 trainable parameters


_IncompatibleKeys(missing_keys=['vision_encoder.class_embedding', 'vision_encoder.positional_embedding', 'vision_encoder.proj', 'vision_encoder.conv1.weight', 'vision_encoder.ln_pre.weight', 'vision_encoder.ln_pre.bias', 'vision_encoder.transformer.resblocks.0.ln_1.weight', 'vision_encoder.transformer.resblocks.0.ln_1.bias', 'vision_encoder.transformer.resblocks.0.attn.in_proj_weight', 'vision_encoder.transformer.resblocks.0.attn.in_proj_bias', 'vision_encoder.transformer.resblocks.0.attn.out_proj.weight', 'vision_encoder.transformer.resblocks.0.attn.out_proj.bias', 'vision_encoder.transformer.resblocks.0.ln_2.weight', 'vision_encoder.transformer.resblocks.0.ln_2.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.bias', 'vision_encoder.transformer.resblocks.1.ln_1.weight', 'vision_encoder.transformer.resbloc

## Choose specifc GPU for model

In [None]:
# Define the GPU ID you want to use
gpu_id = 6

# Use the torch.cuda.device() context manager to set the current GPU
with torch.cuda.device(gpu_id):
    tokenizer = AutoTokenizer.from_pretrained(model_type+"/"+model_name, use_auth_token=True)
    model = AutoModelForCausalLM.from_pretrained(server_model_path+model_type+"/"+model_name, use_auth_token=True).to(torch.device("cuda"))

## Loading model

In [4]:
gpu_id = None
tokenizer = AutoTokenizer.from_pretrained(model_type+"/"+model_name, use_auth_token=True, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(server_model_path+model_type+"/"+model_name, use_auth_token=True, device_map="auto")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Vision-Lanuage Experiment

In [9]:
model_choices = dict()
arrows = ['UP', 'DOWN', 'LEFT', 'RIGHT']

img_path = "../../data/schema-choice.png"
img_01 = Image.open(img_path).convert("RGB")
vision_x = [image_processor(img_01).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)#.to(gpu_model_2)
tokenizer.padding_side = "left"

for action_word in action_words:
    print(action_word)

    prompt = "Select one of the four images (A, B, C, D) that best depicts the event 'circle "+action_word+" square'"

    lang_x = tokenizer(
        ["<image>"+prompt],
        return_tensors="pt",
    )#.to(gpu_model_2)

    generated_text = model_flamingo.generate(
        vision_x=vision_x,
        lang_x=lang_x["input_ids"],
        attention_mask=lang_x["attention_mask"],
        max_new_tokens=50,
        do_sample=True,
        num_beams=3
    )

    print(generated_text)

walked


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  7428,  6278,     8, 50277]])
owned


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  9633,  6278,     8, 50277]])
wanted


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  3078,  6278,     8, 50277]])
pointed at


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  8042,   387,  6278,     8, 50277]])
argued with


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  9125,   342,  6278,     8,   285,  3662,   253,
          1953,    15, 50277]])
flew


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 18811,  6278,     8, 50277]])
rested


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 27001,  6278,     8, 50277]])
pushed


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 10184,  6278,     8, 50277]])
hunted


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 47214,  6278,     8,   275,   253, 10659,    15,
         50277]])
pulled


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  7320,  6278,     8,   347,   352,  5369,  3502,
           275,   673,    15, 50277]])
showed


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  2692,  6278,     8,   275,   253,  1563,  3425,
            15, 50277]])
obeyed


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 20090,   264,  6278,     8, 50277]])
hoped


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 13937,  6278,     8, 50277]])
increased


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118,  2559,  6278,     8, 50277]])
rushed


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 20906,  6278,     8,   432,   253,  4677,  1840,
            15, 50277]])
lifted


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 14287,  6278,     8,   275,   253, 10659,    15,
         50277]])
impacted


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


tensor([[50278, 10004,   581,   273,   253,  1740,  3888,   313,    34,    13,
           378,    13,   330,    13,   399,    10,   326,  1682, 31444,   253,
          2362,   686, 23118, 27857,  6278,     8, 50277]])
fled


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


KeyboardInterrupt: 

### Run experiment (inludes prompt)


In [9]:
model_choices = dict()
arrows = ['UP', 'DOWN', 'LEFT', 'RIGHT']

for action_word in tqdm(action_words):

    if action_word == 0:
        continue

    ### PROMPT DEFINED HERE
    friendly_prompt = "Select the CONCEPT that best represents the event described by the sentence: "+action_word+". CONCEPTS: UP, DOWN, LEFT, RIGHT.\nThe best representation is CONCEPT:"
    
    if gpu_id:
        input_ids = tokenizer.encode(friendly_prompt, return_tensors="pt").to(torch.device("cuda"))
        max_length = input_ids.size(1)  + 20
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1).to(torch.device("cuda"))
    else:
        input_ids = tokenizer.encode(friendly_prompt, return_tensors="pt")
        max_length = input_ids.size(1)  + 20
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)  
    
    model_choices[action_word] = []
    for idx, arrow in enumerate(arrows):
        if arrow in generated_answer[len(friendly_prompt):]:
            model_choices[action_word].append(1)
        else:
            model_choices[action_word].append(0)

# Llama-70b: XXXX
# Llama-13b: XXXX
# Llama-7b:  50s

100%|██████████| 30/30 [00:51<00:00,  1.73s/it]


### Storing experimental results from free-form generation as `exp01a_`

In [6]:
to_store = input("Should the result for "+model_name+" be stored? (y/n):")

if to_store == "y":

    with open("results/exp01b_"+model_name+".txt", "w") as f_out:
        f_out.write("Action\tUP\tDOWN\tLEFT\tRIGHT\n")
        for k,v in model_choices.items():
            f_out.write(k+"\t"+"\t".join([str(x) for x in v])+"\n")


    

In [7]:
model_choices

{'walked': [1, 0, 0, 0],
 'perched': [1, 0, 0, 0],
 'respected': [1, 0, 0, 0],
 'fled': [1, 0, 0, 0],
 'pulled': [0, 1, 0, 0],
 'wanted': [1, 0, 0, 0],
 'pushed': [0, 1, 0, 0],
 'argued with': [1, 0, 0, 0],
 'obeyed': [1, 0, 0, 0],
 'showed': [1, 0, 0, 0],
 'sank': [0, 1, 0, 0],
 'lifted': [1, 0, 0, 0],
 'regretted': [1, 0, 0, 0],
 'gave to': [1, 0, 0, 0],
 'pointed at': [1, 0, 0, 0],
 'succeeded': [1, 0, 0, 0],
 'impacted': [1, 0, 0, 0],
 'owned': [1, 0, 0, 0],
 'smashed': [1, 0, 0, 0],
 'increased': [1, 0, 0, 0],
 'floated': [1, 0, 0, 0],
 'bombed': [0, 1, 0, 0],
 'hunted': [1, 0, 0, 0],
 'tempted': [1, 0, 0, 0],
 'hoped': [1, 0, 0, 0],
 'rushed': [1, 0, 0, 0],
 'flew': [1, 0, 0, 0],
 'rested': [1, 0, 0, 0],
 'offended': [1, 0, 0, 0],
 'warned': [1, 0, 0, 0]}