In [1]:
import open_flamingo
from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from PIL import Image
import requests


In [2]:
model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b-dolly",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b-dolly",
    cross_attn_every_n_layers=1,
    )

model.to(0, dtype=torch.bfloat16)



You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters


Flamingo(
  (vision_encoder): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwis

In [3]:
def qa_prompt(question, answer=None) -> str:
        return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"

In [4]:
open_image_url = lambda url: Image.open(requests.get(url, stream=True).raw)

def image_preprocess_batch(images: list)->torch.Tensor:
    vision_x = [image_processor(image).unsqueeze(0) for image in images]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
    return vision_x.cuda()

def text_processor_factory(tokenizer, padding_side):
    tokenizer.padding_side = padding_side
    return lambda text: tokenizer([text], return_tensors='pt')

In [5]:
import random

def make_in_context(d, shots=1):
    images = []
    prompt = ''
    for _ in range(shots):
        idx = int(random.random() * len(d))
        image = d[idx]['image']
        question = d[idx]['question']
        answers = d[idx]['answers'][0]

        images.append(image)
        prompt += qa_prompt(question, answers) + "<|endofchunk|>" +'\n'
        if shots == 0:
                prompt = prompt.replace("<image>", "")
    return images, prompt

def postprocess_vqa_generation(predictions):
    answer = re.split("Question|Answer|Short", predictions, 1)[0]
    answer = re.split(", ", answer, 1)[0]
    return answer

In [15]:
import json, tqdm, re
from torch.utils.data import DataLoader

def vqa_evaluate(model, tokenizer, dataset, early_stop=None, shots=1):
    model.eval()
    if early_stop is None:
        early_stop = len(dataset)
    results = []
    counter = 0
    for q in tqdm.tqdm(dataset):
        counter += 1
        if counter > early_stop:
            break

        image_in_context, text_in_context = make_in_context(dataset, shots=shots)

        image = q['image']
        qestion = q['question']
        answers = q['answers']
        image_tokens = image_preprocess_batch(image_in_context + [image])
        text_tokens = text_processor_factory(tokenizer, "left")(text_in_context+qa_prompt(qestion))

        #print(text_in_context+qa_prompt(qestion))

        output_tokens = model.generate(
            image_tokens.to(0, dtype=torch.bfloat16),
            text_tokens['input_ids'].to(0),
            attention_mask=text_tokens['attention_mask'].to(0, dtype=torch.bfloat16),
            max_new_tokens=5,
            num_beams=3,
            pad_token_id=50277
            )
        outputs = output_tokens[:, len(text_tokens['input_ids'][0]) :]

        y_hat = tokenizer.decode(outputs[0])
        y_hat = postprocess_vqa_generation(y_hat)

        results.append({
            "answer": y_hat,
            "question_id": q["question_id"]
        })
    return results

def results2json(results, name=''):
    json.dump(results, open(f'{VQA_DATA_DIR}{name}results.json', 'w'))


In [19]:
model = torch.nn.DataParallel(model)

In [20]:
from open_flamingo.eval.eval_datasets import VQADataset

VQA_DATA_DIR = "/external/"

d = VQADataset(
    VQA_DATA_DIR + 'val2014',
    VQA_DATA_DIR + 'v2_OpenEnded_mscoco_val2014_questions.json',
    VQA_DATA_DIR + 'v2_mscoco_val2014_annotations.json',
    is_train = False,
    dataset_name = 'vqav2'
)
results = vqa_evaluate(model, tokenizer, d, 100, shots=4)
results2json(results, 'vqa_')

  0%|                                   | 0/214354 [00:02<?, ?it/s]


AttributeError: 'DataParallel' object has no attribute 'generate'

> [0;32m/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py[0m(1614)[0;36m__getattr__[0;34m()[0m
[0;32m   1612 [0;31m            [0;32mif[0m [0mname[0m [0;32min[0m [0mmodules[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1613 [0;31m                [0;32mreturn[0m [0mmodules[0m[0;34m[[0m[0mname[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1614 [0;31m        raise AttributeError("'{}' object has no attribute '{}'".format(
[0m[0;32m   1615 [0;31m            type(self).__name__, name))
[0m[0;32m   1616 [0;31m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/tmp/ipykernel_2270/1512339583.py[0m(25)[0;36mvqa_evaluate[0;34m()[0m
[0;32m     23 [0;31m        [0;31m#print(text_in_context+qa_prompt(qestion))[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     24 [0;31m[0;34m[0m[0m
[0m[0;32m---> 25 [0;31m        output_tokens = model.generate(
[0m[0;32m     26 [0;31m            [0mimage_tokens[0m[0;34m.[0m[0mto[0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0mtorch[0m[0;34m.[0m[0mbfloat16[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m            [0mtext_tokens[0m[0;34m[[0m[0;34m'input_ids'[0m[0;34m][0m[0;34m.[0m[0mto[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  model


DataParallel(
  (module): Flamingo(
    (vision_encoder): VisionTransformer(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (patch_dropout): Identity()
      (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (transformer): Transformer(
        (resblocks): ModuleList(
          (0-23): 24 x ResidualAttentionBlock(
            (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
            )
            (ls_1): Identity()
            (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): Sequential(
              (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
              (gelu): QuickGELU()
              (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
            )
            (ls_2): Identity()
          )
   

ipdb>  dir(model)


['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_forward_hooks', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save

ipdb>  q


In [17]:
from open_flamingo.eval.vqa_metric import VQA, VQAEval

v = VQA(
    f"{VQA_DATA_DIR}v2_mscoco_val2014_annotations.json",
    f"{VQA_DATA_DIR}v2_OpenEnded_mscoco_val2014_questions.json",
)
res = v.loadRes(f'{VQA_DATA_DIR}vqa_results.json', f"{VQA_DATA_DIR}v2_OpenEnded_mscoco_val2014_questions.json")

# create evaluator
evaluator = VQAEval(v, res)
evaluator.evaluate()
print(evaluator.accuracy['overall'])

loading VQA annotations and questions into memory...
0:00:06.233405
creating index...
index created!
Loading and preparing results...     
DONE (t=0.13s)
creating index...
index created!
computing accuracy
Finshed Percent: [--------------------] 0% Done computing accuracy
26.1
