In [1]:
from scripts.model import EvalModel
import os
from scripts.datasets import SQUAD_dataset
from huggingface_hub import hf_hub_download
from scripts.download_dataset import DATASET_DIR
from scripts.custom_evaluate import evaluate_vqa
import torch
from torch.utils.data import Dataset
import json
import os
import numpy as np
from tqdm import tqdm
from PIL import Image
import torch
from scripts.download_dataset import DATASET_DIR
from scripts.download_dataset import DATASET_URL
from open_flamingo.eval.eval_datasets import VQADataset
from open_flamingo.eval.vqa_metric import (
    compute_vqa_accuracy,
    postprocess_vqa_generation,
)
from open_flamingo.eval.eval_model import BaseEvalModel
import scripts.utils as utils

%load_ext autoreload
%autoreload 2

In [2]:
CHECKPOINT_DIR = (
    "/mnt/d/models/"
    if os.environ.get("CHECKPOINT_DIR") == None
    else os.environ["CHECKPOINT_DIR"]
)

model_args = {
    "vision_encoder_path": "ViT-L-14",
    "vision_encoder_pretrained": "openai",
    "lm_path": "anas-awadalla/mpt-1b-redpajama-200b",
    "lm_tokenizer_path": "anas-awadalla/mpt-1b-redpajama-200b",
    "checkpoint_path": f"{CHECKPOINT_DIR}fine-tuned-nl-flamingo-visual/visual_checkpoint.pt",
    #"checkpoint_path": "/mnt/d/models/OpenFlamingo-3B-vitl-mpt1b/checkpoint.pt",
    "cross_attn_every_n_layers": 1,
    "precision": "bf16",
    "device": 0,
}

model = EvalModel(
    model_args
)



You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters


In [3]:
import json

with open("/mnt/d/datasets/v2_OpenEnded_mscoco_val2014_questions.json", 'r') as f:
    questions = json.load(f)

with open("/mnt/d/datasets/v2_mscoco_val2014_annotations.json", 'r') as f:
    annos = json.load(f)

In [4]:
DATASET_DIR = '/mnt/d/datasets/'
dataset_name = 'vqav2'

train_image_dir_path = DATASET_DIR + "train2014"
train_questions_json_path = (
    DATASET_DIR + "v2_OpenEnded_mscoco_train2014_questions.json"
)
train_annotations_json_path = DATASET_DIR + "v2_mscoco_train2014_annotations.json"
test_image_dir_path = DATASET_DIR + "val2014"
test_questions_json_path = (
    DATASET_DIR + "v2_OpenEnded_mscoco_val2014_questions.json"
)
test_annotations_json_path = DATASET_DIR + "v2_mscoco_val2014_annotations.json"

train_dataset = VQADataset(
    image_dir_path=train_image_dir_path,
    question_path=train_questions_json_path,
    annotations_path=train_annotations_json_path,
    is_train=True,
    dataset_name=dataset_name,
)

test_dataset = VQADataset(
    image_dir_path=test_image_dir_path,
    question_path=test_questions_json_path,
    annotations_path=test_annotations_json_path,
    is_train=False,
    dataset_name=dataset_name,
)

In [5]:
test_dataloader = torch.utils.data.DataLoader(
        test_dataset, batch_size=1, collate_fn=utils.custom_collate_fn
    )

In [13]:
prompt_template = lambda question: f"Question: {question}\nContext: <image>\nRationale:"
context_template = lambda question, answer: f"Question: {question}\nContext: <image>\nRationale: Answer {answer}<|endofchunk|>"
query_set = utils.get_query_set(train_dataset, 50)

In [47]:
predictions = []
acc = []

counter = 0
for batch in tqdm(
        test_dataloader,
        desc=f"Running inference {dataset_name}",
    ):

    counter += 1
    if counter >= 2000:
        break
    
    # batch_demo_samples = utils.sample_batch_demos_from_query_set(
    #         query_set, 4, len(batch["image"])
    #     )

    batch_images, batch_text = [], []
    for i in range(len(batch["image"])):
        # if 4 > 0:
        #     context_images = [x["image"] for x in batch_demo_samples[i]]
        # else:
        #     context_images = []
        # batch_images.append([batch["image"][i]])

        # context_text = "".join(
        #     [
        #         model.get_vqa_prompt(
        #             question=x["question"], answer=x["answers"][0]
        #         )
        #         + "\n"
        #         for x in batch_demo_samples[i]
        #     ]
        # )

        batch_images.append([batch["image"][i]])
        batch_text.append(
            prompt_template(question=batch["question"][i])
            )
        
        outputs = model.get_outputs(
            batch_images=batch_images,
            batch_text=batch_text,
            min_generation_length=0,
            max_generation_length=100,
            num_beams=3,
            length_penalty=0.0,
        )

        process_function = postprocess_vqa_generation
        new_predictions = map(process_function, outputs)

        for new_prediction, sample_id in zip(new_predictions, batch["question_id"]):
            predictions.append({"answer": new_prediction, "question_id": sample_id})
        
        match = 0
        for answer in batch['answers'][0]:
            if answer.lower() in new_prediction.lower():
                match += 1
        acc.append(1 if (match / 3) >= 1 else (match / 3))


Running inference vqav2:   0%|          | 0/214354 [00:00<?, ?it/s]

Running inference vqav2:   1%|          | 1999/214354 [1:07:42<119:53:09,  2.03s/it]


In [43]:
predictions

[{'answer': ' He is looking at a skateboard ramp.\n\n',
  'question_id': 262148000},
 {'answer': ' The people in the background are skateboarders.\n\n',
  'question_id': 262148001},
 {'answer': ' He is on top of a skateboard.\n\n', 'question_id': 262148002},
 {'answer': ' The picture of a bowl of noodle soup was copyrighted by a website.\n\n',
  'question_id': 393225000}]

In [49]:
import numpy as np

np.mean(acc)

0.2696348174087044

In [None]:
with open(f"{dataset_name}results.json", "w") as f:
        f.write(json.dumps(predictions, indent=4))

acc = -1
if test_annotations_json_path is not None:
    acc = compute_vqa_accuracy(
        f"{dataset_name}results.json",
        test_questions_json_path,
        test_annotations_json_path,
    )
    # delete the temporary file
    os.remove(f"{dataset_name}results.json")

acc