### Base line model

In [1]:
# from huggingface import AutoModel

In [1]:
!pip install huggingface transformers torch torchvision

Defaulting to user installation because normal site-packages is not writeable


In [26]:
import torch 
from transformers import pipeline  # Use a pipeline as a high-level helper
import os # For all os level functions
import json # For parsing the json files


#### Util functions



In [None]:
### Setup the questions and answers for the image files
def obtain_questions_for_image_ids(image_ids: list, question_file_path: str= "questions.json"):
    """This function obtains the questions for the list of the image ids passed from the file"""
    with open(question_file_path) as file:
        questions_data = json.load(file)
    result = []
    for question_details in questions_data['questions']:
        if question_details['image_id'] in image_ids:
            result.append(question_details)
    return result


def obtain_annotations_for_image_ids(image_ids: list, annotations_file_path: str = "annotations.json"):
    """This function obtians the annotations for the list of the image ids passed from the file"""
    with open(annotations_file_path, 'r') as file:
        annotations_data = json.load(file)
    result = []
    for annotation_details in annotations_data['annotations']:
        if annotation_details['image_id'] in image_ids:
            result.append(annotation_details)
    return result


def get_refined_results_from_lists(relevant_questions:list, relevant_annotations:list):
    """This provides refined format of the question, image, plausible answers and other meta info"""
    result = []
    for question_info in relevant_questions:
        q_id = question_info['question_id']
        question = question_info['question']
        # print(f"Question ID: {q_id}, Question: {question}")
        for annotation_info in relevant_annotations:
            if annotation_info['question_id'] == q_id:
                answer = annotation_info["multiple_choice_answer"]
                plausile_answers = set(x['answer'] for x in annotation_info['answers'])
                result.append({"question_id": q_id, "question": {question}, 
                               "answer": answer, "plausible_answers": plausile_answers,
                               "image_id": annotation_info['image_id'], "question_type" :annotation_info['question_type'],
                               "answer_type": annotation_info['answer_type']})


def get_results(image_ids: list, folder: str):
    relevant_annotations = obtain_annotations_for_image_ids(image_ids, annotations_file_path=f"{folder}/annotations.json")
    relevant_questions = obtain_questions_for_image_ids(image_ids, question_file_path=f"{folder}/questions.json")
    return get_refined_results_from_lists(relevant_questions=relevant_questions, relevant_annotations=relevant_annotations)



In [39]:
validation_folder = "../data/validation"
validation_images_folder = f"{validation_folder}/images"

validation_image_files = [os.path.join(validation_images_folder, x) for x in os.listdir(validation_images_folder)[:10]]

sample_validation_image_files = validation_image_files[:30]

#### Setup for analysis

In [None]:
image_files = sample_validation_image_files
image_ids = [int(image_file.split(os.sep)[-1].split(".")[0].split("_")[-1]) for image_file in image_files]
image_ids

relevant_annotations = obtain_annotations_for_image_ids(image_ids, annotations_file_path=f"{validation_folder}/annotations.json")
relevant_questions = obtain_questions_for_image_ids(image_ids, question_file_path=f"{validation_folder}/questions.json")

results = get_results(image_ids, validation_folder)
print(image_ids)

### Setup the baseline performance


In [43]:
## Obtain the captions for these image files
from transformers import pipeline
blipPipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
generated_captions = blipPipe(image_files)

### Get the generated text along with image_id, image_file
captions = [x[0]['generated_text'] for x in generated_captions]
generated_captions = [{"image_caption": caption, "image_id": image_id, "image_file": image_file.split(os.sep)[-1]} for caption, image_id, image_file in zip(captions, image_ids, image_files)]


[{'image_caption': 'there is a dog that is laying on a rack with shoes',
  'image_id': 42,
  'image_file': 'COCO_val2014_000000000042.jpg'},
 {'image_caption': 'there are two motorcycles parked next to each other on the street',
  'image_id': 73,
  'image_file': 'COCO_val2014_000000000073.jpg'},
 {'image_caption': 'araffe dog laying on the ground next to a bike on a city street',
  'image_id': 74,
  'image_file': 'COCO_val2014_000000000074.jpg'},
 {'image_caption': 'there is a small desk with a lamp on it in a room',
  'image_id': 133,
  'image_file': 'COCO_val2014_000000000133.jpg'},
 {'image_caption': 'there are two giraffes standing in a room with a woman looking at them',
  'image_id': 136,
  'image_file': 'COCO_val2014_000000000136.jpg'},
 {'image_caption': 'brightly colored living room with a television and a fireplace',
  'image_id': 139,
  'image_file': 'COCO_val2014_000000000139.jpg'},
 {'image_caption': 'there are many birds sitting on the branches of a tree',
  'image_id': 1

NameError: name 'image_ids' is not defined

In [46]:
relevant_questions, relevant_annotations

([{'image_id': 42,
   'question': 'What color are the gym shoes?',
   'question_id': 42000},
  {'image_id': 42,
   'question': 'Is there a red sandal here?',
   'question_id': 42001},
  {'image_id': 42,
   'question': 'What color is the flip flop?',
   'question_id': 42002},
  {'image_id': 74,
   'question': 'Does this dog have a collar?',
   'question_id': 74000},
  {'image_id': 74,
   'question': 'Where is the dog laying?',
   'question_id': 74001},
  {'image_id': 74, 'question': 'What is the dog doing?', 'question_id': 74002},
  {'image_id': 133, 'question': 'What color is lamp?', 'question_id': 133000},
  {'image_id': 133,
   'question': 'Is this a child room?',
   'question_id': 133001},
  {'image_id': 133,
   'question': 'What size mattress would you need for this bed?',
   'question_id': 133002},
  {'image_id': 136, 'question': 'Is this in a museum?', 'question_id': 136000},
  {'image_id': 136,
   'question': 'How many animals are in the picture?',
   'question_id': 136001},
  {

In [3]:
relevant_questions = [{'image_id': 42,
   'question': 'What color are the gym shoes?',
   'question_id': 42000},
  {'image_id': 42,
   'question': 'Is there a red sandal here?',
   'question_id': 42001},
  {'image_id': 42,
   'question': 'What color is the flip flop?',
   'question_id': 42002},
  {'image_id': 74,
   'question': 'Does this dog have a collar?',
   'question_id': 74000},
  {'image_id': 74,
   'question': 'Where is the dog laying?',
   'question_id': 74001},
  {'image_id': 74, 'question': 'What is the dog doing?', 'question_id': 74002},
  {'image_id': 133, 'question': 'What color is lamp?', 'question_id': 133000},
  {'image_id': 133,
   'question': 'Is this a child room?',
   'question_id': 133001},
  {'image_id': 133,
   'question': 'What size mattress would you need for this bed?',
   'question_id': 133002},
  {'image_id': 136, 'question': 'Is this in a museum?', 'question_id': 136000},
  {'image_id': 136,
   'question': 'How many animals are in the picture?',
   'question_id': 136001},
  {'image_id': 136,
   'question': 'What kind of animal is shown?',
   'question_id': 136002},
  {'image_id': 139,
   'question': 'What is the woman in the room doing?',
   'question_id': 139000},
  {'image_id': 139,
   'question': 'How many yellow vases?',
   'question_id': 139001},
  {'image_id': 139,
   'question': 'What color is the floor?',
   'question_id': 139002},
  {'image_id': 139,
   'question': 'What color is the flower?',
   'question_id': 139003},
  {'image_id': 143,
   'question': "What color is the tip of the birds' tails?",
   'question_id': 143000},
  {'image_id': 143,
   'question': 'How many birds are in the tree?',
   'question_id': 143001},
  {'image_id': 143,
   'question': 'Who many birds are black?',
   'question_id': 143002},
  {'image_id': 164,
   'question': 'How many paper towel rolls?',
   'question_id': 164000},
  {'image_id': 164,
   'question': 'What is the color of the refrigerator?',
   'question_id': 164001},
  {'image_id': 164, 'question': 'Is the light on?', 'question_id': 164002},
  {'image_id': 164,
   'question': 'What color is the wall?',
   'question_id': 164003},
  {'image_id': 192,
   'question': 'What sport is being played?',
   'question_id': 192000},
  {'image_id': 192,
   'question': 'Is the catcher wearing safety gear?',
   'question_id': 192001},
  {'image_id': 192,
   'question': 'What is the name of the teams?',
   'question_id': 192002},
  {'image_id': 196,
   'question': 'What is the yellow food?',
   'question_id': 196000},
  {'image_id': 196,
   'question': 'Are there lots of healthy options on the table?',
   'question_id': 196001},
  {'image_id': 196, 'question': 'How many containers?', 'question_id': 196002},
  {'image_id': 196,
   'question': 'How many dishes of food are in the picture?',
   'question_id': 196003},
  {'image_id': 196,
   'question': 'Is this going to be a feast?',
   'question_id': 196004},
  {'image_id': 73,
   'question': 'What is the license number?',
   'question_id': 73000},
  {'image_id': 73,
   'question': 'Is this a motorcycle or bike?',
   'question_id': 73001},
  {'image_id': 73,
   'question': 'What color is the bike?',
   'question_id': 73002},
  {'image_id': 73,
   'question': 'What letter and 3 numbers are on the tag?',
   'question_id': 73003}]

relevant_annotations = [{'question_type': 'what color are the',
   'multiple_choice_answer': 'white',
   'answers': [{'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'white and black',
     'answer_confidence': 'maybe',
     'answer_id': 5},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 42,
   'answer_type': 'other',
   'question_id': 42000},
  {'question_type': 'is there a',
   'multiple_choice_answer': 'yes',
   'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 42,
   'answer_type': 'yes/no',
   'question_id': 42001},
  {'question_type': 'what color is the',
   'multiple_choice_answer': 'red',
   'answers': [{'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'red and blue', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 42,
   'answer_type': 'other',
   'question_id': 42002},
  {'question_type': 'does this',
   'multiple_choice_answer': 'no',
   'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 74,
   'answer_type': 'yes/no',
   'question_id': 74000},
  {'question_type': 'where is the',
   'multiple_choice_answer': 'sidewalk',
   'answers': [{'answer': 'outside',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'sidewalk', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'sidewalk', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'sidewalk', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'sidewalk', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'sidewalk', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'ground', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'street', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'sidewalk', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 74,
   'answer_type': 'other',
   'question_id': 74001},
  {'question_type': 'what is the',
   'multiple_choice_answer': 'sleeping',
   'answers': [{'answer': 'laying down',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'sleeping', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'resting', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'lying down', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'sleeping', 'answer_confidence': 'maybe', 'answer_id': 5},
    {'answer': 'it is sleeping', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'sleeping', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'laying down', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'sleeping', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'lying down', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 74,
   'answer_type': 'other',
   'question_id': 74002},
  {'question_type': 'what color is',
   'multiple_choice_answer': 'blue',
   'answers': [{'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'blue', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 133,
   'answer_type': 'other',
   'question_id': 133000},
  {'question_type': 'is this a',
   'multiple_choice_answer': 'yes',
   'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 133,
   'answer_type': 'yes/no',
   'question_id': 133001},
  {'question_type': 'what',
   'multiple_choice_answer': 'twin',
   'answers': [{'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'single', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'twin long', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'single', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'twin', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 133,
   'answer_type': 'other',
   'question_id': 133002},
  {'question_type': 'is this',
   'multiple_choice_answer': 'no',
   'answers': [{'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 136,
   'answer_type': 'yes/no',
   'question_id': 136000},
  {'question_type': 'how many',
   'multiple_choice_answer': '2',
   'answers': [{'answer': '2', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 136,
   'answer_type': 'number',
   'question_id': 136001},
  {'question_type': 'what kind of',
   'multiple_choice_answer': 'giraffe',
   'answers': [{'answer': 'giraffe',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'giraffe', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 136,
   'answer_type': 'other',
   'question_id': 136002},
  {'question_type': 'what is the woman',
   'multiple_choice_answer': 'talking',
   'answers': [{'answer': 'talking to someone',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'cleaning', 'answer_confidence': 'maybe', 'answer_id': 2},
    {'answer': 'looking out window',
     'answer_confidence': 'maybe',
     'answer_id': 3},
    {'answer': 'talking', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'talking', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'cleaning', 'answer_confidence': 'maybe', 'answer_id': 6},
    {'answer': 'talking to someone outside of window',
     'answer_confidence': 'yes',
     'answer_id': 7},
    {'answer': 'talking', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'talking out window',
     'answer_confidence': 'yes',
     'answer_id': 9},
    {'answer': 'cleaning', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 139,
   'answer_type': 'other',
   'question_id': 139000},
  {'answer_type': 'number',
   'multiple_choice_answer': '1',
   'answers': [{'answer': '2', 'answer_confidence': 'maybe', 'answer_id': 1},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '1', 'answer_confidence': 'maybe', 'answer_id': 6},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': '1', 'answer_confidence': 'maybe', 'answer_id': 9},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 139,
   'question_type': 'how many',
   'question_id': 139001},
  {'question_type': 'what color is the',
   'multiple_choice_answer': 'brown',
   'answers': [{'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'brown', 'answer_confidence': 'maybe', 'answer_id': 3},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'wood colored', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'brown', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'brown', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 139,
   'answer_type': 'other',
   'question_id': 139002},
  {'question_type': 'what color is the',
   'multiple_choice_answer': 'pink',
   'answers': [{'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'pink', 'answer_confidence': 'maybe', 'answer_id': 3},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'red', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 139,
   'answer_type': 'other',
   'question_id': 139003},
  {'question_type': 'what color is the',
   'multiple_choice_answer': 'yellow',
   'answers': [{'answer': 'yellow',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'yellow', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 143,
   'answer_type': 'other',
   'question_id': 143000},
  {'question_type': 'how many',
   'multiple_choice_answer': '8',
   'answers': [{'answer': '8', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': '8', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 143,
   'answer_type': 'number',
   'question_id': 143001},
  {'question_type': 'none of the above',
   'multiple_choice_answer': '0',
   'answers': [{'answer': '0', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': '0', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': '7', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '8', 'answer_confidence': 'maybe', 'answer_id': 4},
    {'answer': '0', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '0', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': '0', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': '0', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': '0', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': '0', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 143,
   'answer_type': 'number',
   'question_id': 143002},
  {'question_type': 'how many',
   'multiple_choice_answer': '1',
   'answers': [{'answer': '1', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': '2', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': '1', 'answer_confidence': 'maybe', 'answer_id': 9},
    {'answer': '1', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 164,
   'answer_type': 'number',
   'question_id': 164000},
  {'question_type': 'what is the color of the',
   'multiple_choice_answer': 'white',
   'answers': [{'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'cream', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'beige', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 164,
   'answer_type': 'other',
   'question_id': 164001},
  {'answer_type': 'yes/no',
   'multiple_choice_answer': 'no',
   'answers': [{'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 6},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 164,
   'question_type': 'is the',
   'question_id': 164002},
  {'question_type': 'what color is the',
   'multiple_choice_answer': 'white',
   'answers': [{'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'cream', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'pink', 'answer_confidence': 'maybe', 'answer_id': 5},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'white', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': 'white', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'pink', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'pink', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 164,
   'answer_type': 'other',
   'question_id': 164003},
  {'question_type': 'what sport is',
   'multiple_choice_answer': 'baseball',
   'answers': [{'answer': 'baseball',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'baseball', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 192,
   'answer_type': 'other',
   'question_id': 192000},
  {'question_type': 'is the',
   'multiple_choice_answer': 'yes',
   'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 192,
   'answer_type': 'yes/no',
   'question_id': 192001},
  {'question_type': 'what is the name',
   'multiple_choice_answer': 'cubs',
   'answers': [{'answer': 'cubs and phillies',
     'answer_confidence': 'maybe',
     'answer_id': 1},
    {'answer': 'cubs', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'chicago cubs', 'answer_confidence': 'maybe', 'answer_id': 3},
    {'answer': 'cubs', 'answer_confidence': 'no', 'answer_id': 4},
    {'answer': 'cubs', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'cubs / red sox', 'answer_confidence': 'maybe', 'answer_id': 6},
    {'answer': 'impossible to tell',
     'answer_confidence': 'maybe',
     'answer_id': 7},
    {'answer': 'cubs and reds', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'cubs reds', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'cubs & red sox',
     'answer_confidence': 'maybe',
     'answer_id': 10}],
   'image_id': 192,
   'answer_type': 'other',
   'question_id': 192002},
  {'question_type': 'what is the',
   'multiple_choice_answer': 'corn',
   'answers': [{'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'corn', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 196,
   'answer_type': 'other',
   'question_id': 196000},
  {'answer_type': 'yes/no',
   'multiple_choice_answer': 'yes',
   'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 4},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'no', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'no', 'answer_confidence': 'maybe', 'answer_id': 9},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 196,
   'question_type': 'are there',
   'question_id': 196001},
  {'answer_type': 'number',
   'multiple_choice_answer': '20',
   'answers': [{'answer': 'many', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': '19', 'answer_confidence': 'maybe', 'answer_id': 2},
    {'answer': '20', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '19', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': '20', 'answer_confidence': 'maybe', 'answer_id': 5},
    {'answer': '20', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'lots', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': '17', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': '20', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': '10', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 196,
   'question_type': 'how many',
   'question_id': 196002},
  {'question_type': 'how many',
   'multiple_choice_answer': '19',
   'answers': [{'answer': '20', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': '19', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': '20', 'answer_confidence': 'maybe', 'answer_id': 3},
    {'answer': '19', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': '18', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '17', 'answer_confidence': 'maybe', 'answer_id': 6},
    {'answer': '18', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': '10', 'answer_confidence': 'maybe', 'answer_id': 8},
    {'answer': '20', 'answer_confidence': 'maybe', 'answer_id': 9},
    {'answer': '19', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 196,
   'answer_type': 'number',
   'question_id': 196003},
  {'question_type': 'is this',
   'multiple_choice_answer': 'yes',
   'answers': [{'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'yes', 'answer_confidence': 'maybe', 'answer_id': 3},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'yes', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 196,
   'answer_type': 'yes/no',
   'question_id': 196004},
  {'question_type': 'what is the',
   'multiple_choice_answer': 'sv-6260',
   'answers': [{'answer': 'sv-6260',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': '6260', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': '6260', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'sv 6260', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 73,
   'answer_type': 'other',
   'question_id': 73000},
  {'question_type': 'is this a',
   'multiple_choice_answer': 'motorcycle',
   'answers': [{'answer': 'motorcycle',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'motorcycle', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'bike', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'bike', 'answer_confidence': 'maybe', 'answer_id': 4},
    {'answer': 'bike', 'answer_confidence': 'maybe', 'answer_id': 5},
    {'answer': 'bike', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'motorcycle', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': 'motorcycle', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'motorcycle', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'motorcycle', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 73,
   'answer_type': 'other',
   'question_id': 73001},
  {'question_type': 'what color is the',
   'multiple_choice_answer': 'black',
   'answers': [{'answer': 'black and silver',
     'answer_confidence': 'yes',
     'answer_id': 1},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'silver black and brown',
     'answer_confidence': 'maybe',
     'answer_id': 5},
    {'answer': 'black', 'answer_confidence': 'maybe', 'answer_id': 6},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'black', 'answer_confidence': 'yes', 'answer_id': 10}],
   'image_id': 73,
   'answer_type': 'other',
   'question_id': 73002},
  {'answer_type': 'other',
   'multiple_choice_answer': 'sv-6260',
   'answers': [{'answer': 'sv626', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'sv 626', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 's620', 'answer_confidence': 'maybe', 'answer_id': 3},
    {'answer': 'sv', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'sv-626', 'answer_confidence': 'maybe', 'answer_id': 5},
    {'answer': '6260', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'sv-626', 'answer_confidence': 'maybe', 'answer_id': 7},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 8},
    {'answer': 'sv-6260', 'answer_confidence': 'yes', 'answer_id': 9},
    {'answer': 'sv-6260', 'answer_confidence': 'maybe', 'answer_id': 10}],
   'image_id': 73,
   'question_type': 'what',
   'question_id': 73003}]

In [4]:
generated_caps = [{'image_caption': 'there is a dog that is laying on a rack with shoes',
  'image_id': 42,
  'image_file': 'COCO_val2014_000000000042.jpg'},
 {'image_caption': 'there are two motorcycles parked next to each other on the street',
  'image_id': 73,
  'image_file': 'COCO_val2014_000000000073.jpg'},
 {'image_caption': 'araffe dog laying on the ground next to a bike on a city street',
  'image_id': 74,
  'image_file': 'COCO_val2014_000000000074.jpg'},
 {'image_caption': 'there is a small desk with a lamp on it in a room',
  'image_id': 133,
  'image_file': 'COCO_val2014_000000000133.jpg'},
 {'image_caption': 'there are two giraffes standing in a room with a woman looking at them',
  'image_id': 136,
  'image_file': 'COCO_val2014_000000000136.jpg'},
 {'image_caption': 'brightly colored living room with a television and a fireplace',
  'image_id': 139,
  'image_file': 'COCO_val2014_000000000139.jpg'},
 {'image_caption': 'there are many birds sitting on the branches of a tree',
  'image_id': 143,
  'image_file': 'COCO_val2014_000000000143.jpg'},
 {'image_caption': 'there is a kitchen with a table and a refrigerator in it',
  'image_id': 164,
  'image_file': 'COCO_val2014_000000000164.jpg'},
 {'image_caption': 'arafed baseball player walking on home plate with catcher and umpire',
  'image_id': 192,
  'image_file': 'COCO_val2014_000000000192.jpg'},
 {'image_caption': 'there are many different foods on the table ready to be eaten',
  'image_id': 196,
  'image_file': 'COCO_val2014_000000000196.jpg'}]

In [14]:
# for x in relevant_annotations:
#     print(f"multiple choice answer: {x['multiple_choice_answer']}")


[{'question_id': 42000,
  'question': {'What color are the gym shoes?'},
  'answer': 'white',
  'plausible_answers': {'white', 'white and black'}},
 {'question_id': 42001,
  'question': {'Is there a red sandal here?'},
  'answer': 'yes',
  'plausible_answers': {'yes'}},
 {'question_id': 42002,
  'question': {'What color is the flip flop?'},
  'answer': 'red',
  'plausible_answers': {'black', 'red', 'red and blue'}},
 {'question_id': 74000,
  'question': {'Does this dog have a collar?'},
  'answer': 'no',
  'plausible_answers': {'no', 'yes'}},
 {'question_id': 74001,
  'question': {'Where is the dog laying?'},
  'answer': 'sidewalk',
  'plausible_answers': {'ground', 'outside', 'sidewalk', 'street', 'yes'}},
 {'question_id': 74002,
  'question': {'What is the dog doing?'},
  'answer': 'sleeping',
  'plausible_answers': {'it is sleeping',
   'laying down',
   'lying down',
   'resting',
   'sleeping'}},
 {'question_id': 133000,
  'question': {'What color is lamp?'},
  'answer': 'blue',
 