### Base line model

In [1]:
# from huggingface import AutoModel

In [1]:
!pip install huggingface transformers torch torchvision

Defaulting to user installation because normal site-packages is not writeable


In [26]:
import torch 
from transformers import pipeline  # Use a pipeline as a high-level helper
import os # For all os level functions
import json # For parsing the json files


In [39]:
validation_folder = "../data/validation"
validation_images_folder = f"{validation_folder}/images"

validation_image_files = [os.path.join(validation_images_folder, x) for x in os.listdir(validation_images_folder)[:10]]

sample_validation_image_files = validation_image_files[:30]

### Setup the baseline performance


In [41]:
image_files = sample_validation_image_files
image_ids = [int(image_file.split(os.sep)[-1].split(".")[0].split("_")[-1]) for image_file in image_files]
image_ids

[42, 73, 74, 133, 136, 139, 143, 164, 192, 196]

In [43]:
## Obtain the captions for these image files
from transformers import pipeline
blipPipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
generated_captions = blipPipe(image_files)

### Get the generated text along with image_id, image_file
captions = [x[0]['generated_text'] for x in generated_captions]
generated_captions = [{"image_caption": caption, "image_id": image_id, "image_file": image_file.split(os.sep)[-1]} for caption, image_id, image_file in zip(captions, image_ids, image_files)]
generated_captions

[{'image_caption': 'there is a dog that is laying on a rack with shoes',
  'image_id': 42,
  'image_file': 'COCO_val2014_000000000042.jpg'},
 {'image_caption': 'there are two motorcycles parked next to each other on the street',
  'image_id': 73,
  'image_file': 'COCO_val2014_000000000073.jpg'},
 {'image_caption': 'araffe dog laying on the ground next to a bike on a city street',
  'image_id': 74,
  'image_file': 'COCO_val2014_000000000074.jpg'},
 {'image_caption': 'there is a small desk with a lamp on it in a room',
  'image_id': 133,
  'image_file': 'COCO_val2014_000000000133.jpg'},
 {'image_caption': 'there are two giraffes standing in a room with a woman looking at them',
  'image_id': 136,
  'image_file': 'COCO_val2014_000000000136.jpg'},
 {'image_caption': 'brightly colored living room with a television and a fireplace',
  'image_id': 139,
  'image_file': 'COCO_val2014_000000000139.jpg'},
 {'image_caption': 'there are many birds sitting on the branches of a tree',
  'image_id': 1

In [45]:
### Setup the questions and answers for the image files
def obtain_questions_for_image_ids(image_ids: list, question_file_path: str= "questions.json"):
    """This function obtains the questions for the list of the image ids passed from the file"""
    with open(question_file_path) as file:
        questions_data = json.load(file)
    result = []
    for question_details in questions_data['questions']:
        if question_details['image_id'] in image_ids:
            result.append(question_details)
    return result


def obtain_annotations_for_image_ids(image_ids: list, annotations_file_path: str = "annotations.json"):
    """This function obtians the annotations for the list of the image ids passed from the file"""
    with open(annotations_file_path, 'r') as file:
        annotations_data = json.load(file)
    result = []
    for annotation_details in annotations_data['annotations']:
        if annotation_details['image_id'] in image_ids:
            result.append(annotation_details)
    return result

relevant_annotations = obtain_annotations_for_image_ids(image_ids, annotations_file_path=f"{validation_folder}/annotations.json")
relevant_questions = obtain_questions_for_image_ids(image_ids, question_file_path=f"{validation_folder}/questions.json")

In [46]:
relevant_questions, relevant_annotations

([{'image_id': 42,
   'question': 'What color are the gym shoes?',
   'question_id': 42000},
  {'image_id': 42,
   'question': 'Is there a red sandal here?',
   'question_id': 42001},
  {'image_id': 42,
   'question': 'What color is the flip flop?',
   'question_id': 42002},
  {'image_id': 74,
   'question': 'Does this dog have a collar?',
   'question_id': 74000},
  {'image_id': 74,
   'question': 'Where is the dog laying?',
   'question_id': 74001},
  {'image_id': 74, 'question': 'What is the dog doing?', 'question_id': 74002},
  {'image_id': 133, 'question': 'What color is lamp?', 'question_id': 133000},
  {'image_id': 133,
   'question': 'Is this a child room?',
   'question_id': 133001},
  {'image_id': 133,
   'question': 'What size mattress would you need for this bed?',
   'question_id': 133002},
  {'image_id': 136, 'question': 'Is this in a museum?', 'question_id': 136000},
  {'image_id': 136,
   'question': 'How many animals are in the picture?',
   'question_id': 136001},
  {