# COCO Caption Analysis

We use this notebook to generate a annotation file that is a smaller subset of the validation dataset

In [21]:
import os
import json
NUM_IMAGES = 1000

original_annotation_path = r"D:\Programming\CAP 5415\ClipSemanticSearch\RecallBenchmarking\annotation_files\captions_val2017.json"
output_filename = rf'./captions_val2017_{NUM_IMAGES}_subset.json'

In [5]:
# Open and load the JSON file
with open(original_annotation_path, 'r') as f:
    coco_data = json.load(f)

We verify the contents and format of the annotations

In [7]:
coco_data.keys()

dict_keys(['info', 'licenses', 'images', 'annotations'])

In [11]:
coco_data['images'][0]

{'license': 4,
 'file_name': '000000397133.jpg',
 'coco_url': 'http://images.cocodataset.org/val2017/000000397133.jpg',
 'height': 427,
 'width': 640,
 'date_captured': '2013-11-14 17:02:52',
 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg',
 'id': 397133}

In [12]:
coco_data['annotations'][0]

{'image_id': 179765,
 'id': 38,
 'caption': 'A black Honda motorcycle parked in front of a garage.'}

In [8]:
len(coco_data['images'])

5000

In [9]:
len(coco_data['annotations'])

25014

Next we will extract a subset of the images and the corresponding annotations

In [17]:
image_subset = coco_data['images'][:NUM_IMAGES]

# Get the unique Image ID's associate
image_ids = set([image_info['id'] for image_info in image_subset])

# Obtain those corresponding annotations for the images
annotation_subset = [annotation for annotation in coco_data['annotations'] if annotation['image_id'] in image_ids]

In [18]:
print(len(image_ids))
print(len(annotation_subset))

1000
5005


In [19]:
# Extract the subset of data (first 1000 images and corresponding annotations)
subset_data = {
    'info': coco_data['info'],
    'licenses': coco_data['licenses'],
    'images': image_subset,
    'annotations': annotation_subset,
}

In [22]:
# Save the new subset data as a JSON file
with open(output_filename, 'w') as f:
    json.dump(subset_data, f)

# Verify the JSON output file

In [23]:
# Open and load the JSON file
with open(output_filename, 'r') as f:
    coco_output_data = json.load(f)

print(coco_output_data.keys())
print(len(coco_output_data['images']))
print(len(coco_output_data['annotations']))

dict_keys(['info', 'licenses', 'images', 'annotations'])
1000
5005
