# CoCo Entities

In [1]:
import json

import os
def get_coco_entities_splits(coco_entities_path):
    coco_entities = json.load(open(coco_entities_path, "r"))
    coco_entities_splits = {"train": {}, "val": {}, "test": {}}
    for image_id, entities in coco_entities.items():
        split = list(entities.values())[0]["split"]
        coco_entities_splits[split][image_id] = entities
    return coco_entities_splits

In [2]:
coco_entities_path = "/root/Documents/DATASETS/CoCo_Entities/coco_entities_release.json"
coco_entities_splits = get_coco_entities_splits(coco_entities_path)

In [3]:
coco_entities_image_ids = {"train": [], "val": [], "test": []}
for split_name, split in coco_entities_splits.items():
    image_ids = list(split.keys())
    coco_entities_image_ids[split_name] = [int(image_id) for image_id in image_ids]
    assert len(image_ids) == len(set(image_ids))
    cap_num = 0
    for image_id in image_ids:
        cap_num += len(split[str(image_id)])
    print("# {} [images|captions]: [{}|{}]".format(split_name, len(image_ids), cap_num))

# train [images|captions]: [113179|544926]
# val [images|captions]: [4995|24015]
# test [images|captions]: [4995|23940]


# CoCo Captions

In [4]:
def get_coco_annotations_splits(ann_root):    
    splits = {
        "train": 'coco_karpathy_train.json',
        "val": 'coco_karpathy_val.json',
        "test": 'coco_karpathy_test.json',
    }
    
    coco_ann_splits = {}
    for split, file in splits.items():
        data = json.load(
            open(os.path.join(ann_root, file), "r")
        )                    
        coco_ann_splits[split] = data   
    return coco_ann_splits

In [5]:
ann_root = "/root/Documents/DATASETS/MS_COCO/annotations"
coco_ann_splits = get_coco_annotations_splits(ann_root)

In [6]:
coco_ann_image_ids = {"train": [], "val": [], "test": []}
for split_name, split in coco_ann_splits.items():
    image_ids = []
    cap_num = 0
    for ann in split:        
        image_id = int(ann["image"].split('_')[-1].split('.')[0])
        image_ids.append(image_id)
        caption = ann["caption"]
        cap_num += len(caption) if type(caption) is list else 1
    image_ids = list(set(image_ids))
    coco_ann_image_ids[split_name] = image_ids
    print("# {} [images|captions]: [{}|{}]".format(split_name, len(image_ids), cap_num))

# train [images|captions]: [113287|566747]
# val [images|captions]: [5000|25010]
# test [images|captions]: [5000|25010]


# Difference between CoCo Entities and CoCo Captions

In [7]:
# find the diff between coco_entities and coco_ann
for split in ["train", "val", "test"]:
    coco_entities_set = set(coco_entities_image_ids[split])
    coco_ann_set = set(coco_ann_image_ids[split])
    print("# of {} images in coco_ann but not in coco_entities: {}".format(split, len(coco_ann_set-coco_entities_set)))
    print("# of {} images in coco_entities but not in coco_ann: {}".format(split, len(coco_entities_set-coco_ann_set)))

# of train images in coco_ann but not in coco_entities: 108
# of train images in coco_entities but not in coco_ann: 0
# of val images in coco_ann but not in coco_entities: 5
# of val images in coco_entities but not in coco_ann: 0
# of test images in coco_ann but not in coco_entities: 5
# of test images in coco_entities but not in coco_ann: 0
