In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import sys
drive_path = '/content/gdrive/MyDrive/DeepMirror'
sys.path.insert(0, drive_path)

In [3]:
from pycocotools.coco import COCO
import requests
import os
from os.path import join
from tqdm import tqdm
import json

#### Obtaining a subset of data from the COCO library, consisting of two categories. Implemented with help from Jalagarto's coco_utils / COCO API wrapper: https://github.com/Jalagarto/coco_utils.


In [7]:
class coco_category_filter:
    """
    Downloads images of one category & filters jsons 
    to only keep annotations of this category
    """
    def __init__(self, json_path, imgs_dir, categ='bus'):
        self.coco = COCO(json_path) # instanciate coco class
        self.json_path = json_path
        self.imgs_dir = imgs_dir
        self.categ = categ
        self.images = self.get_imgs_from_json()        
        
    def get_imgs_from_json(self):
        """returns image names of the desired category"""
        # instantiate COCO specifying the annotations json path
        # Specify a list of category names of interest
        catIds = self.coco.getCatIds(catNms=[self.categ])
        print("catIds: ", catIds)
        # Get the corresponding image ids and images using loadImgs
        imgIds = self.coco.getImgIds(catIds=catIds)
        images = self.coco.loadImgs(imgIds)
        print(f"{len(images)} images in '{self.json_path}' with '{self.categ}' instances")
        self.catIds = catIds # list
        return images
    
    def save_imgs(self):
        """saves the images of this category"""
        print("Saving the images with required categories ...")
        os.makedirs(self.imgs_dir, exist_ok=True)
        # Save the images into a local folder
        for im in tqdm(self.images):
            img_data = requests.get(im['coco_url']).content
            with open(os.path.join(self.imgs_dir, im['file_name']), 'wb') as handler:
                handler.write(img_data)
    
    def filter_json_by_category(self, new_json_path):
        """creates a new json with the desired category"""
        # {'supercategory': 'person', 'id': 1, 'name': 'person'}
        ### Filter images:
        print("Filtering the annotations ... ")
        json_parent = os.path.split(new_json_path)[0]
        os.makedirs(json_parent, exist_ok=True)
        imgs_ids = [x['id'] for x in self.images] # get img_ids of imgs with the category
        new_imgs = [x for x in self.coco.dataset['images'] if x['id'] in imgs_ids]
        catIds = self.catIds
        ### Filter annotations
        new_annots = [x for x in self.coco.dataset['annotations'] if x['category_id'] in catIds]
        ### Reorganize the ids
        new_imgs, annotations = self.modify_ids(new_imgs, new_annots)
        ### Filter categories
        new_categories = [x for x in self.coco.dataset['categories'] if x['id'] in catIds]
        print("new_categories: ", new_categories)
        data = {
            "info": self.coco.dataset['info'],
            "licenses": self.coco.dataset['licenses'],
            "images": new_imgs, 
            "annotations": new_annots,
            "categories": new_categories 
            }
        print("saving json: ")
        with open(new_json_path, 'w') as f:
            json.dump(data, f)

    def modify_ids(self, images, annotations):
        """
        creates new ids for the images. I.e., reorganizes the ids and returns the dictionaries back
        images: list of images dictionaries
        imId_counter: image id starting from one (each dicto will start with id of last json +1)
        """
        print("Reinitialicing images and annotation IDs ...")
        ### Images
        old_new_imgs_ids = {}  # necessary for the annotations!
        for n,im in enumerate(images):
            old_new_imgs_ids[images[n]['id']] = n+1  # dicto with old im_ids and new im_ids
            images[n]['id'] = n+1 # reorganize the ids
        ### Annotations
        for n,ann in enumerate(annotations):
            annotations[n]['id'] = n+1
            old_image_id = annotations[n]['image_id']
            annotations[n]['image_id'] = old_new_imgs_ids[old_image_id]  # replace im_ids in the annotations as well
        return images, annotations




In [8]:
def main(subset, year, root_dir, category='bus'):
    json_file = join(os.path.split(root_dir)[0], 'COCOannotations', 'instances_'+subset+year+'.json')   # local path
    imgs_dir = join(root_dir, category + '_' + subset)
    new_json_file = join(root_dir, 'annotations', subset+".json")
    coco_filter = coco_category_filter(json_file, imgs_dir, categ=category) # instanciate class
    coco_filter.save_imgs()
    coco_filter.filter_json_by_category(new_json_file)

In [9]:
subset, year='train', '2017'  # val - train
root_dir = '/content/gdrive/MyDrive/DeepMirror/COCOdata/subset_dataset/'
main(subset, year, root_dir, category="bus") 

loading annotations into memory...
Done (t=34.88s)
creating index...
index created!
catIds:  [6]
3952 images in '/content/gdrive/MyDrive/DeepMirror/COCOdata/subset_dataset/COCOannotations/instances_train2017.json' with 'bus' instances
Saving the images with required categories ...


100%|██████████| 3952/3952 [14:59<00:00,  4.39it/s]


Filtering the annotations ... 
Reinitialicing images and annotation IDs ...
new_categories:  [{'supercategory': 'vehicle', 'id': 6, 'name': 'bus'}]
saving json: 


In [10]:
subset, year='val', '2017'  # val - train
root_dir = '/content/gdrive/MyDrive/DeepMirror/COCOdata/subset_dataset/'
main(subset, year, root_dir, category="bus") 

loading annotations into memory...
Done (t=1.21s)
creating index...
index created!
catIds:  [6]
189 images in '/content/gdrive/MyDrive/DeepMirror/COCOdata/subset_dataset/COCOannotations/instances_val2017.json' with 'bus' instances
Saving the images with required categories ...


100%|██████████| 189/189 [00:41<00:00,  4.56it/s]

Filtering the annotations ... 
Reinitialicing images and annotation IDs ...
new_categories:  [{'supercategory': 'vehicle', 'id': 6, 'name': 'bus'}]
saving json: 



