In [1]:
import glob
import sys
import logging
from tqdm import tqdm
from PIL import Image
from os import path
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import time
import csv
import random
import pandas as pd
import math
import json

from tqdm.notebook import tqdm

In [2]:
input_dir = Path('../../data/01_raw/Hack the Planet')
coco_filepath = input_dir / "coco.json"
image_root_dir = input_dir / "images"

In [82]:
def get_filepaths(root: Path, allowed_suffixes={".jpg", ".JPG", ".PNG"}) -> list[Path]:
    """Lists all filepaths given a root directory `root`."""
    return [p for p in root.rglob("*") if p.suffix in allowed_suffixes and p.exists()]

In [3]:
with open(coco_filepath, 'r') as f:
    data = json.load(f)

In [4]:
data.keys()

dict_keys(['info', 'licenses', 'categories', 'images', 'annotations'])

In [5]:
def is_valid_image_filepath(image_filepath: Path) -> bool:
    """
    Checks whether the image_filepath exists and can be read.
    """
    if not image_filepath.exists():
        return False
    else:
        try:
            image = Image.open(image_filepath)
            return True
        except:
            return False

In [8]:
def get_image_filepaths_without_bears(coco_filepath: dict, image_root_dir: Path) -> list[Path]:
    data = None
    with open(coco_filepath, 'r') as f:
        data = json.load(f)
    if not data:
        logging.error(f"could not load coco_filepath: {coco_filepath}")
        return []
    else:
        id_to_image_data = {image_data["id"]: image_data for image_data in data["images"]}
        class_id_to_label = {category["id"]: category["name"] for category in data["categories"]}
        label_to_class_id = {category["name"]: category["id"] for category in data["categories"]}
        annotations = data['annotations']
        annotations_without_bears = [annotation for annotation in annotations if (annotation["category_id"] != label_to_class_id["bear"])]
        image_filepaths_without_bears = [image_root_dir / id_to_image_data[annotation["image_id"]]["file_name"] for annotation in annotations_without_bears]
        return [image_filepath for image_filepath in image_filepaths_without_bears if is_valid_image_filepath(image_filepath)]

In [9]:
image_filepaths_without_bears = get_image_filepaths_without_bears(coco_filepath=coco_filepath, image_root_dir=image_root_dir)

In [10]:
len(image_filepaths_without_bears)

14073

In [11]:
random_seed = 42
n = 10

samples_without_bears = random.Random(random_seed).sample(image_filepaths_without_bears, n)

In [12]:
samples_without_bears

[PosixPath('../../data/01_raw/Hack the Planet/images/frameOutput/data/Bison - animals only/18RaulTarguluiAG/Hranitor/20210226_20210308/03070944_frames/image_570.jpg'),
 PosixPath('../../data/01_raw/Hack the Planet/images/frameOutput/data/Season5 - animals only/22RucarAG/Plaghia_Loc boncanit_2/20210915_20210930/09270074_frames/image_1230.jpg'),
 PosixPath('../../data/01_raw/Hack the Planet/images/frameOutput/data/Bison - animals only/21IzvoareleDamboviteiAG/Tamas/Sararie/20220519_20220523/05200571_frames/image_180.jpg'),
 PosixPath('../../data/01_raw/Hack the Planet/images/frameOutput/data/Bison - animals only/18RaulTarguluiAG/Hranitor/20210331_20210415/04100667_frames/image_30.jpg'),
 PosixPath('../../data/01_raw/Hack the Planet/images/frameOutput/data/Bison - animals only/18RaulTarguluiAG/Tarc desfacut/20210817_20210903/08280144_frames/image_120.jpg'),
 PosixPath('../../data/01_raw/Hack the Planet/images/frameOutput/data/Season1 - animals only/POST 28/BLIDARU_Bushnell_CC00YY_FCC-CAM-0