In [1]:
DOWNLOAD_AMOUNT = 500 # set to none for unlimited 
LABELS_PATH = '../data/labels/'
LABEL_CNT = 228
IMG_SIZE = 299
OUT_FILE = '../data/images.hdf5'

In [2]:
import numpy as np
import os
import zipfile
import json
import h5py
import urllib3
import multiprocessing
from PIL import Image
from tqdm import tqdm
from urllib3.util import Retry
urllib3.disable_warnings()
import io

# Load image urls

In [3]:
def parse(fname, path, max_parse=None):
    """
    If the given filename does not exist, unzips a file called "<fname>.zip"
    """
    if not os.path.exists(fname):
        # unzip first
        with zipfile.ZipFile(path + fname + '.zip',"r") as zip_ref:
            zip_ref.extractall(path)
            
    ids_urls = []
    ids_labels = []
    with open(path + fname, 'r') as f:
        data = json.load(f)
        for image in data["images"]:
            url = image["url"]
            id = image["imageId"]
            ids_urls.append((id, url))
        if "annotations" in data.keys():
            for image in data["annotations"]:
                label_list = np.array(list(map(int, image["labelId"])))
                label_list = label_list - 1
                id = image["imageId"]
                label_vector = np.zeros(LABEL_CNT, dtype=np.int8)
                label_vector[label_list] = 1
                ids_labels.append((id, label_vector))
    
    if max_parse is not None:
        ids_urls = ids_urls[:max_parse]
        ids_labels = ids_labels[:max_parse]
        
    return ids_urls, ids_labels

In [4]:
train_ids_urls, train_ids_labels = parse('train.json', LABELS_PATH, max_parse=DOWNLOAD_AMOUNT)
val_ids_urls, val_ids_labels = parse('validation.json', LABELS_PATH, max_parse=DOWNLOAD_AMOUNT)
test_ids_urls, _ = parse('test.json', LABELS_PATH, max_parse=DOWNLOAD_AMOUNT)

# Set up h5 file

In [5]:
train_shape = (len(train_ids_urls), 299, 299, 3)
val_shape = (len(val_ids_urls), 299, 299, 3)
test_shape = (len(test_ids_urls), 299, 299, 3)

In [6]:
hdf5_file = h5py.File(OUT_FILE, mode='w')
hdf5_file.create_dataset("train_img", train_shape, np.uint8, chunks=True)
hdf5_file.create_dataset("val_img", val_shape, np.uint8, chunks=True)
hdf5_file.create_dataset("test_img", test_shape, np.uint8, chunks=True)
hdf5_file.create_dataset("train_labels", (len(train_ids_urls), LABEL_CNT), np.int8)
hdf5_file["train_labels"][...] = [labels for (id, labels) in train_ids_labels]
hdf5_file.create_dataset("val_labels", (len(val_ids_urls), LABEL_CNT), np.int8)
hdf5_file["val_labels"][...] = [labels for (id, labels) in val_ids_labels]

# Load Images

In [7]:
def download_image(id_url):
    id, url = id_url
    http = urllib3.PoolManager(retries=Retry(connect=3, read=2, redirect=3))
    response = http.request("GET", url)
    image = Image.open(io.BytesIO(response.data))
    image = image.resize((IMG_SIZE, IMG_SIZE))
    image_rgb = image.convert("RGB")
    return (id, np.array(image_rgb))
    

def download(ids_urls, h5_dataset):
    pool = multiprocessing.Pool(processes=30)
    with tqdm(total=len(ids_urls)) as progress_bar:
        for id, img in pool.imap_unordered(download_image, ids_urls):
            h5_dataset[int(id) - 1, ...] = img
            progress_bar.update(1)

In [8]:
download(train_ids_urls, hdf5_file["train_img"])
download(val_ids_urls, hdf5_file["val_img"])
download(test_ids_urls, hdf5_file["test_img"])

100%|██████████| 500/500 [00:08<00:00, 55.70it/s]
100%|██████████| 500/500 [00:09<00:00, 55.01it/s]
100%|██████████| 500/500 [00:12<00:00, 41.36it/s]


In [9]:
hdf5_file.close()