In [1]:
import os
import shutil
import zipfile
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from pycocotools.coco import COCO


def download_file(url, dest):
    if os.path.exists(dest):
        first_byte = os.path.getsize(dest)
        mode = 'ab'
        headers = {'Range': f'bytes={first_byte}-'}
    else:
        first_byte = 0
        mode = 'wb'
        headers = {}
    resp = requests.get(url, stream=True, timeout=30, headers=headers)
    total_size = int(resp.headers.get('content-length', 0)) + first_byte
    with open(dest, mode) as f, tqdm(total=total_size, initial=first_byte,
                                     unit='B', unit_scale=True, desc=os.path.basename(dest)) as pbar:
        for chunk in resp.iter_content(8192):
            f.write(chunk)
            pbar.update(len(chunk))


def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(extract_to)
    print(f"Unpacked {zip_path}")


def prepare_split(ann_path, img_src, img_dst, label_dst, split_name):
    os.makedirs(img_dst, exist_ok=True)
    os.makedirs(label_dst, exist_ok=True)
    
    if os.path.exists(img_src):
        for fname in os.listdir(img_src):
            if fname.endswith('.jpg'):
                src_path = os.path.join(img_src, fname)
                new_fname = str(int(fname.split('.')[0])) + '.jpg'
                dst_path = os.path.join(img_dst, new_fname)
                if not os.path.exists(dst_path):
                    shutil.move(src_path, dst_path)
        os.rmdir(img_src)
    
    coco = COCO(ann_path)
    cat_ids = coco.getCatIds()
    cat_ids.sort()
    categories = coco.loadCats(cat_ids)
    class_names = [cat['name'] for cat in categories]
    
    img_ids = coco.getImgIds()
    rows = []
    for img_id in tqdm(img_ids, desc=split_name):
        ann_ids = coco.getAnnIds(imgIds=img_id, catIds=cat_ids)
        anns = coco.loadAnns(ann_ids)
        multi_hot = np.zeros(len(cat_ids), dtype=np.int8)
        for ann in anns:
            multi_hot[cat_ids.index(ann['category_id'])] = 1
        
        dst_path = os.path.join(img_dst, f"{img_id}.jpg")
        if os.path.exists(dst_path):
            rows.append([str(img_id)] + multi_hot.tolist())
    
    df = pd.DataFrame(rows, columns=["ImageID"] + class_names)
    df.to_csv(os.path.join(label_dst, "labels.csv"), index=False)
    return class_names


coco_dir = "./coco"
os.makedirs(coco_dir, exist_ok=True)

train_zip = os.path.join(coco_dir, "train2017.zip")
val_zip = os.path.join(coco_dir, "val2017.zip")
ann_zip = os.path.join(coco_dir, "annotations_trainval2017.zip")

print("=== Loading ===")
download_file("http://images.cocodataset.org/zips/train2017.zip", train_zip)
download_file("http://images.cocodataset.org/zips/val2017.zip", val_zip)
download_file("http://images.cocodataset.org/annotations/annotations_trainval2017.zip", ann_zip)

print("\n=== Unpacking ===")
extract_zip(train_zip, coco_dir)
extract_zip(val_zip, coco_dir)
extract_zip(ann_zip, coco_dir)

print("\n=== Preparation TRAIN ===")
class_names = prepare_split(
    os.path.join(coco_dir, "annotations/instances_train2017.json"),
    os.path.join(coco_dir, "train2017"),
    "data/images/train",
    "data/labels/train",
    "train"
)

print("\n=== Preparation VAL ===")
prepare_split(
    os.path.join(coco_dir, "annotations/instances_val2017.json"),
    os.path.join(coco_dir, "val2017"),
    "data/images/val",
    "data/labels/val",
    "val"
)

os.makedirs("data/images/test", exist_ok=True)
os.makedirs("data/labels/test", exist_ok=True)

val_df = pd.read_csv("data/labels/val/labels.csv")
val_ids = val_df["ImageID"].values

test_size = min(2500, len(val_ids) // 2)
np.random.seed(42)
test_ids = np.random.choice(val_ids, test_size, replace=False)
train_val_ids = [id for id in val_ids if id not in test_ids]

for img_id in test_ids:
    src = os.path.join("data/images/val", f"{img_id}.jpg")
    dst = os.path.join("data/images/test", f"{img_id}.jpg")
    if os.path.exists(src) and not os.path.exists(dst):
        shutil.move(src, dst)

test_df = val_df[val_df["ImageID"].isin(test_ids)]
test_df.to_csv("data/labels/test/labels.csv", index=False)

val_df = val_df[val_df["ImageID"].isin(train_val_ids)]
val_df.to_csv("data/labels/val/labels.csv", index=False)

print("\n=== Delete files ===")
os.remove(train_zip)
os.remove(val_zip)
os.remove(ann_zip)
shutil.rmtree(os.path.join(coco_dir, "annotations"), ignore_errors=True)
shutil.rmtree(coco_dir, ignore_errors=True)

KeyboardInterrupt: 

In [4]:
class_names

['person',
 'bicycle',
 'car',
 'motorcycle',
 'airplane',
 'bus',
 'train',
 'truck',
 'boat',
 'traffic light',
 'fire hydrant',
 'stop sign',
 'parking meter',
 'bench',
 'bird',
 'cat',
 'dog',
 'horse',
 'sheep',
 'cow',
 'elephant',
 'bear',
 'zebra',
 'giraffe',
 'backpack',
 'umbrella',
 'handbag',
 'tie',
 'suitcase',
 'frisbee',
 'skis',
 'snowboard',
 'sports ball',
 'kite',
 'baseball bat',
 'baseball glove',
 'skateboard',
 'surfboard',
 'tennis racket',
 'bottle',
 'wine glass',
 'cup',
 'fork',
 'knife',
 'spoon',
 'bowl',
 'banana',
 'apple',
 'sandwich',
 'orange',
 'broccoli',
 'carrot',
 'hot dog',
 'pizza',
 'donut',
 'cake',
 'chair',
 'couch',
 'potted plant',
 'bed',
 'dining table',
 'toilet',
 'tv',
 'laptop',
 'mouse',
 'remote',
 'keyboard',
 'cell phone',
 'microwave',
 'oven',
 'toaster',
 'sink',
 'refrigerator',
 'book',
 'clock',
 'vase',
 'scissors',
 'teddy bear',
 'hair drier',
 'toothbrush']