# Objects365 Dataset Download and Processing
This notebook downloads and processes the Objects365 dataset, generating annotations in YOLO format.

In [None]:
import os
import sys
import tarfile
import numpy as np
from pathlib import Path
from tqdm import tqdm
import requests
from typing import List, Union, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor
from zipfile import ZipFile, is_zipfile
from itertools import repeat
from multiprocessing.pool import ThreadPool

# Verify that pycocotools is installed
try:
    from pycocotools.coco import COCO
except ImportError:
    print("Installation of pycocotools required. Run: pip install pycocotools>=2.0")
    sys.exit(1)

## Support Functions
These functions handle downloading, extraction, and conversion of annotations.

In [None]:
def download_with_resume(url: str, dest: Union[str, Path], retry: int = 3) -> None:
    """
    Downloads a file from a given URL with support for resuming partial downloads.
    """
    headers = {}
    if os.path.exists(dest):
        headers['Range'] = f"bytes={os.path.getsize(dest)}-"
    with requests.get(url, headers=headers, stream=True) as r:
        total_size = int(r.headers.get('content-length', 0)) + os.path.getsize(dest)
        if r.status_code == 416:
            print(f"{dest} is already fully downloaded.")
            return
        elif r.status_code not in (200, 206):
            raise Exception(f"Failed to download {url}: {r.status_code}")
        with open(dest, "ab") as f, tqdm(
            desc=f"Downloading {Path(dest).name}",
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            initial=os.path.getsize(dest),
        ) as bar:
            for chunk in r.iter_content(chunk_size=65536):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))

In [None]:
def xyxy2xywhn(xyxy: np.ndarray, w: Union[int, float] = 640, h: Union[int, float] = 640, clip: bool = False, eps: float = 0.0) -> np.ndarray:
    """
    Converts bounding box coordinates from (x_min, y_min, x_max, y_max) format 
    to normalized (x_center, y_center, width, height) format.
    """
    if clip:
        xyxy[:, 0] = np.maximum(0, np.minimum(xyxy[:, 0], w - eps))
        xyxy[:, 1] = np.maximum(0, np.minimum(xyxy[:, 1], h - eps))
        xyxy[:, 2] = np.maximum(0, np.minimum(xyxy[:, 2], w - eps))
        xyxy[:, 3] = np.maximum(0, np.minimum(xyxy[:, 3], h - eps))
    
    y = xyxy.copy()
    y[:, 0] = ((xyxy[:, 0] + xyxy[:, 2]) / 2) / w  # x center
    y[:, 1] = ((xyxy[:, 1] + xyxy[:, 3]) / 2) / h  # y center
    y[:, 2] = (xyxy[:, 2] - xyxy[:, 0]) / w        # width
    y[:, 3] = (xyxy[:, 3] - xyxy[:, 1]) / h        # height
    return y

## Directory Setup
Set the base directories for the dataset.

In [None]:
threads = 16  # Number of threads for downloading
base_dir = Path("/mnt/e/object365")
base_dir.mkdir(parents=True, exist_ok=True)

# Create directories
for p in ["images", "labels"]:
    for q in ["train", "val"]:
        (base_dir / p / q).mkdir(parents=True, exist_ok=True)

## Download Annotations and Images
Download the necessary files for the Objects365 dataset.

In [None]:
for split, patches in [("train", 50 + 1), ("val", 43 + 1)]:
    print(f"Processing {split} in {patches} patches ...")
    images, labels = base_dir / "images" / split, base_dir / "labels" / split

    # Base URL
    url = f"https://dorc.ks3-cn-beijing.ksyun.com/data-set/2020Objects365%E6%95%B0%E6%8D%AE%E9%9B%86/{split}/"

    # Download annotations
    if split == "train":
        download_with_resume(f"{url}zhiyuan_objv2_{split}.tar.gz", base_dir / f"zhiyuan_objv2_{split}.tar.gz")
    elif split == "val":
        download_with_resume(f"{url}zhiyuan_objv2_{split}.json", base_dir / f"zhiyuan_objv2_{split}.json")

    # Download images
    if split == "train":
        print(f"Downloading training images ({patches} patches)...")
        for i in range(patches):
            download_with_resume(f"{url}patch{i}.tar.gz", images / f"patch{i}.tar.gz")
    elif split == "val":
        print("Downloading validation images v1...")
        for i in range(15 + 1):
            download_with_resume(f"{url}images/v1/patch{i}.tar.gz", images / f"v1_patch{i}.tar.gz")
        print("Downloading validation images v2...")
        for i in range(16, patches):
            download_with_resume(f"{url}images/v2/patch{i}.tar.gz", images / f"v2_patch{i}.tar.gz")

## Annotation Processing
Convert annotations to YOLO format.

In [None]:
for split in ["train", "val"]:
    annotations_path = base_dir / f"zhiyuan_objv2_{split}.json"
    if annotations_path.exists():
        coco = COCO(annotations_path)
        names = [x["name"] for x in coco.loadCats(coco.getCatIds())]

        for cid, cat in enumerate(names):
            catIds = coco.getCatIds(catNms=[cat])
            imgIds = coco.getImgIds(catIds=catIds)

            for im in tqdm(coco.loadImgs(imgIds), desc=f"Class {cid + 1}/{len(names)} {cat}"):
                width, height = im["width"], im["height"]
                path = Path(im["file_name"])

                try:
                    label_file = labels / path.with_suffix(".txt").name
                    with open(label_file, "a", encoding="utf-8") as file:
                        annIds = coco.getAnnIds(imgIds=im["id"], catIds=catIds, iscrowd=None)
                        for a in coco.loadAnns(annIds):
                            x, y, w, h = a["bbox"]
                            xyxy = np.array([[x, y, x + w, y + h]])
                            x, y, w, h = xyxy2xywhn(xyxy, w=width, h=height, clip=True)[0]
                            file.write(f"{cid} {x:.5f} {y:.5f} {w:.5f} {h:.5f}\n")
                except Exception as e:
                    print(f"Error processing image {path}: {e}")