In [1]:
import json
import random
from collections import Counter, defaultdict
import numpy as np
import os

In [4]:
COCO_FPATH = "C:/Users/hp/Downloads/mock_coco.json" 

with open(COCO_FPATH, "r") as f:
    coco = json.load(f)

images = coco.get("images", [])
annotations = coco.get("annotations", [])
categories = coco.get("categories", [])

print(f"Loaded COCO JSON: {COCO_FPATH}")
print("Images:", len(images))
print("Annotations:", len(annotations))
print("Categories:", len(categories))

# convenience maps
cat_id2name = {c["id"]: c.get("name", str(c["id"])) for c in categories}
img_id2meta = {img["id"]: img for img in images}

Loaded COCO JSON: C:/Users/hp/Downloads/mock_coco.json
Images: 100
Annotations: 295
Categories: 10


In [5]:
total_images = len(images)
total_annotations = len(annotations)

# per-category counts
cat_counts = Counter([ann["category_id"] for ann in annotations])
top3 = cat_counts.most_common(3)

print("SUMMARY REPORT")
print("Total images:", total_images)
print("Total annotations:", total_annotations)
print("Top 3 categories:")
for cid, cnt in top3:
    print(f" - {cat_id2name.get(cid, cid)} (id={cid}): {cnt} annotations")
    
# optional: full per-category breakdown
print("\nFull category counts:")
for cid, cnt in cat_counts.most_common():
    print(f"{cat_id2name.get(cid,cid)} (id={cid}): {cnt}  -> {cnt/total_annotations*100:.2f}%")


SUMMARY REPORT
Total images: 100
Total annotations: 295
Top 3 categories:
 - fire hydrant (id=10): 33 annotations
 - bicycle (id=3): 33 annotations
 - car (id=1): 31 annotations

Full category counts:
fire hydrant (id=10): 33  -> 11.19%
bicycle (id=3): 33  -> 11.19%
car (id=1): 31  -> 10.51%
person (id=2): 31  -> 10.51%
traffic light (id=8): 30  -> 10.17%
dog (id=4): 29  -> 9.83%
truck (id=7): 29  -> 9.83%
bench (id=9): 28  -> 9.49%
cat (id=5): 27  -> 9.15%
bus (id=6): 24  -> 8.14%


In [7]:
# Build image -> unique category set
img2cats = defaultdict(set)
for a in annotations:
    img2cats[a["image_id"]].add(a["category_id"])

image_rows = [{"image_id": img["id"], "file_name": img.get("file_name",""), "categories": sorted(list(img2cats.get(img["id"], [])))} for img in images]

def multilabel_greedy_split(image_rows, ratios=(0.7,0.15,0.15), seed=0):
    splits = ["train","val","test"]
    random.seed(seed)
    imgs = image_rows.copy()
    random.shuffle(imgs)
    total = len(imgs)
    # assign integer targets for val/test; leave train as remainder
    target = {"train": int(ratios[0]*total), "val": int(ratios[1]*total), "test": int(ratios[2]*total)}
    assigned = {s:[] for s in splits}
    cat_totals = Counter()
    for r in imgs:
        for c in r["categories"]:
            cat_totals[c]+=1
    cur_counts = {s:Counter() for s in splits}
    for r in imgs:
        best=None; best_score=None
        for s in splits:
            # respect val/test capacities first
            if s!="train" and len(assigned[s]) >= target[s]:
                continue
            score=0.0
            for c in r["categories"]:
                desired = ratios[splits.index(s)]
                cur_frac = cur_counts[s][c] / (cat_totals[c] + 1e-9)
                score += abs(cur_frac - desired)
            if best_score is None or score < best_score:
                best_score = score; best=s
        assigned[best].append(r)
        for c in r["categories"]:
            cur_counts[best][c]+=1
    return assigned

assigned = multilabel_greedy_split(image_rows, ratios=(0.7,0.15,0.15), seed=123)

# count annotations in each split
split_ann_counts = {}
split_cat_counts = {}
for s, imgs_assigned in assigned.items():
    img_ids = set([r["image_id"] for r in imgs_assigned])
    anns = [a for a in annotations if a["image_id"] in img_ids]
    split_ann_counts[s] = len(anns)
    split_cat_counts[s] = Counter([a["category_id"] for a in anns])

print("Image counts per split:", {s:len(assigned[s]) for s in assigned})
print("Annotation counts per split:", split_ann_counts)

# Save image assignments
split_assign_path = "coco_split_assignment.json"
with open(split_assign_path, "w") as f:
    json.dump({s:[r["image_id"] for r in assigned[s]] for s in assigned}, f, indent=2)
print("Saved split assignment to:", split_assign_path)


Image counts per split: {'train': 70, 'val': 15, 'test': 15}
Annotation counts per split: {'train': 197, 'val': 52, 'test': 46}
Saved split assignment to: coco_split_assignment.json


In [8]:
def export_coco_split(coco, image_ids, out_path):
    # images subset
    image_set = set(image_ids)
    new_images = [img for img in coco["images"] if img["id"] in image_set]
    new_annotations = [ann for ann in coco["annotations"] if ann["image_id"] in image_set]
    new_coco = {
        "images": new_images,
        "annotations": new_annotations,
        "categories": coco.get("categories", [])
    }
    with open(out_path, "w") as f:
        json.dump(new_coco, f)
    return out_path

# create files
os.makedirs("splits", exist_ok=True)
for s in ["train","val","test"]:
    img_ids = [r["image_id"] for r in assigned[s]]
    p = f"splits/{s}.json"
    export_coco_split(coco, img_ids, p)
    print("Wrote", p, "images:", len(img_ids))


Wrote splits/train.json images: 70
Wrote splits/val.json images: 15
Wrote splits/test.json images: 15


In [9]:
from itertools import combinations

def bbox_iou(boxA, boxB):
    # boxes in [x,y,w,h]
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
    yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
    interW = max(0, xB-xA)
    interH = max(0, yB-yA)
    inter = interW * interH
    areaA = boxA[2]*boxA[3]
    areaB = boxB[2]*boxB[3]
    union = areaA + areaB - inter
    return inter/union if union>0 else 0.0

img2anns = defaultdict(list)
for a in annotations:
    img2anns[a["image_id"]].append(a)

exact_duplicates = []
near_duplicates = []
for img_id, anns_list in img2anns.items():
    for a,b in combinations(anns_list, 2):
        # exact
        if a["category_id"]==b["category_id"] and a["bbox"]==b["bbox"] and a.get("iscrowd",0)==b.get("iscrowd",0):
            exact_duplicates.append((a["id"], b["id"], img_id, a["category_id"]))
        else:
            if a["category_id"]==b["category_id"]:
                iou = bbox_iou(a["bbox"], b["bbox"])
                if iou >= 0.99:
                    near_duplicates.append((a["id"], b["id"], img_id, a["category_id"], iou))

print("Exact duplicate pairs found:", len(exact_duplicates))
print("Near-duplicate pairs (IoU >= 0.99) found:", len(near_duplicates))

# if you want a looser detection, change threshold to 0.9


Exact duplicate pairs found: 0
Near-duplicate pairs (IoU >= 0.99) found: 0


In [10]:
counts = np.array([v for v in cat_counts.values()])
mean = counts.mean()
std = counts.std()
imbalance_ratio = counts.max() / counts.min() if counts.min()>0 else float("inf")

print("Per-category counts:")
for cid, cnt in cat_counts.most_common():
    print(f"{cat_id2name.get(cid,cid)} (id={cid}): {cnt} ({cnt/total_annotations*100:.2f}%)")

print("\nClass imbalance metrics:")
print(f" mean={mean:.2f}, std={std:.2f}, max/min ratio={imbalance_ratio:.2f}")

# simple rule-of-thumb:
if imbalance_ratio < 2.0:
    print("Conclusion: no severe class imbalance (ratio < 2.0).")
else:
    print("Conclusion: noticeable imbalance (consider augmentation / re-weighting).")


Per-category counts:
fire hydrant (id=10): 33 (11.19%)
bicycle (id=3): 33 (11.19%)
car (id=1): 31 (10.51%)
person (id=2): 31 (10.51%)
traffic light (id=8): 30 (10.17%)
dog (id=4): 29 (9.83%)
truck (id=7): 29 (9.83%)
bench (id=9): 28 (9.49%)
cat (id=5): 27 (9.15%)
bus (id=6): 24 (8.14%)

Class imbalance metrics:
 mean=29.50, std=2.62, max/min ratio=1.38
Conclusion: no severe class imbalance (ratio < 2.0).
