- Be sure current working directory is 'demo/' folder:

In [None]:
import os
os.getcwd()

- Import required functions:

In [6]:
from sahi.slicing import slice_coco
from sahi.utils.file import load_json

from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

import os
import glob
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import shutil
import multiprocessing
from sahi.utils.coco import Coco

import tempfile
from tqdm import tqdm
from dask import bag as db

import json
import cv2
import uuid

from pathlib import Path

from multiprocessing import Pool

### Optimized version

In [7]:
images_train_source = '/home/bohbot/ultralytics/datasets/mos/all_mos_new/images/train'
images_val_source = '/home/bohbot/ultralytics/datasets/mos/all_mos_new/images/val'
labels_train_source = '/home/bohbot/ultralytics/datasets/mos/all_mos_new/labels/train'
labels_val_source = '/home/bohbot/ultralytics/datasets/mos/all_mos_new/labels/val'

In [9]:
all_image_paths = (
    [image_path for image_path in glob.iglob(f"{images_train_source}/*") 
     if image_path.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))] +
    [image_path for image_path in glob.iglob(f"{images_val_source}/*") 
     if image_path.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))]
)

all_annots_paths = (
    [annot_path for annot_path in glob.iglob(f"{labels_train_source}/*") 
     if annot_path.endswith('txt')] +
    [annot_path for annot_path in glob.iglob(f"{labels_val_source}/*") 
     if annot_path.endswith('txt')]
)

In [None]:
annot_dict = {
    os.path.splitext(os.path.basename(annot_path))[0]: annot_path
    for annot_path in all_annots_paths if 'test' not in annot_path
}

image_annot_paths = [
    (image_path, annot_dict[os.path.splitext(os.path.basename(image_path))[0]])
    for image_path in all_image_paths
    if os.path.splitext(os.path.basename(image_path))[0] in annot_dict
]

len(image_annot_paths), image_annot_paths[:2]

In [6]:
def process_file(file_paths, large_files_dir):
    try:
        image_path, annot_path = file_paths

        if not os.path.exists(image_path) or not os.path.exists(annot_path):
            print(f"Missing file: {image_path} or {annot_path}")
            return

        with Image.open(image_path) as img:
            width, height = img.size
            if width > 1000 or height > 1000:
                shutil.copy(image_path, large_files_dir)
                shutil.copy(annot_path, large_files_dir)
            else:
                return
    except Exception as e:
        print(f"Error processing {file_paths}: {e}")


def copy_list(image_annot_paths, large_files_dir):
    os.makedirs(large_files_dir, exist_ok=True)
    copied_count = 0
    for file_paths in tqdm(image_annot_paths):
        process_file(file_paths, large_files_dir)
        copied_count += 1
    print(f"Total files copied: {copied_count}")

def copy_large_images(image_annot_paths, large_files_dir):
    copied_count = 0
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()-2) as executor:
        results = list(executor.map(
            lambda x: process_file(x, large_files_dir),
            image_annot_paths
        ))
    print(f"Total files copied: {copied_count}")


In [11]:
source_directory_or_paths = image_annot_paths # folder with both images and txt files
# large_files_directory = '/home/bohbot/Evyatar/git/crop_sahi/large_files' # copies only the large images to a different folder
batch_output_directory = '/home/bohbot/Evyatar/git/crop_sahi/large_images/images' # where to save the crops

In [None]:
copy_list(image_annot_paths, batch_output_directory)

## 1. convert yolo annotations to coco

In [9]:
def process_image(args):
    image_filename, images_dir, yolo_annotations_dir = args[:3]
    category_mapping = args[3]

    image_path = os.path.join(images_dir, image_filename)
    annotation_filename = os.path.splitext(image_filename)[0] + ".txt"
    annotation_path = os.path.join(yolo_annotations_dir, annotation_filename)

    if not os.path.exists(annotation_path):
        return None, None, None

    image = cv2.imread(image_path)
    if image is None:
        return None, None, None

    height, width, _ = image.shape
    image_id = hash(image_filename)

    image_data = {
        "id": image_id,
        "file_name": image_filename,
        "height": height,
        "width": width
    }

    annotations = []
    categories = []

    with open(annotation_path, "r") as f:
        for line in f:
            line_data = line.strip().split()
            if len(line_data) < 5:
                continue

            category_id = int(line_data[0])
            x_center, y_center, bbox_width, bbox_height = map(float, line_data[1:])

            if category_id not in category_mapping:
                category_mapping[category_id] = f"category_{category_id}"
                categories.append({
                    "id": category_id,
                    "name": category_mapping[category_id],
                    "supercategory": "none"
                })

            x_min = (x_center - bbox_width / 2) * width
            y_min = (y_center - bbox_height / 2) * height
            bbox_width *= width
            bbox_height *= height

            annotations.append({
                "id": len(annotations) + 1,
                "image_id": image_id,
                "category_id": category_id,
                "bbox": [x_min, y_min, bbox_width, bbox_height],
                "area": bbox_width * bbox_height,
                "iscrowd": 0
            })

    return annotations, image_data, categories


def convert_yolo_to_coco(yolo_annotations_dir, images_dir, output_json_path):
    os.makedirs(output_json_path, exist_ok=True)
    coco_format = {"images": [], "categories": [], "annotations": []}
    category_mapping = {}

    image_filenames = [
        f for f in os.listdir(images_dir)
        if f.lower().endswith((".jpg", ".png", ".jpeg"))
    ]

    args = [
        (image_filename, images_dir, yolo_annotations_dir, category_mapping)
        for image_filename in image_filenames
    ]

    with Pool(processes=os.cpu_count()-2) as pool:
        results = list(
            tqdm(pool.imap(process_image, args), total=len(args), desc="Processing images")
        )

    for annotations, image_data, categories in results:
        if annotations and image_data:
            coco_format["annotations"].extend(annotations)
            coco_format["images"].append(image_data)
        if categories:
            coco_format["categories"].extend(categories)

    category_ids = set()
    unique_categories = []
    for category in coco_format["categories"]:
        if category["id"] not in category_ids:
            category_ids.add(category["id"])
            unique_categories.append(category)
    coco_format["categories"] = unique_categories

    with open(f"{output_json_path}/coco_annotations.json", "w") as json_file:
        json.dump(coco_format, json_file, indent=4)

In [53]:
coco_annotations_path = f"{batch_output_directory.replace('/images','/coco')}"
sliced_images_path = f"{batch_output_directory.replace('/images','/sliced')}"
temp_image_dir = f"{batch_output_directory.replace('/images','/temp_processing')}"
yolo_format_dir = f"{batch_output_directory.replace('/images','/yolo_format')}"

In [None]:
convert_yolo_to_coco(batch_output_directory, batch_output_directory, coco_annotations_path)

## 2. Slicing COCO Dataset into Grids

- To slice a COCO dataset annoations an images, we need to specify slice parameters. In this example we will ice images into 256x256 grids overlap ratio of 0.2:

In [None]:
n_files = len(glob.glob(f"{batch_output_directory}/*txt"))
n_files

In [13]:
batch = n_files
json_out =f"{coco_annotations_path}/coco_annotations.json"
sliced_dir = sliced_images_path

In [14]:
def split_coco(coco_path):
    with open(coco_path, "r") as f:
        data = json.load(f)
    m = {}
    for img in data["images"]:
        m[img["file_name"]] = {
            "images": [img],
            "annotations": [],
            "categories": data.get("categories", [])
        }
    d = {img["id"]: img["file_name"] for img in data["images"]}
    for ann in data["annotations"]:
        fn = d[ann["image_id"]]
        m[fn]["annotations"].append(ann)
    return m

def process_file(args):
    f, splitted, sliced_dir = args
    name = Path(f).name
    if name not in splitted:
        return f
    o = os.path.join(sliced_dir, Path(f).stem)
    a = os.path.join(o, f"{Path(f).stem}_sliced_annotations.json")
    if os.path.exists(o) and os.path.exists(a):
        return None
    t = os.path.join(sliced_dir, f"temp_processing_{uuid.uuid4()}")
    try:
        os.makedirs(t, exist_ok=True)
        local_coco = os.path.join(t, "annotation.json")
        with open(local_coco, "w") as w:
            json.dump(splitted[name], w)
        shutil.copy(f, t)
        slice_coco(
            coco_annotation_file_path=local_coco,
            image_dir=t,
            output_coco_annotation_file_name=f"{Path(f).stem}_sliced_annotations",
            ignore_negative_samples=False,
            output_dir=o,
            slice_height=640,
            slice_width=640,
            overlap_height_ratio=0.2,
            overlap_width_ratio=0.2,
            min_area_ratio=0.1,
            verbose=False
        )
    except Exception:
        return f
    finally:
        if os.path.exists(t):
            shutil.rmtree(t)
    return None

def run_processing(batch_output_directory, sliced_dir, coco_file, processes=4):
    splitted = split_coco(coco_file)
    image_files = [
        x for x in glob.iglob(os.path.join(batch_output_directory, "*"))
        if not x.lower().endswith(".txt")
    ]
    with Pool(processes=processes) as p:
        r = p.map(
            process_file,
            [(f, splitted, sliced_dir) for f in image_files]
        )
    m = [x for x in r if x]
    if m:
        with open("missing_files.log", "w") as w:
            w.write("\n".join(m))
    return m

In [None]:
missing_files = run_processing(
    batch_output_directory,
    sliced_dir,
    json_out,
    processes=os.cpu_count()-2
)

In [None]:
sliced_dir

- Convert back to yolo

In [46]:
def _process_folder(args):
    folder_name, batch_output_directory, source_directory = args
    batch_path = os.path.join(batch_output_directory, folder_name)
    if not os.path.isdir(batch_path):
        return
    sliced_coco_path = os.path.join(
        batch_path, f"{os.path.basename(batch_path)}_sliced_annotations_coco.json"
    )
    if not os.path.exists(sliced_coco_path):
        return
    coco = Coco.from_coco_dict_or_path(sliced_coco_path, image_dir=batch_path)
    coco.export_as_yolov5(
        output_dir=source_directory,
        train_split_rate=1,
        disable_symlink=True
    )

def export_all_sliced_to_yolo(batch_output_directory, source_directory, processes=4):
    folders = sorted(os.listdir(batch_output_directory))
    with Pool(processes=processes) as pool:
        pool.map(
            _process_folder,
            [(f, batch_output_directory, source_directory) for f in folders]
        )

In [None]:
export_all_sliced_to_yolo(sliced_dir, yolo_format_dir, processes=os.cpu_count()-5)

- Visualize sliced annotations on sliced images:

In [1]:
import os
import random
from pathlib import Path
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

def plot_random_images_with_yolo_annotations(source_directory, num_images=5):
    image_files = []
    for f in os.listdir(source_directory):
        if f.lower().endswith((".png", ".jpg", ".jpeg")):
            image_files.append(os.path.join(source_directory, f))

    if not image_files:
        print("No images found in:", source_directory)
        return
    
    selected_images = random.sample(image_files, min(num_images, len(image_files)))

    fig, axes = plt.subplots(1, len(selected_images), figsize=(16, 5))
    if len(selected_images) == 1:
        axes = [axes]

    for ax, img_path in zip(axes, selected_images):
        img = Image.open(img_path)
        draw = ImageDraw.Draw(img)
        width, height = img.size

        label_path = os.path.splitext(img_path)[0] + ".txt"
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) != 5:
                        continue
                    cat_id = parts[0]
                    x_c, y_c, w, h = map(float, parts[1:])
                    x1 = int((x_c - w / 2) * width)
                    y1 = int((y_c - h / 2) * height)
                    x2 = int((x_c + w / 2) * width)
                    y2 = int((y_c + h / 2) * height)
                    draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
                    draw.text((x1, y1), f"Class {cat_id}", fill="red")

        ax.imshow(img)
        ax.axis("off")

    plt.tight_layout()
    plt.show()


In [None]:
plot_random_images_with_yolo_annotations(f"{yolo_format_dir}/train", num_images=5)

In [54]:
full_dataset_dir = f"{batch_output_directory.replace('/images','/full_dataset')}"
os.makedirs(full_dataset_dir, exist_ok=True)

In [None]:
os.system(f"cp -r {yolo_format_dir}/train {full_dataset_dir}")

In [None]:
os.system(f"cd {full_dataset_dir} && rsync -a --remove-source-files train/ . > /dev/null 2>&1")

In [61]:
os.rmdir(f"{full_dataset_dir}/train")

In [62]:
os.system(f"cp -r {images_train_source} {full_dataset_dir}")
os.system(f"cp -r {images_val_source} {full_dataset_dir}")
os.system(f"cp -r {labels_train_source} {full_dataset_dir}")
os.system(f"cp -r {labels_val_source} {full_dataset_dir}")

0

In [63]:
os.system(f"cd {full_dataset_dir} && rsync -a --remove-source-files train/ . > /dev/null 2>&1")
os.system(f"cd {full_dataset_dir} && rsync -a --remove-source-files val/ . > /dev/null 2>&1")

0

In [64]:
os.rmdir(f"{full_dataset_dir}/train")
os.rmdir(f"{full_dataset_dir}/val")