- Be sure current working directory is 'demo/' folder:

In [None]:
import os
os.getcwd()

- Import required functions:

In [3]:
from sahi.slicing import slice_coco
from sahi.utils.file import load_json

from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

import os
import glob
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import shutil
import multiprocessing

### Optimized version

In [5]:
suffix = ['png', 'jpg', 'jpeg', 'bmp', 'gif', 'tiff']

In [4]:
source_dir = '/home/bohbot/ultralytics/datasets/mos/all_mos_new'

In [11]:
all_images_paths = []
all_annot_paths = []

for file in glob.iglob(f"{source_dir}/*/*/*", recursive=True):
    if file.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif', 'tiff')):
        all_images_paths.append(file)
        txt_path = file.replace('images', 'labels').split('.')[0] + '.txt'
        if os.path.exists(txt_path):
            all_annot_paths.append(txt_path)

In [None]:
print(f"found {len(all_annot_paths)} annotations and {len(all_images_paths)} images")

In [None]:
def process_file(file_path, large_files_dir):
    try:
        with Image.open(file_path) as img:
            width, height = img.size
            if width > 1000 or height > 1000:
                shutil.copy(file_path, large_files_dir)
                txt_file = os.path.splitext(file_path)[0] + '.txt'
                if os.path.exists(txt_file):
                    shutil.copy(txt_file, large_files_dir)
    except Exception:
        pass

def copy_large_images(source_dir, large_files_dir):
    os.makedirs(large_files_dir, exist_ok=True)
    with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()-2) as executor:
        for root, _, files in os.walk(source_dir):
            file_paths = [os.path.join(root, file) for file in files if file.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif', 'tiff'))]
            executor.map(process_file, file_paths, [large_files_dir] * len(file_paths))

def split_dataset(large_files_dir, batch_output_dir, batch_size):
    os.makedirs(batch_output_dir, exist_ok=True)
    files = sorted(os.listdir(large_files_dir))
    image_files = [f for f in files if f.lower().endswith(('png', 'jpg', 'jpeg'))]
    for idx, image_file in enumerate(image_files):
        batch_number = idx // batch_size + 1
        batch_folder = os.path.join(batch_output_dir, f"batch_{batch_number}")
        os.makedirs(batch_folder, exist_ok=True)
        shutil.move(os.path.join(large_files_dir, image_file), batch_folder)
        txt_file = os.path.splitext(image_file)[0] + '.txt'
        txt_path = os.path.join(large_files_dir, txt_file)
        if os.path.exists(txt_path):
            shutil.move(txt_path, batch_folder)

def copy_and_split_large_images(source_dir, large_files_dir, batch_output_dir, batch_size):
    copy_large_images(source_dir, large_files_dir)
    split_dataset(large_files_dir, batch_output_dir, batch_size)

In [None]:
source_directory_or_paths = '/home/bohbot/Evyatar/git/crop_sahi/data' # folder with both images and txt files
large_files_directory = '/home/bohbot/Evyatar/git/crop_sahi/large_files' # copies only the large images to a different folder
batch_output_directory = '/home/bohbot/Evyatar/git/crop_sahi/batches' # where to save the crops
batch_size = 1000

In [None]:
copy_and_split_large_images(source_directory_or_paths, large_files_directory, batch_output_directory, batch_size)

## preslice - filter images that are more than 1024

In [None]:
import os
from PIL import Image
import shutil


def copy_and_split_large_images(source_dir, large_files_dir, batch_output_dir, batch_size):
    """
    Copies large images (height or width > 1000) and their associated .txt files to a target directory,
    then splits them into batches, ensuring annotations are moved along with the images.

    Args:
        source_dir (str): Path to the source directory containing images and .txt files.
        large_files_dir (str): Path to the directory for storing large files.
        batch_output_dir (str): Path to the directory for storing batches.
        batch_size (int): Number of images per batch.
    """
    # Step 1: Copy large images and their associated .txt files
    if not os.path.exists(large_files_dir):
        os.makedirs(large_files_dir)

    # Filter and copy large images and their .txt files
    for root, _, files in os.walk(source_dir):
        for file in files:
            file_path = os.path.join(root, file)

            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                try:
                    with Image.open(file_path) as img:
                        width, height = img.size

                        if width > 1000 or height > 1000:
                            # Copy the image
                            shutil.copy(file_path, large_files_dir)

                            # Copy the corresponding .txt file
                            txt_file = os.path.splitext(file)[0] + '.txt'
                            txt_file_path = os.path.join(root, txt_file)
                            if os.path.exists(txt_file_path):
                                shutil.copy(txt_file_path, large_files_dir)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

    print(f"Large files copied to {large_files_dir}")

    # Step 2: Split the dataset into batches
    def split_dataset(source_dir, output_dir, batch_size):
        files = sorted(os.listdir(source_dir))
        image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        os.makedirs(output_dir, exist_ok=True)

        for idx, image_file in enumerate(image_files):
            batch_number = idx // batch_size + 1
            batch_folder = os.path.join(output_dir, f"batch_{batch_number}")
            os.makedirs(batch_folder, exist_ok=True)

            # Move the image to the batch folder
            image_path = os.path.join(source_dir, image_file)
            shutil.move(image_path, batch_folder)

            # Move the corresponding .txt file if it exists
            txt_file = os.path.splitext(image_file)[0] + '.txt'
            txt_file_path = os.path.join(source_dir, txt_file)
            if os.path.exists(txt_file_path):
                shutil.move(txt_file_path, batch_folder)

        print(f"Dataset split into batches of size {batch_size}")

    split_dataset(large_files_dir, batch_output_dir, batch_size)


# Example usage:
source_directory = '/home/bohbot/Evyatar/git/crop_sahi/data'
large_files_directory = '/home/bohbot/Evyatar/git/crop_sahi/large_files'
batch_output_directory = '/home/bohbot/Evyatar/git/crop_sahi/batches'
batch_size = 1000

copy_and_split_large_images(source_directory, large_files_directory, batch_output_directory, batch_size)


In [None]:
rresponding .txt file
                            txt_file = os.path.splitext(file)[0] + '.txt'
                            txt_file_path = os.path.join(root, txt_file)
                            if os.path.exists(txt_file_path):
                                shutil.copy(txt_file_path, large_files_dir)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

    print(f"Large files copied to {large_files_dir}")

    # Step 2: Split the dataset into batches
    def split_dataset(source_dir, output_dir, batch_size):
        files = sorted(os.listdir(source_dir))
        image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        os.makedirs(output_dir, exist_ok=True)

        for idx, image_file in enumerate(image_files):
            batch_number = idx // batch_size + 1
            batch_folder = os.path.join(output_dir, f"batch_{batch_number}")
            os.makedirs(batch_folder, exist_ok=True)

            # Move the image to the batch folder
            image_path = os.path.join(source_dir, image_file)
            shutil.move(image_path, batch_folder)

            # Move the corresponding .txt file if it exists
            txt_file = os.path.splitext(image_file)[0] + '.txt'
            txt_file_path = os.path.join(source_dir, txt_file)
            if os.path.exists(txt_file_path):
                shutil.move(txt_file_path, batch_folder)

        print(f"Dataset split into batches of size {batch_size}")

    split_dataset(large_files_dir, batch_output_dir, batch_size)


# Example usage:
source_directory = '/home/bohbot/Evyatar/git/crop_sahi/data'
large_files_directory = '/home/bohbot/Evyatar/git/crop_sahi/large_files'
batch_output_directory = '/home/bohbot/Evyatar/git/crop_sahi/batches'
batch_size = 1000

copy_and_split_large_images(source_directory, large_files_directory, batch_output_directory, batch_size)


## 1. convert yolo annotations to coco

In [15]:
import os
import json
import cv2

# YOLO to COCO Conversion Script
def convert_yolo_to_coco(yolo_annotations_dir, images_dir, output_json_path):
    # Initialize COCO format structure
    coco_format = {
        "images": [],
        "categories": [],
        "annotations": []
    }

    # Define categories dynamically or manually
    category_mapping = {}
    annotation_id = 1

    # Process image files and corresponding annotations
    for image_filename in os.listdir(images_dir):
        if not image_filename.lower().endswith(('.jpg', '.png', '.jpeg')):
            continue

        image_path = os.path.join(images_dir, image_filename)
        annotation_filename = os.path.splitext(image_filename)[0] + ".txt"
        annotation_path = os.path.join(yolo_annotations_dir, annotation_filename)

        # Check if annotation file exists
        if not os.path.exists(annotation_path):
            print(f"Annotation file missing for image: {image_filename}")
            continue

        # Read image to get dimensions
        image = cv2.imread(image_path)
        if image is None:
            print(f"Unable to read image: {image_filename}")
            continue

        height, width, _ = image.shape
        image_id = len(coco_format["images"]) + 1

        # Add image info to COCO structure
        coco_format["images"].append({
            "id": image_id,
            "file_name": image_filename,
            "height": height,
            "width": width
        })

        # Parse YOLO annotations
        with open(annotation_path, "r") as f:
            for line in f:
                line_data = line.strip().split()
                if len(line_data) < 5:
                    print(f"Invalid annotation in file: {annotation_filename}")
                    continue

                category_id = int(line_data[0])
                x_center, y_center, bbox_width, bbox_height = map(float, line_data[1:])

                # Update category mapping dynamically
                if category_id not in category_mapping:
                    category_mapping[category_id] = f"category_{category_id}"
                    coco_format["categories"].append({
                        "id": category_id,
                        "name": category_mapping[category_id],
                        "supercategory": "none"
                    })

                # Convert YOLO to COCO bbox format
                x_min = (x_center - bbox_width / 2) * width
                y_min = (y_center - bbox_height / 2) * height
                bbox_width *= width
                bbox_height *= height

                # Add annotation to COCO structure
                coco_format["annotations"].append({
                    "id": annotation_id,
                    "image_id": image_id,
                    "category_id": category_id,
                    "bbox": [x_min, y_min, bbox_width, bbox_height],
                    "area": bbox_width * bbox_height,
                    "iscrowd": 0
                })
                annotation_id += 1

    # Save COCO annotations to JSON file
    with open(output_json_path, "w") as json_file:
        json.dump(coco_format, json_file, indent=4)

    print(f"COCO annotations successfully saved to {output_json_path}")



In [None]:
for i in sorted(os.listdir(batch_output_directory)):
        batch = os.path.join(batch_output_directory,i)
        yolo_dir = img_dir = batch
        json_out = os.path.join(batch, "coco_annotations.json")
        convert_yolo_to_coco(yolo_dir, img_dir, json_out)

## 2. Slicing COCO Dataset into Grids

- To slice a COCO dataset annoations an images, we need to specify slice parameters. In this example we will ice images into 256x256 grids overlap ratio of 0.2:

In [None]:
for i in sorted(os.listdir(batch_output_directory)):
    batch = os.path.join(batch_output_directory,i)

    json_out = os.path.join(batch, "coco_annotations.json")
    
    sliced_dir = os.path.join(batch,"sliced/")

    coco_dict, coco_path = slice_coco(
        coco_annotation_file_path=json_out,
        image_dir=batch,
        output_coco_annotation_file_name="sliced",
        ignore_negative_samples=False,
        output_dir=sliced_dir,
        slice_height=640,
        slice_width=640,
        overlap_height_ratio=0.2,
        overlap_width_ratio=0.2,
        min_area_ratio=0.1,
        verbose=True
    )

- Convert back to yolo

In [None]:
from sahi.utils.coco import Coco

for i in sorted(os.listdir(batch_output_directory)):
  batch = os.path.join(batch_output_directory,i)
    
  sliced_dir = os.path.join(batch,"sliced/")
  print(sliced_dir)
  json_sliced = os.path.join(sliced_dir,"sliced_coco.json")
  # init Coco object
  coco = Coco.from_coco_dict_or_path(json_sliced, image_dir=sliced_dir)

  # export converted YoloV5 formatted dataset into given output_dir with a 85% train/15% val split
  coco.export_as_yolov5(
    output_dir=os.path.join(source_directory,"sliced_images_with_yolo_format"),
    train_split_rate=1,
      disable_symlink=True
  )


- Visualize sliced annotations on sliced images:

In [None]:
f, axarr = plt.subplots(4, 5, figsize=(13,13))
img_ind = 0
for row_ind in range(4):
    for column_ind in range(5):
        # read image
        img = Image.open("demo_data/sliced/" + coco_dict["images"][img_ind]["file_name"]).convert('RGBA')
        # iterate over all annotations
        for ann_ind in range(len(coco_dict["annotations"])):
            # find annotations that belong the selected image
            if coco_dict["annotations"][ann_ind]["image_id"] == coco_dict["images"][img_ind]["id"]:
                # convert coco bbox to pil bbox
                xywh = coco_dict["annotations"][ann_ind]["bbox"]
                xyxy = [xywh[0], xywh[1], xywh[0]+xywh[2], xywh[1]+xywh[3]]
                # visualize bbox over image
                ImageDraw.Draw(img, 'RGBA').rectangle(xyxy, width=5)
            axarr[row_ind, column_ind].imshow(img)
        img_ind += 1
