In [1]:
import os
import shutil
import json
import random
from tqdm import tqdm  # Import progress bar

# Define original dataset paths
original_base_path = "coco2017"
original_train_images_path = os.path.join(original_base_path, "train2017")
original_val_images_path = os.path.join(original_base_path, "val2017")
original_test_images_path = os.path.join(original_base_path, "test2017")
original_annotations_path = os.path.join(original_base_path, "annotations")

# Define subset paths
subset_base_path = "cocosubset"
subset_train_images_path = os.path.join(subset_base_path, "train2017")
subset_val_images_path = os.path.join(subset_base_path, "val2017")
subset_test_images_path = os.path.join(subset_base_path, "test2017")
subset_annotations_path = os.path.join(subset_base_path, "annotations")

# Define annotation file paths
original_train_annotations_path = os.path.join(original_annotations_path, "instances_train2017.json")
original_val_annotations_path = os.path.join(original_annotations_path, "instances_val2017.json")

subset_train_annotations_path = os.path.join(subset_annotations_path, "instances_train2017.json")
subset_val_annotations_path = os.path.join(subset_annotations_path, "instances_val2017.json")

# Set image counts based on ~2GB total size
train_subset_size = 5600  # ~1.4GB
val_subset_size = 1200    # ~0.3GB
test_subset_size = 1200   # ~0.3GB

# Function to create image subset and annotations
def create_subset(original_images_path, original_annotations_path, subset_images_path, subset_annotations_path, subset_size, dataset_name):
    print(f"\nProcessing {dataset_name} dataset...")

    # Load COCO annotations
    with open(original_annotations_path, "r") as f:
        coco_data = json.load(f)

    # Get a random subset of images
    all_images = coco_data["images"]
    subset_images = random.sample(all_images, subset_size)

    # Create new annotation file
    subset_annotations = {
        "info": coco_data["info"],
        "licenses": coco_data["licenses"],
        "images": subset_images,
        "annotations": [ann for ann in coco_data["annotations"] if ann["image_id"] in {img["id"] for img in subset_images}],
        "categories": coco_data["categories"],
    }

    # Save new annotations file
    os.makedirs(os.path.dirname(subset_annotations_path), exist_ok=True)
    with open(subset_annotations_path, "w") as f:
        json.dump(subset_annotations, f, indent=4)

    # Copy selected images with progress bar
    os.makedirs(subset_images_path, exist_ok=True)
    print(f"Copying {subset_size} images for {dataset_name}...")
    for img in tqdm(subset_images, desc=f"Copying {dataset_name}", unit="img"):
        src = os.path.join(original_images_path, img["file_name"])
        dst = os.path.join(subset_images_path, img["file_name"])
        shutil.copy(src, dst)

# Create train subset
create_subset(original_train_images_path, original_train_annotations_path, subset_train_images_path, subset_train_annotations_path, train_subset_size, "Train")

# Create validation subset
create_subset(original_val_images_path, original_val_annotations_path, subset_val_images_path, subset_val_annotations_path, val_subset_size, "Validation")

# Create test subset (random selection if no annotations)
if os.path.exists(original_test_images_path):
    os.makedirs(subset_test_images_path, exist_ok=True)
    test_images = random.sample(os.listdir(original_test_images_path), test_subset_size)
    print("\nCopying test images...")
    for img in tqdm(test_images, desc="Copying Test", unit="img"):
        shutil.copy(os.path.join(original_test_images_path, img), os.path.join(subset_test_images_path, img))

# Copy full annotations folder (for compatibility)
print("\nCopying annotations...")
shutil.copytree(original_annotations_path, subset_annotations_path, dirs_exist_ok=True)

print("\nSubset creation complete! (~2GB total)")



Processing Train dataset...
Copying 5600 images for Train...


Copying Train: 100%|██████████| 5600/5600 [01:05<00:00, 85.90img/s] 



Processing Validation dataset...
Copying 1200 images for Validation...


Copying Validation: 100%|██████████| 1200/1200 [00:06<00:00, 175.06img/s]



Copying test images...


Copying Test: 100%|██████████| 1200/1200 [00:11<00:00, 100.78img/s]



Copying annotations...

Subset creation complete! (~2GB total)


### POTHOLE DATASET


In [7]:
import os
import random
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm  # Progress bar

# Root dataset folder
DATASET_PATH = "pothole"

# Paths to annotations and images inside "pothole"
ANNOTATIONS_PATH = os.path.join(DATASET_PATH, "annotations")
IMAGES_PATH = os.path.join(DATASET_PATH, "images")

# YOLO formatted dataset structure (train, val, test)
YOLO_IMAGES_TRAIN = os.path.join(DATASET_PATH, "images/train")
YOLO_IMAGES_VAL = os.path.join(DATASET_PATH, "images/val")
YOLO_IMAGES_TEST = os.path.join(DATASET_PATH, "images/test")
YOLO_LABELS_TRAIN = os.path.join(DATASET_PATH, "labels/train")
YOLO_LABELS_VAL = os.path.join(DATASET_PATH, "labels/val")
YOLO_LABELS_TEST = os.path.join(DATASET_PATH, "labels/test")

# Ensure YOLO directories exist
for path in [YOLO_IMAGES_TRAIN, YOLO_IMAGES_VAL, YOLO_IMAGES_TEST, 
             YOLO_LABELS_TRAIN, YOLO_LABELS_VAL, YOLO_LABELS_TEST]:
    os.makedirs(path, exist_ok=True)

# Class names (modify if needed)
classes = ["pothole"]

def convert_voc_to_yolo(xml_file):
    """ Convert Pascal VOC XML annotations to YOLO format """
    tree = ET.parse(xml_file)
    root = tree.getroot()

    image_name = root.find("filename").text
    image_width = int(root.find("size/width").text)
    image_height = int(root.find("size/height").text)

    yolo_annotations = []

    for obj in root.findall("object"):
        class_name = obj.find("name").text
        if class_name not in classes:
            continue
        class_id = classes.index(class_name)

        bbox = obj.find("bndbox")
        xmin = int(bbox.find("xmin").text)
        ymin = int(bbox.find("ymin").text)
        xmax = int(bbox.find("xmax").text)
        ymax = int(bbox.find("ymax").text)

        # Convert to YOLO format
        x_center = (xmin + xmax) / 2 / image_width
        y_center = (ymin + ymax) / 2 / image_height
        width = (xmax - xmin) / image_width
        height = (ymax - ymin) / image_height

        yolo_annotations.append(f"{class_id} {x_center} {y_center} {width} {height}")

    return image_name, yolo_annotations

# Ensure annotations directory exists
if not os.path.exists(ANNOTATIONS_PATH):
    raise FileNotFoundError(f"Annotations folder '{ANNOTATIONS_PATH}' not found!")

# Get all XML annotation files
all_xml_files = [f for f in os.listdir(ANNOTATIONS_PATH) if f.endswith(".xml")]

# Shuffle dataset and split into train (70%), val (20%), test (10%)
random.shuffle(all_xml_files)
train_split = int(0.7 * len(all_xml_files))
val_split = int(0.9 * len(all_xml_files))  # 70% train + 20% val = 90%, remaining 10% is test

train_files = all_xml_files[:train_split]
val_files = all_xml_files[train_split:val_split]
test_files = all_xml_files[val_split:]

# Process files with progress bar
for dataset_type, xml_files, img_dest, label_dest in [
    ("train", train_files, YOLO_IMAGES_TRAIN, YOLO_LABELS_TRAIN),
    ("val", val_files, YOLO_IMAGES_VAL, YOLO_LABELS_VAL),
    ("test", test_files, YOLO_IMAGES_TEST, YOLO_LABELS_TEST)
]:
    print(f"📂 Processing {dataset_type} set...")
    
    for xml_file in tqdm(xml_files, desc=f"Converting {dataset_type}", unit="file"):
        xml_path = os.path.join(ANNOTATIONS_PATH, xml_file)
        image_name, yolo_annotations = convert_voc_to_yolo(xml_path)

        # Save YOLO annotation file
        yolo_label_path = os.path.join(label_dest, image_name.replace(".png", ".txt"))
        with open(yolo_label_path, "w") as f:
            f.write("\n".join(yolo_annotations))

        # Move corresponding image
        src_img_path = os.path.join(IMAGES_PATH, image_name)
        dst_img_path = os.path.join(img_dest, image_name)
        if os.path.exists(src_img_path):
            shutil.copy(src_img_path, dst_img_path)

print("✅ Dataset conversion and organization completed!")


📂 Processing train set...


Converting train: 100%|██████████| 465/465 [00:15<00:00, 29.16file/s]


📂 Processing val set...


Converting val: 100%|██████████| 133/133 [00:04<00:00, 31.28file/s]


📂 Processing test set...


Converting test: 100%|██████████| 67/67 [00:01<00:00, 34.85file/s]

✅ Dataset conversion and organization completed!





## EASY-OCR

### ICDAR 2015

#### Rename Files

In [8]:
import os
from tqdm import tqdm

# Set the dataset path
DATASET_PATH = "icdar2015/data"  # Change this if needed

# Get all files
files = os.listdir(DATASET_PATH)

# Rename all .jpg and .txt files
for filename in tqdm(files, desc="Renaming files", unit="file"):
    old_path = os.path.join(DATASET_PATH, filename)
    
    # Check if the file is a .jpg or .txt
    if filename.endswith(".jpg") or filename.endswith(".txt"):
        new_filename = f"letter{filename}"
        new_path = os.path.join(DATASET_PATH, new_filename)
        os.rename(old_path, new_path)

print("✅ Renaming completed!")


Renaming files: 100%|██████████| 34282/34282 [00:36<00:00, 930.17file/s] 

✅ Renaming completed!





####  Remove Corrupted Images & Empty Annotations

In [7]:
import cv2
import os
from tqdm import tqdm
# Get list of files
DATASET_PATH = "icdar2015/data"  # Change this if needed
files = os.listdir(DATASET_PATH)

# Remove corrupted images
for filename in tqdm(files, desc="Checking images", unit="file"):
    if filename.endswith(".jpg"):
        img_path = os.path.join(DATASET_PATH, filename)
        
        try:
            img = cv2.imread(img_path)
            if img is None:
                print(f"❌ Corrupted image: {filename} (Deleting...)")
                os.remove(img_path)
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            os.remove(img_path)

print("✅ Image cleaning completed!")

# Remove empty annotation files
for filename in tqdm(files, desc="Checking annotations", unit="file"):
    if filename.endswith(".txt"):
        txt_path = os.path.join(DATASET_PATH, filename)
        
        if os.path.getsize(txt_path) == 0:
            print(f"❌ Empty annotation file: {filename} (Deleting...)")
            os.remove(txt_path)

print("✅ Annotation cleaning completed!")


Checking images: 100%|██████████| 34282/34282 [03:16<00:00, 174.79file/s]


✅ Image cleaning completed!


Checking annotations: 100%|██████████| 34282/34282 [00:00<00:00, 41061.35file/s]

✅ Annotation cleaning completed!





#### train.txt and val.txt with Progress

In [9]:
# Read the train and val lists
with open("icdar2015/train.txt", "r") as f:
    train_files = f.readlines()
with open("icdar2015/val.txt", "r") as f:
    val_files = f.readlines()

# Rename inside the files with progress
train_files = [f"letter{line.strip()}\n" for line in tqdm(train_files, desc="Updating train.txt", unit="line")]
val_files = [f"letter{line.strip()}\n" for line in tqdm(val_files, desc="Updating val.txt", unit="line")]

# Write back the updated lists
with open("icdar2015/train.txt", "w") as f:
    f.writelines(train_files)
with open("icdar2015/val.txt", "w") as f:
    f.writelines(val_files)

print("✅ train.txt and val.txt updated!")


Updating train.txt: 100%|██████████| 13712/13712 [00:00<00:00, 3984501.62line/s]
Updating val.txt: 100%|██████████| 3429/3429 [00:00<00:00, 3109679.66line/s]

✅ train.txt and val.txt updated!



