In [1]:
import os
import shutil
import json
import random
from tqdm import tqdm  # Import progress bar

# Define original dataset paths
original_base_path = "coco2017"
original_train_images_path = os.path.join(original_base_path, "train2017")
original_val_images_path = os.path.join(original_base_path, "val2017")
original_test_images_path = os.path.join(original_base_path, "test2017")
original_annotations_path = os.path.join(original_base_path, "annotations")

# Define subset paths
subset_base_path = "cocosubset"
subset_train_images_path = os.path.join(subset_base_path, "train2017")
subset_val_images_path = os.path.join(subset_base_path, "val2017")
subset_test_images_path = os.path.join(subset_base_path, "test2017")
subset_annotations_path = os.path.join(subset_base_path, "annotations")

# Define annotation file paths
original_train_annotations_path = os.path.join(original_annotations_path, "instances_train2017.json")
original_val_annotations_path = os.path.join(original_annotations_path, "instances_val2017.json")

subset_train_annotations_path = os.path.join(subset_annotations_path, "instances_train2017.json")
subset_val_annotations_path = os.path.join(subset_annotations_path, "instances_val2017.json")

# Set image counts based on ~2GB total size
train_subset_size = 5600  # ~1.4GB
val_subset_size = 1200    # ~0.3GB
test_subset_size = 1200   # ~0.3GB

# Function to create image subset and annotations
def create_subset(original_images_path, original_annotations_path, subset_images_path, subset_annotations_path, subset_size, dataset_name):
    print(f"\nProcessing {dataset_name} dataset...")

    # Load COCO annotations
    with open(original_annotations_path, "r") as f:
        coco_data = json.load(f)

    # Get a random subset of images
    all_images = coco_data["images"]
    subset_images = random.sample(all_images, subset_size)

    # Create new annotation file
    subset_annotations = {
        "info": coco_data["info"],
        "licenses": coco_data["licenses"],
        "images": subset_images,
        "annotations": [ann for ann in coco_data["annotations"] if ann["image_id"] in {img["id"] for img in subset_images}],
        "categories": coco_data["categories"],
    }

    # Save new annotations file
    os.makedirs(os.path.dirname(subset_annotations_path), exist_ok=True)
    with open(subset_annotations_path, "w") as f:
        json.dump(subset_annotations, f, indent=4)

    # Copy selected images with progress bar
    os.makedirs(subset_images_path, exist_ok=True)
    print(f"Copying {subset_size} images for {dataset_name}...")
    for img in tqdm(subset_images, desc=f"Copying {dataset_name}", unit="img"):
        src = os.path.join(original_images_path, img["file_name"])
        dst = os.path.join(subset_images_path, img["file_name"])
        shutil.copy(src, dst)

# Create train subset
create_subset(original_train_images_path, original_train_annotations_path, subset_train_images_path, subset_train_annotations_path, train_subset_size, "Train")

# Create validation subset
create_subset(original_val_images_path, original_val_annotations_path, subset_val_images_path, subset_val_annotations_path, val_subset_size, "Validation")

# Create test subset (random selection if no annotations)
if os.path.exists(original_test_images_path):
    os.makedirs(subset_test_images_path, exist_ok=True)
    test_images = random.sample(os.listdir(original_test_images_path), test_subset_size)
    print("\nCopying test images...")
    for img in tqdm(test_images, desc="Copying Test", unit="img"):
        shutil.copy(os.path.join(original_test_images_path, img), os.path.join(subset_test_images_path, img))

# Copy full annotations folder (for compatibility)
print("\nCopying annotations...")
shutil.copytree(original_annotations_path, subset_annotations_path, dirs_exist_ok=True)

print("\nSubset creation complete! (~2GB total)")



Processing Train dataset...
Copying 5600 images for Train...


Copying Train: 100%|██████████| 5600/5600 [01:05<00:00, 85.90img/s] 



Processing Validation dataset...
Copying 1200 images for Validation...


Copying Validation: 100%|██████████| 1200/1200 [00:06<00:00, 175.06img/s]



Copying test images...


Copying Test: 100%|██████████| 1200/1200 [00:11<00:00, 100.78img/s]



Copying annotations...

Subset creation complete! (~2GB total)


In [3]:
import json
import os
import pandas as pd  # Using pandas instead of cudf
import concurrent.futures
from tqdm import tqdm

# Paths (Update these if necessary)
annotations_paths = {
    "train": ("cocosubset/annotations/instances_train2017.json", "cocosubset/train2017", "cocosubset/labels/train2017"),
    "val": ("cocosubset/annotations/instances_val2017.json", "cocosubset/val2017", "cocosubset/labels/val2017"),
}

# Process both train and val datasets
for dataset, (coco_json, image_folder, output_label_folder) in annotations_paths.items():
    print(f"\n🔄 Processing {dataset} dataset...")

    # Create labels folder if it doesn't exist
    os.makedirs(output_label_folder, exist_ok=True)

    # Load COCO JSON
    with open(coco_json, "r") as f:
        coco_data = json.load(f)

    # Get image filenames present in the folder
    existing_images = set(os.listdir(image_folder))

    # Create a dictionary for fast lookups (image_id → (width, height, filename))
    image_id_to_data = {
        img["id"]: (img["width"], img["height"], img["file_name"])
        for img in coco_data["images"] if img["file_name"] in existing_images
    }

    # Map COCO category IDs to YOLO format category IDs
    category_map = {cat["id"]: idx for idx, cat in enumerate(coco_data["categories"])}

    # Convert annotations into a pandas DataFrame
    annotations_df = pd.DataFrame.from_records(coco_data["annotations"])

    # Function to process a single annotation
    def process_annotation(ann):
        image_id = ann["image_id"]

        # Ensure image_id exists
        if image_id in image_id_to_data:
            img_width, img_height, file_name = image_id_to_data[image_id]
            txt_file = os.path.join(output_label_folder, file_name.replace(".jpg", ".txt"))

            # Extract bbox and convert to YOLO format
            x, y, w, h = ann["bbox"]
            x_center = (x + w / 2) / img_width
            y_center = (y + h / 2) / img_height
            w_norm = w / img_width
            h_norm = h / img_height
            category = category_map[ann["category_id"]]

            # Save to .txt file
            with open(txt_file, "a") as f:
                f.write(f"{category} {x_center} {y_center} {w_norm} {h_norm}\n")

    # Process annotations using multiple threads for speed
    with concurrent.futures.ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(process_annotation, annotations_df.to_dict(orient="records")), 
                  total=len(annotations_df), desc=f"Processing {dataset} Annotations"))

    print(f"✅ {dataset} dataset conversion complete!")



🔄 Processing train dataset...


Processing train Annotations: 100%|██████████| 860001/860001 [00:18<00:00, 45430.08it/s]


✅ train dataset conversion complete!

🔄 Processing val dataset...


Processing val Annotations: 100%|██████████| 36781/36781 [00:02<00:00, 17127.61it/s]

✅ val dataset conversion complete!





#### FIND AND FIX INCORRECT LABELS 


In [8]:
import os

labels_path = r"C:\Users\Kirk Recio\Documents\PYTHON\Project Testing\Data Cleaning Training\cocosubset\labels"

for split in ["train2017", "val2017"]:
    split_path = os.path.join(labels_path, split)

    # Check if the folder exists
    if not os.path.exists(split_path):
        print(f"🚨 Folder not found: {split_path}")
        continue  # Skip to the next loop iteration

    for label_file in os.listdir(split_path):
        with open(os.path.join(split_path, label_file), "r") as f:
            lines = f.readlines()

        # Check for invalid class IDs
        valid_lines = [line for line in lines if int(line.split()[0]) < 80]

        # Overwrite file if changes were made
        if len(valid_lines) != len(lines):
            with open(os.path.join(split_path, label_file), "w") as f:
                f.writelines(valid_lines)
            print(f"✅ Fixed {label_file}")


ValueError: invalid literal for int() with base 10: '0.041015625'

In [6]:
import os

labels_dir = r"cocosubset\labels\train2017"  # Use raw string or forward slashes
max_classes = 80  # Expected max class index

for file in os.listdir(labels_dir):
    if file.endswith(".txt"):
        path = os.path.join(labels_dir, file)
        with open(path, "r") as f:
            lines = f.readlines()

        valid_lines = []
        for line in lines:
            parts = line.split()
            if len(parts) > 0 and parts[0].isdigit():
                class_id = int(parts[0])
                if class_id < max_classes:
                    valid_lines.append(line)

        if valid_lines:  # Only write if there are valid labels
            with open(path, "w") as f:
                f.writelines(valid_lines)
        else:
            os.remove(path)  # Remove empty label file

print("Invalid labels removed.")


Invalid labels removed.


### POTHOLE DATASET


In [2]:
import os
import random
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm  # Progress bar

# Root dataset folder
DATASET_PATH = "pothole"

# Paths to annotations and images inside "pothole"
ANNOTATIONS_PATH = os.path.join(DATASET_PATH, "annotations")
IMAGES_PATH = os.path.join(DATASET_PATH, "images")

# YOLO formatted dataset structure (train, val, test)
YOLO_IMAGES_TRAIN = os.path.join(DATASET_PATH, "images/train2017")
YOLO_IMAGES_VAL = os.path.join(DATASET_PATH, "images/val2017")
YOLO_IMAGES_TEST = os.path.join(DATASET_PATH, "images/test2017")
YOLO_LABELS_TRAIN = os.path.join(DATASET_PATH, "labels/train2017")
YOLO_LABELS_VAL = os.path.join(DATASET_PATH, "labels/val2017")
YOLO_LABELS_TEST = os.path.join(DATASET_PATH, "labels/test2017")  # ✅ Now included!

# Ensure YOLO directories exist
for path in [YOLO_IMAGES_TRAIN, YOLO_IMAGES_VAL, YOLO_IMAGES_TEST, 
             YOLO_LABELS_TRAIN, YOLO_LABELS_VAL, YOLO_LABELS_TEST]:
    os.makedirs(path, exist_ok=True)

# Class names (modify if needed)
classes = ["pothole"]

def convert_voc_to_yolo(xml_file):
    """ Convert Pascal VOC XML annotations to YOLO format """
    tree = ET.parse(xml_file)
    root = tree.getroot()

    image_name = root.find("filename").text
    image_width = int(root.find("size/width").text)
    image_height = int(root.find("size/height").text)

    yolo_annotations = []

    for obj in root.findall("object"):
        class_name = obj.find("name").text
        if class_name not in classes:
            continue
        class_id = classes.index(class_name)

        bbox = obj.find("bndbox")
        xmin = int(bbox.find("xmin").text)
        ymin = int(bbox.find("ymin").text)
        xmax = int(bbox.find("xmax").text)
        ymax = int(bbox.find("ymax").text)

        # Convert to YOLO format
        x_center = (xmin + xmax) / 2 / image_width
        y_center = (ymin + ymax) / 2 / image_height
        width = (xmax - xmin) / image_width
        height = (ymax - ymin) / image_height

        yolo_annotations.append(f"{class_id} {x_center} {y_center} {width} {height}")

    return image_name, yolo_annotations

# Ensure annotations directory exists
if not os.path.exists(ANNOTATIONS_PATH):
    raise FileNotFoundError(f"Annotations folder '{ANNOTATIONS_PATH}' not found!")

# Get all XML annotation files
all_xml_files = [f for f in os.listdir(ANNOTATIONS_PATH) if f.endswith(".xml")]

# Shuffle dataset and split into train (70%), val (20%), test (10%)
random.shuffle(all_xml_files)
train_split = int(0.7 * len(all_xml_files))
val_split = int(0.9 * len(all_xml_files))  # 70% train + 20% val = 90%, remaining 10% is test

train_files = all_xml_files[:train_split]
val_files = all_xml_files[train_split:val_split]
test_files = all_xml_files[val_split:]

# Process files with progress bar
for dataset_type, xml_files, img_dest, label_dest in [
    ("train2017", train_files, YOLO_IMAGES_TRAIN, YOLO_LABELS_TRAIN),
    ("val2017", val_files, YOLO_IMAGES_VAL, YOLO_LABELS_VAL),
    ("test2017", test_files, YOLO_IMAGES_TEST, YOLO_LABELS_TEST)  # ✅ Now processes test2017
]:
    print(f"📂 Processing {dataset_type} set...")
    
    for xml_file in tqdm(xml_files, desc=f"Converting {dataset_type}", unit="file"):
        xml_path = os.path.join(ANNOTATIONS_PATH, xml_file)
        image_name, yolo_annotations = convert_voc_to_yolo(xml_path)

        # Save YOLO annotation file
        yolo_label_path = os.path.join(label_dest, image_name.replace(".png", ".txt"))
        with open(yolo_label_path, "w") as f:
            f.write("\n".join(yolo_annotations))

        # Move corresponding image
        src_img_path = os.path.join(IMAGES_PATH, image_name)
        dst_img_path = os.path.join(img_dest, image_name)
        if os.path.exists(src_img_path):
            shutil.copy(src_img_path, dst_img_path)

print("✅ Dataset conversion and organization completed!")


📂 Processing train2017 set...


Converting train2017:   0%|          | 0/465 [00:00<?, ?file/s]

Converting train2017: 100%|██████████| 465/465 [00:03<00:00, 121.77file/s]


📂 Processing val2017 set...


Converting val2017: 100%|██████████| 133/133 [00:00<00:00, 148.46file/s]


📂 Processing test2017 set...


Converting test2017: 100%|██████████| 67/67 [00:00<00:00, 154.41file/s]

✅ Dataset conversion and organization completed!





## EASY-OCR

### ICDAR 2015

#### Rename Files

In [8]:
import os
from tqdm import tqdm

# Set the dataset path
DATASET_PATH = "icdar2015/data"  # Change this if needed

# Get all files
files = os.listdir(DATASET_PATH)

# Rename all .jpg and .txt files
for filename in tqdm(files, desc="Renaming files", unit="file"):
    old_path = os.path.join(DATASET_PATH, filename)
    
    # Check if the file is a .jpg or .txt
    if filename.endswith(".jpg") or filename.endswith(".txt"):
        new_filename = f"letter{filename}"
        new_path = os.path.join(DATASET_PATH, new_filename)
        os.rename(old_path, new_path)

print("✅ Renaming completed!")


Renaming files: 100%|██████████| 34282/34282 [00:36<00:00, 930.17file/s] 

✅ Renaming completed!





####  Remove Corrupted Images & Empty Annotations

In [7]:
import cv2
import os
from tqdm import tqdm
# Get list of files
DATASET_PATH = "icdar2015/data"  # Change this if needed
files = os.listdir(DATASET_PATH)

# Remove corrupted images
for filename in tqdm(files, desc="Checking images", unit="file"):
    if filename.endswith(".jpg"):
        img_path = os.path.join(DATASET_PATH, filename)
        
        try:
            img = cv2.imread(img_path)
            if img is None:
                print(f"❌ Corrupted image: {filename} (Deleting...)")
                os.remove(img_path)
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            os.remove(img_path)

print("✅ Image cleaning completed!")

# Remove empty annotation files
for filename in tqdm(files, desc="Checking annotations", unit="file"):
    if filename.endswith(".txt"):
        txt_path = os.path.join(DATASET_PATH, filename)
        
        if os.path.getsize(txt_path) == 0:
            print(f"❌ Empty annotation file: {filename} (Deleting...)")
            os.remove(txt_path)

print("✅ Annotation cleaning completed!")


Checking images: 100%|██████████| 34282/34282 [03:16<00:00, 174.79file/s]


✅ Image cleaning completed!


Checking annotations: 100%|██████████| 34282/34282 [00:00<00:00, 41061.35file/s]

✅ Annotation cleaning completed!





#### train.txt and val.txt with Progress

In [9]:
# Read the train and val lists
with open("icdar2015/train.txt", "r") as f:
    train_files = f.readlines()
with open("icdar2015/val.txt", "r") as f:
    val_files = f.readlines()

# Rename inside the files with progress
train_files = [f"letter{line.strip()}\n" for line in tqdm(train_files, desc="Updating train.txt", unit="line")]
val_files = [f"letter{line.strip()}\n" for line in tqdm(val_files, desc="Updating val.txt", unit="line")]

# Write back the updated lists
with open("icdar2015/train.txt", "w") as f:
    f.writelines(train_files)
with open("icdar2015/val.txt", "w") as f:
    f.writelines(val_files)

print("✅ train.txt and val.txt updated!")


Updating train.txt: 100%|██████████| 13712/13712 [00:00<00:00, 3984501.62line/s]
Updating val.txt: 100%|██████████| 3429/3429 [00:00<00:00, 3109679.66line/s]

✅ train.txt and val.txt updated!





In [3]:
## test 

import os

def update_class_id(label_dir, old_id=0, new_id=80):
    for filename in os.listdir(label_dir):
        if filename.endswith(".txt"):
            path = os.path.join(label_dir, filename)
            with open(path, "r") as file:
                lines = file.readlines()

            updated_lines = []
            for line in lines:
                parts = line.strip().split()
                if parts:  # skip empty lines
                    parts[0] = str(new_id)
                    updated_lines.append(" ".join(parts))

            with open(path, "w") as file:
                file.write("\n".join(updated_lines))

# Run for train and val label folders
update_class_id("pothole/labels/train")
update_class_id("pothole/labels/val")




In [4]:
import os

def remove_npy_files(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".npy"):
            file_path = os.path.join(folder_path, filename)
            os.remove(file_path)
            print(f"Deleted: {file_path}")

# Remove .npy files from train and val
remove_npy_files("pothole/images/train")
remove_npy_files("pothole/images/val")


Deleted: pothole/images/train\potholes0.npy
Deleted: pothole/images/train\potholes10.npy
Deleted: pothole/images/train\potholes100.npy
Deleted: pothole/images/train\potholes103.npy
Deleted: pothole/images/train\potholes105.npy
Deleted: pothole/images/train\potholes108.npy
Deleted: pothole/images/train\potholes109.npy
Deleted: pothole/images/train\potholes11.npy
Deleted: pothole/images/train\potholes110.npy
Deleted: pothole/images/train\potholes111.npy
Deleted: pothole/images/train\potholes112.npy
Deleted: pothole/images/train\potholes113.npy
Deleted: pothole/images/train\potholes114.npy
Deleted: pothole/images/train\potholes116.npy
Deleted: pothole/images/train\potholes117.npy
Deleted: pothole/images/train\potholes118.npy
Deleted: pothole/images/train\potholes119.npy
Deleted: pothole/images/train\potholes12.npy
Deleted: pothole/images/train\potholes120.npy
Deleted: pothole/images/train\potholes122.npy
Deleted: pothole/images/train\potholes123.npy
Deleted: pothole/images/train\potholes1

In [5]:
import os
import shutil

def move_pothole_to_coco(pothole_dir, coco_dir):
    # Define paths
    pothole_train_img = os.path.join(pothole_dir, "images/train")
    pothole_val_img = os.path.join(pothole_dir, "images/val")
    pothole_train_lbl = os.path.join(pothole_dir, "labels/train")
    pothole_val_lbl = os.path.join(pothole_dir, "labels/val")

    coco_train_img = os.path.join(coco_dir, "images/train2017")
    coco_val_img = os.path.join(coco_dir, "images/val2017")
    coco_train_lbl = os.path.join(coco_dir, "labels/train2017")
    coco_val_lbl = os.path.join(coco_dir, "labels/val2017")

    # Move images
    for src_folder, dest_folder in [
        (pothole_train_img, coco_train_img),
        (pothole_val_img, coco_val_img),
    ]:
        for file in os.listdir(src_folder):
            src_path = os.path.join(src_folder, file)
            dest_path = os.path.join(dest_folder, file)
            shutil.move(src_path, dest_path)
            print(f"Moved image: {file} → {dest_folder}")

    # Move labels
    for src_folder, dest_folder in [
        (pothole_train_lbl, coco_train_lbl),
        (pothole_val_lbl, coco_val_lbl),
    ]:
        for file in os.listdir(src_folder):
            src_path = os.path.join(src_folder, file)
            dest_path = os.path.join(dest_folder, file)
            shutil.move(src_path, dest_path)
            print(f"Moved label: {file} → {dest_folder}")

# Example usage
move_pothole_to_coco("pothole", "cocosubset")


Moved image: potholes0.png → cocosubset\images/train2017
Moved image: potholes10.png → cocosubset\images/train2017
Moved image: potholes100.png → cocosubset\images/train2017
Moved image: potholes103.png → cocosubset\images/train2017
Moved image: potholes105.png → cocosubset\images/train2017
Moved image: potholes108.png → cocosubset\images/train2017
Moved image: potholes109.png → cocosubset\images/train2017
Moved image: potholes11.png → cocosubset\images/train2017
Moved image: potholes110.png → cocosubset\images/train2017
Moved image: potholes111.png → cocosubset\images/train2017
Moved image: potholes112.png → cocosubset\images/train2017
Moved image: potholes113.png → cocosubset\images/train2017
Moved image: potholes114.png → cocosubset\images/train2017
Moved image: potholes116.png → cocosubset\images/train2017
Moved image: potholes117.png → cocosubset\images/train2017
Moved image: potholes118.png → cocosubset\images/train2017
Moved image: potholes119.png → cocosubset\images/train2017
M