# Import Libraries & Configuration

#### This section imports all required Python libraries and sets the main configuration for the dataset processing workflow.

'cv2' → for image loading, resizing, and saving

'numpy' → for creating numerical arrays for ML

'os' → for directory and file handling

'tqdm' → for progress bars

'DATASET_PATH' → path to your original dataset

'OUTPUT_PATH' → path where processed images will be saved

'IMG_SIZE' → the target image size for resizing

'ALLOWED' → allowed image extensions

In [1]:
!pip install tqdm
import os
import cv2
import numpy as np
from tqdm import tqdm



In [2]:
import kagglehub

path = kagglehub.dataset_download("c7934597/cocotext-v20")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Parth\.cache\kagglehub\datasets\c7934597\cocotext-v20\versions\4


# Fetch All Image Paths

#### This section defines a helper function that recursively scans the dataset directory and collects all image file paths.

##### It ensures that:

. only valid image extensions are included

. the dataset structure is preserved

. you get a complete list of all image files

. Printing the total number of images helps confirm correct dataset loading.

In [3]:
DATASET_PATH = "cocotext-v20/versions/4/data"
OUTPUT_PATH = "processed_dataset/"
IMG_SIZE = (128, 128)
ALLOWED = {".jpg", ".jpeg", ".png", ".bmp"}

In [4]:
# List all images
def get_all_image_paths(root):
    image_paths = []
    for root_dir, _, files in os.walk(root):
        for file in files:
            if os.path.splitext(file)[1].lower() in ALLOWED:
                image_paths.append(os.path.join(root_dir, file))
    return image_paths

image_paths = get_all_image_paths(DATASET_PATH)
print("Total Images Found:", len(image_paths))

Total Images Found: 17141


# Inspect Images (Size, Shape, Corrupt Files)

#### This section reads each image and extracts:

    . Image shape (height, width, channels)

    . Unique image sizes in the dataset

    . Any corrupt or unreadable images

#### This helps understand the dataset quality and identify inconsistencies such as:

    . mixed resolutions

    . corrupted files

    . grayscale vs RGB images

It prepares you before performing resizing or training.

In [5]:
# Inspect Image Sizes / Shapes
sizes = []
corrupt = []

for path in tqdm(image_paths, desc="Inspecting images"):
    img = cv2.imread(path)

    if img is None:
        corrupt.append(path)
        continue

    h, w, c = img.shape
    sizes.append((w, h, c))

print("\nUnique image shapes:", set(sizes))
print("Corrupt/Unreadable images:", len(corrupt))
if corrupt:
    print("Corrupt file paths:", corrupt[:10], "...")


Inspecting images: 100%|████████████████████████████████████████████████████████| 17141/17141 [01:27<00:00, 195.63it/s]


Unique image shapes: {(640, 308, 3), (400, 542, 3), (480, 640, 3), (415, 500, 3), (378, 500, 3), (500, 337, 3), (500, 348, 3), (500, 493, 3), (640, 638, 3), (544, 640, 3), (640, 616, 3), (500, 231, 3), (640, 376, 3), (640, 387, 3), (640, 365, 3), (640, 521, 3), (429, 640, 3), (580, 329, 3), (447, 500, 3), (299, 500, 3), (613, 640, 3), (597, 640, 3), (465, 640, 3), (500, 299, 3), (229, 123, 3), (480, 329, 3), (640, 444, 3), (640, 600, 3), (640, 589, 3), (640, 578, 3), (437, 640, 3), (640, 349, 3), (640, 338, 3), (640, 327, 3), (640, 483, 3), (600, 386, 3), (349, 640, 3), (318, 480, 3), (360, 302, 3), (640, 406, 3), (412, 640, 3), (640, 562, 3), (640, 540, 3), (640, 551, 3), (220, 293, 3), (640, 300, 3), (640, 289, 3), (430, 500, 3), (370, 277, 3), (500, 397, 3), (414, 500, 3), (282, 500, 3), (596, 640, 3), (384, 512, 3), (324, 640, 3), (448, 640, 3), (296, 640, 3), (610, 635, 3), (420, 640, 3), (640, 619, 3), (640, 368, 3), (640, 357, 3), (640, 513, 3), (360, 640, 3), (640, 502, 3), (5




# Load, Resize, Save, and Prepare Label Mapping

#### In this section:

    . Each class folder is scanned

    . Output directories are created in processed_dataset/

    . Each image is read and resized to the configured shape

    . Resized images are saved permanently

    . Images are normalized (0–1 range)

    . Labels are auto-indexed using a dictionary

This step prepares the cleaned and standardized dataset for neural network training.

In [6]:
# Load, Resize, Save & Prepare Arrays
X = []
y = []

IMG_SIZE = (224, 224)
ALLOWED = [".jpg"]

os.makedirs(OUTPUT_PATH, exist_ok=True)

files = sorted([f for f in os.listdir(DATASET_PATH) if f.endswith(".jpg")])

print("Total Images Found:", len(files))

for img_file in files:
    img_path = os.path.join(DATASET_PATH, img_file)
    txt_path = img_path.replace(".jpg", ".txt")

    if not os.path.exists(txt_path):
        continue

    img = cv2.imread(img_path)
    if img is None:
        continue

    # Resize
    img_resized = cv2.resize(img, IMG_SIZE)

    # Normalize
    img_norm = img_resized.astype("float32") / 255.0

    # Read text label
    with open(txt_path, "r") as f:
        lines = f.readlines()

    if len(lines) == 0:
        continue

    label = lines[0].strip()

    # Add to arrays
    X.append(img_norm)
    y.append(label)

    # Save resized image
    output_img_path = os.path.join(OUTPUT_PATH, img_file)
    cv2.imwrite(output_img_path, img_resized)

print("\nLoaded Samples:", len(X))
print("Example Label:", y[0])

Total Images Found: 17141

Loaded Samples: 17141
Example Label: 0 0.7859375 0.5670588235294117 0.046875 0.023529411764705882


# Convert Dataset to NumPy Arrays

#### This section converts:

    . X → list of processed images

    . y → corresponding class labels

into **NumPy arrays**.

#### This is required for:

    . training neural networks

    . feeding into TensorFlow / PyTorch models

    . performing data analysis

#### Finally, it prints:

    . final dataset shape

    . example image shape

save location of processed images

In [7]:
# Convert to NumPy Arrays
X = np.array(X)
y = np.array(y)

print("X Shape:", X.shape)
print("y Shape:", y.shape)
print("Example Image Shape:", X[0].shape)
print("\nProcessed dataset saved at:", OUTPUT_PATH)

X Shape: (17141, 224, 224, 3)
y Shape: (17141,)
Example Image Shape: (224, 224, 3)

Processed dataset saved at: processed_dataset/


In [8]:
print(os.path.exists(DATASET_PATH))

True
