# Image Splitting by Lines
Notebook for splitting images by line in a `directory` .

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def thresholding(image):
    img_gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    ret,thresh = cv2.threshold(img_gray,80,255,cv2.THRESH_BINARY_INV)
    return thresh

## List files



In [16]:
import os

directory_path = '../synthetic_data_png/V04'
all_entries = os.listdir(directory_path)
image_files = [os.path.join(directory_path, entry) for entry in all_entries if os.path.isfile(os.path.join(directory_path, entry)) and entry.lower().endswith(('.png', '.jpg', '.jpeg'))]

print(image_files)

['../synthetic_data_png/V04\\data_11.png', '../synthetic_data_png/V04\\data_12.png', '../synthetic_data_png/V04\\data_13.png', '../synthetic_data_png/V04\\data_14.png', '../synthetic_data_png/V04\\data_15.png', '../synthetic_data_png/V04\\data_16.png', '../synthetic_data_png/V04\\data_17.png', '../synthetic_data_png/V04\\data_18.png', '../synthetic_data_png/V04\\data_19.png', '../synthetic_data_png/V04\\data_20.png', '../synthetic_data_png/V04\\data_21.png', '../synthetic_data_png/V04\\data_22.png', '../synthetic_data_png/V04\\data_23.png', '../synthetic_data_png/V04\\data_24.png', '../synthetic_data_png/V04\\data_25.png', '../synthetic_data_png/V04\\data_26.png', '../synthetic_data_png/V04\\data_27.png', '../synthetic_data_png/V04\\data_28.png', '../synthetic_data_png/V04\\data_29.png', '../synthetic_data_png/V04\\data_30.png']


## Iterate and process

Loops through each image file, applies the existing preprocessing and line segmentation steps, then saves the bounded box image.

In [17]:
import shutil
# from google.colab import files
import os
import cv2
import numpy as np

base_output_dir = 'V04_segmented_lines_per_image'
os.makedirs(base_output_dir, exist_ok=True)

for image_file in image_files:
    print(f"Processing image: {image_file}")

    # Extract the original filename without extension
    original_filename = os.path.splitext(os.path.basename(image_file))[0]

    # Create a subdirectory for the current image
    image_output_dir = os.path.join(base_output_dir, original_filename)
    os.makedirs(image_output_dir, exist_ok=True)

    # Read each image and convert to RGB
    img = cv2.imread(image_file)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Resize the image if its width is greater than 1000 pixels
    h, w, c = img.shape
    if w > 1000:
        new_w = 1000
        ar = w / h
        new_h = int(new_w / ar)
        img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)

    # Apply thresholding
    thresh_img = thresholding(img)

    # Dilation for line segmentation
    kernel_line = np.ones((3, 85), np.uint8)
    dilated_line = cv2.dilate(thresh_img, kernel_line, iterations=1)

    # Find contours for line segmentation and sort
    (contours_line, heirarchy_line) = cv2.findContours(dilated_line.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    sorted_contours_lines = sorted(contours_line, key=lambda ctr: cv2.boundingRect(ctr)[1])

    # Save the segmented lines
    skipped_count = 0
    saved_count = 0
    for i, ctr in enumerate(sorted_contours_lines):
        # Get bounding box coordinates
        x, y, w, h = cv2.boundingRect(ctr)

        # Garbage images filter: check if the height is less than 25 pixels, if true; skip saving
        if h < 25:
            skipped_count += 1
            continue

        # Crop the image using the bounding box
        cropped_line = img[y:y+h, x:x+w]

        # Define the output filename
        output_filename = os.path.join(image_output_dir, f'line_{saved_count}.png')

        # Save the cropped line image
        cv2.imwrite(output_filename, cv2.cvtColor(cropped_line, cv2.COLOR_RGB2BGR))
        saved_count += 1

    print(f"Saved {saved_count} segmented lines for {original_filename} to '{image_output_dir}' directory.")
    print(f"Skipped {skipped_count} lines with height less than 25 pixels for {original_filename}.")


print("Finished processing all images.")

Processing image: ../synthetic_data_png/V04\data_11.png
Saved 5 segmented lines for data_11 to 'V04_segmented_lines_per_image\data_11' directory.
Skipped 2 lines with height less than 25 pixels for data_11.
Processing image: ../synthetic_data_png/V04\data_12.png
Saved 6 segmented lines for data_12 to 'V04_segmented_lines_per_image\data_12' directory.
Skipped 3 lines with height less than 25 pixels for data_12.
Processing image: ../synthetic_data_png/V04\data_13.png
Saved 5 segmented lines for data_13 to 'V04_segmented_lines_per_image\data_13' directory.
Skipped 2 lines with height less than 25 pixels for data_13.
Processing image: ../synthetic_data_png/V04\data_14.png
Saved 5 segmented lines for data_14 to 'V04_segmented_lines_per_image\data_14' directory.
Skipped 4 lines with height less than 25 pixels for data_14.
Processing image: ../synthetic_data_png/V04\data_15.png
Saved 5 segmented lines for data_15 to 'V04_segmented_lines_per_image\data_15' directory.
Skipped 3 lines with heigh

In [5]:
import shutil
from google.colab import files

# Define the directory to zip
directory_to_zip = 'segmented_lines_per_image'

# Define the output zip filename
zip_filename = 'segmented_lines_per_image.zip'

# Create the zip archive
shutil.make_archive(directory_to_zip, 'zip', directory_to_zip)

# Download the zip file
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>