In [150]:
#!/usr/bin/env python3
import os
import cv2
import numpy as np
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

# Optional: prevent OpenCV from oversubscribing threads inside each worker
cv2.setNumThreads(1)

In [158]:
SRC_PATH = "/projectnb/ec500kb/projects/Fall_2025_Projects/Project_3_data/EMBED/cohort_1/"
OUT_PATH = "/projectnb/ec500kb/projects/Fall_2025_Projects/Project_3/dataset/cohort_1_process"

In [152]:
# ===========================================
# Global target image for workers
# ===========================================
_target_img = None

def _init_worker(target_path):
    """Initializer: load target image once per worker."""
    global _target_img
    _target_img = cv2.imread(target_path, cv2.IMREAD_GRAYSCALE)
    if _target_img is None:
        raise RuntimeError(f"Worker could not load target image: {target_path}")


def _process_one(args):
    src_path, out_path = args
    try:
        cleaned = remove_line_to_background(src_path)
        if cleaned is None:
            return (src_path, False, "read failed")

        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        ok = cv2.imwrite(out_path, cleaned)
        if not ok:
            return (src_path, False, "cv2.imwrite failed")

        return (src_path, True, "")
    except Exception as e:
        return (src_path, False, str(e))

In [156]:
def collect_jobs(src_root, out_root, exts={".png"}):
    jobs = []
    src_root = os.path.abspath(src_root)
    out_root = os.path.abspath(out_root)

    for dirpath, _, filenames in os.walk(src_root):
        for fname in filenames:
            ext = os.path.splitext(fname)[1].lower()
            if ext not in exts:
                continue

            src_path = os.path.join(dirpath, fname)
            rel_path = os.path.relpath(src_path, src_root)
            out_path = os.path.join(out_root, rel_path)

            jobs.append((src_path, out_path))

    return jobs

In [154]:
def remove_line_to_background(image_path):
    # 1. Read the image
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    if img is None:
        print("Error: Could not load image.")
        return

    # 2. Create a Mask (Using YOUR settings)
    _, mask = cv2.threshold(img, 210, 255, cv2.THRESH_BINARY)

    # 3. Refine the Mask (Using YOUR settings)
    kernel_width = 25
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_width, 1))
    
    # 'Open' operation
    detected_lines = cv2.morphologyEx(mask, cv2.MORPH_OPEN, horizontal_kernel)
    
    # Dilate (thicken)
    dilate_kernel = np.ones((5,5), np.uint8)
    final_mask = cv2.dilate(detected_lines, dilate_kernel, iterations=2)

    # 4. Remove to Background (Modified Step)
    # Instead of inpainting, we copy the image and set the masked pixels to 0 (Black)
    result = img.copy()
    result[final_mask > 0] = 0

    return result
    

In [159]:
def main():
    print(f"[info] Scanning source tree: {SRC_PATH}")
    jobs = collect_jobs(SRC_PATH, OUT_PATH)
    print(f"[info] Found {len(jobs)} images to process")

    if not jobs:
        return

    n_workers = 12
    print(f"[info] Using {n_workers} workers")

    successes = 0
    failures = 0

    with Pool(processes=n_workers) as pool:
        for src_path, ok, msg in tqdm(pool.imap_unordered(_process_one, jobs),
                                      total=len(jobs),
                                      desc="Processing"):
            if ok:
                successes += 1
            else:
                failures += 1
                print(f"[warn] {src_path}: {msg}")

    print(f"[done] Success: {successes}, Failures: {failures}")
    print(f"[out] Processed images saved under: {OUT_PATH}")


if __name__ == "__main__":
    main()

[info] Scanning source tree: /projectnb/ec500kb/projects/Fall_2025_Projects/Project_3_data/EMBED/cohort_2/
[info] Found 165340 images to process
[info] Using 12 workers


Processing: 100%|██████████| 165340/165340 [10:42<00:00, 257.31it/s]


[done] Success: 165340, Failures: 0
[out] Processed images saved under: /projectnb/ec500kb/projects/Fall_2025_Projects/Project_3/dataset/cohort_2_process
