In [1]:
import cv2
import numpy as np
import os
from sklearn.cluster import DBSCAN
from scipy.signal import find_peaks

In [2]:
def autocrop_dark_edges(img, threshold_ratio=0.7):
    """
    Automatically removes dark borders from module images.
    threshold_ratio = fraction of mean brightness used to detect dark margin.
    """

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    mean_val = gray.mean()

    # Compute row/column brightness
    row_brightness = gray.mean(axis=1)
    col_brightness = gray.mean(axis=0)

    # A row/column is considered border if it is significantly darker
    th = mean_val * threshold_ratio

    # Detect start & end positions where brightness > threshold
    y1 = np.argmax(row_brightness > th)
    y2 = len(row_brightness) - np.argmax(row_brightness[::-1] > th)

    x1 = np.argmax(col_brightness > th)
    x2 = len(col_brightness) - np.argmax(col_brightness[::-1] > th)

    # Crop
    cropped = img[y1:y2, x1:x2]

    return cropped

In [15]:
def process_module(img_path, save_root="extracted_cells"):
    print(f"\nProcessing: {img_path}")

    orig = cv2.imread(img_path)
    if orig is None:
        print("Could not read image:", img_path)
        return

    orig = cv2.imread(img_path)
    orig = autocrop_dark_edges(orig)
    gray = cv2.cvtColor(orig, cv2.COLOR_BGR2GRAY)
    
    # Smooth slightly to reduce noise
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    
    # Adaptive threshold (BEST for faint tables)
    th = cv2.adaptiveThreshold(
        blur, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV, 21, 8
    )

    edges = cv2.Canny(th, 50, 150, apertureSize=3)
    
    # Hough transform for line detection
    lines = cv2.HoughLinesP(
        edges,
        rho=1,
        theta=np.pi/180,
        threshold=120,
        minLineLength=200,   # adjust based on module height
        maxLineGap=20
    )
    
    vertical = np.zeros_like(th)
    
    if lines is not None:
        for x1, y1, x2, y2 in lines[:, 0]:
            # Keep only near-vertical lines (angle ~ 90°)
            if abs(x1 - x2) < 5:   # vertical line condition
                cv2.line(vertical, (x1, y1), (x2, y2), 255, 3)
    kernel_h = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 3))
    horizontal = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel_h)

    v_proj = vertical.sum(axis=0)
    h_proj = horizontal.sum(axis=1)
    
    # Find peaks = real grid lines
    v_peaks, _ = find_peaks(v_proj, distance=40, prominence=0.30 * v_proj.max())
    h_peaks, _ = find_peaks(h_proj, distance=80, prominence=0.4 * h_proj.max())

    merged_v = sorted(v_peaks.tolist())
    merged_h = sorted(h_peaks.tolist())

    vis = orig.copy()

    for x in v_peaks:
        cv2.line(vis, (x, 0), (x, vis.shape[0]), (0, 255, 0), 2)
    
    for y in h_peaks:
        cv2.line(vis, (0, y), (vis.shape[1], y), (0, 0, 255), 2)

    pts = np.array(v_peaks).reshape(-1, 1)
    clusters = DBSCAN(eps=50, min_samples=1).fit(pts)
    
    merged_v = []
    for label in set(clusters.labels_):
        cluster_points = pts[clusters.labels_ == label].flatten()
        merged_v.append(int(cluster_points.mean()))
    
    merged_v = sorted(merged_v)

    pts = np.array(h_peaks).reshape(-1, 1)
    clusters = DBSCAN(eps=12, min_samples=1).fit(pts)
    
    merged_h = []
    for label in set(clusters.labels_):
        cluster_points = pts[clusters.labels_ == label].flatten()
        merged_h.append(int(cluster_points.mean()))
    
    merged_h = sorted(merged_h)

    vis2 = orig.copy()

    for x in merged_v:
        cv2.line(vis2, (x, 0), (x, vis2.shape[0]), (0, 255, 0), 2)
    
    for y in merged_h:
        cv2.line(vis2, (0, y), (vis2.shape[1], y), (0, 0, 255), 2)

    # Add left and right borders
    merged_v = [0] + merged_v + [orig.shape[1] - 1]
    
    # Add top and bottom borders
    merged_h = [0] + merged_h + [orig.shape[0] - 1]
    
    # Sort to ensure correct ordering
    merged_v = sorted(merged_v)
    merged_h = sorted(merged_h)

    base_name = os.path.splitext(os.path.basename(img_path))[0]
    save_dir = os.path.join(save_root, base_name)
    os.makedirs(save_dir, exist_ok=True)

    cell_count = 0
    for r in range(len(merged_h) - 1):
        y1, y2 = merged_h[r], merged_h[r + 1]
        for c in range(len(merged_v) - 1):
            x1, x2 = merged_v[c], merged_v[c + 1]

            cell = orig[y1:y2, x1:x2]
            filename = os.path.join(save_dir, f"cell_r{r+1}_c{c+1}.png")
            cv2.imwrite(filename, cell)

            cell_count += 1

    print(f"✔ Saved {cell_count} cells → {save_dir}")

In [17]:
import glob

module_folder = r"D:\PROGRAMMING\Internships_assignments\ResearchInternIITMandi\Full_modules_datasets\Full modules datasets\18.11.2024\B Grade"

image_list = glob.glob(module_folder + "/*.jpg") + \
             glob.glob(module_folder + "/*.png") + \
             glob.glob(module_folder + "/*.jpeg")

print("Found", len(image_list), "modules.")

for img_path in image_list:
    process_module(img_path)

Found 10 modules.

Processing: D:\PROGRAMMING\Internships_assignments\ResearchInternIITMandi\Full_modules_datasets\Full modules datasets\18.11.2024\B Grade\WS11249040878638.jpg
✔ Saved 144 cells → extracted_cells\WS11249040878638

Processing: D:\PROGRAMMING\Internships_assignments\ResearchInternIITMandi\Full_modules_datasets\Full modules datasets\18.11.2024\B Grade\WS11249040878796.jpg
✔ Saved 144 cells → extracted_cells\WS11249040878796

Processing: D:\PROGRAMMING\Internships_assignments\ResearchInternIITMandi\Full_modules_datasets\Full modules datasets\18.11.2024\B Grade\WS11249040878831.jpg
✔ Saved 144 cells → extracted_cells\WS11249040878831

Processing: D:\PROGRAMMING\Internships_assignments\ResearchInternIITMandi\Full_modules_datasets\Full modules datasets\18.11.2024\B Grade\WS11249040879340.jpg
✔ Saved 144 cells → extracted_cells\WS11249040879340

Processing: D:\PROGRAMMING\Internships_assignments\ResearchInternIITMandi\Full_modules_datasets\Full modules datasets\18.11.2024\B Gr