# Word Segmentation + Anchor Grouping (PaddleOCR 3.x only, **no OCR**)


This notebook is **locked to PaddleOCR 3.x API** to avoid legacy-argument errors like `Unknown argument: det/show_log`.
- Uses **DBNet++** detector via PaddleOCR **3.x** (PaddleX pipeline).
- Segments **every word** (no recognition).
- Auto- or fixed-border to collect right-side words per left anchor.

If you want a 2.x–compatible version, pin: `pip install paddlepaddle==2.6.1 paddleocr==2.7.1` and use the previous notebook.



## Install
```bash
pip install opencv-python numpy pillow
pip install paddlepaddle paddleocr   # 3.x API
# (GPU optional) pip install paddlepaddle-gpu  # wheel must match your CUDA
```


In [1]:

from pathlib import Path

# --- Paths ---
INPUT_PATH = Path(r"E:\EvaluationAI\Dataset\29.jpg")   # change if needed
OUTPUT_DIR = Path(r"E:\EvaluationAI\autoevalaioutputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Alignment (optional but recommended) ---
ALIGN_TO_A4        = True
A4_SIZE_PX         = (2480, 3508)  # (W, H) A4 @ 300DPI

# --- Detector params (tune for **word** granularity) ---
USE_GPU            = False
DET_DB_BOX_THRESH  = 0.65
DET_DB_THRESH      = 0.40
DET_DB_UNCLIP      = 1.40
MIN_AREA_PX        = 30             # drop tiny boxes

# --- Border / anchors ---
BORDER_MODE        = "auto"         # "auto" or "fixed"
FIXED_BORDER_RATIO = 0.30           # used when BORDER_MODE == "fixed"
LEFT_CAND_MAX_XR   = 0.50           # anchor cand: center-x <= 50% of width
LEFT_PAD_PX        = 20             # push border slightly right of anchors

# --- Grouping pads ---
TOP_PAD            = 6
BOTTOM_PAD         = 6

print("INPUT_PATH:", INPUT_PATH)
print("OUTPUT_DIR:", OUTPUT_DIR)


INPUT_PATH: E:\EvaluationAI\Dataset\29.jpg
OUTPUT_DIR: E:\EvaluationAI\autoevalaioutputs


In [2]:

import os, json, math
from typing import List, Dict, Any, Tuple, Optional
import cv2
import numpy as np

def ensure_img(path: Path) -> np.ndarray:
    img = cv2.imread(str(path))
    if img is None:
        raise FileNotFoundError(f"Could not load image: {path}")
    return img

def save_image(img_bgr: np.ndarray, name: str) -> Path:
    p = OUTPUT_DIR / name
    p.parent.mkdir(parents=True, exist_ok=True)
    cv2.imwrite(str(p), img_bgr)
    return p

def save_json(obj: Any, name: str) -> Path:
    p = OUTPUT_DIR / name
    p.parent.mkdir(parents=True, exist_ok=True)
    with open(p, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)
    return p

def _find_contours(*args, **kwargs):
    res = cv2.findContours(*args, **kwargs)
    return res[-2], res[-1]

def detect_page_quad(gray: np.ndarray) -> Optional[np.ndarray]:
    g = cv2.GaussianBlur(gray, (5,5), 0)
    edges = cv2.Canny(g, 50, 150)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
    edges = cv2.dilate(edges, kernel, iterations=1)
    edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=1)
    cnts, _ = _find_contours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not cnts: return None
    cnt = max(cnts, key=cv2.contourArea)
    peri = cv2.arcLength(cnt, True)
    approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
    if len(approx) < 4:
        approx = cv2.convexHull(cnt)
    approx = approx.reshape(-1, 2).astype(np.float32)
    if len(approx) != 4:
        rect = cv2.minAreaRect(cnt)
        box  = cv2.boxPoints(rect)
        approx = box.astype(np.float32)
    return approx

def order_corners(pts: np.ndarray) -> np.ndarray:
    s = pts.sum(axis=1)
    d = np.diff(pts, axis=1)[:,0]
    tl = pts[np.argmin(s)]
    br = pts[np.argmax(s)]
    tr = pts[np.argmin(d)]
    bl = pts[np.argmax(d)]
    return np.array([tl, tr, br, bl], dtype=np.float32)

def align_to_a4(img_bgr: np.ndarray, out_size=(2480,3508)) -> Tuple[np.ndarray, np.ndarray]:
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    quad = detect_page_quad(gray)
    if quad is None:
        h, w = img_bgr.shape[:2]
        quad = np.array([[0,0],[w-1,0],[w-1,h-1],[0,h-1]], dtype=np.float32)
    src = order_corners(quad)
    dst = np.array([[0,0],[out_size[0]-1,0],[out_size[0]-1,out_size[1]-1],[0,out_size[1]-1]], dtype=np.float32)
    M = cv2.getPerspectiveTransform(src, dst)
    warped = cv2.warpPerspective(img_bgr, M, out_size)
    g  = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
    cl = cv2.createCLAHE(2.0, (8,8))
    g2 = cl.apply(g)
    warped = cv2.cvtColor(g2, cv2.COLOR_GRAY2BGR)
    return warped, src

def poly_to_bbox(poly: np.ndarray) -> Tuple[int,int,int,int]:
    xs = poly[:,0]; ys = poly[:,1]
    x0 = int(np.floor(xs.min())); y0 = int(np.floor(ys.min()))
    x1 = int(np.ceil(xs.max()));  y1 = int(np.ceil(ys.max()))
    return x0,y0,x1,y1

def save_crop(img: np.ndarray, bbox_xyxy, path: Path):
    x0,y0,x1,y1 = bbox_xyxy
    h, w = img.shape[:2]
    x0 = max(0,x0); y0 = max(0,y0); x1 = min(w-1, x1); y1 = min(h-1, y1)
    if x1 > x0 and y1 > y0:
        path.parent.mkdir(parents=True, exist_ok=True)
        cv2.imwrite(str(path), img[y0:y1, x0:x1])


## PaddleOCR 3.x detector wrapper

In [3]:

from paddleocr import PaddleOCR
from typing import List
import numpy as np

def _extract_polys_v3(result) -> List[np.ndarray]:
    polys: List[np.ndarray] = []
    # result is usually a list (one per page)
    pages = result if isinstance(result, (list, tuple)) else [result]
    for pg in pages:
        if isinstance(pg, dict):
            for key in ("word_boxes", "boxes", "det", "text_det_results", "dt_boxes", "dt_polys", "polygons"):
                if key in pg and isinstance(pg[key], (list, tuple)):
                    for box in pg[key]:
                        arr = np.array(box, dtype=np.float32)
                        if arr.ndim == 2 and arr.shape[0] >= 4:
                            polys.append(arr)
        elif isinstance(pg, (list, tuple)):
            # some 3.x returns a list of polys already
            for box in pg:
                arr = np.array(box, dtype=np.float32)
                if arr.ndim == 2 and arr.shape[0] >= 4:
                    polys.append(arr)
        elif hasattr(pg, "shape"):
            arr = np.array(pg, dtype=np.float32)
            if arr.ndim == 3 and arr.shape[1] >= 4:
                for poly in arr:
                    polys.append(np.array(poly, dtype=np.float32))
    return polys

def run_dbnet_detector(img_bgr):
    ocr = PaddleOCR(
        lang='en',
        use_gpu=USE_GPU,
        ocr_version='PP-OCRv4',
        # detector thresholds:
        text_det_box_thresh=DET_DB_BOX_THRESH,
        text_det_thresh=DET_DB_THRESH,
        text_det_unclip_ratio=DET_DB_UNCLIP,
        # request word boxes; disable recognition
        return_word_box=True,
        text_recognition_model_name=None,
    )
    res = ocr.ocr(img_bgr)  # 3.x: no show_log/cls/det/rec args
    polys = _extract_polys_v3(res)
    return polys


## Run segmentation → auto/fixed border → group right-side words

In [4]:

def run_pipeline():
    img0 = ensure_img(INPUT_PATH)
    img  = align_to_a4(img0, A4_SIZE_PX)[0] if ALIGN_TO_A4 else img0

    H, W = img.shape[:2]
    polys = run_dbnet_detector(img)

    words = []
    crops_dir = OUTPUT_DIR / "crops"
    overlay   = img.copy()

    idx = 1
    for poly in polys:
        x0,y0,x1,y1 = poly_to_bbox(poly)
        if (x1-x0)*(y1-y0) < MIN_AREA_PX:
            continue
        cx = (x0+x1)/2.0; cy = (y0+y1)/2.0
        cv2.polylines(overlay, [poly.astype(np.int32)], True, (0,255,0), 2)
        cv2.rectangle(overlay, (x0,y0), (x1,y1), (255,0,0), 1)
        fname = f"word_{idx:04d}.png"
        save_crop(img, (x0,y0,x1,y1), crops_dir / fname)
        words.append({
            "id": idx,
            "poly": poly.round(2).tolist(),
            "bbox_xyxy": [int(x0),int(y0),int(x1),int(y1)],
            "center": [float(cx), float(cy)],
            "size": [int(x1-x0), int(y1-y0)],
            "crop_path": str((crops_dir / fname).as_posix())
        })
        idx += 1

    # Border estimation
    if BORDER_MODE == "fixed":
        x_border = int(FIXED_BORDER_RATIO * W)
    else:
        LEFT_CAND_MAX_X = LEFT_CAND_MAX_XR * W
        cand = []
        for w in words:
            cx, cy = w["center"]
            bw, bh = w["size"]
            if cx <= LEFT_CAND_MAX_X and bw <= 0.18*W and bh >= 0.012*H:
                cand.append(w)
        if not cand:
            x_border = int(0.30 * W)
        else:
            xs     = np.array([w["center"][0] for w in cand], dtype=np.float32)
            widths = np.array([w["size"][0]   for w in cand], dtype=np.float32)
            x_med  = float(np.median(xs))
            w_med  = float(np.median(widths) if len(widths) else 20.0)
            x_border = int(x_med + max(LEFT_PAD_PX, 0.5*w_med))

    # Mark anchors & border
    anchors = []
    for w in words:
        cx, cy = w["center"]
        if cx <= x_border:
            anchors.append(w)
            x0,y0,x1,y1 = w["bbox_xyxy"]
            cv2.rectangle(overlay, (x0,y0), (x1,y1), (0,0,255), 2)
    anchors.sort(key=lambda w: w["center"][1])
    cv2.line(overlay, (x_border, 0), (x_border, H-1), (0,165,255), 2)
    save_image(overlay, "debug_overlay.png")

    # Group right-side words
    groups = []
    if anchors:
        edges_y = [0] + [int(a["center"][1]) for a in anchors] + [H-1]
        bounds = []
        for i in range(1, len(edges_y)-1):
            y_mid_prev = int((edges_y[i-1] + edges_y[i]) / 2)
            y_mid_next = int((edges_y[i]   + edges_y[i+1]) / 2)
            y0 = max(0, y_mid_prev + TOP_PAD)
            y1 = min(H-1, y_mid_next - BOTTOM_PAD)
            bounds.append((i, y0, y1))

        for idx, y0, y1 in bounds:
            grp_dir = OUTPUT_DIR / "groups" / f"Q{idx:03d}"
            grp_dir.mkdir(parents=True, exist_ok=True)

            anc = anchors[idx-1]
            ax0,ay0,ax1,ay1 = anc["bbox_xyxy"]
            save_crop(img, (ax0,ay0,ax1,ay1), grp_dir / "anchor.png")

            pack = {"group": f"Q{idx:03d}", "y_range": [y0,y1], "anchor_id": anc["id"], "words": []}
            for w in words:
                cx, cy = w["center"]
                if cx > x_border and y0 <= cy <= y1:
                    src = Path(w["crop_path"])
                    dst = grp_dir / src.name
                    im  = cv2.imread(str(src))
                    if im is not None:
                        cv2.imwrite(str(dst), im)
                    pack["words"].append({**w, "group_path": str(dst.as_posix())})
            groups.append(pack)
    else:
        grp_dir = OUTPUT_DIR / "groups" / "Q001"
        grp_dir.mkdir(parents=True, exist_ok=True)
        pack = {"group": "Q001", "y_range": [0,H-1], "anchor_id": None, "words": []}
        for w in words:
            cx, cy = w["center"]
            if cx > x_border:
                src = Path(w["crop_path"])
                dst = grp_dir / src.name
                im  = cv2.imread(str(src))
                if im is not None:
                    cv2.imwrite(str(dst), im)
                pack["words"].append({**w, "group_path": str(dst.as_posix())})
        groups.append(pack)

    save_json({"border_x": x_border, "width": W, "height": H, "words": words}, "words.json")
    save_json({"groups": groups}, "groups_index.json")

    print(f"Border x = {x_border} (W={W}) | anchors={len(anchors)} | total words={len(words)}")
    for g in groups:
        print(f"{g['group']}: {len(g['words'])} right-side words; y_range={g['y_range']}")

run_pipeline()


ValueError: Unknown argument: use_gpu

## Outputs

In [None]:

from pprint import pprint
files = sorted([p.as_posix() for p in OUTPUT_DIR.rglob("*") if p.is_file()])
pprint(files[:60])
print("... total files:", len(files))
