
# Answer Sheet — **Segment → Group by Question → OCR per Question** (RapidOCR, CPU)

This notebook is self‑contained and uses **RapidOCR (onnxruntime)** only (no Paddle, Tesseract, or EasyOCR).
It will:
1. Load the page **respecting EXIF orientation** (no rotation surprises).
2. Detect **horizontal separator lines** by row‑projection + morphology.
3. Detect **left‑margin question numbers** only in a left strip (e.g., `3)`, `4.`, `Q5`).
4. Start a group at each question number and **end the group at the next horizontal line** (if any).
5. OCR each group with RapidOCR and save **PNG + TXT** files and a combined **JSON**.

Outputs:
- `overlay_debug.png` — shows lines (cyan), anchors (blue), and grouping bands.
- `groups/Q003/`, `Q004/`, … — each contains `region.png` and `text.txt`.
- `questions.json` — all text + metadata.



## Install locally
```bash
pip install rapidocr-onnxruntime onnxruntime opencv-python numpy pillow
```


In [11]:
# !pip install rapidocr-onnxruntime onnxruntime opencv-python numpy pillow


In [12]:

from pathlib import Path

# ==== Paths (update these) ====
INPUT_PATH  = Path(r"E:\EvaluationAI\Dataset\29.jpg")     # <- your page image
OUTPUT_DIR  = Path(r"E:\EvaluationAI\autoevalaioutputs7")  # <- results will go here
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ==== Segmentation params ====
SMOOTH_K_PCT       = 0.005   # projection smoothing kernel ≈ 0.5% of height
PEAK_STD_MUL       = 2.0     # peaks above mean + N*std => ruling line
MIN_LINE_GAP_PCT   = 0.06    # min vertical spacing between lines (6% height)
MAX_THICK_PCT      = 0.05    # max thickness for a line (5% of height)
PAD_INSIDE_PX      = 18      # padding inside segment bounds

# ==== Left-strip OCR for anchors ====
LEFT_STRIP_RATIO   = 0.32    # scan up to 32% of width on the left
ANCHOR_CONF_MIN    = 0.30    # min confidence to keep a candidate
LEFT_PAD_BORDER_PX = 16      # draw border slightly to right of numbers
# Question number patterns to accept:
ANCHOR_REGEXES     = [
    r"^\(?\d{1,2}\)?$",        # 3  (3)
    r"^\(?\d{1,2}\)?[.)]$",    # 3)  3.
    r"^[Qq]\s*\d{1,2}[.)]?$",  # Q3  q12.
]

# ==== OCR / compose params ====
CONF_MIN           = 0.30    # drop very low-confidence tokens
ROW_GAP_FACTOR     = 0.7     # row clustering threshold = median_height * factor

print("INPUT_PATH:", INPUT_PATH)
print("OUTPUT_DIR:", OUTPUT_DIR)


INPUT_PATH: E:\EvaluationAI\Dataset\29.jpg
OUTPUT_DIR: E:\EvaluationAI\autoevalaioutputs7


In [13]:

import re, json
from typing import List, Tuple, Dict, Any
import numpy as np
import cv2
from PIL import Image, ImageOps

def read_image_exif_bgr(path: Path):
    im = Image.open(path)
    im = ImageOps.exif_transpose(im).convert("RGB")
    arr = np.array(im)
    return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)

def save_image(img_bgr, name: str) -> Path:
    p = OUTPUT_DIR / name
    p.parent.mkdir(parents=True, exist_ok=True)
    cv2.imwrite(str(p), img_bgr)
    return p

def save_text(txt: str, name: str) -> Path:
    p = OUTPUT_DIR / name
    p.parent.mkdir(parents=True, exist_ok=True)
    with open(p, "w", encoding="utf-8") as f:
        f.write((txt or "").strip() + "\n")
    return p

def save_json(obj: Any, name: str) -> Path:
    p = OUTPUT_DIR / name
    p.parent.mkdir(parents=True, exist_ok=True)
    with open(p, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)
    return p


In [14]:

def detect_horizontal_lines(img_bgr: np.ndarray,
                            smooth_k_pct=0.005,
                            peak_std_mul=2.0,
                            min_line_gap_pct=0.06,
                            max_thick_pct=0.05) -> List[int]:
    H, W = img_bgr.shape[:2]
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    g = cv2.GaussianBlur(gray, (5,5), 0)

    # Row ink projection (text/lines darker => more 'ink')
    ink = 255 - g
    row_sum = ink.sum(axis=1).astype(np.float32)

    k = max(11, int(H * smooth_k_pct) | 1)  # odd kernel length
    kernel = np.ones(k, dtype=np.float32) / k
    smooth = np.convolve(row_sum, kernel, mode="same")

    mu, sd = float(smooth.mean()), float(smooth.std())
    thr = mu + peak_std_mul*sd
    mask = smooth > thr

    lines_raw = []
    i = 0
    while i < H:
        if mask[i]:
            j = i
            while j+1 < H and mask[j+1]:
                j += 1
            idx = np.arange(i, j+1)
            center = int(np.average(idx, weights=smooth[i:j+1]))
            thickness = j - i + 1
            strength  = float(smooth[i:j+1].max())
            lines_raw.append((center, thickness, strength))
            i = j + 1
        else:
            i += 1

    # keep thin peaks
    lines_raw = [l for l in lines_raw if l[1] <= max_thick_pct*H]

    # pick strongest, spaced apart
    lines_raw.sort(key=lambda t: t[2], reverse=True)
    picked = []
    min_gap = int(min_line_gap_pct * H)
    for c,t,s in lines_raw:
        if not picked or all(abs(c - pc) >= min_gap for pc,_t,_s in picked):
            picked.append((c,t,s))
    centers = sorted([c for c,_t,_s in picked])
    return centers


In [15]:

from rapidocr_onnxruntime import RapidOCR

_ocr = None
def get_ocr():
    global _ocr
    if _ocr is None:
        _ocr = RapidOCR(det_use_cuda=False, rec_use_cuda=False)
    return _ocr

def rapidocr(img_bgr: np.ndarray) -> list:
    ocr = get_ocr()
    result, _ = ocr(img_bgr)  # list of [box(4x2), text, score]
    return result or []

def compose_text_from_rapidocr(results: list,
                               conf_min=0.30,
                               row_gap_factor=0.7) -> str:
    if not results: return ""
    items = []
    for box, text, score in results:
        if not text or float(score) < conf_min:
            continue
        b = np.array(box, dtype=np.float32)
        xs = b[:,0]; ys = b[:,1]
        x0,y0,x1,y1 = int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
        cx, cy = (x0+x1)/2.0, (y0+y1)/2.0
        items.append({"text": text.strip(), "conf": float(score),
                      "bbox": (x0,y0,x1,y1), "cx": cx, "cy": cy, "h": (y1-y0)})
    if not items: return ""

    hs = np.array([it["h"] for it in items], dtype=np.float32)
    row_thr = max(12.0, float(np.median(hs) * row_gap_factor))

    items.sort(key=lambda it: it["cy"])
    rows = []
    cur = [items[0]]
    for it in items[1:]:
        if abs(it["cy"] - cur[-1]["cy"]) <= row_thr:
            cur.append(it)
        else:
            rows.append(cur); cur = [it]
    rows.append(cur)

    lines = []
    for row in rows:
        row.sort(key=lambda it: it["cx"])
        parts = [it["text"] for it in row if it["text"]]
        if parts:
            lines.append(" ".join(parts))
    return "\n".join(lines)

def preprocess_block(block_bgr: np.ndarray, mode: int = 1) -> np.ndarray:
    g = cv2.cvtColor(block_bgr, cv2.COLOR_BGR2GRAY)
    if mode == 1:
        clahe = cv2.createCLAHE(2.0, (8,8))
        g = clahe.apply(g)
        th = cv2.adaptiveThreshold(g, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 35, 10)
    else:
        g = cv2.GaussianBlur(g, (3,3), 0)
        _, th = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    return cv2.cvtColor(th, cv2.COLOR_GRAY2BGR)


In [16]:

def is_anchor_text(text: str) -> bool:
    t = text.strip()
    return any(re.match(p, t) for p in ANCHOR_REGEXES)

def detect_anchors_left_strip(img_bgr: np.ndarray,
                              xr: float,
                              conf_min: float = 0.30) -> List[Dict[str, Any]]:
    H, W = img_bgr.shape[:2]
    x_max = int(max(1, min(W-1, xr * W)))
    crop = img_bgr[:, :x_max].copy()
    res = rapidocr(crop)
    anchors = []
    for box, text, score in res:
        if float(score) < max(0.0, conf_min): continue
        if not is_anchor_text(text): continue
        b = np.array(box, dtype=np.float32)
        xs = b[:,0]; ys = b[:,1]
        x0,y0,x1,y1 = int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
        # adjust to absolute x
        anchors.append({"text": text, "conf": float(score),
                        "bbox": [x0, y0, x1, y1],
                        "center_y": float((y0+y1)/2.0),
                        "right_x": int(x1)})
    # sort by Y & dedupe close ones
    anchors.sort(key=lambda a: a["center_y"])
    dedup = []
    for a in anchors:
        if dedup and abs(a["center_y"] - dedup[-1]["center_y"]) < 18:
            # keep the larger/confident
            prev = dedup[-1]
            pa = (prev["bbox"][2]-prev["bbox"][0])*(prev["bbox"][3]-prev["bbox"][1])
            ca = (a["bbox"][2]-a["bbox"][0])*(a["bbox"][3]-a["bbox"][1])
            if ca > pa:
                dedup[-1] = a
        else:
            dedup.append(a)
    return dedup


In [17]:

def segment_blocks(img_bgr: np.ndarray) -> Dict[str, Any]:
    H, W = img_bgr.shape[:2]

    # 1) detect lines & anchors
    line_ys = detect_horizontal_lines(img_bgr,
                                      smooth_k_pct=SMOOTH_K_PCT,
                                      peak_std_mul=PEAK_STD_MUL,
                                      min_line_gap_pct=MIN_LINE_GAP_PCT,
                                      max_thick_pct=MAX_THICK_PCT)
    anchors = detect_anchors_left_strip(img_bgr, LEFT_STRIP_RATIO, conf_min=ANCHOR_CONF_MIN)

    # 2) border just to the right of left numbers (95th percentile)
    if anchors:
        right_edges = [a["right_x"] for a in anchors]
        x_border = int(np.percentile(np.array(right_edges), 95) + LEFT_PAD_BORDER_PX)
        x_border = min(max(x_border, int(0.08*W)), int(0.5*W))
    else:
        # no anchors found — use a conservative 28% width
        x_border = int(0.28 * W)

    # 3) compute vertical bands per question:
    #    start at anchor y, end at the earliest of (next separator below, mid-to-next-anchor, bottom)
    groups = []
    if anchors:
        for i, anc in enumerate(anchors):
            a_y = int(anc["center_y"])
            next_a_y = int(anchors[i+1]["center_y"]) if i+1 < len(anchors) else H-1
            # first separator below current anchor
            sep_candidates = [y for y in line_ys if y > a_y + 8]
            sep_y = min(sep_candidates) if sep_candidates else None
            mid_to_next = int((a_y + next_a_y)/2)

            y_start = 0 if i == 0 else int((anchors[i-1]["center_y"] + a_y)/2)
            y_start = max(0, y_start + PAD_INSIDE_PX)

            y_end = min([v for v in [sep_y, mid_to_next, H-1] if v is not None])
            y_end = max(y_start+1, y_end - PAD_INSIDE_PX)

            groups.append({"qid": i+1,
                           "label": f"Q{re.sub('[^0-9]','', anc['text']) or i+1:>}",  # try to use the number
                           "anchor_text": anc["text"],
                           "anchor_conf": anc["conf"],
                           "y0": int(y_start), "y1": int(y_end)})
    else:
        # fallback: split by lines only
        edges = [0] + line_ys + [H-1]
        for i in range(len(edges)-1):
            y0 = max(0, edges[i] + (PAD_INSIDE_PX if i>0 else 0))
            y1 = min(H-1, edges[i+1] - PAD_INSIDE_PX)
            if y1 <= y0: continue
            groups.append({"qid": i+1, "label": f"Q{i+1:02d}",
                           "anchor_text": None, "anchor_conf": None,
                           "y0": int(y0), "y1": int(y1)})

    return {"lines": line_ys, "anchors": anchors, "border_x": int(x_border), "groups": groups}


In [18]:

def run_pipeline():
    img = read_image_exif_bgr(INPUT_PATH)
    H, W = img.shape[:2]

    seg = segment_blocks(img)
    line_ys   = seg["lines"]
    anchors   = seg["anchors"]
    x_border  = seg["border_x"]
    groups    = seg["groups"]

    out = {"input": str(INPUT_PATH), "lines": line_ys,
           "anchors": anchors, "border_x": x_border, "questions": []}

    # overlay
    overlay = img.copy()
    # lines (cyan)
    for y in line_ys:
        cv2.line(overlay, (0,y), (W-1,y), (255,255,0), 2, lineType=cv2.LINE_AA)
    # border (orange)
    cv2.line(overlay, (x_border, 0), (x_border, H-1), (0,165,255), 2, lineType=cv2.LINE_AA)
    # anchors (blue)
    for a in anchors:
        x0,y0,x1,y1 = a["bbox"]
        cv2.rectangle(overlay, (x0,y0), (x1,y1), (255,128,0), 1, lineType=cv2.LINE_AA)
        cv2.putText(overlay, a["text"], (x0, max(0,y0-5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,128,0), 1, cv2.LINE_AA)
    # groups (green bands)
    for g in groups:
        y0,y1 = g["y0"], g["y1"]
        cv2.rectangle(overlay, (0,y0), (W-1,y1), (0,200,0), 1, lineType=cv2.LINE_AA)

    save_image(overlay, "overlay_debug.png")

    # OCR each group (right of border only)
    qroot = OUTPUT_DIR / "groups"
    qroot.mkdir(parents=True, exist_ok=True)
    for g in groups:
        y0,y1 = g["y0"], g["y1"]
        # crop full width, but OCR only region to the right of border to avoid left margin
        region = img[y0:y1, 0:W].copy()
        region_right = img[y0:y1, x_border:W].copy()

        # two-pass preprocessing (same backend)
        r1 = preprocess_block(region_right, 1)
        txt = compose_text_from_rapidocr(rapidocr(r1), conf_min=CONF_MIN, row_gap_factor=ROW_GAP_FACTOR)
        if not txt.strip():
            r2 = preprocess_block(region_right, 2)
            txt = compose_text_from_rapidocr(rapidocr(r2), conf_min=max(0.0, CONF_MIN*0.9), row_gap_factor=ROW_GAP_FACTOR)

        # save
        label = g["label"] if isinstance(g["label"], str) else f"Q{g['qid']:02d}"
        qdir = qroot / label
        qdir.mkdir(parents=True, exist_ok=True)
        save_image(region,  f"{label}/region.png")
        save_text(txt,      f"{label}/text.txt")

        out["questions"].append({
            "label": label, "qid": g["qid"], "y_range": [int(y0), int(y1)],
            "image": str((OUTPUT_DIR / f"{label}/region.png").as_posix()).replace(str(OUTPUT_DIR.as_posix())+"/",""),
            "text_file": str((OUTPUT_DIR / f"{label}/text.txt").as_posix()).replace(str(OUTPUT_DIR.as_posix())+"/",""),
            "text": txt
        })

    save_json(out, "questions.json")
    return out

res = run_pipeline()
print("Detected lines:", res["lines"])
print("Anchors:", [(a['text'], int(a['center_y'])) for a in res['anchors']])
print("Border x:", res["border_x"])
for q in res["questions"]:
    print(f"{q['label']}  y={q['y_range']}  chars={len(q['text'])}")


Detected lines: [2848]
Anchors: []
Border x: 766
Q01  y=[0, 2830]  chars=96
Q02  y=[2866, 3997]  chars=67


## Inspect outputs

In [19]:

from pprint import pprint
print("Overlay:", (OUTPUT_DIR / "overlay_debug.png").as_posix())
print("Groups folder:", (OUTPUT_DIR / "groups").as_posix())
with open(OUTPUT_DIR / "questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)
for q in data["questions"]:
    print("\n===", q["label"], "===")
    print(q["text"][:600])


Overlay: E:/EvaluationAI/autoevalaioutputs7/overlay_debug.png
Groups folder: E:/EvaluationAI/autoevalaioutputs7/groups

=== Q01 ===
e energPfon
OS Hhee totally 1o bPls
Pphes
but" H
enqiPokon tspeifke
02517
Ccsingdfheco mthean
on

=== Q02 ===
P
Yoiteapf Akry.
a PvPble Sy P Hhen
d ypquy1
PS ChPnocue Bermorindg
