In [None]:
#DOCSTRANGE_NANONETSOCR 
%pip install -q 'git+https://github.com/facebookresearch/detectron2.git'
%pip install opencv-python-headless ultralytics 
%pip install docstrange 
%pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121 
%pip install tqdm pyyaml matplotlib scikit-image shapely 
%pip install imgaug pycocotools

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os, cv2, torch
import numpy as np
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo

# For Docstrange
from docstrange import DocumentExtractor
import random
import glob

def init_segmentor(threshold=0.5, device=None):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(
        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"))
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml")
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = threshold
    cfg.MODEL.DEVICE = device or ("cuda" if torch.cuda.is_available() else "cpu")
    return DefaultPredictor(cfg)

# Create a global predictor for reuse
predictor = init_segmentor()

# ===============================================================
# Main function: remove objects & overwrite image
# ===============================================================
def remove_objects_and_save(img_path, mask_color=(255, 255, 255)):
    """
    Detect objects in the image, remove (mask white) those regions, and overwrite the original image.
    """
    # Read image
    img = cv2.imread(img_path)
    if img is None:
        print(f"[!] Cannot read image: {img_path}")
        return

    # Detect objects
    outputs = predictor(img)
    instances = outputs["instances"].to("cpu")

    if not instances.has("pred_masks"):
        print(f"[INFO] No masks detected in image: {img_path}")
        return

    masks = instances.pred_masks.numpy()  # [N, H, W]

    # If no objects found
    if masks.shape[0] == 0:
        print(f"[INFO] No objects to remove in image: {img_path}")
        return

    # Combine all masks
    combined_mask = np.any(masks, axis=0).astype(np.uint8)

    # Replace mask area with white color
    img[combined_mask == 1] = mask_color

    # Save (overwrite)
    cv2.imwrite(img_path, img)
    print(f"[DONE] Removed objects and saved: {img_path}")


def auto_split_image(img_path, diff_thresh=40, min_gap=2000):
    """
    Automatically split a long image based on background color change along the Y-axis.
    Save cropped images to Cropped_Images/{img_name}/
    Images are named as: {img_name}-A1.jpg, {img_name}-A2.jpg, ...
    """
    # ==== Load original image ====
    img = cv2.imread(img_path)
    if img is None:
        print(f"[!] Cannot read image: {img_path}")
        return
    h, w, _ = img.shape
    print(f"[INFO] Original image: {w}x{h}")

    # ==== Smooth image to reduce text noise ====
    blur = cv2.GaussianBlur(img, (15, 15), 0)

    # ==== Convert to LAB color space for stability ====
    lab = cv2.cvtColor(blur, cv2.COLOR_BGR2LAB)
    mean_color = lab.mean(axis=1)  # average across width
    diff = np.abs(np.diff(mean_color, axis=0)).mean(axis=1)

    # ==== Normalize and smooth ====
    diff = cv2.GaussianBlur(diff.reshape(-1,1), (1, 21), 0).flatten()
    diff_norm = (diff - diff.min()) / (diff.max() - diff.min() + 1e-8)

    # ==== Compute edge strength using Sobel operator ====
    gray = cv2.cvtColor(blur, cv2.COLOR_BGR2GRAY)
    sobel = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
    edge_strength = np.mean(np.abs(sobel), axis=1)
    edge_strength = (edge_strength - edge_strength.min()) / (edge_strength.max() - edge_strength.min() + 1e-8)

    # ==== Align lengths ====
    if edge_strength.shape[0] != diff_norm.shape[0]:
        min_len = min(len(edge_strength), len(diff_norm))
        edge_strength = edge_strength[:min_len]
        diff_norm = diff_norm[:min_len]

    # ==== Combine both signals ====
    signal = 0.7 * diff_norm + 0.3 * edge_strength

    # ==== Find cutting boundaries ====
    boundaries = []
    last_cut = 0
    for y in range(1, len(signal)-1):
        if signal[y] > 0.6 and signal[y] == max(signal[y-3:y+3]):
            if y - last_cut > min_gap:
                boundaries.append(y)
                last_cut = y

    if not boundaries:
        print("[!] No clear boundaries detected, image not split.")
        boundaries = [h]

    # ==== Crop and save ====
    base_name = os.path.splitext(os.path.basename(img_path))[0]
    save_dir = f"Cropped_Images/{base_name}"
    os.makedirs(save_dir, exist_ok=True)

    prev_y = 0
    count = 1
    for y in boundaries + [h]:
        crop = img[prev_y:y, :]
        if crop.shape[0] > 50:  # ignore very small crops
            save_path = os.path.join(save_dir, f"{base_name}-A{count}.jpg")
            cv2.imwrite(save_path, crop)
            print(f"[+] Saved: {save_path} ({crop.shape[1]}x{crop.shape[0]})")
            count += 1
        prev_y = y

    print(f"[DONE] Saved {count-1} cropped images to folder: {save_dir}")
    return save_dir


# Creating multiple extractors to use more API keys
extractor1 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractor2 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractor3 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractor4 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractors_list = [extractor1, extractor2, extractor3, extractor4]


def docstrange_ocr_short(img_path, output_path):
    extractor = random.choice(extractors_list)
    remove_objects_and_save(img_path)
    result = extractor.extract(img_path)

    schema = {
        "text_top_to_bottom": "string\nstring"
    }

    os.makedirs(output_path, exist_ok=True)
    output_text = result.extract_data(json_schema=schema)["structured_data"]["text_top_to_bottom"]
    output_path = output_path + "/" + os.path.splitext(os.path.basename(img_path))[0] + ".txt"  # get image name

    with open(output_path, mode="w") as f:
        f.write(output_text)


def docstrange_ocr_long(long_img_path, output_path):
    extractor = random.choice(extractors_list)

    img_name = "/" + os.path.basename(long_img_path) + ".txt"  # long_img_path is a folder containing cropped images
    schema = {"text_top_to_bottom": "string\nstring"}

    # Create output folder if not exist
    os.makedirs(output_path, exist_ok=True)
    output_file = output_path + img_name

    # Get all valid images in folder
    img_extensions = ("*.png", "*.jpg", "*.jpeg", "*.webp")
    img_files = []
    for ext in img_extensions:
        img_files.extend(glob.glob(os.path.join(long_img_path, ext)))

    # OCR each image and append text to one file
    for img_path in sorted(img_files):  # ensure correct order
        remove_objects_and_save(img_path)
        result = extractor.extract(img_path)
        output_text = result.extract_data(json_schema=schema)["structured_data"]["text_top_to_bottom"]

        with open(output_file, mode="a", encoding="utf-8") as f:
            f.write(output_text + "\n")

    print(f"[DONE] Saved combined OCR text to: {output_file}")


# ===============================================================
# Example main script: assign paths and run functions
# ===============================================================
if __name__ == "__main__":
    # Example: assign image path and output folder
    img_path = r"\images\no_bill"          # Path to input image
    output_path = r"OCR_Results"             # Folder to save OCR results

    # Example 1: remove objects and overwrite the image
    remove_objects_and_save(img_path)

    # Example 2: split long image automatically
    cropped_folder = auto_split_image(img_path)

    # Example 3: OCR for a single short image
    docstrange_ocr_short(img_path, output_path)

    # Example 4: OCR for long image that has been split
    if cropped_folder:
        docstrange_ocr_long(cropped_folder, output_path)
