In [None]:
from paddleocr import PaddleOCR  

ocr = PaddleOCR(
    use_doc_orientation_classify=False, # Disables document orientation classification model via this parameter
    use_doc_unwarping=False, # Disables text image rectification model via this parameter
    use_textline_orientation=False, # Disables text line orientation classification model via this parameter
)
# ocr = PaddleOCR(lang="en") # Uses English model by specifying language parameter
# ocr = PaddleOCR(ocr_version="PP-OCRv4") # Uses other PP-OCR versions via version parameter
# ocr = PaddleOCR(device="gpu") # Enables GPU acceleration for model inference via device parameter
# ocr = PaddleOCR(
#     text_detection_model_name="PP-OCRv5_mobile_det",
#     text_recognition_model_name="PP-OCRv5_mobile_rec",
#     use_doc_orientation_classify=False,
#     use_doc_unwarping=False,
#     use_textline_orientation=False,
# ) # Switch to PP-OCRv5_mobile models
result = ocr.predict("../data/input/sd19.jpg")  
for res in result:  
    res.print()  
    res.save_to_img("../data/output")  
    res.save_to_json("../data/output")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np

def plot_paddleocr_bounding_boxes(image_path, ocr_results):
    # Load the original image
    image = Image.open(image_path)
    
    # Create figure and axis
    fig, ax = plt.subplots(1, figsize=(15, 10))
    ax.imshow(image)
    
    for res in ocr_results:
        # Get detection polygons and recognition results
        dt_polys = res['dt_polys']  # Detection polygons (bounding boxes)
        
        # Plot each detection
        for i, poly in enumerate(dt_polys):
            # Convert polygon to numpy array
            poly = np.array(poly)
            
            # Create polygon patch for bounding box
            polygon = patches.Polygon(poly, linewidth=2, edgecolor='red', facecolor='none', alpha=0.8)
            ax.add_patch(polygon)
            
    
    ax.set_title("PaddleOCR Detection and Recognition Results", fontsize=16)
    ax.axis('off')
    plt.tight_layout()
    plt.show()

# Plot your results
plot_paddleocr_bounding_boxes("../data/input/sd19.jpg", result)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np
import cv2
import os

def extract_polygon_sections(image_path, ocr_results):
    
    # Load the original image
    image = Image.open(image_path)
    image_cv = cv2.imread(image_path)  # For OpenCV operations
    
    extracted_images = []
    
    for res_idx, res in enumerate(ocr_results):
        dt_polys = res['dt_polys']
        rec_texts = res.get('rec_texts', [])
        
        for i, poly in enumerate(dt_polys):
            # Convert polygon to numpy array
            poly = np.array(poly, dtype=np.int32)
            
            # Get bounding rectangle
            x, y, w, h = cv2.boundingRect(poly)
            
            # Extract the rectangular region
            rect_crop = image_cv[y:y+h, x:x+w]
            
            # Create a mask for the polygon within the bounding rectangle
            mask = np.zeros((h, w), dtype=np.uint8)
            poly_shifted = poly - [x, y]  # Shift polygon to new coordinate system
            cv2.fillPoly(mask, [poly_shifted], 255)
            
            # Apply mask to create transparent background
            rect_crop_rgba = cv2.cvtColor(rect_crop, cv2.COLOR_BGR2RGBA)
            rect_crop_rgba[:, :, 3] = mask  # Set alpha channel
            
            # Convert back to PIL Image
            pil_image = Image.fromarray(cv2.cvtColor(rect_crop_rgba, cv2.COLOR_BGRA2RGBA))
            
            # Generate filename
            text_part = ""
            if rec_texts and i < len(rec_texts):
                # Clean text for filename
                text_part = "".join(c for c in rec_texts[i] if c.isalnum() or c in (' ', '-', '_')).strip()
                text_part = text_part.replace(' ', '_')[:20]  # Limit length
            
            section = f"section_{res_idx}_{i:03d}_{text_part}.png"
        
  
            extracted_images.append({
                'image': pil_image,
                'polygon': poly,
                'text': rec_texts[i] if rec_texts and i < len(rec_texts) else "",
                'section': section
            })
            
    
    return extracted_images

extracted_sections = extract_polygon_sections("../data/input/sd19.jpg", result)

# print name and extracted section
for item in extracted_sections:
    print(f"Section: {item['section']}, Text: {item['text']}")
    # plot image
    plt.imshow(item['image'])
    plt.axis('off')
    plt.show()

In [None]:
# now lets compare using a different model the test extraction
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

device_map = "cpu"

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", dtype="auto", device_map=device_map, cache_dir="../models"
)

In [None]:
def qwen_extraction(img,processor):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": img,
                },
                {"type": "text", "text": "Extract the text from the image."},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device_map)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]


# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

for item in extracted_sections:
    text = qwen_extraction(item['image'],processor)
    print(f"Text: {item['text']}")
    print(f"Qwen Extracted Text: {text}")
    # plot image
    plt.imshow(item['image'])
    plt.axis('off')
    plt.show()