In [4]:
import cv2
import pytesseract
import numpy as np
from collections import defaultdict

# Set tesseract path if needed
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load image
image_path = 'image.jpg'
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Use Tesseract OCR with layout info
ocr_data = pytesseract.image_to_data(image_rgb, output_type=pytesseract.Output.DICT)

# Create a structure to hold text blocks
blocks = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

n_boxes = len(ocr_data['text'])
for i in range(n_boxes):
    text = ocr_data['text'][i].strip()
    conf = int(ocr_data['conf'][i])

    if conf > 60 and text:
        block_num = ocr_data['block_num'][i]
        par_num = ocr_data['par_num'][i]
        line_num = ocr_data['line_num'][i]
        word_info = {
            'text': text,
            'left': ocr_data['left'][i],
            'top': ocr_data['top'][i]
        }
        blocks[block_num][par_num][line_num].append(word_info)

        # Draw bounding boxes
        x, y, w, h = ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i]
        cv2.rectangle(image_rgb, (x, y), (x + w, y + h), (0, 255, 0), 1)

# Reconstruct the text in block-column order
structured_text = ""
for block in sorted(blocks.keys()):
    for par in sorted(blocks[block].keys()):
        for line in sorted(blocks[block][par].keys()):
            words = blocks[block][par][line]
            # Sort words left to right
            sorted_words = sorted(words, key=lambda x: x['left'])
            line_text = " ".join([w['text'] for w in sorted_words])
            structured_text += line_text + "\n"
        structured_text += "\n"  # Paragraph break
    structured_text += "\n"      # Block break

# Save output text and annotated image
print("== Structured Extracted Text ==")
print(structured_text.strip())

cv2.imwrite("newspaper_layout_text_boxes.jpg", cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR))


== Structured Extracted Text ==
WORLD MENTAL HEALTH MONTH


Addressing mental health can
reduce school violence


DHLAMINI


MENTAL Health is a serious challenge
globally. Many people don’t even
Know they suffer from it and those
who do feel like they have nowhere
to turn to.

‘Mental issues can manifest them-
selves in different ways. The most
notable is depression, but bullying
and violent outbursts are also signs of
‘a mental health challenge.

The South African Depression and
Anxiety Group (SADAG) says its help-
lines have received more than 466 400
calls since January 2021, with one in
every five calls being suicide related.
‘The organisation gets about 1 800 to
2.200 calls a day.

World Mental Health Day is
marked every year on October 10 to
raise awareness about mental health
and well-being around the world and
to organise efforts to assist those expe-
riencing mental health issues.

Since 2013, the World Health
Organization (WHO) has mobilised a
worldwide campaign for World Men

True