In [87]:
import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_path

def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path, poppler_path=r"C:/Users/waqqa/Downloads/Release-24.07.0-0/poppler-24.07.0/Library/bin")

def preprocess_image(image):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Detect edges using Canny Edge Detection
    edges = cv2.Canny(gray, 50, 150)
    return edges

def detect_lines(edges):
    # Detect lines using Hough Line Transform
    lines = cv2.HoughLinesP(edges, 2, np.pi / 180, threshold=100, minLineLength=1000, maxLineGap=10)
    return lines

def merge_close_lines(lines, distance_threshold=10):
    if lines is None or len(lines) == 0:
        return np.array([])

    def line_distance(line1, line2):
        x1, y1, x2, y2 = line1[0]
        x3, y3, x4, y4 = line2[0]
        return min(
            np.sqrt((x1 - x3) ** 2 + (y1 - y3) ** 2),
            np.sqrt((x2 - x4) ** 2 + (y2 - y4) ** 2)
        )

    def merge_lines(group):
        x_coords = [line[0][0] for line in group] + [line[0][2] for line in group]
        y_coords = [line[0][1] for line in group] + [line[0][3] for line in group]
        return np.array([[min(x_coords), min(y_coords), max(x_coords), max(y_coords)]])

    merged_lines = []
    visited = [False] * len(lines)

    for i in range(len(lines)):
        if visited[i]:
            continue
        group = [lines[i]]
        visited[i] = True
        for j in range(i + 1, len(lines)):
            if not visited[j] and line_distance(lines[i], lines[j]) < distance_threshold:
                group.append(lines[j])
                visited[j] = True
        merged_lines.append(merge_lines(group))

    return np.array(merged_lines)

def draw_lines(image, lines):
    # Draw lines on the image
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            cv2.line(image, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Drawing in green color with thickness of 2
    return image

def save_image(image, file_path):
    # Save the image to the specified path
    cv2.imwrite(file_path, image)
    print(f"Image saved to {file_path}")

def save_intermediate_image(image, lines, step_name):
    # Draw lines on a copy of the image for intermediate visualization
    image_with_lines = draw_lines(image.copy(), lines)
    file_path = f"{step_name}_with_lines.png"
    save_image(image_with_lines, file_path)
    return image_with_lines

def find_quadruple_layout(lines, image_shape):
    vertical_lines = []
    horizontal_lines = []

    # Separate lines into vertical and horizontal based on their orientation
    for line in lines:
        x1, y1, x2, y2 = line[0]
        if abs(x1 - x2) < 10:  # Vertical line
            vertical_lines.append((x1, y1, x2, y2))
        elif abs(y1 - y2) < 10:  # Horizontal line
            horizontal_lines.append((x1, y1, x2, y2))
    print(vertical_lines,horizontal_lines)
    # Check if we have the correct number of lines for a quadruple layout
    if len(vertical_lines) == 1 and len(horizontal_lines) == 3:
        return vertical_lines, horizontal_lines
    else:
        return None, None

def extract_subpage_coordinates(image, vertical_lines, horizontal_lines):
    # Remove duplicates and sort lines
    # vertical_lines = sorted(set((x, y) for x, y, _, _ in vertical_lines))
    horizontal_lines = sorted(set((y1, x1) for x1, y1, _, _ in horizontal_lines))
    print(vertical_lines)
    print('--------------',horizontal_lines)
    image = cv2.line(image, (horizontal_lines[0][1], horizontal_lines[0][0]), (horizontal_lines[-1][1], horizontal_lines[-1][0]), (0, 255, 0), 2)  # Drawing in green color with thickness of 2
    cv2.imwrite(f'check.png', image)
    # Ensure we have at least one vertical and two horizontal lines
    if len(vertical_lines) < 1 or len(horizontal_lines) < 2:
        raise ValueError("Not enough lines to define subpages")

    # Define image boundaries
    left = min(line[0] for line in horizontal_lines)
    right = max(line[0] for line in horizontal_lines)
    top = min(line[1] for line in vertical_lines )
    bottom = max(line[-1] for line in vertical_lines)
    print(left, right, top ,bottom)
    # Define the subpage coordinates
    subpage_coordinates = [
        (left, top, vertical_lines[0][0], horizontal_lines[1][0]),  # Top-left quadrant
        (vertical_lines[0][0], top, right, horizontal_lines[1][0]),  # Top-right quadrant
        (left, horizontal_lines[1][0], vertical_lines[0][0], bottom),  # Bottom-left quadrant
        (vertical_lines[0][0], horizontal_lines[1][0], right, bottom)  # Bottom-right quadrant
    ]
    print(subpage_coordinates)
    # Extract subpages from the image
    subpages = []
    
    for x1, y1, x2, y2 in subpage_coordinates:
        subpage = image[y1:y2, x1:x2]
        subpages.append(subpage)
    
    return subpages



def extract_text_from_subpages(subpages):
    for idx, subpage in enumerate(subpages):
        text = pytesseract.image_to_string(subpage, config='--psm 6')
        print(f"Text from subpage {idx + 1}:\n{text}\n")




In [90]:
pdf_path = 'Document/9. EXAM_T-1059-22_20230519_HOSSEINI_79512_CONDENSED.pdf'
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
# images = convert_pdf_to_images(pdf_path)
# i = 0
# for image in images:
#     i +=1
#     print("-----------------------------")
#     image_copy = image.copy()
#     image_np = np.array(image)
#     edges = preprocess_image(image_np)
#     lines = detect_lines(edges)
#     image_with_lines = draw_lines(image_copy, lines)

#     # Save the image with detected lines
#     save_image(image_with_lines, 'intermediate_state/image'+str(i)+'.png')

#     # print(lines)
#     vertical_lines, horizontal_lines = find_quadruple_layout(lines, image_np.shape)
#     print(vertical_lines, horizontal_lines)
    
#     if vertical_lines and horizontal_lines:
#         subpages = extract_subpages(image_np, vertical_lines, horizontal_lines)
#         extract_text_from_subpages(subpages)
#     else:
#         print("No quadruple layout detected on this page.")


images = convert_pdf_to_images(pdf_path)

for i, image in enumerate(images):
    print("-----------------------------")
    # Convert PIL image to OpenCV format
    image_cv = np.array(image)
    image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
    
    # Preprocess image to detect edges
    edges = preprocess_image(image_cv)
    
    # Detect lines from edges
    lines = detect_lines(edges)
    # print(lines)
    lines = merge_close_lines(lines)
    # print(lines)
    
    # Save intermediate image with detected lines
    image_with_lines = save_intermediate_image(image_cv, lines, f"intermediate_state/page_{i+1}_edges")
    
    # Check for quadruple layout
    vertical_lines, horizontal_lines = find_quadruple_layout(lines, image_cv.shape)
    
    if vertical_lines and horizontal_lines:
        # Extract subpages if a quadruple layout is detected
        subpages = extract_subpage_coordinates(image_with_lines, vertical_lines, horizontal_lines)
        for j, subpage in enumerate(subpages):
            cv2.imwrite(f'subpages/subpage_{i+1}_{j+1}.png', subpage)
        # Extract and print text from each subpage
        extract_text_from_subpages(subpages)
    else:
        print(f"No quadruple layout detected on page {i+1}.")

-----------------------------
Image saved to intermediate_state/page_1_edges_with_lines.png
[] [(100, 158, 1599, 160), (100, 2001, 1599, 2003)]
No quadruple layout detected on page 1.
-----------------------------
Image saved to intermediate_state/page_2_edges_with_lines.png
[] [(100, 158, 1599, 160), (100, 2001, 1599, 2003)]
No quadruple layout detected on page 2.
-----------------------------
Image saved to intermediate_state/page_3_edges_with_lines.png
[] [(100, 2001, 1599, 2003), (100, 158, 1599, 160)]
No quadruple layout detected on page 3.
-----------------------------
Image saved to intermediate_state/page_4_edges_with_lines.png
[] [(100, 158, 1599, 160), (100, 2001, 1599, 2003)]
No quadruple layout detected on page 4.
-----------------------------
Image saved to intermediate_state/page_5_edges_with_lines.png
[(849, 158, 851, 1952)] [(100, 1950, 1599, 1952), (101, 1054, 1598, 1054), (100, 158, 1599, 160)]
[(849, 158, 851, 1952)]
-------------- [(158, 100), (1054, 101), (1950, 10