# Text Extraction from scanned pdf file using OCR


## spliting the images to parts
when searching i found out that tessaract will perform better on small chancks of text, this is why i choose to split the data into chuncks

In [3]:
import os
import cv2
from PIL import Image
import pytesseract
from pytesseract import Output
import numpy as np

def split_image_with_ocr(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith('.png') or filename.endswith('.jpg'):
            input_image = os.path.join(input_folder, filename)

            # Read the image using OpenCV
            img = cv2.imread(input_image)

            if img is None:
                print(f"Failed to load {filename}")
                continue

            # Convert the image to RGB format (OpenCV loads images in BGR format)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            # Convert the NumPy array (OpenCV image) to a PIL Image
            pil_img = Image.fromarray(img_rgb)

            # Perform OCR on the image to get bounding boxes for the text
            data = pytesseract.image_to_data(pil_img, output_type=Output.DICT)

            # Extract the y-coordinates (top positions) and heights of each text box
            heights = data['top']
            text_heights = data['height']

            # Sort the y-coordinates of the text boxes
            heights_sorted = sorted(heights)

            # Calculate split points based on gaps between text boxes
            split_gap = 20  # Adjust this depending on the size of gaps in your images

            # Identify two split points based on the largest gaps between text boxes
            split_points = []
            for i in range(len(heights_sorted) - 1):
                gap = heights_sorted[i + 1] - (heights_sorted[i] + text_heights[i])
                if gap > split_gap:
                    split_points.append(heights_sorted[i + 1])
                if len(split_points) == 2:  # We only need two split points
                    break

            # If we couldn't find enough split points, skip this image
            if len(split_points) < 2:
                print(f"Error: Could not find enough split points for {filename}.")
                continue

            # Split the image into three parts
            top_part = img[:split_points[0], :]
            middle_part = img[split_points[0]:split_points[1], :]
            bottom_part = img[split_points[1]:, :]

            # Check if any of the parts are empty
            if top_part.size == 0 or middle_part.size == 0 or bottom_part.size == 0:
                print(f"Error: Image {filename} split failed, empty part found.")
                continue

            # Create output filenames
            base_filename = os.path.splitext(filename)[0]
            output_top = os.path.join(output_folder, f'{base_filename}_top.png')
            output_middle = os.path.join(output_folder, f'{base_filename}_middle.png')
            output_bottom = os.path.join(output_folder, f'{base_filename}_bottom.png')

            # Save the three parts as separate images
            cv2.imwrite(output_top, top_part)
            cv2.imwrite(output_middle, middle_part)
            cv2.imwrite(output_bottom, bottom_part)

            print(f"Image {filename} split and saved as {output_top}, {output_middle}, and {output_bottom}.")

# Example usage
input_folder = 'images'
output_folder = 'split_images'
split_image_with_ocr(input_folder, output_folder)


Image short_story-003.png split and saved as split_images\short_story-003_top.png, split_images\short_story-003_middle.png, and split_images\short_story-003_bottom.png.
Image short_story-004.png split and saved as split_images\short_story-004_top.png, split_images\short_story-004_middle.png, and split_images\short_story-004_bottom.png.
Image short_story-005.png split and saved as split_images\short_story-005_top.png, split_images\short_story-005_middle.png, and split_images\short_story-005_bottom.png.
Image short_story-006.png split and saved as split_images\short_story-006_top.png, split_images\short_story-006_middle.png, and split_images\short_story-006_bottom.png.
Image short_story-007.png split and saved as split_images\short_story-007_top.png, split_images\short_story-007_middle.png, and split_images\short_story-007_bottom.png.
Image short_story-008.png split and saved as split_images\short_story-008_top.png, split_images\short_story-008_middle.png, and split_images\short_story-00

## converting images to text

In [4]:
import os
import pytesseract
from PIL import Image
import re

# Set up the Tesseract executable path (change if necessary)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def is_title_candidate(line, previous_line):
    stripped_line = line.strip()
    # Check for numbers, short length, no punctuation at the end, and previous line is empty
    if (len(stripped_line) > 0 and len(stripped_line.split()) <= 5 and
        not re.search(r'\d', stripped_line) and  # Avoid numbers
        not stripped_line.endswith(('.', '؟', '!', ':')) and  # Avoid ending punctuation
        len(previous_line.strip()) == 0):  # Check if the previous line is empty
        return True
    return False

def png_to_txt_with_titles(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".png"):
            img_path = os.path.join(input_folder, filename)
            img = Image.open(img_path)
            text = pytesseract.image_to_string(img, lang='ara', config='--psm 6z')
            
            processed_lines = []
            lines = text.split('\n')
            for i, line in enumerate(lines):
                if i == 0:
                    previous_line = ""
                else:
                    previous_line = lines[i - 1]
                
                if is_title_candidate(line, previous_line):
                    line = f"<title>{line.strip()}</title>"
                
                processed_lines.append(line.strip())
            
            txt_filename = f"{os.path.splitext(filename)[0]}.txt"
            txt_path = os.path.join(output_folder, txt_filename)
            
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(processed_lines))
            
            print(f"Processed {filename} -> {txt_filename}")

# Example usage
input_folder = r"split_images"
output_folder = r"txtoutput"

png_to_txt_with_titles(input_folder, output_folder)

Processed short_story-003_bottom.png -> short_story-003_bottom.txt
Processed short_story-003_middle.png -> short_story-003_middle.txt
Processed short_story-003_top.png -> short_story-003_top.txt
Processed short_story-004_bottom.png -> short_story-004_bottom.txt
Processed short_story-004_middle.png -> short_story-004_middle.txt
Processed short_story-004_top.png -> short_story-004_top.txt
Processed short_story-005_bottom.png -> short_story-005_bottom.txt
Processed short_story-005_middle.png -> short_story-005_middle.txt
Processed short_story-005_top.png -> short_story-005_top.txt
Processed short_story-006_bottom.png -> short_story-006_bottom.txt
Processed short_story-006_middle.png -> short_story-006_middle.txt
Processed short_story-006_top.png -> short_story-006_top.txt
Processed short_story-007_bottom.png -> short_story-007_bottom.txt
Processed short_story-007_middle.png -> short_story-007_middle.txt
Processed short_story-007_top.png -> short_story-007_top.txt
Processed short_story-008

## combinning all txt file to one file


In [9]:
import os

def combine_all_txt_files(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # List all text files in the input folder
    text_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.txt')])

    # Create a dictionary to group files by base name
    file_groups = {}
    
    for file in text_files:
        # Assuming the naming convention is like "page1_top.txt", "page1_middle.txt", "page1_bottom.txt"
        base_name = "_".join(file.split("_")[:-1])  # Get the base name without the suffix (top/middle/bottom)
        if base_name not in file_groups:
            file_groups[base_name] = {}
        # Add file to the respective category
        if "top" in file:
            file_groups[base_name]['top'] = file
        elif "middle" in file:
            file_groups[base_name]['middle'] = file
        elif "bottom" in file:
            file_groups[base_name]['bottom'] = file

    # Combine files for each group
    for base_name, files in file_groups.items():
        # Check that we have all three parts (top, middle, bottom)
        if 'top' in files and 'middle' in files and 'bottom' in files:
            output_file = os.path.join(output_folder, f"{base_name}_combined.txt")
            
            with open(output_file, 'w', encoding='utf-8') as output:
                # Write in the order: top, middle, bottom
                for part in ['top', 'middle', 'bottom']:
                    file_path = os.path.join(input_folder, files[part])
                    with open(file_path, 'r', encoding='utf-8') as f:
                        output.write(f.read())
                        output.write("\n\n")  # Add spacing between sections

            print(f"Combined text written to {output_file}")
        else:
            print(f"Skipping incomplete set for {base_name}: {files}")

# Example usage
combine_all_txt_files('txtoutput', 'combined_txtoutput')


Combined text written to combined_txtoutput\short_story-003_combined.txt
Combined text written to combined_txtoutput\short_story-004_combined.txt
Combined text written to combined_txtoutput\short_story-005_combined.txt
Combined text written to combined_txtoutput\short_story-006_combined.txt
Combined text written to combined_txtoutput\short_story-007_combined.txt
Combined text written to combined_txtoutput\short_story-008_combined.txt
Combined text written to combined_txtoutput\short_story-009_combined.txt
Combined text written to combined_txtoutput\short_story-010_combined.txt
Combined text written to combined_txtoutput\short_story-011_combined.txt
Combined text written to combined_txtoutput\short_story-012_combined.txt
Combined text written to combined_txtoutput\short_story-013_combined.txt
Combined text written to combined_txtoutput\short_story-014_combined.txt
Combined text written to combined_txtoutput\short_story-015_combined.txt
Combined text written to combined_txtoutput\short_s

## Converting to one docx file

In [10]:
import os
from collections import defaultdict
from docx import Document
from docx.shared import RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

def combine_txt_files_by_prefix_to_word(input_folder, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all text files in the input folder
    text_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.txt')])

    # Group files by their prefix (start name)
    file_groups = defaultdict(list)
    for text_file in text_files:
        # Extract prefix (start name) before the first underscore
        prefix = text_file.split('_')[0]
        file_groups[prefix].append(text_file)

    # Process each group of files and combine them into a single document
    for prefix, files in file_groups.items():
        doc = Document()

        # Process each file in the group and add its content to the document
        for text_file in files:
            file_path = os.path.join(input_folder, text_file)

            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    stripped_line = line.strip()

                    # Check if the line is a title
                    if stripped_line.startswith('<title>') and stripped_line.endswith('</title>'):
                        # Extract the title text
                        title_text = stripped_line[7:-8]  # Remove <title> and </title>

                        # Add the title paragraph with centered alignment and color
                        title_paragraph = doc.add_paragraph(title_text)
                        title_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
                        run = title_paragraph.runs[0]
                        run.font.color.rgb = RGBColor(255, 0, 0)  # Change color to red (you can adjust RGB color)

                    else:
                        # Add regular paragraphs
                        doc.add_paragraph(stripped_line)

            # Optionally, add a page break after each file in the group
            doc.add_page_break()

        # Save the combined document for this prefix
        output_file = os.path.join(output_folder, f"{prefix}_combined.docx")
        doc.save(output_file)

        print(f"Combined Word document created: {output_file}")

# Example usage
input_folder = 'combined_txtoutput'  # Folder with text files
output_folder = 'combined_word_output'  # Folder to save combined .docx files

# Call the function
combine_txt_files_by_prefix_to_word(input_folder, output_folder)


Combined Word document created: combined_word_output\short_combined.docx
