In [None]:
# Install the OCR library
! pip install -q easyocr
# Install the OCR dependencies libraries
! pip install -q torch torchvision
# Install the libraries to convert the pdf files to images
! pip install -q pdf2image
# Install tqdm for progress bar
! pip install -q tqdm

In [None]:
import easyocr
from pdf2image import convert_from_path
import os
from tqdm import tqdm

In [7]:
from PIL import Image, ImageEnhance
import easyocr
from pdf2image import convert_from_path
import os
from tqdm import tqdm  # For progress bar


def preprocess_image(image_path):
    # Open image using PIL
    img = Image.open(image_path)
    
    # Enhance contrast
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(2.0)

    # Convert to grayscale
    img = img.convert('L')
    
    # Binarize (convert to black and white)
    img = img.point(lambda x: 0 if x < 128 else 255, '1')

    # Save the preprocessed image
    img.save(image_path)


def extract_lines_rtl(results):
    """
    Processes OCR results line by line, preserving the right-to-left order for Arabic.
    """
    # Group results into lines by sorting vertically (top-to-bottom)
    sorted_results = sorted(results, key=lambda x: x[0][0][1])  # Sort by top-left y-coordinate
    
    # Group lines and sort text within each line from right-to-left
    lines = []
    current_line = []
    prev_y = None
    
    # Threshold for line grouping (adjust as needed)
    line_spacing_threshold = 15

    for (bbox, text, _) in sorted_results:
        top_left = bbox[0]  # Top-left corner of the bounding box
        current_y = top_left[1]

        if prev_y is not None and abs(current_y - prev_y) > line_spacing_threshold:
            # Sort the current line from right to left (by x-coordinate)
            current_line = sorted(current_line, key=lambda x: x[0][0][0], reverse=True)
            # Append sorted line to lines
            lines.append(" ".join([text for _, text, _ in current_line]))
            current_line = []

        current_line.append((bbox, text, _))
        prev_y = current_y

    # Sort and append the last line
    if current_line:
        current_line = sorted(current_line, key=lambda x: x[0][0][0], reverse=True)
        lines.append(" ".join([text for _, text, _ in current_line]))

    return lines


def extract_text_from_pdf(pdf_path, output_txt_path, lang='ar', gpu=False):
    # Initialize EasyOCR reader for Arabic only
    reader = easyocr.Reader([lang], gpu=gpu)
    
    # Convert PDF pages to images
    print("Converting PDF to images...")
    pages = convert_from_path(pdf_path)
    
    # Create a directory to store temporary images
    temp_dir = "temp_images"
    os.makedirs(temp_dir, exist_ok=True)

    # Prepare text storage
    all_text = ""

    # Process each page with a progress bar
    print("Processing images for OCR...")
    for i, page in enumerate(tqdm(pages, desc="OCR Progress", unit="page")):
        image_path = os.path.join(temp_dir, f"page_{i+1}.jpg")
        page.save(image_path, 'JPEG')
        
        # Preprocess the image
        preprocess_image(image_path)
        
        # Perform OCR on the image
        results = reader.readtext(image_path)
        
        # Extract lines and sort them from right-to-left
        lines = extract_lines_rtl(results)

        # Add page divider and lines to the final output
        all_text += f"=== Page {i+1} ===\n"
        all_text += "\n".join(lines) + "\n\n"  # Separate lines with single newline

    # Save the extracted text to a .txt file
    print("Saving extracted text to file...")
    with open(output_txt_path, 'w', encoding='utf-8') as f:
        f.write(all_text)

    # Cleanup temporary images
    for image_file in os.listdir(temp_dir):
        os.remove(os.path.join(temp_dir, image_file))
    os.rmdir(temp_dir)

    print(f"Text extraction complete. Saved to {output_txt_path}")


# Example Usage
pdf_path = "../Data/old_cases/9_26.pdf"  # Replace with your PDF file path
output_txt_path = "extracted_text_rtl.txt"  # Replace with desired output .txt file path

extract_text_from_pdf(pdf_path, output_txt_path, lang='ar', gpu=True)

Converting PDF to images...
Processing images for OCR...


OCR Progress: 100%|██████████| 476/476 [26:53<00:00,  3.39s/page]

Saving extracted text to file...
Text extraction complete. Saved to extracted_text_rtl.txt



