In [None]:
# pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/pytesseract'

In [None]:
! sudo apt install tesseract-ocr
! apt install libtesseract-dev
! pip install pytesseract
! pip install pymupdf
! pip install tesseract
! pip install easyocr
! pip install paddleocr
! pip install paddlepaddle

In [None]:
import os
import shutil

def clear_directory(directory_path):
    if os.path.exists(directory_path):
        for filename in os.listdir(directory_path):
            file_path = os.path.join(directory_path, filename)

            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f"Failed to delete {file_path}. Reason: {e}")
    else:
        os.makedirs(directory_path)

clear_directory("output/pdf_images")
clear_directory("output/preprocessed_images")
clear_directory("output/ocr_texts")
clear_directory("output/cropped_images")


In [19]:
import os
import cv2
import fitz
import pytesseract
import easyocr
import paddleocr
import numpy as np
from skimage.metrics import structural_similarity as ssim

# Create necessary directories
os.makedirs("output/pdf_images", exist_ok=True)
os.makedirs("output/preprocessed_images", exist_ok=True)
os.makedirs("output/ocr_texts", exist_ok=True)
os.makedirs("output/cropped_images", exist_ok=True)

# OCR Model Types
class ModelTypes:
    Tesseract = "tesseract"
    EasyOCR = "easyocr"
    PaddleOCR = "paddleocr"

# Step 1: Image Extraction from PDF and Images
def extract_images(file_path):
    if file_path.lower().endswith('.pdf'):
        pdf_to_images(file_path)
    else:
        process_image(file_path)

def pdf_to_images(pdf_path):
    pdf_doc = fitz.open(pdf_path)
    for page_num in range(len(pdf_doc)):
        page = pdf_doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)
        img_path = f"output/pdf_images/{page_num + 1:02d}_page.jpg"
        pix.save(img_path)
        print(f"Extracted and saved: {img_path}")
    pdf_doc.close()

def process_image(img_path):
    if img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Error: Unable to load image: {img_path}")
            return
        preprocessed_image = preprocess_image(image)
        preprocessed_path = f"output/preprocessed_images/{os.path.basename(img_path)}"
        cv2.imwrite(preprocessed_path, preprocessed_image)
        print(f"Processed and saved: {preprocessed_path}")

# Step 2: Preprocess the Image
def preprocess_image(image):
    image = cv2.bitwise_not(image)  # Invert colors for better OCR
    blurred = cv2.GaussianBlur(image, (9, 9), 0)
    sharp_image = image_sharpen(blurred)
    thresholded_image = image_thresholding(sharp_image)
    return thresholded_image

def image_sharpen(image):
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    return cv2.filter2D(image, -1, kernel)

def image_thresholding(image):
    _, binary_image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    return binary_image

# Step 3: Crop Images and Remove Duplicates
def crop_and_remove_duplicates(preprocessed_dir="output/preprocessed_images"):
    for img_file in os.listdir(preprocessed_dir):
        img_path = os.path.join(preprocessed_dir, img_file)
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

        if image is None:
            print(f"Error: Unable to load image: {img_path}")
            continue

        cropped_images = []
        cropped_images += crop_image_by_rows(image)   # Crop by rows based on criteria
        cropped_images += crop_image_by_columns(image)  # Crop by columns based on vertical white space

        unique_crops = remove_duplicates(cropped_images)

        for i, unique_crop in enumerate(unique_crops):
            crop_output_path = f"output/cropped_images/{os.path.basename(img_file)}_crop_{i}.jpg"
            cv2.imwrite(crop_output_path, unique_crop)
            print(f"Saved cropped image: {crop_output_path}")

def crop_image_by_rows(image):
    h, w = image.shape[:2]
    row_height = h // 4  # Example: Divide image into 4 horizontal parts
    cropped_images = []

    for i in range(0, h, row_height):
        crop = image[i:i + row_height, 0:w]
        if crop.size > 0:  # Check if the crop is non-empty
            cropped_images.append(crop)

    return cropped_images

def crop_image_by_columns(image):
    h, w = image.shape[:2]
    cropped_images = []

    # Convert image to binary to find white spaces
    _, binary_image = cv2.threshold(image, 200, 255, cv2.THRESH_BINARY)
    vertical_sum = np.sum(binary_image, axis=0)  # Sum along the columns
    column_indices = np.where(vertical_sum < (h * 255 * 0.1))[0]  # Find columns with white space

    # Group indices into segments
    last_index = -1
    start_index = -1
    for idx in column_indices:
        if start_index == -1:  # Start a new segment
            start_index = idx
        if last_index != -1 and idx > last_index + 1:  # End the current segment
            crop = image[0:h, start_index:last_index + 1]
            cropped_images.append(crop)
            start_index = idx  # Start a new segment
        last_index = idx

    # Handle the last segment if applicable
    if start_index != -1 and last_index != -1 and start_index < last_index:
        crop = image[0:h, start_index:last_index + 1]
        cropped_images.append(crop)

    return cropped_images

def remove_duplicates(image_list):
    unique_images = []
    for i, img1 in enumerate(image_list):
        is_unique = True

        img1_resized = cv2.resize(img1, (300, 300))

        for img2 in unique_images:
            img2_resized = cv2.resize(img2, (300, 300))
            score, _ = ssim(img1_resized, img2_resized, full=True)

            if score > 0.95:  # Similarity threshold
                is_unique = False
                break

        if is_unique:
            unique_images.append(img1)

    return unique_images

# Step 4: Perform OCR on Preprocessed Images
def perform_ocr_on_images(preprocessed_dir="output/preprocessed_images", model_type=ModelTypes.Tesseract):
    for img_file in os.listdir(preprocessed_dir):
        img_path = os.path.join(preprocessed_dir, img_file)
        text = perform_ocr(img_path, model_type=model_type)
        if text:
            output_text_path = f"output/ocr_texts/{os.path.basename(img_path)}.txt"
            with open(output_text_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Saved OCR text: {output_text_path}")

def perform_ocr(image_path, model_type):
    if model_type == ModelTypes.Tesseract:
        return perform_ocr_tesseract(image_path)
    elif model_type == ModelTypes.EasyOCR:
        return perform_ocr_easyocr(image_path)
    elif model_type == ModelTypes.PaddleOCR:
        return perform_ocr_paddleocr(image_path)
    else:
        print("Unsupported OCR model type.")
        return ""

def perform_ocr_tesseract(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error: Unable to load image: {image_path}")
        return ""
    return pytesseract.image_to_string(image, config='--oem 3 --psm 6 -l kan+eng')

def perform_ocr_easyocr(image_path):
    reader = easyocr.Reader(['en', 'kn'])  # English and Kannada
    results = reader.readtext(image_path)
    text = ' '.join([result[1] for result in results])
    return text

def perform_ocr_paddleocr(image_path):
    ocr = paddleocr.OCR()  # Initialize PaddleOCR
    results = ocr.ocr(image_path, cls=True)
    text = ' '.join([line[1][0] for line in results[0]])
    return text

def preprocess_images(image_dir="output/pdf_images"):
    for img_file in os.listdir(image_dir):  # Iterate through images in the specified directory
        img_path = os.path.join(image_dir, img_file)
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

        if image is None:
            print(f"Error: Unable to load image: {img_path}")
            continue

        # Call the unified preprocessing logic
        preprocessed_image = preprocess_image(image)  # Preprocess using the common function
        save_preprocessed_image(preprocessed_image, img_file)  # Save preprocessed image

def save_preprocessed_image(preprocessed_image, img_file):
    preprocessed_path = f"output/preprocessed_images/{img_file}"
    cv2.imwrite(preprocessed_path, preprocessed_image)

# Main Driver Function
def main(file_path, model_type=ModelTypes.Tesseract):
    extract_images(file_path)          # Step 1: Extract images from PDF or normal images
    preprocess_images()                # Step 2: Preprocess images
    crop_and_remove_duplicates()       # Step 3: Crop and remove duplicates
    perform_ocr_on_images(model_type=model_type)  # Step 4: Perform OCR on preprocessed images

# Example usage
if __name__ == "__main__":
    file_path = "/content/02_page.jpg"  # Replace with the path to your PDF or image
    selected_model = ModelTypes.EasyOCR  # Choose the OCR model type (Tesseract, EasyOCR, PaddleOCR)
    main(file_path, model_type=selected_model)  # Run the process


Processed and saved: output/preprocessed_images/02_page.jpg




Saved cropped image: output/cropped_images/02_page.jpg_crop_0.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_1.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_2.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_3.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_4.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_5.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_6.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_7.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_8.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_9.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_10.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_11.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_12.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_13.jpg
Saved cropped image: output/cropped_images/02_page.jpg_crop_14.jpg
Saved