In [None]:
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,521 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,904 kB]
Fetched 4,683 kB in 2s (2,258 kB/s)
Reading package lists... Done
W: S

In [None]:
!apt-get install poppler-utils
!pip install pdf2image pytesseract opencv-python-headless


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.


In [None]:
import pytesseract

# Set the path to the Tesseract binary
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# Verify installation
print("Tesseract Version:", pytesseract.get_tesseract_version())


Tesseract Version: 4.1.1


In [None]:
import os
import logging
import asyncio
import traceback
from concurrent.futures import ThreadPoolExecutor
from pdf2image import convert_from_path
import pytesseract
import cv2
import numpy as np
from PIL import Image

In [None]:

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    kernel = np.ones((1, 1), np.uint8)
    gray = cv2.dilate(gray, kernel, iterations=1)
    return Image.fromarray(gray)

def convert_pdf_to_images(input_pdf_file_path: str, max_pages: int = 0, skip_first_n_pages: int = 0):
    logging.info(f"Processing PDF file {input_pdf_file_path}")
    first_page = skip_first_n_pages + 1
    last_page = None if max_pages == 0 else first_page + max_pages - 1
    images = convert_from_path(input_pdf_file_path, first_page=first_page, last_page=last_page)
    logging.info(f"Converted {len(images)} pages from PDF file to images.")
    return images

def ocr_image(image):
    """Perform OCR on a single image."""
    preprocessed_image = preprocess_image(image)
    return pytesseract.image_to_string(preprocessed_image)

def main():
    try:
        logging.basicConfig(level=logging.INFO)
        input_pdf_file_path = 'samplepic.pdf'
        max_test_pages = 0
        skip_first_n_pages = 0

        base_name = os.path.splitext(input_pdf_file_path)[0]
        raw_ocr_output_file_path = f"{base_name}__raw_ocr_output.txt"

        list_of_scanned_images = convert_pdf_to_images(input_pdf_file_path, max_test_pages, skip_first_n_pages)
        logging.info(f"Tesseract version: {pytesseract.get_tesseract_version()}")
        logging.info("Extracting text from converted pages...")

        with ThreadPoolExecutor() as executor:
            list_of_extracted_text_strings = list(executor.map(ocr_image, list_of_scanned_images))

        raw_ocr_output = "\n".join(list_of_extracted_text_strings)
        with open(raw_ocr_output_file_path, "w") as f:
            f.write(raw_ocr_output)

        logging.info(f"Raw OCR output written to: {raw_ocr_output_file_path}")

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        logging.error(traceback.format_exc())

if __name__ == '__main__':
    main()
