<a href="https://colab.research.google.com/github/SoumyaR01/lead_scoring_18Dec2023/blob/main/Multiprocessing_Multithreading_Tesseract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
from IPython.display import display, HTML

def keep_colab_alive():
    display(HTML('<script>function ClickConnect(){console.log("Clicking");document.querySelector("colab-connect-button").click()}setInterval(ClickConnect,60000)</script>'))

keep_colab_alive()


In [2]:
# Install Tesseract and Poppler
!apt-get install -y poppler-utils tesseract-ocr

# Install Python libraries
!pip install pdf2image pytesseract pillow tqdm

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.8).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [3]:
import os
from threading import Thread
from multiprocessing import Pool, cpu_count
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from tqdm import tqdm

# Setup folders
PDF_FOLDER = '/content/pdfs'
IMAGE_FOLDER = '/content/temp_images'
OUTPUT_FOLDER = '/content/ocr_output'

os.makedirs(PDF_FOLDER, exist_ok=True)
os.makedirs(IMAGE_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

print(f"[INFO] CPU cores available: {cpu_count()}")

[INFO] CPU cores available: 2


# New Section

In [4]:
from google.colab import files

uploaded = files.upload()

# Move to pdfs folder
for filename in uploaded.keys():
    if filename.endswith('.pdf'):
        os.rename(filename, os.path.join(PDF_FOLDER, filename))

Saving diary-no-23823-2019-tpcrl-no-5122019-consolidated.pdf to diary-no-23823-2019-tpcrl-no-5122019-consolidated.pdf


In [5]:
def convert_pdf(pdf_file):
    local_paths = []
    try:
        pages = convert_from_path(os.path.join(PDF_FOLDER, pdf_file), dpi=300)
        for i, page in enumerate(pages):
            image_path = os.path.join(IMAGE_FOLDER, f"{os.path.splitext(pdf_file)[0]}_page_{i + 1}.png")
            page.save(image_path, 'PNG')
            local_paths.append(image_path)
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
    return local_paths

In [6]:
def ocr_image(image_path):
    try:
        pid = os.getpid()
        print(f"[PROCESS {pid}] OCR on: {image_path}")
        text = pytesseract.image_to_string(Image.open(image_path))
        output_file = os.path.join(OUTPUT_FOLDER, os.path.basename(image_path).replace(".png", ".txt"))
        with open(output_file, "w") as f:
            f.write(text)
        return (image_path, text[:200])  # Preview first 200 characters
    except Exception as e:
        return (image_path, f"Error: {str(e)}")

In [7]:
def main():
    all_image_paths = []
    threads = []
    results = []

    def thread_task(pdf_file):
        result = convert_pdf(pdf_file)
        results.append(result)

    for pdf_file in os.listdir(PDF_FOLDER):
        if pdf_file.lower().endswith('.pdf'):
            t = Thread(target=thread_task, args=(pdf_file,))
            t.start()
            threads.append(t)

    for t in threads:
        t.join()

    for sublist in results:
        all_image_paths.extend(sublist)

    print(f"[INFO] Total images for OCR: {len(all_image_paths)}")

    with Pool(processes=cpu_count()) as pool:
        results = list(tqdm(pool.imap(ocr_image, all_image_paths), total=len(all_image_paths)))

    for path, preview_text in results:
        print(f"\n--- OCR Preview from {path} ---\n{preview_text}\n{'='*60}")

main()

[INFO] Total images for OCR: 7


  0%|          | 0/7 [00:00<?, ?it/s]

[PROCESS 9179] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_1.png
[PROCESS 9180] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_2.png
[PROCESS 9180] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_3.png


 14%|█▍        | 1/7 [00:08<00:52,  8.67s/it]

[PROCESS 9179] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_4.png


 43%|████▎     | 3/7 [00:13<00:16,  4.02s/it]

[PROCESS 9180] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_5.png


 57%|█████▋    | 4/7 [00:20<00:15,  5.21s/it]

[PROCESS 9179] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_6.png


 71%|███████▏  | 5/7 [00:23<00:08,  4.48s/it]

[PROCESS 9180] OCR on: /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_7.png


100%|██████████| 7/7 [00:44<00:00,  6.32s/it]


--- OCR Preview from /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_1.png ---
  

ge,

(3

p=
HE SUPREME COURT OF INDIA
£SRJMINAL ORIGINAL JURISDICTION
UfS 406 OF THE CODE OF CRIMINAL
-s. PROGEDURE, 1973, R/W ORDER XXXIX OF THE
aS f // SUPREME.COURT RULES, 2013)
Fe, spose PETIT

--- OCR Preview from /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_2.png ---
®
%

RECORD OF PROCEEDINGS
-SLNO. DATE OF RECORD OF PROCEEDING PGS
4

ee en ee

a

3.

| 10. 7


--- OCR Preview from /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_3.png ---
ITEM NO.22 COURT NO.13 SECTION XVI-A

SUPREME COURT OF INDIA
RECORD OF PROCEEDINGS

Transfer Petition(s)(Criminal) No(s). 512/2019

REKHA RATHI Petitioner(s)
VERSUS

LT. COL GURMEET SINGH Respondent(s

--- OCR Preview from /content/temp_images/diary-no-23823-2019-tpcrl-no-5122019-consolidated_page_4.png ---
Cs
ITEM NO.82 REGISTRAR COURT. 1 SECTION XVI-A |
SUPREME COURT OF IND




In [8]:
from google.colab import files
import zipfile

zip_path = "/content/ocr_results.zip"

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file in os.listdir(OUTPUT_FOLDER):
        zipf.write(os.path.join(OUTPUT_FOLDER, file), arcname=file)

files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>