In [None]:
# Install dependencies
!pip install pytesseract pdf2image opencv-python-headless

import os
import cv2
import pytesseract
import numpy as np
import json
from PIL import Image
from pdf2image import convert_from_path
from google.colab import drive

In [None]:
#  Mount Google Drive
drive.mount('/content/drive')

#  Folder where your PDFs are stored in Drive
pdf_folder = "/content/drive/MyDrive/OCR_PDFs"   # <-- Change this to your folder path
output_folder = "/content/drive/MyDrive/OCR_Results"

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

In [None]:
#  Preprocessing function
def preprocess_image(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    # Binarize using Otsu thresholding
    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    # Denoise
    img = cv2.fastNlMeansDenoising(img, h=30)
    # Deskew
    coords = np.column_stack(np.where(img > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = img.shape[:2]
    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
    img = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return img

In [None]:
#  Loop through all PDFs in the folder
for pdf_file in os.listdir(pdf_folder):
    if not pdf_file.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(pdf_folder, pdf_file)
    pdf_name = os.path.splitext(pdf_file)[0]
    print(f"\n Processing PDF: {pdf_file}")

    # Convert PDF to images
    pages = convert_from_path(pdf_path, dpi=300)

    results = {}

    for i, page in enumerate(pages):
        img_path = f"/content/page_{i+1}.png"
        page.save(img_path, "PNG")

        print(f"    Page {i+1}/{len(pages)}...")

        # Preprocess
        proc_img = preprocess_image(img_path)
        cv2.imwrite(f"/content/preproc_{i+1}.png", proc_img)

        # Run Tesseract OCR
        tess_text = pytesseract.image_to_string(proc_img)

        # Save results
        results[f"Page_{i+1}"] = {
            "Tesseract": tess_text.strip()
        }

    # Save results JSON in Drive
    output_path = os.path.join(output_folder, f"{pdf_name}_ocr_results.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

    print(f"    Saved: {output_path}")

print("\n All PDFs processed! JSONs saved in:", output_folder)
