In [None]:
#if not installed

# from transformers import VisionEncoderDecoderModel, TrOCRProcessor
# import torch

# processor = TrOCRProcessor.from_pretrained("facebook/nougat-base")
# model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [1]:
import os
import cv2
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from transformers import VisionEncoderDecoderModel, TrOCRProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = VisionEncoderDecoderModel.from_pretrained("C:/Projects/Conceptify - AI Powered Learning Platform/models/nougat-base")
processor = TrOCRProcessor.from_pretrained("C:/Projects/Conceptify - AI Powered Learning Platform/models/nougat-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
# Path to your PDF
pdf_path = "C:\\Projects\\Conceptify - AI Powered Learning Platform\\OCR\\Tests files\\MC test.pdf"

# Optional: path to Poppler bin if not in PATH
poppler_path = r'C:\\Coding\Mics\\poppler-25.07.0\\Library\\bin'

# Tesseract path (if not in PATH)
pytesseract.pytesseract.tesseract_cmd = r"C:\Coding\Mics\Tesseract-OCR-5.5.0.20241111/tesseract.exe"

# ------------------ Helper Functions ------------------
def preprocess_image(page: Image.Image) -> Image.Image:
    """Convert page to grayscale, threshold, resize, and return RGB image."""

    # Convert to grayscale
    img = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2GRAY)

    # Threshold / binarize
    _, img = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY)

    # Resize width to 1280 px (keep aspect ratio)
    h, w = img.shape
    new_w = 1280
    new_h = int(h * (new_w / w))
    img = cv2.resize(img, (new_w, new_h))

    # Convert back to RGB (3 channels)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(img_rgb)


def run_nougat(img: Image.Image) -> str:
    """Run Nougat OCR."""
    pixel_values = processor(images=img, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

def run_tesseract(img: Image.Image) -> str:
    """Run Tesseract OCR."""
    text = pytesseract.image_to_string(img)
    return text

# ------------------ Convert PDF to Images ------------------
pages = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)

# ------------------ Hybrid OCR Pipeline ------------------
full_text = ""
for i, page in enumerate(pages):
    preprocessed_img = preprocess_image(page)
    
    # Run Nougat first
    nougat_text = run_nougat(preprocessed_img).strip()
    
    # Fallback to Tesseract if output is very short / missing
    if len(nougat_text.splitlines()) < 2:
        tesseract_text = run_tesseract(preprocessed_img).strip()
        combined_text = tesseract_text
    else:
        combined_text = nougat_text
    
    # Add page separator
    full_text += f"\n\n--- Page {i+1} ---\n\n{combined_text}"

# ------------------ Save Output ------------------
output_path = os.path.splitext(pdf_path)[0] + "_ocr.txt"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(full_text)

print(f"OCR complete! Output saved to:\n{output_path}")

OCR complete! Output saved to:
C:\Projects\Conceptify - AI Powered Learning Platform\OCR\Tests files\MC test_ocr.txt


In [None]:
# Save locally to a folder
# model.save_pretrained("C:/Projects/Conceptify - AI Powered Learning Platform/models/nougat-base")
# processor.save_pretrained("C:/Projects/Conceptify - AI Powered Learning Platform/models/nougat-base")

[]