In [17]:
!pip install PyMuPDF pdf2image pytesseract


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [19]:
import os
import io
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# Define base folders (adjust these paths as needed)
PDF_BASE_FOLDER = "ncert_pdfs"   # Your NCERT PDFs organized in subfolders
TEXT_BASE_FOLDER = "ncert_texts" # Where the extracted text files will be saved

def extract_text_with_pymupdf(pdf_path):
    """
    Extracts text from a PDF file using PyMuPDF.
    """
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text("text") + "\n"
    return full_text

def extract_text_with_tesseract(pdf_path, dpi=300):
    """
    Converts each page of a PDF to an image using pdf2image,
    then uses Tesseract OCR to extract text from those images.
    """
    # Convert PDF pages to images (higher DPI for better OCR accuracy)
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for image in images:
        # Convert image to bytes for pytesseract
        # (Alternatively, you can pass the PIL image directly)
        text = pytesseract.image_to_string(image, lang='eng')
        full_text += text + "\n"
    return full_text

def extract_pdf_text(pdf_path):
    """
    Attempts to extract text from the PDF using PyMuPDF.
    If the extracted text is very short (indicating potential issues),
    it falls back to Tesseract OCR.
    """
    text = extract_text_with_pymupdf(pdf_path)
    if len(text.strip()) < 100:  # Threshold for minimal acceptable text
        print(f"⚠️ Text from {pdf_path} appears incomplete. Falling back to OCR...")
        text = extract_text_with_tesseract(pdf_path)
    return text

def process_all_pdfs(pdf_base_folder, text_base_folder):
    """
    Processes all PDF files from pdf_base_folder,
    extracts full text, and saves the results in text_base_folder.
    
    The folder structure is preserved, and output files have a .txt extension.
    """
    for root, dirs, files in os.walk(pdf_base_folder):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                # Get relative path to preserve folder structure
                rel_path = os.path.relpath(pdf_path, pdf_base_folder)
                # Remove the .pdf extension (handle if filename already contains .pdf)
                base_name = os.path.splitext(rel_path)[0]
                # Construct output file path with a single .txt extension
                text_file_path = os.path.join(text_base_folder, base_name + ".txt")
                os.makedirs(os.path.dirname(text_file_path), exist_ok=True)
                print(f"Processing {pdf_path}...")
                try:
                    text = extract_pdf_text(pdf_path)
                    with open(text_file_path, "w", encoding="utf-8") as f:
                        f.write(text)
                    print(f"✅ Saved extracted text to {text_file_path}")
                except Exception as e:
                    print(f"⚠️ Failed to process {pdf_path}: {e}")

# Run the PDF processing
process_all_pdfs(PDF_BASE_FOLDER, TEXT_BASE_FOLDER)


Processing ncert_pdfs\class_11\biology\Chapter1.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter1.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter10.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter10.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter11.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter11.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter12.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter12.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter13.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter13.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter14.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter14.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter15.pdf.pdf...
✅ Saved extracted text to ncert_texts\class_11\biology\Chapter15.pdf.txt
Processing ncert_pdfs\class_11\biology\Chapter16.pdf.pdf...
✅ Saved ext