In [1]:
pip install PyMuPDF pytesseract Pillow spacy transformers

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting spacy
  Downloading spacy-3.8.2-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from s

In [2]:
pip install pdf2image torch

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting torch
  Downloading torch-2.5.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.1-py3-none-any.whl.metadata (6.3 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading torch-2.5.0-cp312-cp312-win_amd64.whl (203.1 MB)
   ---------------------------------------- 0.0/203.1 MB ? eta -:--:--
   ---------------------------------------- 2.4/203.1 MB 12.2 MB/s eta 0:00:17
    --------------------------------------- 4.7/203.1 MB 11.9 MB/s eta 0:00:17
   - -------------------------------------- 6.8/203.1 MB 11.0 MB/s eta 0:00:18
   - -------------------------------------- 8.1/203.1 MB 9.7 MB/s eta 0:00:21
   -- ----

In [None]:
import fitz  
import pytesseract  
from PIL import Image
from transformers import pipeline
import io


summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    images = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        full_text += page.get_text("text")
        
        for img in page.get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            images.append(Image.open(io.BytesIO(image_bytes)))

    doc.close()
    return full_text, images


def extract_text_from_images(images):
    ocr_text = ""
    for img in images:
        ocr_text += pytesseract.image_to_string(img) + "\n"
    return ocr_text

# Function to chunk the text for summarization
def chunk_text(text, max_chunk_size=1024):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_chunk.append(word)
        current_length += len(word) + 1  # +1 for space
        if current_length >= max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
    
    # Add the last chunk if it has content
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Function to summarize each chunk of text
def summarize_chunks(chunks):
    summarized_text = ""
    for chunk in chunks:
        try:
            # Summarize each chunk
            summary = summarizer(chunk, max_length=150, min_length=25, do_sample=False)
            summarized_text += summary[0]['summary_text'] + " "
        except Exception as e:
            summarized_text += f"\nError in summarization: {str(e)}"
    return summarized_text

# Main function to process the uploaded PDF
def process_upload(file_path):
    try:
        # Extract text and images from the PDF
        text, images = extract_text_from_pdf(file_path)

        # Extract text from the images using OCR
        ocr_text = extract_text_from_images(images)

        # Combine text from the PDF and from OCR
        full_text = text + "\n" + ocr_text

        # Check if full_text is non-empty and large enough
        if len(full_text.strip()) == 0:
            return "No extractable text found in the PDF."

        # Chunk the text if it's too long
        chunks = chunk_text(full_text)

        # Summarize each chunk
        summary = summarize_chunks(chunks)

        return full_text, summary

    except Exception as e:
        return f"Error in processing PDF: {str(e)}", None