In [None]:
!pip install PyMuPDF pdfplumber pytesseract pdf2image pillow python-dateutil pandas
!sudo apt-get install poppler-utils tesseract-ocr tesseract-ocr-eng libtesseract-dev


In [None]:
!pip install PyMuPDF pdf2image pdfplumber pytesseract Pillow python-dateutil
!sudo apt-get install poppler-utils tesseract-ocr tesseract-ocr-eng

import os
import fitz
from pdf2image import convert_from_path
import pdfplumber
import pytesseract
from PIL import Image
import io
import pandas as pd
from IPython.display import display, HTML, Markdown

def process_pdf(pdf_path):
    """Main processing function with direct Colab output"""

    # Initialize results dictionary
    results = {
        'metadata': {},
        'text': [],
        'tables': [],
        'images': [],
        'ocr_text': []
    }

    # Extract metadata
    with fitz.open(pdf_path) as doc:
        results['metadata'] = doc.metadata

    # Extract text and tables using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Text extraction
            text = page.extract_text()
            if text:
                results['text'].append(f"Page {page_num+1} Text:\n{text}\n{'='*50}")

            # Table extraction
            tables = page.extract_tables()
            for table_num, table in enumerate(tables):
                df = pd.DataFrame(table[1:], columns=table[0])
                results['tables'].append({
                    'page': page_num+1,
                    'table_num': table_num+1,
                    'dataframe': df
                })

    # Image extraction and OCR
    images = convert_from_path(pdf_path)
    for img_num, img in enumerate(images):
        # Save image to bytes
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format='JPEG')

        # OCR processing
        ocr_text = pytesseract.image_to_string(img)
        if ocr_text.strip():
            results['ocr_text'].append(f"Image {img_num+1} OCR Text:\n{ocr_text}\n{'='*50}")

        # Store image for display
        results['images'].append(img)

    return results

def display_results(results):
    """Display all results in Colab interface"""

    # Display metadata
    display(HTML("<h2>PDF Metadata</h2>"))
    display(pd.DataFrame.from_dict(results['metadata'], orient='index'))

    # Display extracted text
    display(HTML("<h2>Extracted Text</h2>"))
    display(Markdown("\n".join(results['text'])))

    # Display OCR text
    if results['ocr_text']:
        display(HTML("<h2>OCR Text from Images</h2>"))
        display(Markdown("\n".join(results['ocr_text'])))

    # Display tables
    if results['tables']:
        display(HTML("<h2>Extracted Tables</h2>"))
        for table in results['tables']:
            display(HTML(f"<h4>Page {table['page']} - Table {table['table_num']}</h4>"))
            display(table['dataframe'])

    # Display images
    display(HTML("<h2>Extracted Images</h2>"))
    for img in results['images']:
        display(img)



In [None]:
# Upload PDF file in Colab
from google.colab import files
uploaded = files.upload()

# Process and display results
pdf_file = list(uploaded.keys())[0]
results = process_pdf(pdf_file)
display_results(results)