Extract Document Content

In [9]:
# !pip install pymupdf openai tiktoken python-dotenv

In [48]:
import fitz  # PyMuPDF
import re
import os
from pathlib import Path
from typing import List

def extract_pdf_text(pdf_path):
    """Extract clean text from PDF, handles images/tables automatically"""
    doc = fitz.open(pdf_path)
    text = []
    
    for page in doc:
        # Extract text blocks (ignores images, keeps readable content)
        page_text = page.get_text("text")
        text.append(page_text.strip())
    
    doc.close()
    return "\n\n".join(text)

def clean_text(raw_text: str) -> str:
    """Aggressive OCR noise removal + semantic cleanup"""
    
    # Step 1: Remove heavy OCR garbage (symbols, random letters, prices)
    text = re.sub(r'[$€£¥]\d+\.?\d*|\b[kx]+|\b[vj]+|\b[L»]+\b|[•/\\|]+|[%#*®]+|[-]{2,}', ' ', raw_text)
    text = re.sub(r'\b\w{1,2}[^a-zA-Z\s]{1,2}\b', ' ', text)  # Short gibberish
    
    # Step 2: Remove common OCR artifacts (CIVIV/M, OMNIVM, etc.)
    text = re.sub(r'\b[A-Z]{2,}M\b|\bfoMccui\b|\bM®e®\b|\bL[V^]+\b', ' ', text)
    text = re.sub(r'\b\w+wm\)\b|\bzxwm\)\b', ' ', text)
    
    # Step 3: Normalize whitespace aggressively
    text = re.sub(r'\s*\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'\n[ \t]*\n', '\n\n', text)  # Clean single newlines
    
    # Step 4: Remove lines with mostly noise (short lines, all caps fragments)
    lines = text.split('\n')
    clean_lines = []
    for line in lines:
        line = line.strip()
        if len(line) > 20 and not re.match(r'^[A-Z\s\W]+$', line):  # Skip short/noisy lines
            clean_lines.append(line)
    
    text = '\n\n'.join(clean_lines)
    
    # Step 5: Final polish - remove excessive spaces, strip
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Step 6: Remove common publisher fragments
    text = re.sub(r'boston PUBLIC LIBRARY\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'HAWTHORN BOOKS INC?\.?\b.*?(?=Copyright|$)', '', text, flags=re.IGNORECASE)
    
    return text.strip()

def process_folder(folder_path):
    """Process all PDFs in folder, clean text, save as text files in Extracted_text folder"""
    folder = Path(folder_path)
    extracted_folder = folder / "Extracted_text"
    extracted_folder.mkdir(exist_ok=True)  # Create folder if it doesn't exist
    docs = []
    
    for pdf_file in folder.glob("*.pdf"):
        print(f"Processing: {pdf_file.name}")
        
        # Extract raw text
        raw_text = extract_pdf_text(pdf_file)
        
        # Clean the text
        cleaned_text = clean_text(raw_text)
        
        # Save individual text file to Extracted_text folder
        txt_path = extracted_folder / f"{pdf_file.stem}.txt"
        txt_path.write_text(cleaned_text)
        
        docs.append({
            "source": pdf_file.name,
            "text": cleaned_text,  # Store cleaned version
            "chunk_id": f"{pdf_file.stem}_full"
        })
        
        print(f"  -> Extracted {len(raw_text):,} chars -> {len(cleaned_text):,} chars cleaned -> Saved: {txt_path.name}")
    
    return docs


# Usage
pdf_folder = "PDFs"
documents = process_folder(pdf_folder)

Processing: All about repairing small appliances.pdf
  -> Extracted 244,610 chars -> 154,095 chars cleaned -> Saved: All about repairing small appliances.txt
Processing: Care and repair of your large home appliances.pdf
  -> Extracted 138,342 chars -> 100,813 chars cleaned -> Saved: Care and repair of your large home appliances.txt
Processing: All about repairing major household appliances.pdf
  -> Extracted 144,034 chars -> 129,411 chars cleaned -> Saved: All about repairing major household appliances.txt
Processing: All thumbs guide to VCRs.pdf
  -> Extracted 104,793 chars -> 89,572 chars cleaned -> Saved: All thumbs guide to VCRs.txt


In [49]:
documents[0]["text"][:2000]

'Illustrated by Carl Bryant W. Clement Stone, Publisher s nice, of course, to have a handyman around the house, but never there when the steam steam or the toaster won’t is written for and dedicated to the housewife who needs a repair manual on ast array of small electrical appliances. Edited by the Staff of Vocational Horizons, Inc. rights reserved, including the right to reproduce this book or portions thereof in any form, except inquiries should be addressed Madison Avenue, New York, New York of America and published taneously in Canada by Prentice-Hall of Canada, Limited, Birch- mount Road, Scarborough, Ontario. Library of Congress Catalog Card I: Repair Problems and Their Solutions Iron does not slide smoothly over cloth Not enough steam comes from the iron Water and steam sputters out of steam ports during operation Iron does not spray water Iron does not shut off Toaster does not heat One side of bread does not toast Bread does not stay down Toaster does not pop up Bread toasts 

In [46]:
len(documents)

4

In [55]:
documents[0].keys()

dict_keys(['source', 'text', 'chunk_id'])

Create Document Chunk

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splits = text_splitter.split_documents(eur_lex_docs)

In [57]:
# !pip install pinecone

Test Basic Search functionality