In [1]:
import os
import json
import pdfplumber
import fitz  # PyMuPDF

# Step 1: Define functions to extract text, tables, and links

def extract_text_and_tables(pdf_path):
    """Extract text and tables from a PDF using pdfplumber."""
    text = []
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
            # Extract tables
            page_tables = page.extract_tables()
            for table in page_tables:
                # Convert table to list of lists for JSON
                tables.append({
                    'page': page.page_number,
                    'table': table
                })
    return "\n\n".join(text), tables

def extract_links(pdf_path):
    """Extract links from a PDF using PyMuPDF."""
    doc = fitz.open(pdf_path)
    links = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        for link in page.get_links():
            # Store link data
            links.append({
                'page': page_num + 1,
                'url': link.get('uri', ''),
                'text': link.get('title', '')  # Not always available
            })
    doc.close()
    return links

# Step 2: Walk through all folders and collect PDFs

base_dir = 'E:/WESEEAGENT/Archibus Docs'  # Change to your main folder
pdfs = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.lower().endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

# Step 3: Process all PDFs and build knowledge base

knowledge_base = []

for pdf_path in pdfs:
    # Get folder and file info
    folder = os.path.relpath(os.path.dirname(pdf_path), base_dir)
    filename = os.path.basename(pdf_path)
    
    # Extract content
    text, tables = extract_text_and_tables(pdf_path)
    links = extract_links(pdf_path)
    
    # Store in knowledge base
    knowledge_base.append({
        'folder': folder,
        'file': filename,
        'text': text,
        'tables': tables,
        'links': links
    })

# Step 4: Save knowledge base as JSON

with open('knowledge_base.json', 'w', encoding='utf-8') as f:
    json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

print("Knowledge base saved as knowledge_base.json")


KeyboardInterrupt: 

In [1]:
import os
import json
import pdfplumber
from concurrent.futures import ThreadPoolExecutor

def process_pdf(pdf_path, base_dir):
    folder = os.path.relpath(os.path.dirname(pdf_path), base_dir)
    filename = os.path.basename(pdf_path)
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "\n\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        text = "Error: " + str(e)
    return {
        'folder': folder,
        'file': filename,
        'text': text
    }

base_dir = 'E:/WESEEAGENT/Archibus Docs'  # Change to your path
pdfs = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.lower().endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

knowledge_base = []
batch_size = 50  # Adjust as needed
for i in range(0, len(pdfs), batch_size):
    batch = pdfs[i:i+batch_size]
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(lambda p: process_pdf(p, base_dir), batch))
    knowledge_base.extend(results)
    print(f"Processed {i + len(batch)}/{len(pdfs)} PDFs")

with open('knowledge_base.json', 'w', encoding='utf-8') as f:
    json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

print("Knowledge base saved as knowledge_base.json")


Processed 50/5429 PDFs
Processed 100/5429 PDFs
Processed 150/5429 PDFs
Processed 200/5429 PDFs
Processed 250/5429 PDFs
Processed 300/5429 PDFs
Processed 350/5429 PDFs
Processed 400/5429 PDFs
Processed 450/5429 PDFs
Processed 500/5429 PDFs
Processed 550/5429 PDFs
Processed 600/5429 PDFs
Processed 650/5429 PDFs
Processed 700/5429 PDFs
Processed 750/5429 PDFs
Processed 800/5429 PDFs
Processed 850/5429 PDFs
Processed 900/5429 PDFs
Processed 950/5429 PDFs
Processed 1000/5429 PDFs
Processed 1050/5429 PDFs
Processed 1100/5429 PDFs
Processed 1150/5429 PDFs
Processed 1200/5429 PDFs
Processed 1250/5429 PDFs
Processed 1300/5429 PDFs
Processed 1350/5429 PDFs
Processed 1400/5429 PDFs
Processed 1450/5429 PDFs
Processed 1500/5429 PDFs
Processed 1550/5429 PDFs
Processed 1600/5429 PDFs
Processed 1650/5429 PDFs
Processed 1700/5429 PDFs
Processed 1750/5429 PDFs
Processed 1800/5429 PDFs
Processed 1850/5429 PDFs
Processed 1900/5429 PDFs
Processed 1950/5429 PDFs
Processed 2000/5429 PDFs
Processed 2050/5429 

In [2]:
import os
import json
import pdfplumber
from concurrent.futures import ThreadPoolExecutor
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter (you can tweak chunk size & overlap)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "]
)

def process_pdf(pdf_path, base_dir):
    folder = os.path.relpath(os.path.dirname(pdf_path), base_dir)
    filename = os.path.basename(pdf_path)

    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = "\n\n".join(
                page.extract_text() for page in pdf.pages if page.extract_text()
            )
            chunks = text_splitter.split_text(full_text)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        chunks = ["Error: " + str(e)]

    return [
        {
            'folder': folder,
            'file': filename,
            'text': chunk,
            'source': filename
        }
        for chunk in chunks
    ]

# Set base directory path
base_dir = 'E:/WESEEAGENT/Archibus Docs'  # Change to your path
pdfs = []

# Walk through all PDFs in subdirectories
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.lower().endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

knowledge_base = []
batch_size = 50  # Tune depending on system capability

for i in range(0, len(pdfs), batch_size):
    batch = pdfs[i:i + batch_size]
    with ThreadPoolExecutor(max_workers=4) as executor:
        batch_results = list(executor.map(lambda p: process_pdf(p, base_dir), batch))

    # Flatten chunks from all PDFs
    for doc_chunks in batch_results:
        knowledge_base.extend(doc_chunks)

    print(f"Processed {i + len(batch)}/{len(pdfs)} PDFs")

# Save the full knowledge base with smaller, high-quality chunks
with open('knowledge_base.json', 'w', encoding='utf-8') as f:
    json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

print("Knowledge base saved as knowledge_base(new).json")


Processed 50/5429 PDFs
Processed 100/5429 PDFs
Processed 150/5429 PDFs
Processed 200/5429 PDFs
Processed 250/5429 PDFs
Processed 300/5429 PDFs
Processed 350/5429 PDFs
Processed 400/5429 PDFs
Processed 450/5429 PDFs
Processed 500/5429 PDFs
Processed 550/5429 PDFs
Processed 600/5429 PDFs
Processed 650/5429 PDFs
Processed 700/5429 PDFs
Processed 750/5429 PDFs
Processed 800/5429 PDFs
Processed 850/5429 PDFs
Processed 900/5429 PDFs
Processed 950/5429 PDFs
Processed 1000/5429 PDFs
Processed 1050/5429 PDFs
Processed 1100/5429 PDFs
Processed 1150/5429 PDFs
Processed 1200/5429 PDFs
Processed 1250/5429 PDFs
Processed 1300/5429 PDFs
Processed 1350/5429 PDFs
Processed 1400/5429 PDFs
Processed 1450/5429 PDFs
Processed 1500/5429 PDFs
Processed 1550/5429 PDFs
Processed 1600/5429 PDFs
Processed 1650/5429 PDFs
Processed 1700/5429 PDFs
Processed 1750/5429 PDFs
Processed 1800/5429 PDFs
Processed 1850/5429 PDFs
Processed 1900/5429 PDFs
Processed 1950/5429 PDFs
Processed 2000/5429 PDFs
Processed 2050/5429 