In [1]:
from dotenv import load_dotenv
import os

load_dotenv()   # <-- REQUIRED

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [2]:
!pip install openai chromadb langchain langchain-openai langchain-community tiktoken python-dotenv \
    pdfplumber pypdf pdf2image pytesseract pillow pandas numpy tabulate



In [3]:
pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install langchain_text_splitters



In [5]:
import os
import base64
import io
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from dotenv import load_dotenv

# PDF processing
import pdfplumber
from pypdf import PdfReader
from pdf2image import convert_from_path
from PIL import Image

# LangChain imports
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Load environment variables

from dotenv import load_dotenv
import os

load_dotenv()   
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!


In [6]:
print(os.getenv("OPENAI_API_KEY"))


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


In [7]:
class PDFContentExtractor:
    """
    Comprehensive PDF content extractor that handles:
    - Text extraction
    - Table extraction and conversion to text
    - Image extraction and OCR
    """
    
    def __init__(self, pdf_path: str, extract_images: bool = True, ocr: bool = False):
        self.pdf_path = pdf_path
        self.extract_images = extract_images
        self.ocr = ocr
        self.pages_content = []
        self.extracted_images = []
        self.extracted_tables = []
    
    def extract_all(self) -> Dict:
        """
        Extract all content from PDF
        """
        print(f"Processing PDF: {self.pdf_path}")
        
        with pdfplumber.open(self.pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                print(f"  Processing page {page_num}/{len(pdf.pages)}...")
                
                page_data = {
                    'page_number': page_num,
                    'text': '',
                    'tables': [],
                    'images': []
                }
                
                # Extract text
                text = page.extract_text()
                if text:
                    page_data['text'] = text.strip()
                
                # Extract tables
                tables = page.extract_tables()
                for table_idx, table in enumerate(tables):
                    if table:
                        table_text = self._table_to_text(table, page_num, table_idx)
                        page_data['tables'].append(table_text)
                        self.extracted_tables.append({
                            'page': page_num,
                            'table_index': table_idx,
                            'data': table,
                            'text': table_text
                        })
                
                self.pages_content.append(page_data)
        
        # Extract images if requested
        if self.extract_images:
            self._extract_images()
        
        return self._compile_results()
    
    def _table_to_text(self, table: List[List], page_num: int, table_idx: int) -> str:
        """
        Convert table to readable text format
        """
        if not table or len(table) == 0:
            return ""
        
        # Try to identify header row
        header = table[0] if table else []
        rows = table[1:] if len(table) > 1 else []
        
        # Create text representation
        text_parts = [f"\n[Table {table_idx + 1} on Page {page_num}]\n"]
        
        # Add header
        if header:
            text_parts.append(" | ".join(str(cell) if cell else "" for cell in header))
            text_parts.append("-" * 80)
        
        # Add rows
        for row in rows:
            text_parts.append(" | ".join(str(cell) if cell else "" for cell in row))
        
        return "\n".join(text_parts)
    
    def _extract_images(self):
        """
        Extract images from PDF pages
        """
        print("  Extracting images from PDF...")
        
        try:
            # Convert PDF pages to images
            images = convert_from_path(self.pdf_path, dpi=150)
            
            for page_num, image in enumerate(images, 1):
                # Save image info
                img_data = {
                    'page': page_num,
                    'image': image,
                    'path': None  # Can save to disk if needed
                }
                
                # Perform OCR if requested
                if self.ocr:
                    try:
                        import pytesseract
                        ocr_text = pytesseract.image_to_string(image)
                        img_data['ocr_text'] = ocr_text
                    except Exception as e:
                        print(f"    OCR failed for page {page_num}: {e}")
                
                self.extracted_images.append(img_data)
        
        except Exception as e:
            print(f"  Image extraction failed: {e}")
    
    def _compile_results(self) -> Dict:
        """
        Compile all extracted content
        """
        return {
            'pages': self.pages_content,
            'total_pages': len(self.pages_content),
            'total_tables': len(self.extracted_tables),
            'total_images': len(self.extracted_images),
            'tables': self.extracted_tables,
            'images': self.extracted_images
        }
    
    def create_documents(self) -> List[Document]:
        """
        Create LangChain Document objects from extracted content
        """
        documents = []
        
        for page_data in self.pages_content:
            page_num = page_data['page_number']
            
            # Combine text and tables for the page
            content_parts = []
            
            if page_data['text']:
                content_parts.append(page_data['text'])
            
            if page_data['tables']:
                content_parts.extend(page_data['tables'])
            
            # Add OCR text from images if available
            for img in self.extracted_images:
                if img['page'] == page_num and 'ocr_text' in img and img['ocr_text']:
                    content_parts.append(f"\n[Image OCR Text]\n{img['ocr_text']}")
            
            if content_parts:
                combined_content = "\n\n".join(content_parts)
                
                doc = Document(
                    page_content=combined_content,
                    metadata={
                        'source': self.pdf_path,
                        'page': page_num,
                        'has_tables': len(page_data['tables']) > 0,
                        'num_tables': len(page_data['tables'])
                    }
                )
                documents.append(doc)
        
        return documents

print("✓ PDFContentExtractor class defined")

✓ PDFContentExtractor class defined


In [8]:
# Example: Process your PDF files
# Replace with your actual PDF paths

pdf_files = [
    r"C:\Users\ASUS\OneDrive\Desktop\RAG\pdfs\Paradigms_of_Programming.pdf",
]

# For demonstration, we'll create a sample PDF (optional)
# You can skip this if you have your own PDFs

all_documents = []

if pdf_files:
    for pdf_path in pdf_files:
        print(f"\nProcessing: {pdf_path}")
        print("="*80)
        
        extractor = PDFContentExtractor(
            pdf_path=pdf_path,
            extract_images=True,  # Extract images
            ocr=False  # Set to True if you want OCR on images
        )
        
        # Extract all content
        results = extractor.extract_all()
        
        print(f"\nExtracted:")
        print(f"  - {results['total_pages']} pages")
        print(f"  - {results['total_tables']} tables")
        print(f"  - {results['total_images']} images")
        
        # Create documents
        docs = extractor.create_documents()
        all_documents.extend(docs)
        
        print(f"  - Created {len(docs)} document chunks")
else:
    print("No PDF files specified. Please add your PDF file paths to the 'pdf_files' list above.")
    print("\nUsing sample text documents for demonstration...")
    
    # Sample documents for testing
    sample_texts = [
        """Machine Learning is a subset of AI that focuses on algorithms that can learn from data.
        It includes supervised learning, unsupervised learning, and reinforcement learning.""",
        
        """Deep Learning uses neural networks with multiple layers to learn hierarchical representations.
        It has achieved remarkable success in image recognition and natural language processing.""",
    ]
    
    all_documents = [Document(page_content=text, metadata={"source": "sample", "page": i}) 
                     for i, text in enumerate(sample_texts, 1)]

print(f"\n✓ Total documents to process: {len(all_documents)}")


Processing: C:\Users\ASUS\OneDrive\Desktop\RAG\pdfs\Paradigms_of_Programming.pdf
Processing PDF: C:\Users\ASUS\OneDrive\Desktop\RAG\pdfs\Paradigms_of_Programming.pdf
  Processing page 1/46...
  Processing page 2/46...
  Processing page 3/46...
  Processing page 4/46...
  Processing page 5/46...
  Processing page 6/46...
  Processing page 7/46...
  Processing page 8/46...
  Processing page 9/46...
  Processing page 10/46...
  Processing page 11/46...
  Processing page 12/46...
  Processing page 13/46...
  Processing page 14/46...
  Processing page 15/46...
  Processing page 16/46...
  Processing page 17/46...
  Processing page 18/46...
  Processing page 19/46...
  Processing page 20/46...
  Processing page 21/46...
  Processing page 22/46...
  Processing page 23/46...
  Processing page 24/46...
  Processing page 25/46...
  Processing page 26/46...
  Processing page 27/46...
  Processing page 28/46...
  Processing page 29/46...
  Processing page 30/46...
  Processing page 31/46...
  Pro

In [9]:
# Configure text splitter for optimal chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunks to preserve table context
    chunk_overlap=200,  # More overlap to maintain context
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Split documents
splits = text_splitter.split_documents(all_documents)

print(f"✓ Split into {len(splits)} chunks")
print(f"\nSample chunk:")
print("-" * 80)
if splits:
    print(splits[0].page_content[:500] + "..." if len(splits[0].page_content) > 500 else splits[0].page_content)
    print(f"\nMetadata: {splits[0].metadata}")

✓ Split into 78 chunks

Sample chunk:
--------------------------------------------------------------------------------
GettingStarted
Black-BoxAbstraction
Foodforthought
Welcome to CS302
Paradigms of Programming
1/28


[Table 1 on Page 1]

GettingStarted
Black-BoxAbstraction
Foodforthought | 
--------------------------------------------------------------------------------
Welcome to CS302
Paradigms of Programming
1/28 |

Metadata: {'source': 'C:\\Users\\ASUS\\OneDrive\\Desktop\\RAG\\pdfs\\Paradigms_of_Programming.pdf', 'page': 1, 'has_tables': True, 'num_tables': 1}


In [10]:
print(os.getenv("OPENAI_API_KEY"))


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


In [11]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",  # Fast and efficient
    # model="text-embedding-3-large",  # Use this for higher quality
    openai_api_key=OPENAI_API_KEY
)

print("Creating vector store...")

# Create vector store with persistence
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    collection_name="pdf_rag_collection",
    persist_directory="./chroma_pdf_db"
)

print("✓ Vector store created and persisted to ./chroma_pdf_db")

Creating vector store...


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}