# Pdf Reader

In [1]:
pip install pypdf pdfplumber bs4 pytesseract lxml tabulate

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings

# Configuration
DOCS_DIR = "datastore/"
PERSIST_DIR = "stores/insurance_metadata_v4"
EMBEDDING_MODEL = "BAAI/bge-large-en"

def extract_plan_type(filename):
    """Extract plan type from filename (case-insensitive)"""
    filename_lower = filename.lower()
    if "basic" in filename_lower: return "basic"
    if "standard" in filename_lower: return "standard"
    if "enhanced" in filename_lower: return "enhanced"
    if "scotia" in filename_lower: return "scotia"
    return "other"

# Initialize components
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": False}
)

# Process each PDF
all_splits = []

for pdf_file in os.listdir(DOCS_DIR):
    if not pdf_file.endswith(".pdf"):
        continue
        
    print(f"Processing: {pdf_file}")
    file_path = os.path.join(DOCS_DIR, pdf_file)
    
    # Load PDF
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    
    # Split and add metadata
    plan_type = extract_plan_type(pdf_file)
    for page in pages:
        splits = text_splitter.split_documents([page])
        for split in splits:
            split.metadata.update({
                "plan_type": plan_type,
                "source_file": pdf_file,
                "file_type": "pdf"
            })
        all_splits.extend(splits)

# Create vector store
vectorstore = Chroma.from_documents(
    documents=all_splits,
    embedding=embeddings,
    persist_directory=PERSIST_DIR
)

vectorstore._collection.modify(
    metadata={"allow_filtering": True}  # Enable filtering by doc_type
)

print(f"\nIngestion complete! Stored {len(all_splits)} chunks from {len(os.listdir(DOCS_DIR))} PDFs.")

  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Processing: scotia.pdf

Ingestion complete! Stored 21 chunks from 1 PDFs.


In [3]:
all_splits

[Document(metadata={'producer': 'Acrobat Distiller 6.0 for Macintosh', 'creator': 'QuarkXPress: pictwpstops filter 1.0', 'creationdate': '2005-10-07T12:40:16-04:00', 'moddate': '2005-10-07T12:40:16-04:00', 'title': 'SA72-75_CERT_E', 'author': 'Monique Stewart-Channer', 'source': 'datastore/scotia.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'start_index': 0, 'plan_type': 'scotia', 'source_file': 'scotia.pdf', 'file_type': 'pdf'}, page_content='COPY\nCERTIFICATE OF INSURANCE\nSCOTIA® ACCIDENT CARE PLAN  •  GROUP POLICY NUMBER SLG000007\nSCOTIA LIFE INSURANCE COMPANY\n100 YONGE STREET, SUITE 400, TORONTO, ONTARIO M5H 1H1\nTEL: 1-800-387-9844 / FAX: 1-800-647-8129 / www.scotialife.com\nScotia Life Insurance Company (ScotiaLife) has issued the above-referenced\nGroup Policy to The Bank of Nova Scotia (Scotiabank).\nThis Certificate is intended to provide a summary of the principal provisions of\nthe Group Policy. This Certificate is not an insurance policy, insurance contract (or\

# Extract data from Table

In [4]:
import os
import pdfplumber
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings


def extract_structured_docs_from_pdf(pdf_path):
    docs = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            plan_type = extract_plan_type(pdf_path)
            for table in tables:
                if table and len(table) > 1:  # Skip empty or header-only tables
                    for row in table[1:]:  # Skip header row
                        try:
                            benefit, reimbursement, maximum = row
                            sentence = f"{benefit} coverage reimburses {reimbursement} with a maximum of {maximum}."
                            metadata = {
                                "doc_type": os.path.basename(pdf_path),
                                "plan_type": plan_type,
                                "source": str(os.path.basename(pdf_path)).replace(".pdf",""),
                                "page": i + 1,
                                "benefit": benefit.strip()
                            }
                            docs.append(Document(page_content=sentence, metadata=metadata))
                        except ValueError:
                            continue  # skip malformed rows
    return docs

def process_pdf_folder(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            docs = extract_structured_docs_from_pdf(pdf_path)
            all_docs.extend(docs)
    return all_docs

# Extract documents
docs = process_pdf_folder(DOCS_DIR)
print(docs)

# Add new documents
if docs:
    vectorstore.add_documents(docs)
    vectorstore.persist()

print(f"✅ {len(docs)} documents embedded and saved to vector store.")


[]
✅ 0 documents embedded and saved to vector store.


# WebPageLoader

In [5]:
URLS = [
    "https://www.ontario.ca/page/what-ohip-covers",
    "https://www.ontario.ca/page/ohip-coverage-while-outside-canada",
    "https://www.ontario.ca/page/documents-needed-get-health-card",
    "https://www.ontario.ca/page/military-families-services-and-support",
    "https://www.ontario.ca/page/apply-ohip-and-get-health-card",
    "https://uhip.ca/help-faq/",
    "https://www.ontario.ca/page/learn-about-ohip-plus"
]


In [6]:
from bs4 import BeautifulSoup
import re
import requests
import time
from typing import List
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import WebBaseLoader

# 1. Define the custom text splitter
class GovernmentTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self):
        super().__init__(
            chunk_size=512,
            chunk_overlap=64,
            separators=[
                "\n\n", 
                "\n", 
                r"(?<=\. )",  # Split after periods
                " ", 
                ""
            ],
            keep_separator=True
        )

# 2. Custom web loader with cleaning
class GovernmentWebLoader(WebBaseLoader):
    def __init__(self, urls):
        super().__init__(urls)
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        })
    
    def load(self) -> List[Document]:
        docs = super().load()
        return self.clean_documents(docs)
    
    def clean_documents(self, docs: List[Document]) -> List[Document]:
        cleaned_docs = []
        for doc in docs:
            try:
                soup = BeautifulSoup(doc.page_content, 'html.parser')
                
                # Remove unwanted elements
                for element in soup(['script', 'style', 'nav', 'footer']):
                    element.decompose()
                
                # Get clean text
                text = soup.get_text('\n', strip=True)
                text = re.sub(r'\n{3,}', '\n\n', text).strip()
                
                # Preserve important metadata
                metadata = doc.metadata.copy()
                metadata.update({
                    "plan_type": "OHIP" if "ontario.ca" in doc.metadata["source"] else "UHIP",
                    "cleaned": True
                })
                
                cleaned_docs.append(Document(
                    page_content=text,
                    metadata=metadata
                ))
            except Exception as e:
                print(f"Error cleaning document: {str(e)}")
                cleaned_docs.append(doc)
        return cleaned_docs

# 3. Vectorization pipeline
def vectorize_webpages():
    
    print("🕸️ Loading webpages...")
    loader = GovernmentWebLoader(URLS)
    docs = loader.load()
    
    print("✂️ Splitting documents...")
    splitter = GovernmentTextSplitter()
    chunks = splitter.split_documents(docs)
    
    return chunks

# Run the pipeline
chunks_web_based = vectorize_webpages()
vectorstore.add_documents(chunks_web_based)

# Save updates
vectorstore.persist()

USER_AGENT environment variable not set, consider setting it to identify your requests.


🕸️ Loading webpages...
✂️ Splitting documents...


  vectorstore.persist()


In [7]:
print(chunks_web_based)

[Document(metadata={'source': 'https://www.ontario.ca/page/what-ohip-covers', 'title': 'What OHIP covers | ontario.ca', 'description': 'Find out what services you can get through OHIP.', 'language': 'en', 'plan_type': 'OHIP', 'cleaned': True}, page_content="What OHIP covers | ontario.ca\n\n      Skip to main content\n    \n\nOntario.ca needs JavaScript to function properly and provide you with a fast, stable experience.\n\nTo have a better experience, you need to:\n\nGo to your browser's settings\nEnable JavaScript\n \n\nOntario.ca homepage\n\nSearch\n\nSubmit\n\nFrançais\nFR\n\nSearch\n\nMenu\n\nclose\n\nMenu\n\nArts and culture\n\nBusiness and economy\n\nDriving and roads\n\nEducation and training\n\nEnvironment and energy\n\nGovernment\n\nHealth and wellness\n\nHome and community\n\nJobs and employment"), Document(metadata={'source': 'https://www.ontario.ca/page/what-ohip-covers', 'title': 'What OHIP covers | ontario.ca', 'description': 'Find out what services you can get through OH

In [8]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from langchain_core.documents import Document
from urllib.parse import urljoin
from typing import List

# List of official OHIP/UHIP URLs
OHIP_URLS = [
    "https://www.ontario.ca/page/what-ohip-covers",
    "https://www.ontario.ca/page/ohip-coverage-while-outside-canada",
    "https://www.ontario.ca/page/documents-needed-get-health-card",
    "https://www.ontario.ca/page/apply-ohip-and-get-health-card"
]

UHIP_URLS = [
    "https://uhip.ca/help-faq/",
    "https://uhip.ca/coverage-details/"
]

def fetch_with_retry(url: str, max_retries: int = 3) -> requests.Response:
    """Handle request failures with retries"""
    for attempt in range(max_retries):
        try:
            response = requests.get(
                url,
                headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'},
                timeout=10
            )
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)  # Exponential backoff

def extract_tables_from_url(url: str) -> List[Document]:
    """Extract all tables from a single URL"""
    try:
        response = fetch_with_retry(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        documents = []
        
        for i, table in enumerate(soup.find_all('table'), 1):
            try:
                # Extract table data
                df = pd.read_html(str(table))[0]
                
                # Create document with context
                table_text = f"HEALTH TABLE FROM {url}:\n{df.to_markdown(index=False)}"
                
                documents.append(Document(
                    page_content=table_text,
                    metadata={
                        "source": url + f"#table_{i}",
                        "plan_type": "OHIP" if "ontario.ca" in url else "UHIP",
                        
                        "columns": str(list(df.columns)),
                        "row_count": len(df),
                        "last_updated": response.headers.get('Last-Modified', '')
                    }
                ))
            except Exception as e:
                print(f"Skipped table {i} at {url}: {str(e)}")
                continue
                
        return documents
    except Exception as e:
        print(f"Failed to process {url}: {str(e)}")
        return []

def scrape_all_tables(url_list: List[str]) -> List[Document]:
    """Process multiple URLs in sequence"""
    all_tables = []
    for url in url_list:
        print(f"Processing {url}...")
        all_tables.extend(extract_tables_from_url(url))
        time.sleep(1)  # Respectful delay between requests
    return all_tables

# Usage
ohip_tables = scrape_all_tables(OHIP_URLS)
uhip_tables = scrape_all_tables(UHIP_URLS)
all_tables_web = ohip_tables + uhip_tables

vectorstore.add_documents(all_tables_web)

# 5. Save updates
vectorstore.persist()



Processing https://www.ontario.ca/page/what-ohip-covers...
Processing https://www.ontario.ca/page/ohip-coverage-while-outside-canada...
Processing https://www.ontario.ca/page/documents-needed-get-health-card...
Processing https://www.ontario.ca/page/apply-ohip-and-get-health-card...
Processing https://uhip.ca/help-faq/...


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Processing https://uhip.ca/coverage-details/...


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


In [9]:
all_tables_web

[Document(metadata={'source': 'https://uhip.ca/help-faq/#table_1', 'plan_type': 'UHIP', 'columns': '[0, 1]', 'row_count': 2, 'last_updated': ''}, page_content='HEALTH TABLE FROM https://uhip.ca/help-faq/:\n| 0                                               | 1                                                                                                                                                                                             |\n|:------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| Full-time, part-time, ESL & non-degree students | Your University will do this for you and charge the cost to your student accounts. Check with your university to confirm whether it extends coverage to ESL or non-degree students.           |\n| Employee/short-term visitor                     | You nee

# OCR Based

In [10]:
from PIL import Image
import pytesseract
from io import BytesIO
import requests

class OCRWebLoader(GovernmentWebLoader):
    def __init__(self, urls):
        super().__init__(urls)
        pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update path as needed

    def _extract_text_from_image(self, img_url: str) -> str:
        try:
            response = requests.get(img_url, stream=True)
            img = Image.open(BytesIO(response.content))
            return pytesseract.image_to_string(img)
        except Exception as e:
            print(f"OCR failed for {img_url}: {str(e)}")
            return ""

    def load(self) -> List[Document]:
        docs = super().load()
        
        for doc in docs:
            soup = BeautifulSoup(doc.page_content, 'html.parser')
            for img in soup.find_all('img', src=True):
                if any(keyword in img['src'] for keyword in ['coverage', 'eligibility', 'table']):
                    ocr_text = self._extract_text_from_image(img['src'])
                    if ocr_text:
                        doc.page_content += f"\n[IMAGE TEXT]: {ocr_text}"
                        doc.metadata['ocr_extracted'] = True
        
        return docs

In [11]:
import pdfplumber

def extract_pdf_tables(pdf_url: str) -> List[Document]:
    try:
        response = requests.get(pdf_url)
        docs = []
        
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            for page in pdf.pages:
                # Extract text
                text = page.extract_text()
                if text:
                    docs.append(Document(
                        page_content=text,
                        metadata={
                            "source": pdf_url,
                            "plan_type": extract_plan_type(pdf_url),
                            "page": page.page_number,
                            "type": "pdf_text"
                        }
                    ))
                
                # Extract tables
                for table in page.extract_tables():
                    docs.append(Document(
                        page_content=str(table),
                        metadata={
                            "source": pdf_url,
                            "plan_type": extract_plan_type(pdf_url),
                            "page": page.page_number,
                            "type": "pdf_table"
                        }
                    ))
        
        return docs
    except Exception as e:
        print(f"PDF extraction failed: {str(e)}")
        return []

In [12]:
def enhanced_vectorization():
    # Standard text content
    text_urls = URLS
    
    # PDF resources
    pdf_urls = [
        
    ]
    
    print("📄 Processing text content...")
    text_docs = OCRWebLoader(text_urls).load()
    
    print("📑 Processing PDF content...")
    pdf_docs = []
    for pdf_url in pdf_urls:
        pdf_docs.extend(extract_pdf_tables(pdf_url))
    
    print("✂️ Chunking documents...")
    all_docs = text_docs + pdf_docs
    chunks = GovernmentTextSplitter().split_documents(all_docs)
    
    return chunks

chucks_ocr = vectorize_webpages()


🕸️ Loading webpages...
✂️ Splitting documents...


In [13]:
chucks_ocr

[Document(metadata={'source': 'https://www.ontario.ca/page/what-ohip-covers', 'title': 'What OHIP covers | ontario.ca', 'description': 'Find out what services you can get through OHIP.', 'language': 'en', 'plan_type': 'OHIP', 'cleaned': True}, page_content="What OHIP covers | ontario.ca\n\n      Skip to main content\n    \n\nOntario.ca needs JavaScript to function properly and provide you with a fast, stable experience.\n\nTo have a better experience, you need to:\n\nGo to your browser's settings\nEnable JavaScript\n \n\nOntario.ca homepage\n\nSearch\n\nSubmit\n\nFrançais\nFR\n\nSearch\n\nMenu\n\nclose\n\nMenu\n\nArts and culture\n\nBusiness and economy\n\nDriving and roads\n\nEducation and training\n\nEnvironment and energy\n\nGovernment\n\nHealth and wellness\n\nHome and community\n\nJobs and employment"),
 Document(metadata={'source': 'https://www.ontario.ca/page/what-ohip-covers', 'title': 'What OHIP covers | ontario.ca', 'description': 'Find out what services you can get through O

In [14]:
vectorstore.add_documents(chucks_ocr)

# 5. Save updates
vectorstore.persist()

In [15]:
# Sample query with metadata filtering
results = vectorstore.similarity_search(
    "What is covered under UHIP?",
    filter={"doc_type":"UHIP"},
    k=3
)

for i, doc in enumerate(results):
    print(f"\nRESULT {i+1}:")
    print(doc.page_content[:300] + "...")
    print("METADATA:", doc.metadata)

    print("-" * 50)

In [16]:
# Sample query with metadata filtering
results = vectorstore.similarity_search(
    "What is covered under UHIP?",
    k=3
)

for i, doc in enumerate(results):
    print(f"\nRESULT {i+1}:")
    print(doc.page_content[:300] + "...")
    print("METADATA:", doc.metadata)

    print("-" * 50)


RESULT 1:
Does UHIP cover the cost of vaccinations?...
METADATA: {'cleaned': True, 'source': 'https://uhip.ca/help-faq/', 'title': 'Help & FAQ - UHIP/RAMU', 'plan_type': 'UHIP', 'language': 'en-US'}
--------------------------------------------------

RESULT 2:
Does UHIP cover the cost of vaccinations?...
METADATA: {'plan_type': 'UHIP', 'title': 'Help & FAQ - UHIP/RAMU', 'source': 'https://uhip.ca/help-faq/', 'language': 'en-US', 'cleaned': True}
--------------------------------------------------

RESULT 3:
Does UHIP cover the cost of vaccinations?...
METADATA: {'language': 'en-US', 'title': 'Help & FAQ - UHIP/RAMU', 'cleaned': True, 'plan_type': 'UHIP', 'source': 'https://uhip.ca/help-faq/'}
--------------------------------------------------


In [17]:
results = vectorstore.similarity_search("", k=100)  # Empty query → no filtering

for doc in results:
    print("Content preview:", doc.page_content[:100])
    print("Metadata:", doc.metadata)
    print("-" * 50)


# Sample query with metadata filtering
results = vectorstore.similarity_search(
    "What is covered under UHIP?",
    filter={"plan_type":"basic"},
    k=3
)

for i, doc in enumerate(results):
    print(f"\nRESULT {i+1}:")
    print(doc.page_content[:300] + "...")
    print("METADATA:", doc.metadata)

    print("-" * 50)

Content preview: HEALTH TABLE FROM https://uhip.ca/coverage-details/:
| 0                                            
Metadata: {'plan_type': 'UHIP', 'columns': '[0, 1]', 'row_count': 4, 'last_updated': '', 'source': 'https://uhip.ca/coverage-details/#table_3'}
--------------------------------------------------
Content preview: HEALTH TABLE FROM https://uhip.ca/coverage-details/:
| 0                                            
Metadata: {'row_count': 4, 'last_updated': '', 'source': 'https://uhip.ca/coverage-details/#table_3', 'columns': '[0, 1]', 'plan_type': 'UHIP'}
--------------------------------------------------
Content preview: HEALTH TABLE FROM https://uhip.ca/coverage-details/:
| ('Injury requiring physiotherapy', 'Non-compl
Metadata: {'row_count': 0, 'columns': "[('Injury requiring physiotherapy', 'Non-complex injury or surgery that did not require inpatient stay (such as ACL repair, arthroscopy, rotator cuff surgery)', 'Complex injury or surgery or neurological injury (such 

In [18]:
# Sample query with metadata filtering


#results = vectorstore.similarity_search("",  filter={"title":"phi-basic.pdf"},k=100)  # Empty query → no filtering
#Personal Health Insurance - Standard

results = vectorstore.similarity_search("",  filter={"plan_type":"basic"},k=100)  # Empty query → no filtering
for i, doc in enumerate(results):
    print(f"\nRESULT {i+1}:")
    print(doc.page_content[:300] + "...")
    print("METADATA:", doc.metadata)

    print("-" * 50)

In [19]:
# Sample query with metadata filtering


#results = vectorstore.similarity_search("",  filter={"title":"phi-basic.pdf"},k=100)  # Empty query → no filtering
#Personal Health Insurance - Standard

results = vectorstore.similarity_search("what is the Drug coverage reimburses in a year",  filter={"plan_type":"basic"},k=10)  # Empty query → no filtering
for i, doc in enumerate(results):
    print(f"\nRESULT {i+1}:")
    print(doc.page_content[:300] + "...")
    print("METADATA:", doc.metadata)

    print("-" * 50)