In [1]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [33]:
pip install bert-score rouge-score nltk sentence-transformers

Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Collecting matplotlib (from bert-score)
  Using cached matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting contourpy>=1.0.1 (from matplotlib->bert-score)
  Using cached contourpy-1.3.1-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib->bert-score)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->bert-score)
  Using cached fonttools-4.57.0-cp312-cp312-win_amd64.whl.metadata (104 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->bert-score)
  Using cached kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Using cached absl_py-2.2.2-py3-none-any.whl

In [13]:
import os
import fitz  # PyMuPDF
import numpy as np
import faiss
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import time
from openai import OpenAI  # OpenRouter Client

# ---------- Global Models & Setup ----------

# Load SpaCy model for sentence segmentation
spacy_model = spacy.load("en_core_web_sm")

# Load BERT model and tokenizer for insurance-specific embeddings
tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
bert_model = AutoModel.from_pretrained("llmware/industry-bert-insurance-v0.1")

# Initialize FAISS index with the embedding dimension
embedding_dim = 768
faiss_index = faiss.IndexFlatIP(embedding_dim)  # Cosine similarity-based index
policy_chunk_map = []  # Holds chunks for the current policy

# Initialize OpenRouter client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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",  # Replace with your actual API key
)

# ---------- Utility Functions ----------

def extract_text(pdf_path):
    """Extract text from a PDF file using PyMuPDF (fitz)."""
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def chunk_policy_text(text, chunk_size=3):
    """Chunk the policy text into smaller segments."""
    doc = spacy_model(text)
    sentences = [
        sent.text.strip()
        for sent in doc.sents
        if len(sent.text.strip()) > 50 and not sent.text.lower().startswith(("sbi general", "registered office"))
    ]
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

def embed(text: str):
    """Generate BERT embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        output = bert_model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

def normalize_vectors(vectors):
    """Normalize vectors for FAISS search."""
    vectors = np.array(vectors).astype(np.float32)
    faiss.normalize_L2(vectors)
    return vectors

# ---------- Processing & Indexing ----------

def process_current_policy(pdf_path):
    """Process and index the current policy PDF."""
    global faiss_index, policy_chunk_map

    print("📄 Extracting and chunking text...")
    text = extract_text(pdf_path)
    chunks = chunk_policy_text(text)

    print("📊 Embedding and indexing chunks...")
    chunk_vectors = [embed(chunk) for chunk in tqdm(chunks, desc="Embedding")]
    chunk_vectors = normalize_vectors(chunk_vectors)

    faiss_index.add(chunk_vectors)
    policy_chunk_map = chunks  # Store chunks for retrieval

    print(f"✅ Processed and indexed {len(chunks)} chunks.")

# ---------- Querying ----------

def search_policy(query, top_k=1):
    """Search the policy for the most relevant clause."""
    print("🤖 Thinking", end="")
    for _ in range(5): 
        time.sleep(0.2); print(".", end="", flush=True)
    print("\n🔍 Searching for relevant clauses...")

    query_vec = normalize_vectors([embed(query)])
    D, I = faiss_index.search(query_vec, top_k)

    # Check if the results are valid and not empty
    if I.shape[0] > 0 and I[0].size > 0:
        top_match = policy_chunk_map[I[0][0]]  # Access top match
        score = D[0][0]  # The similarity score
        print(f"\nResult (Cosine Similarity: {score:.4f}):\n{top_match}")
    else:
        print("❌ No relevant results found.")
        return None

    summary = summarize_with_openrouter(top_match)
    return summary

def summarize_with_openrouter(text):
    """Summarize the top result using OpenRouter's LLM."""
    print("\n📝 Summarizing result with OpenRouter...")
    try:
        completion = client.chat.completions.create(
            model="meta-llama/llama-4-scout:free",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Summarize the following insurance policy clause: and also next explain it in easy to understand words"},
                    {"type": "text", "text": text}
                ]
            }]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error during summarization: {e}")
        return None

# ---------- Execution ----------

if __name__ == "__main__":
    pdf_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\policy_pdfs\c983715b738f4f88b4dd0fc684d796c3.pdf"
    
    # Process the policy PDF
    process_current_policy(pdf_path)

    # Ask a query
    result_summary = search_policy("Is newborn baby covered under this plan?")
    if result_summary:
        print("\n🔑 Summary of the top result:\n", result_summary)

    # Clean-up (optional, but FAISS index will be destroyed after script ends)
    #del faiss_index


📄 Extracting and chunking text...
📊 Embedding and indexing chunks...


Embedding: 100%|██████████| 164/164 [00:38<00:00,  4.29it/s]


✅ Processed and indexed 164 chunks.
🤖 Thinking.....
🔍 Searching for relevant clauses...

Result (Cosine Similarity: 0.5088):
Migration means a facility provided to Policyholders (including all 
members under family cover and group policies), to transfer the 
credits gained for pre-existing diseases and specific waiting 
periods from one health insurance policy to another with the 
same insurer. Newborn baby means baby born during the Policy Period and is 
aged between 1 day and 90 days, both days inclusive. Network Provider means hospitals or health care providers 
enlisted by an Insurer or by a TPA and Insurer together to provide 
medical services to an Insured on payment by a cashless facility.

📝 Summarizing result with OpenRouter...

🔑 Summary of the top result:
 **Summary of Insurance Policy Clauses:**

The clause describes three insurance-related terms:

1. **Migration**: A facility allowing policyholders to transfer credits gained for pre-existing diseases and waiting periods to