In [1]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [33]:
pip install bert-score rouge-score nltk sentence-transformers

Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Collecting matplotlib (from bert-score)
  Using cached matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting contourpy>=1.0.1 (from matplotlib->bert-score)
  Using cached contourpy-1.3.1-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib->bert-score)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->bert-score)
  Using cached fonttools-4.57.0-cp312-cp312-win_amd64.whl.metadata (104 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->bert-score)
  Using cached kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Using cached absl_py-2.2.2-py3-none-any.whl

In [13]:
import os
import fitz  # PyMuPDF
import numpy as np
import faiss
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import time
from openai import OpenAI  # OpenRouter Client

# ---------- Global Models & Setup ----------

# Load SpaCy model for sentence segmentation
spacy_model = spacy.load("en_core_web_sm")

# Load BERT model and tokenizer for insurance-specific embeddings
tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
bert_model = AutoModel.from_pretrained("llmware/industry-bert-insurance-v0.1")

# Initialize FAISS index with the embedding dimension
embedding_dim = 768
faiss_index = faiss.IndexFlatIP(embedding_dim)  # Cosine similarity-based index
policy_chunk_map = []  # Holds chunks for the current policy

# Initialize OpenRouter client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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",  # Replace with your actual API key
)

# ---------- Utility Functions ----------

def extract_text(pdf_path):
    """Extract text from a PDF file using PyMuPDF (fitz)."""
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def chunk_policy_text(text, chunk_size=3):
    """Chunk the policy text into smaller segments."""
    doc = spacy_model(text)
    sentences = [
        sent.text.strip()
        for sent in doc.sents
        if len(sent.text.strip()) > 50 and not sent.text.lower().startswith(("sbi general", "registered office"))
    ]
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

def embed(text: str):
    """Generate BERT embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        output = bert_model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

def normalize_vectors(vectors):
    """Normalize vectors for FAISS search."""
    vectors = np.array(vectors).astype(np.float32)
    faiss.normalize_L2(vectors)
    return vectors

# ---------- Processing & Indexing ----------

def process_current_policy(pdf_path):
    """Process and index the current policy PDF."""
    global faiss_index, policy_chunk_map

    print("📄 Extracting and chunking text...")
    text = extract_text(pdf_path)
    chunks = chunk_policy_text(text)

    print("📊 Embedding and indexing chunks...")
    chunk_vectors = [embed(chunk) for chunk in tqdm(chunks, desc="Embedding")]
    chunk_vectors = normalize_vectors(chunk_vectors)

    faiss_index.add(chunk_vectors)
    policy_chunk_map = chunks  # Store chunks for retrieval

    print(f"✅ Processed and indexed {len(chunks)} chunks.")

# ---------- Querying ----------

def search_policy(query, top_k=1):
    """Search the policy for the most relevant clause."""
    print("🤖 Thinking", end="")
    for _ in range(5): 
        time.sleep(0.2); print(".", end="", flush=True)
    print("\n🔍 Searching for relevant clauses...")

    query_vec = normalize_vectors([embed(query)])
    D, I = faiss_index.search(query_vec, top_k)

    # Check if the results are valid and not empty
    if I.shape[0] > 0 and I[0].size > 0:
        top_match = policy_chunk_map[I[0][0]]  # Access top match
        score = D[0][0]  # The similarity score
        print(f"\nResult (Cosine Similarity: {score:.4f}):\n{top_match}")
    else:
        print("❌ No relevant results found.")
        return None

    summary = summarize_with_openrouter(top_match)
    return summary

def summarize_with_openrouter(text):
    """Summarize the top result using OpenRouter's LLM."""
    print("\n📝 Summarizing result with OpenRouter...")
    try:
        completion = client.chat.completions.create(
            model="meta-llama/llama-4-scout:free",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Summarize the following insurance policy clause: and also next explain it in easy to understand words"},
                    {"type": "text", "text": text}
                ]
            }]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error during summarization: {e}")
        return None

# ---------- Execution ----------

if __name__ == "__main__":
    pdf_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\policy_pdfs\c983715b738f4f88b4dd0fc684d796c3.pdf"
    
    # Process the policy PDF
    process_current_policy(pdf_path)

    # Ask a query
    result_summary = search_policy("Is newborn baby covered under this plan?")
    if result_summary:
        print("\n🔑 Summary of the top result:\n", result_summary)

    # Clean-up (optional, but FAISS index will be destroyed after script ends)
    #del faiss_index


📄 Extracting and chunking text...
📊 Embedding and indexing chunks...


Embedding: 100%|██████████| 164/164 [00:38<00:00,  4.29it/s]


✅ Processed and indexed 164 chunks.
🤖 Thinking.....
🔍 Searching for relevant clauses...

Result (Cosine Similarity: 0.5088):
Migration means a facility provided to Policyholders (including all 
members under family cover and group policies), to transfer the 
credits gained for pre-existing diseases and specific waiting 
periods from one health insurance policy to another with the 
same insurer. Newborn baby means baby born during the Policy Period and is 
aged between 1 day and 90 days, both days inclusive. Network Provider means hospitals or health care providers 
enlisted by an Insurer or by a TPA and Insurer together to provide 
medical services to an Insured on payment by a cashless facility.

📝 Summarizing result with OpenRouter...

🔑 Summary of the top result:
 **Summary of Insurance Policy Clauses:**

The clause describes three insurance-related terms:

1. **Migration**: A facility allowing policyholders to transfer credits gained for pre-existing diseases and waiting periods to

In [1]:
import os
import fitz  # PyMuPDF
import numpy as np
import faiss
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import time
import json
from openai import OpenAI

# ---------- Setup ----------

spacy_model = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
bert_model = AutoModel.from_pretrained("llmware/industry-bert-insurance-v0.1")
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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",
)

embedding_dim = 768

# Sample queries
queries = [
    "Is newborn baby covered under this plan?",
    "What is the waiting period for pre-existing diseases?",
    "Does this policy cover maternity expenses?",
    "Are day care procedures included?",
    "What is the cashless hospital network?",
    "Is OPD treatment reimbursable?",
    "What is the coverage amount for critical illness?",
    "Are ambulance charges covered?",
    "Is there a claim settlement ratio mentioned?",
    "What documents are needed for claim filing?"
]

# ---------- Utility Functions ----------

def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def chunk_policy_text(text, chunk_size=3):
    doc = spacy_model(text)
    sentences = [
        sent.text.strip()
        for sent in doc.sents
        if len(sent.text.strip()) > 50 and not sent.text.lower().startswith(("sbi general", "registered office"))
    ]
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

def embed(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        output = bert_model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

def normalize_vectors(vectors):
    vectors = np.array(vectors).astype(np.float32)
    faiss.normalize_L2(vectors)
    return vectors

def summarize_with_openrouter(text):
    try:
        completion = client.chat.completions.create(
            model="meta-llama/llama-4-scout:free",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Summarize the following insurance policy clause and explain it simply:"},
                    {"type": "text", "text": text}
                ]
            }]
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Summarization error: {e}")
        return "Summarization failed."

import re

def safe_extract_scores(text):
    """
    Extracts relevance, accuracy, clarity, helpfulness, and comments using regex from messy LLM output.
    """
    def extract_number(label):
        match = re.search(rf"{label}\s*[:\-]?\s*(\d)", text, re.IGNORECASE)
        return int(match.group(1)) if match else 0

    def extract_comment():
        match = re.search(r"comments?\s*[:\-]?\s*(.*)", text, re.IGNORECASE | re.DOTALL)
        return match.group(1).strip() if match else "No comment."

    return {
        "relevance": extract_number("relevance"),
        "accuracy": extract_number("accuracy"),
        "clarity": extract_number("clarity"),
        "helpfulness": extract_number("helpfulness"),
        "comments": extract_comment()
    }


def evaluate_with_openrouter(query, chunk, summary, retries=3):
    eval_prompt = f"""
You are an evaluator for an insurance assistant system. Assess the following:

Query: "{query}"

Retrieved Chunk:
{chunk}

LLM-generated Summary:
{summary}

Rate the following from 0 to 5:
- Relevance (does the chunk answer the query?)
- Accuracy (is the summary faithful?)
- Clarity (is it understandable?)
- Helpfulness (would a user find it useful?)

Also write a short evaluator comment.

Respond like:
Relevance: 4
Accuracy: 3
Clarity: 5
Helpfulness: 4
Comment: The summary is mostly relevant but lacks detail on OPD.
    """

    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model="meta-llama/llama-4-scout:free",
                messages=[{
                    "role": "user",
                    "content": [{"type": "text", "text": eval_prompt}]
                }]
            )
            response_text = completion.choices[0].message.content.strip()
            return safe_extract_scores(response_text)

        except Exception as e:
            print(f"Evaluation retry {attempt+1}/{retries} failed: {e}")
            time.sleep(2)

    return {
        "relevance": 0,
        "accuracy": 0,
        "clarity": 0,
        "helpfulness": 0,
        "comments": "Evaluation failed completely."
    }


# ---------- Main Pipeline ----------

def process_policy(pdf_path, queries):
    policy_results = []
    print(f"\n📄 Processing {os.path.basename(pdf_path)}...")
    
    text = extract_text(pdf_path)
    chunks = chunk_policy_text(text)
    chunk_vectors = normalize_vectors([embed(c) for c in tqdm(chunks, desc="Embedding Chunks")])
    
    faiss_index = faiss.IndexFlatIP(embedding_dim)
    faiss_index.add(chunk_vectors)

    for query in queries:
        print(f"\n🔍 Query: {query}")
        query_vec = normalize_vectors([embed(query)])
        D, I = faiss_index.search(query_vec, 1)

        top_idx = I[0][0]
        top_chunk = chunks[top_idx]
        score = float(D[0][0])
        
        summary = summarize_with_openrouter(top_chunk)
        evaluation = evaluate_with_openrouter(query, top_chunk, summary)

        policy_results.append({
            "query": query,
            "retrieved_chunk": top_chunk,
            "similarity_score": score,
            "summary": summary,
            "evaluation": evaluation
        })

    return {
        "policy_pdf": os.path.basename(pdf_path),
        "results": policy_results
    }

# ---------- Runner ----------

if __name__ == "__main__":
    input_folder = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\policy_pdfs"
    all_results = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            path = os.path.join(input_folder, filename)
            result = process_policy(path, queries)
            all_results.append(result)

    with open("results.json", "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)

    print("\n✅ All policies processed. Results saved to `results.json`.")



📄 Processing 1. Policy- NMP.pdf...


Embedding Chunks: 100%|██████████| 148/148 [00:35<00:00,  4.11it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing 242972d58c064559b7335ac1d9cdf9b5.pdf...


Embedding Chunks: 100%|██████████| 162/162 [00:42<00:00,  3.79it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing 46045becd7b842dca4dfffd893ad9263.pdf...


Embedding Chunks: 100%|██████████| 82/82 [00:32<00:00,  2.51it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?
Summarization error: 'NoneType' object is not subscriptable
Evaluation retry 1/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 2/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 3/3 failed: 'NoneType' object is not subscriptable

🔍 Query: What is the coverage amount for critical illness?
Summarization error: 'NoneType' object is not subscriptable
Evaluation retry 1/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 2/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 3/3 failed: 'NoneType' object is not subscriptable

🔍 Query: Are ambulance charges covered?
Summarization error: 'NoneType' object is not subscriptable
Evaluatio

Embedding Chunks: 100%|██████████| 312/312 [01:55<00:00,  2.69it/s]



🔍 Query: Is newborn baby covered under this plan?
Summarization error: 'NoneType' object is not subscriptable
Evaluation retry 1/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 2/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 3/3 failed: 'NoneType' object is not subscriptable

🔍 Query: What is the waiting period for pre-existing diseases?
Summarization error: 'NoneType' object is not subscriptable
Evaluation retry 1/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 2/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 3/3 failed: 'NoneType' object is not subscriptable

🔍 Query: Does this policy cover maternity expenses?
Summarization error: 'NoneType' object is not subscriptable
Evaluation retry 1/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 2/3 failed: 'NoneType' object is not subscriptable
Evaluation retry 3/3 failed: 'NoneType' object is not subscriptable

🔍 Query: Are day care procedures includ

Embedding Chunks:  69%|██████▊   | 231/337 [01:05<00:29,  3.54it/s]


KeyboardInterrupt: 