In [6]:
import os
import fitz  # PyMuPDF
import numpy as np
import faiss
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import time
import json
import re
from groq import Groq  # ✅ New import for Groq

# ---------- Setup ----------

spacy_model = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
bert_model = AutoModel.from_pretrained("llmware/industry-bert-insurance-v0.1")

# ✅ Initialize Groq Client
client = Groq(api_key="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")  # Replace with your Groq key

embedding_dim = 768

# Sample queries
queries = [
    "Is newborn baby covered under this plan?",
    "What is the waiting period for pre-existing diseases?",
    "Does this policy cover maternity expenses?",
    "Are day care procedures included?",
    "What is the cashless hospital network?",
    "Is OPD treatment reimbursable?",
    "What is the coverage amount for critical illness?",
    "Are ambulance charges covered?",
    "Is there a claim settlement ratio mentioned?",
    "What documents are needed for claim filing?"
]

# ---------- Utility Functions ----------

def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def chunk_policy_text(text, chunk_size=3):
    doc = spacy_model(text)
    sentences = [
        sent.text.strip()
        for sent in doc.sents
        if len(sent.text.strip()) > 50 and not sent.text.lower().startswith(("sbi general", "registered office"))
    ]
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    return chunks

def embed(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        output = bert_model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

def normalize_vectors(vectors):
    vectors = np.array(vectors).astype(np.float32)
    faiss.normalize_L2(vectors)
    return vectors

def summarize_with_groq(text):
    try:
        response = client.chat.completions.create(
            model="llama3-8b-8192",  # ✅ Using LLaMA3
            messages=[
                {
                    "role": "user",
                    "content": f"Summarize the following insurance policy clause and explain it simply:\n\n{text}"
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Summarization error: {e}")
        return "Summarization failed."

def safe_extract_scores(text):
    def extract_number(label):
        match = re.search(rf"{label}\s*[:\-]?\s*(\d)", text, re.IGNORECASE)
        return int(match.group(1)) if match else 0

    def extract_comment():
        match = re.search(r"comments?\s*[:\-]?\s*(.*)", text, re.IGNORECASE | re.DOTALL)
        return match.group(1).strip() if match else "No comment."

    return {
        "relevance": extract_number("relevance"),
        "accuracy": extract_number("accuracy"),
        "clarity": extract_number("clarity"),
        "helpfulness": extract_number("helpfulness"),
        "comments": extract_comment()
    }

def evaluate_with_groq(query, chunk, summary, retries=3):
    eval_prompt = f"""
You are an evaluator for an insurance assistant system. Assess the following:

Query: "{query}"

Retrieved Chunk:
{chunk}

LLM-generated Summary:
{summary}

Rate the following from 0 to 5:
- Relevance (does the chunk answer the query?)
- Accuracy (is the summary faithful?)
- Clarity (is it understandable?)
- Helpfulness (would a user find it useful?)

Also write a short evaluator comment.

Respond like:
Relevance: 4
Accuracy: 3
Clarity: 5
Helpfulness: 4
Comment: The summary is mostly relevant but lacks detail on OPD.
"""

    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="llama3-8b-8192",  # ✅ Using LLaMA3
                messages=[{"role": "user", "content": eval_prompt}]
            )
            return safe_extract_scores(response.choices[0].message.content.strip())

        except Exception as e:
            print(f"Evaluation retry {attempt+1}/{retries} failed: {e}")
            time.sleep(2)

    return {
        "relevance": 0,
        "accuracy": 0,
        "clarity": 0,
        "helpfulness": 0,
        "comments": "Evaluation failed completely."
    }

# ---------- Main Pipeline ----------

def process_policy(pdf_path, queries):
    policy_results = []
    print(f"\n📄 Processing {os.path.basename(pdf_path)}...")

    text = extract_text(pdf_path)
    chunks = chunk_policy_text(text)
    chunk_vectors = normalize_vectors([embed(c) for c in tqdm(chunks, desc="Embedding Chunks")])

    faiss_index = faiss.IndexFlatIP(embedding_dim)
    faiss_index.add(chunk_vectors)

    for query in queries:
        print(f"\n🔍 Query: {query}")
        query_vec = normalize_vectors([embed(query)])
        D, I = faiss_index.search(query_vec, 1)

        top_idx = I[0][0]
        top_chunk = chunks[top_idx]
        score = float(D[0][0])

        summary = summarize_with_groq(top_chunk)
        evaluation = evaluate_with_groq(query, top_chunk, summary)

        policy_results.append({
            "query": query,
            "retrieved_chunk": top_chunk,
            "similarity_score": score,
            "summary": summary,
            "evaluation": evaluation
        })

    return {
        "policy_pdf": os.path.basename(pdf_path),
        "results": policy_results
    }

# ---------- Runner ----------

if __name__ == "__main__":
    input_folder = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\policy_pdfs"
    all_results = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            path = os.path.join(input_folder, filename)
            result = process_policy(path, queries)
            all_results.append(result)

    with open("results.json", "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)

    print("\n✅ All policies processed. Results saved to `results.json`.")



📄 Processing 1. Policy- NMP.pdf...


Embedding Chunks: 100%|██████████| 148/148 [00:31<00:00,  4.64it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing 242972d58c064559b7335ac1d9cdf9b5.pdf...


Embedding Chunks: 100%|██████████| 162/162 [00:32<00:00,  5.00it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing 46045becd7b842dca4dfffd893ad9263.pdf...


Embedding Chunks: 100%|██████████| 82/82 [00:16<00:00,  5.10it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing 572dc7b9faac4c3985d9782f791c3624.pdf...


Embedding Chunks: 100%|██████████| 312/312 [01:03<00:00,  4.88it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing bc75b0756ad1487b9886aa0088809b25.pdf...


Embedding Chunks: 100%|██████████| 337/337 [01:06<00:00,  5.10it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing c983715b738f4f88b4dd0fc684d796c3.pdf...


Embedding Chunks: 100%|██████████| 164/164 [00:30<00:00,  5.30it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing COMPLETE HEALTHCARE INSURANCE_2013-2014.pdf...


Embedding Chunks: 100%|██████████| 156/156 [00:34<00:00,  4.47it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing e33c7adfab4248f290d841dbd513ed7a.pdf...


Embedding Chunks: 100%|██████████| 169/169 [00:38<00:00,  4.40it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing easy-health.pdf...


Embedding Chunks: 100%|██████████| 178/178 [00:37<00:00,  4.73it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing Employ-FirstPolicyWording.pdf...


Embedding Chunks: 100%|██████████| 113/113 [00:22<00:00,  5.00it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing FHO & Accident Care Clause_GEN629.pdf...


Embedding Chunks: 100%|██████████| 80/80 [00:19<00:00,  4.16it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing group-health-secure-terms-and-conditions.pdf...


Embedding Chunks: 100%|██████████| 134/134 [00:27<00:00,  4.83it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing health-booster_policy-wordings.pdf...


Embedding Chunks: 100%|██████████| 195/195 [00:39<00:00,  4.97it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing Health-Guard-Policy-Wordings-print.pdf...


Embedding Chunks: 100%|██████████| 198/198 [00:47<00:00,  4.13it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing health_infinity_pw_new.pdf...


Embedding Chunks: 100%|██████████| 300/300 [01:04<00:00,  4.67it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing Individual_Health_Wordings_0.pdf...


Embedding Chunks: 100%|██████████| 131/131 [00:27<00:00,  4.69it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing policy-document.pdf...


Embedding Chunks: 100%|██████████| 414/414 [01:28<00:00,  4.68it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing Policy-Wording-Acko-Retail-Health-Policy.pdf...


Embedding Chunks: 100%|██████████| 331/331 [01:07<00:00,  4.91it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing smart-super-health-insurance-policy-wording.pdf...


Embedding Chunks: 100%|██████████| 202/202 [00:41<00:00,  4.82it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing StarHealthAssureInsurancePolicy-Policy-wording.pdf...


Embedding Chunks: 100%|██████████| 202/202 [00:44<00:00,  4.53it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

📄 Processing Tata_AIG_Medi_Care_82932b277a.pdf...


Embedding Chunks: 100%|██████████| 140/140 [00:28<00:00,  4.89it/s]



🔍 Query: Is newborn baby covered under this plan?

🔍 Query: What is the waiting period for pre-existing diseases?

🔍 Query: Does this policy cover maternity expenses?

🔍 Query: Are day care procedures included?

🔍 Query: What is the cashless hospital network?

🔍 Query: Is OPD treatment reimbursable?

🔍 Query: What is the coverage amount for critical illness?

🔍 Query: Are ambulance charges covered?

🔍 Query: Is there a claim settlement ratio mentioned?

🔍 Query: What documents are needed for claim filing?

✅ All policies processed. Results saved to `results.json`.


### ROUGE , BLEU Scores

In [11]:
import json
import sacrebleu
from rouge_score import rouge_scorer
from sklearn.metrics import precision_score, recall_score
from bert_score import score as bert_score

from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

def simple_tokenize(text):
    return text.lower().split()

def compute_f1(reference, prediction):
    ref_tokens = set(simple_tokenize(reference))
    pred_tokens = set(simple_tokenize(prediction))
    overlap = ref_tokens & pred_tokens

    if not ref_tokens or not pred_tokens:
        return 0.0

    precision = len(overlap) / len(pred_tokens)
    recall = len(overlap) / len(ref_tokens)
    if precision + recall == 0:
        return 0.0

    return 2 * precision * recall / (precision + recall)

# Load data
with open("results.json", "r", encoding="utf-8") as f:
    all_data = json.load(f)

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

all_bleu, all_rouge, all_f1, all_bert = [], [], [], []

for policy in all_data:
    for result in policy["results"]:
        ref = result["retrieved_chunk"]
        hypo = result["summary"]

        # BLEU using sacrebleu
        bleu = sacrebleu.sentence_bleu(hypo, [ref]).score / 100
        all_bleu.append(bleu)

        # ROUGE-L
        rouge = scorer.score(ref, hypo)["rougeL"].fmeasure
        all_rouge.append(rouge)

        # F1
        f1 = compute_f1(ref, hypo)
        all_f1.append(f1)

        # BERTScore
        _, _, bert_f1 = bert_score([hypo], [ref], lang="en", verbose=False)
        all_bert.append(bert_f1[0].item())

# Print averages
print("\n✅ Evaluation Metrics across all results:")
print(f"Average BLEU Score      : {sum(all_bleu)/len(all_bleu):.4f}")
print(f"Average ROUGE-L F1      : {sum(all_rouge)/len(all_rouge):.4f}")
print(f"Average Token F1 Score  : {sum(all_f1)/len(all_f1):.4f}")
print(f"Average BERTScore F1    : {sum(all_bert)/len(all_bert):.4f}")



✅ Evaluation Metrics across all results:
Average BLEU Score      : 0.0704
Average ROUGE-L F1      : 0.3167
Average Token F1 Score  : 0.3777
Average BERTScore F1    : 0.8443
