In [2]:
import fitz # pip install pymupdf
import pytesseract # pip install pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
from PIL import Image # pip install pillow
import io

In [4]:
def read_pdf(file):
    doc = fitz.open(file)
    text = ""
    for page in doc:
        page_text = page.get_text()

        if len(page_text.strip()) == 0: # might be an image
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            page_text = pytesseract.image_to_string(img)

        text += page_text + "\n"
    return text

pdf_text = read_pdf("docs/EDLHLGA23009V012223.pdf")
print(pdf_text[:21])


Well Baby Well Mother


In [5]:
# Split chunks based on the doc length.
# STEP 2:

from langchain_text_splitters import RecursiveCharacterTextSplitter # pip install lanchain

# Suppose pdf_text is your extracted text from Step 1
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # size of each chunk
    chunk_overlap=100,    # overlap so that important info isn't cut off
    separators=["\n\n", "\n", ".", " "]  # try to split on paragraphs/sentences
)

chunks = text_splitter.split_text(pdf_text)

print(f"Total chunks: {len(chunks)}")
print("First chunk:\n", chunks[8])

Total chunks: 9
First chunk:
 sum insured shall be subject to limit in place. 
Routine Preventive Care Services will include expenses recommended by a doctor and incurred on – 
Pharmacy and Diagnostic Tests.


In [6]:
# STEP 3:
# pip install sentence-transformers faiss-cpu
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(chunks, convert_to_numpy=True)

# Store embeddings in FAISS index
dimension = embeddings.shape[1]  # length of each embedding vector
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save index and chunks for later use
faiss.write_index(index, "vector.index")
import pickle
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print(f"Stored {len(chunks)} chunks in FAISS.")

Stored 9 chunks in FAISS.


  return forward_call(*args, **kwargs)


In [None]:
# Load index and chunks
index = faiss.read_index("vector.index")
with open("chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

# Convert query to embedding
query = "knee surgery in Pune 3 months old policy"
query_embedding = model.encode([query], convert_to_numpy=True)

# Search
k = 5  # top 5 most relevant chunks
distances, indices = index.search(query_embedding, k)

print("Top relevant chunks:")
for i in indices[0]:
    print(chunks[i])

Top relevant chunks:
immunizations) up to first discharge from hospital. 
iii) At the onset of pregnancy, maternity hospitalization (only routine preventive care services and 
immunizations) and  until 30 days following birth of new born baby. 
We will not cover 
1. Any infertility treatments 
2. Any charges payable under the maternity section (if opted as an optional cover) of the policy 
  
Healthy baby expenses / well baby care expenses  
Cover for expenses incurred for a New born baby after the birth until first discharge from hospital.  
Covers routine medical care provided to a new born baby, which includes limited to appropriate customary 
examinations required to assess the integrity and basic functions of child’s organs and skeletal structure 
carried out immediately following birth, routine preventive care services and immunizations (within the 
hospitalization period).  
The sum insured limit shall be as opted for and specified in the policy schedule. For multiple born babie

In [9]:
import requests
from sentence_transformers import SentenceTransformer
import faiss, pickle

# === Load your FAISS index and chunks (from previous steps) ===
index = faiss.read_index("vector.index")
with open("chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# === Step 1: Query from user ===
query = "46-year-old male, knee surgery in Pune, 3-month-old insurance policy"

# === Step 2: Retrieve relevant text ===
query_embedding = embed_model.encode([query], convert_to_numpy=True)
distances, indices = index.search(query_embedding, 5)
relevant_clauses = [chunks[i] for i in indices[0]]

# === Step 3: Build prompt for reasoning ===
context = "\n\n".join(relevant_clauses)
prompt = f"""
You are an insurance policy assistant.

User query:
{query}

Relevant clauses:
{context}

Follow these steps:
1. Extract details from the query: age, gender, procedure, location, policy duration.
2. Decide whether the claim is approved or rejected.
3. Justify the decision with references to specific clauses.
4. Return JSON with fields: decision, amount (if any), justification, used_clauses.
"""

print("LLM Output:\n", result)


NameError: name 'result' is not defined

In [15]:
from fastapi import FastAPI
from pydantic import BaseModel
import requests, faiss, pickle
from sentence_transformers import SentenceTransformer

# Load everything at startup
index = faiss.read_index("vector.index")
with open("chunks.pkl", "rb") as f:
    chunks = pickle.load(f)
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

app = FastAPI()

class QueryRequest(BaseModel):
    query: str

@app.post("/ask")
def ask(req: QueryRequest):
    # 1. Find relevant text
    q_emb = embed_model.encode([req.query], convert_to_numpy=True)
    _, indices = index.search(q_emb, 5)
    relevant_clauses = [chunks[i] for i in indices[0]]

    # 2. Build prompt
    context = "\n\n".join(relevant_clauses)
    prompt = f"""
    User query:
    {req.query}

    Relevant clauses:
    {context}

    Instructions:
    1. Extract details (age, gender, procedure, location, policy duration).
    2. Decide approval/rejection.
    3. Justify using clauses.
    4. Return valid JSON with: decision, amount, justification, used_clauses.
    """

    # 3. Call local Mistral via Ollama
    resp = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "mistral", "prompt": prompt, "stream": False}
    )
    return {"result": resp.json()["response"]}
