In [1]:
pip install pdfplumber python-docx nltk langchain

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting langchain-core<0.2.0,>=0.1.33 (from langchain)
  Using cached langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Using cached langchain_core-0.1.53-py3-none-any.whl (303 kB)
Installing collected packages: python-docx, langchain-core
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.51
    Uninstalling langchain-core-0.3.51:
      Successfully uninstalled langchain-core-0.3.51
Successfully installed langchain-core-0.1.53 python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-experimental 0.3.3 requires langchain-community<0.4.0,>=0.3.0, but you have langchain-community 0.0.29 which is incompatible.
langchain-experimental 0.3.3 requires langchain-core<0.4.0,>=0.3.15, but you have langchain-core 0.1.53 which is incompatible.
langchain-google-genai 2.0.6 requires langchain-core<0.4,>=0.3.15, but you have langchain-core 0.1.53 which is incompatible.
langgraph-checkpoint 2.0.24 requires langchain-core<0.4,>=0.2.38, but you have langchain-core 0.1.53 which is incompatible.
langgraph-prebuilt 0.1.8 requires langchain-core!=0.3.0,!=0.3.1,!=0.3.10,!=0.3.11,!=0.3.12,!=0.3.13,!=0.3.14,!=0.3.15,!=0.3.16,!=0.3.17,!=0.3.18,!=0.3.19,!=0.3.2,!=0.3.20,!=0.3.21,!=0.3.22,!=0.3.3,!=0.3.4,!=0.3.5,!=0.3.6,!=0.3.7,!=0.3.8,!=0.3.9,<0.4.0,>=0.2.43, but you have langchain-core 0.1.53 which is incompa

In [2]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Sejal
[nltk_data]     Hanmante\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pdfplumber
import docx
import nltk
from nltk.tokenize import word_tokenize
from ipywidgets import FileUpload
import io

nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Sejal
[nltk_data]     Hanmante\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

###  Text Extraction functions

In [4]:
def extract_text_from_pdf(file_stream):
    text = ""
    with pdfplumber.open(file_stream) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(file_stream):
    doc = docx.Document(file_stream)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_txt(file_stream):
    return file_stream.read().decode('utf-8')

def extract_text(uploaded_file):
    filename = list(uploaded_file.value.keys())[0]
    content = uploaded_file.value[filename]['content']
    ext = filename.split('.')[-1].lower()
    file_stream = io.BytesIO(content)

    if ext == 'pdf':
        return extract_text_from_pdf(file_stream)
    elif ext == 'docx':
        return extract_text_from_docx(file_stream)
    elif ext == 'txt':
        return extract_text_from_txt(file_stream)
    else:
        raise ValueError("Unsupported file type. Please upload a PDF, DOCX, or TXT file.")


### Chunking function

In [5]:
def chunk_text(text, chunk_size=200, overlap=50):
    words = word_tokenize(text)
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


In [6]:
upload_widget = FileUpload(accept='.pdf,.docx,.txt', multiple=False)
display(upload_widget)


FileUpload(value={}, accept='.pdf,.docx,.txt', description='Upload')

In [7]:
if upload_widget.value:
    raw_text = extract_text(upload_widget)
    chunks = chunk_text(raw_text)

    print(f"Total Chunks: {len(chunks)}\n")
    for i, chunk in enumerate(chunks[:5]):
        print(f"\n--- Chunk {i+1} ---\n{chunk[:500]}...")  # Truncate long chunks
else:
    print("Please upload a file.")


Total Chunks: 152


--- Chunk 1 ---
Policy wordings - Smart Super Health Insurance Policy PREAMBLE : The insurance cover provided under this Policy to the Insured / Insured Person up to the Sum Insured is and shall be subject to ( a ) the terms and conditions of this Policy and ( b ) the receipt of premium and ( c ) Disclosure to Information Norm ( including by way of the Proposal or Information Summary Sheet ) and ( d ) Schedule of Benefits . SECTION 1 - DEFINITIONS : Any word or expression to which a specific meaning has been as...

--- Chunk 2 ---
and violent means . 1.2 ) `` Any one Illness '' means continuous period of Illness and it includes a relapse within 45 days from the date of last consultation with the Hospital/Nursing Home where treatment may have been taken . 1.3 ) `` Ayush Treatment '' refers to the medical and / or hospitalization treatments given under â€˜ Ayurveda , Yoga and Naturopathy , Unani , Siddha and Homeopathy systems . 1.4 ) `` Cashless facility '' means a f

In [9]:
import os
import pdfplumber
import re
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm


### InsuranceBERT model

In [14]:
# Load InsuranceBERT model
# Load model directly
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
model = AutoModel.from_pretrained("llmware/industry-bert-insurance-v0.1")

def get_embedding(text):
    tokens = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()


RuntimeError: Failed to import transformers.models.timm_wrapper.configuration_timm_wrapper because of the following error (look up to see its traceback):
partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [None]:
pip uninstall timm


### Preprocessing functions

In [11]:
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def clean_text(text):
    text = re.sub(r'\n+', ' ', text)  # remove newlines
    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
    text = re.sub(r'Page\s*\d+|\d+\s*/\s*\d+', '', text)  # remove common page numbers
    return text.strip()

def chunk_text(text, chunk_size=200, overlap=50):
    words = word_tokenize(text)
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


### Processing all pdfs in the folder 


In [None]:
pdf_folder = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\policy_pdfs"
all_chunks = []
all_embeddings = []
metadata = []

for filename in tqdm(os.listdir(pdf_folder)):
    if filename.endswith(".pdf"):
        path = os.path.join(pdf_folder, filename)
        raw_text = extract_text_from_pdf(path)
        cleaned_text = clean_text(raw_text)
        chunks = chunk_text(cleaned_text)

        for i, chunk in enumerate(chunks):
            embedding = get_embedding(chunk)
            all_chunks.append(chunk)
            all_embeddings.append(embedding)
            metadata.append({"file": filename, "chunk_id": i})


In [None]:
import numpy as np
import pickle

# Save using pickle
with open("embeddings.pkl", "wb") as f:
    pickle.dump({
        "chunks": all_chunks,
        "embeddings": all_embeddings,
        "metadata": metadata
    }, f)

print("âœ… Embeddings and chunks saved.")


### FAISS INDEXING

In [None]:
import faiss
import numpy as np
import pickle


In [None]:
# Load embeddings, chunks, metadata
with open("embeddings.pkl", "rb") as f:
    data = pickle.load(f)

all_embeddings = np.array(data["embeddings"]).astype("float32")
all_chunks = data["chunks"]
metadata = data["metadata"]


In [None]:
# Create FAISS index
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(all_embeddings)

# Save FAISS index
faiss.write_index(index, "policy_index.faiss")

# Save metadata and chunks for lookup
with open("policy_metadata.pkl", "wb") as f:
    pickle.dump({"chunks": all_chunks, "metadata": metadata}, f)

print("âœ… FAISS index and metadata saved.")


In [None]:
# Load FAISS index
index = faiss.read_index("policy_index.faiss")

# Load metadata & chunks
with open("policy_metadata.pkl", "rb") as f:
    db = pickle.load(f)

chunks = db["chunks"]
metadata = db["metadata"]


In [None]:
def search_policy(query, k=3):
    query_embedding = get_embedding(query).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_embedding, k)

    results = []
    for i in indices[0]:
        results.append({
            "chunk": chunks[i],
            "file": metadata[i]["file"],
            "chunk_id": metadata[i]["chunk_id"]
        })
    return results


### Query Handling

In [None]:
import faiss
import pickle
import numpy as np

# Load FAISS index
index = faiss.read_index("policy_index.faiss")

# Load metadata
with open("policy_metadata.pkl", "rb") as f:
    db = pickle.load(f)

chunks = db["chunks"]
metadata = db["metadata"]


### Query Search + prompt builder 

In [None]:
def retrieve_relevant_chunks(query, k=3):
    query_embedding = get_embedding(query).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_embedding, k)

    results = []
    for i in indices[0]:
        results.append({
            "chunk": chunks[i],
            "file": metadata[i]["file"],
            "chunk_id": metadata[i]["chunk_id"]
        })
    return results

def build_prompt(retrieved_chunks, query):
    context = "\n\n".join([f"[{r['file']} - Chunk {r['chunk_id']}]:\n{r['chunk']}" for r in retrieved_chunks])
    prompt = f"""You are a helpful assistant. Answer the user's question based only on the provided policy content.

Context:
{context}

Question: {query}
"""
    return prompt


In [None]:
import openai
openai.api_base = "https://openrouter.ai/api/v1"
openai.api_key = "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"

def ask_deepseek(prompt, model="deepseek/deepseek-r1:free"):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant for insurance policy queries."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=800,
    )
    return response.choices[0].message['content']


In [None]:
query = "Does this policy provide cashless hospitalization for critical illness?"

# Step 1: Retrieve
top_chunks = retrieve_relevant_chunks(query)

# Step 2: Build Prompt
prompt = build_prompt(top_chunks, query)

# Step 3: Ask LLM
answer = ask_deepseek(prompt)

# Step 4: Display
print("ðŸ“„ Retrieved Context:")
for i, chunk in enumerate(top_chunks):
    print(f"\n[{chunk['file']} - Chunk {chunk['chunk_id']}]:\n{chunk['chunk'][:400]}...\n")

print("\nðŸ§  Answer:")
print(answer)
