# Step 1: Decode the InsuranceQA Dataset

In [1]:
import json
import gzip

# Function to load vocabulary from a file
def load_vocabulary(vocab_file_path):
    vocab_dict = {}
    with open(vocab_file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t", 1)  # Split only on the first tab
            if len(parts) == 2:
                index, word = parts
                vocab_dict[index] = word  # Store mapping
    return vocab_dict

# Function to decode category and question only
def decode_questions(encoded_file_path, vocab_dict1, vocab_dict2):
    decoded_questions = []
    
    try:
        with gzip.open(encoded_file_path, 'rt', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split("\t")
                
                if len(parts) >= 2:  # Ensure the format is correct
                    question_category = parts[0]  # The first part is the category
                    encoded_question = parts[1]  # The second part is the encoded question

                    # Decode the question using both vocab dictionaries
                    decoded_question = " ".join([vocab_dict1.get(token, vocab_dict2.get(token, "[UNKNOWN]")) for token in encoded_question.split()])

                    # Store the decoded category and question
                    decoded_questions.append({
                        'category': question_category,
                        'question': decoded_question
                    })
    except Exception as e:
        print(f"Error decoding file {encoded_file_path}: {e}")
    
    return decoded_questions

# File paths
vocab_file1_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\vocabulary"
vocab_file2_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\vocabulary.txt"
train_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1500.pool.solr.train.encoded.gz"
test1_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1500.pool.solr.test.encoded.gz"
test2_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1000.pool.solr.test.encoded.gz"

# Load both vocabulary files
vocab_dict1 = load_vocabulary(vocab_file1_path)
vocab_dict2 = load_vocabulary(vocab_file2_path)

# Decode only questions
train_questions = decode_questions(train_file_path, vocab_dict1, vocab_dict2)
test1_questions = decode_questions(test1_file_path, vocab_dict1, vocab_dict2)
test2_questions = decode_questions(test2_file_path, vocab_dict1, vocab_dict2)

# Combine all decoded questions
all_questions_decoded = train_questions + test1_questions + test2_questions

# Save decoded data as a JSON file
output_file_path = r"D:\NLPInsuranceProject\decoded_questions.json"
with open(output_file_path, 'w', encoding='utf-8') as f_out:
    json.dump(all_questions_decoded, f_out, indent=4, ensure_ascii=False)

print(f"✅ Decoding complete! Results saved to {output_file_path}")

# Display a few decoded examples
print(json.dumps(all_questions_decoded[:3], indent=4, ensure_ascii=False))


✅ Decoding complete! Results saved to D:\NLPInsuranceProject\decoded_questions.json
[
    {
        "category": "disability-insurance",
        "question": "Is Disability Insurance Required By Law?"
    },
    {
        "category": "life-insurance",
        "question": "Can Creditors Take Life Insurance After Death?"
    },
    {
        "category": "renters-insurance",
        "question": "Does Travelers Insurance Have Renters Insurance?"
    }
]


In [2]:
pip install SpeechRecognition

Note: you may need to restart the kernel to use updated packages.


# HE WALA GHE

In [3]:
import sqlite3
import speech_recognition as sr
import pyttsx3
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown

# ✅ Set Up OpenRouter API
API_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=API_KEY,
    default_headers={ 
        "Authorization": f"Bearer {API_KEY}",  
        "X-Title": "Insurance Chatbot"
    }
)




In [4]:
import json
from sentence_transformers import InputExample

# ✅ Load the JSON File
with open("decoded_questions.json", "r", encoding="utf-8") as f:
    questions_data = json.load(f)

# ✅ Extract question pairs (create synthetic paraphrases for now)
examples = []
questions = [entry["question"] for entry in questions_data]

# We'll use simple random pairs (not ideal, but okay for initial fine-tune)
for i in range(len(questions) - 1):
    examples.append(InputExample(texts=[questions[i], questions[i + 1]]))

print(f"Total training pairs: {len(examples)}")


Total training pairs: 16888


In [5]:
import json
from sentence_transformers import InputExample
from collections import defaultdict
import random

# ✅ Load decoded questions
with open("decoded_questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ Group questions by category
category_to_questions = defaultdict(list)
for item in data:
    category = item["category"]
    question = item["question"]
    category_to_questions[category].append(question)

# ✅ Create pairs of similar questions from the same category
examples = []
for category, questions in category_to_questions.items():
    if len(questions) < 2:
        continue
    random.shuffle(questions)
    for i in range(0, len(questions) - 1, 2):
        q1, q2 = questions[i], questions[i + 1]
        examples.append(InputExample(texts=[q1, q2]))

print(f"✅ Prepared {len(examples)} training examples")


✅ Prepared 8441 training examples


In [6]:
sentences1 = [
    "What is term insurance?",
    "How can I cancel my health insurance?",
    "Does motor insurance cover theft?"
]

sentences2 = [
    "Explain term insurance coverage.",
    "What’s the process for cancelling a policy?",
    "Is vehicle theft included in insurance?"
]

scores = [0.95, 0.8, 0.9]  # Based on your judgment or annotations


In [8]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator


# ✅ Load base model
model = SentenceTransformer("all-MiniLM-L6-v2")
#model = model.to('cuda')

# ✅ Prepare DataLoader
train_dataloader = DataLoader(examples, shuffle=True, batch_size=64)

# ✅ Choose Loss
train_loss = losses.MultipleNegativesRankingLoss(model)

evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    evaluator=evaluator,
    evaluation_steps=500,
    warmup_steps=100,
    show_progress_bar=True
)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
132,No log,No log,0.975155,1.0
264,No log,No log,0.972147,1.0
396,No log,No log,0.971722,1.0
500,2.305000,No log,0.969061,1.0
528,2.305000,No log,0.971763,1.0
660,2.305000,No log,0.970769,1.0


In [9]:
model_save_path = "custom_insurance_encoder"
model.save(model_save_path)
print(f"✅ Model saved at: {model_save_path}")


✅ Model saved at: custom_insurance_encoder


In [10]:
sentences = [
    "What does liability insurance cover?",
    "Tell me about car insurance coverage.",
    "When does a policy lapse?"
]

embeddings = model.encode(sentences, convert_to_tensor=True)

# Check cosine similarity (optional)
from sentence_transformers.util import cos_sim
print("Similarity between 1st and 2nd:", cos_sim(embeddings[0], embeddings[1]))


Similarity between 1st and 2nd: tensor([[0.8101]])


# HE WALA GHE

In [12]:
from IPython.display import display, Markdown


# ✅ Load Fine-Tuned Sentence Transformer
model = SentenceTransformer(r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\custom_insurance_encoder")

# ✅ Load Insurance Questions
import json
with open(r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\decoded_questions.json", "r", encoding="utf-8") as file:
    insurance_data = json.load(file)

questions = [entry["question"] for entry in insurance_data]
question_embeddings = model.encode(questions)

# ✅ Set Up SQLite for Memory
conn = sqlite3.connect("chatbot_memory.db", check_same_thread=False)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS chatbot_memory (query TEXT, response TEXT)")
conn.commit()

# ✅ Function to Search Memory
def search_memory(user_query):
    cursor.execute("SELECT response FROM chatbot_memory WHERE query=?", (user_query,))
    result = cursor.fetchone()
    return result[0] if result else None

# ✅ Function to Store in Memory
def store_memory(user_query, response):
    cursor.execute("INSERT INTO chatbot_memory (query, response) VALUES (?, ?)", (user_query, response))
    conn.commit()

# ✅ Function for Semantic Search
def find_relevant_questions(user_query):
    query_embedding = model.encode([user_query])
    similarities = np.dot(query_embedding, question_embeddings.T)[0]
    top_indices = np.argsort(similarities)[-3:][::-1]
    relevant_questions = [questions[i] for i in top_indices]
    return relevant_questions

# ✅ Function to Get Insurance Answer
def get_insurance_response(user_query):
    # Step 1: Check Memory
    memory_response = search_memory(user_query)
    if memory_response:
        return f"[From Memory] {memory_response}"

    # Step 2: Find Relevant Questions
    relevant_questions = find_relevant_questions(user_query)

    # Step 3: Call OpenRouter (DeepSeek R1)
    try:
        response = client.chat.completions.create(
            model="deepseek/deepseek-r1:free",
            messages=[
                {"role": "system", "content": "You are an insurance chatbot based from India who follows rules and regulations related to insurance for India. Answer insurance-related questions only. If non-insurance related questions asked please politely deny saying i am not made for this domain"},
                {"role": "user", "content": f"User Query: {user_query}\nRelevant Questions: {relevant_questions}"}
            ],
            temperature=0.3,
        )
        chatbot_response = response.choices[0].message.content
        store_memory(user_query, chatbot_response)  # Save response in memory
        return chatbot_response
    except Exception as e:
        return f"Error: {str(e)}"

# ✅ Voice Input & Output Functions
def recognize_speech():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("🎤 Listening...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)
    try:
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return "Sorry, I couldn't understand."
    except sr.RequestError:
        return "Speech recognition service is unavailable."

def speak_response(response):
    engine = pyttsx3.init()
    engine.say(response)
    
    try:
        engine.runAndWait()  # Normal execution
    except RuntimeError:
        engine.endLoop()  # Stop the current event loop
        engine.runAndWait() 

# ✅ Main Chat Loop (Conversation Mode)
def start_chatbot():
    print("💬 Insurance Chatbot Started! Type 'voice' for voice input. Say 'thank you', 'ok', or 'exit' to stop.")
    
    while True:
        user_input = input("👤 You: ").strip().lower()
        print("User question: " ,user_input)

        # Handle voice input
        if user_input == "voice":
            user_input = recognize_speech()
            print(f"👤 You (via voice): {user_input}")

        # Check for exit words
        if user_input in ["thank you", "thanks", "ok", "exit", "goodbye", "bye"]:
            print("🤖 Bot: You're welcome! Have a great day! 😊")
            speak_response("You're welcome! Have a great day!")
            break

        # Get response
        bot_response = get_insurance_response(user_input)
    
        display(Markdown(f"🤖 Bot: {bot_response}"))
        speak_response(bot_response)

        # Ask if further help is needed
        follow_up = input("🤖 Bot: Do you need help with anything else? (yes/no) ").strip().lower()
        if follow_up in ["no", "thank you", "thanks", "ok", "exit", "goodbye", "bye"]:
            print("🤖 Bot: Have a great day! 😊")
            speak_response("Have a great day!")
            break

# ✅ Run the Chatbot
if __name__ == "__main__":
    start_chatbot()


💬 Insurance Chatbot Started! Type 'voice' for voice input. Say 'thank you', 'ok', or 'exit' to stop.
User question:  hey there i am sejal , i want to know about various types of insurances


🤖 Bot: [From Memory] Hello Sejal! I'm here to help you understand the **types of insurance available in India** and their key features. Let’s break them down:

---

### **1. Life Insurance**  
- **Purpose**: Financial protection for your family in case of your untimely death.  
- **Types**:  
  - **Term Insurance**: Pure risk cover (no maturity benefit). *E.g., HDFC Life Click 2 Protect*.  
  - **Endowment Plans**: Savings + life cover. *E.g., LIC Money Back Policy*.  
  - **ULIPs (Unit-Linked Insurance Plans)**: Combines insurance + investment.  
  - **Whole Life Insurance**: Coverage until death.  
- **Tax Benefit**: Premiums qualify under **Section 80C**.

---

### **2. Health Insurance**  
- **Purpose**: Covers medical expenses due to illness/accidents.  
- **Types**:  
  - **Individual/ Family Floater**: Covers self or family under one policy.  
  - **Critical Illness Insurance**: Lump-sum payout for diseases like cancer.  
  - **Senior Citizen Plans**: Tailored for ages 60+.  
  - **Government Schemes**: *Ayushman Bharat (PM-JAY)* for low-income families.  
- **Tax Benefit**: Premiums up to ₹25,000–₹75,000 under **Section 80D**.

---

### **3. Motor Insurance**  
- **Mandatory in India** (under the *Motor Vehicles Act*).  
- **Types**:  
  - **Third-Party Insurance**: Covers damage to others (vehicles/injuries).  
  - **Comprehensive Policy**: Covers own vehicle + third-party liabilities.  

---

### **4. Travel Insurance**  
- Covers trip cancellations, medical emergencies abroad, lost baggage, etc.  
- **Key for international trips** (some countries mandate it for visas).  

---

### **5. Home Insurance**  
- Protects against damage to property (fire, theft, natural disasters).  
- **Types**: Structure coverage (building) + contents (belongings).  

---

### **6. Liability Insurance**  
- **Professional Liability**: For doctors, lawyers (malpractice claims).  
- **Public Liability**: Covers third-party injuries/damage (e.g., business owners).  

---

### **Other Specialized Insurances**  
- **Crop Insurance**: For farmers (e.g., *PM Fasal Bima Yojana*).  
- **Pet Insurance**: Covers veterinary expenses.  
- **Gadget Insurance**: For phones, laptops.  

---

### **Key Regulator**:  
- **IRDAI** (Insurance Regulatory and Development Authority of India) oversees all policies.  

Need details on **claims, policy transfers, or switching insurers**? Feel free to ask! 😊

User question:  explain health insurance senior citizen plans in detail


🤖 Bot: **Health Insurance Senior Citizen Plans in India (Detailed Explanation)**  

Health insurance for senior citizens (typically aged 60+) in India is tailored to address the unique medical needs of older adults. Here’s a breakdown of key aspects:  

---

### **1. Key Features of Senior Citizen Health Plans**  
- **Higher Sum Insured**: Coverage ranges from ₹1 lakh to ₹25+ lakhs to manage age-related health risks.  
- **Comprehensive Coverage**:  
  - Hospitalization (room rent, ICU, surgery).  
  - Pre- & post-hospitalization expenses (up to 60 days).  
  - Pre-existing diseases (PEDs) covered after a **waiting period** (1–4 years).  
  - Daycare procedures, domiciliary treatment, and AYUSH treatments.  
- **Lifelong Renewability**: Mandated by IRDAI, ensuring coverage beyond 65+ years.  
- **Higher Premiums**: Reflects increased health risks but offers tax benefits under **Section 80D** (up to ₹50,000 for seniors).  
- **Co-payment Clause**: Seniors may pay 10–30% of claims (varies by insurer).  

---

### **2. Types of Senior Citizen Plans**  
- **Standalone Senior Plans**: Exclusive policies for ages 60–70 (e.g., Star Health Senior Citizen Red Carpet).  
- **Family Floater with Senior Inclusion**: Covers seniors under a family plan (age limits apply).  
- **Top-up/Super Top-up Plans**: Enhances coverage beyond base policy limits.  
- **Critical Illness Riders**: Add-ons for diseases like cancer or heart ailments.  

---

### **3. Government Schemes for Seniors**  
- **Ayushman Bharat (PM-JAY)**: Covers ₹5 lakh/year for low-income families (includes seniors).  
- **Niramaya Health Insurance**: For disabled/autistic individuals (includes seniors).  
- **Varishta Mediclaim (by LIC)**: Covers hospitalization for ages 60–80.  

---

### **4. Considerations When Choosing a Plan**  
- **Coverage Adequacy**: Ensure the sum insured matches potential medical costs.  
- **Waiting Periods**: Check terms for PEDs, specific illnesses, and maternity.  
- **Network Hospitals**: Opt for insurers with wide hospital networks.  
- **Claim Settlement Ratio (CSR)**: Prioritize insurers with CSR >90% (e.g., Oriental, Star Health).  

---

### **Answers to Related Queries**  
1. **Best Supplemental Health Insurance**:  
   - **Super Top-up Plans** (covers costs beyond base policy).  
   - **Critical Illness Riders** (lump-sum payout for specific diseases).  

2. **Supplemental Insurance for Elders**:  
   Often called **"senior-specific riders"** or **"top-up plans"** in India.  

3. **Grandfathered Health Plans**:  
   Rarely used in India. Refers to older policies exempt from new regulations. Most insurers now comply with IRDAI’s standardized guidelines.  

--- 

**Final Tip**: Compare policies on portals like Policybazaar, check exclusions, and prioritize lifelong renewability. Always disclose medical history to avoid claim rejection.

User question:  do they cover age above 75


🤖 Bot: In India, insurance coverage for individuals above 75 years of age is possible, but it depends on the type of policy and insurer. Here are the key details:

### 1. **Senior Citizen-Specific Policies**  
   Many insurers offer **dedicated health insurance plans for seniors** (e.g., Star Health's Senior Citizen Red Carpet, National Insurance's Varistha Mediclaim). These typically:  
   - Cover individuals up to **90 years** (renewable for life in some cases).  
   - Require **medical check-ups** for enrollment.  
   - Include features like **pre-existing disease coverage** (after a 2–4-year waiting period).  

### 2. **IRDAI Guidelines**  
   Insurers cannot deny coverage solely based on age (as per regulations). However, they may:  
   - Charge **higher premiums** due to age-related risks.  
   - Impose **co-payment clauses** (e.g., 20–30% of claims shared by the insured).  
   - Exclude certain treatments or limit the **sum insured**.  

### 3. **Critical Factors**  
   - **Pre-existing conditions**: Declare them upfront; coverage may start after a waiting period.  
   - **Renewability**: Ensure the policy is lifelong renewable.  
   - **Group/Floter Plans**: Some family floaters allow senior members if the primary insured is younger.  

### 4. **Suggestions**  
   - Compare policies via platforms like **Policybazaar** or **Coverfox**.  
   - Consult insurers directly (e.g., Star Health, HDFC Ergo) for tailored options.  

While options exist, terms are stricter and costlier. Always read policy documents carefully! 🩺💡

🤖 Bot: Have a great day! 😊
