# Step 1: Decode the InsuranceQA Dataset

In [1]:
import json
import gzip

# Function to load vocabulary from a file
def load_vocabulary(vocab_file_path):
    vocab_dict = {}
    with open(vocab_file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t", 1)  # Split only on the first tab
            if len(parts) == 2:
                index, word = parts
                vocab_dict[index] = word  # Store mapping
    return vocab_dict

# Function to decode category and question only
def decode_questions(encoded_file_path, vocab_dict1, vocab_dict2):
    decoded_questions = []
    
    try:
        with gzip.open(encoded_file_path, 'rt', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split("\t")
                
                if len(parts) >= 2:  # Ensure the format is correct
                    question_category = parts[0]  # The first part is the category
                    encoded_question = parts[1]  # The second part is the encoded question

                    # Decode the question using both vocab dictionaries
                    decoded_question = " ".join([vocab_dict1.get(token, vocab_dict2.get(token, "[UNKNOWN]")) for token in encoded_question.split()])

                    # Store the decoded category and question
                    decoded_questions.append({
                        'category': question_category,
                        'question': decoded_question
                    })
    except Exception as e:
        print(f"Error decoding file {encoded_file_path}: {e}")
    
    return decoded_questions

# File paths
vocab_file1_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\vocabulary"
vocab_file2_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\vocabulary.txt"
train_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1500.pool.solr.train.encoded.gz"
test1_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1500.pool.solr.test.encoded.gz"
test2_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1000.pool.solr.test.encoded.gz"

# Load both vocabulary files
vocab_dict1 = load_vocabulary(vocab_file1_path)
vocab_dict2 = load_vocabulary(vocab_file2_path)

# Decode only questions
train_questions = decode_questions(train_file_path, vocab_dict1, vocab_dict2)
test1_questions = decode_questions(test1_file_path, vocab_dict1, vocab_dict2)
test2_questions = decode_questions(test2_file_path, vocab_dict1, vocab_dict2)

# Combine all decoded questions
all_questions_decoded = train_questions + test1_questions + test2_questions

# Save decoded data as a JSON file
output_file_path = r"D:\NLPInsuranceProject\decoded_questions.json"
with open(output_file_path, 'w', encoding='utf-8') as f_out:
    json.dump(all_questions_decoded, f_out, indent=4, ensure_ascii=False)

print(f"✅ Decoding complete! Results saved to {output_file_path}")

# Display a few decoded examples
print(json.dumps(all_questions_decoded[:3], indent=4, ensure_ascii=False))


✅ Decoding complete! Results saved to D:\NLPInsuranceProject\decoded_questions.json
[
    {
        "category": "disability-insurance",
        "question": "Is Disability Insurance Required By Law?"
    },
    {
        "category": "life-insurance",
        "question": "Can Creditors Take Life Insurance After Death?"
    },
    {
        "category": "renters-insurance",
        "question": "Does Travelers Insurance Have Renters Insurance?"
    }
]


In [2]:
pip install SpeechRecognition

Note: you may need to restart the kernel to use updated packages.


# HE WALA GHE

In [3]:
!pip install SpeechRecognition pyttsx3



In [16]:
import sqlite3
import speech_recognition as sr
import pyttsx3
import numpy as np
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown

# ✅ Set Up OpenRouter API
API_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=API_KEY,
    default_headers={ 
        "Authorization": f"Bearer {API_KEY}",  
        "X-Title": "Insurance Chatbot"
    }
)

In [5]:
import json
from sentence_transformers import InputExample

# ✅ Load the JSON File
with open("decoded_questions.json", "r", encoding="utf-8") as f:
    questions_data = json.load(f)

# ✅ Extract question pairs (create synthetic paraphrases for now)
examples = []
questions = [entry["question"] for entry in questions_data]

# We'll use simple random pairs (not ideal, but okay for initial fine-tune)
for i in range(len(questions) - 1):
    examples.append(InputExample(texts=[questions[i], questions[i + 1]]))

print(f"Total training pairs: {len(examples)}")


Total training pairs: 16888


In [6]:
import json
from sentence_transformers import InputExample
from collections import defaultdict
import random

# ✅ Load decoded questions
with open("decoded_questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ Group questions by category
category_to_questions = defaultdict(list)
for item in data:
    category = item["category"]
    question = item["question"]
    category_to_questions[category].append(question)

# ✅ Create pairs of similar questions from the same category
examples = []
for category, questions in category_to_questions.items():
    if len(questions) < 2:
        continue
    random.shuffle(questions)
    for i in range(0, len(questions) - 1, 2):
        q1, q2 = questions[i], questions[i + 1]
        examples.append(InputExample(texts=[q1, q2]))

print(f"✅ Prepared {len(examples)} training examples")


✅ Prepared 8441 training examples


In [7]:
sentences1 = [
    "What is term insurance?",
    "How can I cancel my health insurance?",
    "Does motor insurance cover theft?"
]

sentences2 = [
    "Explain term insurance coverage.",
    "What’s the process for cancelling a policy?",
    "Is vehicle theft included in insurance?"
]

scores = [0.95, 0.8, 0.9]  # Based on your judgment or annotations


In [13]:
!pip install transformers



In [11]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator


# ✅ Load base model
model = SentenceTransformer("all-MiniLM-L6-v2")
#model = model.to('cuda')

# ✅ Prepare DataLoader
train_dataloader = DataLoader(examples, shuffle=True, batch_size=64)

# ✅ Choose Loss
train_loss = losses.MultipleNegativesRankingLoss(model)

evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    evaluator=evaluator,
    evaluation_steps=500,
    warmup_steps=100,
    show_progress_bar=True
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
model_save_path = "custom_insurance_encoder"
model.save(model_save_path)
print(f"✅ Model saved at: {model_save_path}")


✅ Model saved at: custom_insurance_encoder


In [None]:
sentences = [
    "What does liability insurance cover?",
    "Tell me about car insurance coverage.",
    "When does a policy lapse?"
]

embeddings = model.encode(sentences, convert_to_tensor=True)

# Check cosine similarity (optional)
from sentence_transformers.util import cos_sim
print("Similarity between 1st and 2nd:", cos_sim(embeddings[0], embeddings[1]))


Similarity between 1st and 2nd: tensor([[0.8101]])


# HE WALA GHE

In [17]:
from IPython.display import display, Markdown


# ✅ Load Fine-Tuned Sentence Transformer
model = SentenceTransformer(r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\custom_insurance_encoder")

# ✅ Load Insurance Questions
import json
with open(r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\decoded_questions.json", "r", encoding="utf-8") as file:
    insurance_data = json.load(file)

questions = [entry["question"] for entry in insurance_data]
question_embeddings = model.encode(questions)

# ✅ Set Up SQLite for Memory
conn = sqlite3.connect("chatbot_memory.db", check_same_thread=False)
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS chatbot_memory (query TEXT, response TEXT)")
conn.commit()

# ✅ Function to Search Memory
def search_memory(user_query):
    cursor.execute("SELECT response FROM chatbot_memory WHERE query=?", (user_query,))
    result = cursor.fetchone()
    return result[0] if result else None

# ✅ Function to Store in Memory
def store_memory(user_query, response):
    cursor.execute("INSERT INTO chatbot_memory (query, response) VALUES (?, ?)", (user_query, response))
    conn.commit()

# ✅ Function for Semantic Search
def find_relevant_questions(user_query):
    query_embedding = model.encode([user_query])
    similarities = np.dot(query_embedding, question_embeddings.T)[0]
    top_indices = np.argsort(similarities)[-3:][::-1]
    relevant_questions = [questions[i] for i in top_indices]
    return relevant_questions

# ✅ Function to Get Insurance Answer
def get_insurance_response(user_query):
    # Step 1: Check Memory
    memory_response = search_memory(user_query)
    if memory_response:
        return f"[From Memory] {memory_response}"

    # Step 2: Find Relevant Questions
    relevant_questions = find_relevant_questions(user_query)

    # Step 3: Call OpenRouter (DeepSeek R1)
    try:
        response = client.chat.completions.create(
            model="deepseek/deepseek-r1:free",
            messages=[
                {"role": "system", "content": "You are an insurance chatbot based from India who follows rules and regulations related to insurance for India. Answer insurance-related questions only. If non-insurance related questions asked please politely deny saying I am not made for this domain."},
                {"role": "user", "content": f"User Query: {user_query}\nRelevant Questions: {relevant_questions}"}
            ],
            temperature=0.3,
        )

        # Validate response before accessing
        if response and hasattr(response, "choices") and len(response.choices) > 0:
            chatbot_response = response.choices[0].message.content
            store_memory(user_query, chatbot_response)
            return chatbot_response
        else:
            return "I'm sorry, I couldn't generate a response at the moment."

    except Exception as e:
        return f"Error: {str(e)}"


# ✅ Voice Input & Output Functions
def recognize_speech():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("🎤 Listening...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)
    try:
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return "Sorry, I couldn't understand."
    except sr.RequestError:
        return "Speech recognition service is unavailable."

def speak_response(response):
    engine = pyttsx3.init()
    engine.say(response)
    
    try:
        engine.runAndWait()  # Normal execution
    except RuntimeError:
        engine.endLoop()  # Stop the current event loop
        engine.runAndWait() 

# ✅ Main Chat Loop (Conversation Mode)
def start_chatbot():
    print("💬 Insurance Chatbot Started! Type 'voice' for voice input. Say 'thank you', 'ok', or 'exit' to stop.")
    
    while True:
        user_input = input("👤 You: ").strip().lower()
        print("User question: " ,user_input)

        # Handle voice input
        if user_input == "voice":
            user_input = recognize_speech()
            print(f"👤 You (via voice): {user_input}")

        # Check for exit words
        if user_input in ["thank you", "thanks", "ok", "exit", "goodbye", "bye"]:
            print("🤖 Bot: You're welcome! Have a great day! 😊")
            speak_response("You're welcome! Have a great day!")
            break

        # Get response
        bot_response = get_insurance_response(user_input)
    
        display(Markdown(f"🤖 Bot: {bot_response}"))
        speak_response(bot_response)

        # Ask if further help is needed
        follow_up = input("🤖 Bot: Do you need help with anything else? (yes/no) ").strip().lower()
        if follow_up in ["no", "thank you", "thanks", "ok", "exit", "goodbye", "bye"]:
            print("🤖 Bot: Have a great day! 😊")
            speak_response("Have a great day!")
            break

# ✅ Run the Chatbot
if __name__ == "__main__":
    start_chatbot()


💬 Insurance Chatbot Started! Type 'voice' for voice input. Say 'thank you', 'ok', or 'exit' to stop.
User question:  types of automobile insurance


🤖 Bot: Error: object of type 'NoneType' has no len()

User question:  


KeyboardInterrupt: 

### BLEU and ROUGE Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import json
from tqdm import tqdm

def simple_tokenize(text):
    return text.strip().split()

# ✅ Load test queries and reference answers
with open(r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\test_queries.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# ✅ Scorers
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
smoothie = SmoothingFunction().method4

# ✅ Accumulators
bleu_scores = []
rouge1_precision = []
rouge1_recall = []
rouge1_f1 = []
rougeL_f1 = []

# ✅ Evaluate each query
for item in tqdm(test_data, desc="Evaluating..."):
    query = item["query"]
    reference = item["reference"]
    
    # Call your chatbot
    prediction = get_insurance_response(query)
    
    # ✅ BLEU
    reference_tokens = [simple_tokenize(reference)]
    prediction_tokens = simple_tokenize(prediction)
    bleu = sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothie)
    bleu_scores.append(bleu)

    # ✅ ROUGE
    rouge = scorer.score(reference, prediction)
    rouge1 = rouge["rouge1"]
    rougeL = rouge["rougeL"]

    rouge1_precision.append(rouge1.precision)
    rouge1_recall.append(rouge1.recall)
    rouge1_f1.append(rouge1.fmeasure)
    rougeL_f1.append(rougeL.fmeasure)

# ✅ Final Results
print("\n📊 Final Evaluation Results:")
print(f"Average BLEU Score       : {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"Average ROUGE-1 Precision: {sum(rouge1_precision)/len(rouge1_precision):.4f}")
print(f"Average ROUGE-1 Recall   : {sum(rouge1_recall)/len(rouge1_recall):.4f}")
print(f"Average ROUGE-1 F1 Score : {sum(rouge1_f1)/len(rouge1_f1):.4f}")
print(f"Average ROUGE-L F1 Score : {sum(rougeL_f1)/len(rougeL_f1):.4f}")


Evaluating...: 100%|██████████| 25/25 [19:44<00:00, 47.39s/it]


📊 Final Evaluation Results:
Average BLEU Score       : 0.0169
Average ROUGE-1 Precision: 0.0827
Average ROUGE-1 Recall   : 0.8093
Average ROUGE-1 F1 Score : 0.1475
Average ROUGE-L F1 Score : 0.1117





In [None]:
!pip install evaluate

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Using cached pyarrow-19.0.1-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from da

In [None]:
import json
import time
import pandas as pd
import openai
from rouge_score import rouge_scorer
import evaluate

# === Set OpenRouter-compatible API base ===
openai.api_key = "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"
openai.api_base = "https://openrouter.ai/api/v1"
openai.api_type = "openai"

# === Load BLEU and ROUGE evaluators ===
bleu = evaluate.load("bleu")
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# === LLM Evaluator Prompt ===
LLM_EVAL_PROMPT = """
You are an expert insurance evaluator assistant. You will receive:
- A user question related to insurance.
- A chatbot’s response to the question.
- A reference answer provided by human experts.

Please analyze the chatbot’s response critically in comparison to the reference answer and rate it on a scale of **1 to 5**, where:

1 = Completely irrelevant or incorrect. Misses core ideas.
2 = Partially correct but contains major gaps or incorrect assumptions.
3 = Fairly relevant, misses some key points or gives generic info.
4 = Very relevant and mostly complete, with minor inaccuracies or omissions.
5 = Excellent response. Accurate, complete, and clear.

Also provide a short justification of your score (2-3 lines).

Return ONLY the result in the following JSON format:
{
  "rating": <number from 1 to 5>,
  "comment": "<justification>"
}
"""

# === Load test queries ===
with open(r"D:\\NLPInsuranceProject\\NLPINSURANCE-FINTECHPROJ\\test_queries.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# === LLM Evaluator ===
def evaluate_with_llm(query, reference, prediction):
    user_prompt = f"""
{LLM_EVAL_PROMPT}

User Question: {query}

Chatbot Response: {prediction}

Reference Answer: {reference}
"""
    try:
        response = client.chat.completions.create(
            model="meta-llama/llama-4-maverick:free",
            messages=[
                {"role": "system", "content": "You are an expert in evaluating insurance chatbot responses."},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.3,
        )
        content = response.choices[0].message.content
        result = json.loads(content)
        return result["rating"], result["comment"]
    except Exception as e:
        return None, f"LLM Error: {str(e)}"

# === Run the Evaluation Pipeline ===
report_data = []

for i, entry in enumerate(test_data):
    query = entry["query"]
    reference = entry["reference"]

    print(f"[{i+1}/{len(test_data)}] Processing: {query}")

    # Get chatbot response using your chatbot logic
    prediction = get_insurance_response(query)

    # BLEU score
    bleu_score = bleu.compute(predictions=[prediction], references=[[reference]])["bleu"]

    # ROUGE-L score
    rouge_result = rouge.score(reference, prediction)
    rouge_l = rouge_result["rougeL"]

    # LLM rating and comment
    rating, comment = evaluate_with_llm(query, reference, prediction)
    time.sleep(3)  # Respect rate limits

    report_data.append({
        "query": query,
        "reference": reference,
        "prediction": prediction,
        "bleu": round(bleu_score, 4),
        "rougeL_precision": round(rouge_l.precision, 4),
        "rougeL_recall": round(rouge_l.recall, 4),
        "rougeL_f1": round(rouge_l.fmeasure, 4),
        "llm_rating": rating,
        "llm_comment": comment
    })

# === Save the evaluation report ===
with open("chatbot_final_evaluation.json", "w", encoding="utf-8") as f:
    json.dump(report_data, f, indent=4, ensure_ascii=False)

df = pd.DataFrame(report_data)
df.to_csv("chatbot_final_evaluation.csv", index=False)

print("✅ Evaluation complete! Results saved to JSON and CSV.")


Using the latest cached version of the module from C:\Users\Sejal Hanmante\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bleu\9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Sun Apr  6 19:35:39 2025) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.


[1/25] Processing: What happens if I miss the health insurance premium due date?


NameError: name 'get_insurance_response' is not defined