In [None]:
%pip install -U langchain langchain-community langchain-core langchain-groq


In [None]:
%pip list


In [None]:
%pip show langchain-text-splitters


In [None]:
import os
from pypdf import PdfReader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.vectorstores import FAISS




In [None]:
# Extracting all the pdf strings from the pdfs

pdf_folder = r"..\Data\insurance_documents"


all_text = ""

for file_name in os.listdir(pdf_folder):
    if file_name.lower().endswith(".pdf"):
        file_path = os.path.join(pdf_folder, file_name)
        print(f"Loading: {file_name}")

        reader = PdfReader(file_path)
        for page in reader.pages:
            text = page.extract_text()
            if text:
                all_text += text + "\n"


print(f"Length of the the text: {len(all_text)}")
print(f"First 1000 characters:\n{all_text[:1000]}")


In [None]:
# Extract tabular data:

%pip install camelot-py[cv]


In [None]:
import camelot

In [None]:
all_tables_text = ""

for file_name in os.listdir(pdf_folder):
    if file_name.lower().endswith(".pdf"):
        file_path = os.path.join(pdf_folder, file_name)
        print(f"Extracting tables from: {file_name}")

        try:
            tables = camelot.read_pdf(
                file_path,
                pages="all",
                flavor="lattice"   
            )

            for table in tables:
                df = table.df
                table_text = df.to_string(index=False)
                all_tables_text += table_text + "\n\n"

        except Exception as e:
            print(f"  No tables or error: {e}")


In [None]:
print(len(all_tables_text))

In [None]:
print(all_tables_text[:5000])


In [None]:
def clean_table_text(text):
    # remove repeated column numbers and excessive spaces
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.isdigit():
            continue
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

cleaned_tables_text = clean_table_text(all_tables_text)

print("Cleaned table text length:", len(cleaned_tables_text))
print(cleaned_tables_text[:5000])


In [None]:
import re

def table_to_sentences_strong(table_text):
    sentences = []
    buffer = ""

    for line in table_text.splitlines():
        line = line.strip()

        # skip empty lines
        if not line:
            continue

        # skip column-number headers like "0 1 2 3 4"
        if re.fullmatch(r"(?:\d+\s+){2,}\d+", line):
            continue

        # normalize spacing
        line = re.sub(r"\s{2,}", " ", line)

        # merge broken lines
        buffer += " " + line if buffer else line

        # heuristic: end a fact when line looks complete
        if (
            line.endswith(".")
            or "Insurance" in line
            or "SESSION" in line
            or "covered" in line
        ):
            if len(buffer.split()) > 6:
                sentences.append(buffer.strip())
            buffer = ""

    # catch remaining buffer
    if buffer and len(buffer.split()) > 6:
        sentences.append(buffer.strip())

    return "\n".join(sentences)


In [None]:
table_sentences = table_to_sentences_strong(cleaned_tables_text)

print("Table sentences length:", len(table_sentences))
print(table_sentences[:1500])


In [None]:
def final_text_cleanup(text):
    # replace escaped newlines with space
    text = text.replace("\\n", " ")
    
    # normalize multiple spaces
    import re
    text = re.sub(r"\s{2,}", " ", text)
    
    return text.strip()


In [None]:
table_sentences_clean = final_text_cleanup(table_sentences)

print("Clean table sentences length:", len(table_sentences_clean))
print(table_sentences_clean[:1000])


In [None]:
print(len(table_sentences_clean))

In [None]:
# Merging pdf data and camelot data :

final_corpus = final_text_cleanup(all_text) + "\n\n" + table_sentences_clean



In [None]:
print(len(final_corpus))

In [None]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

chunks = text_splitter.split_text(final_corpus)

print("Total chunks:", len(chunks))
print("\nSample chunk:\n")
print(chunks[0])


In [None]:
# Converting words into numbers by embedding model

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [None]:
# store the embedding with chunks into vector by using Faiss:

vectorstore = FAISS.from_texts(chunks, embeddings)


In [None]:
# Save Faiss 
vectorstore.save_local("faiss_index")


In [None]:
# Reloading the saved FAISS_index :

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = FAISS.load_local(
     r"D:\Virtual_Insurnace_Ai_Agent\faiss_insurance_index\faiss_index",
    embeddings,
    allow_dangerous_deserialization=True
)

# quick test
vectorstore.similarity_search("What is insurance?", k=2)


In [None]:
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 3,
        "fetch_k": 10,
        "lambda_mult": 0.7
    }
)



In [None]:
docs = retriever.invoke("what is medical insurace?")
print(docs[0].page_content)


In [None]:
from dotenv import load_dotenv

load_dotenv()   

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not found. Check your .env file")

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model="llama-3.1-8b-instant" # or  You can use  "llama-3.3-70b versatile" for better reasoning 
)



In [None]:
from langchain_core.runnables import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain_core.chat_history import InMemoryChatMessageHistory




In [None]:
from langchain_core.chat_history import InMemoryChatMessageHistory

SESSION_STORE = {}
MAX_MESSAGES = 6   

def get_session_history(session_id: str):
    if session_id not in SESSION_STORE:
        SESSION_STORE[session_id] = InMemoryChatMessageHistory()

    history = SESSION_STORE[session_id]

    # limit past history to control tokens
    
    if len(history.messages) > MAX_MESSAGES:
        history.messages = history.messages[-MAX_MESSAGES:]

    return history


In [None]:
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """
You are an Friendly insurance assistant.

You are a friendly insurance assistant.

STRICT RULES:
- Answer ONLY using the information provided below.
- Do NOT use outside knowledge.
- Do NOT guess or assume.
- If the answer is NOT available in the provided information, reply exactly:
  "Sorry, I don't know that. Is there any other insurance-related question you would like to talk about?"
- Keep the answer polite, clear, and well-polished.
- The answer must be within three lines.
- Do NOT mention documents, context, sources, or internal information.



CONTEXT:
{context}
"""
    ),
    ("placeholder", "{chat_history}"),
    ("human", "{question}")
])


In [None]:
rag_chain = (
    {
        "context": lambda x: retriever.invoke(x["question"]),
        "question": lambda x: x["question"],
        "chat_history": lambda x: x.get("chat_history", [])
    }
    | prompt
    | llm
)



In [None]:

chat_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,           # session-aware memory
    input_messages_key="question",
    history_messages_key="chat_history",
)


In [None]:
import uuid

def get_or_create_session_id(session_id: str | None):
    if session_id and session_id.strip():
        return session_id          
    return str(uuid.uuid4())       

In [None]:
incoming_session_id = None  


In [None]:
session_id = get_or_create_session_id(incoming_session_id)


In [71]:
def safe_chat_invoke(chat_chain, question, session_id):
    try:
        response = chat_chain.invoke(
            {"question": question},
            config={"configurable": {"session_id": session_id}}
        )
        return {"answer": response.content, "error": None}

    except Exception as e:
        error_msg = str(e).lower()

        if "rate limit" in error_msg:
            return {
                "answer": None,
                "error": "I'm temporarily busy due to high usage. Please try again shortly."
            }

        if "api key" in error_msg or "authentication" in error_msg:
            return {
                "answer": None,
                "error": "There is a configuration issue. Please try again later."
            }

        return {
            "answer": None,
            "error": "Something went wrong. Please try again."
        }


In [None]:
result = safe_chat_invoke(
    chat_chain,
    "What is health insurance?",
    session_id
)

print(result)


In [None]:
%pip install openai-whisper sounddevice scipy


In [None]:
import os
import sounddevice as sd
from scipy.io.wavfile import write
import whisper

AUDIO_PATH = os.path.join(os.getcwd(), "input.wav")

def record_audio(duration=5, sample_rate=16000):
    print("Speak now...")
    audio = sd.rec(
        int(duration * sample_rate),
        samplerate=sample_rate,
        channels=1,
        dtype="int16"
    )
    sd.wait()
    write(AUDIO_PATH, sample_rate, audio)
    print("Recording finished")

model = whisper.load_model("base")

def audio_to_text():
    result = model.transcribe(AUDIO_PATH)
    return result["text"].strip()

# Run end-to-end
record_audio()
text = audio_to_text()
print("Recognized text:", text)


In [None]:
# 1. Record and transcribe
record_audio()
text = audio_to_text()
print("Recognized text:", text)

# 2. Send text to LLM (safe)
result = safe_chat_invoke(
    chat_chain,
    text,
    session_id
)

# 3. Handle response
if result["error"]:
    print("Error:", result["error"])
else:
    print("Answer:", result["answer"])

    

In [None]:
# 1. Record and transcribe
record_audio()
text = audio_to_text()
print("Recognized text:", text)

# 2. Send text to LLM (safe)
result = safe_chat_invoke(
    chat_chain,
    text,
    session_id
)

# 3. Handle response
if result["error"]:
    print("Error:", result["error"])
else:
    print("Answer:", result["answer"])


In [60]:

from src.rag.Vectorstore import VectorStoreManager

# 1. Load the vector store (class method)
vector_stores = VectorStoreManager.load_vectorstore(
    r"D:\Virtual_Insurnace_Ai_Agent\faiss_insurance_index\faiss_index"
)

print(" Vector store loaded")

# 2. Run a similarity search
query = "What does health insurance cover?"
results = vector_stores.similarity_search(query, k=3)

print(f"\n Query: {query}\n")

for i, doc in enumerate(results, start=1):
    print(f"Result {i}:")
    print(doc.page_content[:300])  # first 300 chars
    print("Metadata:", doc.metadata)
    print("-" * 50)



 Vector store loaded

 Query: What does health insurance cover?

Result 1:
22. Health Insurance
The term ‘Health Insurance’ relates to a type of insurance that essentially covers your medical expenses. A health insurance policy like other policies is a contract between an insurer and an individual / group in which the insurer agrees to provide specified health insurance co
Metadata: {}
--------------------------------------------------
Result 2:
Q.What kinds of Health Insurance plans are available?
Metadata: {}
--------------------------------------------------
Result 3:
2. Health Insurance The term ‘Health Insurance’ relates to a type of insurance that essentially covers your medical expenses. A health insurance policy like other policies is a contract between an insurer and an individual / group in which the insurer agrees to provide specified health insurance cov
Metadata: {}
--------------------------------------------------
