In [3]:
# 2️⃣ Imports
import os
import json
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from langchain_groq import ChatGroq

# 3️⃣ Paths
chunk_dir = Path("tirumala_hybrid_chunks_final")

# 4️⃣ Load final chunks (with page info if available)
chunks = []
for file in chunk_dir.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        for ch in data.get("final_chunks", []):
            chunks.append({
                "text": ch["content"],
                "source": file.name,
                "page": ch.get("page"),   # ✅ store page if available
                "type": ch.get("type", "normalized")
            })
print(f"✅ Loaded {len(chunks)} chunks from {chunk_dir}")

# 5️⃣ Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode([c["text"] for c in chunks], show_progress_bar=True)
embeddings = normalize(embeddings, axis=1)  # ✅ normalize for cosine similarity
embeddings = np.array(embeddings, dtype="float32")

# 6️⃣ FAISS HNSW Index
dim = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)
index.hnsw.efSearch = 64
index.add(embeddings)
faiss.write_index(index, "tirumala_chunks_hnsw.index")
print("✅ FAISS index built and saved")

# 7️⃣ Retriever (returns indices)
def retrieve(query, top_k=10):
    q_emb = embedder.encode([query])
    q_emb = normalize(q_emb, axis=1).astype("float32")
    D, I = index.search(q_emb, top_k)
    return [(i, float(D[0][j])) for j, i in enumerate(I[0]) if i >= 0], q_emb

# 8️⃣ Deduplication
def dedupe(results):
    seen, unique = set(), []
    for idx, score in results:
        text = chunks[idx]["text"]
        if text not in seen:
            seen.add(text)
            unique.append((idx, score))
    return unique

# 9️⃣ Clustering-based pruning (works on indices)
def cluster_prune(results, n_clusters=5):
    if len(results) <= n_clusters:
        return results
    idxs = [r[0] for r in results]
    vecs = embeddings[idxs]
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(vecs)
    best = {}
    for label, (idx, score) in zip(km.labels_, results):
        if label not in best or score > best[label][1]:  # ✅ keep higher similarity
            best[label] = (idx, score)
    return [(idx, score) for idx, score in best.values()]

# 🔟 Cosine similarity re-ranker
def rerank(results, q_emb):
    reranked = []
    for idx, _ in results:
        c_emb = embeddings[idx]
        cos_sim = float((q_emb @ c_emb.T).item())  # ✅ safe scalar
        reranked.append((idx, cos_sim))
    return sorted(reranked, key=lambda x: x[1], reverse=True)

# 1️⃣1️⃣ Groq LLM setup
llm = ChatGroq(
    api_key="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",  # ⚠️ replace with your valid key
    model="llama-3.1-8b-instant"
)

# 1️⃣2️⃣ Ask function with history
def ask(query, history=None, top_k=10):
    if history is None:
        history = []

    # 1. retrieve
    candidates, q_emb = retrieve(query, top_k=top_k)
    candidates = dedupe(candidates)
    candidates = cluster_prune(candidates, n_clusters=5)
    candidates = rerank(candidates, q_emb)

    # 2. build context
    context = "\n".join([chunks[idx]["text"] for idx, _ in candidates])

    # 3. build conversational prompt
    history_text = ""
    for turn in history:
        history_text += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"

    prompt = f"""You are a helpful historian chatbot answering strictly based 
on the Tirumala Hybrid Book.

⚠️ Important:
- Do not speak to the user as if they are a historical figure.
- Always answer in the third person (e.g., "Ramaraja was the son of Bukkaraja").
- If the context is insufficient, say "I don't know."
- Be concise, factual, and neutral.

Context:
{context}

Conversation so far:
{history_text}

User: {query}
Assistant:"""

    # 4. call LLM
    result = llm.invoke(prompt)

    return result.content, candidates

# 1️⃣3️⃣ Chat loop
if __name__ == "__main__":
    history = []
    print("🤖 Tirumala Hybrid Book Chatbot. Type 'exit' to quit.\n")

    while True:
        user_query = input("🧑 You: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("👋 Goodbye!")
            break

        response, refs = ask(user_query, history)

        print("\n🤖 Bot:", response)
        print("📂 Sources:", [
            f"{chunks[idx]['source']} (page {chunks[idx].get('page')})"
            for idx, _ in refs
        ], "\n")

        # save turn
        history.append({"user": user_query, "assistant": response})


  from .autonotebook import tqdm as notebook_tqdm


✅ Loaded 1634 chunks from tirumala_hybrid_chunks_final


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 52/52 [02:06<00:00,  2.43s/it]


✅ FAISS index built and saved
🤖 Tirumala Hybrid Book Chatbot. Type 'exit' to quit.



🧑 You:   Architectural features of the Garbha griha



🤖 Bot: The architectural features of the Garbha griha include a square shape with an inner measurement of about 12 feet 9 inches. Its effective thickness is about seven feet two inches, which is equivalent to five hasthams.
📂 Sources: ['page_314.json (page None)', 'page_329.json (page None)', 'page_185.json (page None)', 'page_324.json (page None)', 'page_667.json (page None)'] 



🧑 You:  Vijayaganda Gopala



🤖 Bot: I don't know.
📂 Sources: ['page_520.json (page None)', 'page_647.json (page None)', 'page_043.json (page None)', 'page_078.json (page None)', 'page_336.json (page None)'] 



🧑 You:  south wall of the Ramar Madai 



🤖 Bot: On the south wall of the Ramar Madai, there is a complete inscription that extols Vijayaganda Gopala for his charitable disposition and states that every living being was benefited by him and that every one was grateful to him.
📂 Sources: ['page_327.json (page None)', 'page_277.json (page None)', 'page_102.json (page None)', 'page_359.json (page None)', 'page_622.json (page None)'] 



🧑 You:  Vijayaganda Gopala



🤖 Bot: Vijayaganda Gopala was mentioned as the author who added a closed passage called the Vaikuntha pradakshina, which runs around the sanctum of the temple.
📂 Sources: ['page_520.json (page None)', 'page_647.json (page None)', 'page_043.json (page None)', 'page_078.json (page None)', 'page_336.json (page None)'] 



🧑 You:    Vijayaganda Gopala



🤖 Bot: Vijayaganda Gopala was the author who added a closed passage called the Vaikuntha pradakshina, which runs around the sanctum of the temple.
📂 Sources: ['page_520.json (page None)', 'page_647.json (page None)', 'page_043.json (page None)', 'page_078.json (page None)', 'page_336.json (page None)'] 



🧑 You:  where tirupati located



🤖 Bot: Tirupati is located in the state of Andhra Pradesh in India.
📂 Sources: ['page_022.json (page None)', 'page_434.json (page None)', 'page_053.json (page None)', 'page_446.json (page None)', 'page_536.json (page None)'] 



🧑 You:  who is Vijayaganda Gopala



🤖 Bot: Vijayaganda Gopala was a benefactor of the Tirumala temple.
📂 Sources: ['page_647.json (page None)', 'page_355.json (page None)', 'page_336.json (page None)', 'page_050.json (page None)', 'page_506.json (page None)'] 



🧑 You:  history of Vijayaganda Gopala



🤖 Bot: Vijayaganda Gopala was a Telugu Pallava chief who was one of the adherents of Undara Pandya. His political life should have commenced much earlier, but the usually assigned date of his rule is 1250-1285 A.D.
📂 Sources: ['page_327.json (page None)', 'page_050.json (page None)', 'page_472.json (page None)', 'page_078.json (page None)', 'page_520.json (page None)'] 



🧑 You:  exit


👋 Goodbye!


In [4]:
# 2️⃣ Imports
import os
import json
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
from sklearn.preprocessing import normalize
from langchain_groq import ChatGroq

# 3️⃣ Paths
chunk_dir = Path("tirumala_hybrid_chunks_final")

# 4️⃣ Load final chunks
chunks = []
for file in chunk_dir.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        for ch in data.get("final_chunks", []):
            chunks.append({
                "text": ch["content"],
                "source": file.name,
                "page": ch.get("page"),
                "type": ch.get("type", "normalized")
            })
print(f"✅ Loaded {len(chunks)} chunks from {chunk_dir}")

# 5️⃣ Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode([c["text"] for c in chunks], show_progress_bar=True)
embeddings = normalize(embeddings, axis=1)
embeddings = np.array(embeddings, dtype="float32")

# 6️⃣ FAISS HNSW Index
dim = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)
index.hnsw.efSearch = 64
index.add(embeddings)
faiss.write_index(index, "tirumala_chunks_hnsw.index")
print("✅ FAISS index built and saved")

# 7️⃣ Retriever
def retrieve(query, top_k=15):
    q_emb = embedder.encode([query])
    q_emb = normalize(q_emb, axis=1).astype("float32")
    D, I = index.search(q_emb, top_k)
    return [(i, float(D[0][j])) for j, i in enumerate(I[0]) if i >= 0], q_emb

# 8️⃣ Deduplication
def dedupe(results):
    seen, unique = set(), []
    for idx, score in results:
        text = chunks[idx]["text"]
        if text not in seen:
            seen.add(text)
            unique.append((idx, score))
    return unique

# 9️⃣ MMR-based pruning (replaces KMeans)
def mmr(query_vec, results, lambda_param=0.7, top_n=5):
    if len(results) <= top_n:
        return results

    idxs = [r[0] for r in results]
    vecs = embeddings[idxs]

    selected, remaining = [], list(range(len(idxs)))

    # start with the most relevant
    selected.append(remaining.pop(0))

    while len(selected) < top_n and remaining:
        best_idx, best_score = None, -1e9
        for i in remaining:
            sim_to_query = float(np.dot(query_vec, vecs[i]))
            sim_to_selected = max(float(np.dot(vecs[i], vecs[j])) for j in selected)
            score = lambda_param * sim_to_query - (1 - lambda_param) * sim_to_selected
            if score > best_score:
                best_idx, best_score = i, score
        selected.append(best_idx)
        remaining.remove(best_idx)

    return [(idxs[i], results[i][1]) for i in selected]

# 🔟 Re-ranker (cosine similarity)
def rerank(results, q_emb):
    reranked = []
    for idx, _ in results:
        c_emb = embeddings[idx]
        cos_sim = float((q_emb @ c_emb.T).item())
        reranked.append((idx, cos_sim))
    return sorted(reranked, key=lambda x: x[1], reverse=True)

# 1️⃣1️⃣ Groq LLM setup
llm = ChatGroq(
    api_key="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",  # ⚠️ replace
    model="llama-3.1-8b-instant"
)

# 1️⃣2️⃣ Query type check (entity vs descriptive)
def is_entity_query(query):
    # Short queries with <=3 words are likely names/titles
    return len(query.split()) <= 3

# 1️⃣3️⃣ Ask function
def ask(query, history=None, top_k=10):
    if history is None:
        history = []

    candidates, q_emb = retrieve(query, top_k=top_k)
    candidates = dedupe(candidates)

    if is_entity_query(query):
        # Entity queries → high recall, minimal pruning
        candidates = rerank(candidates, q_emb)[:10]
    else:
        # Descriptive queries → MMR for diversity
        candidates = mmr(q_emb[0], candidates, top_n=7)
        candidates = rerank(candidates, q_emb)

    # Build context
    context = "\n".join([chunks[idx]["text"] for idx, _ in candidates])

    # Looser fallback: if almost nothing retrieved
    if len(context.strip()) < 50:
        return "I don't know.", candidates

    # Build conversational prompt
    history_text = ""
    for turn in history:
        history_text += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"

    prompt = f"""You are a helpful historian chatbot answering strictly based 
on the Tirumala Hybrid Book.

⚠️ Important:
- Do not speak to the user as if they are a historical figure.
- Always answer in the third person (e.g., "Ramaraja was the son of Bukkaraja").
- If the context is insufficient, say "I don't know."
- Be concise, factual, and neutral.

Context:
{context}

Conversation so far:
{history_text}

User: {query}
Assistant:"""

    result = llm.invoke(prompt)
    return result.content, candidates

# 1️⃣4️⃣ Chat loop
if __name__ == "__main__":
    history = []
    print("🤖 Tirumala Hybrid Book Chatbot. Type 'exit' to quit.\n")

    while True:
        user_query = input("🧑 You: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("👋 Goodbye!")
            break

        response, refs = ask(user_query, history)

        print("\n🤖 Bot:", response)
        print("📂 Sources:", [
            f"{chunks[idx]['source']} (page {chunks[idx].get('page')})"
            for idx, _ in refs
        ], "\n")

        history.append({"user": user_query, "assistant": response})


✅ Loaded 1634 chunks from tirumala_hybrid_chunks_final


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 52/52 [01:58<00:00,  2.28s/it]


✅ FAISS index built and saved
🤖 Tirumala Hybrid Book Chatbot. Type 'exit' to quit.



🧑 You:  Vijayaganda Gopala



🤖 Bot: Vijayaganda Gopala was a Telugu Pallava chief who was one of the adherents of Undara Pandya. He was the ruler from 1250 to 1285 A.D.
📂 Sources: ['page_447.json (page None)', 'page_327.json (page None)', 'page_422.json (page None)', 'page_050.json (page None)', 'page_520.json (page None)', 'page_647.json (page None)', 'page_043.json (page None)', 'page_450.json (page None)', 'page_078.json (page None)', 'page_336.json (page None)'] 



🧑 You:  Sarika birds 



🤖 Bot: I don't know.
📂 Sources: ['page_026.json (page None)', 'page_026.json (page None)', 'page_155_20250918_215650.json (page None)', 'page_592.json (page None)', 'page_155_20250918_215650.json (page None)', 'page_669.json (page None)', 'page_108.json (page None)', 'page_559.json (page None)', 'page_319.json (page None)', 'page_631.json (page None)'] 



🧑 You:   Alvars



🤖 Bot: The Alvars were a group of twelve Tamil Vaishnavite saints who lived in South India between the 6th and 9th centuries AD. They were devoted to Lord Vishnu and composed hymns in praise of Him. The Alvars are considered to be the earliest devotees of Lord Vishnu in Tamil literature and their hymns are still sung today in the Tirumala Temple.
📂 Sources: ['page_068.json (page None)', 'page_067.json (page None)', 'page_068.json (page None)', 'page_622.json (page None)', 'page_290.json (page None)', 'page_066.json (page None)', 'page_066.json (page None)', 'page_292.json (page None)', 'page_292.json (page None)', 'page_292.json (page None)'] 



🧑 You:  exit


👋 Goodbye!


In [None]:
# 2️⃣ Imports
import os
import json
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from langchain_groq import ChatGroq

# 3️⃣ Paths
chunk_dir = Path("tirumala_hybrid_chunks_final")

# 4️⃣ Load final chunks
chunks = []
for file in chunk_dir.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        for ch in data.get("final_chunks", []):
            chunks.append({
                "text": ch["content"],
                "source": file.name,
                "page": ch.get("page"),
                "type": ch.get("type", "normalized")
            })
print(f"✅ Loaded {len(chunks)} chunks from {chunk_dir}")

# 5️⃣ Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode([c["text"] for c in chunks], show_progress_bar=True)
embeddings = normalize(embeddings, axis=1)
embeddings = np.array(embeddings, dtype="float32")

# 6️⃣ FAISS HNSW Index
dim = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)
index.hnsw.efSearch = 64
index.add(embeddings)
faiss.write_index(index, "tirumala_chunks_hnsw.index")
print("✅ FAISS index built and saved")

# 7️⃣ Retriever
def retrieve(query, top_k=15):
    q_emb = embedder.encode([query])
    q_emb = normalize(q_emb, axis=1).astype("float32")
    D, I = index.search(q_emb, top_k)
    return [(i, float(D[0][j])) for j, i in enumerate(I[0]) if i >= 0], q_emb

# 8️⃣ Deduplication
def dedupe(results):
    seen, unique = set(), []
    for idx, score in results:
        text = chunks[idx]["text"]
        if text not in seen:
            seen.add(text)
            unique.append((idx, score))
    return unique

# 9️⃣ Clustering-based pruning
def cluster_prune(results, n_clusters=5):
    if len(results) <= n_clusters:
        return results
    idxs = [r[0] for r in results]
    vecs = embeddings[idxs]
    km = KMeans(n_clusters=n_clusters, random_state=0).fit(vecs)
    best = {}
    for label, (idx, score) in zip(km.labels_, results):
        if label not in best or score > best[label][1]:
            best[label] = (idx, score)
    return [(idx, score) for idx, score in best.values()]

# 🔟 Re-ranker (cosine similarity)
def rerank(results, q_emb):
    reranked = []
    for idx, _ in results:
        c_emb = embeddings[idx]
        cos_sim = float((q_emb @ c_emb.T).item())
        reranked.append((idx, cos_sim))
    return sorted(reranked, key=lambda x: x[1], reverse=True)

# 1️⃣1️⃣ Groq LLM setup
llm = ChatGroq(
    api_key="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",  # ⚠️ replace with valid key
    model="llama-3.1-8b-instant"
)

# 1️⃣2️⃣ Query type check (entity vs descriptive)
def is_entity_query(query):
    # Short queries with <=3 words are likely names/titles
    return len(query.split()) <= 3

# 1️⃣3️⃣ Ask function
def ask(query, history=None, top_k=15):
    if history is None:
        history = []

    candidates, q_emb = retrieve(query, top_k=top_k)
    candidates = dedupe(candidates)

    if is_entity_query(query):
        # Entity queries → high recall (no clustering, just rerank)
        candidates = rerank(candidates, q_emb)[:10]
    else:
        # Descriptive queries → cluster prune + rerank
        candidates = cluster_prune(candidates, n_clusters=5)
        candidates = rerank(candidates, q_emb)

    # Build context
    context = "\n".join([chunks[idx]["text"] for idx, _ in candidates])

    # Looser fallback: if almost nothing retrieved
    if len(context.strip()) < 50:
        return "I don't know.", candidates

    # Build conversational prompt
    history_text = ""
    for turn in history:
        history_text += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"

    prompt = f"""You are a helpful historian chatbot answering strictly based 
on the Tirumala Hybrid Book.

⚠️ Important:
- Do not speak to the user as if they are a historical figure.
- Always answer in the third person (e.g., "Ramaraja was the son of Bukkaraja").
- If the context is insufficient, say "I don't know."
- Be concise, factual, and neutral.

Context:
{context}

Conversation so far:
{history_text}

User: {query}
Assistant:"""

    result = llm.invoke(prompt)
    return result.content, candidates

# 1️⃣4️⃣ Chat loop
if __name__ == "__main__":
    history = []
    print("🤖 Tirumala Hybrid Book Chatbot. Type 'exit' to quit.\n")

    while True:
        user_query = input("🧑 You: ")
        if user_query.lower() in ["exit", "quit", "bye"]:
            print("👋 Goodbye!")
            break

        response, refs = ask(user_query, history)

        print("\n🤖 Bot:", response)
        print("📂 Sources:", [
            f"{chunks[idx]['source']} (page {chunks[idx].get('page')})"
            for idx, _ in refs
        ], "\n")

        history.append({"user": user_query, "assistant": response})


✅ Loaded 1634 chunks from tirumala_hybrid_chunks_final


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 52/52 [01:48<00:00,  2.08s/it]


✅ FAISS index built and saved
🤖 Tirumala Hybrid Book Chatbot. Type 'exit' to quit.



🧑 You:  Vijayaganda Gopala



🤖 Bot: Vijayaganda Gopala was a Telugu Pallava chief who was one of the adherents of Undara Pandya. His date of rule is usually assigned to 1250-1285 A.D., but his political life likely commenced much earlier.
📂 Sources: ['page_447.json (page None)', 'page_327.json (page None)', 'page_422.json (page None)', 'page_050.json (page None)', 'page_520.json (page None)', 'page_647.json (page None)', 'page_043.json (page None)', 'page_450.json (page None)', 'page_078.json (page None)', 'page_336.json (page None)'] 



🧑 You:  Sarika birds



🤖 Bot: Sarika birds are not mentioned in the provided context. However, a bird similar to the Sarika, which is the White-Browed Starling, is mentioned as a bird that can be seen in the tract around the Tirumala Temple.
📂 Sources: ['page_026.json (page None)', 'page_026.json (page None)', 'page_155_20250918_215650.json (page None)', 'page_592.json (page None)', 'page_155_20250918_215650.json (page None)', 'page_669.json (page None)', 'page_108.json (page None)', 'page_559.json (page None)', 'page_319.json (page None)', 'page_631.json (page None)'] 



🧑 You:  Alvars



🤖 Bot: The Alvars were a group of twelve Vaishnavite saints who lived in South India and were the authors of the Nalayira Prabandham, a collection of four thousand hymns sung in praise of Vishnu and His Avatars.
📂 Sources: ['page_068.json (page None)', 'page_067.json (page None)', 'page_068.json (page None)', 'page_622.json (page None)', 'page_290.json (page None)', 'page_066.json (page None)', 'page_066.json (page None)', 'page_292.json (page None)', 'page_292.json (page None)', 'page_292.json (page None)'] 

