# Indian Temple Travel Chatbot Setup, Data, Embeddings
First part: install dependencies, load/preprocess the dataset, build embeddings, and run sample retrievals.


In [8]:
# 1) Install all dependencies
!pip install fastapi uvicorn streamlit sentence-transformers faiss-cpu transformers langchain pandas numpy





[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# 2) Imports + constants
import json
from pathlib import Path
from typing import Any, Dict, List, Optional

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

DATA_PATH = "combined_temple_dataset_for_chatbot.json"
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
TOP_K = 5


In [None]:
%load_ext autoreload
%autoreload 2

from chatbot_backend import TempleChatbot, search_temples
from app import load_chatbot_resources

In [None]:
%load_ext autoreload
%autoreload 2
from chatbot_backend import TempleChatbot, search_temples

In [10]:
# 3) Data Loading & Preprocessing
def load_raw_data(path: str) -> List[Dict[str, Any]]:
    path_obj = Path(path)
    if not path_obj.exists():
        raise FileNotFoundError(f"Dataset not found at {path_obj.resolve()}")
    with path_obj.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "data" in data:
        data = data["data"]
    if not isinstance(data, list):
        raise ValueError("Expected top-level JSON array of temple records.")
    return data

TEXT_FIELDS = [
    "overview",
    "story",
    "visiting_guide",
    "architecture",
    "scripture_mentions",
]

def to_list(value: Any) -> List[str]:
    if value is None:
        return []
    if isinstance(value, list):
        return [str(v).strip() for v in value if str(v).strip()]
    if isinstance(value, str):
        parts = [p.strip() for p in value.replace(";", ",").split(",") if p.strip()]
        return parts or [value.strip()]
    return [str(value).strip()]

def build_search_text(item: Dict[str, Any]) -> str:
    parts: List[str] = []
    name = (item.get("name") or item.get("temple_name") or "").strip()
    state = (item.get("state") or item.get("region") or "").strip()
    deities = to_list(item.get("deities") or item.get("deity"))
    base_values = [name, state, ", ".join(deities)]
    for val in base_values:
        if val:
            parts.append(val)
    for field in TEXT_FIELDS:
        val = item.get(field)
        if val is None:
            continue
        if isinstance(val, list):
            val = " ".join(str(v).strip() for v in val if str(v).strip())
        else:
            val = str(val).strip()
        if val:
            parts.append(val)
    return " | ".join(parts)

def normalize_record(rec: Dict[str, Any]) -> Dict[str, Any]:
    name = rec.get("name") or rec.get("temple_name") or ""
    state = rec.get("state") or rec.get("region") or ""
    location = rec.get("location") or rec.get("city") or rec.get("place") or ""
    deities = to_list(rec.get("deities") or rec.get("deity"))
    lat = rec.get("lat") or rec.get("latitude")
    lng = rec.get("lng") or rec.get("longitude")
    search_text = build_search_text(
        {
            "name": name,
            "state": state,
            "deities": deities,
            **{field: rec.get(field) for field in TEXT_FIELDS},
        }
    )
    return {
        "id": rec.get("id") or rec.get("temple_id") or rec.get("slug") or name,
        "name": name,
        "state": state,
        "location": location,
        "deities": deities,
        "lat": lat,
        "lng": lng,
        "text": search_text,
        "raw": rec,
    }

raw_data = load_raw_data(DATA_PATH)
normalized = [normalize_record(r) for r in raw_data]

df = pd.DataFrame(normalized)
df["search_text"] = df["text"].fillna("")
print("Total temples loaded:", len(df))
display(df.head())
print("\nSample search text:\n", df.loc[0, "search_text"][:500])


Total temples loaded: 365


Unnamed: 0,id,name,state,location,deities,lat,lng,text,raw,search_text
0,temple_adi_keshava_temple_bhubaneswar_odisha,"Adi Keshava Temple, Bhubaneswar",Odisha,,[],,,"Adi Keshava Temple, Bhubaneswar | Odisha",{'id': 'temple_adi_keshava_temple_bhubaneswar_...,"Adi Keshava Temple, Bhubaneswar | Odisha"
1,temple_agatti_devi_temple_agatti_lakshadweep_l...,"Agatti Devi Temple, Agatti, Lakshadweep",Lakshadweep,,[],,,"Agatti Devi Temple, Agatti, Lakshadweep | Laks...",{'id': 'temple_agatti_devi_temple_agatti_laksh...,"Agatti Devi Temple, Agatti, Lakshadweep | Laks..."
2,temple_akshardham_temple_delhi,Akshardham Temple,Delhi,New Delhi,[],,,Akshardham Temple | Delhi,"{'id': 'temple_akshardham_temple_delhi', 'cate...",Akshardham Temple | Delhi
3,temple_akshardham_temple_gujarat,Akshardham Temple,Gujarat,Gandhinagar,[],,,Akshardham Temple | Gujarat,"{'id': 'temple_akshardham_temple_gujarat', 'ca...",Akshardham Temple | Gujarat
4,temple_akshardham_temple_new_delhi_delhi_uttar...,"Akshardham Temple, New Delhi, Delhi",Uttar Pradesh,,[],,,"Akshardham Temple, New Delhi, Delhi | Uttar Pr...",{'id': 'temple_akshardham_temple_new_delhi_del...,"Akshardham Temple, New Delhi, Delhi | Uttar Pr..."



Sample search text:
 Adi Keshava Temple, Bhubaneswar | Odisha


In [11]:
# 4) Embeddings + FAISS Index
from functools import lru_cache

@lru_cache(maxsize=1)
def get_model():
    return SentenceTransformer(EMBED_MODEL_NAME)

@lru_cache(maxsize=1)
def get_embeddings_and_index():
    model = get_model()
    corpus_texts = df["search_text"].fillna("").tolist()
    embeddings = model.encode(
        corpus_texts,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)
    return embeddings, index

model = get_model()
embeddings, index = get_embeddings_and_index()
print("Embeddings shape:", embeddings.shape)
print("FAISS index size:", index.ntotal)

metadata = df.to_dict(orient="records")

def search_temples(
    query: str,
    k: int = TOP_K,
    state_filter: Optional[str] = None,
    deity_filter: Optional[str] = None,
):
    if not query or not query.strip():
        return []
    query_vec = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    distances, indices = index.search(query_vec, k=k * 5)
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx == -1:
            continue
        rec = metadata[idx]
        if state_filter and state_filter.lower() not in rec.get("state", "").lower():
            continue
        if deity_filter:
            deities = rec.get("deities") or []
            if not any(deity_filter.lower() in str(d).lower() for d in deities):
                continue
        results.append(
            {
                "name": rec.get("name", ""),
                "state": rec.get("state", ""),
                "score": float(dist),
                "raw": rec,
            }
        )
        if len(results) >= k:
            break
    return results

sample_queries = [
    "Shiva temple in Tamil Nadu with rich architecture",
    "temple dedicated to Lord Vishnu near river",
    "famous Durga shrine in the Himalayas",
]

for q in sample_queries:
    print(f"\nQuery: {q}")
    for r in search_temples(q, k=3):
        rec_raw = r["raw"] or {}
        deities = to_list(rec_raw.get("deities") or rec_raw.get("deity")) if isinstance(rec_raw, dict) else []
        print(
            f"- {r['name']} ({r['state']}) | Deities: {', '.join(deities)} | score={r['score']:.4f}"
        )

tamil_nadu_example = search_temples("famous temples in Tamil Nadu", k=5, state_filter="Tamil Nadu")
print("\nExample query: famous temples in Tamil Nadu (top 5)")
for hit in tamil_nadu_example:
    print(f"- {hit['name']} ({hit['state']}) | score={hit['score']:.4f}")
if not tamil_nadu_example:
    print("No results returned for the example query.")

Batches: 100%|██████████| 12/12 [00:01<00:00, 10.84it/s]

Embeddings shape: (365, 384)
FAISS index size: 365

Query: Shiva temple in Tamil Nadu with rich architecture
- Shiva Temple (Karnataka) | Deities:  | score=0.8376
- Shiva Temple (Jammu and Kashmir) | Deities:  | score=0.7360
- Somnath Temple, Gujarat (Shiva) | Deities: Shiva | score=0.7054

Query: temple dedicated to Lord Vishnu near river
- Tirupati Balaji Temple, Andhra Pradesh (Vishnu) | Deities: Vishnu | score=0.6690
- Dwarkadhish Temple, Gujarat (Vishnu) | Deities: Vishnu | score=0.6610
- Ranganathaswamy Temple, Tamil Nadu (Vishnu) | Deities: Vishnu | score=0.6475

Query: famous Durga shrine in the Himalayas
- Durga Temple (Karnataka) | Deities:  | score=0.7126
- Durga Temple (Jammu and Kashmir) | Deities:  | score=0.6994
- Kalighat Temple, West Bengal (Durga) | Deities: Durga | score=0.6919

Example query: famous temples in Tamil Nadu (top 5)
- Sripuram Golden Temple (Tamil Nadu) | score=0.7297
- Srirangam Ranganathaswamy Temple, Srirangam (Tamil Nadu) | score=0.7127
- Shore Temp




# Section 2  NLU, Intents, Entities


In [12]:

# 5) Section 2: Intents + Entity Extraction
import re

INTENTS = [
    "TEMPLE_INFO",
    "FIND_TEMPLES",
    "PLAN_TRIP",
    "ITINERARY_COST",
    "SMALL_TALK",
    "UNKNOWN",
]

intent_examples = {
    "TEMPLE_INFO": [
        "Tell me about Kedarnath temple",
        "Give details on this temple",
        "History of the temple",
        "Temple overview",
    ],
    "FIND_TEMPLES": [
        "temples in Tamil Nadu",
        "find Shiva temples near river",
        "list famous Vishnu shrines",
        "recommend temples to visit",
    ],
    "PLAN_TRIP": [
        "plan a 3 day trip",
        "help me plan a pilgrimage",
        "itinerary for temples",
        "trip plan with timings",
    ],
    "ITINERARY_COST": [
        "budget for a 2 day temple trip",
        "what is the cost for visiting",
        "estimate expenses for pilgrimage",
        "trip cost for temples",
    ],
    "SMALL_TALK": [
        "hello",
        "how are you",
        "thanks",
        "good morning",
    ],
    "UNKNOWN": [
        "random question",
        "not sure",
    ],
}

intent_ex_embeddings = {}
for intent, phrases in intent_examples.items():
    emb = model.encode(phrases, convert_to_numpy=True, normalize_embeddings=True)
    intent_ex_embeddings[intent] = emb

RULE_KEYWORDS = {
    "ITINERARY_COST": ["cost", "budget", "price", "expense"],
    "PLAN_TRIP": ["plan", "itinerary", "trip", "travel"],
    "TEMPLE_INFO": ["about", "details", "history", "information", "info"],
    "FIND_TEMPLES": ["find", "show", "list", "recommend", "suggest", "near"],
    "SMALL_TALK": ["hello", "hi", "thanks", "thank you", "good morning", "good evening"],
}


def _rule_based_intent(text_lower: str):
    for intent, keywords in RULE_KEYWORDS.items():
        if any(kw in text_lower for kw in keywords):
            return intent
    return None


def detect_intent(user_message: str) -> str:
    """Deterministic intent detection using rules first, embeddings fallback."""
    if not user_message or not user_message.strip():
        return "UNKNOWN"
    text = user_message.strip()
    text_lower = text.lower()

    rule_intent = _rule_based_intent(text_lower)
    if rule_intent:
        return rule_intent

    q_emb = model.encode([text], convert_to_numpy=True, normalize_embeddings=True)[0]
    best_intent, best_score = "UNKNOWN", -1.0
    for intent, emb in intent_ex_embeddings.items():
        scores = emb @ q_emb  # cosine similarity
        top = float(scores.max())
        if top > best_score:
            best_intent, best_score = intent, top
    return best_intent

# Precompute vocab for entity extraction
TEMPLE_NAMES = sorted({name for name in df["name"].dropna().tolist() if name})
STATE_NAMES = sorted({s for s in df["state"].dropna().tolist() if s})
DEITY_NAMES = sorted({d for deities in df["deities"].tolist() for d in (deities or []) if d})

def _find_substrings(candidates, text_lower: str, max_hits: int = 5):
    hits = [c for c in candidates if c and c.lower() in text_lower]
    return hits[:max_hits]


def extract_entities(user_message: str):
    msg_lower = user_message.lower() if user_message else ""
    temples = _find_substrings(TEMPLE_NAMES, msg_lower)
    states = _find_substrings(STATE_NAMES, msg_lower)
    deities = _find_substrings(DEITY_NAMES, msg_lower)

    day_match = re.search(r"(\d+)\s*day(?:s)?", msg_lower)
    days = int(day_match.group(1)) if day_match else None

    return {
        "temples": temples,
        "states": states,
        "deities": deities,
        "days": days,
    }

# Tests / verification
print("Intent test:", detect_intent("Tell me about Kedarnath temple"))
print("Entities test 1:", extract_entities("Plan a 3 day trip in Tamil Nadu"))
print("Entities test 2:", extract_entities("Show Shiva temples in Karnataka"))


Intent test: TEMPLE_INFO
Entities test 1: {'temples': [], 'states': ['Tamil Nadu'], 'deities': [], 'days': 3}
Entities test 2: {'temples': ['Shiva Temple'], 'states': ['Karnataka', 'Shiva'], 'deities': ['Shiva'], 'days': None}


# Section 3  Generation + Chatbot


In [13]:
# 6) Section 3: Local FLAN-T5 generation + Chatbot
import textwrap
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


def generate_answer(context_text: str, user_message: str, max_new_tokens: int = 180) -> str:
    prompt = textwrap.dedent(
        f"""
        You are a helpful Indian temple travel assistant.
        Use ONLY the following context to answer:
        {context_text}
        Question: {user_message}
        Answer in 4-6 friendly sentences.
        """
    ).strip()
    try:
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = flan_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception:
        if context_text.strip():
            return "Here's what I can share from what I know: " + context_text[:400]
        return "I don't have enough context yet, but I can help if you mention a temple, state, or deity."


class TempleChatbot:
    def __init__(self, temples, embeddings, faiss_index, tokenizer, model):
        self.temples = temples
        self.embeddings = embeddings
        self.index = faiss_index
        self.tokenizer = tokenizer
        self.model = model

    def detect_intent(self, user_message: str) -> str:
        return detect_intent(user_message)

    def extract_entities(self, user_message: str):
        return extract_entities(user_message)

    def retrieve(self, query: str, k: int = 5, state_filter: str | None = None, deity_filter: str | None = None):
        return search_temples(query, k=k, state_filter=state_filter, deity_filter=deity_filter)

    def build_context(self, temple_records):
        ctx_parts = []
        for rec in temple_records:
            raw = rec.get("raw") or {}
            sections = []
            for key in ("overview", "story", "visiting_guide"):
                val = raw.get(key)
                if val:
                    sections.append(str(val))
            info_text = " ".join(sections) or raw.get("text") or rec.get("name", "")
            ctx_parts.append(
                f"Name: {rec.get('name', '')} | State: {rec.get('state', '')} | Deities: {', '.join(raw.get('deities', []) or [])} | Info: {info_text[:500]}"
            )
        return "\n".join(ctx_parts)

    def generate(self, context_records, user_message: str) -> str:
        context_text = self.build_context(context_records) if context_records else ""
        return generate_answer(context_text or "No specific context available.", user_message)

    def answer(self, user_message: str):
        intent = self.detect_intent(user_message)
        entities = self.extract_entities(user_message)

        state_filter = entities.get("states", [None])[0] if entities.get("states") else None
        deity_filter = entities.get("deities", [None])[0] if entities.get("deities") else None

        if intent == "TEMPLE_INFO":
            query_text = entities["temples"][0] if entities.get("temples") else user_message
            hits = self.retrieve(query_text, k=5, state_filter=state_filter, deity_filter=deity_filter)
            if not hits and entities.get("temples"):
                hits = self.retrieve(user_message, k=5, state_filter=state_filter, deity_filter=deity_filter)
            if not hits:
                reply = "I couldn't find exact matches, but here are some similar temples: "
                alt_hits = self.retrieve("famous temples in India", k=3)
                if alt_hits:
                    reply += ", ".join(h["name"] for h in alt_hits)
                else:
                    reply = "I couldn't find exact matches. Try mentioning a state or deity."
            else:
                reply = self.generate(hits, user_message)

        elif intent == "FIND_TEMPLES":
            hits = self.retrieve(user_message, k=5, state_filter=state_filter, deity_filter=deity_filter)
            if not hits and (state_filter or deity_filter):
                hits = self.retrieve("popular temples", k=5)
            if hits:
                lines_out = []
                for h in hits:
                    raw = h.get("raw") or {}
                    deities = raw.get("deities") or []
                    line = f"- {h['name']} ({h['state']})"
                    if deities:
                        line += f" | Deities: {', '.join(deities)}"
                    lines_out.append(line)
                reply = "Here are some temples you might like:\n" + "\n".join(lines_out)
            else:
                reply = "I couldn't find exact matches, but here are some similar temples"

        elif intent == "PLAN_TRIP":
            days = entities.get("days") or 2
            hits = self.retrieve(user_message, k=max(days, 5), state_filter=state_filter, deity_filter=deity_filter)
            if not hits and state_filter:
                hits = self.retrieve(state_filter, k=max(days, 5))
            if not hits:
                alt_hits = self.retrieve("popular pilgrimage temples", k=max(days, 3))
                if alt_hits:
                    hits = alt_hits
                else:
                    reply = "I couldn't find exact matches, but here are some similar temples"
                    return {"reply": reply, "intent": intent, "entities": entities, "used_temples": []}
            plan_lines = []
            for i in range(days):
                rec = hits[i % len(hits)] if hits else None
                if rec:
                    loc = (rec.get("raw") or {}).get("location") or (rec.get("raw") or {}).get("city") or rec.get("state", "")
                    plan_lines.append(f"Day {i+1}: Visit {rec.get('name', 'a temple')} in {rec.get('state', '')} around {loc or 'the area'}")
                else:
                    plan_lines.append(f"Day {i+1}: Explore nearby temples or local sites.")
            reply = "Here's a simple plan:\n" + "\n".join(plan_lines)

        elif intent == "ITINERARY_COST":
            reply = (
                "A modest trip often costs around INR 2k-4k per day for stay, food, and local travel; "
                "adjust upward for private transport or premium lodging. Prices vary by season and city."
            )

        elif intent == "SMALL_TALK":
            reply = "Hi there! I can help you discover temples, plan trips, or share details. What would you like to explore?"

        else:
            reply = "Could you clarify which temple, state, or deity you're interested in?"

        used = [h.get("name", "") for h in hits] if "hits" in locals() else []
        return {
            "reply": reply,
            "intent": intent,
            "entities": entities,
            "used_temples": used,
        }


bot = TempleChatbot(metadata, embeddings, index, tokenizer, flan_model)

# 7) Interactive REPL (type exit/quit to stop)
import sys
if sys.stdin is None or not sys.stdin.isatty():
    print("REPL skipped (no interactive stdin available).")
else:
    while True:
        user = input("You: ")
        if user.lower() in ["exit", "quit"]:
            break
        print(bot.answer(user)["reply"])

REPL skipped (no interactive stdin available).


In [14]:
# 7) Interactive REPL (type exit/quit to stop)
import sys
if sys.stdin is None or not sys.stdin.isatty():
    print("REPL skipped (no interactive stdin available).")
else:
    while True:
        user = input("You: ")
        if user.lower() in ["exit", "quit"]:
            break
        result = bot.answer(user)
        print("Bot:", result["reply"])


REPL skipped (no interactive stdin available).


In [15]:
%reload_ext autoreload
%autoreload 2
from chatbot_backend import TempleChatbot, search_temples

In [16]:
# Quick sanity checks for chatbot backend
bot = TempleChatbot(metadata, model, index, tokenizer, flan_model)
print(bot.answer("Hi")["reply"])
print(bot.answer("What is the capital of France?")["reply"])
print(bot.answer("Temples in Tamil Nadu")["reply"])

Hi there! Im Agent Chaari, an Indian temple travel assistant. I can help you discover temples, plan simple trips, or share temple details. What would you like to explore?
Im mainly designed to help with Indian temples and temple trips. Try asking about a temple, a state, or a deity.
Here are some temples you might like:
- Shore Temple (Tamil Nadu)
- Sripuram Golden Temple (Tamil Nadu)
- Srirangam Ranganathaswamy Temple, Srirangam (Tamil Nadu)
- Tiruvannamalai Arunachaleswarar Temple, Tiruvannamalai (Tamil Nadu)
- Ranganathaswamy Temple (Tamil Nadu)


In [None]:
# Attribute retrieval smoke tests
bot = TempleChatbot(metadata, model, index, tokenizer, flan_model)
print(bot.answer("give me overview of Kashi Vishwanath Temple, Uttar Pradesh")["reply"])
print(bot.answer("tell me the story of Kedarnath Temple")["reply"])
print(bot.answer("visiting guide for Sripuram Golden Temple in Tamil Nadu")["reply"])


In [None]:
df, bot = load_chatbot_resources()

test_queries = [
    "overview of Golden Temple",
    "story of Kedarnath Temple",
    "visiting guide for Tirumala, Andhra Pradesh",
    "temples in Tamil Nadu",
]

for q in test_queries:
    res = bot.answer(q)
    print("Q:", q)
    print("Intent:", res["intent"])
    print("Reply:", res["reply"][:500], "...\n")
    print("Sections keys:", list(res.get("sections", {}).keys()))
    print("=" * 80)
