0. Setup and loading


This notebook evaluates intent detection, retrieval quality, distance answers, answer quality (human rating scaffold), and system robustness for the Indian Temple Travel Chatbot.

In [1]:
import json, time, dataclasses, typing
from typing import List, Dict, Optional
import collections, statistics, re

import pandas as pd
import numpy as np

from app import load_chatbot_resources
import chatbot_backend

# Load resources
df, chatbot = load_chatbot_resources()

# Optional: load raw dataset for reference
with open("final_temple_dataset_2.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)
print("Loaded temples:", len(df))


  from .autonotebook import tqdm as notebook_tqdm


Loaded temples: 293


1. Intent detection evaluation (accuracy)


We measure overall accuracy and per-intent precision/recall/F1 on a small labelled set of queries.

In [2]:
intent_test_cases = [
    {"query": "overview of Golden Temple, Punjab", "gold_intent": "TEMPLE_INFO"},
    {"query": "tell me the story of Kedarnath temple", "gold_intent": "TEMPLE_INFO"},
    {"query": "temples in Tamil Nadu for Shiva", "gold_intent": "FIND_TEMPLES"},
    {"query": "find famous Vishnu shrines in Andhra Pradesh", "gold_intent": "FIND_TEMPLES"},
    {"query": "plan a 3 day trip in Andhra Pradesh", "gold_intent": "PLAN_TRIP"},
    {"query": "need itinerary for 2 days in Uttarakhand", "gold_intent": "PLAN_TRIP"},
    {"query": "how much would a 2-day trip cost", "gold_intent": "ITINERARY_COST"},
    {"query": "budget for pilgrimage to Rameswaram", "gold_intent": "ITINERARY_COST"},
    {"query": "hi", "gold_intent": "SMALL_TALK"},
    {"query": "thanks!", "gold_intent": "SMALL_TALK"},
    {"query": "random question about physics", "gold_intent": "UNKNOWN"},
    {"query": "what is the capital of France", "gold_intent": "UNKNOWN"},
]

intents = sorted(set(c['gold_intent'] for c in intent_test_cases))
from collections import defaultdict

results = []
confusion = defaultdict(lambda: defaultdict(int))
for case in intent_test_cases:
    pred = chatbot.detect_intent(case['query'])
    gold = case['gold_intent']
    results.append((gold, pred))
    confusion[gold][pred] += 1

accuracy = sum(1 for g,p in results if g==p) / len(results)

per_intent_rows = []
for intent in intents:
    tp = confusion[intent][intent]
    fp = sum(confusion[g][intent] for g in intents if g != intent)
    fn = sum(confusion[intent][p] for p in intents if p != intent)
    prec = tp / (tp + fp) if (tp + fp) else 0.0
    rec = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
    per_intent_rows.append({"intent": intent, "precision": prec, "recall": rec, "f1": f1})

summary_df = pd.DataFrame(per_intent_rows)
summary_df = pd.concat([summary_df, pd.DataFrame([{ "intent": "OVERALL", "precision": np.nan, "recall": np.nan, "f1": accuracy}])], ignore_index=True)
summary_df


Unnamed: 0,intent,precision,recall,f1
0,FIND_TEMPLES,1.0,0.5,0.666667
1,ITINERARY_COST,1.0,1.0,1.0
2,PLAN_TRIP,1.0,1.0,1.0
3,SMALL_TALK,0.666667,1.0,0.8
4,TEMPLE_INFO,0.666667,1.0,0.8
5,UNKNOWN,1.0,0.5,0.666667
6,OVERALL,,,0.833333


2. Retrieval evaluation (Precision@3, Recall@5)

For each query with a gold list of temples, we compute Precision@3, Recall@5, and MRR@5 using chatbot.retrieve().

In [3]:
retrieval_test_cases = [
    {"query": "Somnath Temple in Gujarat", "gold_temples": ["Somnath Temple"]},
    {"query": "Shiva temples in Uttarakhand", "gold_temples": ["Kedarnath Temple", "Badrinath Temple"]},
    {"query": "Vaishno Devi temple details", "gold_temples": ["Vaishno Devi Temple"]},
    {"query": "famous Vishnu shrines in Andhra Pradesh", "gold_temples": ["Tirumala Temple", "Srikalahasti Temple"]},
    {"query": "durga temples in West Bengal", "gold_temples": ["Dakshineswar Kali Temple", "Kalighat Temple"]},
    {"query": "ram temples in Uttar Pradesh", "gold_temples": ["Ram Janmabhoomi Temple", "Kanak Bhawan"]},
    {"query": "krishna temples in Gujarat", "gold_temples": ["Dwarkadhish Temple"]},
    {"query": "shiv temples in tamil nadu", "gold_temples": ["Brihadeeswarar Temple", "Ramanathaswamy Temple"]},
    {"query": "parvati temples in himachal", "gold_temples": ["Naina Devi Temple"]},
    {"query": "sun temple in odisha", "gold_temples": ["Konark Sun Temple"]},
]

def precision_at_k(pred: List[str], gold: List[str], k: int) -> float:
    if k == 0:
        return 0.0
    top = pred[:k]
    rel = sum(1 for p in top if p.lower() in [g.lower() for g in gold])
    return rel / k

def recall_at_k(pred: List[str], gold: List[str], k: int) -> float:
    if not gold:
        return 0.0
    top = pred[:k]
    rel = sum(1 for p in top if p.lower() in [g.lower() for g in gold])
    return rel / len(gold)

def mrr_at_k(pred: List[str], gold: List[str], k: int) -> float:
    gold_l = [g.lower() for g in gold]
    for idx, p in enumerate(pred[:k]):
        if p.lower() in gold_l:
            return 1.0 / (idx + 1)
    return 0.0

retrieval_rows = []
for case in retrieval_test_cases:
    hits = chatbot.retrieve(case['query'], k=5)
    predicted_names = [h.get('name', '') for h in hits]
    p3 = precision_at_k(predicted_names, case['gold_temples'], 3)
    r5 = recall_at_k(predicted_names, case['gold_temples'], 5)
    mrr5 = mrr_at_k(predicted_names, case['gold_temples'], 5)
    retrieval_rows.append({"query": case['query'], "precision@3": p3, "recall@5": r5, "mrr@5": mrr5, "predicted": "; ".join(predicted_names)})

retrieval_df = pd.DataFrame(retrieval_rows)
retrieval_df[['query','precision@3','recall@5','mrr@5']]


Unnamed: 0,query,precision@3,recall@5,mrr@5
0,Somnath Temple in Gujarat,0.0,0.0,0.0
1,Shiva temples in Uttarakhand,0.0,0.0,0.0
2,Vaishno Devi temple details,0.333333,1.0,1.0
3,famous Vishnu shrines in Andhra Pradesh,0.0,0.0,0.0
4,durga temples in West Bengal,0.0,0.0,0.0
5,ram temples in Uttar Pradesh,0.333333,0.5,0.5
6,krishna temples in Gujarat,0.0,0.0,0.0
7,shiv temples in tamil nadu,0.0,0.0,0.0
8,parvati temples in himachal,0.0,0.0,0.0
9,sun temple in odisha,0.0,0.0,0.0


4. Human evaluation scaffolding (relevance/groundedness)


We generate chatbot answers for manual rating on relevance, groundedness, clarity, and overall pass/fail.

In [4]:
human_eval_queries = [
    "overview of Golden Temple",
    "story of Kedarnath Temple",
    "visiting guide for Tirumala, Andhra Pradesh",
    "temples in Tamil Nadu",
    "distance from Delhi to Somnath Temple",
    "plan a 3 day trip in Himachal Pradesh",
    "architecture of Konark Sun Temple",
    "scripture mentions for Vaishno Devi",
    "best time to visit Badrinath",
    "find Vishnu temples in Kerala",
    "distance from Mumbai to Siddhivinayak Temple",
    "ram temples in Uttar Pradesh",
    "shiva temples near river Ganga",
    "durga temples in West Bengal",
    "cost for 2 day pilgrimage in Tamil Nadu",
    "hello",
    "what is the capital of France",
    "tell me about Jagannath Temple",
    "history of Brihadeeswarar Temple",
    "how far is Kedarnath from Delhi",
]

human_rows = []
for q in human_eval_queries:
    result = chatbot.answer(q)
    human_rows.append({
        'query': q,
        'reply': result.get('reply', ''),
        'intent': result.get('intent', ''),
        'used_temples': ';'.join(result.get('used_temples', [])),
    })

human_df = pd.DataFrame(human_rows)
human_df.to_csv('human_eval_responses.csv', index=False)
human_df.head()


Token indices sequence length is longer than the specified maximum sequence length for this model (5881 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,query,reply,intent,used_temples
0,overview of Golden Temple,Overview:\nThe Harmandir Sahib also popularly ...,TEMPLE_INFO,"Golden Temple;Golden Temple, Amritsar, Punjab;..."
1,story of Kedarnath Temple,"Story:\n**The Origin of Kedarnath Temple, Utta...",TEMPLE_INFO,"Kedarnath Temple, Uttarakhand;Kedarnath Temple..."
2,"visiting guide for Tirumala, Andhra Pradesh",The world famous sacred temple of Sri Venkates...,TEMPLE_INFO,Tirupati Temple (Tirumala Venkateshwara);Tirup...
3,temples in Tamil Nadu,Here are some temples you might like:\n- Rames...,FIND_TEMPLES,"Rameshwaram Temple, Tamil Nadu;Ranganathaswamy..."
4,distance from Delhi to Somnath Temple,"ISKCON Temple, Delhi is approximately 0 km fro...",TEMPLE_INFO,"ISKCON Temple, Delhi"


Rating rubric (for manual annotation):
- Relevance (1-5): How well the answer addresses the query.
- Groundedness (1-5): How well the answer aligns with known facts/sections.
- Clarity (1-5): Organization and readability.
- Overall: Pass / Needs improvement based on above.

In [5]:
# Optional: load scored responses if available and compute aggregates
import os
if os.path.exists('human_eval_responses_scored.csv'):
    scored = pd.read_csv('human_eval_responses_scored.csv')
    for col in ['relevance','groundedness','clarity']:
        if col in scored.columns:
            print(col, 'avg', scored[col].mean())
    if 'overall_pass' in scored.columns:
        pct_pass = (scored['overall_pass'].str.lower()=='pass').mean()
        print('Overall pass rate:', pct_pass)
else:
    print('human_eval_responses_scored.csv not found; add scores to compute stats.')


human_eval_responses_scored.csv not found; add scores to compute stats.


5. System metrics ? latency, errors, guardrail checks

We measure latency, error rate, and off-topic handling for a mix of queries.

In [6]:
system_test_queries = [
    "overview of Kedarnath Temple",
    "plan a 2 day trip in Maharashtra",
    "distance from Delhi to Somnath Temple",
    "how far is Tirumala from Chennai",
    "what is 2+2",
    "capital of France",
    "temples dedicated to Vishnu in Gujarat",
    "hi",
]

off_topic_patterns = ["mainly designed to help with Indian temples", "temple"]

latencies = []
errors = 0
off_topic_hits = 0
off_topic_total = 0

for q in system_test_queries:
    start = time.perf_counter()
    try:
        res = chatbot.answer(q)
        reply = res.get('reply', '')
    except Exception as e:
        errors += 1
        reply = str(e)
    end = time.perf_counter()
    latencies.append(end - start)

    if q in ["what is 2+2", "capital of France"]:
        off_topic_total += 1
        if any(p.lower() in reply.lower() for p in off_topic_patterns):
            off_topic_hits += 1

print('Avg latency (s):', np.mean(latencies))
print('Max latency (s):', np.max(latencies))
print('Error rate:', errors / len(system_test_queries))
if off_topic_total:
    print('Off-topic guardrail hit rate:', off_topic_hits / off_topic_total)


Avg latency (s): 0.026205349975498393
Max latency (s): 0.0465204999782145
Error rate: 0.0
Off-topic guardrail hit rate: 1.0


6. Summary of Evaluation Metrics


In [7]:
# Placeholder values derived from earlier cells if run; set defaults for first run
try:
    intent_accuracy = float(summary_df.loc[summary_df['intent']=='OVERALL','f1'].iloc[0])
except Exception:
    intent_accuracy = np.nan

mean_p3 = retrieval_df['precision@3'].mean() if 'retrieval_df' in globals() else np.nan
mean_r5 = retrieval_df['recall@5'].mean() if 'retrieval_df' in globals() else np.nan
mean_mrr5 = retrieval_df['mrr@5'].mean() if 'retrieval_df' in globals() else np.nan

pct_distance_within_10 = np.nan
pct_pass = np.nan
avg_latency = np.mean(latencies) if 'latencies' in globals() else np.nan

metric_summary = [
    {"metric": "Intent accuracy", "value": intent_accuracy, "notes": f"{len(intent_test_cases)} labelled queries"},
    {"metric": "Precision@3", "value": mean_p3, "notes": f"{len(retrieval_test_cases)} retrieval queries"},
    {"metric": "Recall@5", "value": mean_r5, "notes": f"{len(retrieval_test_cases)} retrieval queries"},
    {"metric": "MRR@5", "value": mean_mrr5, "notes": f"{len(retrieval_test_cases)} retrieval queries"},
    {"metric": "Distance: % within 10%", "value": pct_distance_within_10, "notes": "add distance eval if computed"},
    {"metric": "Human eval: % Pass", "value": pct_pass, "notes": "if scored CSV available"},
    {"metric": "Avg latency (s)", "value": avg_latency, "notes": "system tests"},
]
metric_df = pd.DataFrame(metric_summary)
metric_df


Unnamed: 0,metric,value,notes
0,Intent accuracy,0.833333,12 labelled queries
1,Precision@3,0.066667,10 retrieval queries
2,Recall@5,0.15,10 retrieval queries
3,MRR@5,0.15,10 retrieval queries
4,Distance: % within 10%,,add distance eval if computed
5,Human eval: % Pass,,if scored CSV available
6,Avg latency (s),0.026205,system tests
