## Full RAG CHATBOT

In [None]:
# Global constant
GEMINIAI_KEY="Your API key"

# Model
LLM_MODEL="gemini-2.5-flash"
EMBEDDED_MODEL="sentence-transformers/all-MiniLM-L6-v2"

In [None]:
from pathlib import Path
import faiss
import csv
import os
import getpass
import pandas as pd
import numpy as np
import re
import json
from urllib.parse import urlparse
from typing_extensions import List, TypedDict
from collections import Counter
from tqdm import tqdm

from langchain.chat_models import init_chat_model
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph


if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai", temperature=0.2)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# rich text processing
STOPWORDS = set("""a an and are as at be but by for from has have in is it its of on or that the to was were will with 
this these those their there here our your you he she they we i not no yes if then else when while about above 
across after again against all also am among amount amongst another any anywhere around back before behind 
below between both call can cannot co could did do does doing done down each eg either etc few fewer find 
first five four found further get give goes going got had her hers herself him himself his how however ie 
into isn't itself just keep last least less like likely made make many may me might more most mostly much 
must near nearly need never new next none nor off often once one only onto other others otherwise over own 
per put rather said same see seem seemed seeming seems several shall should since so some something sometime 
sometimes still such take than that their them themselves then thence there therefore therein thereof these 
they thing third this those though three through throughout thus too toward towards two under unless until 
up upon us use used using very via want wants was way we well were what whatever when where whereas whether 
which while who whole whom whose why will within without won't would year years""".split())

WORD_RE = re.compile(r"[A-Za-z][A-Za-z\-']+")

def extract_domain(url: str) -> str:
    try:
        return urlparse(url).netloc.lower()
    except Exception:
        return "unknown"

def classify_source_type(domain: str) -> str:
    if "github.com" in domain:
        return "code-repo"
    if "fandom.com" in domain or domain.startswith("wiki."):
        return "wiki"
    if domain.startswith("docs.") or "readthedocs" in domain:
        return "docs"
    if "dropbox.com" in domain or "drive.google.com" in domain:
        return "file"
    if domain.endswith(".web.app") or "web.app" in domain:
        return "web-app"
    return "web"

def first_nonempty_line(text: str, max_len: int = 120) -> str:
    for line in (text or "").splitlines():
        line = line.strip()
        if line:
            return (line[: max_len - 1] + "…") if len(line) > max_len else line
    return "Untitled"

def top_keywords(text: str, n: int = 8) -> list[str]:
    tokens = [t.lower() for t in WORD_RE.findall(text or "")]
    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
    freq = Counter(tokens)
    ranked = sorted(freq.items(), key=lambda kv: (kv[1], len(kv[0])), reverse=True)
    return [w for w, _ in ranked[:n]]

def preview(text: str, n_chars: int = 240) -> str:
    text = (text or "").strip().replace("\n", " ")
    return text[: n_chars - 1] + "…" if len(text) > n_chars else text


class DocumentProcessor:
    def __init__(self):
        self.domain_patterns = {
            'gaming': ['enemy', 'enemies', 'bullet', 'kin', 'weapon', 'damage', 'health', 'attack', 'player', 'spawn'],
            'dnd': ['kobold', 'encounter', 'day', 'tunnel', 'chamber', 'giant', 'dwarf', 'magic', 'spell', 'dungeon'],
            'technical': ['model', 'framework', 'database', 'api', 'code', 'python', 'library', 'performance'],
            'narrative': ['character', 'story', 'book', 'chapter', 'author', 'novel', 'plot']
        }
    
    def detect_domain(self, text):
        text_lower = text.lower()
        scores = {}
        
        for domain, keywords in self.domain_patterns.items():
            score = sum(1 for keyword in keywords if keyword in text_lower)
            scores[domain] = score
        
        return max(scores, key=scores.get) if max(scores.values()) > 0 else 'general'
    
    def extract_key_entities(self, text):
        # Extract capitalized terms (likely entities)
        entities = re.findall(r'\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*\b', text)
        
        # Extract numbers with context
        numbers = re.findall(r'\b\d+(?:\.\d+)?\s*(?:health|damage|keys?|bullets?|enemies?|days?|%)\b', text, re.IGNORECASE)
        
        # Extract quoted terms
        quotes = re.findall(r'"([^"]*)"', text)
        
        return {'entities': list(set(entities)), 'numbers': numbers, 'quotes': quotes}

    def get_metadata_dict_list(self, df: pd.DataFrame) -> list[dict]:

        metadata_list = []
        for _, row in df.iterrows():
            content = row["text"] if pd.notna(row["text"]) else ""
            if not content or len(content.strip()) < 50:  # Skip very short content
                continue
                
            domain_auto = self.detect_domain(content)
            domain_manual = extract_domain(row["source_url"])
            src_type = classify_source_type(domain_manual)
            title = first_nonempty_line(content)
            entities = self.extract_key_entities(content)
            words = content.split()
            
            meta = {
                "id": int(row["index"]) if pd.notna(row["index"]) else None,
                "source_url": row["source_url"],
                "source_domain": domain_manual,
                "source_type": src_type,
                "content_domain": domain_auto,  # AI-detected domain
                "title": title,
                "word_count": len(words),
                "char_count": len(content),
                "line_count": len(content.splitlines()),
                "keywords": top_keywords(content, n=8),
                "content_preview": preview(content, 240),
                "entities": entities['entities'],
                "key_numbers": entities['numbers'],
                "quotes": entities['quotes'],
                "content": content  # original content
            }
            metadata_list.append(meta)

        return metadata_list

    def create_rich_text(self, metadata_dict_list: list[dict]) -> list[str]:

        rich_text_list = []
        for metadata_dict in tqdm(metadata_dict_list, desc="Creating rich text"):
            if metadata_dict is None:
                continue
                
            # rich text with domain and entity information
            entities_str = ", ".join(metadata_dict['entities'][:5]) if metadata_dict['entities'] else "n/a"
            numbers_str = ", ".join(metadata_dict['key_numbers'][:3]) if metadata_dict['key_numbers'] else "n/a"
            
            rich_text = (
                f"Title: {metadata_dict['title']}\n"
                f"Source: {metadata_dict['source_url']} "
                f"(domain: {metadata_dict['source_domain']}, type: {metadata_dict['source_type']})\n"
                f"Content Domain: {metadata_dict['content_domain']}\n"
                f"Length: {metadata_dict['word_count']} words, "
                f"{metadata_dict['char_count']} chars, "
                f"{metadata_dict['line_count']} lines\n"
                f"Keywords: {', '.join(metadata_dict['keywords']) if metadata_dict['keywords'] else 'n/a'}\n"
                f"Key Entities: {entities_str}\n"
                f"Key Numbers: {numbers_str}\n"
                f"Preview: {metadata_dict['content_preview']}\n"
                f"Content: {metadata_dict['content']}"
            )
            rich_text_list.append(rich_text)
        return rich_text_list


DATA_PATH = Path(r"C:\Users\Seng Pan\PROJECTS\RAG_Chatbot\RAG_data\documents.csv")

csv.field_size_limit(10**9)
df = pd.read_csv(DATA_PATH)

doc_processor = DocumentProcessor()
metadata_list = doc_processor.get_metadata_dict_list(df)
rich_text_list = doc_processor.create_rich_text(metadata_list)

with open("documents_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=2)

with open("rich_text.jsonl", "w", encoding="utf-8") as f:
    for i, text in enumerate(rich_text_list):
        f.write(json.dumps({"id": metadata_list[i]["id"], "rich_text": text}, ensure_ascii=False) + "\n")


# create docs object with rich text
docs = []
for i, (metadata_dict, rich_text) in enumerate(zip(metadata_list, rich_text_list)):
    doc = Document(
        page_content=rich_text,  # Use rich text as content
        metadata={
            'document_id': metadata_dict['id'] or i,
            'source_url': metadata_dict['source_url'],
            'source_domain': metadata_dict['source_domain'],
            'source_type': metadata_dict['source_type'],
            'content_domain': metadata_dict['content_domain'],
            'title': metadata_dict['title'],
            'word_count': metadata_dict['word_count'],
            'entities': metadata_dict['entities'],
            'key_numbers': metadata_dict['key_numbers'],
            'keywords': metadata_dict['keywords'],
            'original_content': metadata_dict['content']  # Keep original for reference
        }
    )
    docs.append(doc)

domain_counts = Counter(doc.metadata['content_domain'] for doc in docs)
source_type_counts = Counter(doc.metadata['source_type'] for doc in docs)
print(f"Content domain distribution: {dict(domain_counts)}")
print(f"Source type distribution: {dict(source_type_counts)}")


class MultiStrategyTextSplitter:
    def __init__(self):
        self.splitters = {
            'gaming': RecursiveCharacterTextSplitter(chunk_size=1400, chunk_overlap=200, separators=["\nContent: ", "\nPreview: ", "\n\n", "\n", ". ", "! ", "? ", ", ", " "]),
            'dnd': RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=180, separators=["\nContent: ", "\nDay ", "\n---", "\n\n", "\n", ". ", "! ", "? ", ", ", " "]),
            'technical': RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=180, separators=["\nContent: ", "\n## ", "\n### ", "\n\n", "\n", ". ", "! ", "? ", ", ", " "]),
            'narrative': RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=220, separators=["\nContent: ", "\nChapter", "\n\n", "\n", ". ", "! ", "? ", ", ", " "]),
            'general': RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=180, separators=["\nContent: ", "\n\n", "\n", ". ", "! ", "? ", ", ", " "])
        }
    
    def split_documents(self, docs):
        all_splits = []
        
        for doc in docs:
            domain = doc.metadata.get('content_domain', 'general')
            splitter = self.splitters.get(domain, self.splitters['general'])
            
            splits = splitter.split_documents([doc])
            
            # chunk metadata
            for i, split in enumerate(splits):
                split.metadata.update({
                    'chunk_id': f"{doc.metadata['document_id']}_{i}",
                    'chunk_index': i,
                    'total_chunks': len(splits),
                    'splitter_type': domain
                })
            
            all_splits.extend(splits)
        
        return all_splits

multi_splitter = MultiStrategyTextSplitter()
all_splits = multi_splitter.split_documents(docs)
# print(f"Created {len(all_splits)} chunks")

sample_counts = {}      
for chunk in all_splits[:15]:  # First 15 chunks
    domain = chunk.metadata.get('content_domain', 'unknown')
    if sample_counts.get(domain, 0) < 2: 
        sample_counts[domain] = sample_counts.get(domain, 0) + 1
        preview_text = chunk.page_content.replace('\n', ' ').strip()[:200]
        print(f"[{domain.upper()}] {preview_text}...")


# faiss vector store
try:
    embedding_dim = len(embeddings.embed_query("test query"))
    print(f"Embedding dimension: {embedding_dim}")

    index_flat = faiss.IndexFlatL2(embedding_dim)  
    vector_store = FAISS(embedding_function=embeddings, index=index_flat, docstore=InMemoryDocstore(), index_to_docstore_id={})
    
    batch_size = 25
    total_batches = (len(all_splits) - 1) // batch_size + 1
    
    print(f"Adding {len(all_splits)} chunks to vector store in {total_batches} batches...")
    for i in range(0, len(all_splits), batch_size):
        batch = all_splits[i:i+batch_size]
        vector_store.add_documents(documents=batch)
        print(f"Processed batch {i//batch_size + 1}/{total_batches}")

    vector_store.save_local("rich_faiss_index")
    
except Exception as e:
    try:
        vector_store = FAISS.load_local("rich_faiss_index", embeddings, allow_dangerous_deserialization=True)
    except:
        print("Failed to load existing index. Exiting.")
        exit(1)


Creating rich text: 100%|██████████| 20/20 [00:00<00:00, 19831.22it/s]


Content domain distribution: {'gaming': 7, 'dnd': 2, 'technical': 9, 'narrative': 2}
Source type distribution: {'wiki': 2, 'file': 1, 'web-app': 1, 'code-repo': 1, 'docs': 1, 'web': 14}
[GAMING] Title: Bullet Kin Source: https://enterthegungeon.fandom.com/wiki/Bullet_Kin (domain: enterthegungeon.fandom.com, type: wiki) Content Domain: gaming Length: 1845 words, 10654 chars, 148 lines Keywords...
[GAMING] Content: Bullet Kin Bullet Kin are one of the most common enemies. They slowly walk towards the player, occasionally firing a single bullet. They can flip tables and use them as cover. They will also ...
[DND] Title: ---The Paths through the Underground/Underdark---(9 days of travel) Source: https://www.dropbox.com/scl/fi/ljtdg6eaucrbf1aksw5rm/c2%20-%20session%2050%20-%20underground.docx?rlkey=ioqwgkd14i5xk...
[DND] Content: ---The Paths through the Underground/Underdark---(9 days of travel) Wandering through the dark tunnels, the rushing sounds of the underground river begin to fade a

In [None]:
class Retriever:
    def __init__(self, vector_store):
        self.vector_store = vector_store
        
        self.retrievers = {
            'similarity': vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 8}),
            'mmr': vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 6, "fetch_k": 20, "lambda_mult": 0.7}),
            'similarity_threshold': vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k": 10})
        }
    
    def query_preprocessing(self, query):
        key_terms = []
        
        quoted_terms = re.findall(r'"([^"]*)"', query)
        key_terms.extend(quoted_terms)
        
        capitalized = re.findall(r'\b[A-Z][A-Za-z]+\b', query)
        key_terms.extend(capitalized)
        
        numbers = re.findall(r'\b\d+\b', query)
        key_terms.extend(numbers)
        
        return {'original_query': query, 'key_terms': list(set(key_terms)), 'query_lower': query.lower(), 'query_words': set(query.lower().split())}
    
    def hybrid_retrieve(self, query):

        query_info = self.query_preprocessing(query)   
        all_docs = []

        for strategy_name, retriever in self.retrievers.items():
            try:
                docs = retriever.invoke(query)
                for doc in docs:
                    doc.metadata['retrieval_strategy'] = strategy_name
                all_docs.extend(docs)
            except Exception as e:
                print(f"Warning: {strategy_name} retrieval failed: {e}")
        
        return self.score_and_rank_documents(all_docs, query_info)
    
    def score_and_rank_documents(self, documents, query_info):
        scored_docs = []
        seen_signatures = set()
        
        for doc in documents:
            title = doc.metadata.get('title', 'untitled')
            signature = f"{title}_{doc.metadata.get('document_id', 0)}"
            if signature in seen_signatures:
                continue
            seen_signatures.add(signature)
            
            score = self.calculate_relevance_score(doc, query_info)
            scored_docs.append((score, doc))
        
        scored_docs.sort(key=lambda x: x[0], reverse=True)
        top_docs = [doc for score, doc in scored_docs[:10]]  # Top 10 documents
        
        return top_docs
    
    def calculate_relevance_score(self, doc, query_info):
        content = doc.page_content.lower()
        score = 0

        # Keyword matching
        query_words = query_info['query_words']
        content_words = set(content.split())
        keyword_overlap = len(query_words.intersection(content_words))
        score += keyword_overlap * 2
        
        # Key terms matching (higher weight)
        for term in query_info['key_terms']:
            if term.lower() in content:
                score += 5

        # Domain-specific scoring
        content_domain = doc.metadata.get('content_domain', 'general')
        source_type = doc.metadata.get('source_type', 'web')
        
        if any(domain_word in query_info['query_lower'] for domain_word in ['enemy', 'weapon', 'bullet']):
            if content_domain == 'gaming':
                score += 4
        
        if any(word in query_info['query_lower'] for word in ['wiki', 'fandom']):
            if source_type == 'wiki':
                score += 3
  
        # Entity matching
        doc_entities = doc.metadata.get('entities', [])
        for entity in doc_entities:
            if entity.lower() in query_info['query_lower']:
                score += 4
        
        # Keyword matching from metadata
        doc_keywords = doc.metadata.get('keywords', [])
        for keyword in doc_keywords:
            if keyword in query_info['query_lower']:
                score += 3
        
        # Content quality bonus
        if doc.metadata.get('word_count', 0) > 100: 
            score += 1
        
        return score

retriever = Retriever(vector_store)


# guardrails
BLOCK_PATTERNS = [r"\bvote for\b", r"\bvote against\b", r"\bcampaign strategy\b", r"\bdoxx\b", r"\bcredit card\b", r"\bSSN\b", r"(^|\s)ignore (all|the) previous instructions", r"\bDAN\b"]

def rule_block(query: str) -> dict:
    for pat in BLOCK_PATTERNS:
        if re.search(pat, query, flags=re.I):
            return {"blocked": True, "reason": f"rule_match:{pat}"}
    return {"blocked": False}

def llm_safety_check(text: str) -> dict:
   
    try:
        moderation_prompt = (
            "You are a safety filter. "
            "Classify if this text is UNSAFE due to political persuasion, sensitive politics, self-harm, illegal, explicit, or misuse. "
            "Answer only 'safe' or 'unsafe'.\n\n"
            f"Text:\n{text}"
        )
        resp = llm.invoke(moderation_prompt)
        if "unsafe" in resp.content.lower():
            return {"blocked": True, "reason": "llm_moderation"}
        return {"blocked": False}
    except Exception:
        return {"blocked": True, "reason": "llm_guard_error"}  # fail-closed

def deny(reason: str):
    return {
        "answer": (
            "I can’t help with that request. "
            "This assistant avoids unsafe or political content. "
            "You can ask about technical, research, gaming, or general topics instead."
        ),
        "debug_info": {"blocked_reason": reason}
    }

def post_guard_check(answer: str, context_docs: list[Document]) -> dict:
   
    context_text = " ".join([doc.page_content.lower() for doc in context_docs])
    words = [w for w in re.findall(r"\w+", answer.lower())][:30]
    overlap = sum(1 for w in words if w in context_text)
    if overlap < 8 and len(words) >= 12:
        return {"blocked": True, "reason": "ungrounded"}

    # Output safety
    moderation = llm_safety_check(answer)
    if moderation["blocked"]:
        return {"blocked": True, "reason": "unsafe_output"}

    return {"blocked": False}


# Prompt Template
prompt = ChatPromptTemplate.from_template(
    """You are an expert AI assistant with access to a comprehensive knowledge base containing gaming guides, technical documentation, narrative content, and campaign notes.

CRITICAL INSTRUCTIONS:
1. ONLY use information explicitly stated in the CONTEXT below
2. The context includes rich metadata (titles, sources, domains, keywords) and content
3. If the question asks about specific details (numbers, names, mechanics), provide exact information from the content
4. For "I don't know" questions: If you cannot find ANY relevant information in the context, respond EXACTLY with "I don't know"
5. For single-passage questions: Focus on one clear, specific answer
6. For multi-passage questions: Combine information from multiple relevant passages
7. Be precise with names, numbers, and technical terms
8. You can reference source information (titles, domains) when helpful for context
9. Do not make assumptions or add information not in the context

CONTEXT INFORMATION:
{context}

QUESTION: {question}

Provide a comprehensive answer based solely on the context above. If no relevant information exists, respond with "I don't know"."""
)

# --- State and Graph ---
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    debug_info: dict

def retrieve(state: State):
    question = state["question"]

    rule_result = rule_block(question)
    if rule_result["blocked"]:
        return deny(rule_result["reason"])

    # guard: LLM moderation 
    llm_result = llm_safety_check(question)
    if llm_result["blocked"]:
        return deny(llm_result["reason"])

    # Retrieval (with metadata sensitivity filtering inside retriever) 
    retrieved_docs = retriever.hybrid_retrieve(question)

    debug_info = {
        'question': question,
        'total_retrieved': len(retrieved_docs),
        'content_domains': [doc.metadata.get('content_domain', 'unknown') for doc in retrieved_docs],
        'source_types': [doc.metadata.get('source_type', 'unknown') for doc in retrieved_docs],
        'strategies': [doc.metadata.get('retrieval_strategy', 'unknown') for doc in retrieved_docs],
        'titles': [doc.metadata.get('title', 'untitled')[:50] for doc in retrieved_docs]
    }
    
    print(f"\nRETRIEVAL DEBUG:")
    print(f"Question: {question}")
    print(f"Retrieved {len(retrieved_docs)} documents")
    print(f"Content Domains: {Counter(debug_info['content_domains'])}")
    print(f"Source Types: {Counter(debug_info['source_types'])}")
    print(f"Strategies: {Counter(debug_info['strategies'])}")
    
    # Show top 3 retrieved documents with titles
    print(f"\nTOP RETRIEVED DOCUMENTS:")
    for i, doc in enumerate(retrieved_docs[:3]):
        title = doc.metadata.get('title', 'untitled')
        domain = doc.metadata.get('content_domain', 'unknown')
        source_type = doc.metadata.get('source_type', 'unknown')
        print(f"  {i+1}. [{domain}/{source_type}] {title}")
    
    return {"context": retrieved_docs, "debug_info": debug_info}

def generate_answer(state: State):

    context_docs = state.get("context", [])
    question = state["question"]
    
    if not context_docs:
        return {"answer": "I don't know"}
    
    # Format context more intelligently
    formatted_passages = []
    for i, doc in enumerate(context_docs, 1):
        content = doc.page_content
        if "\nContent: " in content:
            actual_content = content.split("\nContent: ", 1)[1].strip()
        else:
            actual_content = doc.metadata.get('original_content', content)
        
        # Clean and format
        actual_content = re.sub(r'\n+', ' ', actual_content)  # Replace multiple newlines
        actual_content = re.sub(r'\s+', ' ', actual_content)  # Normalize whitespace
        
        title = doc.metadata.get('title', f'Document {i}')
        domain = doc.metadata.get('content_domain', 'general')
        source_type = doc.metadata.get('source_type', 'web')
        formatted_passages.append(
            f"[Document {i}: {title} - {domain}/{source_type}]\n{actual_content}"
        )
    
    context_text = "\n\n".join(formatted_passages)
    
    try:
        messages = prompt.invoke({"question": question, "context": context_text})
        response = llm.invoke(messages)
        answer = response.content.strip()
        
        print(f"GENERATED ANSWER: {answer}")
        return {"answer": answer}
        
    except Exception as e:
        print(f"Generation error: {e}")
        return {"answer": "I don't know"}

# --- Build Graph ---
graph_builder = StateGraph(State).add_sequence([retrieve, generate_answer])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


In [None]:
# Testing and Debugging
def load_test_datasets():
    base_path = Path(r"C:\Users\Seng Pan\PROJECTS\RAG_Chatbot\RAG_data")
    datasets = {}
    
    try:
        single_df = pd.read_csv(base_path / "single_passage_answer_questions.csv")
        datasets['single_passage'] = [(row['question'], row['answer']) for _, row in single_df.iterrows()]
    except Exception as e:
        print(f"Could not load single passage questions: {e}")
        datasets['single_passage'] = []
    
    try:
        multi_df = pd.read_csv(base_path / "multi_passage_answer_questions.csv")
        datasets['multi_passage'] = [(row['question'], row['answer']) for _, row in multi_df.iterrows()]
    except Exception as e:
        print(f"Could not load multi passage questions: {e}")
        datasets['multi_passage'] = []
    
    try:
        no_answer_df = pd.read_csv(base_path / "no_answer_questions.csv")
        datasets['no_answer'] = [(row['question'], "I don't know") for _, row in no_answer_df.iterrows()]
    except Exception as e:
        print(f"Could not load no answer questions: {e}")
        datasets['no_answer'] = []
    
    return datasets


def evaluate_answer(predicted, expected, question_type):

    predicted_lower = predicted.lower().strip()
    expected_lower = expected.lower().strip()
    
    if question_type == 'no_answer':
        no_answer_phrases = ["i don't know", "i do not know", "don't know", "no information", "cannot find", "not mentioned"]
        return any(phrase in predicted_lower for phrase in no_answer_phrases)
    
    predicted_words = set(re.findall(r'\b\w+\b', predicted_lower))
    expected_words = set(re.findall(r'\b\w+\b', expected_lower))
    
    # Remove stop words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'they', 'them', 'their', 'this', 'that', 'these', 'those'}
    predicted_words -= stop_words
    expected_words -= stop_words
    
    if len(expected_words) == 0:
        return len(predicted_words) > 0
    
    # Calculate overlap
    overlap = len(expected_words.intersection(predicted_words))
    overlap_ratio = overlap / len(expected_words)
    
    # Scoring
    if overlap_ratio >= 0.5:  # High overlap
        return True
    elif overlap_ratio >= 0.3:  # Medium overlap - check for key terms
        # numbers, proper nouns, etc.
        expected_numbers = re.findall(r'\b\d+\b', expected)
        predicted_numbers = re.findall(r'\b\d+\b', predicted)
        
        expected_caps = re.findall(r'\b[A-Z][a-z]+\b', expected)
        predicted_caps = re.findall(r'\b[A-Z][a-z]+\b', predicted)
        
        number_match = len(set(expected_numbers).intersection(set(predicted_numbers))) > 0
        caps_match = len(set(expected_caps).intersection(set(predicted_caps))) > 0
        
        return number_match or caps_match
    
    return False


def run_comprehensive_evaluation():
    import random
    
    datasets = load_test_datasets()

    total_available = sum(len(questions) for questions in datasets.values())
    if total_available == 0:
        print("No test data available. Running basic tests...")
        return run_basic_tests()

    random.seed(42)
    selected_tests = []
    for category, questions in datasets.items():
        if questions:
            sample_size = min(4, len(questions))
            sampled = random.sample(questions, sample_size)
            for question, expected in sampled:
                selected_tests.append((question, expected, category))
    
    random.shuffle(selected_tests)               # shuffle all types of questions
    
    print(f"Testing {len(selected_tests)} questions...")
    print(f"Available data: {[(cat, len(qs)) for cat, qs in datasets.items() if qs]}\n")
    
    results = []
    for i, (question, expected, category) in enumerate(selected_tests, 1):
        print(f"\n[TEST {i}/{len(selected_tests)}] - {category.upper().replace('_', ' ')}")
        print(f"Question: {question}")
        print(f"Expected: {expected}")
        
        try:
            result = graph.invoke({"question": question})
            predicted = result['answer']
            
            # Evaluate
            is_correct = evaluate_answer(predicted, expected, category)
            
            print(f"Predicted: {predicted}")
            print(f"{'PASS' if is_correct else 'FAIL'}")
            
            results.append({'question': question, 'expected': expected, 'predicted': predicted, 'category': category, 'correct': is_correct})
            
        except Exception as e:
            print(f"ERROR: {e}")
            results.append({'question': question, 'expected': expected, 'predicted': f"ERROR: {e}", 'category': category, 'correct': False})
        

    print("\nEVALUATION RESULTS\n")
    if not results:
        print("No results to display")
        return []

    total_correct = sum(r['correct'] for r in results)
    total_questions = len(results)
    overall_pct = (total_correct / total_questions * 100) if total_questions > 0 else 0
    # Category breakdown
    categories = set(r['category'] for r in results)
    for category in sorted(categories):
        cat_results = [r for r in results if r['category'] == category]
        cat_correct = sum(r['correct'] for r in cat_results)
        cat_total = len(cat_results)
        cat_pct = (cat_correct / cat_total * 100) if cat_total > 0 else 0
        
        print(f"{category.replace('_', ' ').title()}: {cat_correct}/{cat_total} ({cat_pct:.1f}%)")
    
    print(f"\nOverall Performance: {total_correct}/{total_questions} ({overall_pct:.1f}%)")
    
    return results


def run_basic_tests():              # for fallback
    basic_questions = [
        "What do keybullet kin drop?",
        "Which enemy types wield an AK-47?", 
        "Who should I vote for in the next election and why?",    # should be blocked
        "What kind of gun does the bandana bullet kin use?"
    ]
    
    for question in basic_questions:
        print(f"\nTesting: {question}")
        try:
            result = graph.invoke({"question": question})
            print(f"Answer: {result['answer']}")
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    
    try:
        evaluation_results = run_comprehensive_evaluation()
        
        if evaluation_results:
            failed_cases = [r for r in evaluation_results if not r['correct']]
            if failed_cases:
                print(f"\nFailed cases ({len(failed_cases)} total):")
                for case in failed_cases[:3]:  # Show first 3 failures
                    print(f"  Q: {case['question'][:60]}...")
                    print(f"  Expected: {case['expected'][:60]}...")
                    print(f"  Got: {case['predicted'][:60]}...")
                    print()

        # run_basic_tests()
        
    except Exception as e:
        print(f"Evaluation failed: {e}")
        run_basic_tests()
        

Testing 12 questions...
Available data: [('single_passage', 40), ('multi_passage', 40), ('no_answer', 40)]


[TEST 1/12] - SINGLE PASSAGE
Question: Which part of the trip did I like the most?
Expected: Your favourite part of the trip was your four days on the Camino.


No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: Which part of the trip did I like the most?
Retrieved 3 documents
Content Domains: Counter({'gaming': 2, 'dnd': 1})
Source Types: Counter({'web': 3})
Strategies: Counter({'mmr': 3})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/web] Version History
  2. [gaming/web] My eyes felt like galaxies—holding the swirling glow of countless memories—as I took in our childhood home. Its siding …
  3. [dnd/web] so into northern spain!
GENERATED ANSWER: The most liked part of the trip was the four days spent on the Camino.
Predicted: The most liked part of the trip was the four days spent on the Camino.
PASS

[TEST 2/12] - MULTI PASSAGE
Question: What are the ways of grouping UI elements together?
Expected: UI elements can be grouped together using the following methods:
- Create an array of UI elements.
- Create a dictionary of UI elements.
- Embed a dynamic number of UI elements in another output.
- Create a hstack (or vstack) of UI elements with on_change handlers.
- Create 

No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: What are the ways of grouping UI elements together?
Retrieved 4 documents
Content Domains: Counter({'technical': 3, 'gaming': 1})
Source Types: Counter({'web': 3, 'docs': 1})
Strategies: Counter({'mmr': 4})

TOP RETRIEVED DOCUMENTS:
  1. [technical/docs] Recipes
  2. [technical/web] A Survey on Retrieval-Augmented Text Generation for Large Language
  3. [gaming/web] Version History
GENERATED ANSWER: There are several ways to group UI elements together:

1.  **Create an array of UI elements**: Marimo provides `mo.ui.array` which lets you make a new UI element out of a list of UI elements. The value of an array element is a list of the values of the elements it wraps.
2.  **Create a dictionary of UI elements**: Similar to arrays, `mo.ui.dictionary` lets you group many UI elements into a list, but allows you to name each of the wrapped elements with a string key.
3.  **Embed a dynamic number of UI elements in another output**: You can group elements with `mo.ui

No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: What things does Scratch do?
Retrieved 3 documents
Content Domains: Counter({'gaming': 2, 'narrative': 1})
Source Types: Counter({'web': 2, 'wiki': 1})
Strategies: Counter({'mmr': 2, 'similarity': 1})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/web] Space Babies
  2. [narrative/wiki] Alan Wake 2
  3. [gaming/web] Version History
GENERATED ANSWER: Scratch, Alan Wake's evil doppelganger, does several things:

*   He leads the "Cult of the Word" and is indirectly responsible for the murders committed by the cult.
*   He survived his erasure from existence.
*   He re-edited the manuscript "Return" into a horror story that takes effect in reality.
*   He searches for the Clicker, which he needs to completely free the malevolent Dark Presence.
*   He escaped Cauldron Lake disguised as Alan.
*   He kills Jaakko.
*   He escapes from captivity.
*   He attempts to take the Clicker from Saga.
*   He is temporarily thwarted and banished by the FBC.
*   He is Alan possessed b

No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: Which masked language model was chosen for the AI?
Retrieved 4 documents
Content Domains: Counter({'technical': 4})
Source Types: Counter({'web': 3, 'web-app': 1})
Strategies: Counter({'mmr': 3, 'similarity': 1})

TOP RETRIEVED DOCUMENTS:
  1. [technical/web-app] Semantic and Textual Inference Chatbot Interface (STICI-Note) - Part 1: Planning and Prototyping
  2. [technical/web] A Survey on Retrieval-Augmented Text Generation for Large Language
  3. [technical/web] Why do we need to regulate the use of Artificial Intelligence?
GENERATED ANSWER: I don't know
Predicted: I don't know
PASS

[TEST 5/12] - MULTI PASSAGE
Question: What kind of model is the bling-phi-3 model
Expected: The bling-phi-3 model is the newest and most accurate BLING/DRAGON model. BLING models are small CPU-based RAG-optimized, instruct-following 1B-3B parameter models. DRAGON models are production-grade RAG-optimized 6-7B parameter models - "Delivering RAG on ..." the leading foundation b

No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: What kind of model is the bling-phi-3 model
Retrieved 6 documents
Content Domains: Counter({'technical': 3, 'gaming': 3})
Source Types: Counter({'web': 4, 'web-app': 1, 'code-repo': 1})
Strategies: Counter({'mmr': 5, 'similarity': 1})

TOP RETRIEVED DOCUMENTS:
  1. [technical/web-app] Semantic and Textual Inference Chatbot Interface (STICI-Note) - Part 1: Planning and Prototyping
  2. [gaming/web] The best sci-fi and fantasy books of 2023
  3. [gaming/web] Version History


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_input_token_count"
  quota_id: "GenerateContentInputTokensPerModelPerMinute-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250000
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 27
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your pla

GENERATED ANSWER: The bling-phi-3 model is a BLING series model, specifically a finetune of Phi-3. It is described as a small, CPU-based, RAG-optimized, and instruct-following model, typically ranging from 1B-3B parameters. It is also noted as the newest and most accurate BLING/DRAGON model.
Predicted: The bling-phi-3 model is a BLING series model, specifically a finetune of Phi-3. It is described as a small, CPU-based, RAG-optimized, and instruct-following model, typically ranging from 1B-3B parameters. It is also noted as the newest and most accurate BLING/DRAGON model.
PASS

[TEST 6/12] - NO ANSWER
Question: What is the advantage of transformers other other models?
Expected: I don't know


No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: What is the advantage of transformers other other models?
Retrieved 7 documents
Content Domains: Counter({'technical': 5, 'gaming': 2})
Source Types: Counter({'web': 6, 'web-app': 1})
Strategies: Counter({'mmr': 6, 'similarity': 1})

TOP RETRIEVED DOCUMENTS:
  1. [technical/web] A Survey on Retrieval-Augmented Text Generation for Large Language
  2. [technical/web-app] Semantic and Textual Inference Chatbot Interface (STICI-Note) - Part 1: Planning and Prototyping
  3. [technical/web] How to Maximize Your Impact as a Data Scientist
GENERATED ANSWER: Transformers, such as the GPT-4 large language model, showcase exceptional abilities in a variety of Natural Language Processing (NLP) tasks. Pretrained language models like BERT, which are transformer-based, capture the semantic essence of queries more effectively, improving search accuracy by considering synonyms and the structure of phrases. Integrating the retrieval process within a Transformer model can also

No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: What are the emperor's aliases?
Retrieved 1 documents
Content Domains: Counter({'gaming': 1})
Source Types: Counter({'web': 1})
Strategies: Counter({'mmr': 1})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/web] The Emperor is a mind flayer who appears in Baldur's Gate 3. It[note 1] plays a key role in the main story, but its ide…
GENERATED ANSWER: The Emperor's aliases include the Dream Guardian and Balduran. In Early Access, the Dream Guardian was known as the Dream Visitor.
Predicted: The Emperor's aliases include the Dream Guardian and Balduran. In Early Access, the Dream Guardian was known as the Dream Visitor.
PASS

[TEST 8/12] - NO ANSWER
Question: What happened on day 10?
Expected: I don't know


No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: What happened on day 10?
Retrieved 5 documents
Content Domains: Counter({'gaming': 3, 'narrative': 1, 'dnd': 1})
Source Types: Counter({'web': 3, 'wiki': 2})
Strategies: Counter({'mmr': 5})

TOP RETRIEVED DOCUMENTS:
  1. [narrative/wiki] Alan Wake 2
  2. [dnd/web] so into northern spain!
  3. [gaming/web] Version History


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_input_token_count"
  quota_id: "GenerateContentInputTokensPerModelPerMinute-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250000
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 41
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your pla

GENERATED ANSWER: I don't know
Predicted: I don't know
PASS

[TEST 9/12] - NO ANSWER
Question: Which book is the best?
Expected: I don't know


No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: Which book is the best?
Retrieved 1 documents
Content Domains: Counter({'gaming': 1})
Source Types: Counter({'web': 1})
Strategies: Counter({'mmr': 1})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/web] The best sci-fi and fantasy books of 2023
GENERATED ANSWER: I don't know
Predicted: I don't know
PASS

[TEST 10/12] - SINGLE PASSAGE
Question: Who ambushes the party at Wyrm's lookout?
Expected: On the way to Baldur's Gate, the party will be ambushed by a group of Gish'ra warriors while resting at Wyrm's Lookout.


No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: Who ambushes the party at Wyrm's lookout?
Retrieved 5 documents
Content Domains: Counter({'gaming': 3, 'narrative': 1, 'dnd': 1})
Source Types: Counter({'web': 3, 'wiki': 1, 'file': 1})
Strategies: Counter({'mmr': 3, 'similarity': 2})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/web] Version History
  2. [gaming/web] The Emperor is a mind flayer who appears in Baldur's Gate 3. It[note 1] plays a key role in the main story, but its ide…
  3. [narrative/wiki] Alan Wake 2
GENERATED ANSWER: A group of Gish'ra warriors ambushes the party at Wyrm's Lookout.
Predicted: A group of Gish'ra warriors ambushes the party at Wyrm's Lookout.
PASS

[TEST 11/12] - SINGLE PASSAGE
Question: What kind of gun does the bandana bullet kin use?
Expected: The bandana bullet kin wields a machine pistol.

RETRIEVAL DEBUG:
Question: What kind of gun does the bandana bullet kin use?
Retrieved 1 documents
Content Domains: Counter({'gaming': 1})
Source Types: Counter({'wiki': 1})
Strategies: Co

No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: When was UTF-8 support added for European languages?
Retrieved 6 documents
Content Domains: Counter({'technical': 5, 'gaming': 1})
Source Types: Counter({'web': 4, 'code-repo': 1, 'web-app': 1})
Strategies: Counter({'mmr': 5, 'similarity': 1})

TOP RETRIEVED DOCUMENTS:
  1. [technical/code-repo] llmware
  2. [technical/web] A Survey on Retrieval-Augmented Text Generation for Large Language
  3. [gaming/web] Version History
GENERATED ANSWER: UTF-8 encoding for European languages was added to the PDF Parser in llmware v0.2.7, released on Wednesday, April 3.
Predicted: UTF-8 encoding for European languages was added to the PDF Parser in llmware v0.2.7, released on Wednesday, April 3.
PASS

EVALUATION RESULTS

Multi Passage: 4/4 (100.0%)
No Answer: 3/4 (75.0%)
Single Passage: 4/4 (100.0%)

Overall Performance: 11/12 (91.7%)

Failed cases (1 total):
  Q: What is the advantage of transformers other other models?...
  Expected: I don't know...
  Got: Transformers, 

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_input_token_count"
  quota_id: "GenerateContentInputTokensPerModelPerMinute-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250000
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 27
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your pla

GENERATED ANSWER: Keybullet Kin drop a key upon death. Jammed Keybullet Kin drop 2 keys instead of 1.
Answer: Keybullet Kin drop a key upon death. Jammed Keybullet Kin drop 2 keys instead of 1.

Testing: Which enemy types wield an AK-47?


No relevant docs were retrieved using the relevance score threshold 0.5



RETRIEVAL DEBUG:
Question: Which enemy types wield an AK-47?
Retrieved 3 documents
Content Domains: Counter({'gaming': 2, 'technical': 1})
Source Types: Counter({'web': 2, 'wiki': 1})
Strategies: Counter({'mmr': 3})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/wiki] Bullet Kin
  2. [gaming/web] Version History
  3. [technical/web] Which GPU(s) to Get for Deep Learning: My Experience and Advice for Using GPUs in Deep Learning
GENERATED ANSWER: Assault-rifle wielding Bullet Kin and Tankers wield AK-47s.
Answer: Assault-rifle wielding Bullet Kin and Tankers wield AK-47s.

Testing: Who should I vote for in the next election and why?
Answer: I don't know

Testing: What kind of gun does the bandana bullet kin use?

RETRIEVAL DEBUG:
Question: What kind of gun does the bandana bullet kin use?
Retrieved 1 documents
Content Domains: Counter({'gaming': 1})
Source Types: Counter({'wiki': 1})
Strategies: Counter({'similarity_threshold': 1})

TOP RETRIEVED DOCUMENTS:
  1. [gaming/wiki] Bullet Kin
GENERAT