In [12]:
# Cell 1: imports & configuration
import os
import sys
import time
import subprocess
import uuid
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional

import numpy as np
import faiss
import pickle

from sentence_transformers import SentenceTransformer, util  # util for cos_sim in evaluation

# Config (edit if needed)
CTX_FILE = "amazon_help_doc.txt"
MODEL_PATH = r"C:\amrita_uni\Projects\BeyondChats\model"   # <-- your local sentence-transformer path
STORAGE_DIR = Path("storage")
STORAGE_DIR.mkdir(exist_ok=True)
INDEX_PATH = STORAGE_DIR / "faiss.index"
META_PATH = STORAGE_DIR / "meta.pkl"
EMB_VEC_PATH = STORAGE_DIR / "embeddings.npy"

# Ollama invocation
OLLAMA_CMD = ["ollama", "run", "mistral"]

# Retrieval defaults
INDEX_TOP_K = 5
SIM_THRESHOLD = 0.20   # starting point; tune on eval data (0.6-0.75 is typical)


In [13]:
# Cell 2: load embedding model and context
print("Loading SentenceTransformer from:", MODEL_PATH)
embedder = SentenceTransformer(MODEL_PATH)

# Read your context file
if not Path(CTX_FILE).exists():
    raise FileNotFoundError(f"Context file {CTX_FILE} not found in project root.")
with open(CTX_FILE, "r", encoding="utf-8") as f:
    context_lines = [ln.strip() for ln in f.readlines() if ln.strip()]

print(f"Loaded {len(context_lines)} context lines.")
# Optional: show first few
for i, ln in enumerate(context_lines[:5], 1):
    print(i, ln[:150])


Loading SentenceTransformer from: C:\amrita_uni\Projects\BeyondChats\model
Loaded 100 context lines.
1 If you are on the Amazon homepage and want to search for a specific product like Crocs, move to the large search bar at the top center of the page, ty
2 If you are on the homepage and want to track your past orders, hover your mouse over the “Accounts & Lists” option at the top right corner, click on “
3 If you are browsing from the homepage and wish to check for discounts and deals, click on the “All” menu at the top left, select “Today’s Deals,” and 
4 If you are on the homepage and want to redeem a gift card, hover over “Accounts & Lists,” click on “Gift Card Balance,” then select “Redeem a Gift Car
5 If you are on the homepage and need to update your payment methods, hover over “Accounts & Lists,” click on “Your Account,” choose “Payment Options,” 


IndexFlatIP — inner product for cosine on normalized data

In [None]:
# Cell 3: create and save FAISS index + metadata + embeddings
def build_and_save_index(texts: List[str], embedder: SentenceTransformer,
                         index_path: Path = INDEX_PATH, meta_path: Path = META_PATH,
                         emb_vec_path: Path = EMB_VEC_PATH):
    print("Computing embeddings (this may take some time)...")
    embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True, batch_size=32)
    # Normalize for cosine search with IndexFlatIP
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    faiss.write_index(index, str(index_path))
    print("FAISS index saved to", index_path)
    # Save embeddings & metadata
    np.save(str(emb_vec_path), embeddings)
    print("Embeddings saved to", emb_vec_path)
    metadata = [{"line_no": i+1, "text": t} for i, t in enumerate(texts)]
    with open(str(meta_path), "wb") as f:
        pickle.dump(metadata, f)
    print("Metadata saved to", meta_path)
    return index, embeddings, metadata

if not INDEX_PATH.exists() or not META_PATH.exists() or not EMB_VEC_PATH.exists():
    idx, embs, metadata = build_and_save_index(context_lines, embedder)
else:
    # load
    idx = faiss.read_index(str(INDEX_PATH))
    embs = np.load(str(EMB_VEC_PATH))
    with open(str(META_PATH), "rb") as f:
        metadata = pickle.load(f)
    print("Loaded existing index, embeddings and metadata.")


Loaded existing index, embeddings and metadata.


In [15]:
# Cell 4: retrieval function using FAISS IndexFlatIP and normalized vectors
def search_faiss(query: str, top_k: int = INDEX_TOP_K, threshold: float = SIM_THRESHOLD):
    """
    Returns: retrieved: List[dict] with keys (idx, line_no, text, score)
             is_ood: bool (True if max_score < threshold)
             max_score: float
    """
    query_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_emb)
    D, I = idx.search(query_emb, top_k)  # D are inner products (cosine if normalized)
    scores = D[0].tolist()
    ids = I[0].tolist()
    retrieved = []
    for score, doc_idx in zip(scores, ids):
        if doc_idx < 0:
            continue
        meta = metadata[doc_idx]
        retrieved.append({
            "idx": int(doc_idx),
            "line_no": int(meta["line_no"]),
            "text": meta["text"],
            "score": float(score)
        })
    max_score = max(scores) if len(scores) > 0 else 0.0
    is_ood = max_score < threshold
    return retrieved, is_ood, float(max_score)

# quick smoke test
print(search_faiss("Where do I find Crocs from the homepage?", top_k=5))


([{'idx': 0, 'line_no': 1, 'text': 'If you are on the Amazon homepage and want to search for a specific product like Crocs, move to the large search bar at the top center of the page, type the product name, press Enter, and then scroll through the results while using the filters on the left-hand panel to narrow your choices by size, color, or price.', 'score': 0.6045942306518555}, {'idx': 49, 'line_no': 50, 'text': 'If you are on the homepage and want to review customer service interactions, scroll to the footer, click “Help,” and access past support chats or requests under “Contact History.”', 'score': 0.3660828173160553}, {'idx': 31, 'line_no': 32, 'text': 'If you are on the homepage and want to check magazine subscriptions, type “Magazine Subscriptions” in the search bar and then browse or purchase directly from the results.', 'score': 0.3560046851634979}, {'idx': 42, 'line_no': 43, 'text': 'If you are on the homepage and want to see gift registries, hover over “Accounts & Lists,” c

In [16]:
# Cell 5: Ollama runner and prompt builders

def run_ollama_mistral(prompt: str, timeout: int = 60) -> str:
    """
    Calls local ollama run mistral with prompt as stdin and returns stdout.
    Raises RuntimeError if ollama returns non-zero or times out.
    """
    try:
        res = subprocess.run(OLLAMA_CMD, input=prompt, text=True, capture_output=True, encoding='utf-8', timeout=timeout)
    except subprocess.TimeoutExpired as e:
        raise RuntimeError(f"ollama timed out: {e}")
    if res.returncode != 0:
        # include stderr for debugging
        raise RuntimeError(f"ollama error (code {res.returncode}):\n{res.stderr}")
    return res.stdout.strip()

def build_generation_prompt(query: str, retrieved: List[dict]) -> str:
    """
    Build a strict instruction prompt. We include the top retrieved results (with line numbers).
    Must instruct the model to not hallucinate and to give professional, step-by-step directions.
    """
    if not retrieved:
        return (f"User question: {query}\n\n"
                "This question cannot be answered from the provided knowledge base. "
                "Respond exactly (no extra content): \"Sorry, I cannot answer that from the provided document. "
                "Would you like to contact support?\"")
    ctx = "\n".join([f"[{r['line_no']}] {r['text']}" for r in retrieved])
    prompt = f"""
You are a professional Amazon Help Assistant. You MUST follow these rules:

1) Use ONLY the information given in the CONTEXT block below. Do NOT invent or assume facts.
2) If the CONTEXT does NOT contain enough information to answer the question, respond EXACTLY with:
   "Sorry, I cannot answer that from the provided document. Would you like to contact support?"
3) Provide step-by-step actionable directions starting from the Amazon homepage.
4) If you reference specific instructions, cite the CONTEXT line numbers in square brackets, e.g. [23].
5) Be concise, professional, and polite.

CONTEXT:
{ctx}

User Question:
{query}

Answer:
"""
    return prompt.strip()

def build_verification_prompt(query: str, retrieved: List[dict], answer: str) -> str:
    """
    Ask the model to check whether the answer strictly relies only on context.
    Request a YES or NO only.
    """
    ctx = "\n".join([f"[{r['line_no']}] {r['text']}" for r in retrieved])
    verification = f"""
CONTEXT:
{ctx}

User Question:
{query}

Proposed Answer:
{answer}

TASK:
Based only on the CONTEXT above, does the Proposed Answer strictly and fully rely ONLY on the provided CONTEXT (without adding any external facts or assumptions)? 
Answer with a single word: YES or NO.
"""
    return verification.strip()


In [17]:
# Cell 6: end-to-end RAG answer flow (retrieval -> generate -> verify)
FALLBACK_TEXT = "Sorry, I cannot answer that from the provided document. Would you like to contact support?"

def rag_answer(query: str, top_k: int = INDEX_TOP_K, threshold: float = SIM_THRESHOLD, verify: bool = True) -> Dict:
    """
    Returns a dict: {
      'answer': str,
      'is_ood': bool,
      'max_score': float,
      'retrieved': list[ {idx,line_no,text,score} ],
      'verified': bool or None  # None if verification not executed
    }
    """
    retrieved, is_ood, max_score = search_faiss(query, top_k=top_k, threshold=threshold)
    if is_ood:
        return {"answer": FALLBACK_TEXT, "is_ood": True, "max_score": max_score, "retrieved": retrieved, "verified": False}
    
    # Build prompt and generate
    gen_prompt = build_generation_prompt(query, retrieved)
    try:
        gen_out = run_ollama_mistral(gen_prompt)
    except Exception as e:
        # If generator fails, return friendly fallback
        return {"answer": "Sorry, an internal error occurred while generating an answer.", "is_ood": False, "max_score": max_score, "retrieved": retrieved, "verified": False, "error": str(e)}

    verified = None
    final_answer = gen_out

    # Verification pass to detect hallucinations
    if verify:
        try:
            ver_prompt = build_verification_prompt(query, retrieved, gen_out)
            ver_out = run_ollama_mistral(ver_prompt)
            ver_text = ver_out.strip().upper()
            if "NO" in ver_text:
                final_answer = FALLBACK_TEXT
                verified = False
            elif "YES" in ver_text:
                verified = True
            else:
                # fallback conservative
                final_answer = FALLBACK_TEXT
                verified = False
        except Exception as e:
            # if verification fails, be conservative
            final_answer = FALLBACK_TEXT
            verified = False

    return {"answer": final_answer, "is_ood": False, "max_score": max_score, "retrieved": retrieved, "verified": verified}


In [18]:
# Cell 7: smoke tests
tests = [
    "How do I track my orders from the Amazon homepage?",
    "How do I find Crocs from the homepage?",
    "How do I redeem a gift card starting from homepage?",
    "How do I list a new product as a seller starting from the Amazon homepage?",
    "Who invented the electric toaster?" , # should be OOD
    "hello, how are you?" 
]

for q in tests:
    out = rag_answer(q)
    print("Q:", q)
    print("Answer:", out["answer"])
    print("is_ood:", out["is_ood"], "max_score:", out["max_score"], "verified:", out.get("verified"))
    print("-" * 80)


Q: How do I track my orders from the Amazon homepage?
Answer: To track your orders from the Amazon homepage, follow these steps:

1. Hover your mouse over the “Accounts & Lists” option at the top right corner.
2. Click on “Your Orders.”
3. Review the list of your purchases where each item has an option to track its current delivery status.
is_ood: False max_score: 0.7776432633399963 verified: True
--------------------------------------------------------------------------------
Q: How do I find Crocs from the homepage?
Answer: To find Crocs from the Amazon homepage, move to the large search bar at the top center of the page, type "Crocs" and press Enter. Then scroll through the results while using the filters on the left-hand panel to narrow your choices by size, color, or price. [1]
is_ood: False max_score: 0.6121615767478943 verified: True
--------------------------------------------------------------------------------
Q: How do I redeem a gift card starting from homepage?
Answer: To 

In [None]:
# Cell 8: Evaluation helpers
def find_expected_line_indices(keyword: str) -> List[int]:
    """Utility: Identifies which context lines are relevant for eval,by key word search and return indices of context_lines containing keyword (case-insensitive)."""
    hits = []
    for i, txt in enumerate(context_lines):
        if keyword.lower() in txt.lower():
            hits.append(i)
    return hits

def recall_at_k(eval_pairs: List[Tuple[str, List[int]]], k: int = 5) -> float:
    """iter through all lines in context  check if keyword is present in any of the retrieved lines if yes count as hit"""
    hits = 0
    total = len(eval_pairs)
    for query, expected_idxs in eval_pairs:
        retrieved, _, _ = search_faiss(query, top_k=k, threshold=-1.0)  # threshold disabled for eval
        retrieved_idxs = {r["idx"] for r in retrieved}
        if any(idx in retrieved_idxs for idx in expected_idxs):
            hits += 1
    return hits / total

def mrr(eval_pairs: List[Tuple[str, List[int]]], k: int = 5) -> float:
    """Mean Reciprocal Rank: for each query, find rank of first relevant doc, compute mean of 1/rank.
    Basically measure how often for each query the first relevant doc appears in top-k."""
    rr_sum = 0.0
    for query, expected_idxs in eval_pairs:
        retrieved, _, _ = search_faiss(query, top_k=k, threshold=-1.0)
        retrieved_idxs = [r["idx"] for r in retrieved]
        rank = None
        for i, rid in enumerate(retrieved_idxs, start=1):
            if rid in expected_idxs:
                rank = i
                break
        rr_sum += (1.0/rank) if rank else 0.0
    return rr_sum / len(eval_pairs)

# Build a small eval set by keyword lookup in your context file:
eval_queries = [
    ("track my orders", "track"),
    ("search for Crocs", "Crocs"),
    ("redeem gift card", "gift card"),
    ("check prime membership", "Prime"),
    ("return an item", "Return or Replace"),
    ("list a new product", "Add a Product"),
    ("manage inventory", "Manage Inventory"),
    ("create a promotion", "Promotions"),
    ("fulfillment by amazon", "Fulfillment by Amazon"),
    ("contact seller support", "Contact Seller"),
    ("hello, how are you?", "hello")
]

# Map queries to expected line indices automatically
eval_pairs = []
for q, keyword in eval_queries:
    idxs = find_expected_line_indices(keyword)
    if idxs:
        eval_pairs.append((q, idxs))
    else:
        # If keyword not found, skip but print note
        print(f"Warning: keyword '{keyword}' not found in context; skipping {q}")

print("Eval pairs prepared:", len(eval_pairs))

# Compute metrics
r_at_5 = recall_at_k(eval_pairs, k=5)
mrr_5 = mrr(eval_pairs, k=5)
print(f"Retrieval Recall@5: {r_at_5:.3f}  MRR@5: {mrr_5:.3f}")


Eval pairs prepared: 10
Retrieval Recall@5: 1.000  MRR@5: 0.950


In [22]:
# Cell 9: Generation evaluation using verification pass as proxy for faithfulness

def evaluate_generation_faithfulness(eval_queries, top_k=5):
    total = len(eval_queries)
    fallback_count = 0
    verified_yes = 0
    for q, _ in eval_queries:
        out = rag_answer(q, top_k=top_k, verify=True)
        if out["answer"].strip() == FALLBACK_TEXT:
            fallback_count += 1
        if out.get("verified") is True:
            verified_yes += 1
    fallback_rate = fallback_count / total
    verified_rate = verified_yes / total
    return {"fallback_rate": fallback_rate, "verified_rate": verified_rate}

g_eval = evaluate_generation_faithfulness([ (q,_) for q,_ in eval_pairs ])
print("Generation eval (fallback_rate, verified_rate):", g_eval)


Generation eval (fallback_rate, verified_rate): {'fallback_rate': 0.0, 'verified_rate': 1.0}


fallback_rate was 0.4 before when threshold was 0.6, since it is now lowered to 0.2 to get better responses for smaller and concise queries we are gettin 0 fallbackrate