In [None]:

!pip install -q sentence-transformers faiss-cpu openai pandas numpy scikit-learn torch tqdm pytest rouge-score bert-score rank_bm25




  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
# Cell 1 — Imports
import os, re, json, time, math, random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import faiss
from tqdm import tqdm
import torch
from rank_bm25 import BM25Okapi  # optional lexical retrieval
from rouge_score import rouge_scorer   # optional evaluation
from sklearn.metrics import precision_score, recall_score
print("Libraries loaded.")


Libraries loaded.


In [None]:

import os, random, numpy as np, torch

WORKDIR = "/content/gthackathon"
DATA_DIR = f"{WORKDIR}/datasets"
KB_DIR = f"{WORKDIR}/kb"
ARTIFACTS_DIR = f"{WORKDIR}/artifacts"

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(KB_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# dataset + KB collector lists
DATA_PATHS = []
KB_DOCS_PATHS = []

# helper output files
MERGED_DATA_PATH = f"{DATA_DIR}/merged_pipeline_dataset.csv"
COMBINED_KB_PATH = f"{KB_DIR}/combined_kb.txt"

print("CONFIG FIXED")
print("DATA_DIR:", DATA_DIR)
print("KB_DIR:", KB_DIR)
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)
print("MERGED_DATA_PATH:", MERGED_DATA_PATH)
print("COMBINED_KB_PATH:", COMBINED_KB_PATH)


CONFIG FIXED
DATA_DIR: /content/gthackathon/datasets
KB_DIR: /content/gthackathon/kb
ARTIFACTS_DIR: /content/gthackathon/artifacts
MERGED_DATA_PATH: /content/gthackathon/datasets/merged_pipeline_dataset.csv
COMBINED_KB_PATH: /content/gthackathon/kb/combined_kb.txt


In [None]:
# Cell A — Download SQuAD v1.1 (2000 QA pairs) into DATA_DIR
import requests, json, pandas as pd, os

squad_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
print("Downloading SQuAD from:", squad_url)
r = requests.get(squad_url, timeout=30)
squad = r.json()

rows = []
idx = 0
for article in squad.get('data', []):
    for para in article.get('paragraphs', []):
        ctxt = para.get('context', "")
        for qa in para.get('qas', []):
            q = qa.get('question', "")
            a = qa.get('answers', [{}])[0].get('text', "") if qa.get('answers') else ""
            rows.append({
                "id": idx,
                "user_text": q,
                "lat": None,
                "lon": None,
                "user_history": "{}",
                "gt_doc": -1,
                "ground_truth": a,
                "kb_context": ctxt
            })
            idx += 1

os.makedirs(DATA_DIR, exist_ok=True)
squad_path = os.path.join(DATA_DIR, "squad_sample_2k.csv")
pd.DataFrame(rows[:2000]).to_csv(squad_path, index=False)
print("Saved SQuAD sample to:", squad_path)

if 'DATA_PATHS' in globals():
    DATA_PATHS.append(squad_path)
    print("Appended to DATA_PATHS")
else:
    print("DATA_PATHS not found — manually add:", squad_path)


Downloading SQuAD from: https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Saved SQuAD sample to: /content/gthackathon/datasets/squad_sample_2k.csv
Appended to DATA_PATHS


In [None]:
# Cell B — Clone MultiWOZ repo and extract user utterances into CSV
import os, glob, json, pandas as pd

if not os.path.exists("/content/multiwoz_repo"):
    !git clone --depth 1 https://github.com/budzianowski/multiwoz.git /content/multiwoz_repo || true
else:
    print("multiwoz_repo already exists")

rows = []
i = 0
multiwoz_dir = "/content/multiwoz_repo/data"
files = glob.glob(multiwoz_dir + "/**/*.json", recursive=True)
print("Found multiwoz json files:", len(files))
for fn in files:
    try:
        doc = json.load(open(fn, "r", encoding="utf8"))
        if isinstance(doc, dict) and "log" in doc:
            for turn in doc["log"]:
                speaker = turn.get("speaker") or turn.get("role") or ""
                if speaker and str(speaker).upper() == "USER":
                    txt = turn.get("text", "") or turn.get("utterance", "")
                    if txt:
                        rows.append({
                            "id": i,
                            "user_text": txt,
                            "lat": None,
                            "lon": None,
                            "user_history": "{}",
                            "gt_doc": -1,
                            "ground_truth": ""
                        })
                        i += 1
    except Exception:
        continue

os.makedirs(DATA_DIR, exist_ok=True)
multiwoz_path = os.path.join(DATA_DIR, "multiwoz_sample.csv")
pd.DataFrame(rows[:2000]).to_csv(multiwoz_path, index=False)
print("Saved MultiWOZ sample to:", multiwoz_path)

if 'DATA_PATHS' in globals():
    DATA_PATHS.append(multiwoz_path)
    print("Appended to DATA_PATHS")
else:
    print("DATA_PATHS not found — manually add:", multiwoz_path)


Cloning into '/content/multiwoz_repo'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 66 (delta 22), reused 23 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (66/66), 54.66 MiB | 9.49 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Updating files: 100% (61/61), done.
Found multiwoz json files: 23
Saved MultiWOZ sample to: /content/gthackathon/datasets/multiwoz_sample.csv
Appended to DATA_PATHS


In [None]:
# Cell C — Create a simple one-paragraph-per-line demo KB
os.makedirs(KB_DIR, exist_ok=True)
kb_fp = os.path.join(KB_DIR, "demo_brand_kb.txt")
with open(kb_fp, "w", encoding="utf8") as f:
    f.write("Starbucks near Marine Lines serves Hot Cocoa and offers a 10% coupon.\n")
    f.write("Costa Coffee at MG Road offers Hot Chocolate and loyalty discounts.\n")
    f.write("Cafe Nero at Central Mall sells pastries; buy-one-get-one on desserts this week.\n")
    f.write("Central Mall hours: 9 AM to 10 PM. Free parking available.\n")
print("Wrote demo KB to:", kb_fp)

if 'KB_DOCS_PATHS' in globals():
    KB_DOCS_PATHS.append(kb_fp)
    print("Appended to KB_DOCS_PATHS")
else:
    print("KB_DOCS_PATHS not found — manually add:", kb_fp)



Wrote demo KB to: /content/gthackathon/kb/demo_brand_kb.txt
Appended to KB_DOCS_PATHS


In [None]:
print("DATA_PATHS =", DATA_PATHS)
print("KB_DOCS_PATHS =", KB_DOCS_PATHS)

print("\nChecking files exist:")
import os
for p in DATA_PATHS:
    print(p, "=>", os.path.exists(p), "size:", os.path.getsize(p) if os.path.exists(p) else 0)

print("\nMerged dataset exists?", os.path.exists(MERGED_DATA_PATH))
print("COMBINED KB exists?", os.path.exists(COMBINED_KB_PATH))


DATA_PATHS = ['/content/gthackathon/datasets/squad_sample_2k.csv', '/content/gthackathon/datasets/multiwoz_sample.csv']
KB_DOCS_PATHS = ['/content/gthackathon/kb/demo_brand_kb.txt']

Checking files exist:
/content/gthackathon/datasets/squad_sample_2k.csv => True size: 1895461
/content/gthackathon/datasets/multiwoz_sample.csv => True size: 1

Merged dataset exists? False
COMBINED KB exists? False


In [None]:
# Cell D — Merge datasets in DATA_PATHS into MERGED_DATA_PATH and combine KB_DOCS_PATHS to COMBINED_KB_PATH
import pandas as pd, os

def normalize_df_for_pipeline(df):
    if "user_text" not in df.columns:
        if "question" in df.columns: df = df.rename(columns={"question":"user_text"})
        elif "text" in df.columns: df = df.rename(columns={"text":"user_text"})
    if "ground_truth" not in df.columns:
        if "answer" in df.columns: df = df.rename(columns={"answer":"ground_truth"})
    for c in ["lat","lon","user_history","gt_doc","ground_truth"]:
        if c not in df.columns:
            if c in ("lat","lon"):
                df[c] = None
            elif c == "user_history":
                df[c] = "{}"
            elif c == "gt_doc":
                df[c] = -1
            else:
                df[c] = ""
    return df[["user_text","lat","lon","user_history","gt_doc","ground_truth"]]

merged = []
if 'DATA_PATHS' in globals() and len(DATA_PATHS) > 0:
    for p in DATA_PATHS:
        if not os.path.exists(p):
            print("WARN missing:", p); continue
        try:
            if p.lower().endswith(".json"):
                tmp = pd.read_json(p, lines=False)
            else:
                tmp = pd.read_csv(p)
            tmp2 = normalize_df_for_pipeline(tmp)
            merged.append(tmp2)
            print("Loaded:", p, "rows:", len(tmp2))
        except Exception as e:
            print("Error reading", p, ":", e)
else:
    print("No DATA_PATHS found or empty — merging skipped.")

if merged:
    merged_df = pd.concat(merged, ignore_index=True)
    merged_df.to_csv(MERGED_DATA_PATH, index=False)
    print("Merged dataset saved to:", MERGED_DATA_PATH, "rows:", len(merged_df))
else:
    print("Merged dataset not created (no input datasets).")

# Combine KBs
with open(COMBINED_KB_PATH, "w", encoding="utf8") as out:
    if 'KB_DOCS_PATHS' in globals() and len(KB_DOCS_PATHS) > 0:
        for kb in KB_DOCS_PATHS:
            if os.path.exists(kb):
                for line in open(kb, "r", encoding="utf8"):
                    txt = line.strip()
                    if txt:
                        out.write(txt + "\n")
                print("Appended KB file:", kb)
            else:
                print("KB missing:", kb)
    else:
        print("No KB_DOCS_PATHS found or empty — created empty combined KB file.")
print("Combined KB path:", COMBINED_KB_PATH)


Loaded: /content/gthackathon/datasets/squad_sample_2k.csv rows: 2000
Error reading /content/gthackathon/datasets/multiwoz_sample.csv : No columns to parse from file
Merged dataset saved to: /content/gthackathon/datasets/merged_pipeline_dataset.csv rows: 2000
Appended KB file: /content/gthackathon/kb/demo_brand_kb.txt
Combined KB path: /content/gthackathon/kb/combined_kb.txt


In [None]:
# SAFE MERGE (improved Cell D) — merges DATA_PATHS -> MERGED_DATA_PATH and builds combined KB
import pandas as pd, os

MIN_FILE_BYTES = 20

def normalize_df_for_pipeline(df):
    if "user_text" not in df.columns:
        if "question" in df.columns: df = df.rename(columns={"question":"user_text"})
        elif "text" in df.columns: df = df.rename(columns={"text":"user_text"})
    if "ground_truth" not in df.columns:
        if "answer" in df.columns: df = df.rename(columns={"answer":"ground_truth"})
    for c in ["lat","lon","user_history","gt_doc","ground_truth"]:
        if c not in df.columns:
            if c in ("lat","lon"):
                df[c] = None
            elif c == "user_history":
                df[c] = "{}"
            elif c == "gt_doc":
                df[c] = -1
            else:
                df[c] = ""
    return df[["user_text","lat","lon","user_history","gt_doc","ground_truth"]]

loaded = []
skipped = []
merged = []

print("DATA_PATHS:", DATA_PATHS)
for p in DATA_PATHS:
    if not os.path.exists(p):
        print("MISSING:", p); skipped.append((p,"missing")); continue
    size = os.path.getsize(p)
    if size < MIN_FILE_BYTES:
        print("TOO SMALL (skip):", p, "size:", size); skipped.append((p,"too_small")); continue
    try:
        if p.lower().endswith(".json"):
            tmp = pd.read_json(p, lines=False)
        else:
            tmp = pd.read_csv(p)
        tmp2 = normalize_df_for_pipeline(tmp)
        merged.append(tmp2)
        loaded.append((p, len(tmp2)))
        print("Loaded", p, "rows:", len(tmp2))
    except Exception as e:
        print("ERROR reading", p, ":", e); skipped.append((p,str(e)))

if merged:
    merged_df = pd.concat(merged, ignore_index=True)
    merged_df.to_csv(MERGED_DATA_PATH, index=False)
    print("Merged dataset saved to:", MERGED_DATA_PATH, "rows:", len(merged_df))
else:
    print("No valid datasets to merge (merged list empty).")

# combine KBs
combined_count = 0
with open(COMBINED_KB_PATH, "w", encoding="utf8") as out:
    if 'KB_DOCS_PATHS' in globals() and len(KB_DOCS_PATHS) > 0:
        for kb in KB_DOCS_PATHS:
            if os.path.exists(kb) and os.path.getsize(kb) > 10:
                for line in open(kb, "r", encoding="utf8"):
                    t=line.strip()
                    if t:
                        out.write(t+"\n"); combined_count += 1
                print("Appended KB:", kb)
            else:
                print("KB missing/empty (skipped):", kb)
    else:
        print("No KB_DOCS_PATHS found; empty combined KB created.")
print("Combined KB:", COMBINED_KB_PATH, "paragraphs:", combined_count)


DATA_PATHS: ['/content/gthackathon/datasets/squad_sample_2k.csv', '/content/gthackathon/datasets/multiwoz_sample.csv']
Loaded /content/gthackathon/datasets/squad_sample_2k.csv rows: 2000
TOO SMALL (skip): /content/gthackathon/datasets/multiwoz_sample.csv size: 1
Merged dataset saved to: /content/gthackathon/datasets/merged_pipeline_dataset.csv rows: 2000
Appended KB: /content/gthackathon/kb/demo_brand_kb.txt
Combined KB: /content/gthackathon/kb/combined_kb.txt paragraphs: 4


In [None]:
# Cell E — Build embeddings & FAISS index from COMBINED_KB_PATH
import os, json
from sentence_transformers import SentenceTransformer
import numpy as np, faiss

EMB_MODEL = globals().get("EMB_MODEL", "all-MiniLM-L6-v2")
ARTIFACTS_DIR = globals().get("ARTIFACTS_DIR", "/content/gthackathon/artifacts")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

print("Loading embedder:", EMB_MODEL)
embedder = SentenceTransformer(EMB_MODEL)

# Load KB paragraphs
if os.path.exists(COMBINED_KB_PATH) and os.path.getsize(COMBINED_KB_PATH) > 0:
    with open(COMBINED_KB_PATH, "r", encoding="utf8") as f:
        docs = [line.strip() for line in f if line.strip()]
else:
    print("Combined KB missing or empty — using small fallback KB")
    docs = [
        "Starbucks near Marine Lines serves Hot Cocoa and offers a 10% coupon.",
        "Costa Coffee at MG Road offers Hot Chocolate and loyalty discounts.",
        "Cafe Nero at Central Mall sells pastries; buy-one-get-one on desserts this week.",
        "Central Mall hours: 9 AM to 10 PM. Free parking available."
    ]

print("Number of KB paragraphs:", len(docs))

# embed
kb_embeddings = embedder.encode(docs, show_progress_bar=True, convert_to_numpy=True)
d = kb_embeddings.shape[1]
# normalize for cosine search
kb_norm = kb_embeddings / np.linalg.norm(kb_embeddings, axis=1, keepdims=True)
xb = kb_norm.astype('float32')

index = faiss.IndexFlatIP(d)
index.add(xb)
print("FAISS index built: vectors =", index.ntotal, "dim =", d)

# Save index and docs
faiss.write_index(index, os.path.join(ARTIFACTS_DIR, "faiss_index.bin"))
with open(os.path.join(ARTIFACTS_DIR, "kb_docs.json"), "w", encoding="utf8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)
print("Saved faiss_index.bin and kb_docs.json to", ARTIFACTS_DIR)
# expose for later cells
globals()["docs"] = docs
globals()["index"] = index
globals()["embedder"] = embedder


Loading embedder: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of KB paragraphs: 4


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index built: vectors = 4 dim = 384
Saved faiss_index.bin and kb_docs.json to /content/gthackathon/artifacts


In [None]:
# Cell F — Retrieval helpers & quick local test
import numpy as np, time

TOP_K = globals().get("TOP_K", 5)
docs = globals().get("docs")
index = globals().get("index")
embedder = globals().get("embedder")

def dense_retrieve_local(query, k=TOP_K):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    qn = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    D,I = index.search(qn.astype('float32'), k)
    return [{"id": int(i), "text": docs[i], "score": float(s)} for i,s in zip(I[0], D[0])]

# quick test
q = "I'm cold and want a warm drink"
t0 = time.time()
res = dense_retrieve_local(q, k=TOP_K)
t1 = time.time()
print("Query:", q)
print("Retrieval time (s):", round(t1-t0,4))
for r in res:
    print(f"  id={r['id']} score={r['score']:.4f} text={r['text'][:80]}")


Query: I'm cold and want a warm drink
Retrieval time (s): 0.0215
  id=0 score=0.3356 text=Starbucks near Marine Lines serves Hot Cocoa and offers a 10% coupon.
  id=1 score=0.2887 text=Costa Coffee at MG Road offers Hot Chocolate and loyalty discounts.
  id=2 score=0.2012 text=Cafe Nero at Central Mall sells pastries; buy-one-get-one on desserts this week.
  id=3 score=0.0313 text=Central Mall hours: 9 AM to 10 PM. Free parking available.
  id=-1 score=-340282346638528859811704183484516925440.0000 text=Central Mall hours: 9 AM to 10 PM. Free parking available.


In [None]:
# Cell G — RAG prompt builder + small run (works without OpenAI key; if you have key, it will call LLM)
import json, os

# Simple PII masking (safe)
PII_PATTERNS = [
    (r'\b\d{10}\b', "<PHONE>"),
    (r'\b(?:\d[ -]*?){13,19}\b', "<CC_NUMBER>"),
    (r'[\w\.-]+@[\w\.-]+\.\w{2,4}', "<EMAIL>")
]
import re
def mask_pii(text):
    if not isinstance(text,str): return text, []
    log=[]
    t=text
    for pat,repl in PII_PATTERNS:
        for m in re.finditer(pat, t):
            log.append((m.group(0), repl))
        t = re.sub(pat, repl, t)
    return t, log

def build_context(user_row, retrieved, variant="full"):
    user_text = user_row.get("user_text","")
    user_masked, mask_log = mask_pii(user_text)
    history = {}
    try:
        history = json.loads(user_row.get("user_history","{}"))
    except:
        history = {}
    loc = {"lat": user_row.get("lat"), "lon": user_row.get("lon")}
    parts = [f"User Query: {user_masked}"]
    if variant not in ("no_history","no_retrieval"):
        parts.append(f"User History: {history}")
    if variant not in ("no_location","no_retrieval"):
        parts.append(f"User Location: {loc}")
    if variant != "no_retrieval":
        parts.append("Retrieved KB:\n" + "\n".join([f"({r['id']}) {r['text']}" for r in retrieved]))
    ctx = "\n\n".join(parts)
    return ctx, mask_log

def build_prompt(user_row, retrieved, variant="full"):
    ctx, mask_log = build_context(user_row, retrieved, variant=variant)
    prompt = f"""You are a precise customer-support assistant.

CONTEXT:
{ctx}

TASK: Provide a short answer (1-2 sentences), one concrete ACTION (what to do next), and list referenced paragraph IDs.

OUTPUT FORMAT:
Short Answer:
Action:
References:
"""
    return prompt, mask_log

# sample run over up to 5 rows of merged dataset (or synthetic fallback)
import pandas as pd
if os.path.exists(MERGED_DATA_PATH):
    df_merge = pd.read_csv(MERGED_DATA_PATH)
    sample = df_merge.sample(n=min(5, len(df_merge)), random_state=42).reset_index(drop=True)
else:
    sample = pd.DataFrame([{"user_text":"I'm cold","lat":19.07,"lon":72.87,"user_history":"{}"}])

runs=[]
for i,row in sample.iterrows():
    retrieved = dense_retrieve_local(row['user_text'], k=TOP_K)
    prompt, mask_log = build_prompt(row, retrieved, variant="full")
    runs.append({"id": i, "user_text": row['user_text'], "retrieved_ids":[r['id'] for r in retrieved],
                 "prompt": prompt, "mask_log": mask_log})
    # print prompt preview
    print(f"\n--- Sample {i} ---")
    print("User:", row['user_text'])
    print("Retrieved ids:", [r['id'] for r in retrieved])
    print("Prompt preview (first 300 chars):\n", prompt[:300].replace("\n", " "))
print("\nCompleted prompt builds for sample.")
# save sample prompts for inspection
pd.DataFrame(runs).to_csv(os.path.join(ARTIFACTS_DIR, "sample_prompts.csv"), index=False)
print("Saved sample_prompts.csv to", ARTIFACTS_DIR)



--- Sample 0 ---
User: What two things did Chopin advise Viardot on?
Retrieved ids: [2, 1, 0, 3, -1]
Prompt preview (first 300 chars):
 You are a precise customer-support assistant.  CONTEXT: User Query: What two things did Chopin advise Viardot on?  User History: {}  User Location: {'lat': nan, 'lon': nan}  Retrieved KB: (2) Cafe Nero at Central Mall sells pastries; buy-one-get-one on desserts this week. (1) Costa Coffee at MG Road

--- Sample 1 ---
User: Beyonce's group changed their name to Destiny's Child in what year?
Retrieved ids: [1, 3, 2, 0, -1]
Prompt preview (first 300 chars):
 You are a precise customer-support assistant.  CONTEXT: User Query: Beyonce's group changed their name to Destiny's Child in what year?  User History: {}  User Location: {'lat': nan, 'lon': nan}  Retrieved KB: (1) Costa Coffee at MG Road offers Hot Chocolate and loyalty discounts. (3) Central Mall h

--- Sample 2 ---
User: Besides sprectrum of activity and chemical structure, how can antibacterial an

In [None]:
# Cell H — Quick retrieval recall metrics on up to 200 sample rows (uses gt_doc in merged if available)
import pandas as pd, os
def recall_at_k(retrieved_lists, gt_list, k=3):
    hits = 0
    total = 0
    for ret,gt in zip(retrieved_lists, gt_list):
        if gt is None or gt < 0:
            continue
        total += 1
        if gt in ret[:k]:
            hits += 1
    return (hits/total) if total>0 else None

if os.path.exists(MERGED_DATA_PATH):
    dfm = pd.read_csv(MERGED_DATA_PATH)
    sample = dfm.sample(n=min(200, len(dfm)), random_state=42).reset_index(drop=True)
    retrieved_lists = []
    gt_list = []
    for _,r in sample.iterrows():
        ret = dense_retrieve_local(r['user_text'], k=TOP_K)
        retrieved_lists.append([x['id'] for x in ret])
        gt_list.append(int(r.get('gt_doc', -1)))
    r1 = recall_at_k(retrieved_lists, gt_list, k=1)
    r3 = recall_at_k(retrieved_lists, gt_list, k=3)
    metrics = {"recall@1": r1, "recall@3": r3, "sample_size": len(sample)}
    print("Quick retrieval metrics:", metrics)
    pd.DataFrame([metrics]).to_csv(os.path.join(ARTIFACTS_DIR, "quick_retrieval_metrics.csv"), index=False)
    print("Saved quick_retrieval_metrics.csv to", ARTIFACTS_DIR)
else:
    print("No merged dataset found; skipping recall computation.")


Quick retrieval metrics: {'recall@1': None, 'recall@3': None, 'sample_size': 200}
Saved quick_retrieval_metrics.csv to /content/gthackathon/artifacts


In [None]:
# Cell I — PII masking tests and save results
import pandas as pd, os
tests = [
    {"text":"Call 9876543210", "expect":"<PHONE>"},
    {"text":"Email vin@example.com", "expect":"<EMAIL>"},
    {"text":"Card 4111 1111 1111 1111", "expect":"<CC_NUMBER>"}
]
rows=[]
for t in tests:
    masked,log = mask_pii(t['text'])
    rows.append({"orig": t['text'], "masked": masked, "pass": t['expect'] in masked, "log": str(log)})
pii_df = pd.DataFrame(rows)
pii_df.to_csv(os.path.join(ARTIFACTS_DIR, "pii_test.csv"), index=False)
print("Saved PII test results to", os.path.join(ARTIFACTS_DIR, "pii_test.csv"))
pii_df


Saved PII test results to /content/gthackathon/artifacts/pii_test.csv


Unnamed: 0,orig,masked,pass,log
0,Call 9876543210,Call <PHONE>,True,"[('9876543210', '<PHONE>')]"
1,Email vin@example.com,Email <EMAIL>,True,"[('vin@example.com', '<EMAIL>')]"
2,Card 4111 1111 1111 1111,Card <CC_NUMBER>,True,"[('4111 1111 1111 1111', '<CC_NUMBER>')]"


In [None]:
# Cell J — List artifacts and give next steps
import os
print("Artifacts directory:", ARTIFACTS_DIR)
for f in sorted(os.listdir(ARTIFACTS_DIR)):
    p = os.path.join(ARTIFACTS_DIR, f)
    print(f"{f}  — {os.path.getsize(p)} bytes")

print("\nNext recommended steps:")
print("1) If you have an OpenAI key, set OPENAI_API_KEY and re-run Cell G to generate llm outputs.")
print("2) Run the full ablation (longer) if you want detailed metrics; I can provide that cell next.")
print("3) Download artifacts and commit to your GitHub repo under submissions/vinayak_mahindrakar_groundtruth/artifacts/")
print("\nIf everything looks good, tell me and I'll provide the full ablation+comparison cell to generate judge-ready CSVs (comparison_sample.csv, ablation_results.csv).")


Artifacts directory: /content/gthackathon/artifacts
faiss_index.bin  — 6189 bytes
kb_docs.json  — 300 bytes
pii_test.csv  — 251 bytes
quick_retrieval_metrics.csv  — 36 bytes
sample_prompts.csv  — 4247 bytes

Next recommended steps:
1) If you have an OpenAI key, set OPENAI_API_KEY and re-run Cell G to generate llm outputs.
2) Run the full ablation (longer) if you want detailed metrics; I can provide that cell next.
3) Download artifacts and commit to your GitHub repo under submissions/vinayak_mahindrakar_groundtruth/artifacts/

If everything looks good, tell me and I'll provide the full ablation+comparison cell to generate judge-ready CSVs (comparison_sample.csv, ablation_results.csv).


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"
print("API key set safely.")


API key set safely.


In [None]:
# === MODEL COMPARISON CHARTS ===
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

ART = ARTIFACTS_DIR

# Load metrics produced by ablation cell
metrics_fp = os.path.join(ART, "ablation_metrics.csv")
metrics = pd.read_csv(metrics_fp)

metrics = metrics.set_index("variant")

# BAR CHART — Recall@1 & Recall@3
plt.figure(figsize=(10,5))
x = np.arange(len(metrics))
w = 0.35

plt.bar(x - w/2, metrics["recall@1"], width=w, label="Recall@1")
plt.bar(x + w/2, metrics["recall@3"], width=w, label="Recall@3")

plt.xticks(x, metrics.index)
plt.ylabel("Score")
plt.title("Model Variant Comparison — Recall Scores")
plt.legend()
plt.grid(axis='y', alpha=0.3)

out = os.path.join(ART, "model_variant_recall.png")
plt.savefig(out, dpi=200)
plt.close()

print("Saved:", out)

# TEXT SUMMARY
metrics


In [None]:
# === EMBEDDING MODEL COMPARISON ===
from sentence_transformers import SentenceTransformer
import numpy as np
import matplotlib.pyplot as plt

models = [
    "all-MiniLM-L6-v2",
    "all-mpnet-base-v2",
    "msmarco-distilbert-base-dot-v5"
]

queries = [
    "I'm cold and need a hot drink",
    "Which stores near me have offers?",
    "My order is delayed",
    "Where can I find warm clothes?",
]

kb = docs

scores = {}

for m in models:
    print("Testing:", m)
    model = SentenceTransformer(m)
    q_emb = model.encode(queries)
    kb_emb = model.encode(kb)
    qb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    kb_norm = kb_emb / np.linalg.norm(kb_emb, axis=1, keepdims=True)
    sim = np.dot(qb, kb_norm.T)
    avg = sim.mean()
    scores[m] = avg

plt.figure(figsize=(8,5))
plt.bar(scores.keys(), scores.values())
plt.ylabel("Average Similarity Score")
plt.title("Embedding Model Comparison")
plt.xticks(rotation=20)

out2 = os.path.join(ART, "embedding_model_comparison.png")
plt.savefig(out2, dpi=200)
plt.close()

print("Saved:", out2)
scores


In [None]:
!pip install datasets -q
from datasets import load_dataset
import pandas as pd, os

OUT="/content/gthackathon/datasets"
os.makedirs(OUT, exist_ok=True)
SAVE = os.path.join(OUT, "amazon_7k_stream.csv")

stream = load_dataset("amazon_polarity", split="train", streaming=True)
rows=[]
for i,ex in enumerate(stream):
    rows.append({"user_text": ex["content"], "lat": None, "lon": None, "user_history":"{}", "gt_doc":-1, "ground_truth":""})
    if i+1>=7000: break

pd.DataFrame(rows).to_csv(SAVE, index=False)
print("Saved:", SAVE, "rows:", len(rows))


In [None]:

import os, json, numpy as np, pandas as pd, matplotlib.pyplot as plt, time
from tqdm import tqdm

# ---------------- Config ----------------
ARTIFACTS_DIR = globals().get("ARTIFACTS_DIR", "/content/gthackathon/artifacts")
MERGED_DATA_PATH = globals().get("MERGED_DATA_PATH", "/content/gthackathon/datasets/merged_pipeline_dataset.csv")
COMBINED_KB_PATH = globals().get("COMBINED_KB_PATH", "/content/gthackathon/kb/combined_kb.txt")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# Experiment scale & retrieval settings
LIMIT = 500                     # change to 1000+ for bigger runs (watch runtime)
RETRIEVAL_MODES = ["hybrid"]    # choose any of: "dense","bm25","hybrid" or multiple e.g. ["dense","hybrid"]
VARIANTS = ["full","no_location","no_history","no_retrieval"]
TOP_K = 5
CALL_LLM = bool(os.environ.get("OPENAI_API_KEY"))   # true if you set API key in runtime
EMB_MODELS_COMPARE = ["all-MiniLM-L6-v2", "all-mpnet-base-v2", "msmarco-distilbert-base-v5"]  # model compare (sampled)

print("Artifacts:", ARTIFACTS_DIR)
print("Merged dataset path:", MERGED_DATA_PATH)
print("Combined KB path:", COMBINED_KB_PATH)
print("CALL_LLM:", CALL_LLM, "LIMIT:", LIMIT, "RET_MODES:", RETRIEVAL_MODES)

# ---------------- Prechecks ----------------
if not os.path.exists(MERGED_DATA_PATH):
    raise FileNotFoundError(f"Merged dataset not found at {MERGED_DATA_PATH}. Run merge cell first.")
if not os.path.exists(COMBINED_KB_PATH):
    print("Warning: combined KB not found; using fallback KB (small).")

# Ensure embedder, index, docs exist (Cell E should have created these)
if "embedder" not in globals() or "index" not in globals() or "docs" not in globals():
    # attempt to build minimal embedder/index from COMBINED_KB_PATH now
    from sentence_transformers import SentenceTransformer
    import faiss
    print("embedder/index/docs not found in globals — building now (may take time)...")
    EMB_MODEL = "all-MiniLM-L6-v2"
    embedder = SentenceTransformer(EMB_MODEL)
    if os.path.exists(COMBINED_KB_PATH) and os.path.getsize(COMBINED_KB_PATH) > 0:
        with open(COMBINED_KB_PATH, "r", encoding="utf8") as f:
            docs = [line.strip() for line in f if line.strip()]
    else:
        docs = [
            "Starbucks near Marine Lines serves Hot Cocoa and offers a 10% coupon.",
            "Costa Coffee at MG Road offers Hot Chocolate and loyalty discounts.",
            "Cafe Nero at Central Mall sells pastries; buy-one-get-one on desserts this week.",
            "Central Mall hours: 9 AM to 10 PM. Free parking available."
        ]
    kb_embeddings = embedder.encode(docs, show_progress_bar=True, convert_to_numpy=True)
    d = kb_embeddings.shape[1]
    kb_norm = kb_embeddings / np.linalg.norm(kb_embeddings, axis=1, keepdims=True)
    xb = kb_norm.astype('float32')
    index = faiss.IndexFlatIP(d)
    index.add(xb)
    # save
    faiss.write_index(index, os.path.join(ARTIFACTS_DIR, "faiss_index.bin"))
    import json
    with open(os.path.join(ARTIFACTS_DIR, "kb_docs.json"), "w", encoding="utf8") as f:
        json.dump(docs, f, ensure_ascii=False, indent=2)
    globals().update({"embedder": embedder, "index": index, "docs": docs})
    print("Built and saved FAISS index and kb_docs.json.")

# Build bm25 if not present
if "bm25" not in globals():
    try:
        from rank_bm25 import BM25Okapi
        tokenized = [d.split() for d in docs]
        bm25 = BM25Okapi(tokenized)
        globals()["bm25"] = bm25
        print("Built BM25 index.")
    except Exception as e:
        print("BM25 build failed:", e)
        globals()["bm25"] = None

# ---------------- Retrieval helpers ----------------
def dense_retrieve(query, k=TOP_K):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-9)
    D,I = index.search(qn.astype('float32'), k)
    return [{"id": int(i), "text": docs[i], "score": float(s)} for i,s in zip(I[0], D[0])]

def bm25_retrieve(query, k=TOP_K):
    if globals().get("bm25") is None:
        return []
    tokens = query.split()
    scores = bm25.get_scores(tokens)
    top_idx = np.array(scores).argsort()[::-1][:k]
    return [{"id": int(i), "text": docs[int(i)], "score": float(scores[int(i)])} for i in top_idx]

def hybrid_retrieve(query, k=TOP_K, alpha=0.6):
    dense = dense_retrieve(query, k=k*2)
    bm = bm25_retrieve(query, k=k*2)
    agg = {}
    for r in dense: agg[r["id"]] = agg.get(r["id"], 0.0) + alpha*r["score"]
    for r in bm: agg[r["id"]] = agg.get(r["id"], 0.0) + (1-alpha)*r["score"]
    top = sorted(agg.items(), key=lambda x: x[1], reverse=True)[:k]
    return [{"id": int(i), "text": docs[int(i)], "score": float(s)} for i,s in top]

# ---------------- PII masking ----------------
import re
PII_PATTERNS = [
    (re.compile(r'\b\d{10}\b'), "<PHONE>"),
    (re.compile(r'\b(?:\d[ -]*?){13,19}\b'), "<CC_NUMBER>"),
    (re.compile(r'[\w\.-]+@[\w\.-]+\.\w{2,4}'), "<EMAIL>")
]
def mask_pii(text):
    if not isinstance(text, str): return text, []
    t = text
    log = []
    for pat, repl in PII_PATTERNS:
        for m in pat.finditer(t):
            log.append((m.group(0), repl))
        t = pat.sub(repl, t)
    return t, log

# ---------------- Prompt builder & LLM wrapper ----------------
def build_prompt(user_row, retrieved, variant="full"):
    user_masked, mask_log = mask_pii(user_row.get("user_text",""))
    history = user_row.get("user_history","{}")
    loc = {"lat": user_row.get("lat"), "lon": user_row.get("lon")}
    parts = [f"User Query: {user_masked}"]
    if variant not in ("no_history","no_retrieval"):
        parts.append(f"User History: {history}")
    if variant not in ("no_location","no_retrieval"):
        parts.append(f"User Location: {loc}")
    if variant != "no_retrieval":
        parts.append("Retrieved KB:\n" + "\n".join([f"({r['id']}) {r['text']}" for r in retrieved]))
    context = "\n\n".join(parts)
    prompt = f"""You are a helpful, precise customer-support assistant.

CONTEXT:
{context}

TASK: Provide a short answer (1-2 sentences), one concrete ACTION (what to do next), and list referenced paragraph IDs.

OUTPUT FORMAT:
Short Answer:
Action:
References:
"""
    return prompt, mask_log

def call_llm_openai(prompt):
    if not CALL_LLM:
        return None
    import openai
    openai.api_key = os.environ.get("OPENAI_API_KEY")
    resp = openai.ChatCompletion.create(
        model = globals().get("LLM_MODEL","gpt-4o-mini"),
        messages=[{"role":"user","content":prompt}],
        max_tokens=256, temperature=0.0
    )
    return resp['choices'][0]['message']['content'].strip()

# ---------------- Run experiments ----------------
df = pd.read_csv(MERGED_DATA_PATH)
n = min(LIMIT, len(df))
print("Loaded merged dataset rows:", len(df), "running on sample size:", n)

results = []
start = time.time()
sample_df = df.sample(n=n, random_state=42).reset_index(drop=True)

for retrieval_mode in RETRIEVAL_MODES:
    print(f"\n=== Running retrieval_mode: {retrieval_mode} ===")
    for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        user_text = row.get("user_text","")
        for variant in VARIANTS:
            if variant == "no_retrieval":
                retrieved = []
            else:
                if retrieval_mode == "dense":
                    retrieved = dense_retrieve(user_text, TOP_K)
                elif retrieval_mode == "bm25":
                    retrieved = bm25_retrieve(user_text, TOP_K)
                else:
                    retrieved = hybrid_retrieve(user_text, TOP_K)
            prompt, mask_log = build_prompt(row, retrieved, variant)
            llm_out = call_llm_openai(prompt) if CALL_LLM else None
            results.append({
                "orig_id": int(row.get("id", -1)) if "id" in row else i,
                "sample_index": int(i),
                "user_text": user_text,
                "retrieval_mode": retrieval_mode,
                "variant": variant,
                "retrieved_ids": [r["id"] for r in retrieved],
                "prompt": prompt,
                "mask_log": json.dumps(mask_log),
                "llm_out": llm_out,
                "gt_doc": int(row.get("gt_doc", -1)) if not pd.isna(row.get("gt_doc", -1)) else -1
            })

# Save ablation results
res_df = pd.DataFrame(results)
ablation_fp = os.path.join(ARTIFACTS_DIR, "ablation_results_full.csv")
res_df.to_csv(ablation_fp, index=False)
print("\nSaved ablation results to:", ablation_fp)

# Build comparison_sample.csv (baseline vs full for first retrieval_mode)
comp = []
for sid in res_df['sample_index'].unique():
    block = res_df[(res_df['sample_index']==sid) & (res_df['retrieval_mode']==RETRIEVAL_MODES[0])]
    base = block[block['variant']=="no_retrieval"].iloc[0]
    full = block[block['variant']=="full"].iloc[0]
    comp.append({
        "sample_index": sid,
        "user_text": base['user_text'],
        "baseline_prompt": base['prompt'],
        "baseline_output": base['llm_out'],
        "rag_prompt": full['prompt'],
        "rag_output": full['llm_out'],
        "retrieved_ids": full['retrieved_ids'],
        "gt_doc": base['gt_doc']
    })
comp_df = pd.DataFrame(comp)
comp_fp = os.path.join(ARTIFACTS_DIR, "comparison_sample.csv")
comp_df.to_csv(comp_fp, index=False)
print("Saved comparison sample to:", comp_fp)

# Compute recall metrics
def recall_at_k_list(ret_lists, gts, k=3):
    hits=0; total=0
    for ret,gt in zip(ret_lists, gts):
        if gt is None or int(gt) < 0: continue
        total += 1
        if gt in ret[:k]:
            hits += 1
    return hits/total if total>0 else None

metrics = []
for variant in VARIANTS:
    sub = res_df[res_df['variant']==variant]
    if sub.empty:
        metrics.append({"variant": variant, "recall@1": None, "recall@3": None, "rows": 0})
        continue
    retrieved_lists = sub['retrieved_ids'].tolist()
    gts = sub['gt_doc'].tolist()
    r1 = recall_at_k_list(retrieved_lists, gts, k=1)
    r3 = recall_at_k_list(retrieved_lists, gts, k=3)
    metrics.append({"variant": variant, "recall@1": r1, "recall@3": r3, "rows": len(sub)})
metrics_df = pd.DataFrame(metrics)
metrics_fp = os.path.join(ARTIFACTS_DIR, "ablation_metrics.csv")
metrics_df.to_csv(metrics_fp, index=False)
print("Saved ablation metrics to:", metrics_fp)
print(metrics_df)

# ---------------- Charts ----------------
try:
    # Recall bar chart
    md = metrics_df.set_index("variant")
    plt.figure(figsize=(8,4))
    x = np.arange(len(md))
    w = 0.35
    plt.bar(x - w/2, md["recall@1"], width=w, label="Recall@1")
    plt.bar(x + w/2, md["recall@3"], width=w, label="Recall@3")
    plt.xticks(x, md.index)
    plt.ylabel("Score"); plt.title("Recall by Ablation Variant"); plt.ylim(0,1)
    plt.legend(); plt.grid(axis='y', alpha=0.3)
    recall_png = os.path.join(ARTIFACTS_DIR, "model_variant_recall.png")
    plt.tight_layout(); plt.savefig(recall_png, dpi=150); plt.close()
    print("Saved recall chart to:", recall_png)
except Exception as e:
    print("Failed to plot recall chart:", e)

# ---------------- Embedding model quick comparison (sampled) ----------------
try:
    from sentence_transformers import SentenceTransformer
    sample_queries = list(comp_df['user_text'].head(20).values) or ["I'm cold and want a hot drink", "Any offers near me?"]
    kb_sample = docs if len(docs) <= 200 else docs[:200]  # sample KB to keep compute small
    emb_scores = {}
    for m in EMB_MODELS_COMPARE:
        print("Embedding model:", m)
        model = SentenceTransformer(m)
        q_emb = model.encode(sample_queries, convert_to_numpy=True, show_progress_bar=False)
        kb_emb = model.encode(kb_sample, convert_to_numpy=True, show_progress_bar=False)
        qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-9)
        kbn = kb_emb / (np.linalg.norm(kb_emb, axis=1, keepdims=True) + 1e-9)
        sim = np.dot(qn, kbn.T)
        emb_scores[m] = float(sim.mean())
    # plot
    plt.figure(figsize=(7,4))
    plt.bar(range(len(emb_scores)), list(emb_scores.values()), tick_label=list(emb_scores.keys()))
    plt.title("Embedding models — avg query→KB similarity (sampled)")
    plt.xticks(rotation=20)
    emb_png = os.path.join(ARTIFACTS_DIR, "embedding_model_comparison.png")
    plt.tight_layout(); plt.savefig(emb_png, dpi=150); plt.close()
    print("Saved embedding model comparison to:", emb_png)
    print("Embedding scores:", emb_scores)
except Exception as e:
    print("Embedding comparison failed:", e)

# Final listing
print("\nArtifacts produced in:", ARTIFACTS_DIR)
for f in sorted(os.listdir(ARTIFACTS_DIR)):
    print("-", f, os.path.getsize(os.path.join(ARTIFACTS_DIR,f)), "bytes")

end = time.time()
print("\nTotal time (s):", round(end-start, 1))
