In [18]:
#!/usr/bin/env python3
"""
assignment_retriever_eval.py

Usage:
    python assignment_retriever_eval.py --qa_csv Squad_v2_dataset.csv --ctx_csv Context.csv --out_dir reports --sample 1000 --do_reader
"""

import argparse
import json
import math
import os
from collections import defaultdict
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import nltk
nltk.download('punkt')

# --------------------
# Helpers / Metrics
# --------------------
def normalize_answer(s: str) -> str:
    import re, string
    if s is None:
        return ""
    s = s.lower().strip()
    s = re.sub(r'\s+', ' ', s)
    s = ''.join(ch for ch in s if ch not in set(string.punctuation))
    return s.strip()

def f1_score(pred: str, golds: List[str]) -> float:
    p = normalize_answer(pred).split()
    if len(p) == 0:
        return 0.0
    best = 0.0
    for g in golds:
        g_toks = normalize_answer(g).split()
        common = set(p) & set(g_toks)
        if not common:
            continue
        prec = len(common) / len(p)
        rec = len(common) / max(1, len(g_toks))
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
        if f1 > best:
            best = f1
    return best

def exact_match(pred: str, golds: List[str]) -> float:
    p = normalize_answer(pred)
    for g in golds:
        if p == normalize_answer(g):
            return 1.0
    return 0.0

def recall_at_k_single(retrieved_ids: List[str], gold_id: str) -> int:
    return 1 if gold_id in retrieved_ids else 0

def mrr_at_k_single(retrieved_ids: List[str], gold_id: str) -> float:
    for i, cid in enumerate(retrieved_ids, start=1):
        if cid == gold_id:
            return 1.0 / i
    return 0.0

def dcg(rels: List[int]) -> float:
    return sum((2**r - 1) / math.log2(i + 1) for i, r in enumerate(rels, start=1))

def ndcg_at_k_single(retrieved_ids: List[str], gold_id: str, k: int) -> float:
    rels = [1 if cid == gold_id else 0 for cid in retrieved_ids[:k]]
    ideal = sorted(rels, reverse=True)
    denom = dcg(ideal)
    if denom == 0:
        return 0.0
    return dcg(rels) / denom

# --------------------
# Load Data
# --------------------
def parse_answers_field(x):
    # field 'answers' may be a JSON string or plain string; normalize to list[str]
    if pd.isna(x):
        return [""]
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        x = x.strip()
        # try parse JSON
        try:
            obj = json.loads(x)
            if isinstance(obj, dict) and "text" in obj:
                t = obj["text"]
                return t if isinstance(t, list) else [t]
            if isinstance(obj, list):
                return obj
        except Exception:
            pass
        # fallback: return string wrapped
        return [x]
    return [str(x)]

# --------------------
# Retrievers: TF-IDF, BM25, Dense (FAISS), Hybrid (RRF)
# --------------------
def build_tfidf(contexts: List[str]):
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=100_000, stop_words='english')
    X = tfidf.fit_transform(contexts)
    return tfidf, X

def tfidf_retrieve(query: str, tfidf, X, context_ids, contexts, top_k=5):
    from sklearn.metrics.pairwise import linear_kernel
    qv = tfidf.transform([query])
    scores = linear_kernel(qv, X).ravel()
    idx = np.argsort(scores)[::-1][:top_k]
    return [(context_ids[i], contexts[i], float(scores[i])) for i in idx]

def build_bm25(tokenized_contexts: List[List[str]]):
    from rank_bm25 import BM25Okapi
    return BM25Okapi(tokenized_contexts)

def bm25_retrieve(query: str, bm25, context_ids, contexts, top_k=5):
    import nltk
    q_tokens = nltk.word_tokenize(query.lower())
    scores = bm25.get_scores(q_tokens)
    idx = np.argsort(scores)[::-1][:top_k]
    return [(context_ids[i], contexts[i], float(scores[i])) for i in idx]

def build_dense(contexts: List[str], embed_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(embed_model_name)
    emb = model.encode(contexts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
    import faiss
    dim = emb.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(emb.astype('float32'))
    return model, emb, index

def dense_retrieve(query: str, model, index, context_ids, contexts, top_k=5):
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype('float32')
    scores, idx = index.search(q_emb, top_k)
    idx = idx[0]; scores = scores[0]
    return [(context_ids[i], contexts[i], float(scores[j])) for j,i in enumerate(idx)]

def rrf_fuse(list_of_ranked: List[List[Tuple[str,str,float]]], k=5, rrf_k=60):
    # Reciprocal Rank Fusion across lists of (cid, text, score)
    from collections import defaultdict
    rr = defaultdict(float)
    info = {}
    for L in list_of_ranked:
        for rank, item in enumerate(L, start=1):
            cid = item[0]
            rr[cid] += 1.0 / (rrf_k + rank)
            if cid not in info:
                info[cid] = item
    fused = [(cid, info[cid][1], rr[cid]) for cid in rr]
    fused.sort(key=lambda x: x[2], reverse=True)
    return fused[:k]

# --------------------
# Reranker (Cross-Encoder)
# --------------------
def build_reranker(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
    from sentence_transformers import CrossEncoder
    return CrossEncoder(model_name)

def cross_rerank(query: str, candidates: List[Tuple[str,str,float]], reranker, top_k=5):
    pairs = [(query, text) for (cid, text, s) in candidates]
    if len(pairs) == 0:
        return []
    scores = reranker.predict(pairs, show_progress_bar=False)
    cand_scored = [(cid, text, float(score)) for (cid, text, s), score in zip(candidates, scores)]
    cand_scored.sort(key=lambda x: x[2], reverse=True)
    return cand_scored[:top_k]

# --------------------
# Extractive reader (optional)
# --------------------
def build_reader(model_name="deepset/roberta-base-squad2", device="cpu"):
    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
    import torch
    tok = AutoTokenizer.from_pretrained(model_name)
    mdl = AutoModelForQuestionAnswering.from_pretrained(model_name)
    mdl.to(device)
    return tok, mdl, device

def extractive_answer(question: str, context: str, tok, mdl, device="cpu", max_len=512):
    import torch
    inputs = tok.encode_plus(question, context, truncation=True, max_length=max_len, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        out = mdl(**inputs)
    start = int(out.start_logits.argmax(dim=1).cpu().numpy()[0])
    end = int(out.end_logits.argmax(dim=1).cpu().numpy()[0])
    if end < start:
        return ""
    ids = inputs["input_ids"][0][start:end+1].cpu().numpy().tolist()
    ans = tok.decode(ids, skip_special_tokens=True)
    return ans.strip()

# --------------------
# Evaluate retriever
# --------------------
def evaluate_retriever(df: pd.DataFrame, retriever_fn, retriever_name: str, topk_list=[1,3,5,10,20], limit=None, reranker=None):
    metrics = {k: {"recall_sum":0.0, "mrr_sum":0.0, "ndcg_sum":0.0, "count":0} for k in topk_list}
    n = limit or len(df)
    for i, row in enumerate(tqdm(df.itertuples(index=False), total=n)):
        if limit and i >= limit:
            break
        question = getattr(row, "question")
        gold = getattr(row, "context_id")
        # retrieve larger list for potential rerank
        topn = max(topk_list) * 5
        candidates = retriever_fn(question, top_k=topn)
        for k in topk_list:
            cand_k = candidates[:k]
            if reranker is not None:
                cand_k = reranker(question, candidates, top_k=k)
            retrieved_ids = [cid for cid, _, _ in cand_k]
            metrics[k]["recall_sum"] += recall_at_k_single(retrieved_ids, gold)
            metrics[k]["mrr_sum"] += mrr_at_k_single(retrieved_ids, gold)
            metrics[k]["ndcg_sum"] += ndcg_at_k_single(retrieved_ids, gold, k)
            metrics[k]["count"] += 1
    rows = []
    for k in topk_list:
        cnt = metrics[k]["count"] or 1
        rows.append({
            "retriever": retriever_name,
            "top_k": k,
            "recall": metrics[k]["recall_sum"] / cnt,
            "mrr": metrics[k]["mrr_sum"] / cnt,
            "ndcg": metrics[k]["ndcg_sum"] / cnt,
            "count": cnt
        })
    return pd.DataFrame(rows)

# --------------------
# Main CLI
# --------------------
def main(args):
    os.makedirs(args.out_dir, exist_ok=True)
    print("Loading CSVs...")
    qa_columns = ["question", "answers", "context_id"]
    ctx_columns = ["context_id", "context"]
    qa_df = pd.read_csv(args.qa_csv,header=None,names=qa_columns,encoding="latin1")
    ctx_df = pd.read_csv(args.ctx_csv,header=None,names=ctx_columns,encoding="latin1")

    qa_df['answers_list'] = qa_df['answers'].apply(parse_answers_field)
    data = qa_df.merge(ctx_df, left_on='context_id', right_on='context_id', how='left')
    #assert data['context'].notnull().all(), "Some context_id missing in Context.csv"

    # optionally sample
    eval_df = data[['question','context_id','answers_list']].copy()
    if args.sample and args.sample < len(eval_df):
        eval_df = eval_df.sample(n=args.sample, random_state=42).reset_index(drop=True)

    contexts = ctx_df['context'].astype(str).tolist()
    context_ids = ctx_df['context_id'].astype(str).tolist()

    # Build TF-IDF
    print("Building TF-IDF...")
    tfidf, X_tfidf = build_tfidf(contexts)
    tfidf_fn = lambda q, top_k: tfidf_retrieve(q, tfidf, X_tfidf, context_ids, contexts, top_k=top_k)

    # Build BM25
    print("Building BM25...")
    tokenized_contexts = [nltk.word_tokenize(c.lower()) for c in contexts]
    bm25 = build_bm25(tokenized_contexts)
    bm25_fn = lambda q, top_k: bm25_retrieve(q, bm25, context_ids, contexts, top_k=top_k)

    # Build Dense + FAISS
    print("Building dense embeddings + FAISS...")
    embed_model_name = args.embed_model
    dense_model, dense_embs, faiss_index = None, None, None
    dense_model, dense_embs, faiss_index = build_dense(contexts, embed_model_name)
    dense_fn = lambda q, top_k: dense_retrieve(q, dense_model, faiss_index, context_ids, contexts, top_k=top_k)

    # Hybrid (BM25 + Dense via RRF)
    hybrid_fn = lambda q, top_k: rrf_fuse([bm25_fn(q, top_k=50), dense_fn(q, top_k=50)], k=top_k)

    # Optional reranker
    reranker = None
    rerank_fn = None
    if args.do_rerank:
        print("Loading cross-encoder reranker...")
        reranker = build_reranker(args.reranker_model)
        rerank_fn = lambda q, candidates, top_k: cross_rerank(q, candidates, reranker, top_k=top_k)

    # Evaluate retrievers
    topk_list = [1,3,5,10,20]
    print("Evaluating TF-IDF...")
    res_tfidf = evaluate_retriever(eval_df, tfidf_fn, "tfidf", topk_list=topk_list, limit=None, reranker=rerank_fn)
    print("Evaluating BM25...")
    res_bm25  = evaluate_retriever(eval_df, bm25_fn, "bm25", topk_list=topk_list, limit=None, reranker=rerank_fn)
    print("Evaluating Dense...")
    res_dense = evaluate_retriever(eval_df, dense_fn, "dense", topk_list=topk_list, limit=None, reranker=rerank_fn)
    print("Evaluating Hybrid...")
    res_hybrid = evaluate_retriever(eval_df, hybrid_fn, "hybrid", topk_list=topk_list, limit=None, reranker=rerank_fn)

    summary = pd.concat([res_tfidf, res_bm25, res_dense, res_hybrid], ignore_index=True)
    summary.to_csv(os.path.join(args.out_dir, "retrieval_summary.csv"), index=False)
    print("Saved retrieval_summary.csv")

    # Optional: answer extraction evaluation on small sample
    if args.do_reader:
        print("Building extractive reader...")
        import torch
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tok, mdl, dev = build_reader(args.reader_model, device)
        sample_n = min(args.reader_sample, len(eval_df))
        rows = []
        for row in tqdm(eval_df.head(sample_n).itertuples(index=False), total=sample_n):
            q = row.question
            golds = row.answers_list
            candidates = dense_fn(q, top_k=args.ans_topk)
            if rerank_fn:
                candidates = rerank_fn(q, candidates, top_k=args.ans_topk)
            concat_ctx = "\n\n".join([t for (_cid, t, _s) in candidates])
            pred = extractive_answer(q, concat_ctx, tok, mdl, device=device)
            em = exact_match(pred, golds)
            f1 = f1_score(pred, golds)
            rows.append({"question": q, "pred": pred, "em": em, "f1": f1})
        ans_df = pd.DataFrame(rows)
        ans_df.to_csv(os.path.join(args.out_dir, "answer_eval_sample.csv"), index=False)
        print("Saved answer_eval_sample.csv")

    print("Done.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--qa_csv", type=str, default="data/squad_v2_dataset.csv")
    parser.add_argument("--ctx_csv", type=str, default="data/context.csv")
    parser.add_argument("--out_dir", type=str, default="reports")
    parser.add_argument("--sample", type=int, default=1000, help="sample size for evaluation (None for full)")
    parser.add_argument("--embed_model", type=str, default="sentence-transformers/all-MiniLM-L6-v2")
    parser.add_argument("--do_rerank", action="store_true", help="use cross-encoder reranker")
    parser.add_argument("--reranker_model", type=str, default="cross-encoder/ms-marco-MiniLM-L-6-v2")
    parser.add_argument("--do_reader", action="store_true", help="run extractive reader on small sample")
    parser.add_argument("--reader_model", type=str, default="deepset/roberta-base-squad2")
    parser.add_argument("--reader_sample", type=int, default=200)
    parser.add_argument("--ans_topk", type=int, default=5)
    args, unknown = parser.parse_known_args()  # ignore unknown args
    main(args)


[nltk_data] Downloading package punkt to /home/sasidhar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading CSVs...
Building TF-IDF...
Building BM25...
Building dense embeddings + FAISS...


Batches: 100%|██████████| 35/35 [00:37<00:00,  1.06s/it]


Evaluating TF-IDF...


100%|██████████| 1000/1000 [00:03<00:00, 262.73it/s]


Evaluating BM25...


100%|██████████| 1000/1000 [00:05<00:00, 194.83it/s]


Evaluating Dense...


100%|██████████| 1000/1000 [00:22<00:00, 45.08it/s]


Evaluating Hybrid...


100%|██████████| 1000/1000 [00:30<00:00, 33.10it/s]


Saved retrieval_summary.csv
Done.


In [3]:
%pwd

'/mnt/c/Users/sasidhar.chennup/Documents/Tiger-Training/NLP-GenAI-Classroom/NLP-Gen-AI-classroom/Assignment-2'

In [5]:
%chdir '/mnt/c/Users/sasidhar.chennup/Documents/Tiger-Training/NLP-GenAI-Classroom/NLP-Gen-AI-classroom/'

UsageError: Line magic function `%chdir` not found.


In [7]:
%cd '/mnt/c/Users/sasidhar.chennup/Documents/Tiger-Training/NLP-GenAI-Classroom/NLP-Gen-AI-classroom/'

/mnt/c/Users/sasidhar.chennup/Documents/Tiger-Training/NLP-GenAI-Classroom/NLP-Gen-AI-classroom


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [8]:
%pwd

'/mnt/c/Users/sasidhar.chennup/Documents/Tiger-Training/NLP-GenAI-Classroom/NLP-Gen-AI-classroom'