In [1]:
# Load all required Libraries
import sys, subprocess
try:
    import sentence_transformers  # noqa: F401
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", "assignment2-rag/requirements.txt"])  # installs deps if missing

import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

import evaluate as hf_evaluate
from ragas import evaluate as ragas_evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)


  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import DistributionNotFound, get_distribution

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


In [None]:

import os, requests, json, time

API_KEY = "SECRET_KEY"
os.environ["OPENAI_API_KEY"] = API_KEY



import os
from openai import OpenAI
client_openai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

try:
    r = client_openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"user","content":"ping"}],
        max_tokens=1,
        temperature=0
    )
    print("OK:", r.choices[0].message.content)
except Exception as e:
    print("ERROR:", e)

OK: P


# Read Passages from the Datasets and Drop rows if they are NA or empty

In [4]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

# drop rows if they are NA or empty
passages = passages.dropna(subset=["passage"])
passages = passages[passages["passage"].astype(str).str.strip().ne("")]

print(passages.shape)
passages_reset = passages.reset_index()
passages.head()

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


# Do EDA on the passage dataset
- You can try to find the maximum and minimum length of the passages before indexing (just a direction)

In [5]:
# Code for EDA
passages_series = passages["passage"].astype(str)
lengths = passages_series.str.len()
print({
    "num_passages": len(passages_series),
    "min_len": int(lengths.min()),
    "max_len": int(lengths.max()),
    "avg_len": float(lengths.mean()),
    "median_len": float(lengths.median()),
})
passages_series.sample(5, random_state=42).tolist()

{'num_passages': 3200, 'min_len': 1, 'max_len': 2515, 'avg_len': 389.848125, 'median_len': 299.0}


['In some beetles, the ability to fly has been lost. These include the ground beetles (family Carabidae) and some "true weevils" (family Curculionidae), but also some desert and cave-dwelling species of other families. Many of these species have the two elytra fused together, forming a solid shield over the abdomen. In a few families, both the ability to fly and the elytra have been lost, with the best known example being the glow-worms of the family Phengodidae, in which the females are larviform throughout their lives.',
 'The name "Qatar" may derive from the same Arabic root as qatura which means "to exude."  The word Qatura traces to the Arabic qatran meaning "tar" or "resin", which relates to the country\'s rich resources in petroleum and natural gas.  Adrian Room, Placenames of the World (1997) McFarland and Company.',
 "President Woodrow Wilson articulated what became known as the Fourteen Points before Congress on January 8, 1918.  The Points were the only war aims clearly expr

# Tokenize Text and Generate Embeddings using Sentence Transformers

In [6]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Encode Text
passages_reset = passages.reset_index()  # ensure 'id' is a column
passage_texts = passages_reset["passage"].astype(str).tolist()
embeddings = embedding_model.encode(
    passage_texts,
    batch_size=64,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True,
)
print(embeddings.shape)

Batches: 100%|██████████| 50/50 [00:11<00:00,  4.17it/s]

(3200, 384)





# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [7]:
# Define every column of your schema

# Infer embedding dimension from computed embeddings
embedding_dim = int(embeddings.shape[1])

id_ = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
passage = FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=8192)
embedding = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim)

In [8]:
schema = CollectionSchema(fields=[id_, passage, embedding], description="RAG Mini Wikipedia collection")

In [9]:
from pymilvus import MilvusClient
DB_PATH = "rag_wikipedia_mini_v3.db"  # new file to avoid lock
client = MilvusClient(DB_PATH)

# Create the Collection with Collection Name = "rag_mini". Make sure you define the schema variable while creating the collection
try:
    if client.has_collection("rag_mini"):
        client.drop_collection("rag_mini")
except Exception as e:
    print("Drop existing rag_mini failed:", e)

try:
    client.create_collection(
        collection_name="rag_mini",
        schema=schema,
        shard_num=1,
    )
    print("Collection created: rag_mini")
except Exception as e:
    print(f"Create collection result: {e}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collection created: rag_mini


**Convert your Pandas Dataframe to a list of dictionaries**
- The Dictionary at least have 3 keys [id, passage, embedding]

In [10]:
# Convert Pandas DataFrame + embeddings to Milvus row dicts
ids = passages_reset["id"].astype(int).tolist()
rag_data = [
    {
        "id": int(idx),
        "passage": str(text),
        "embedding": emb.astype(float).tolist(),
    }
    for idx, text, emb in zip(ids, passage_texts, embeddings)
]
print(len(rag_data), "rows ready for insert")

3200 rows ready for insert


In [11]:
# Code to insert the data to your DB
try:
    res = client.insert(collection_name="rag_mini", data=rag_data)
    print("Inserted:", res.get("insert_count", res))
except Exception as e:
    print(f"Insert result: {e}")

Inserted: 3200


- Do a Sanity Check on your database 

**Do not delete the below line during your submission**

In [12]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

Entity count: 3200
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG Mini Wikipedia collection', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8192}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}


# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

In [13]:
import pandas as pd

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

queries = queries.dropna(subset=["question"])
queries = queries[queries["question"].astype(str).str.strip().ne("")]

print(queries.shape)

queries.head()

(918, 2)


Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832


In [14]:
query = queries.iloc[0]["question"]

query_embedding = embedding_model.encode([str(query)], normalize_embeddings=True)

print(query_embedding.shape)

(1, 384)


#### Create Index on the embedding column on your DB

In [15]:
index_params = MilvusClient.prepare_index_params()

# Add an index on the embedding field
index_params.add_index(
    field_name="embedding",
    index_type="AUTOINDEX",  # or IVF_FLAT/HNSW if desired
    metric_type="IP",        # cosine works when normalized; use IP for speed
    params={}
)

# Create the index
try:
    client.create_index(collection_name="rag_mini", index_params=index_params)
    print("Index created on embedding")
except Exception as e:
    print(f"Index creation result: {e}")

# Load collection into memory (required for search)
try:
    client.load_collection(collection_name="rag_mini")
    print("Collection loaded into memory")
except Exception as e:
    print(f"Load collection result: {e}")

Index created on embedding
Collection loaded into memory


In [None]:
# # Build second collection with all-mpnet-base-v2
# from sentence_transformers import SentenceTransformer
# MPNET_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
# embedding_model_mpnet = SentenceTransformer(MPNET_MODEL_NAME)
# mpnet_embeddings = embedding_model_mpnet.encode(
#     passage_texts,
#     batch_size=64,
#     convert_to_numpy=True,
#     show_progress_bar=True,
#     normalize_embeddings=True,
# )
# embedding_dim_mpnet = int(mpnet_embeddings.shape[1])

# id2 = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
# passage2 = FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=8192)
# embedding2 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim_mpnet)
# schema_mpnet = CollectionSchema(fields=[id2, passage2, embedding2], description="RAG Mini Wikipedia mpnet")

# try:
#     client.create_collection(collection_name="rag_mini_mpnet", schema=schema_mpnet, shard_num=1)
#     print("Collection created: rag_mini_mpnet")
# except Exception as e:
#     print("Create collection (mpnet):", e)

# rag_data_mpnet = [
#     {"id": int(idx), "passage": str(text), "embedding": emb.astype(float).tolist()}
#     for idx, text, emb in zip(ids, passage_texts, mpnet_embeddings)
# ]
# res = client.insert(collection_name="rag_mini_mpnet", data=rag_data_mpnet)
# print("Inserted mpnet:", res.get("insert_count", res))

# index_params2 = MilvusClient.prepare_index_params()
# index_params2.add_index(field_name="embedding", index_type="AUTOINDEX", metric_type="IP", params={})
# try:
#     client.create_index(collection_name="rag_mini_mpnet", index_params=index_params2)
#     client.load_collection(collection_name="rag_mini_mpnet")
#     print("rag_mini_mpnet loaded")
# except Exception as e:
#     print("Index/load (mpnet):", e)

In [17]:
# Build second collection with all-mpnet-base-v2
from sentence_transformers import SentenceTransformer
MPNET_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
embedding_model_mpnet = SentenceTransformer(MPNET_MODEL_NAME)
mpnet_embeddings = embedding_model_mpnet.encode(
    passage_texts,
    batch_size=64,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True,
)
embedding_dim_mpnet = int(mpnet_embeddings.shape[1])

id2 = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
passage2 = FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=8192)
embedding2 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim_mpnet)
schema_mpnet = CollectionSchema(fields=[id2, passage2, embedding2], description="RAG Mini Wikipedia mpnet")

try:
    if client.has_collection("rag_mini_mpnet"):
        client.drop_collection("rag_mini_mpnet")
except Exception as e:
    print("Drop existing rag_mini_mpnet failed:", e)

try:
    client.create_collection(collection_name="rag_mini_mpnet", schema=schema_mpnet, shard_num=1)
    print("Collection created: rag_mini_mpnet")
except Exception as e:
    print("Create collection (mpnet):", e)

rag_data_mpnet = [
    {"id": int(idx), "passage": str(text), "embedding": emb.astype(float).tolist()}
    for idx, text, emb in zip(ids, passage_texts, mpnet_embeddings)
]
res = client.insert(collection_name="rag_mini_mpnet", data=rag_data_mpnet)
print("Inserted mpnet:", res.get("insert_count", res))

index_params2 = MilvusClient.prepare_index_params()
index_params2.add_index(field_name="embedding", index_type="AUTOINDEX", metric_type="IP", params={})
try:
    client.create_index(collection_name="rag_mini_mpnet", index_params=index_params2)
    client.load_collection(collection_name="rag_mini_mpnet")
    print("rag_mini_mpnet loaded")
except Exception as e:
    print("Index/load (mpnet):", e)

Batches: 100%|██████████| 50/50 [00:51<00:00,  1.02s/it]


Collection created: rag_mini_mpnet
Inserted mpnet: 3200
rag_mini_mpnet loaded


In [18]:
# Search the db with your query embedding
search_res = client.search(
    collection_name="rag_mini",
    data=query_embedding.tolist(),
    anns_field="embedding",
    limit=10,
    search_params={"metric_type": "IP"}
)
print(search_res)
# Extract ids and distances for top-k
hits = search_res[0]
retrieved_ids = [hit["id"] for hit in hits]
retrieved_scores = [hit["distance"] for hit in hits]
print("Top IDs:", retrieved_ids[:5])

data: ["[{'id': 288, 'distance': 0.7095188498497009, 'entity': {}}, {'id': 278, 'distance': 0.5840359926223755, 'entity': {}}, {'id': 698, 'distance': 0.5568779110908508, 'entity': {}}, {'id': 2228, 'distance': 0.5566980838775635, 'entity': {}}, {'id': 319, 'distance': 0.5500738024711609, 'entity': {}}, {'id': 390, 'distance': 0.548395037651062, 'entity': {}}, {'id': 1813, 'distance': 0.5443781614303589, 'entity': {}}, {'id': 317, 'distance': 0.5384628176689148, 'entity': {}}, {'id': 392, 'distance': 0.538284182548523, 'entity': {}}, {'id': 289, 'distance': 0.530716061592102, 'entity': {}}]"] , extra_info: {'cost': 0}
Top IDs: [288, 278, 698, 2228, 319]


## Now get the Context 
- Initially use the first passage ONLY as your context
- In Later Experiments, you must try at least 2 different passage selection strategies (Top 3 / Top 5 / Top 10) and pass to your prompt

In [19]:
# Use first retrieved passage as context
first_id = int(retrieved_ids[0])
context = passages_reset.loc[passages_reset["id"] == first_id, "passage"].values[0]
print(context[:300], "...")

Young Abraham Lincoln ...


**Develop your Prompt**

In [20]:
system_prompt = f""

prompt = f"""{system_prompt} \n Context: {context}: \n Question: {query} """
print(prompt)

 
 Context: Young Abraham Lincoln: 
 Question: Was Abraham Lincoln the sixteenth President of the United States? 


# RAG Response for a Single Query

In [21]:
# Load the LLM Model you want to use (small seq2seq for offline use)
GEN_MODEL = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
llm = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)

def generate_answer(prompt_text: str, max_new_tokens: int = 128) -> str:
    inputs = tokenizer(
        prompt_text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    with torch.no_grad():
        output_ids = llm.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)



In [22]:
# Generate answer
answer = generate_answer(prompt)
print("Answer:", answer)

# Decode and extract answer.
# Already decoded above

Answer: no


# Generate Responses for all the Queries in the Dataset

In [23]:
from tqdm.auto import tqdm

# Batch inference over queries using top-1 context
clean_queries = queries.dropna(subset=["question"]).copy()
clean_queries["question"] = clean_queries["question"].astype(str)

pred_answers = []
retrieved_first_ids = []
for q in tqdm(clean_queries["question"].tolist()):
    q_emb = embedding_model.encode([q], normalize_embeddings=True)
    sres = client.search(
        collection_name="rag_mini",
        data=q_emb.tolist(),
        anns_field="embedding",
        limit=10,
        search_params={"metric_type": "IP"}
    )
    top_id = int(sres[0][0]["id"]) if len(sres) > 0 and len(sres[0]) > 0 else -1
    retrieved_first_ids.append(top_id)
    ctx = passages_reset.loc[passages_reset["id"] == top_id, "passage"].values
    ctx_text = ctx[0] if len(ctx) > 0 else ""
    sys_prompt = "You are a helpful assistant. Answer concisely using the provided context. If unknown, say you don't know."
    prompt_i = f"{sys_prompt}\nContext: {ctx_text}\nQuestion: {q}"
    pred = generate_answer(prompt_i)
    pred_answers.append(pred)

batch_df = clean_queries[["question", "answer"]].copy()
batch_df["pred_answer"] = pred_answers
batch_df["retrieved_id"] = retrieved_first_ids
batch_df.head()

 20%|█▉        | 181/918 [00:27<01:50,  6.70it/s]


KeyboardInterrupt: 

# Finding out the Basic QA Metrics (F1 score, EM score)

In [None]:
# # Simple prompting strategies comparison (top-1, same retriever)
# prompts = [
#     "Answer concisely using only the context. If unknown, say you don't know.\nContext: {ctx}\nQuestion: {q}",
#     "You are a strict QA system. Use only the context; if missing, say 'I don't know'.\nContext: {ctx}\nQuestion: {q}",
#     "Provide a short, direct answer based on the context only.\nContext: {ctx}\nQuestion: {q}",
# ]

# from sentence_transformers import SentenceTransformer
# tmp_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# def eval_prompt(template):
#     preds = []
#     refs = clean_queries["answer"].astype(str).tolist()
#     for q in clean_queries["question"].astype(str).tolist():
#         q_emb = tmp_model.encode([q], normalize_embeddings=True)
#         sres = client.search(collection_name="rag_mini", data=q_emb.tolist(), anns_field="embedding", limit=1, search_params={"metric_type": "IP"})
#         pid = int(sres[0][0]["id"])
#         ctx = passages_reset.loc[passages_reset["id"]==pid,"passage"].values[0]
#         pred = generate_answer(template.format(ctx=ctx, q=q))
#         preds.append(pred)
#     em_ = sum(exact_match_score(p, r) for p, r in zip(preds, refs)) / len(refs) * 100.0
#     f1_ = sum(f1_score(p, r) for p, r in zip(preds, refs)) / len(refs) * 100.0
#     return em_, f1_

# prompt_scores = [eval_prompt(t) for t in prompts]
# print(prompt_scores)

In [None]:
# Offline SQuAD-style EM/F1

import re, string, json

def normalize_answer(s: str) -> str:
    def remove_articles(text): return re.sub(r"\b(a|an|the)\b", " ", text, flags=re.IGNORECASE)
    def remove_punc(text): return text.translate(str.maketrans("", "", string.punctuation))
    def white_space_fix(text): return " ".join(text.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match_score(pred: str, truth: str) -> int:
    return int(normalize_answer(pred) == normalize_answer(truth))

def f1_score(pred: str, truth: str) -> float:
    p = normalize_answer(pred).split(); t = normalize_answer(truth).split()
    if not p and not t: return 1.0
    if not p or not t: return 0.0
    common = {}
    for tok in set(p):
        common[tok] = min(p.count(tok), t.count(tok))
    num_same = sum(common.values())
    if num_same == 0: return 0.0
    precision = num_same / len(p); recall = num_same / len(t)
    return 2 * precision * recall / (precision + recall)

preds = batch_df["pred_answer"].astype(str).tolist()
refs = batch_df["answer"].astype(str).tolist()

em = sum(exact_match_score(p, r) for p, r in zip(preds, refs)) / len(refs) * 100.0
f1 = sum(f1_score(p, r) for p, r in zip(preds, refs)) / len(refs) * 100.0

print({"exact_match": em, "f1": f1})

with open("naive_results.json", "w") as f:
    json.dump({"em": em, "f1": f1}, f, indent=2)

{'exact_match': 41.50326797385621, 'f1': 48.547767028825}


In [None]:
# 95% CI + failure analysis for naive batch_df (repeat for enhanced if needed)
import numpy as np
import pandas as pd

def bootstrap_ci(vals, n_boot=500, alpha=0.05, seed=42):
    rng = np.random.default_rng(seed)
    vals = np.array(vals, dtype=float)
    boots = [np.mean(vals[rng.integers(0, len(vals), len(vals))]) for _ in range(n_boot)]
    lo = float(np.percentile(boots, 100*alpha/2))
    hi = float(np.percentile(boots, 100*(1-alpha/2)))
    return lo, hi

per_em = [int(exact_match_score(p, r)) for p, r in zip(batch_df["pred_answer"], batch_df["answer"])]
per_f1 = [f1_score(p, r) for p, r in zip(batch_df["pred_answer"], batch_df["answer"])]
em_ci = bootstrap_ci(per_em)
f1_ci = bootstrap_ci(per_f1)
print({"em_mean": float(np.mean(per_em))*100, "em_ci": em_ci,
       "f1_mean": float(np.mean(per_f1))*100, "f1_ci": f1_ci})

def error_type(pred, ref):
    p, r = str(pred).strip().lower(), str(ref).strip().lower()
    if p == r: return "correct"
    if r in ["yes","no"] and p in ["yes","no"] and p != r: return "yn_mismatch"
    if any(ch.isdigit() for ch in r) and not any(ch.isdigit() for ch in p): return "number_missing"
    return "other"

fa_df = batch_df.copy()
fa_df["error_type"] = [error_type(p, r) for p, r in zip(fa_df["pred_answer"], fa_df["answer"])]
print(fa_df["error_type"].value_counts())
fa_df[fa_df["error_type"]!="correct"][["question","answer","pred_answer","error_type"]].head(10)

{'em_mean': 41.50326797385621, 'em_ci': (0.38344226579520696, 0.44937363834422656), 'f1_mean': 48.54776702882495, 'f1_ci': (0.45592722971165917, 0.5147832400573178)}
error_type
other             506
correct           309
yn_mismatch        77
number_missing     26
Name: count, dtype: int64


Unnamed: 0_level_0,question,answer,pred_answer,error_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes,no,yn_mismatch
10,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",United States Note,other
12,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,Grace Bedell,number_missing
14,When did the Gettysburg address argue that Ame...,1776,1789,other
24,Which county was Lincoln born in?,Hardin County,Springfield,other
26,When did Lincoln first serve as President?,"March 4, 1861",1861,other
28,Who assassinated Lincoln?,John Wilkes Booth,Abraham Lincoln,other
32,Who was the general in charge at the Battle of...,General McClellan,Ambrose Burnside,other
34,Why did Lincoln issue the Emancipation Proclam...,To free slaves,freed slaves in territories not under Union co...,other
41,Was Lincoln chosen as a presidential candidate...,Yes,Lincoln was eventually chosen as the Republica...,other


# Advanced Evaluation using RAGAs

In [None]:
# data = {
#     "question": batch_df["question"].astype(str).tolist(),                     # Question
#     "answer": batch_df["pred_answer"].astype(str).tolist(),                    # Generated Answer
#     "contexts": [ [ctx] for ctx in batch_df["retrieved_id"].apply(lambda rid: passages_reset.loc[passages_reset["id"]==rid, "passage"].values[0] if (passages_reset["id"]==rid).any() else "").tolist() ],
#     "ground_truths": [ [gt] for gt in batch_df["answer"].astype(str).tolist() ], # Reference Answer
# }

# # Convert dict to dataset
# dataset = Dataset.from_dict(data)

In [None]:
# # RAGAs single-pass minimal eval — combine 4 metrics with tight budgets
# import os, numpy as np
# from ragas import evaluate as ragas_evaluate
# from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
# from ragas.run_config import RunConfig
# from langchain_openai import ChatOpenAI

# # adjust here to fit your rate limit
# K = min(100, len(dataset["question"]))     # set 30 if you still hit 429
# MAX_CHARS = 600                        
# MAX_TOKENS = 128                          # set 128 if needed

# rng = np.random.default_rng(42)
# idx = rng.choice(len(dataset["question"]), size=K, replace=False)
# subset = dataset.select(idx.tolist())

# def _truncate(row, max_chars=MAX_CHARS):
#     ctxs = row.get("contexts", [])
#     row["contexts"] = [ctxs[0][:max_chars]] if ctxs else [""]
#     return row
# subset = subset.map(_truncate)

# llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=MAX_TOKENS, api_key=os.environ["OPENAI_API_KEY"])
# rc = RunConfig(timeout=120)

# metrics = [faithfulness, answer_relevancy, context_precision, context_recall]
# res = ragas_evaluate(subset, metrics=metrics, llm=llm, run_config=rc)
# df = res.to_pandas()
# df.to_csv("ragas_naive.csv", index=False)
# print(f"Saved ragas_naive.csv with {len(df)} rows")
# df.head()

Map: 100%|██████████| 100/100 [00:00<00:00, 11400.04 examples/s]
Evaluating:  12%|█▏        | 47/400 [00:12<00:53,  6.66it/s]Failed to parse output. Returning None.
Evaluating:  22%|██▏       | 88/400 [00:19<00:53,  5.88it/s]Failed to parse output. Returning None.
Evaluating:  24%|██▍       | 95/400 [00:19<00:34,  8.94it/s]Failed to parse output. Returning None.
Evaluating:  24%|██▍       | 98/400 [00:19<00:30,  9.82it/s]Failed to parse output. Returning None.
Evaluating:  44%|████▍     | 178/400 [00:30<00:24,  9.06it/s]Failed to parse output. Returning None.
Evaluating:  70%|███████   | 282/400 [00:45<00:17,  6.84it/s]Failed to parse output. Returning None.
Evaluating:  76%|███████▋  | 306/400 [00:48<00:12,  7.40it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 400/400 [01:19<00:00,  5.04it/s]

Saved ragas_naive.csv with 100 rows





Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall
0,Which property did James Monroe sell in 1817?,Highland Plantation,[Monroe had racked up many debts during his ye...,Monroe Hill on the grounds of the University o...,0.0,0.90112,0.0,0.0
1,Was it a two-sentence description that complet...,Yes,"[Immediately after Lee's surrender, Grant had ...",yes,1.0,0.831117,1.0,1.0
2,What is an otter's den called?,a holt,[An otter's den is called a holt. Male otters...,Holt,1.0,1.0,1.0,1.0
3,Who dismantled partisan and sectional coalitio...,Wilson dismantled partisan and sectional coali...,[The longest section of Congressional Governme...,many congressmen,0.0,0.910533,0.0,0.0
4,Did James Monroe fight in the Continental Army?,yes,[* Monroe was (arguably) the last president to...,yes,0.0,0.957436,1.0,1.0


In [None]:
# Load Cross-Encoder for reranking
from sentence_transformers import CrossEncoder
RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(RERANK_MODEL_NAME)


In [None]:
# Enhanced pipeline: mpnet embeddings + CrossEncoder reranking (top-3),
# low-budget generation and RAGAs evaluation
import os, json, numpy as np, pandas as pd
from typing import List, Tuple
from datasets import Dataset
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas.run_config import RunConfig
from langchain_openai import ChatOpenAI

# 1) Reranker helper with confidence

def rerank_with_confidence(query: str,
                           candidate_ids: List[int],
                           candidate_texts: List[str]) -> Tuple[List[float], List[float], List[Tuple[int, float]]]:
    pairs = [(query, t) for t in candidate_texts]
    scores = reranker.predict(pairs).tolist()
    # Softmax for normalized confidence over candidates
    scores_np = np.array(scores, dtype=float)
    exps = np.exp(scores_np - np.max(scores_np))
    probs = exps / (exps.sum() if exps.sum() > 0 else 1.0)
    id_prob_list = list(zip(candidate_ids, probs.astype(float).tolist()))
    return scores, probs.astype(float).tolist(), id_prob_list


In [None]:

# 2) Build enhanced answers on a modest subset to keep runtime small
TOP_K = 1
CAND_LIMIT = 30
MAX_NEW_TOKENS = 24
SAMPLE_N = min(100, len(clean_queries))  # meets the >=100 guideline, runs fast

# Ensure mpnet model is available (created earlier); fallback if needed
try:
    embedding_model_mpnet  # type: ignore  # noqa: F821
except Exception:
    from sentence_transformers import SentenceTransformer
    embedding_model_mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

pred_answers_enh, contexts_list_enh, confidences = [], [], []
subset_q = clean_queries.head(SAMPLE_N).copy()

for q in subset_q["question"].astype(str).tolist():
    q_emb = embedding_model_mpnet.encode([q], normalize_embeddings=True)
    sres = client.search(
        collection_name="rag_mini_mpnet",
        data=q_emb.tolist(),
        anns_field="embedding",
        limit=CAND_LIMIT,
        search_params={"metric_type": "IP"}
    )
    hits_local = sres[0] if len(sres) > 0 else []
    cand_ids = [int(h["id"]) for h in hits_local]
    cand_texts = [
        passages_reset.loc[passages_reset["id"] == cid, "passage"].values[0]
        for cid in cand_ids
    ] if len(cand_ids) > 0 else []

    if len(cand_ids) == 0:
        contexts = [""]
        conf_score = 0.0
    else:
        _, _, id_prob_list = rerank_with_confidence(q, cand_ids, cand_texts)
        sorted_by_prob = sorted(id_prob_list, key=lambda x: x[1], reverse=True)
        topk_ids = [cid for cid, _ in sorted_by_prob[:TOP_K]]
        contexts = [
            passages_reset.loc[passages_reset["id"] == cid, "passage"].values[0]
            for cid in topk_ids
        ]
        conf_score = float(np.mean([p for _, p in sorted_by_prob[:TOP_K]])) if len(sorted_by_prob) > 0 else 0.0

    ctx_text = "\n\n".join(contexts)
    sys_prompt = "You are a helpful assistant. Answer concisely using the provided context. If unknown, say you don't know."
    prompt_i = f"{sys_prompt}\nContext: {ctx_text}\nQuestion: {q}"
    pred_answers_enh.append(generate_answer(prompt_i, max_new_tokens=MAX_NEW_TOKENS))
    contexts_list_enh.append(contexts)
    confidences.append(conf_score)

enhanced_df = subset_q[["question", "answer"]].copy()
enhanced_df["pred_answer"] = pred_answers_enh
enhanced_df["contexts_list"] = contexts_list_enh
enhanced_df["confidence"] = confidences



E20251003 23:31:54.334995 1871613 milvus_proxy.cpp:210] [SERVER][Search][] Can not find rag_mini_mpnet's schema
RPC error: [search], <MilvusException: (code=100, message=Can not find rag_mini_mpnet's schema: collection not found)>, <Time:{'RPC start': '2025-10-03 23:31:54.332555', 'RPC error': '2025-10-03 23:31:54.336750'}>
Failed to search collection: rag_mini_mpnet


MilvusException: <MilvusException: (code=100, message=Can not find rag_mini_mpnet's schema: collection not found)>

In [None]:
# 3) EM/F1 on enhanced subset
try:
    preds_ = enhanced_df["pred_answer"].astype(str).tolist()
    refs_ = enhanced_df["answer"].astype(str).tolist()
    em_enh = sum(exact_match_score(p, r) for p, r in zip(preds_, refs_)) / len(refs_) * 100.0
    f1_enh = sum(f1_score(p, r) for p, r in zip(preds_, refs_)) / len(refs_) * 100.0
except Exception:
    # Fallback in case helper functions are not in scope
    import re, string
    def _normalize_answer(s: str) -> str:
        def remove_articles(text): return re.sub(r"\b(a|an|the)\b", " ", text, flags=re.IGNORECASE)
        def remove_punc(text): return text.translate(str.maketrans("", "", string.punctuation))
        def white_space_fix(text): return " ".join(text.split())
        return white_space_fix(remove_articles(remove_punc(s.lower())))
    def _em(a, b): return int(_normalize_answer(a) == _normalize_answer(b))
    def _f1(a, b):
        p = _normalize_answer(a).split(); t = _normalize_answer(b).split()
        if not p and not t: return 100.0
        if not p or not t: return 0.0
        overlap = 0
        for tok in set(p): overlap += min(p.count(tok), t.count(tok))
        if overlap == 0: return 0.0
        precision = overlap / len(p); recall = overlap / len(t)
        return 2 * precision * recall / (precision + recall) * 100.0
    em_enh = sum(_em(p, r) for p, r in zip(preds_, refs_)) / len(refs_) * 100.0
    f1_enh = sum(_f1(p, r) for p, r in zip(preds_, refs_)) / len(refs_)

with open("enhanced_results.json", "w") as f:
    json.dump({
        "embedding_model": "sentence-transformers/all-mpnet-base-v2",
        "reranker": RERANK_MODEL_NAME,
        "top_k": TOP_K,
        "num_eval": int(len(enhanced_df)),
        "em": float(em_enh),
        "f1": float(f1_enh)
    }, f, indent=2)
print({"enhanced_subset": len(enhanced_df), "em": em_enh, "f1": f1_enh})



In [None]:
# 4) Low-budget RAGAs for enhanced
K = min(60, len(enhanced_df))  # smaller than naive to avoid rate limits
if "OPENAI_API_KEY" in os.environ and os.environ["OPENAI_API_KEY"]:
    data_enh = {
        "question": enhanced_df["question"].astype(str).tolist(),
        "answer": enhanced_df["pred_answer"].astype(str).tolist(),
        "contexts": enhanced_df["contexts_list"].tolist(),
        "ground_truth": enhanced_df["answer"].astype(str).tolist(),
    }
    ds_enh = Dataset.from_dict(data_enh)
    rng = np.random.default_rng(123)
    idx = rng.choice(len(ds_enh), size=K, replace=False)
    subset_enh = ds_enh.select(idx.tolist())

    def _trim(row, max_chars=600):
        row["contexts"] = [c[:max_chars] for c in row.get("contexts", [])]
        return row
    subset_enh = subset_enh.map(_trim)

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=96, api_key=os.environ["OPENAI_API_KEY"])
    rc = RunConfig(timeout=120)
    metrics = [faithfulness, answer_relevancy, context_precision, context_recall]

    res = ragas_evaluate(subset_enh, metrics=metrics, llm=llm, run_config=rc)
    df_enh = res.to_pandas()
    df_enh.to_csv("ragas_enhanced.csv", index=False)
    print(f"Saved ragas_enhanced.csv with {len(df_enh)} rows")
else:
    print("OPENAI_API_KEY not set; skipping RAGAs for enhanced.")

In [None]:
# Results summary — compare naive vs enhanced (saved to current directory)
import pandas as pd, numpy as np, json, os

naive = pd.read_csv("ragas_naive.csv")
enh   = pd.read_csv("ragas_enhanced.csv") if os.path.exists("ragas_enhanced.csv") else pd.DataFrame()
cols = ["faithfulness","answer_relevancy","context_precision","context_recall"]

def means(df):
    return {c: float(np.nanmean(df[c])) for c in cols if c in df.columns and len(df) > 0}

naive_mean = means(naive)
enh_mean   = means(enh)

cmp = pd.DataFrame([
    {"metric": c, "naive": naive_mean.get(c), "enhanced": enh_mean.get(c),
     "delta": (enh_mean.get(c, np.nan) - naive_mean.get(c, np.nan)) if enh_mean.get(c) is not None else None}
    for c in cols
])

cmp.to_csv("comparison_analysis.csv", index=False)
with open("enhanced_results_summary.json","w") as f:
    json.dump({"naive": naive_mean, "enhanced": enh_mean,
               "delta": {r["metric"]: r["delta"] for _, r in cmp.iterrows()}}, f, indent=2)

print("Saved comparison_analysis.csv and enhanced_results_summary.json")
cmp.head()

In [None]:
# Step 4: Parameter comparison (2 embeddings × top_k 3/5/10) with EM/F1
from sentence_transformers import SentenceTransformer as ST
import pandas as pd

EMBED_MODELS = [
    "sentence-transformers/all-MiniLM-L6-v2",  # 384
    "sentence-transformers/all-mpnet-base-v2", # 768
]
COLLECTION_BY_MODEL = {
    "sentence-transformers/all-MiniLM-L6-v2": "rag_mini",
    "sentence-transformers/all-mpnet-base-v2": "rag_mini_mpnet",
}
TOP_KS = [3, 5]
CAND_LIMIT = 30
MAX_NEW_TOKENS = 24

subset_for_grid = clean_queries.head(120).copy()  # keep it light but >=100

results_rows = []
for model_name in EMBED_MODELS:
    emb_model = ST(model_name)
    collection = COLLECTION_BY_MODEL[model_name]
    for q in subset_for_grid["question"].astype(str).tolist():
        q_emb = emb_model.encode([q], normalize_embeddings=True)
        sres = client.search(
            collection_name=collection,
            data=q_emb.tolist(),
            anns_field="embedding",
            limit=CAND_LIMIT,
            search_params={"metric_type": "IP"}
        )
        hits_local = sres[0] if len(sres) > 0 else []
        cand_ids = [int(h["id"]) for h in hits_local]
        cand_texts = [
            passages_reset.loc[passages_reset["id"]==cid, "passage"].values[0]
            for cid in cand_ids
        ] if len(cand_ids) > 0 else []

        # simple rerank to stabilize order across top_k
        if len(cand_ids) > 0:
            _, _, id_prob_list = rerank_with_confidence(q, cand_ids, cand_texts)
            sorted_ids = [cid for cid,_ in sorted(id_prob_list, key=lambda x:x[1], reverse=True)]
        else:
            sorted_ids = []

        for k in TOP_KS:
            topk_ids = sorted_ids[:k]
            ctxs = [passages_reset.loc[passages_reset["id"]==cid, "passage"].values[0] for cid in topk_ids] if len(topk_ids)>0 else [""]
            ctx_text = "\n\n".join(ctxs)
            sys_prompt = "You are a helpful assistant. Answer concisely using the provided context. If unknown, say you don't know."
            prompt_i = f"{sys_prompt}\nContext: {ctx_text}\nQuestion: {q}"
            pred = generate_answer(prompt_i, max_new_tokens=MAX_NEW_TOKENS)
            results_rows.append({
                "embedding_model": model_name,
                "top_k": k,
                "question": q,
                "pred": pred,
            })

# assemble and score
res_df = pd.DataFrame(results_rows)
merged = subset_for_grid[["question","answer"]].merge(res_df, on="question", how="right")

scores = []
for (m, k), g in merged.groupby(["embedding_model","top_k"]):
    preds = g["pred"].astype(str).tolist()
    refs = g["answer"].astype(str).tolist()
    em_ = sum(exact_match_score(p, r) for p, r in zip(preds, refs)) / len(refs) * 100.0
    f1_ = sum(f1_score(p, r) for p, r in zip(preds, refs)) / len(refs) * 100.0
    scores.append({"embedding_model": m, "top_k": k, "em": em_, "f1": f1_})

param_cmp = pd.DataFrame(scores).sort_values(["embedding_model","top_k"]).reset_index(drop=True)
param_cmp.to_csv("parameter_comparison.csv", index=False)
print("Saved parameter_comparison.csv")
param_cmp.head()
