In [21]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 
os.environ["OMP_NUM_THREADS"] = "1"            

In [1]:
print("Step 1: Installing and importing libraries")
!pip install -qU pymilvus milvus-lite ragas sentence-transformers transformers torch datasets pandas evaluate "pymilvus[milvus_lite]"
!pip install -qU langchain # For the text splitter

Step 1: Installing and importing libraries


In [2]:
import pandas as pd
import numpy as np
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import evaluate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType
import warnings
warnings.filterwarnings("ignore")

Run-1

In [3]:
print("\nSetting up configuration")
EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'
COLLECTION_NAME = 'rag_mini_768d'
DB_NAME = "rag_experiments_768d.db"


Setting up configuration


In [4]:
#Data Loading
print("\nLoading and preparing data")
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")



Loading and preparing data


In [5]:
#Chunking the documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = []
for index, row in passages.iterrows():
    text_chunks = text_splitter.split_text(row['passage'])
    for i, chunk in enumerate(text_chunks):
        chunks.append({'id': f"{row.name}_{i}", 'passage': chunk})
chunk_df = pd.DataFrame(chunks)
print(f"Split {len(passages)} documents into {len(chunk_df)} chunks.")

Split 3200 documents into 4430 chunks.


In [None]:
#Embedding Generation
print("\nGenerating embeddings")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
embeddings = embedding_model.encode(chunk_df['passage'].tolist(), show_progress_bar=True)
chunk_df['embedding'] = list(embeddings)
EMBEDDING_DIM = embeddings.shape[1]
print(f"Generated {len(embeddings)} embeddings of dimension {EMBEDDING_DIM}.")


🧠 Step 4: Generating embeddings


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/139 [00:00<?, ?it/s]

Generated 4430 embeddings of dimension 768.


Vector Database Setup

In [None]:
# print("\nSetting up Milvus vector database")
# # Defining Schema
# id_field = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=256)
# passage_field = FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=65535)
# embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM)
# schema = CollectionSchema(fields=[id_field, passage_field, embedding_field], description="RAG Wikipedia Collection")



Setting up Milvus vector database


Creating Client and Connection

In [None]:
# client = MilvusClient(DB_NAME)
# if client.has_collection(collection_name=COLLECTION_NAME):
#     print(f"Collection '{COLLECTION_NAME}' already exists. Dropping it.")
#     client.drop_collection(collection_name=COLLECTION_NAME)
# client.create_collection(collection_name=COLLECTION_NAME, schema=schema)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# #Inserting Data into Milvus
# data_to_insert = chunk_df.to_dict(orient='records')
# res = client.insert(collection_name=COLLECTION_NAME, data=data_to_insert)
# print(f"Inserted {res['insert_count']} entities.")

Inserted 4430 entities.


In [None]:
# #Creating
# index_params = client.prepare_index_params()
# index_params.add_index(field_name="embedding", index_type="AUTOINDEX", metric_type="L2")
# client.create_index(collection_name=COLLECTION_NAME, index_params=index_params)
# client.load_collection(collection_name=COLLECTION_NAME)
# print("Milvus setup complete and collection is loaded.")


Milvus setup complete and collection is loaded.


In [None]:
print("\nConnecting to existing database")
client = MilvusClient(DB_NAME)
if not client.has_collection(collection_name=COLLECTION_NAME):
    raise ValueError(f"Collection '{COLLECTION_NAME}' not found. Please run the `build_database.ipynb` notebook first.")

client.load_collection(collection_name=COLLECTION_NAME)
print(f"Successfully connected to collection '{COLLECTION_NAME}'.")

RAG Pipeline Setup

In [11]:
print("\nSetting up RAG pipeline components")
#QA Dataset and Metric
queries_df = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet").head(25)
squad_metric = evaluate.load("squad")


Setting up RAG pipeline components


In [12]:
llm_model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

In [13]:
#Prompt Strategies
def create_instruction_prompt(context, query): return f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
def create_cot_prompt(context, query): return f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer: Let's think step by step."
def create_persona_prompt(context, query): return f"You are an expert encyclopedia. Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"

In [14]:
#RAG Function
def generate_rag_response(query, prompt_strategy, top_k):
    query_embedding = embedding_model.encode(query)
    search_results = client.search(collection_name=COLLECTION_NAME, data=[query_embedding], limit=top_k, output_fields=["passage"])
    context = "\\n".join([hit['entity']['passage'] for hit in search_results[0]])
    prompt = prompt_strategy(context, query)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [15]:
#Evaluation Function
print("\nExecuting evaluation loop")
results = []
top_k_options = [1, 3, 5]
prompt_strategies = {
    "Instruction": create_instruction_prompt,
    "Chain-of-Thought": create_cot_prompt,
    "Persona": create_persona_prompt,
}

for k in top_k_options:
    for name, func in prompt_strategies.items():
        print(f"--- Running: top_k={k}, prompt='{name}' ---")
        predictions_text = [generate_rag_response(q, func, k) for q in queries_df['question']]
        predictions = [{'prediction_text': p, 'id': str(i)} for i, p in enumerate(predictions_text)]
        references = [{'answers': {'text': [gt], 'answer_start': [0]}, 'id': str(i)} for i, gt in enumerate(queries_df['answer'])]
        metrics = squad_metric.compute(predictions=predictions, references=references)
        results.append({
            "Embedding Model": EMBEDDING_MODEL_NAME,
            "Embedding Size": EMBEDDING_DIM,
            "Prompt Strategy": name,
            "Top K": k,
            "Exact Match": metrics['exact_match'],
            "F1 Score": metrics['f1']
        })


Executing evaluation loop
--- Running: top_k=1, prompt='Instruction' ---
--- Running: top_k=1, prompt='Chain-of-Thought' ---
--- Running: top_k=1, prompt='Persona' ---
--- Running: top_k=3, prompt='Instruction' ---
--- Running: top_k=3, prompt='Chain-of-Thought' ---
--- Running: top_k=3, prompt='Persona' ---
--- Running: top_k=5, prompt='Instruction' ---


Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


--- Running: top_k=5, prompt='Chain-of-Thought' ---
--- Running: top_k=5, prompt='Persona' ---


In [16]:
print("\nDisplaying results and cleaning up")
client.close()
results_df = pd.DataFrame(results)
print("\n\n--- FINAL RESULTS ---")
print(results_df.to_markdown(index=False))


Displaying results and cleaning up


--- FINAL RESULTS ---
| Embedding Model   |   Embedding Size | Prompt Strategy   |   Top K |   Exact Match |   F1 Score |
|:------------------|-----------------:|:------------------|--------:|--------------:|-----------:|
| all-mpnet-base-v2 |              768 | Instruction       |       1 |            64 |    69.7734 |
| all-mpnet-base-v2 |              768 | Chain-of-Thought  |       1 |             0 |    14.2056 |
| all-mpnet-base-v2 |              768 | Persona           |       1 |            64 |    69.0462 |
| all-mpnet-base-v2 |              768 | Instruction       |       3 |            68 |    77.0462 |
| all-mpnet-base-v2 |              768 | Chain-of-Thought  |       3 |             0 |    11.3789 |
| all-mpnet-base-v2 |              768 | Persona           |       3 |            72 |    81.0462 |
| all-mpnet-base-v2 |              768 | Instruction       |       5 |            72 |    80.7128 |
| all-mpnet-base-v2 |              768 |

In [18]:
results_df.to_csv("runs/Assignment2_768d_results.csv", index=False)
with open("runs/Assignment2_768d_results.md","w") as f:
    f.write(results_df.to_markdown(index=False))

In [20]:
def show_single_query_breakdown(index_to_test):

    client = MilvusClient(DB_NAME)
    client.load_collection(collection_name=COLLECTION_NAME)

    question = queries_df.iloc[index_to_test]['question']
    ground_truth = queries_df.iloc[index_to_test]['answer']
    top_k = 1
    
    def create_persona_prompt(context, query):
        return f"You are an expert encyclopedia. Answer the question based on the context.\n\nContext:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"

    print(f"\nTHE QUERY\n'{question}'")
    print(f"\nGROUND TRUTH ANSWER:\n'{ground_truth}'")

    #Retrieval Step
    query_embedding = embedding_model.encode(question)
    search_results = client.search(
        collection_name=COLLECTION_NAME,
        data=[query_embedding],
        limit=top_k,
        output_fields=["passage"]
    )
    
    retrieved_chunks = [hit['entity']['passage'] for hit in search_results[0]]
    context = "\n".join(retrieved_chunks)
    
    print(f"\nRETRIEVED CONTEXT (Top {top_k} Chunks)")
    for i, chunk in enumerate(retrieved_chunks):
        print(f"\n[CHUNK {i+1}]:\n\"{chunk}\"")

    prompt = create_persona_prompt(context, question)
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=128)
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\nFINAL GENERATED ANSWER")
    print(f"'{generated_answer}'")


show_single_query_breakdown(index_to_test=0)


THE QUERY
'Was Abraham Lincoln the sixteenth President of the United States?'

GROUND TRUTH ANSWER:
'yes'

RETRIEVED CONTEXT (Top 1 Chunks)

[CHUNK 1]:
"Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in"

FINAL GENERATED ANSWER
'yes'
