In [1]:
!pip install datasets
!pip install transformers
!pip install sentence-transformers
!pip install pymilvus
!pip install ragas
!pip install ydata-profiling
!pip install pymilvus[milvus_lite]
!pip install google-generativeai
!pip install evaluate datasets
!pip install openai langchain-openai



In [2]:
# Load all required Libraries
import pandas as pd
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)


# Read Passages from the Datasets and Drop rows if they are NA or empty

In [3]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
passages.head()

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


# Do EDA on the passage dataset
- You can try to find the maximum and minimum length of the passages before indexing (just a direction)

In [4]:
from ydata_profiling import ProfileReport

In [5]:
# Code for EDA

profile = ProfileReport(passages, title="Passages EDA")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# Calculate passage lengths
passages['passage_length'] = passages['passage'].apply(len)

# Calculate average, maximum, and minimum passage lengths
avg_length = passages['passage_length'].mean()
max_length = passages['passage_length'].max()
min_length = passages['passage_length'].min()

print(f"Average passage length: {avg_length:.2f}")
print(f"Maximum passage length: {max_length}")
print(f"Minimum passage length: {min_length}")

Average passage length: 389.85
Maximum passage length: 2515
Minimum passage length: 1


In [7]:
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_ngrams(text, n):
    tokens = re.findall(r'\b\w+\b', text.lower())
    # Remove stop words and lemmatize
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return [' '.join(processed_tokens[i:i+n]) for i in range(len(processed_tokens) - n + 1)]

all_words = []
all_bigrams = []
all_trigrams = []

for passage in passages['passage']:
    all_words.extend(get_ngrams(passage, 1))
    all_bigrams.extend(get_ngrams(passage, 2))
    all_trigrams.extend(get_ngrams(passage, 3))

# Find most frequent words, bigrams, and trigrams
most_common_words = Counter(all_words).most_common(10)
most_common_bigrams = Counter(all_bigrams).most_common(10)
most_common_trigrams = Counter(all_trigrams).most_common(10)

print("\nMost common words:", most_common_words)
print("\nMost common bigrams:", most_common_bigrams)
print("\nMost common trigrams:", most_common_trigrams)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Most common words: [('state', 535), ('president', 490), ('wolf', 465), ('also', 458), ('one', 449), ('year', 420), ('new', 417), ('first', 401), ('roosevelt', 366), ('war', 361)]

Most common bigrams: [('united state', 187), ('new york', 160), ('polar bear', 122), ('white house', 70), ('theodore roosevelt', 65), ('nikola tesla', 63), ('world war', 47), ('vice president', 45), ('john adam', 42), ('prime minister', 42)]

Most common trigrams: [('president united state', 27), ('et al 2006', 24), ('new york city', 21), ('retrieved december 31', 21), ('december 31 2006', 21), ('new york time', 19), ('world war ii', 17), ('baker et al', 15), ('gerald r ford', 15), ('justice supreme court', 13)]


# Tokenize Text and Generate Embeddings using Sentence Transformers

In [8]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode Text
embeddings = embedding_model.encode(passages['passage'].tolist(), convert_to_tensor=True)

print(embeddings.shape)

torch.Size([3200, 384])


# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [9]:
# Define every column of your schema

id_ = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
passage = FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=65535)
embedding = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embeddings.shape[1])

In [10]:
schema = CollectionSchema(fields=[id_, passage, embedding], description="Wikipedia Passages Collection")

In [11]:
client = MilvusClient("rag_wikipedia_mini.db")

# Create the Collection with Collection Name = "rag_mini". Make sure you define the schema variable while creating the collection
client.create_collection("rag_mini", schema=schema)

**Convert your Pandas Dataframe to a list of dictionaries**
- The Dictionary at least have 3 keys [id, passage, embedding]

In [12]:
# Convert your Pandas Dataframe to a list of dictionaries
# The Dictionary at least have 3 keys [id, passage, embedding]

rag_data = []
# Iterate through the valid indices of the embeddings tensor
for index in range(embeddings.shape[0]):
    rag_data.append({
        "id": int(index),  # Ensure ID is integer
        "passage": passages.iloc[index]['passage'],
        "embedding": embeddings[index].tolist() # Convert tensor to list
    })

In [13]:
# Code to insert the data to your DB
res = client.insert(collection_name="rag_mini", data=rag_data)

print(res)

{'insert_count': 3200, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

- Do a Sanity Check on your database

**Do not delete the below line during your submission**

In [14]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

Entity count: 6400
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'Wikipedia Passages Collection', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}


# Query Expansion with Genrated Answers (HyDE)

In [15]:
def generate_hypothetical_answer(query, model):
    """Generates a hypothetical answer to the query using a generative model."""
    hyde_prompt = f"""Please write a concise and hypothetical answer to the following question:\n\n{query}"""
    response = model.generate_content(hyde_prompt)
    return response.text

# Reranking

In [16]:
from sentence_transformers import CrossEncoder
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2', token=hf_token)

# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

In [17]:
import pandas as pd

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")
queries

Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832
...,...,...
1710,Was Wilson president of the American Political...,Yes
1711,Did he not cast his ballot for John M. Palmer ...,Yes
1712,Did Wilson not spend 1914 through the beginnin...,Yes
1713,"Was Wilson , a staunch opponent of antisemitis...",Yes


#### Create Index on the embedding column on your DB

In [18]:
index_params = MilvusClient.prepare_index_params()

# Add an index on the embedding field
index_params.add_index(
    field_name="embedding",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)

# Create the index
try:
    client.create_index(collection_name="rag_mini", index_params=index_params)
except Exception as e:
    print(f"Index creation result: {e}")

# Load collection into memory (required for search)
client.load_collection("rag_mini")

In [19]:
import google.generativeai as genai
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [20]:
# Initialize the Gemini API
gemini_model = genai.GenerativeModel('gemini-2.5-flash-lite')

In [21]:
# Let's use the first query from the dataset for demonstration
query = queries['question'].iloc[0]
ground_truth = queries['answer'].iloc[0]

print(f"Original Query: {query}")
print(f"Ground Truth: {ground_truth}")

# Generate a hypothetical answer for the query using the Gemini model (HyDE)
hypothetical_answer = generate_hypothetical_answer(query, gemini_model)
print(f"Hypothetical Answer: {hypothetical_answer}")

# Convert the hypothetical answer to a vector embedding
hypothetical_answer_embedding = embedding_model.encode(hypothetical_answer, convert_to_tensor=True)

# Search the db with your hypothetical answer embedding
# You can adjust the `limit` parameter to fetch more or fewer results
search_results = client.search(
    collection_name="rag_mini",
    data=[hypothetical_answer_embedding.tolist()], # Milvus expects a list of vectors
    limit=10, # Fetch top 10 results to rerank
    output_fields=["passage"] # Include the passage text in the results
)

# Prepare data for reranking
passages_to_rerank = [hit['entity']['passage'] for hit in search_results[0]]
pairs = [[query, passage] for passage in passages_to_rerank]

# Perform reranking
scores = reranker.predict(pairs)

# Get the indices of the top 5 reranked passages
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]

# Extract the top 5 reranked passages as the context
context = "\n".join([passages_to_rerank[i] for i in top_indices])

print("\nSearch results after reranking:")
print(context)

Original Query: Was Abraham Lincoln the sixteenth President of the United States?
Ground Truth: yes
Hypothetical Answer: Yes, Abraham Lincoln was the sixteenth President of the United States.

Search results after reranking:
Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluenc

## Now get the Context
- Initially use the first passage ONLY as your context
- In Later Experiments, you must try at least 2 different passage selection strategies (Top 3 / Top 5 / Top 10) and pass to your prompt

**Develop your Prompt**

In [22]:
system_prompt = """You are a helpful assistant that answers questions based on the provided context."""

prompt = f"""{system_prompt} \n Context: {context}: \n Question: {query} """
print(prompt)

You are a helpful assistant that answers questions based on the provided context. 
 Context: Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the ce

# RAG Response for a Single Query

Before you can make any API calls, you need to initialize the Generative Model.

Now you can generate the answer using the prompt and the model.

In [23]:
# Generate answer
response = gemini_model.generate_content(prompt)
answer = response.text

# Decode and extract answer.
print("Generated Answer:", answer)

Generated Answer: Yes, Abraham Lincoln was the sixteenth President of the United States.


# Generate Responses for all the Queries in the Dataset

In [24]:
from tqdm.auto import tqdm
import time
from sentence_transformers import CrossEncoder

# Load a reranking model (if not already loaded)
try:
    reranker
except NameError:
    reranker = CrossEncoder('cross-encoder/msmarco-MiniLM-L-6-v2')


# Prepare lists to store results
generated_answers = []
retrieved_contexts = []
questions = []
ground_truths = []

# Iterate through each query in the dataset
for index, row in tqdm(queries.iterrows(), total=len(queries)):
    question = row['question']
    ground_truth = row['answer']

    # Generate a hypothetical answer for the query using the Gemini model (HyDE)
    hypothetical_answer = generate_hypothetical_answer(question, gemini_model)

    # Convert the hypothetical answer to a vector embedding
    hypothetical_answer_embedding = embedding_model.encode(hypothetical_answer, convert_to_tensor=True)

    # Search Milvus for relevant passages using the hypothetical answer embedding
    search_results = client.search(
        collection_name="rag_mini",
        data=[hypothetical_answer_embedding.tolist()],
        limit=10, # Fetch top 10 results to rerank
        output_fields=["passage"]
    )

    # Prepare data for reranking
    passages_to_rerank = [hit['entity']['passage'] for hit in search_results[0]]
    pairs = [[question, passage] for passage in passages_to_rerank]

    # Perform reranking
    scores = reranker.predict(pairs)

    # Get the indices of the top 5 reranked passages
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]

    # Extract the top 5 reranked passages as the context
    context = "\n".join([passages_to_rerank[i] for i in top_indices])


    # Develop the prompt using the original question and the retrieved context
    system_prompt = """You are a helpful assistant that answers questions based on the provided context."""
    prompt = f"""{system_prompt} \n Context: {context}: \n Question: {question} """

    # Generate answer using the Gemini model
    try:
        response = gemini_model.generate_content(prompt)
        answer = response.text
    except Exception as e:
        answer = f"Error generating response: {e}"

    # Store results
    questions.append(question)
    generated_answers.append(answer)
    retrieved_contexts.append(context)
    ground_truths.append(ground_truth)

# Create a DataFrame with the results
rag_results = pd.DataFrame({
    "question": questions,
    "answer": generated_answers,
    "contexts": retrieved_contexts,
    "ground_truths": ground_truths
})

display(rag_results.head())

  0%|          | 0/918 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truths
0,Was Abraham Lincoln the sixteenth President of...,"Yes, Abraham Lincoln was the sixteenth Preside...","Abraham Lincoln (February 12, 1809 â April 1...",yes
1,Did Lincoln sign the National Banking Act of 1...,"Yes, Lincoln signed the National Banking Act o...",Lincoln believed in the Whig theory of the pre...,yes
2,Did his mother die of pneumonia?,The provided context states that Theodore Roos...,"Alice Hathaway Lee Roosevelt (July 29, 1861 in...",no
3,How many long was Lincoln's formal education?,Lincoln's formal education consisted of about ...,Lincoln's formal education consisted of about ...,18 months
4,When did Lincoln begin his political career?,Lincoln began his political career in 1832.,"Lincoln began his political career in 1832, at...",1832


# Finding out the Basic QA Metrics (F1 score, EM score)

In [25]:
import evaluate as hf_evaluate # Import with an alias to avoid name conflict

squad_metric = hf_evaluate.load("squad_v2")

# The SQuAD metric requires the data in a specific format
# We need a list of dictionaries for predictions and a list of dictionaries for references

predictions = []
references = []

for index, row in rag_results.iterrows():
    predictions.append({
        'id': str(index), # SQuAD metric expects string ids
        'prediction_text': row['answer'],
        'no_answer_probability': 0.0 # Assuming all questions have answers in this dataset
    })
    references.append({
        'id': str(index), # SQuAD metric expects string ids
        'answers': {
            'answer_start': [-1], # Start index is not available in this dataset
            'text': [row['ground_truths']]
        }
    })

# Compute the metrics
results = squad_metric.compute(predictions=predictions, references=references)

print("SQuAD Metrics:")
print(f"  Exact Match (EM): {results['exact']:.4f}")
print(f"  F1 Score: {results['f1']:.4f}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

SQuAD Metrics:
  Exact Match (EM): 1.3072
  F1 Score: 23.2606


# Advanced Evaluation using RAGAs

In [26]:
data = {
    "user_input": rag_results["question"].tolist(),
    "response": rag_results["answer"].tolist(),
    "retrieved_contexts": [[context] for context in rag_results["contexts"].tolist()], # Ragas expects a list of lists for contexts
    "reference": rag_results["ground_truths"].tolist() # Add ground_truths as reference for context_recall
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [27]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, answer_relevancy, context_precision
from ragas import evaluate
from google.colab import userdata

# Get API key
openai_api_key = userdata.get("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found. Please set it as a Colab secret.")

# Initialize LangChain models directly
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
openai_llm = ChatOpenAI(model="gpt-4o-mini", api_key=openai_api_key)

# Wrap for RAGas
evaluator_llm = LangchainLLMWrapper(openai_llm)

# Evaluate
result = evaluate(
    dataset=dataset,
    metrics=[
        Faithfulness(),
        answer_relevancy,
        LLMContextRecall(),
        context_precision,
        FactualCorrectness()
    ],
    llm=evaluator_llm,
    embeddings=embeddings
)

print(result)


Evaluating:   0%|          | 0/4590 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[1]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[11]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[6]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[16]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[21]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[26]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[36]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[31]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[41]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[51]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[46]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[56]: IndexE

{'faithfulness': 0.9322, 'answer_relevancy': 0.8108, 'context_recall': 0.6493, 'context_precision': 0.8648, 'factual_correctness(mode=f1)': 0.1616}
