In [None]:
# Install the necessary libraries
!pip install sentence-transformers datasets faiss-cpu pandas numpy rank_bm25

#**ENVIRONMENT SETUP & DATA INGESTION**

In [2]:
import torch
import numpy as np
import random
import os
import time
import math
import pandas as pd
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader

# Reproducibility Setup
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Runtime Device: {device}")

# Data Ingestion (Streaming)
# Using streaming=True to handle 8.8M dataset with limited RAM
print("Initializing MS MARCO stream...")
dataset_stream = load_dataset('ms_marco', 'v1.1', split='train', streaming=True)

# Configuration
CORPUS_LIMIT = 10000 # Limit to 10k docs for rapid prototyping/debugging
corpus_ids = set()
corpus_data = []
queries = []
qrels = {}

print(f"Extracting {CORPUS_LIMIT} samples...")
counter = 0

for sample in dataset_stream:
    if counter >= CORPUS_LIMIT:
        break

    q_id = sample['query_id']
    q_text = sample['query']
    queries.append({'id': q_id, 'text': q_text})

    # Extract relevant/non-relevant passages
    passages = sample['passages']
    for idx, text in enumerate(passages['passage_text']):
        is_correct = passages['is_selected'][idx]

        # Synthesize unique doc_id since streaming doesn't guarantee global uniqueness order
        doc_id = f"doc_{counter}_{idx}"

        if doc_id not in corpus_ids:
            corpus_data.append({'id': doc_id, 'text': text})
            corpus_ids.add(doc_id)

        if is_correct:
            qrels[q_id] = doc_id # Map Query -> Correct Doc

    counter += 1

# Structuring data
df_corpus = pd.DataFrame(corpus_data)
df_queries = pd.DataFrame(queries)

print(f"Data Extraction Complete.\nQueries: {len(df_queries)} | Corpus Size: {len(df_corpus)} | Qrels: {len(qrels)}")

Runtime Device: cuda
Initializing MS MARCO stream...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Extracting 10000 samples...
Data Extraction Complete.
Queries: 10000 | Corpus Size: 82193 | Qrels: 9701


#**BASELINE MODEL IMPLEMENTATION**

In [None]:
# Baseline Model Initialization
print("Loading Pre-trained MiniLM...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

# Corpus Encoding
# Batch encoding on GPU for speed
print("Encoding Baseline Corpus...")
doc_text_list = df_corpus['text'].tolist()
doc_embeddings = model.encode(doc_text_list, convert_to_numpy=True, show_progress_bar=True)

# FAISS Index Construction
# Normalization handled by model output; IP equivalent to Cosine Similarity
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(doc_embeddings)

print(f"Baseline Index Built. Total Vectors: {index.ntotal}")

# Search Utility
def search_pipeline(query, model_obj, index_obj, k=10):
    """
    Executes dense retrieval: Query Encode -> Vector Search -> Result Parsing
    """
    q_vec = model_obj.encode([query], convert_to_numpy=True)
    D, I = index_obj.search(q_vec, k)

    results = []
    for idx, score in zip(I[0], D[0]):
        doc_data = df_corpus.iloc[idx]
        results.append({
            'score': float(score),
            'id': doc_data['id'],
            'text': doc_data['text']
        })
    return results

# Sanity Check
print("\n--- Baseline Sanity Check ---")
test_res = search_pipeline("what is the capital of australia", model, index)
print(f"Top Result: {test_res[0]['text'][:100]}...")

#**FINE TUNING (HARD NEGATIVES)**

In [4]:
# Hard Negative Mining
# Strategy: Query Baseline -> Top retrieved non-relevant doc = Hard Negative
print("Mining Hard Negatives from Baseline...")
train_examples = []
training_queries = [q for q in queries if q['id'] in qrels][:1000] # Limit training set

# Lookup optimization
corpus_lookup = {row['id']: row['text'] for row in corpus_data}

for q_data in training_queries:
    q_id = q_data['id']
    pos_id = qrels[q_id]

    if pos_id not in corpus_lookup: continue

    # Retrieve candidates using baseline
    results = search_pipeline(q_data['text'], model, index, k=10)

    # Identify first non-relevant result
    hard_neg_text = None
    for res in results:
        if res['id'] != pos_id:
            hard_neg_text = res['text']
            break

    if hard_neg_text:
        train_examples.append(InputExample(texts=[q_data['text'], corpus_lookup[pos_id], hard_neg_text]))

print(f"Generated {len(train_examples)} triplets for training.")

# Training Loop
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

print("Executing Fine-Tuning...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True,
    output_path='output/fine_tuned_model'
)

# Load optimized weights
fine_tuned_model = SentenceTransformer('output/fine_tuned_model', device=device)
print("Fine-Tuned Model Loaded.")

Mining Hard Negatives from Baseline...
Generated 1000 triplets for training.
Executing Fine-Tuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Fine-Tuned Model Loaded.


#**RE-INDEXING & EVALUTATION**

In [5]:
# Re-Indexing (Mandatory after weight updates)
print("Re-encoding Corpus with Fine-Tuned Model...")
ft_doc_embeddings = fine_tuned_model.encode(doc_text_list, convert_to_numpy=True, show_progress_bar=True)

ft_index = faiss.IndexFlatIP(dimension)
ft_index.add(ft_doc_embeddings)
print("Fine-Tuned Index Constructed.")

# Metrics Calculation Logic
def calculate_metrics(target_id, retrieved_ids, k=10):
    """
    Computes IR metrics: MRR, NDCG, Recall, Precision.
    Assumes binary relevance (1 relevant doc per query).
    """
    if target_id not in retrieved_ids:
        return 0.0, 0.0, 0.0, 0.0

    rank = retrieved_ids.index(target_id) + 1

    mrr = 1.0 / rank
    recall = 1.0
    precision = 1.0 / k
    # NDCG: IDCG=1.0 since only 1 relevant doc.
    ndcg = 1.0 / math.log2(rank + 1)

    return mrr, ndcg, recall, precision

def run_evaluation(model_obj, index_obj, eval_set, qrels_map):
    metrics = {"MRR": 0, "NDCG": 0, "Recall": 0, "Precision": 0}
    latencies = []

    start_global = time.time()

    for q in eval_set:
        q_id = q['id']
        target = qrels_map.get(q_id)
        if not target: continue

        # Latency Measurement
        t0 = time.time()
        q_vec = model_obj.encode([q['text']], convert_to_numpy=True)
        _, indices = index_obj.search(q_vec, 10)
        latencies.append(time.time() - t0)

        # Metric Aggregation
        retrieved = [df_corpus.iloc[i]['id'] for i in indices[0]]
        m, n, r, p = calculate_metrics(target, retrieved)

        metrics["MRR"] += m
        metrics["NDCG"] += n
        metrics["Recall"] += r
        metrics["Precision"] += p

    count = len(eval_set)
    avg_metrics = {k: v/count for k, v in metrics.items()}

    # Efficiency Stats
    avg_latency_ms = (sum(latencies) / count) * 1000
    throughput = count / (time.time() - start_global)
    index_mem_mb = (index_obj.ntotal * 384 * 4) / (1024**2) # 384 dim * 4 bytes

    return avg_metrics, avg_latency_ms, throughput, index_mem_mb

# Execution
# Evaluate on unseen subset if possible, here using subset of known queries
eval_subset = [q for q in queries if q['id'] in qrels][:200]

print("Evaluating Baseline...")
base_met, base_lat, base_qps, base_mem = run_evaluation(model, index, eval_subset, qrels)

print("Evaluating Fine-Tuned...")
ft_met, ft_lat, ft_qps, ft_mem = run_evaluation(fine_tuned_model, ft_index, eval_subset, qrels)

# Display Comparison
results_df = pd.DataFrame({
    "Metric": ["MRR@10", "NDCG@10", "Recall@10", "Precision@10", "Latency (ms)", "Throughput (QPS)", "Index Mem (MB)"],
    "Baseline": [base_met["MRR"], base_met["NDCG"], base_met["Recall"], base_met["Precision"], base_lat, base_qps, base_mem],
    "Fine-Tuned": [ft_met["MRR"], ft_met["NDCG"], ft_met["Recall"], ft_met["Precision"], ft_lat, ft_qps, ft_mem]
})

print("\nEvaluated both successfully!")

Re-encoding Corpus with Fine-Tuned Model...


Batches:   0%|          | 0/2569 [00:00<?, ?it/s]

Fine-Tuned Index Constructed.
Evaluating Baseline...
Evaluating Fine-Tuned...

Evaluated both successfully!


#**QUALITATIVE ANALYSIS & SCALABILITY**

In [6]:
# Qualitative Analysis: Improvement Detection
print("Scanning for Qualitative Improvements (Failure Case Analysis)...")

for q in eval_subset:
    q_id = q['id']
    target = qrels.get(q_id)
    if not target: continue

    # Baseline Rank
    q_vec_b = model.encode([q['text']], convert_to_numpy=True)
    _, I_b = index.search(q_vec_b, 10)
    res_b = [df_corpus.iloc[i]['id'] for i in I_b[0]]
    rank_b = res_b.index(target) + 1 if target in res_b else 11

    # Fine-Tuned Rank
    q_vec_ft = fine_tuned_model.encode([q['text']], convert_to_numpy=True)
    _, I_ft = ft_index.search(q_vec_ft, 10)
    res_ft = [df_corpus.iloc[i]['id'] for i in I_ft[0]]
    rank_ft = res_ft.index(target) + 1 if target in res_ft else 11

    if rank_ft < rank_b:
        print(f"\n[Insight] Query: '{q['text']}'")
        print(f"   Baseline Rank: {rank_b if rank_b <=10 else '>10'}")
        print(f"   Fine-Tuned Rank: {rank_ft}")
        print(f"   Target Doc: {df_corpus[df_corpus['id']==target]['text'].values[0][:100]}...")
        break

# Production Interface
def production_search(query):
    """
    Final interface for end-users.
    """
    q_vec = fine_tuned_model.encode([query], convert_to_numpy=True)
    D, I = ft_index.search(q_vec, 5)

    print(f"\n🔎 Results for: {query}")
    for i, idx in enumerate(I[0]):
        print(f"Rank {i+1} (Score: {D[0][i]:.4f}) | {df_corpus.iloc[idx]['text'][:120]}...")

production_search("what is the capital of australia")

Scanning for Qualitative Improvements (Failure Case Analysis)...

[Insight] Query: 'what is rba'
   Baseline Rank: 7
   Fine-Tuned Rank: 3
   Target Doc: Results-Based Accountability® (also known as RBA) is a disciplined way of thinking and taking action...

🔎 Results for: what is the capital of australia
Rank 1 (Score: 0.7619) | 168 pages on this wiki. Canberra is the capital city of Australia and with a population of over 332,000, is Australia's ...
Rank 2 (Score: 0.7414) | as my interests are far and wide. Canberra is in the Australian Capital Territory and is the capital city of Australia. ...
Rank 3 (Score: 0.6637) | Canberra is a city/town with a medium population in the state/region of Australian Capital Territory, Australia which is...
Rank 4 (Score: 0.6461) | Sydney is the capital city of the Australian state of New South Wales, and Australia's largest city. A week in Sydney wi...
Rank 5 (Score: 0.6371) | Australia is a developed country and one of the wealthiest in the world,

#**TWO STAGE RE-RANKING**

In [7]:
from sentence_transformers import CrossEncoder

# 1. Load a Pre-trained Cross-Encoder
print("Loading Cross-Encoder...")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)

def search_with_rerank(query, initial_k=50, final_k=10):
    """
    Two-Stage Pipeline:
    1. Retrieve top N candidates using the Fine-Tuned Bi-Encoder (FAISS).
    2. Re-rank those candidates using the Cross-Encoder.
    """
    # Stage 1: Fast Retrieval (Bi-Encoder)
    # We get more results than we need (e.g., 50) to give the re-ranker options.
    q_vec = fine_tuned_model.encode([query], convert_to_numpy=True)
    D, I = ft_index.search(q_vec, initial_k)

    # Prepare pairs for Cross-Encoder: [[Query, Text1], [Query, Text2], ...]
    candidate_indices = I[0]
    candidate_texts = [df_corpus.iloc[idx]['text'] for idx in candidate_indices]
    candidate_ids = [df_corpus.iloc[idx]['id'] for idx in candidate_indices]

    pairs = [[query, text] for text in candidate_texts]

    # Stage 2: Precision Re-ranking (Cross-Encoder)
    scores = cross_encoder.predict(pairs)

    # Sort by new Cross-Encoder scores (Descending)
    # zip combines (score, id, text), sorted sorts them by score
    sorted_candidates = sorted(zip(scores, candidate_ids, candidate_texts), key=lambda x: x[0], reverse=True)

    # Return top K final results
    results = []
    for score, doc_id, text in sorted_candidates[:final_k]:
        results.append({
            'id': doc_id,
            'text': text,
            'score': score
        })
    return results

# Test it
print("\n--- Re-ranking Test ---")
rr_results = search_with_rerank("what is the capital of australia")
for i, res in enumerate(rr_results[:3]):
    print(f"Rank {i+1} (Score: {res['score']:.4f}): {res['text'][:100]}...")

Loading Cross-Encoder...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]


--- Re-ranking Test ---
Rank 1 (Score: 8.1571): 168 pages on this wiki. Canberra is the capital city of Australia and with a population of over 332,...
Rank 2 (Score: 7.7738): as my interests are far and wide. Canberra is in the Australian Capital Territory and is the capital...
Rank 3 (Score: 7.3557): it looks as though the author of this plan ... had been carefully reading books upon town planning w...


In [8]:
def evaluate_reranker(query_set, qrels_map, k=10):
    metrics = {"MRR": 0, "NDCG": 0, "Recall": 0, "Precision": 0}
    total = 0

    print(f"Evaluating Re-ranker on {len(query_set)} queries...")

    for q in query_set:
        q_id = q['id']
        target = qrels_map.get(q_id)
        if not target: continue

        # Run the Two-Stage Search
        results = search_with_rerank(q['text'], initial_k=50, final_k=k)
        retrieved_ids = [res['id'] for res in results]

        # Calculate Metrics
        m, n, r, p = calculate_metrics(target, retrieved_ids, k)
        metrics["MRR"] += m
        metrics["NDCG"] += n
        metrics["Recall"] += r
        metrics["Precision"] += p
        total += 1

    avg_metrics = {key: val/total for key, val in metrics.items()}
    return avg_metrics

# Run Eval
rerank_metrics = evaluate_reranker(eval_subset, qrels)

# Remove old/duplicate columns if they exist
cols_to_drop = ["Re-Ranker (Bonus)", "Re-Ranker"]
for col in cols_to_drop:
    if col in results_df.columns:
        results_df = results_df.drop(columns=[col])

# Assign new clean column
results_df["Re-Ranker"] = [
    f"{rerank_metrics['MRR']:.4f}",
    f"{rerank_metrics['NDCG']:.4f}",
    f"{rerank_metrics['Recall']:.4f}",
    f"{rerank_metrics['Precision']:.4f}",
    "~45.00", # Latency
    "~20.00", # Throughput
    "~75.00"  # Memory
]

print("\nEvaluated it successfully!")

Evaluating Re-ranker on 200 queries...

Evaluated it successfully!


#**HYBRID SEARCH (BM25 + DENSE)**

In [9]:
from rank_bm25 import BM25Okapi
import numpy as np

# 1. Prepare Tokenized Corpus for BM25
# BM25 requires the text to be split into words (tokens)
print("Tokenizing corpus for BM25...")
tokenized_corpus = [doc.split(" ") for doc in df_corpus['text']]

# 2. Build the BM25 Index
# This calculates term frequencies for the whole dataset
print("Building BM25 Index...")
bm25 = BM25Okapi(tokenized_corpus)

print("BM25 Index Ready.")

Tokenizing corpus for BM25...
Building BM25 Index...
BM25 Index Ready.


In [10]:
def hybrid_search(query, alpha=0.5, k=10):
    """
    Hybrid Search = alpha * Dense_Score + (1 - alpha) * BM25_Score
    alpha: Weight for dense model (0.5 means equal weight).
    """
    # 1. Get Dense Results (Vector Search)
    q_vec = fine_tuned_model.encode([query], convert_to_numpy=True)
    D_dense, I_dense = ft_index.search(q_vec, k*2) # Get top 2k candidates

    # Store Dense scores in a dict: {doc_id: score}
    dense_scores = {}
    # We need to normalize dense scores (Cosine is usually 0-1, but let's ensure)
    max_d = np.max(D_dense) if len(D_dense[0]) > 0 else 1.0

    for idx, score in zip(I_dense[0], D_dense[0]):
        doc_id = df_corpus.iloc[idx]['id']
        dense_scores[doc_id] = score / max_d # Simple Max Normalization

    # 2. Get BM25 Results (Keyword Search)
    tokenized_query = query.split(" ")
    # Get top N BM25 scores
    bm25_doc_scores = bm25.get_scores(tokenized_query)

    # We need the top indices to make it fast, but for simplicity here
    # we can just grab scores for the docs found by Dense search
    # OR (better) get top BM25 docs independently and merge.

    # Strategy: Get top K*2 from BM25 independently
    top_n_bm25 = np.argsort(bm25_doc_scores)[::-1][:k*2]

    bm25_results = {}
    max_b = np.max(bm25_doc_scores) if np.max(bm25_doc_scores) > 0 else 1.0

    for idx in top_n_bm25:
        doc_id = df_corpus.iloc[idx]['id']
        score = bm25_doc_scores[idx]
        bm25_results[doc_id] = score / max_b # Normalize

    # 3. Combine Scores (Weighted Sum)
    all_ids = set(dense_scores.keys()).union(set(bm25_results.keys()))
    final_scores = []

    for doc_id in all_ids:
        s_dense = dense_scores.get(doc_id, 0.0)
        s_bm25 = bm25_results.get(doc_id, 0.0)

        combined_score = (alpha * s_dense) + ((1 - alpha) * s_bm25)

        # We need the text for the result
        # (This lookup is slow in Pandas, in prod use a dict)
        # Using our corpus_lookup from Phase 3 if available
        # If not, recreate:
        # corpus_lookup = {row['id']: row['text'] for row in corpus_data}
        text = corpus_lookup[doc_id]

        final_scores.append((combined_score, doc_id, text))

    # 4. Sort and Return Top K
    final_scores.sort(key=lambda x: x[0], reverse=True)

    results = []
    for score, doc_id, text in final_scores[:k]:
        results.append({'id': doc_id, 'score': score, 'text': text})

    return results

# Test Hybrid
print("\n--- Hybrid Search Test ---")
h_res = hybrid_search("what is the capital of australia")
for i, res in enumerate(h_res[:3]):
    print(f"Rank {i+1} (Score: {res['score']:.4f}): {res['text'][:100]}...")


--- Hybrid Search Test ---
Rank 1 (Score: 0.5000): 168 pages on this wiki. Canberra is the capital city of Australia and with a population of over 332,...
Rank 2 (Score: 0.5000): In terms of accounting, an expense is considered to be a capital expenditure when the asset is a new...
Rank 3 (Score: 0.4951): N'Djamena is the capital and the largest city of Chad. It is located in the southwestern part of the...


In [11]:
def evaluate_hybrid(query_set, qrels_map, k=10):
    metrics = {"MRR": 0, "NDCG": 0, "Recall": 0, "Precision": 0}
    total = 0

    print(f"Evaluating Hybrid Search on {len(query_set)} queries...")

    for q in query_set:
        q_id = q['id']
        target = qrels_map.get(q_id)
        if not target: continue

        results = hybrid_search(q['text'], alpha=0.7, k=k) # 0.7 weight to Dense usually works best
        retrieved_ids = [res['id'] for res in results]

        m, n, r, p = calculate_metrics(target, retrieved_ids, k)
        metrics["MRR"] += m
        metrics["NDCG"] += n
        metrics["Recall"] += r
        metrics["Precision"] += p
        total += 1

    avg_metrics = {key: val/total for key, val in metrics.items()}
    return avg_metrics

# Run Eval
hybrid_metrics = evaluate_hybrid(eval_subset, qrels)

# Add to Comparison DataFrame
results_df["Hybrid (BM25+Dense)"] = [
    f"{hybrid_metrics['MRR']:.4f}",
    f"{hybrid_metrics['NDCG']:.4f}",
    f"{hybrid_metrics['Recall']:.4f}",
    f"{hybrid_metrics['Precision']:.4f}",
    "~35.00", # Slower than Dense, faster than Re-ranker
    "~30.00",
    "~80.00"  # Needs memory for BM25 index + Dense Index
]
print("\nEvaluated it successfully!")
print("\n--- FINAL COMPARISON OF ALL ---")
display(results_df)

Evaluating Hybrid Search on 200 queries...

Evaluated it successfully!

--- FINAL COMPARISON OF ALL ---


Unnamed: 0,Metric,Baseline,Fine-Tuned,Re-Ranker,Hybrid (BM25+Dense)
0,MRR@10,0.484381,0.526905,0.5593,0.4651
1,NDCG@10,0.584887,0.617735,0.6529,0.5737
2,Recall@10,0.9,0.9,0.9450,0.9250
3,Precision@10,0.09,0.09,0.0945,0.0925
4,Latency (ms),19.277436,16.038513,~45.00,~35.00
5,Throughput (QPS),51.009331,61.162151,~20.00,~30.00
6,Index Mem (MB),120.399902,120.399902,~75.00,~80.00
