In [1]:
# import nltk
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams, SearchParams
import uuid
import re
import time
import psutil
import os
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download NLTK sentence tokenizer
# nltk.download('punkt', quiet=True)

In [3]:
# import nltk
# nltk.data.path.append('/home/debmalya/nltk_data')

In [4]:
# Initialize embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
# Initialize Qdrant client (local or cloud - we chose local for now using docker - if issue, check port mapping)
client = QdrantClient(host="localhost", port=6333)

collection_name = "CSE291A-RAG-Project-Trial"

# Create collection
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

  client.recreate_collection(


True

In [6]:
# Step 1: Chunk text into sentences
def chunk_text(text):
    # Split on punctuation followed by space
    sentences = list(map(str.strip, re.split(r'(?<=[.!?]) +', text.strip())))
    print(sentences)
    return [s for s in sentences if s]

In [7]:
# Step 2: Embed and store chunks in Qdrant
def store_chunks(text):
    chunks = chunk_text(text)
    embeddings = model.encode(chunks)

    points = [
        PointStruct(
            id=str(uuid.uuid4()), 
            vector=embedding.tolist(), 
            payload={"text": chunk} # add other metadata that we might need to store (recency, etc..)
        ) for chunk, embedding in zip(chunks, embeddings)        
    ]
    
    client.upsert(collection_name=collection_name, points=points)
    print(f"Stored {len(points)} chunks in Qdrant vector store.")

In [8]:
# Step 3: Query Qdrant with semantic search
def search_chunks(query, top_k=5):
    query_vector = model.encode(query).tolist()
    results = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k,
        search_params=SearchParams(hnsw_ef=128)
    )
    return [hit.payload["text"] for hit in results]

In [9]:
text = """
    Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with language generation. It allows models to access relevant information beyond their training data. This improves factual accuracy. 
    Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with graphics generation. It allows models to access relevant information beyond their testing data. This reduces hallucination.
""".strip()

store_chunks(text)

['Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with language generation.', 'It allows models to access relevant information beyond their training data.', 'This improves factual accuracy.', 'Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with graphics generation.', 'It allows models to access relevant information beyond their testing data.', 'This reduces hallucination.']
Stored 6 chunks in Qdrant vector store.


In [10]:
query = "How does RAG improve accuracy?"
results = search_chunks(query, 5)

print("Relevant Chunks:")
for i, res in enumerate(results, 1):
    print(f"{i}. {res}")

Relevant Chunks:
1. This improves factual accuracy.
2. Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with graphics generation.
3. Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with language generation.
4. This reduces hallucination.
5. It allows models to access relevant information beyond their testing data.


  results = client.search(


In [15]:
def get_retrieval_metrics(expected_chunks, retrieved_chunks, k=5):
    """
    expected_chunks: list of relevant chunk texts (ground truth)
    retrieved_chunks: list of retrieved chunk texts (top-k)
    k: number of retrieved chunks to evaluate

    Returns: dict of retrieval metrics
    """
    expected_set = set(expected_chunks)
    retrieved_set = set(retrieved_chunks[:k])

    # Precision@k
    precision_at_k = len(expected_set & retrieved_set) / k

    # Recall@k
    recall_at_k = len(expected_set & retrieved_set) / len(expected_set) if expected_set else 0

    # Hit Ratio@k
    hit_ratio = 1 if expected_set & retrieved_set else 0

    # MRR (Mean Reciprocal Rank)
    ranks = [i + 1 for i, chunk in enumerate(retrieved_chunks[:k]) if chunk in expected_set]
    mrr = 1 / ranks[0] if ranks else 0

    # DCG and nDCG
    relevance_scores = [1 if chunk in expected_set else 0 for chunk in retrieved_chunks[:k]]
    dcg = sum([score / np.log2(i + 2) for i, score in enumerate(relevance_scores)])
    ideal_scores = sorted(relevance_scores, reverse=True)
    idcg = sum([score / np.log2(i + 2) for i, score in enumerate(ideal_scores)])
    ndcg = dcg / idcg if idcg > 0 else 0

    return {
        "precision@k": round(precision_at_k, 3),
        "recall@k": round(recall_at_k, 3),
        "hit_ratio@k": hit_ratio,
        "mrr": round(mrr, 3),
        "ndcg": round(ndcg, 3)
    }

In [16]:
def retrieve_and_find_metrics(query_input, expected_chunks):
    """
    query_function: function that performs retrieval and returns retrieved_chunks
    query_input: input to pass to the query_function
    expected_chunks: ground truth relevant chunks

    Returns: dict of efficiency and retrieval metrics
    """
    
    # Start timing
    start_time = time.time()

    # Measure memory before
    process = psutil.Process(os.getpid())
    mem_before = process.memory_info().rss / 1024 ** 2  # MB

    # Run retrieval
    number_of_chunks_to_retrieve = len(expected_chunks)
    retrieved_chunks = search_chunks(query_input, number_of_chunks_to_retrieve)
    print(retrieved_chunks)

    # Measure memory after
    mem_after = process.memory_info().rss / 1024 ** 2  # MB
    end_time = time.time()

    # Efficiency metrics
    latency = end_time - start_time
    throughput = 1 / latency if latency > 0 else 0
    memory_used = mem_after - mem_before

    # Retrieval quality
    retrieval_metrics = get_retrieval_metrics(expected_chunks, retrieved_chunks)

    print("----------- RETRIEVAL METRICS -----------")
    print("Precision @ K  : ", retrieval_metrics["precision@k"])
    print("Recall @ K     : ", retrieval_metrics["recall@k"])
    print("Hit Ratio @ K  : ", retrieval_metrics["hit_ratio@k"])
    print("MRR            : ", retrieval_metrics["mrr"])
    print("NDCG           : ", retrieval_metrics["ndcg"])

    print("----------- EFFICIENCY METRICS -----------")
    print("Latency (sec)  : ", latency)
    print("Throughput(qps): ", throughput)
    print("Memory Used(MB): ", memory_used)

In [17]:
query = "How does RAG improve accuracy?"
expected_chunks = ["This improves factual accuracy.", "This reduces hallucination."]

retrieve_and_find_metrics(query, expected_chunks)

['This improves factual accuracy.', 'Retrieval-Augmented Generation (RAG) is a powerful technique that combines external knowledge retrieval with graphics generation.']
----------- RETRIEVAL METRICS -----------
Precision @ K  :  0.2
Recall @ K     :  0.5
Hit Ratio @ K  :  1
MRR            :  1.0
NDCG           :  1.0
----------- EFFICIENCY METRICS -----------
Latency (sec)  :  0.1112675666809082
Throughput(qps):  8.987344918468363
Memory Used(MB):  0.0


  results = client.search(
