# Tracing performance of RAG pipeline using MLflow Notebook

### Setup Environment

In [2]:
%pip install -U langchain langchain-community unstructured pypdf \
              sentence-transformers accelerate mlflow python-docx faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Import Libraries

In [3]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import mlflow

In [2]:
# load multimple data sources (documents) using path

from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader

files = [
    "data/atomic-habits.pdf",
    "data/another-doc.pdf",
    "data/notes.txt",
    "data/report.docx"
]

docs = []

for file in files:
    if file.endswith(".pdf"):
        loader = PyPDFLoader(file)
    elif file.endswith(".txt"):
        loader = TextLoader(file)
    elif file.endswith(".docx") or file.endswith(".doc"):
        loader = UnstructuredWordDocumentLoader(file)
    else:
        print(f"Skipping unsupported file type: {file}")
        continue
    
    docs.extend(loader.load())

print(f"✅ Loaded {len(docs)} chunks from {len(files)} documents")


ValueError: File path data/another-doc.pdf is not a valid file or url

### Load and Prepare Documents

In [4]:
# load multimple data sources (documents) using path

from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader

# TXT
# txt_loader = TextLoader("data/name.txt")

# Example: PDF
pdf_loader = PyPDFLoader("data/atomic-habits.pdf")

# DOC/DOCX
# doc_loader = UnstructuredWordDocumentLoader("data/name.docx")

# Load all docs
docs = []
# docs.extend(txt_loader.load())
docs.extend(pdf_loader.load())
# docs.extend(doc_loader.load())

print(f"Loaded {len(docs)} chunks")


Loaded 256 chunks


### Preprocessing, Embeddings & Build Vector Store (FAISS)
- Text splitting and embedding models setup.


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Split documents into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

try:
    embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedder)
    status = "✅ Embedder loaded successfully"
except Exception as e:
    embedder = None
    status = f"❌ Failed to load embedder: {e}"


print(f"Split into {len(chunks)} chunks")
print(status)

  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Split into 660 chunks
✅ Embedder loaded successfully


### Load generator

In [6]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)

Device set to use cpu


### Evaluation Function

In [7]:
from sentence_transformers import SentenceTransformer, util

eval_model = SentenceTransformer("all-MiniLM-L6-v2")

def evaluate(query, answer, retrieved_chunks):
    # Similarity between answer and context
    context = " ".join([c.page_content for c in retrieved_chunks])
    emb_ans = eval_model.encode(answer, convert_to_tensor=True)
    emb_ctx = eval_model.encode(context, convert_to_tensor=True)
    sim = util.cos_sim(emb_ans, emb_ctx).item()
    
    # Simple length metric
    length = len(answer.split())
    
    return {
        "semantic_similarity": sim,
        "answer_length": length,
        "context_size": len(retrieved_chunks),
    }


### RAG Function

In [8]:
def rag_pipeline_with_mlflow(query):
    with mlflow.start_run():
        # Retrieve context
        hits = vectorstore.similarity_search(query, k=5)
        context = "\n\n".join([h.page_content for h in hits])

        # Generate
        prompt = f"Answer using ONLY the context:\n\n{context}\n\nQuestion: {query}\nAnswer:"
        answer = generator(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"].strip()

        # Evaluate
        metrics = evaluate(query, answer, hits)

        # Log input/output
        mlflow.log_param("query", query)
        mlflow.log_param("retrieved_chunks", len(hits))
        mlflow.log_text(context, "context.txt")
        mlflow.log_text(answer, "answer.txt")

        # Log metrics
        mlflow.log_metrics(metrics)

    return answer, metrics

In [9]:
def rag_pipeline_with_mlflow(query, max_k=10, threshold=0.75):
    with mlflow.start_run():
        # Retrieve with scores
        hits_with_scores = vectorstore.similarity_search_with_score(query, k=max_k)

        # Filter by threshold
        filtered_hits = [h for h, score in hits_with_scores if score >= threshold]

        # Fallback: ensure at least one doc
        if not filtered_hits:
            filtered_hits = [hits_with_scores[0][0]]

        # Extract context
        context = "\n\n".join([h.page_content for h in filtered_hits])

        # Generate answer
        prompt = f"Answer using ONLY the context:\n\n{context}\n\nQuestion: {query}\nAnswer:, note:if the query is not realted to the given context just say 'I am not able to 
        answer = generator(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"].strip()

        # Evaluate answer (your custom function)
        metrics = evaluate(query, answer, filtered_hits)

        ## Log parameters
        mlflow.log_param("query", query)
        mlflow.log_param("max_k", max_k)
        mlflow.log_param("threshold", threshold)
        mlflow.log_param("retrieved_chunks", len(filtered_hits))

        ## Log artifacts
        mlflow.log_text(context, "context.txt")
        mlflow.log_text(answer, "answer.txt")

        ## Log metrics
        mlflow.log_metrics(metrics)

        ## Log similarity scores
        scores = [score for _, score in hits_with_scores]
        avg_score = sum(scores) / len(scores) if scores else 0
        mlflow.log_metric("avg_similarity_score", avg_score)

    return answer, metrics


In [31]:
def rag_pipeline_with_mlflow(query, max_k=10, threshold=0.75):
    # Handle empty input
    if not query.strip():
        return "Please enter a valid question.", {"status": "empty_query"}

    with mlflow.start_run():
        # Retrieve with scores
        hits_with_scores = vectorstore.similarity_search_with_score(query, k=max_k)

        # Check if retrieval gave results
        if not hits_with_scores:
            return "Sorry, I couldn’t find relevant information.", {"status": "no_results"}

        # Filter by threshold
        filtered_hits = [h for h, score in hits_with_scores if score >= threshold]

        # If nothing passed the threshold, treat as irrelevant query
        if not filtered_hits:
            mlflow.log_param("query_status", "irrelevant")
            return "I am not able to answer this question with the given context.", {"status": "irrelevant"}

        # Extract context
        context = "\n\n".join([h.page_content for h in filtered_hits])

        # Generate answer (with explicit fallback instruction in the prompt)
        prompt = (
            f"Answer using ONLY the context below.\n\n"
            f"Context:\n{context}\n\n"
            f"Question: {query}\n"
            f"Answer: (If the question is not related to the context, respond with 'I am not able to answer this question with the given context.')"
        )
        answer = generator(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"].strip()

        # Evaluate answer
        metrics = evaluate(query, answer, filtered_hits)

        ## Log parameters
        mlflow.log_param("query", query)
        mlflow.log_param("max_k", max_k)
        mlflow.log_param("threshold", threshold)
        mlflow.log_param("retrieved_chunks", len(filtered_hits))
        mlflow.log_param("query_status", "valid")

        ## Log artifacts
        mlflow.log_text(context, "context.txt")
        mlflow.log_text(answer, "answer.txt")

        ## Log metrics
        mlflow.log_metrics(metrics)

        ## Log similarity scores
        scores = [score for _, score in hits_with_scores]
        avg_score = sum(scores) / len(scores) if scores else 0
        mlflow.log_metric("avg_similarity_score", avg_score)

    return answer, metrics, context


### Experiment Tracking MLflow

In [10]:
mlflow.set_experiment("RAG_Tracking")

<Experiment: artifact_location='file:///Users/prabhakaranvijay/Desktop/RAG/mlruns/492747433177853900', creation_time=1757156092444, experiment_id='492747433177853900', last_update_time=1757156092444, lifecycle_stage='active', name='RAG_Tracking', tags={}>

### Test Queries

In [35]:
ans, metrics, context= rag_pipeline_with_mlflow("What are the four laws of behavior change in Atomic Habits?")
print("Answer:", ans)
print("Metrics:", metrics)
print("Context", context)

Answer: I am not able to answer this question with the given context.
Metrics: {'semantic_similarity': 0.15987464785575867, 'answer_length': 12, 'context_size': 8}
Context Chapter	Summary
The	3rd	Law	of	Behavior	Change	is	
make	it	easy
.
The	most	effective	form	of	learning	is	practice,	not	planning.
Focus	on	taking	action,	not	being	in	motion.
Habit	formation	is	the	process	by	which	a	behavior	becomes
progressively	more	automatic	through	repetition.
The	amount	of	time	you	have	been	performing	a	habit	is	not	as
important	as	the	number	of	times	you	have	performed	it.

Chapter	Summary
A	habit	is	a	behavior	that	has	been	repeated	enough	times	to	become
automatic.
The	ultimate	purpose	of	habits	is	to	solve	the	problems	of	life	with	as
little	energy	and	effort	as	possible.
Any	habit	can	be	broken	down	into	a	feedback	loop	that	involves	four
steps:	cue,	craving,	response,	and	reward.
The	Four	Laws	of	Behavior	Change	are	a	simple	set	of	rules	we	can
use	to	build	better	habits.	They	are	(1)	mak

In [13]:
ans, metrics = rag_pipeline_with_mlflow("Why are small habits more effective than big goals?")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: The weight of the system is working for you rather than against you.
Metrics: {'semantic_similarity': 0.3534437417984009, 'answer_length': 13, 'context_size': 8}


In [29]:
ans, metrics = rag_pipeline_with_mlflow("How can I build habits that reinforce my identity?")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: I am not able to answer this question with the given context.
Metrics: {'status': 'irrelevant'}


In [28]:
print(rag_pipeline_with_mlflow("How can I build habits that reinforce my identity?"))

('I am not able to answer this question with the given context.', {'status': 'irrelevant'})


In [16]:
ans, metrics = rag_pipeline_with_mlflow("How can I apply Atomic Habits to increase productivity at work?")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: Focusing on the overall system, rather than a single goal, is one of the core themes of this book. It is also one of the deeper meanings behind the word atomic. By now, you’ve probably realized that an atomic habit refers to a tiny change, a marginal gain, a 1 percent improvement. But atomic habits are not just any old habits, however small. They are little habits that are part of a larger system. Just as atoms are the building blocks of molecules, atomic habits are the building blocks of remarkable results. Habits are like the atoms of our lives. Each one is a fundamental unit that you can rotate through the Four Laws of Behavior Change until you find the next bottleneck. 1.3: Use habit stacking: “After [CURRENT HABIT], I will [NEW HABIT].” 1.4: Design your environment. Make the cues
Metrics: {'semantic_similarity': 0.8922805190086365, 'answer_length': 141, 'context_size': 7}


In [17]:
ans, metrics = rag_pipeline_with_mlflow("How do tiny improvements compound over time?")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: The same way that money multiplies through compound interest, the effects of your habits multiply as you repeat them. They seem to make little difference on any given day and yet money multiplies through compound interest, the effects of your habits multiply as you repeat them. They seem to make little difference on any given day and yet the impact they deliver over the months and years can be enormous. It is only when looking back two, five, or perhaps ten years later that the value of good habits and the cost of bad ones becomes strikingly apparent in daily life. We often dismiss small changes because they don’t seem to matter very much in the moment. If transform into world champions with tiny changes that, at first glance, would seem to make a modest difference at best? Why do small improvements accumulate into such remarkable results, and how can you replicate this approach in your own life?
Metrics: {'semantic_similarity': 0.7746601104736328, 'answer_length': 158, 'contex

In [18]:
ans, metrics = rag_pipeline_with_mlflow("How can environment design help in building better habits?")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: We mentally assign our habits to the locations in which they occur: the home, the office, the gym. Each location develops a connection to certain habits and routines. You establish a particular relationship with the objects on your desk, the items on your kitchen counter, the things in your bedroom. Our behavior is not defined by the objects in the environment but by our relationship to them. In fact, this is a useful way to think about the influence of the environment on your behavior. Stop thinking about your environment as filled with objects. Start thinking about it as filled with relationships. Think in terms of how you interact with the spaces around you. For one person, her couch is the place where she reads for an hour each night. For someone else, the couch is where he watches television and eats a bowl of ice cream after work. Because the amount of water in the environment was increased, behavior shifted naturally and without additional motivation. People often
Metric

In [19]:
!mlflow ui --port 5001

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR[0m:    [Errno 48] Address already in use
Running the mlflow server failed. Please see the logs above for details.


In [20]:
ans, metrics = rag_pipeline_with_mlflow(
    "How can I apply Atomic Habits to increase productivity at work?"
)
print("Answer:", ans)
print("Metrics:", metrics)

Answer: Focusing on the overall system, rather than a single goal, is one of the core themes of this book. It is also one of the deeper meanings behind the word atomic. By now, you’ve probably realized that an atomic habit refers to a tiny change, a marginal gain, a 1 percent improvement. But atomic habits are not just any old habits, however small. They are little habits that are part of a larger system. Just as atoms are the building blocks of molecules, atomic habits are the building blocks of remarkable results. Habits are like the atoms of our lives. Each one is a fundamental unit that you can rotate through the Four Laws of Behavior Change until you find the next bottleneck. 1.3: Use habit stacking: “After [CURRENT HABIT], I will [NEW HABIT].” 1.4: Design your environment. Make the cues
Metrics: {'semantic_similarity': 0.8922805190086365, 'answer_length': 141, 'context_size': 7}


In [24]:
ans, metrics = rag_pipeline_with_mlflow("  ")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: Please enter a valid question.
Metrics: {'status': 'empty_query'}


In [25]:
ans, metrics = rag_pipeline_with_mlflow("What is the capital of Mars?")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: I am not able to answer this question with the given context.
Metrics: {'semantic_similarity': 0.1547727882862091, 'answer_length': 12, 'context_size': 10}


In [30]:
ans, metrics = rag_pipeline_with_mlflow("fjkdslfjsldfj")
print("Answer:", ans)
print("Metrics:", metrics)

Answer: I am not able to answer this question with the given context.
Metrics: {'semantic_similarity': 0.2669399380683899, 'answer_length': 12, 'context_size': 10}
