### Simple RAG Chain using LCEL

In [115]:
from pathlib import Path

# Create data directory
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

texts = {
    "ml.txt": """Machine Learning (ML) is a subset of artificial intelligence that focuses on
building systems that learn from data instead of being explicitly programmed.
ML algorithms identify patterns in historical data and use those patterns
to make predictions or decisions on new, unseen data.

Common examples include spam detection, recommendation systems, and
credit risk modeling.
""",

    "dl.txt": """Deep Learning (DL) is a specialized area of Machine Learning that uses
neural networks with multiple hidden layers, known as deep neural networks.
These models are particularly effective for unstructured data such as images,
audio, and text.

Popular deep learning architectures include Convolutional Neural Networks (CNNs)
and Recurrent Neural Networks (RNNs).
""",

    "nlp.txt": """Natural Language Processing (NLP) is a field of artificial intelligence that
enables machines to understand, interpret, and generate human language.
NLP combines linguistics, machine learning, and deep learning techniques.

Applications of NLP include chatbots, sentiment analysis, and language translation.
"""
}

# Write files
for filename, content in texts.items():
    (DATA_DIR / filename).write_text(content, encoding="utf-8")


In [116]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(
    path="data",
    glob="*.txt",
    loader_cls=TextLoader
)

raw_documents = loader.load()

In [117]:
from langchain_core.documents import Document
from typing import List
import os

def enrich_documents_with_metadata(
    documents: List[Document]
) -> List[Document]:
    enriched_documents = []

    for doc in documents:
        source_path = doc.metadata.get("source", "")
        filename = os.path.basename(source_path)

        # Infer topic from filename (ml.txt â†’ ML)
        topic = os.path.splitext(filename)[0].upper()

        enriched_doc = Document(
            page_content=doc.page_content,
            metadata={
                "topic": topic,
                "source": filename
            }
        )

        enriched_documents.append(enriched_doc)

    return enriched_documents

In [118]:
documents = enrich_documents_with_metadata(raw_documents)

In [119]:
for doc in documents:
    print("Metadata:", doc.metadata)
    print("Content preview:", doc.page_content, "\n")

Metadata: {'topic': 'DL', 'source': 'dl.txt'}
Content preview: Deep Learning (DL) is a specialized area of Machine Learning that uses
neural networks with multiple hidden layers, known as deep neural networks.
These models are particularly effective for unstructured data such as images,
audio, and text.

Popular deep learning architectures include Convolutional Neural Networks (CNNs)
and Recurrent Neural Networks (RNNs).
 

Metadata: {'topic': 'ML', 'source': 'ml.txt'}
Content preview: Machine Learning (ML) is a subset of artificial intelligence that focuses on
building systems that learn from data instead of being explicitly programmed.
ML algorithms identify patterns in historical data and use those patterns
to make predictions or decisions on new, unseen data.

Common examples include spam detection, recommendation systems, and
credit risk modeling.
 

Metadata: {'topic': 'NLP', 'source': 'nlp.txt'}
Content preview: Natural Language Processing (NLP) is a field of artificial intellig

In [120]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [121]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=40,
    separators=["\n\n", "\n", " ", ""]
)

In [122]:
chunked_documents = text_splitter.split_documents(documents)

In [123]:
for i, doc in enumerate(chunked_documents):
    print(f"Chunk {i+1}")
    print("Metadata:", doc.metadata)
    print("Content:", doc.page_content)
    print("-" * 60)

Chunk 1
Metadata: {'topic': 'DL', 'source': 'dl.txt'}
Content: Deep Learning (DL) is a specialized area of Machine Learning that uses
neural networks with multiple hidden layers, known as deep neural networks.
------------------------------------------------------------
Chunk 2
Metadata: {'topic': 'DL', 'source': 'dl.txt'}
Content: These models are particularly effective for unstructured data such as images,
audio, and text.
------------------------------------------------------------
Chunk 3
Metadata: {'topic': 'DL', 'source': 'dl.txt'}
Content: Popular deep learning architectures include Convolutional Neural Networks (CNNs)
and Recurrent Neural Networks (RNNs).
------------------------------------------------------------
Chunk 4
Metadata: {'topic': 'ML', 'source': 'ml.txt'}
Content: Machine Learning (ML) is a subset of artificial intelligence that focuses on
building systems that learn from data instead of being explicitly programmed.
-------------------------------------------------

In [124]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

In [125]:
texts = [
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks with many layers."
]

embeddings = embedding_model.encode(texts)

print(len(embeddings))        # number of vectors
print(embeddings[0].shape)   # embedding dimension

2
(384,)


In [126]:
import numpy as np

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    return np.dot(vec1, vec2) / (
        np.linalg.norm(vec1) * np.linalg.norm(vec2)
    )

In [127]:
def text_similarity(
    text1: str,
    text2: str,
    model: SentenceTransformer
) -> float:
    embeddings = model.encode([text1, text2])
    return cosine_similarity(embeddings[0], embeddings[1])

In [128]:
score = text_similarity(
    "Machine learning models learn from data",
    "Data science involves analyzing data to extract insights",
    embedding_model
)

print(f"Similarity score: {score:.4f}")

Similarity score: 0.3914


In [129]:
chunked_documents  # List[Document]

[Document(metadata={'topic': 'DL', 'source': 'dl.txt'}, page_content='Deep Learning (DL) is a specialized area of Machine Learning that uses\nneural networks with multiple hidden layers, known as deep neural networks.'),
 Document(metadata={'topic': 'DL', 'source': 'dl.txt'}, page_content='These models are particularly effective for unstructured data such as images,\naudio, and text.'),
 Document(metadata={'topic': 'DL', 'source': 'dl.txt'}, page_content='Popular deep learning architectures include Convolutional Neural Networks (CNNs)\nand Recurrent Neural Networks (RNNs).'),
 Document(metadata={'topic': 'ML', 'source': 'ml.txt'}, page_content='Machine Learning (ML) is a subset of artificial intelligence that focuses on\nbuilding systems that learn from data instead of being explicitly programmed.'),
 Document(metadata={'topic': 'ML', 'source': 'ml.txt'}, page_content='ML algorithms identify patterns in historical data and use those patterns\nto make predictions or decisions on new, un

In [130]:
import numpy as np

def embed_documents(documents, model):
    texts = [doc.page_content for doc in documents]
    embeddings = model.encode(texts)

    return [
        {
            "embedding": embeddings[i],
            "metadata": documents[i].metadata,
            "content": documents[i].page_content
        }
        for i in range(len(documents))
    ]
    
embedded_chunks = embed_documents(chunked_documents, embedding_model)

In [131]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [132]:
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(
    documents=chunked_documents,
    embedding=embedding_model
)

In [133]:
VECTORSTORE_PATH = "faiss_store"

vectorstore.save_local(VECTORSTORE_PATH)

In [134]:
vectorstore = FAISS.load_local(
    VECTORSTORE_PATH,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

In [135]:
from typing import List, Dict

def search_similar_texts(
    query: str,
    vectorstore: FAISS,
    top_k: int = 3
) -> List[Dict]:
    results = vectorstore.similarity_search_with_score(
        query=query,
        k=top_k
    )

    return [
        {
            "score": score,
            "content": doc.page_content,
            "metadata": doc.metadata
        }
        for doc, score in results
    ]

In [136]:
results = search_similar_texts(
    query="What is deep learning?",
    vectorstore=vectorstore,
    top_k=3
)

for r in results:
    print(f"Score: {r['score']:.4f}")
    print("Metadata:", r["metadata"])
    print("Content:", r["content"])
    print("-" * 60)

Score: 0.4629
Metadata: {'topic': 'DL', 'source': 'dl.txt'}
Content: Deep Learning (DL) is a specialized area of Machine Learning that uses
neural networks with multiple hidden layers, known as deep neural networks.
------------------------------------------------------------
Score: 0.7776
Metadata: {'topic': 'DL', 'source': 'dl.txt'}
Content: Popular deep learning architectures include Convolutional Neural Networks (CNNs)
and Recurrent Neural Networks (RNNs).
------------------------------------------------------------
Score: 0.8777
Metadata: {'topic': 'ML', 'source': 'ml.txt'}
Content: Machine Learning (ML) is a subset of artificial intelligence that focuses on
building systems that learn from data instead of being explicitly programmed.
------------------------------------------------------------


In [137]:
from dotenv import load_dotenv
import os

load_dotenv()

assert os.getenv("HF_TOKEN") is not None, "HF_TOKEN not loaded"

In [138]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = ChatHuggingFace(
    llm=HuggingFaceEndpoint(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        task="text-generation",
        max_new_tokens=512,
        temperature=0.0,
    )
)

In [139]:
response = llm.invoke(
    "Answer briefly:\nFpunder of Meesho??"
)

print(response)

content=' The founder of Meesho is IIT Delhi alumnus Vishal Mehta along with Sanjeev Barnwal and Akhil Sacchaney. They started Meesho in 2015 with the intention of making e-commerce accessible to everyone in India, especially those in smaller towns and rural areas. The company operates a social commerce platform where users can sell products through social media networks like WhatsApp.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 89, 'prompt_tokens': 16, 'total_tokens': 105}, 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019b64af-421a-7e60-a227-7c92060ec3ec-0' usage_metadata={'input_tokens': 16, 'output_tokens': 89, 'total_tokens': 105}


In [140]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """
You are a helpful assistant.
Answer the question using ONLY the provided context.
If the answer is not present in the context, say "I don't know".

Context:
{context}

Question:
{question}

Answer:
"""
)

In [141]:
# STEP 1: Create a retriever from the FAISS vector store
# -----------------------------------------------
# This converts the vector store into a search component
# that can fetch the top-k most relevant chunks for a query

retriever = vectorstore.as_retriever(
    search_type="similarity",  # cosine similarity
    search_kwargs={"k": 3}     # fetch top 3 relevant chunks
)

print("STEP 1 COMPLETE: Retriever created from FAISS vector store\n")

STEP 1 COMPLETE: Retriever created from FAISS vector store



In [142]:
# STEP 2: Define a function to format retrieved documents
# -------------------------------------------------------
# Input  : List[Document]
# Output : Single formatted string to be injected into the prompt

def format_docs(docs):
    print("STEP 2: Formatting retrieved documents into context\n")

    formatted_chunks = []

    for i, doc in enumerate(docs):
        chunk_text = f"""
Chunk {i+1} (Source: {doc.metadata.get('source')} | Topic: {doc.metadata.get('topic')}):
{doc.page_content}
"""
        formatted_chunks.append(chunk_text)

    final_context = "\n".join(formatted_chunks)

    print("Formatted Context Passed to Prompt:\n")
    print(final_context)
    print("-" * 80)

    return final_context

In [143]:
# STEP 3: Example user question
# -----------------------------
question = "What is deep learning?"

print("STEP 3: User Question:")
print(question)
print("-" * 80)

STEP 3: User Question:
What is deep learning?
--------------------------------------------------------------------------------


In [144]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# STEP 4: Build the RAG chain using LCEL
# -------------------------------------

rag_chain = (
    {
        # The question flows unchanged
        "question": RunnablePassthrough(),

        # The same question is used to retrieve documents
        "context": retriever | format_docs
    }
    # Inject context + question into the prompt template
    | prompt

    # Send the formatted prompt to the LLM
    | llm

    # Convert LLM output to plain string
    | StrOutputParser()
)

print("STEP 4 COMPLETE: LCEL RAG chain constructed\n")

STEP 4 COMPLETE: LCEL RAG chain constructed



In [145]:
# STEP 5: Execute the RAG chain
# ----------------------------

print("STEP 5: Running RAG chain...\n")

response = rag_chain.invoke(question)

print("FINAL ANSWER:\n")
print(response)

STEP 5: Running RAG chain...

STEP 2: Formatting retrieved documents into context

Formatted Context Passed to Prompt:


Chunk 1 (Source: dl.txt | Topic: DL):
Deep Learning (DL) is a specialized area of Machine Learning that uses
neural networks with multiple hidden layers, known as deep neural networks.


Chunk 2 (Source: dl.txt | Topic: DL):
Popular deep learning architectures include Convolutional Neural Networks (CNNs)
and Recurrent Neural Networks (RNNs).


Chunk 3 (Source: ml.txt | Topic: ML):
Machine Learning (ML) is a subset of artificial intelligence that focuses on
building systems that learn from data instead of being explicitly programmed.

--------------------------------------------------------------------------------
FINAL ANSWER:

 Deep learning is a specialized area of Machine Learning that uses neural networks with multiple hidden layers.
