In [3]:
###Document Structure
from langchain_core.documents import Document

In [2]:
doc=Document(
    page_content="this is the main text content I am using to create RAG",
    metadata={
        "source":"chapter 1.txt",
        "pages":1,
        "author":"Ratnam Ojha",
        "date_created":"2025-01-01"
    }
)
#the benefit of having metadata in our document, is that we can filter our search results based on metadata fields.
doc

Document(metadata={'source': 'exmaple.txt', 'pages': 1, 'author': 'Krish Naik', 'date_created': '2025-01-01'}, page_content='this is the main text content I am using to create RAG')

In [6]:
# creating a txt file
import os 
os.makedirs("../data/text_files",exist_ok=True)

In [7]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,"w") as f:
        f.write(content)
print("Sample text files created successfully.")

Sample text files created successfully.


### we could've added .txt files manually as well but masti nahi rukni chahiye.

In [12]:
### reading these files using TextLoader
from langchain.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt",encoding="utf8")
document = loader.load()
print(document)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [14]:
### Directory loader
from langchain.document_loaders import DirectoryLoader
dir_loader = DirectoryLoader( 
    "../data/text_files",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf8"},
    show_progress = False)
documents = dir_loader.load()
documents

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.'),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervise

In [21]:
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader

dir_loader = DirectoryLoader( 
    "../data/text_files/pdf",
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress = False)
pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-24T11:50:53-07:00', 'source': '../data/text_files/pdf/neural networks.pdf', 'file_path': '../data/text_files/pdf/neural networks.pdf', 'total_pages': 27, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-24T11:50:53-07:00', 'trapped': '', 'modDate': "D:20250824115053-07'00'", 'creationDate': "D:20250824115053-07'00'", 'page': 0}, page_content='Speech and Language Processing.\nDaniel Jurafsky & James H. Martin.\nCopyright © 2025.\nAll\nrights reserved.\nDraft of August 24, 2025.\nCHAPTER\n6\nNeural Networks\n“[M]achines of this character can behave in a very complicated manner when\nthe number of units is large.”\nAlan Turing (1948) “Intelligent Machines”, page 6\nNeural networks are a fundamental computational tool for language process-\ning, and a very old one. They are called neural because their origins lie in the\nMcCulloch-Pi

In [34]:
from pathlib import Path
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 2 PDF files to process

Processing: neural networks.pdf
  ✓ Loaded 27 pages

Processing: llm_hallucinations.pdf
  ✓ Loaded 23 pages

Total documents loaded: 50


In [35]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=500, chunk_overlap=50):
    """ Split documents into smaller chunks
    Args:
        documents: List of Document objects or raw strings.
        chunk_size: Max characters per chunk.
        chunk_overlap: Overlap between chunks.
    Returns:
        List of Document chunks
    """
    # Ensure all inputs are Document objects
    if isinstance(documents[0], str):
        documents = [Document(page_content=doc, metadata={}) for doc in documents]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=['\n\n', '\n', ' ', '']
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # show example of chunk
    if split_docs:
        print(f"\n Example chunk")
        print(f"Content : {split_docs[0].page_content[:200]}...")
        print(f"Metadata : {split_docs[0].metadata}")
    
    return split_docs


chunks = split_documents(all_pdf_documents)
print(f"Total chunks created: {len(chunks)}")

Split 50 documents into 358 chunks

 Example chunk
Content : Speech and Language Processing. Daniel Jurafsky & James H. Martin. Copyright © 2025. All
rights reserved. Draft of August 24, 2025.
CHAPTER
6
Neural Networks
“[M]achines of this character can behave i...
Metadata : {'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-24T11:50:53-07:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-24T11:50:53-07:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'source': '../data/text_files/pdf/neural networks.pdf', 'total_pages': 27, 'page': 0, 'page_label': '1', 'source_file': 'neural networks.pdf', 'file_type': 'pdf'}
Total chunks created: 358


### embedding and vectorstoreDB

In [24]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
class EmbeddingManager:
    "Handles document embedding generation using SenetenceTransformer"
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        "Load the SentenceTransformer model"
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print("Model loaded successfully. Embedding dimension:", self.model.get_sentence_embedding_dimension())
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def embed_documents(self, documents: List[str]) -> np.ndarray:
        "Generate embeddings for a list of documents"
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        print(f"Generating embeddings for {len(documents)} documents:")
        embeddings = self.model.encode(documents, convert_to_numpy=True)
        return embeddings
    
    def get_embedding_dimesnion(self) -> int:
        "Get the dimension of the embeddings"
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        return self.model.get_sentence_embedding_dimension()
    
##initialize embedding manager
embedding_manager = EmbeddingManager()
embedding_manager 

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x169ae1e80>

## Now, we store these embeddings in the VectorStore
# VectorStore:

In [28]:
class VectorStore:
    "Manages storage and retrieval of document embeddings using ChromaDB"
    def __init__(self, collection_name: str = 'pdf_documents', persist_directory: str = '../data/vector_store'):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        "Initialize ChromaDB client and collection"
        try:
            #create persistent chromadb client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            #create or get collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata = {"description":"Document embeddings collection"}
            )
            print(f"VectorDB store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error initializing ChromaDB: {e}")
            raise

    def _add_documents(self, documents: List[Document], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents (List[Document]): List of LangChain documents
            embeddings: corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(documents)} documents to the vector store...")

        #prepare data for chromadb:

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # create a unique id for each document
            doc_id = str(uuid.uuid4())
            ids.append(doc_id)

            # prepare metadata 
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # document content
            documents_text.append(doc.page_content)
            
            # embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print("Documents added successfully. Total documents in collection:", self.collection.count())
            print(f"Successfully added {len(documents)} documents.")

        except Exception as e:
            print(f"Error adding documents to Vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore

VectorDB store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x16a9c6120>

In [37]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-24T11:50:53-07:00', 'author': '', 'title': '', 'subject': '', 'keywords': '', 'moddate': '2025-08-24T11:50:53-07:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'source': '../data/text_files/pdf/neural networks.pdf', 'total_pages': 27, 'page': 0, 'page_label': '1', 'source_file': 'neural networks.pdf', 'file_type': 'pdf'}, page_content='Speech and Language Processing. Daniel Jurafsky & James H. Martin. Copyright © 2025. All\nrights reserved. Draft of August 24, 2025.\nCHAPTER\n6\nNeural Networks\n“[M]achines of this character can behave in a very complicated manner when\nthe number of units is large.”\nAlan Turing (1948) “Intelligent Machines”, page 6\nNeural networks are a fundamental computational tool for language process-\ning, and a very old one. They are called neural because their origin

In [39]:
# convert the text to enbeddings
texts = [doc.page_content for doc in chunks]

# generate embeddings
embeddings = embedding_manager.embed_documents(texts)

#store in the vectorDB
vectorstore._add_documents(chunks,embeddings)

Generating embeddings for 358 documents:
Adding 358 documents to the vector store...
Documents added successfully. Total documents in collection: 358
Successfully added 358 documents.


## RAG retriever pipeline from VectorStore

In [42]:
class RAGretriever:
    "Handling query based retrieval from the vector store"

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        initialize the retriever

        Args:
            vector_store: vector store containing document embeddings
            embedding_manager: embedding manager to generate query embeddings
        """

        self.vector_store = vector_store
        self.embedding_manager = embedding_manager


    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a given query

        Args:
            query (str): The input query string.
            top_k (int): Number of top documents to retrieve.
            score_threshold (float): Minimum similarity score threshold.
        Returns:
            List of dictionaries containing retrieved info and similarity scores.
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top k:{top_k}, Score threshold: {score_threshold}")

        # Generate embedding for the query
        query_embedding = self.embedding_manager.embed_documents([query])[0]
        print(f"Query embedding dimension: {len(query_embedding)}")

        # search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGretriever(vectorstore,embedding_manager)
rag_retriever

<__main__.RAGretriever at 0x1766b2a50>

In [43]:
rag_retriever.retrieve("What is the XOR problem in Neural Networks?")

Retrieving documents for query: 'What is the XOR problem in Neural Networks?'
Top k:5, Score threshold: 0.0
Generating embeddings for 1 documents:
Query embedding dimension: 384
Retrieved 5 documents (after filtering)


[{'id': '4937d1ec-2311-4297-bc19-e75c79322dcb',
  'content': 'from the negative cases (00 and 11). We say that XOR is not a linearly separablelinearly\nseparable\nfunction. Of course we could draw a boundary with a curve, or some other function,\nbut not a single line.\n6.2.1 The solution: neural networks\nWhile the XOR function cannot be calculated by a single perceptron, it can be cal-\nculated by a layered network of perceptron units. Rather than see this with networks\nof simple perceptrons, however, let’s see how to compute XOR using two layers of',
  'metadata': {'keywords': '',
   'page_label': '5',
   'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2',
   'subject': '',
   'page': 4,
   'source_file': 'neural networks.pdf',
   'title': '',
   'trapped': '/False',
   'author': '',
   'moddate': '2025-08-24T11:50:53-07:00',
   'doc_index': 26,
   'content_length': 489,
   'producer': 'pdfTeX-1.40.21',
   'creator': 'LaTeX wi

In [44]:
rag_retriever.retrieve("What is feedforward neural network?")

Retrieving documents for query: 'What is feedforward neural network?'
Top k:5, Score threshold: 0.0
Generating embeddings for 1 documents:
Query embedding dimension: 384
Retrieved 5 documents (after filtering)


[{'id': '954479a7-2bf5-4b4c-b8bf-af088a8b8d46',
  'content': '(2016).\n6.3 Feedforward Neural Networks\nLet’s now walk through a slightly more formal presentation of the simplest kind of\nneural network, the feedforward network. A feedforward network is a multilayerfeedforward\nnetwork\nnetwork in which the units are connected with no cycles; the outputs from units in\neach layer are passed to units in the next higher layer, and no outputs are passed\nback to lower layers. (In Chapter 13 we’ll introduce networks with cycles, called\nrecurrent neural networks.)',
  'metadata': {'source': '../data/text_files/pdf/neural networks.pdf',
   'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2',
   'creationdate': '2025-08-24T11:50:53-07:00',
   'page_label': '7',
   'subject': '',
   'producer': 'pdfTeX-1.40.21',
   'total_pages': 27,
   'page': 6,
   'doc_index': 35,
   'author': '',
   'content_length': 496,
   'creator': 'LaTeX with hyp