In [25]:
### Data Ingestion
import langchain
import langchain_community
import langchain_core
from langchain_core.documents import Document

In [26]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader('../data/text_files/sample.txt')
data = loader.load()
print(data)

[Document(metadata={'source': '../data/text_files/sample.txt'}, page_content='Here√¢‚Ç¨‚Ñ¢s a concise and clear sample introduction about Python you can use:\n\nPython is a high-level, interpreted, and general-purpose programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean and easy-to-understand syntax, making it ideal for beginners while still being powerful for professionals.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It comes with a vast standard library and a rich ecosystem of third-party packages, enabling developers to work on diverse applications such as:\n\nWeb development (e.g., Django, Flask)\nData science & machine learning (e.g., Pandas, NumPy, TensorFlow)\nAutomation & scripting\nGame development\nDesktop applications\nIts cross-platform nature allows Python programs to run on various operating systems without mod

In [27]:
from langchain_community.document_loaders import DirectoryLoader
dir_loader = DirectoryLoader(
    '../data/text_files',
    glob='**/*.txt',
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf8'},
    show_progress=False
)

data = dir_loader.load()
print(data)

[Document(metadata={'source': '..\\data\\text_files\\sample.txt'}, page_content='Here‚Äôs a concise and clear sample introduction about Python you can use:\n\nPython is a high-level, interpreted, and general-purpose programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python emphasizes clean and easy-to-understand syntax, making it ideal for beginners while still being powerful for professionals.\n\nPython supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It comes with a vast standard library and a rich ecosystem of third-party packages, enabling developers to work on diverse applications such as:\n\nWeb development (e.g., Django, Flask)\nData science & machine learning (e.g., Pandas, NumPy, TensorFlow)\nAutomation & scripting\nGame development\nDesktop applications\nIts cross-platform nature allows Python programs to run on various operating systems without modif

In [28]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

pdf_dir = "../data/pdf_files"
documents = []

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(pdf_dir, filename)

        loader = PyMuPDFLoader(file_path)
        docs = loader.load()

        # optional: add filename metadata
        for doc in docs:
            doc.metadata["source_file"] = filename

        documents.extend(docs)

print(f"Loaded {len(documents)} pages")


Loaded 76 pages


In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

loader = TextLoader('../data/text_files/sample.txt', encoding='utf8')
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

chunks = splitter.split_documents(docs)
print(len(chunks))
# print(chunks[0].page_content)
# print(chunks[1].page_content)

2


In [30]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = PyMuPDFLoader('../data/pdf_files/ITR.pdf')
data = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=300
)
chunks = splitter.split_documents(data)

for i,chunk in enumerate(chunks):
    chunk.metadata['chunk_id'] = i
    chunk.metadata['document_name'] = 'ITR.pdf'
    chunk.metadata['doc_type'] = 'tax_document'



In [31]:
import numpy as np 
import chromadb
import uuid
import os
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
class EmbeddingManager:
    def __init__(self , model_name:str = 'all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = None
        self.load_model()

    def load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully: {self.model_name}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e 

    def get_embeddings(self , texts:List[str]) -> np.ndarray:
        if self.model is None:
            raise RuntimeError("Embedding model is not loaded")
        try:
            embeddings = self.model.encode(texts, show_progress_bar=True)
            print(f"generated embeddings with shape: {embeddings.shape}")
            return embeddings
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            raise e
        
embedding_manager = EmbeddingManager()
embedding_manager    

Model loaded successfully: all-MiniLM-L6-v2


<__main__.EmbeddingManager at 0x1ed338fde80>

In [33]:
class VectorStore:
    def __init__(self ,collection_name: str= "pdf_documents" , persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = chromadb.PersistentClient(path = self.persist_directory)
        self.collection = self.client.get_or_create_collection(name = self.collection_name)
        self.intialize_store()

    def intialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description":"Vector store for PDF documents"}
            )    
            print(f"Vector store intialized successfully {self.collection_name}")
            print(f"Existing documents in the store: {self.collection.count()}")
        except Exception as e:
            print(f"Error intializing vector store: {e}")
            raise e    
        
    def add_documents(self , documents: List[Any] , embeddings : np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must be the same")
        
        print(f"Adding {len(documents)} documents to the vector store")

        ids = []
        metadatas = []
        document_texts =[]
        embedding_list = []

        for i,(doc,embedding) in enumerate(zip(documents , embeddings)):
            doc_id = str(uuid.uuid4())
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadatas.append(metadata)

            document_texts.append(doc.page_content)

            embedding_list.append(embedding)

            try:
                self.collection.add(
                    ids = ids,
                    metadatas = metadatas,
                    documents = document_texts,
                    embeddings = embedding_list
                )
                print(f"Successfully added {len(documents)} documents to the vector store")
            except Exception as e:
                print(f"Error adding documents to vector store: {e}")
                raise e    
vector_store = VectorStore()
vector_store        

Vector store intialized successfully pdf_documents
Existing documents in the store: 212


<__main__.VectorStore at 0x1ed36e10590>

In [34]:
text = [doc.page_content for doc in chunks]
embeddings = embedding_manager.get_embeddings(text)
vector_store.add_documents(chunks,embeddings)

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:02<00:00,  1.08s/it]


generated embeddings with shape: (53, 384)
Adding 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents to the vector store
Successfully added 53 documents t

In [41]:
from typing import List, Dict, Any


class RAGRetriever:
    def __init__(self, vector_store : VectorStore, embedding_manager:EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(
        self,
        query: str,
        top_k: int = 5,
        score_threshold: float = 0.0
    ) -> List[Dict[str, Any]]:

        print(f"üîç Retrieving documents for query: {query}")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")

        # --- Embed query ---
        query_embedding = self.embedding_manager.get_embeddings([query])[0]
        print("Query embedding shape:", query_embedding.shape)

        # --- Query vector store ---
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                include=["documents", "metadatas", "distances"]
            )
        except Exception as e:
            print(f"‚ùå Error querying vector store: {e}")
            return []

        # --- Extract raw fields safely ---
        documents_raw = results.get("documents")
        metadatas_raw = results.get("metadatas")
        distances_raw = results.get("distances")
        ids_raw = results.get("ids")

        if (
            documents_raw is None
            or metadatas_raw is None
            or distances_raw is None
            or ids_raw is None
        ):
            print("‚ö†Ô∏è Incomplete retrieval results from vector store")
            return []

        if not documents_raw or not documents_raw[0]:
            print("‚ö†Ô∏è No documents found")
            return []
        
        print("Chroma result keys:", results.keys())
        print("Raw distances:", results.get("distances"))


        documents = documents_raw[0]
        metadatas = metadatas_raw[0]
        distances = distances_raw[0]
        ids = ids_raw[0]

        print(distances)

        # --- Build response ---
        retrieved_docs: List[Dict[str, Any]] = []

        for i, (doc_id, doc, meta, dist) in enumerate(
            zip(ids, documents, metadatas, distances)
        ):
            similarity_score = 1 - (dist/2)

            retrieved_docs.append({
                "id": doc_id,
                "content": doc,
                "metadata": meta,
                "distance": dist,
                "similarity_score": similarity_score,
                "rank": i + 1,
            })

        print(f"‚úÖ Retrieved {len(retrieved_docs)} documents")
        return retrieved_docs
    
rag_retriever = RAGRetriever(vector_store, embedding_manager)
rag_retriever.retrieve("What is Identity theft?")


üîç Retrieving documents for query: What is Identity theft?
Top K: 5, Score Threshold: 0.0


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 61.68it/s]

generated embeddings with shape: (1, 384)
Query embedding shape: (384,)
Chroma result keys: dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])
Raw distances: [[1.4781007766723633, 1.4781007766723633, 1.4781007766723633, 1.4781007766723633, 1.4781007766723633]]
[1.4781007766723633, 1.4781007766723633, 1.4781007766723633, 1.4781007766723633, 1.4781007766723633]
‚úÖ Retrieved 5 documents





[{'id': '6f9a0478-24bb-4d27-8d19-485ed6347882',
  'content': 'clause (iia) thereof (relating to family pension); or \n(e) \nany claim of credit of tax deducted at source in the hands of any other \nperson.  \n  \n4.  \nAnnexure-less Return Form \n \n \n \nNo document (including TDS certificate) should be attached to this Return \nForm. All such documents enclosed with this Return Form will be detached and \nreturned to the person filing the return. \n \n5.  \nManner of filing this Return Form \n \n \nThis Return Form can be filed with the Income-tax Department in any of the \nfollowing ways,:-  \n \n(A) \nelectronically on the e-filing web portal of Income-tax Department \n \n(www.incometaxindiaefiling.gov.in) and verified in any one of the \n \nfollowing manner ‚Äì  \n(i) \ndigitally signing the verification part, or  \n(ii) \nauthenticating by way of electronic verification code (EVC), or \n(iii) \nby sending duly signed paper Form ITR-V (Acknowledgment) by \npost to CPC at the follo

<__main__.RAGRetriever at 0x1ed36e10830>