## Data Ingestion

In [53]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader,PyMuPDFLoader,DirectoryLoader


In [54]:
# Text File Loader
dirloader=DirectoryLoader('../data/text/',glob='**/*.txt',loader_cls=TextLoader)
docs_txt=dirloader.load()
print(docs_txt)

[Document(metadata={'source': '../data/text/sample1.txt'}, page_content='The evening settled in slowly, wrapping the streets in a calm that felt almost deliberate. \nLights flickered on in distant windows, each one hinting at a separate story unfolding behind glass and curtains. \nSomewhere, a radio played an old song, its melody drifting through the air and mixing with the muted sounds of traffic and footsteps. Thoughts wandered without direction, jumping from memory to possibility, lingering where they felt comfortable. \nThere was no rush to arrive at a conclusion, no need for a dramatic ending—just the quiet satisfaction of existing within the moment, letting it pass gently and without resistance.'), Document(metadata={'source': '../data/text/sample2.txt'}, page_content='The city woke up without announcing itself,\nnot with alarms or sudden noise,\nbut with a gradual awareness that the night had loosened its grip.\nShadows pulled back inch by inch,\nand the sky shifted colors as if

In [55]:
#pdf loader

dirloader_pdf=DirectoryLoader('../data/pdf/',glob='**/*.pdf',loader_cls=PyMuPDFLoader)
docs_pdf=dirloader_pdf.load()
print(docs_pdf)

[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2025-07-25T11:02:56+05:30', 'source': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf', 'file_path': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf', 'total_pages': 10, 'format': 'PDF 1.5', 'title': 'Microsoft PowerPoint - IT314-Software Engineering-SPM Cont.ppt [Compatibility Mode]', 'author': 'DA-IICT', 'subject': '', 'keywords': '', 'moddate': '2025-07-25T11:02:56+05:30', 'trapped': '', 'modDate': "D:20250725110256+05'30'", 'creationDate': "D:20250725110256+05'30'", 'page': 0}, page_content='7/25/2025\n1\nDA-IICT\nIT 314: Software Engineering\nSoftware Process Models – RUP|XP|TDD\n1\nRUP – Rational Unified Process\n•\nLife Cycle model proposed by Booch, Jacobson, and Rumbaugh\n(“The three Amigos”) derived from the work on UML\n•\nRational Unified Process (RUP) uses Unified Modeling Language\n(UML) as core notation\n•\nDescribed from 3 perspective

## Embedding and Vector store

In [56]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import TypedDict, List,Dict,Any
import numpy as np
import os

In [57]:
class embedding:
    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        try:
            self.model=SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully, with dimension {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model: {e}")

    def generate_embedding(self,texts:List[str])->np.ndarray:
        if self.model is None:
            raise ValueError("Model is not loaded.")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
#initalize
embed_manager=embedding()

Model all-MiniLM-L6-v2 loaded successfully, with dimension 384


In [58]:
### Text splitting get into chunks


from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [59]:
chunks=split_documents(docs_pdf)
chunks

Split 10 documents into 11 chunks

Example chunk:
Content: 7/25/2025
1
DA-IICT
IT 314: Software Engineering
Software Process Models – RUP|XP|TDD
1
RUP – Rational Unified Process
•
Life Cycle model proposed by Booch, Jacobson, and Rumbaugh
(“The three Amigos”)...
Metadata: {'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2025-07-25T11:02:56+05:30', 'source': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf', 'file_path': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf', 'total_pages': 10, 'format': 'PDF 1.5', 'title': 'Microsoft PowerPoint - IT314-Software Engineering-SPM Cont.ppt [Compatibility Mode]', 'author': 'DA-IICT', 'subject': '', 'keywords': '', 'moddate': '2025-07-25T11:02:56+05:30', 'trapped': '', 'modDate': "D:20250725110256+05'30'", 'creationDate': "D:20250725110256+05'30'", 'page': 0}


[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2025-07-25T11:02:56+05:30', 'source': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf', 'file_path': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf', 'total_pages': 10, 'format': 'PDF 1.5', 'title': 'Microsoft PowerPoint - IT314-Software Engineering-SPM Cont.ppt [Compatibility Mode]', 'author': 'DA-IICT', 'subject': '', 'keywords': '', 'moddate': '2025-07-25T11:02:56+05:30', 'trapped': '', 'modDate': "D:20250725110256+05'30'", 'creationDate': "D:20250725110256+05'30'", 'page': 0}, page_content='7/25/2025\n1\nDA-IICT\nIT 314: Software Engineering\nSoftware Process Models – RUP|XP|TDD\n1\nRUP – Rational Unified Process\n•\nLife Cycle model proposed by Booch, Jacobson, and Rumbaugh\n(“The three Amigos”) derived from the work on UML\n•\nRational Unified Process (RUP) uses Unified Modeling Language\n(UML) as core notation\n•\nDescribed from 3 perspective

## Vector Store

In [60]:
class vectorstore():
    #collection means where exactly in vector store,we store our vectors
    def __init__(self,collection_name:str="default",persist_directory:str="../data/vectorstore"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_client()

    def _initialize_client(self):
        try:
            os.makedirs(self.persist_directory,exist_ok=True)

            self.client=chromadb.PersistentClient(path=self.persist_directory)

            self.collection=self.client.get_or_create_collection(name=self.collection_name,metadata={"description":"Vector store collection"})
            
            print(f"ChromaDB client initialized with collection: {self.collection_name}")
       

        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        if(len(documents)!=len(embeddings)):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(documents)} documents to vector store...")
    
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            #this enumerate(zip...) fxn is used to create array of tuples(doc,embedding)
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")

            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vec_store=vectorstore()
vec_store

ChromaDB client initialized with collection: default


<__main__.vectorstore at 0x17907c5d0>

In [61]:
# Converting text to embeddings
texts=[doc.page_content for doc in chunks]
texts

['7/25/2025\n1\nDA-IICT\nIT 314: Software Engineering\nSoftware Process Models – RUP|XP|TDD\n1\nRUP – Rational Unified Process\n•\nLife Cycle model proposed by Booch, Jacobson, and Rumbaugh\n(“The three Amigos”) derived from the work on UML\n•\nRational Unified Process (RUP) uses Unified Modeling Language\n(UML) as core notation\n•\nDescribed from 3 perspectives\n\uf0a7\n A dynamic perspective that shows phases over time;\n\uf0a7\n A static perspective that shows process activities;\n\uf0a7\n A practice perspective that suggests good practice.\n•\nUnified Process is distinguished by being\n\uf0a7\n Use-case driven\n\uf0a7\n Architecture-centric\n\uf0a7\n Iterative and incremental',
 '7/25/2025\n2\nRUP – Rational Unified Process\n•\nRUP proposes a phase model that identifies four discrete phases in\nthe software process\n•\nInception\n•\n Establish the business case for the system\n•\n Decide to cancel or continue the project\n•\nElaboration\n•  \nDevelop an understanding of the problem

In [62]:
#To generate embeddings
embeddings=embed_manager.generate_embedding(texts) #we created embed_manager object earlier

#to store in vector store
vec_store.add_documents(chunks,embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]

Generated embeddings with shape: (11, 384)
Adding 11 documents to vector store...
Successfully added 11 documents to vector store





## Retriever Pipeline from vector store

In [63]:
class retrierver:
    def __init__(self,vectorstore:vectorstore,embedding_manager=embedding):
        self.vectorstore=vectorstore
        self.embedding_manager=embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embedding([query])[0]
        
        # Search in vector store
        try:
            results = self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=retrierver(vec_store,embed_manager)                       
        

In [65]:
rag_retriever.retrieve("RUP uses unified modeling language")

Retrieving documents for query: 'RUP uses unified modeling language'
Top K: 5, Score threshold: 0.0


Batches: 100%|██████████| 1/1 [00:00<00:00, 35.87it/s]

Generated embeddings with shape: (1, 384)
Retrieved 4 documents (after filtering)





[{'id': 'doc_f0ecbea0_5',
  'content': '7/25/2025\n5\nLife of a Unified Process\nRUP - Summary\n•\nThe RUP is not a suitable process for all types of development but it\ndoes represent a new generation of generic processes\n•\nMost important innovation:\n•\n Combination of many views\n•\n Deployment of software is part of the process (almost ignored\nin other process models)\n•\nBased on standards\n•\n Object-oriented Modeling\n•\n Unified Modeling Language',
  'metadata': {'title': 'Microsoft PowerPoint - IT314-Software Engineering-SPM Cont.ppt [Compatibility Mode]',
   'subject': '',
   'creationdate': '2025-07-25T11:02:56+05:30',
   'doc_index': 5,
   'author': 'DA-IICT',
   'source': '../data/pdf/IT314-Software Engineering-SPM Cont.pdf',
   'trapped': '',
   'modDate': "D:20250725110256+05'30'",
   'keywords': '',
   'content_length': 402,
   'page': 4,
   'producer': 'Acrobat Distiller 10.0.0 (Windows)',
   'format': 'PDF 1.5',
   'moddate': '2025-07-25T11:02:56+05:30',
   'creati