## Data Ingestion

In [19]:
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [10]:
text_loader = TextLoader("./data/hello.txt")
text_documents = text_loader.load()
text_documents

[Document(metadata={'source': './data/hello.txt'}, page_content='Hello! this a Demo File.\n\nThis file will be used for Testing.')]

In [11]:
pdf_loader = PyMuPDFLoader("./data/Presentation.pdf")
pdf_documents = pdf_loader.load()
pdf_documents

[Document(metadata={'producer': '', 'creator': '', 'creationdate': '2025-10-13T16:14:29+00:00', 'source': './data/Presentation.pdf', 'file_path': './data/Presentation.pdf', 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-13T16:14:29+00:00', 'trapped': '', 'modDate': "D:20251013161429+00'00'", 'creationDate': "D:20251013161429+00'00'", 'page': 0}, page_content='CVR COLLEGE OF ENGINEERING\nDepartment of CSE(Data Science)\nB.Tech CSE(DS) IV Year I Semester\nMajor Project Stage-1\nReview-2, Aug.23 2025\n• PRC Members\n• Dr. A. Srinivasa Reddy\n• Dr. M. Sreenu\n• Dr. Shaik Janbhasha\n• Dr. Afreen Fatima Mohammed'),
 Document(metadata={'producer': '', 'creator': '', 'creationdate': '2025-10-13T16:14:29+00:00', 'source': './data/Presentation.pdf', 'file_path': './data/Presentation.pdf', 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-13T16:14:29+00:00', '

Spliting to chunks

In [None]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(pdf_documents)
    print(f"Split {len(pdf_documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [21]:
chunks = split_documents(pdf_documents)
chunks

Split 29 documents into 43 chunks

Example chunk:
Content: CVR COLLEGE OF ENGINEERING
Department of CSE(Data Science)
B.Tech CSE(DS) IV Year I Semester
Major Project Stage-1
Review-2, Aug.23 2025
• PRC Members
• Dr. A. Srinivasa Reddy
• Dr. M. Sreenu
• Dr. Sh...
Metadata: {'producer': '', 'creator': '', 'creationdate': '2025-10-13T16:14:29+00:00', 'source': './data/Presentation.pdf', 'file_path': './data/Presentation.pdf', 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-13T16:14:29+00:00', 'trapped': '', 'modDate': "D:20251013161429+00'00'", 'creationDate': "D:20251013161429+00'00'", 'page': 0}


[Document(metadata={'producer': '', 'creator': '', 'creationdate': '2025-10-13T16:14:29+00:00', 'source': './data/Presentation.pdf', 'file_path': './data/Presentation.pdf', 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-13T16:14:29+00:00', 'trapped': '', 'modDate': "D:20251013161429+00'00'", 'creationDate': "D:20251013161429+00'00'", 'page': 0}, page_content='CVR COLLEGE OF ENGINEERING\nDepartment of CSE(Data Science)\nB.Tech CSE(DS) IV Year I Semester\nMajor Project Stage-1\nReview-2, Aug.23 2025\n• PRC Members\n• Dr. A. Srinivasa Reddy\n• Dr. M. Sreenu\n• Dr. Shaik Janbhasha\n• Dr. Afreen Fatima Mohammed'),
 Document(metadata={'producer': '', 'creator': '', 'creationdate': '2025-10-13T16:14:29+00:00', 'source': './data/Presentation.pdf', 'file_path': './data/Presentation.pdf', 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-13T16:14:29+00:00', '

Embedding Manager

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from typing import List
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
class EmbeddingManager:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        try:
            print(f"Loading model '{self.model_name}'...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model '{self.model_name}' loaded successfully.")
        except Exception as e:
            print(f"Error loading model '{self.model_name}': {e}")

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model is not loaded.")
        
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings for {len(texts)} texts.")
        return embeddings

embedding_manager = EmbeddingManager()


Loading model 'all-MiniLM-L6-v2'...
Model 'all-MiniLM-L6-v2' loaded successfully.


Vector Store

In [15]:
from typing import Any


class VectorStore:
    def __init__(self, collection_name: str = "academic_documents", persist_directory: str = "./data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._init_store()
    
    def _init_store(self):
        try:
            print("Initializing ChromaDB client...")
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(name=self.collection_name)
            print(f"Collection '{self.collection_name}' initialized successfully.")
        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
    
    def add_embeddings(self, texts: List[Any], embeddings: np.ndarray):
        if not self.collection:
            raise ValueError("Collection is not initialized.")
        
        if len(texts) != len(embeddings):
            raise ValueError("Number of texts and embeddings must match.")
        
        print(f"Adding {len(texts)} embeddings to vector store'...")

        ids = [str(i) for i in range(len(texts))]
        metadatas = []
        contents = []
        embeddings_lists = []

        for i, (doc, embedding) in enumerate(zip(texts, embeddings)):
            metadata = dict(doc.metadata)
            metadatas.append(metadata)
            contents.append(doc.page_content)
            embeddings_lists.append(embedding.tolist())

        try:
            self.collection.add(ids=ids, documents=contents, embeddings=embeddings_lists)
            print(f"Added {len(texts)} embeddings to the collection '{self.collection_name}'.")
        except Exception as e:
            print(f"Error adding embeddings to collection: {e}")

vector_store = VectorStore()
vector_store

Initializing ChromaDB client...
Collection 'academic_documents' initialized successfully.


<__main__.VectorStore at 0x1de53df9d90>

In [22]:
content = [chunk.page_content for chunk in chunks]

In [23]:
embeddings = embedding_manager.embed_texts(content)

vector_store.add_embeddings(chunks, embeddings)

Batches: 100%|██████████| 2/2 [00:16<00:00,  8.05s/it]


Generated embeddings for 43 texts.
Adding 43 embeddings to vector store'...
Added 43 embeddings to the collection 'academic_documents'.


## Retrieval

In [None]:
retriever = vector_store.collection.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever
