## Data Ingestion Pipeline

In [None]:
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


Document Loading...

In [None]:
text_loader = TextLoader("./data/hello.txt")
text_documents = text_loader.load()
text_documents

In [None]:
pdf_loader = PyMuPDFLoader("./data/Presentation.pdf")
pdf_documents = pdf_loader.load()
print(f"Loaded {len(pdf_documents)} PDF documents.")
pdf_documents

Spliting into Chunks...

In [None]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(pdf_documents)
    print(f"Split {len(pdf_documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [None]:
chunks = split_documents(pdf_documents)
chunks

Embedding Manager

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from typing import List

In [None]:
class EmbeddingManager:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading model '{self.model_name}'...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model '{self.model_name}' loaded successfully.")
        except Exception as e:
            print(f"Error loading model '{self.model_name}': {e}")

    def embed_texts(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model is not loaded.")
        
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings for {len(texts)} texts.")
        return embeddings

embedding_manager = EmbeddingManager()


Vector Store

In [None]:
from typing import Any


class VectorStore:
    def __init__(self, collection_name: str = "academic_documents", persist_directory: str = "./data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._init_store()
    
    def _init_store(self):
        try:
            print("Initializing ChromaDB client...")
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(name=self.collection_name)
            print(f"Collection '{self.collection_name}' initialized successfully.")
        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
    
    def add_embeddings(self, texts: List[Any], embeddings: np.ndarray):
        if not self.collection:
            raise ValueError("Collection is not initialized.")
        
        if len(texts) != len(embeddings):
            raise ValueError("Number of texts and embeddings must match.")
        
        print(f"Adding {len(texts)} embeddings to vector store'...")

        ids = [str(i) for i in range(len(texts))]
        metadatas = []
        contents = []
        embeddings_lists = []

        for i, (doc, embedding) in enumerate(zip(texts, embeddings)):
            metadata = dict(doc.metadata)
            metadatas.append(metadata)
            contents.append(doc.page_content)
            embeddings_lists.append(embedding.tolist())

        try:
            self.collection.add(ids=ids, documents=contents, metadata=metadatas, embeddings=embeddings_lists)
            print(f"Added {len(texts)} embeddings to the collection '{self.collection_name}'.")
        except Exception as e:
            print(f"Error adding embeddings to collection: {e}")

vector_store = VectorStore()
vector_store

In [None]:
content = [chunk.page_content for chunk in chunks]

In [None]:
embeddings = embedding_manager.embed_texts(content)

vector_store.add_embeddings(chunks, embeddings)

## Retrieval

In [None]:
class Retriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager, top_k: int = 5):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        self.top_k = top_k
    
    def retrieve(self, query: str) -> List[Any]:
        print(f"Retrieving top {self.top_k} documents for query: '{query}'")

        query_embedding = self.embedding_manager.embed_texts([query])[0]
        
        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=self.top_k
        )
        
        retrieved_context = []
        for doc_content, metadata in zip(results['documents'][0], results['metadatas'][0]):
            retrieved_context.append({'content': doc_content, 'metadata': metadata})
        
        return retrieved_context

retriever = Retriever(vector_store, embedding_manager, top_k=3)


In [None]:
from dotenv import load_dotenv
import os
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", max_tokens=1024)


In [None]:
def generate_answer(query: str, retriever, llm, top_k=3) -> str:
    retrieved_docs = retriever.retrieve(query)
    
    context = "\n\n".join([doc['content'] for doc in retrieved_docs])
    
    if not context:
        return "No relevant information found to answer the question. Re-Phrase it!"
    
    prompt = f"Using the following context, answer the question:\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    
    print("Generating answer using LLM...")
    answer = llm.invoke([prompt.format(context=context, query=query)])
    
    return answer.content


res = generate_answer("What is the pdf about?", retriever, llm, top_k=3)
print(res)