In [1]:
###DATA LOADING MODULE###
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import DirectoryLoader

  from .autonotebook import tqdm as notebook_tqdm


Embeddings

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict,Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
class EmbeddingModel:
    """ Class to handle text embeddings using SentenceTransformer.

        Args:
            model_name (str): Name of the pre-trained model to use. Defaults to 'all-MiniLM-L6-v2'.
    """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model() ## we write protected method to load the model. this method is not accessible outside the class.
    
    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Loaded model: {self.model_name}")
            print(f"model_dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e

    
    def parse_documents(self,path: str) -> List[Document]:
        """Parse documents from the given directory.

            Args:
                path (str): Path to the directory containing text files.
        """
        if not os.path.isdir(path):
            raise ValueError(f"Provided path {path} is not a valid directory.")
        
        loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyMuPDFLoader)
        documents = loader.load()
        return documents
    
    def batch_embed_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Embed texts in batches to handle large datasets.

            Args:
                texts (List[str]): List of texts to be embedded.
                batch_size (int): Size of each batch. Defaults to 32.
        """
        if not self.model:
            raise ValueError("Model not loaded.")
        
        embeddings =[]
        for i in range(0,len(texts),batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = self.model.encode(batch_texts, convert_to_numpy=True)
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)
    
    def chunking(self, path:str, chunk_size: int = 500, overlap: int = 50) -> Tuple[List[Document], np.ndarray]:
        """Chunk the text into smaller pieces.

            Args:
                text (str): The text to be chunked.
                chunk_size (int): The size of each chunk. Defaults to 500.
                overlap (int): The number of overlapping characters between chunks. Defaults to 50.

            Returns:
                List[str]: List of text chunks.
        """
        All_texts = self.parse_documents(path) ## It will bydefault only split page content and not metadata.
        text_splitter = RecursiveCharacterTextSplitter( 
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = text_splitter.split_documents(All_texts)
        embedding = self.batch_embed_texts([chunk.page_content for chunk in chunks])
        return chunks, embedding
        
    



In [None]:
class VectorDB:

    def __init__(self,collection_name:str = "documents"):
        self.collection_name = collection_name
        self.client = None
        self.collection = None
        self.persist_directory = "/content/chroma_db"
        self._initialize_client()

    def _initialize_client(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)
            self.collection = self.client.get_or_create_collection(name=self.collection_name)
            print(f"ChromaDB client initialized with collection: {self.collection_name}")
        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
            raise e

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if not self.collection:
            raise ValueError("ChromaDB client not initialized.")
        if(len(documents) != len(embeddings)):
            raise ValueError("Number of documents and embeddings must match.")

        ids = [str(uuid.uuid4()) for _ in range(len(documents))]
        metadatas = [doc.metadata for doc in documents]
        texts = [doc.page_content for doc in documents]

        self.collection.add(
            ids=ids,
            metadatas=metadatas,
            documents=texts,
            embeddings=embeddings.tolist()  ## ChromaDB requires list of lists
        )
        print(f"Added {len(documents)} documents to the collection {self.collection_name}.")





In [None]:
#Embedding Object
embedding_model = EmbeddingModel()
chunks, embeddings_ = embedding_model.chunking("../data/pdfFiles")

#VectorDB Object
vectorStore = VectorDB()
vectorStore.add_documents(chunks, embeddings_)

NameError: name 'EmbeddingModel' is not defined

Retriever

In [None]:
class ChromaRetriever:
    """Retriever for querying documents stored in ChromaDB"""

    def __init__(self, vector_store, top_k: int = 5):
        """
        Args:
            vector_store: Instance of VectorStore (ChromaDB wrapper)
            top_k: Number of top results to retrieve
        """
        self.vector_store = vector_store
        self.top_k = top_k

    def retrieve(self, query: str):
        """Retrieve top-k documents for a query string"""
        if not self.vector_store.collection:
            raise ValueError("ChromaDB collection not initialized.")

        results = self.vector_store.collection.query(
            query_texts=[query],  # the search query
            n_results=self.top_k   # number of documents to return
        )
        return results


In [None]:
retriever = ChromaRetriever(vectorStore, top_k=3)

# 4. Query
query = "Write your query here"
embed_query = embedding_model.batch_embed_texts(query)
results = retriever.retrieve(query)

# 5. Print results
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"Content: {doc[:200]}...\nMetadata: {meta}\n")