### Data Ingestion 
### Document Structure

In [88]:
from langchain_core.documents import Document

In [89]:
doc=Document(
    page_content="This is the main content I am using to create RAG",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Vaishnavi Sharma",
        "date_created":"2025-08-01"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Vaishnavi Sharma', 'date_created': '2025-08-01'}, page_content='This is the main content I am using to create RAG')

In [90]:
## create a simple txt dir inside data

import os
os.makedirs("../data/text_files",exist_ok=True)

In [91]:
sample_texts={
    "../data/text_files/langchain.txt":"""Langchain Introduction
LangChain is a powerful framework designed to help developers build applications with large language models (LLMs). It enables seamless integration of LLMs with data sources such as documents, APIs, and databases.

With LangChain, you can implement Retrieval-Augmented Generation (RAG), which combines document retrieval with LLMs to produce accurate and context-aware answers.

This framework supports various document loaders, text splitting, vector stores, and chaining methods to create sophisticated NLP applications.
""",


 "../data/text_files/ml_basics.txt":"""Machine Learning (ML) is a field of artificial intelligence that allows computers to learn patterns from data and make decisions without being explicitly programmed.
 
It includes supervised learning, unsupervised learning, and reinforcement learning. 

Popular ML libraries include scikit-learn, TensorFlow, and PyTorch.

"""
}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample text file created")

Sample text file created


In [92]:
### Text Loader
from langchain.document_loaders import TextLoader

loader=TextLoader("../data/text_files/langchain.txt",encoding="utf-8")
### reads the file and returns a list of Document objects
document=loader.load()
print(document)


[Document(metadata={'source': '../data/text_files/langchain.txt'}, page_content='Langchain Introduction\nLangChain is a powerful framework designed to help developers build applications with large language models (LLMs). It enables seamless integration of LLMs with data sources such as documents, APIs, and databases.\n\nWith LangChain, you can implement Retrieval-Augmented Generation (RAG), which combines document retrieval with LLMs to produce accurate and context-aware answers.\n\nThis framework supports various document loaders, text splitting, vector stores, and chaining methods to create sophisticated NLP applications.\n')]


In [93]:
### Directory Loader
from langchain.document_loaders import DirectoryLoader

## Load all the text files from directory
dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", ## Pattern to match files
    loader_cls= TextLoader, ##Loader class to use
    loader_kwargs={'encoding':'utf-8'},
    show_progress=False
)

documents=dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\text_files\\langchain.txt'}, page_content='Langchain Introduction\nLangChain is a powerful framework designed to help developers build applications with large language models (LLMs). It enables seamless integration of LLMs with data sources such as documents, APIs, and databases.\n\nWith LangChain, you can implement Retrieval-Augmented Generation (RAG), which combines document retrieval with LLMs to produce accurate and context-aware answers.\n\nThis framework supports various document loaders, text splitting, vector stores, and chaining methods to create sophisticated NLP applications.\n'),
 Document(metadata={'source': '..\\data\\text_files\\ml_basics.txt'}, page_content='Machine Learning (ML) is a field of artificial intelligence that allows computers to learn patterns from data and make decisions without being explicitly programmed.\n\nIt includes supervised learning, unsupervised learning, and reinforcement learning. \n\nPopular ML librarie

In [94]:
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
## Load all the pdf files from dir
dir_loader=DirectoryLoader( 
    "../data/pdf", 
    glob="**/*.pdf", 
    loader_cls= PyMuPDFLoader, 
    show_progress=False 
) 

pdf_documents=dir_loader.load() 
pdf_documents



[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-10-18T00:29:28+00:00', 'source': '..\\data\\pdf\\llm.pdf', 'file_path': '..\\data\\pdf\\llm.pdf', 'total_pages': 47, 'format': 'PDF 1.5', 'title': 'A Comprehensive Overview of Large Language Models', 'author': 'Humza Naveed; Asad Ullah Khan; Shi Qiu; Muhammad Saqib; Saeed Anwar; Muhammad Usman; Naveed Akhtar; Nick Barnes; Ajmal Mian;', 'subject': '', 'keywords': '', 'moddate': '2024-10-18T00:29:28+00:00', 'trapped': '', 'modDate': 'D:20241018002928Z', 'creationDate': 'D:20241018002928Z', 'page': 0}, page_content='A Comprehensive Overview of Large Language Models\nHumza Naveeda, Asad Ullah Khanb,∗, Shi Qiuc,∗, Muhammad Saqibd,e,∗, Saeed Anwarf,g, Muhammad Usmanf,g, Naveed Akhtarh,j,\nNick Barnesi, Ajmal Mianj\naThe University of Sydney, Sydney, Australia\nbUniversity of Engineering and Technology (UET), Lahore, Pakistan\ncThe Chinese University of Hong Kong (CUHK), HKSAR, China\ndUn

In [95]:
type(pdf_documents[0])


langchain_core.documents.base.Document

In [96]:
# 2. Chunking function
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List

def chunk_documents(
    documents: List[Document],
    chunk_size: int = 500,
    chunk_overlap: int = 100
) -> List[Document]:
    """
    Split documents into smaller overlapping chunks for embedding.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""],
    )
    chunked_docs = text_splitter.split_documents(documents)
    print(f"Chunked {len(documents)} documents into {len(chunked_docs)} chunks.")
    return chunked_docs

# 3. Apply chunking to the loaded PDFs
chunked_documents = chunk_documents(pdf_documents)
chunked_documents

Chunked 47 documents into 703 chunks.


[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-10-18T00:29:28+00:00', 'source': '..\\data\\pdf\\llm.pdf', 'file_path': '..\\data\\pdf\\llm.pdf', 'total_pages': 47, 'format': 'PDF 1.5', 'title': 'A Comprehensive Overview of Large Language Models', 'author': 'Humza Naveed; Asad Ullah Khan; Shi Qiu; Muhammad Saqib; Saeed Anwar; Muhammad Usman; Naveed Akhtar; Nick Barnes; Ajmal Mian;', 'subject': '', 'keywords': '', 'moddate': '2024-10-18T00:29:28+00:00', 'trapped': '', 'modDate': 'D:20241018002928Z', 'creationDate': 'D:20241018002928Z', 'page': 0}, page_content='A Comprehensive Overview of Large Language Models\nHumza Naveeda, Asad Ullah Khanb,∗, Shi Qiuc,∗, Muhammad Saqibd,e,∗, Saeed Anwarf,g, Muhammad Usmanf,g, Naveed Akhtarh,j,\nNick Barnesi, Ajmal Mianj\naThe University of Sydney, Sydney, Australia\nbUniversity of Engineering and Technology (UET), Lahore, Pakistan\ncThe Chinese University of Hong Kong (CUHK), HKSAR, China\ndUn

In [97]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
class EmbeddingManager:
    """Handles document embedding generation using sentenceTransformer"""
    #It helps convert text into numerical vectors using a pre-trained model called all-MiniLM-L6-v2 from Hugging Face’s sentence-transformers library."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):  #This model will convert text into vector
        """
        Initialize the embedding manager

        Args:
        model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model() ## It is going to load all-MiniLM-L6-v2

    def _load_model(self):
        """ Load the sentence transformer model """
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
    
            ## Returns the **size of the vector - get_sentence_embedding_dimension. (e.g., 384 for MiniLM)
    
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading model {self.model_name}: {e}")
            raise #Rethrows the error


    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
           raise ValueError("Model not loaded")
    
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
embedding_manager=EmbeddingManager()

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


In [99]:
### Vector Store


In [115]:
class VectorStore:
   def __init__(self, collection_name:str = "pdf_documents", persist_directory: str = "../data/vectore_store"):
      self.collection_name = collection_name
      self.persist_directory = persist_directory
      self.client = None
      self.collection = None
      self._initialize_store()

   def _initialize_store(self):
      # initialize chromadb client and collection
      try:
         os.makedirs(self.persist_directory, exist_ok=True)
         self.client = chromadb.PersistentClient(path=self.persist_directory)
   
         #Get or create collection
         self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"description": "Pdf foc embedding for RAG"}
         )   
         print(f"Vector Store initialized. Collection: {self.collection_name}")
         print(f"Existing doc in collection: {self.collection_count()}")
      except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise


   def collection_count(self) -> int:
      try:
            return self.collection.count()
      except Exception as e:
            print(f"Error getting collection count: {e}")
            return 0

   def add_documents(self, documents: List[Any], embeddings: np.ndarray):
    #Add docs and their embedding to the vector stores

    if len(documents) != len(embeddings):
        raise ValueError("No of docs must match number of embeddings")
    
    print(f"Adding {len(documents)} documents to vector store...")

    #Prepare data for chromadb

    ids = []
    metadatas = []
    documents_text = []
    embeddings_list = []

    for i, (doc, embedding) in enumerate(zip(document,embeddings)):
        # generate unique ID
        doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
        ids.append(doc_id)

        #prepare metadata
        metadata = dict(doc.metadata)
        metadata['doc_index'] = i
        metadata['content_length'] = len(doc.page_content)
        metadatas.append(metadata)

        #Document content
        documents_text.append(doc.page_content)

        #Embedding
        embeddings_list.append(embedding.tolist())

    #Add to collection
    try:
        self.collection.add(
            ids=ids,
            embeddings=embeddings_list,
            metadatas=metadata,
            documents=documents_text
        )

        print(f"Sucessfully added {len(documents)}documents to vectore store")
        print(f"Total doc in collection: {self.collection_count()}")

    except Exception as e:
         print(f"Error adding docs to vector store: {e}")
         raise
    
vectorestore=VectorStore()
vectorestore     

Vector Store initialized. Collection: pdf_documents
Existing doc in collection: 3


<__main__.VectorStore at 0x24ecea9f230>

In [116]:
chunked_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-10-18T00:29:28+00:00', 'source': '..\\data\\pdf\\llm.pdf', 'file_path': '..\\data\\pdf\\llm.pdf', 'total_pages': 47, 'format': 'PDF 1.5', 'title': 'A Comprehensive Overview of Large Language Models', 'author': 'Humza Naveed; Asad Ullah Khan; Shi Qiu; Muhammad Saqib; Saeed Anwar; Muhammad Usman; Naveed Akhtar; Nick Barnes; Ajmal Mian;', 'subject': '', 'keywords': '', 'moddate': '2024-10-18T00:29:28+00:00', 'trapped': '', 'modDate': 'D:20241018002928Z', 'creationDate': 'D:20241018002928Z', 'page': 0}, page_content='A Comprehensive Overview of Large Language Models\nHumza Naveeda, Asad Ullah Khanb,∗, Shi Qiuc,∗, Muhammad Saqibd,e,∗, Saeed Anwarf,g, Muhammad Usmanf,g, Naveed Akhtarh,j,\nNick Barnesi, Ajmal Mianj\naThe University of Sydney, Sydney, Australia\nbUniversity of Engineering and Technology (UET), Lahore, Pakistan\ncThe Chinese University of Hong Kong (CUHK), HKSAR, China\ndUn

In [117]:
# convert text to embedding
texts=[doc.page_content for doc in chunked_documents]
texts

#generate the embeddings
embeddings=embedding_manager.generate_embeddings(texts)

#store into vector database
vectorestore.add_documents(chunked_documents,embeddings)

print("Collection count:", vectorestore.collection_count())


Generating embeddings for 703 texts...


Batches: 100%|██████████| 22/22 [00:27<00:00,  1.25s/it]

generated embeddings with shape: (703, 384)
Adding 703 documents to vector store...
Sucessfully added 703documents to vectore store
Total doc in collection: 4
Collection count: 4





In [118]:
for doc in chunked_documents:
    print("----")
    print(doc.page_content[:500])  # Print first 500 characters of each chunk


----
A Comprehensive Overview of Large Language Models
Humza Naveeda, Asad Ullah Khanb,∗, Shi Qiuc,∗, Muhammad Saqibd,e,∗, Saeed Anwarf,g, Muhammad Usmanf,g, Naveed Akhtarh,j,
Nick Barnesi, Ajmal Mianj
aThe University of Sydney, Sydney, Australia
bUniversity of Engineering and Technology (UET), Lahore, Pakistan
cThe Chinese University of Hong Kong (CUHK), HKSAR, China
dUniversity of Technology Sydney (UTS), Sydney, Australia
----
dUniversity of Technology Sydney (UTS), Sydney, Australia
eCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia
fKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia
gSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia
hThe University of Melbourne (UoM), Melbourne, Australia
iAustralian National University (ANU), Canberra, Australia
jThe University of Western Australia (UWA), Perth, Australia
----
jThe University of Western Australia (UWA), Perth, Australia
Abs

In [119]:
### Retriever Pipeline Frome VectorStore

In [120]:
class RAGRetriever:
    #Handles query based retrieval from vector store

    def __init__(self, vector_store: VectorStore, embedding_manager:EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str,any]]:
        print(f"Retrieveing docs for query: '{query}'")
        print(f"Top k : {top_k}, score threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

    #Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
    #Process results
            retrieved_docs=[]

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids,documents,metadatas,distances)):

                    similarity_score = 1 -distance

                    if similarity_score>= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1

                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")

            return retrieved_docs
        
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return[]
        
rag_retriever=RAGRetriever(vectorestore,embedding_manager)
                

In [121]:
rag_retriever

<__main__.RAGRetriever at 0x24ecea9f380>

In [123]:
rag_retriever.retrieve("large language models overview")

Retrieveing docs for query: 'large language models overview'
Top k : 5, score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 38.83it/s]

generated embeddings with shape: (1, 384)
Retrieved 4 documents (after filtering)





[{'id': 'doc_3add7509_0',
  'content': 'Langchain Introduction\nLangChain is a powerful framework designed to help developers build applications with large language models (LLMs). It enables seamless integration of LLMs with data sources such as documents, APIs, and databases.\n\nWith LangChain, you can implement Retrieval-Augmented Generation (RAG), which combines document retrieval with LLMs to produce accurate and context-aware answers.\n\nThis framework supports various document loaders, text splitting, vector stores, and chaining methods to create sophisticated NLP applications.\n',
  'metadata': {'content_length': 546,
   'source': '../data/text_files/langchain.txt',
   'doc_index': 0},
  'similarity_score': 0.4313049912452698,
  'distance': 0.5686950087547302,
  'rank': 1},
 {'id': 'doc_e71dde46_0',
  'content': 'Langchain Introduction\nLangChain is a powerful framework designed to help developers build applications with large language models (LLMs). It enables seamless integrat