### RAG pipeline - Data Injestion --> Vector DB

In [1]:

from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path

In [None]:
#read all the pdfs in pdf directory
def doc_reader(document_directory):
    pdf_dir=Path(document_directory)
    all_doc=[]
    pdf_files=list(pdf_dir.glob("**/*.pdf"))
    
    for file in pdf_files:
        print(f"Processing file name:{file.name}")
        try:
            
            pdf_loader=PyMuPDFLoader(str(file))
            document=pdf_loader.load()
            
            for doc in document:
                doc.metadata['source']=file.name
                doc.metadata['filetype']='pdf'
                            
            all_doc.extend(document)
            print(f"{len(document)} documents added")
        except Exception as e:
            print(f"ERROR: {e}")
    
    print(f"Total Loaded Documents:{len(all_doc)}")
    return all_doc
    
        
    
    
    
    
    # return document.load()
if __name__=="__main__":
    all_pdf_documents=doc_reader("../data/pdf_files")

Processing file name:attention is all you need.pdf
15 documents added
Processing file name:DisasterDataSet_Paper.pdf
16 documents added
Total Loaded Documents:31


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'attention is all you need.pdf', 'file_path': '..\\data\\pdf_files\\attention is all you need.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0, 'filetype': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\nai

In [4]:
# Text Split into Chunks

def splitter_chunker(document,chunk_size=1000,chunk_overlap=200):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_overlap=chunk_overlap,
        chunk_size=chunk_size,
        length_function=len,
        separators=["\n\n"," ",""],
        )
    
    split_doc=text_splitter.split_documents(document)
    print(f"splitted {len(document)} documents into {len(split_doc)} chunks")
    print(split_doc)
    if split_doc:
        print("\n Example Chunk")
        print(f"Content: {split_doc[1].page_content[:200]}\n MetaData: {split_doc[1].metadata['source']} \n{split_doc[99].page_content[:200]} \n MetaData: {split_doc[99].metadata['source']}")
    return split_doc

In [5]:
chunks=splitter_chunker(all_pdf_documents)

splitted 31 documents into 100 chunks
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'attention is all you need.pdf', 'file_path': '..\\data\\pdf_files\\attention is all you need.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0, 'filetype': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan

**Embedding and VectorStoreDB**

In [11]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from langchain_community.embeddings import HuggingFaceEmbeddings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [14]:
class EmbeddingManager:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        #model_name= hugging face model name
        
        self.model_name=model_name
        self.model=None
        self._load_model()
        
    def _load_model(self) :
        try:
            print(f"Loading Embedding Model:{self.model_name}")
            self.model=SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully, Embedding Dimensions {self.model.get_sentence_embedding_dimension()}")
            
        except Exception as e:
            print(f"Error Loading {self.model}: {e}")
            
    def generate_embeddings(self,texts:List[str]) -> np.ndarray:
        '''apply model on the text
        Args:
            texts:List of Strings to be embedded
        returns:
            numpy array of embeddings with shape (len(texts),embedding_dim())
            
        '''
        if not self.model:
            raise ValueError("Model didnt Load")
        
        print(f"Generating Embeddings for {len(texts)} texts..")
        embeddings=self.model.encode(
            sentences=texts,
            show_progress_bar=True,
            )
        print(f"Embedding Sucessfully Performed on {len(texts)} texts{np.ndim(embeddings)} ")
        return embeddings
    
    ## initialize the embeddings manager
    
embedding_manager=EmbeddingManager()
embedding_manager
    

Loading Embedding Model:all-MiniLM-L6-v2
Model Loaded Successfully, Embedding Dimensions 384


<__main__.EmbeddingManager at 0x1e6f081c370>

VectoreStore

In [50]:
class VectorStore:
    def __init__(self, collection_name:str="pdf_documents", persist_directory:str="../data/vector_store"):
        self.collection_name=collection_name
        self.persist_directory=persist_directory
        self.client=None
        self.collection=None
        self._initialize_store()
    
    def _initialize_store(self):
        '''Initialize ChromaDB client with the collection'''
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)
            
            self.collection= self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":'Pdf document Embeddings for RAG'}
            )
            print(f"VectorStore Initialized. Collection:{self.collection_name}")
            print(f"existing documents in collection:{self.collection.count()}")
        except Exception as e:
            print("Error initializing vectore store")
            raise 
        
    def add_documents(self, documents:List[Any], embeddings:np.ndarray):
        '''Add documents and their embeddings in the vectore store'''
        if len(documents)!=len(embeddings):
            raise ValueError("Number of documents should be equal to number of embeddings")
        
        print(f"Adding {len(documents)} into the vector store at {self.persist_directory} ...")
        
        '''prepare the data for chromadb'''
        
        
        ids=[]
        metadatas=[]
        documents_text=[]
        embeddings_list=[]
        
        for i,(doc,emb) in enumerate(zip(documents,embeddings)):
            #Generate Random Uid
            
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            #Prepare Metadata
            
            metadata=dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']=len(doc.page_content)
            metadatas.append(metadata)
            documents_text.append(doc.page_content)
            embeddings_list.append(emb.tolist())
            
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"succesfull added data of length {len(documents)} in collection to vector store")
            print(f"total documents in collection {self.collection.count()}")
            
        except Exception as e:
            print("Error adding documents to vector store")
            raise
        
        
vector_store=VectorStore()
vector_store
            
        
        
        

VectorStore Initialized. Collection:pdf_documents
existing documents in collection:0


<__main__.VectorStore at 0x1e68539bf40>

Converting Chunks to generate Embeddings

In [None]:
#Example Chunks
chunks[:20]

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'attention is all you need.pdf', 'file_path': '..\\data\\pdf_files\\attention is all you need.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0, 'filetype': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\nai

In [51]:
#convert text to embeddings
# texts=[doc.page_content for doc in chunks]

'''OR'''

texts=[]
for doc in chunks:
    texts.append(doc.page_content)
    
print(f"example of text list size: {len(texts)} before embedding:\n\n"+"\n".join(texts[:3]))


embeddings=embedding_manager.generate_embeddings(texts=texts)

vector_store.add_documents(documents=chunks,embeddings=embeddings)


example of text list size: 100 before embedding:

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensin

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches: 100%|██████████| 4/4 [00:03<00:00,  1.10it/s]


Embedding Sucessfully Performed on 100 texts2 
Adding 100 into the vector store at ../data/vector_store ...
succesfull added data of length 100 in collection to vector store
total documents in collection 100


### Retrieval

In [None]:
class RAGRetriever:
      """Query based retriever from vector store"""
      def __init__(self,vector_store:VectorStore,embedding_manager:EmbeddingManager):
          """
          initialize retriver with
          Args:
            vector_store:Vector store containing document text and embeddings
            embedding_manager: Manager for generating embeddings
          
          """
          self.vector_store=vector_store
          self.embedding_manager=embedding_manager
          
      def retrive(self, query:str, top_k:int = 5, score_threshold:float=0.0 )->List[Dict[str,Any]]:
          """Retrieve relavant docs for query

          Args:
              query (str): search query
              top_k (int, optional): _description_. Defaults to 5 : Number of top results to return
              score_threshold (float, optional): _description_. Defaults to 0.0 :minimum similarity score threshold
              

          Returns:
              List[Dict[str,Any]]: List of dictionaries containing retrived documents and metadata
              
          """
          
      
          
          
          
          
            