In [12]:
### data ingestion to vector db pipeline
import os
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [13]:
###read all pdf  inside the directory
def process_all_pdfs(pdf_directory):
    all_documents=[]
    pdf_dir=Path(pdf_directory)

    ###find all pdf files recursively
    pdf_files=list(pdf_dir.glob("**/*.pdf"))
    print(f"found{len(pdf_files)} pdf filees to process")

    for pdf_file in pdf_files:
        print(f"\n processing:{pdf_file.name}")
        try:
            loader=PyMuPDFLoader(str(pdf_file))
            documents=loader.load()

            ###add source info to meta
            for doc in documents:
                doc.metadata['source_file']=pdf_file.name
                doc.metadata['file_type']='pdf'

            all_documents.extend(documents)
            print(f"loaded  {len(documents)} pages")
        
        except Exception as e:
            print(f"error:{e}")
    
    print(f"\ntotal documents loaded:{len(all_documents)}")
    return all_documents

### process all
all_pdf_documents=process_all_pdfs("../data")



found4 pdf filees to process

 processing:Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf
loaded  303 pages

 processing:Rich-Dad-Poor-Dad.pdf
loaded  241 pages

 processing:sample.pdf
loaded  280 pages

 processing:Sorcerer's Stone.pdf
loaded  250 pages

total documents loaded:1074


In [14]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'total_pages': 303, 'format': 'PDF 1.5', 'title': '', 'author': 'artemenovs', 'subject': '', 'keywords': '', 'moddate': '2020-01-22T18:20:50+03:00', 'trapped': '', 'modDate': "D:20200122182050+03'00'", 'creationDate': "D:20200122182050+03'00'", 'page': 0, 'source_file': 'Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Le

In [15]:
###text spliting and get chunks
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ",""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"split {len(documents)} documents into {len(split_docs)} chunks")

    ###example
    if split_docs:
        print(f"\n example chunk:")
        print(f"\ncontent:{split_docs[0].page_content[:200]}...")
        print(f"\nmetadata: {split_docs[0]}.metadata")
    
    return split_docs


In [16]:
chunks=split_documents(all_pdf_documents)
chunks

split 1074 documents into 2442 chunks

 example chunk:

content:Building Machine Learning Powered 
Applications 
Going from Idea to Product 
Emmanuel Ameisen...

metadata: page_content='Building Machine Learning Powered 
Applications 
Going from Idea to Product 
Emmanuel Ameisen' metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'total_pages': 303, 'format': 'PDF 1.5', 'title': '', 'author': 'artemenovs', 'subject': '', 'keywords': '', 'moddate': '2020-01-22T18:20:50+03:00', 'trapped': '', 'modDate': "D:20200122182050+03'00'", 'creationDate': "D:20200122182050+03'00'", 'page': 1, 'source_file': 'Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_type': 'pdf'}.metadata


[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'total_pages': 303, 'format': 'PDF 1.5', 'title': '', 'author': 'artemenovs', 'subject': '', 'keywords': '', 'moddate': '2020-01-22T18:20:50+03:00', 'trapped': '', 'modDate': "D:20200122182050+03'00'", 'creationDate': "D:20200122182050+03'00'", 'page': 1, 'source_file': 'Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_type': 'pdf'}, page_content='Building Machine Learning Powered \nApplications \nGoing from Idea to Product \nEmmanuel Ameisen'),
 Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learnin

dbcvcvb

In [17]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer ###embiding by hugging_face
import chromadb
from chromadb.config import Settings
import uuid ### for vectordb id 
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
class EmbeddingManager:

    def __init__(self,model_name:str="all-MiniLM-L6-v2"):
        """ 
        Initialize the embading manager
        """
        self.model_name=model_name
        self.model =None
        self._load_model()

    def _load_model(self):
        try:
            print (f"loading embedding model:{self.model_name}")
            self.model= SentenceTransformer(self.model_name)
            print(f"Model Loaded successfully.\nembedding diamension:{self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"error loading model{self.model_name}:{e}")
            raise

    def generate_embeddings(self,texts:List[str])->np.ndarray:
        
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating embedding for {len(texts)} texts...")
        embeddings=self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape:{embeddings.shape}")
        return embeddings
    


### intialize embedding manager
embedding_manager=EmbeddingManager()
embedding_manager

loading embedding model:all-MiniLM-L6-v2
Model Loaded successfully.
embedding diamension:384


<__main__.EmbeddingManager at 0x1f19f21e8d0>

In [19]:
###vector storage
class VectorStore:

    def __init__(self,collection_name:str="pdf_documents",persist_directory: str="../data/vector_store"):
        self.collection_name=collection_name 
        self.persist_directory=persist_directory
        self.client =None
        self.collection=None
        self._initialize_store()
        
    def _initialize_store(self):

        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client=chromadb.PersistentClient(path=self.persist_directory)

            self.collection=self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF document embedding for RAG"}
            )
            print(f"vector store initialized.\nCollection:{self.collection_name}")
            print(f"existing document in colletion:{self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store:{e}")
            raise

    def add_documents(self,documents:List[Any],embeddings:np.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("number of documents must match number of embeddings")
        
        print(f"adding{len(documents)}document to vector store...")

        ###prepare data for chromadb
        ids=[]
        metadatas=[]
        documents_text=[]
        embedding_list=[]

        for i, (doc,embedding) in enumerate(zip(documents,embeddings)):
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            ### prepare meta data
            metadata = dict(doc.metadata)
            metadata['doc_index']=i
            metadata['content_length']=len(doc.page_content)
            metadatas.append(metadata)

            ###document content
            documents_text.append(doc.page_content)

            ###embedding
            embedding_list.append(embedding.tolist())

        ###add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embedding_list,
                metadatas=metadatas,
                documents=documents_text
            )

        except Exception as e:
            print(f"error adding documents to vector store:{e}")
            raise
vectorstore=VectorStore()
vectorstore


vector store initialized.
Collection:pdf_documents
existing document in colletion:3742


<__main__.VectorStore at 0x1f19f277110>

cvbxbcvbx

In [20]:
chunks

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_path': '..\\data\\pdf\\Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'total_pages': 303, 'format': 'PDF 1.5', 'title': '', 'author': 'artemenovs', 'subject': '', 'keywords': '', 'moddate': '2020-01-22T18:20:50+03:00', 'trapped': '', 'modDate': "D:20200122182050+03'00'", 'creationDate': "D:20200122182050+03'00'", 'page': 1, 'source_file': 'Building_Machine_Learning_Powered_Applications_Going_From_Idea_to.pdf', 'file_type': 'pdf'}, page_content='Building Machine Learning Powered \nApplications \nGoing from Idea to Product \nEmmanuel Ameisen'),
 Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2020-01-22T18:20:50+03:00', 'source': '..\\data\\pdf\\Building_Machine_Learnin

In [21]:
### convert the text to embadings
texts=[doc.page_content for doc in chunks ]
texts

['Building Machine Learning Powered \nApplications \nGoing from Idea to Product \nEmmanuel Ameisen',
 'Building Machine Learning Powered \nApplications \nby Emmanuel Ameisen \nCopyright © 2020 Emmanuel Ameisen. All rights reserved. \nPrinted in the United States of America. \nPublished by O’Reilly Media, Inc., 1005 Gravenstein Highway North, \nSebastopol, CA 95472. \nO’Reilly books may be purchased for educational, business, or sales \npromotional use. Online editions are also available for most titles \n(http://oreilly.com). For more information, contact our \ncorporate/institutional sales department: 800-998-9938 \nor corporate@oreilly.com. \n\uf0b7 Acquisitions Editor: Jonathan Hassell \n\uf0b7 Development Editor: Melissa Potter \n\uf0b7 Production Editor: Deborah Baker \n\uf0b7 Copyeditor: Kim Wimpsett \n\uf0b7 Proofreader: Christina Edwards \n\uf0b7 Indexer: Judith McConville \n\uf0b7 Interior Designer: David Futato \n\uf0b7 Cover Designer: Karen Montgomery \n\uf0b7 Illustrator: R

In [22]:
### convert the text to embadings
texts=[doc.page_content for doc in chunks ]
texts

###generaate embaddings
embeddings=embedding_manager.generate_embeddings(texts)

###store in the vector database
vectorstore.add_documents(chunks,embeddings)

Generating embedding for 2442 texts...


Batches: 100%|██████████| 77/77 [01:20<00:00,  1.05s/it]


Generated embeddings with shape:(2442, 384)
adding2442document to vector store...


In [23]:
###retriver pipeline from vector store
class RAGRetriever:

    def __init__(self,vector_store:VectorStore,embedding_manager:EmbeddingManager):
        
        self.vector_store =vector_store
        self.embedding_manager=embedding_manager

    def retrieve(self,query:str,top_k:int =5,score_threshold: float=0.0)->List[dict[str,Any]]:

        print(f"retrieving documents from query:'{query}'")
        print(f"top k:{top_k}, score threshold:{score_threshold}")

        ##generate query embedding
        query_embedding=self.embedding_manager.generate_embeddings([query])[0]

        ###search the vectior store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            ### process results
            retrieved_docs=[]

            if results['documents']and results['documents'][0]:
                documents=results['documents'][0]
                metadatas=results['metadatas'][0]
                distances=results['distances'][0]
                ids=results['ids'][0]

                for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
                    similarity_score=1-distance
                    if similarity_score >=score_threshold:
                        retrieved_docs.append({
                            'id':doc_id,
                            'content':document,
                            'metadata':metadata,
                            'similarity_score':similarity_score,
                            'rank':i+1
                        })
                print(f"retrived {len(retrieved_docs)} documents (after filtering )")
            else:
                print("no document found")

            return retrieved_docs
        except Exception as e:
            print(f"error during retrieval: {e}")
            return []
        
rag_retriever=RAGRetriever(vectorstore,embedding_manager)


In [24]:
rag_retriever


<__main__.RAGRetriever at 0x1f19f956a10>

In [25]:
rag_retriever.retrieve('What is the difference between assets and liabilities, and what does Rich Dad describe as a true asset?')

retrieving documents from query:'What is the difference between assets and liabilities, and what does Rich Dad describe as a true asset?'
top k:5, score threshold:0.0
Generating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.10it/s]

Generated embeddings with shape:(1, 384)
retrived 5 documents (after filtering )





[{'id': 'doc_93d23008_833',
  'content': 'person, it makes no sense. But we \nadults are often too proud to admit that \nsomething does not make sense.\nTo us young boys, rich dad said, “What defines an asset are not \nwords, but numbers. And if you can’t read the numbers, you can’t tell \nan asset from a hole in the ground.” “In accounting,” rich dad would \nsay, “it’s not the numbers, but what the numbers are telling you. It’s just \nlike words. It’s not the words, but the story the words are telling you.”\n“If you want to be rich, you’ve got to read and understand \nnumbers.” If I heard that once, I heard it a thousand times from my \nrich dad. And I also heard, “The rich acquire assets, and the poor and \nmiddle class acquire liabilities.”\nHere is how to tell the difference between an asset and a liability. \nMost accountants and financial professionals do not agree with \nthe definitions, but these simple drawings were the start of strong \nfinancial foundations for two young boy

In [None]:
### simple rag pipeline with groq llm
from langchain_groq import ChatGroq

import os
from dotenv import load_dotenv
load_dotenv()

### initialize groq llm
###groq_api_key=os.getenv("api")
groq_api_key = "YOUR_API_KEY_HERE""

llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama-3.1-8b-instant",
    temperature=0.1,
    max_tokens=1024
)


### simple rag func
def rag_simple(query,retriever,llm,top_k=3):

    results=retriever.retrieve(query,top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""

    if not context:
        return "no relevent context found to answer the ques"
    
    ###generate answer using the groq llm
    promt=f"""use the following context to answer the question concisely.
        context:
        {context}

        question:{query}

        Answer:"""
    
    response = llm.invoke(promt.format(context=context, query=query))

    return response.content

In [35]:
answer= rag_simple("what rich da said",rag_retriever,llm)
print(answer)

retrieving documents from query:'what rich da said'
top k:3, score threshold:0.0
Generating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 111.12it/s]

Generated embeddings with shape:(1, 384)
retrived 2 documents (after filtering )





Rich dad said that a human's life is a struggle between ignorance and illumination, and that school is very important to learn a skill or profession, but for many people, school is the end, not the beginning.
