In [None]:
import os
from collections import defaultdict
from typing import List
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

## Document loader
- Loading PDFs
- splitting into chunks

In [34]:
class DocumentLoader:
    
    def __init__(self, data_dir: str, chunk_size: int = 800, chunk_overlap: int = 150):
        
        self.data_dir = data_dir
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        self.documents: List[Document] = []
        self.chunks: List[Document] = []
    
    
    def load_documents(self) -> List[Document]:
        
        print("Loading documents...\n")
        
        loader = DirectoryLoader(
            path=self.data_dir,
            glob="*.pdf",
            loader_cls=PyPDFLoader
        )
        
        self.documents = loader.load()
        
        print(f"Loaded {len(self.documents)} pages\n")
        
        # group by source
        grouped = defaultdict(list)
        
        for doc in self.documents:
            grouped[doc.metadata["source"]].append(doc)
        
        print("PDF Summary:\n")
        
        for source, docs in grouped.items():
            print(f"{source} → {len(docs)} pages")
        
        return self.documents
    
    
    def split_documents(self) -> List[Document]:
        
        print("\nSplitting documents into chunks...\n")
        
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )
        
        self.chunks = splitter.split_documents(self.documents)
        
        print(f"Created {len(self.chunks)} chunks\n")
        
        print("Sample chunk:\n")
        print(self.chunks[0].page_content[:200])
        print("_____________________________________")
        
        return self.chunks
    
    
    def load_and_split(self) -> List[Document]:
        
        self.load_documents()
        return self.split_documents()

In [35]:
DATA_PATH = "../data"

doc_loader = DocumentLoader(DATA_PATH)
documents = doc_loader.load_documents()
chunks = doc_loader.split_documents()

print(f"total no of pages/documents: {len(documents)}")
print(f"Total no. of chunks:{len(chunks)}")

Loading documents...

Loaded 69 pages

PDF Summary:

../data/Ethereum-whitepaper.pdf → 36 pages
../data/Blockchain_For_Beginners.pdf → 33 pages

Splitting documents into chunks...

Created 330 chunks

Sample chunk:

Ethereum White Paper   
A NEXT GENERATION SMART CONTRACT & DECENTRALIZED APPLI CATION PLATFORM 
By Vitalik Buterin 
 
When Satoshi Nakamoto first set the Bitcoin blockchain into motion in Janua ry 200
_____________________________________
total no of pages/documents: 69
Total no. of chunks:330


## Embedding

In [36]:
class EmbeddingManager:
    def __init__(self,model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model_name=model_name
        self.embedding_model=None

    def load_model(self):
        print("Loading Embedding model....\n")

        self.embedding_model=HuggingFaceEmbeddings(
            model_name=self.model_name
        )

        print("Emedding model loaded successfully!")

        return self.embedding_model

In [37]:
embed_manager = EmbeddingManager()
embedding_model = embed_manager.load_model()
print(type(embedding_model))

Loading Embedding model....

Emedding model loaded successfully!
<class 'langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings'>


## VectorStoreManager

In [38]:
class VectorStoreManager:
    def __init__(self,persist_dir: str, embedding_model):
        self.persist_dir = persist_dir
        self.embedding_model = embedding_model
        self.vectorstore=None

    def vectorstore_exists(self) -> bool:
        return os.path.exists(self.persist_dir) and len(os.listdir(self.persist_dir)) > 0

    def create_vectorstore(self, chunks: List[Document]):
        print("Creating vectorStore....")

        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=self.embedding_model,
            persist_directory=self.persist_dir
        )

        print("Vectorstore created!")
        print(f"Total vectors:{self.vectorstore._collection.count()}")

        return self.vectorstore
    
    def load_vectorstore(self):
        print("loading existing vector storage...")

        self.vectorstore = Chroma(
            persist_directory=self.persist_dir,
            embedding_function=self.embedding_model
        )

        print(f"Loaded vectorstore with {self.vectorstore._collection.count()} vectors\n")

        return self.vectorstore

    def get_vectorstore(self, chunks:List[Document] = None):
        if self.vectorstore_exists():
            return self.load_vectorstore()
        else:
            if chunks is None:
                raise ValueError("Chunks are required to create a vector store")
            return self.create_vectorstore(chunks)

In [39]:
PERSIST_DIR = "../data/vectorstore"

vs_manager = VectorStoreManager(
    persist_dir=PERSIST_DIR,
    embedding_model=embedding_model
)
vectorstore = vs_manager.get_vectorstore(chunks)


loading existing vector storage...
Loaded vectorstore with 330 vectors



## RAG-Retriever

In [40]:
class RAGRetriever:

    def __init__(self,vectorstore):
        self.vectorstore = vectorstore

    def retrieve(self, query: str, k:int=3):

        print(f"\nQuery: {query}\n")
        # results = self.vectorstore.similarity_search(query, k=k)

        results = self.vectorstore.max_marginal_relevance_search(
            query=query,
            k=k,
            fetch_k=10
        )


        print(f"Top {k} results:\n")

        for i, doc in enumerate(results):
            
            print(f"Result {i+1}")
            print(f"Source: {doc.metadata['source']}")
            print(f"Page: {doc.metadata['page']}")
            print(f"content:{doc.page_content}")
            print("-"*50)
        
        return results


In [41]:
retriever = RAGRetriever(vectorstore)

results = retriever.retrieve(
    "What is Bitcoin according to Ethereum whitepaper?"
)


Query: What is Bitcoin according to Ethereum whitepaper?

Top 3 results:

Result 1
Source: ../data/Ethereum-whitepaper.pdf
Page: 0
content:Ethereum White Paper   
A NEXT GENERATION SMART CONTRACT & DECENTRALIZED APPLI CATION PLATFORM 
By Vitalik Buterin 
 
When Satoshi Nakamoto first set the Bitcoin blockchain into motion in Janua ry 2009, he was                 
simultaneously introducing two radical and untested concepts. The first is the "bitcoin", a decentralized               
peer-to-peer online currency that maintains a value without any back ing, intrinsic value or central issuer. So                 
far, the "bitcoin" as a currency unit has taken up the bulk of the pu blic attention, both in terms of the political                       
aspects of a currency without a central bank and its extreme upwar d and downward volatility in price.
--------------------------------------------------
Result 2
Source: ../data/Blockchain_For_Beginners.pdf
Page: 14
content:environment. The m

In [42]:
retriever.retrieve(
    "give me types of Consensus Mechanisms."
)


Query: give me types of Consensus Mechanisms.

Top 3 results:

Result 1
Source: ../data/Blockchain_For_Beginners.pdf
Page: 9
content:Consensus Mechanism  
Decision Making: Instead of a central authority (e.g., a bank manager), deciding on transactions to be made, 
in blockchain the transactions are agreed upon by consensus among the nodes.
--------------------------------------------------
Result 2
Source: ../data/Blockchain_For_Beginners.pdf
Page: 11
content:• Transactions and blocks are validated by approved accounts, known as validators.  
• Faster and more energy -efficient but less decentralized.  
To better understand the concept of Consensus Mechanisms, picture a group of stakeholders who share a 
notebook. Each time one of the stakeholders wishes to add one note, they should follow specific rules:  
• In PoW, they need to solve a difficult puzzle to earn the right to add the note.  
• In PoS, they need to show they have a certain number of pages in the notebook (their stake) t

[Document(metadata={'source': '../data/Blockchain_For_Beginners.pdf', 'producer': 'Microsoft® Word for Microsoft 365', 'author': 'CUK Zoran', 'creationdate': '2024-05-17T12:04:33+03:00', 'page_label': '10', 'moddate': '2024-05-17T12:04:33+03:00', 'page': 9, 'total_pages': 33, 'creator': 'Microsoft® Word for Microsoft 365'}, page_content='Consensus Mechanism  \nDecision Making: Instead of a central authority (e.g., a bank manager), deciding on transactions to be made, \nin blockchain the transactions are agreed upon by consensus among the nodes.'),
 Document(metadata={'moddate': '2024-05-17T12:04:33+03:00', 'creator': 'Microsoft® Word for Microsoft 365', 'total_pages': 33, 'creationdate': '2024-05-17T12:04:33+03:00', 'producer': 'Microsoft® Word for Microsoft 365', 'page': 11, 'page_label': '12', 'author': 'CUK Zoran', 'source': '../data/Blockchain_For_Beginners.pdf'}, page_content='• Transactions and blocks are validated by approved accounts, known as validators.  \n• Faster and more e

# Generation Phase

In [43]:
!pip3 install langchain-google-genai google-generativeai python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [44]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

In [45]:
class RAGGenerator:

    def __init__(self,retriever,model_name="gemini-2.5-flash"):
        self.retreiver = retriever
        self.llm = ChatGoogleGenerativeAI(
            model=model_name,
            temperature=0.2,
            top_p=0.9
        )

        self.prompt_template = ChatPromptTemplate.from_template(
        """
        You are a good assistant. Answer the question based ONLY on the provided context.
        
        Context:
        {context}
        
        Question:
        {question}
        
        Instructions:
        - Answer clearly and accurately
        - Use only the context provided
        - If answer is not in context, say "Answer not found in documents"
        
        Answer:
        """
        )

    def build_context(self, documents):
        context="\n\n".join([
            f"source: {doc.metadata['source']} (Page: {doc.metadata['page']})\n{doc.page_content}"
            for doc in documents
        ])
        return context

    def generate(self,query,k=3):

        retrieved_docs = self.retreiver.retrieve(query,k=k)

        print("\n==== Retrieved Docs ====\n")
        print(retrieved_docs)   

        context = self.build_context(retrieved_docs)

        print("\n==== Context ====\n")
        print(context)          

        prompt = self.prompt_template.format(
            context=context,
            question=query
        )

        response = self.llm.invoke(prompt)

        print("\n ==== Generated Answer ===\n")
        print(response.content)

        return response.content



In [46]:
from dotenv import load_dotenv
import os
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [47]:
generator = RAGGenerator(retriever)
generator.generate(
    query="What is Bitcoin?",
    k=3
)


Query: What is Bitcoin?

Top 3 results:

Result 1
Source: ../data/Blockchain_For_Beginners.pdf
Page: 4
content:Page 5 | 33  Blockchain for beginners  
 
A-Z Glossary of Blockchain Terms  
Bitcoin:  A cryptocurrency, the first and most renowned application (use case) of blockchain technology, 
specifically within financial services.  
Blockchain:  A tamper -proof, shared digital ledger that records transactions in a decentralized peer -to-peer 
network. The permanent recording of transactions in the blockchain permanently stores the history of asset 
exchanges between the peers (participants) in the network.  
CBDC:  A Central Bank Digital Currency is a form of digital money , issued by a central bank.    
Centralization:  When a single entity, such as a bank or land registry, maintains control over transaction 
records and data.
--------------------------------------------------
Result 2
Source: ../data/Blockchain_For_Beginners.pdf
Page: 8
content:trust in the system.  
“[...] Bitcoin

'Bitcoin is a cryptocurrency, the first and most renowned application (use case) of blockchain technology, specifically within financial services. It is also described as the first decentralized cryptocurrency.'

In [48]:
generator.generate(
    query="differentiate between public blockchain and private blockchain?",
    k=3
)


Query: differentiate between public blockchain and private blockchain?

Top 3 results:

Result 1
Source: ../data/Blockchain_For_Beginners.pdf
Page: 14
content:adhere to rules , and utili se distinct protocols, offering a spectrum of advantages and challenges tailored to 
various use cases.  
 
Figure 2: Four main types of blockchain technology  
3.1 Public vs Private Blockchains  
Public blockchains  are decentralized platforms that anyone can access and participate  in. Unlike private 
blockchains that are restricted and often managed by single organizations, public blockchains like Bitcoin and 
Ethereum are open for anyone to join, transact on, and participate in the consensus process. This openness, 
however, comes wi th trade -offs in terms of scalability and privacy. Private blockchains , on the other hand, are 
not open to the public and participation requires an invitation or permission. These blockchains provide more
--------------------------------------------------
Result 2


'Public blockchains are decentralized platforms that anyone can access and participate in. They are open for anyone to join, transact on, and participate in the consensus process. Examples include Bitcoin and Ethereum. This openness, however, comes with trade-offs in terms of scalability and privacy. Public blockchains are also known as permissionless blockchains and have their own native currency/asset, such as bitcoin for the Bitcoin blockchain and ether for the Ethereum blockchain.\n\nPrivate blockchains, on the other hand, are restricted and often managed by single organizations. They are not open to the public, and participation requires an invitation or permission.'