In [2]:
import os
import uuid
import warnings
from pathlib import Path
from typing import Any, Dict, List, Tuple

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_classic.chains import RetrievalQA
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate, ChatMessagePromptTemplate, BaseChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, PyPDFLoader, DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

load_dotenv()

print("‚úÖ All libraries imported successfully.")


‚úÖ All libraries imported successfully.


In [3]:
api_key = os.getenv("api_key")

if not api_key:
    raise RuntimeError("API keys not found. Set them in .env or your environment securely.")

print("üîë API keys loaded successfully!")

üîë API keys loaded successfully!


# Retrive

## RAG Pipelines - Data Ingestion to Vector DB

### Data Ingestion

In [4]:
def process_all_pdfs(pdf_directory):
    
    all_documents = [] 
    pdf_dir = Path(pdf_directory)
    
    pdf_files = list(pdf_dir.glob("*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to Process")
    
    for pdf_file in pdf_files :
        print(f"Processing {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()
            
            for doc in documents :
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
                
            all_documents.extend(documents)
            print(f" Loaded {len(documents)} Pages")
            
        except Exception as e :
            print(f" Error {e}")
            
    print(f"Total Documents Loaded : {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../HR Policies")

Found 4 PDF files to Process
Processing HRMD-Procedures.pdf
 Loaded 69 Pages
Processing Human Resources policy and manual.pdf
 Loaded 64 Pages
Processing Human-Resource-HR-Policy-.pdf
 Loaded 55 Pages
Processing USS HRD Policy Revised 2016.pdf
 Loaded 35 Pages
Total Documents Loaded : 223


### Chunking

In [5]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len ,
        separators=["\n\n" , "\n" , " " , "" , "," ,"."]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    if split_docs:
        print(f"Example Chunk:")
        print(f"Content: {split_docs[0].page_content[:250]}...")
        print(f"Metadata: {split_docs[0].metadata}")
        
    return split_docs

In [6]:
chunks = split_documents(all_pdf_documents)
chunks

Split 223 documents into 496 chunks
Example Chunk:
Content: Operational Procedure
Human Resource Management &
Development (HRM & D)
Manusher Jonno Foundation...
Metadata: {'producer': 'Nitro Pro 8  (8. 0. 2. 4)', 'creator': 'Nitro Pro', 'creationdate': '2016-08-11T09:05:36+00:00', 'source': '..\\HR Policies\\HRMD-Procedures.pdf', 'file_path': '..\\HR Policies\\HRMD-Procedures.pdf', 'total_pages': 69, 'format': 'PDF 1.6', 'title': '', 'author': 'Bipul Roy', 'subject': '', 'keywords': '', 'moddate': '2019-03-18T16:35:20+06:00', 'trapped': '', 'encryption': 'Standard V4 R4 128-bit AES', 'modDate': "D:20190318163520+06'00'", 'creationDate': 'D:20160811090536Z', 'page': 0, 'source_file': 'HRMD-Procedures.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Nitro Pro 8  (8. 0. 2. 4)', 'creator': 'Nitro Pro', 'creationdate': '2016-08-11T09:05:36+00:00', 'source': '..\\HR Policies\\HRMD-Procedures.pdf', 'file_path': '..\\HR Policies\\HRMD-Procedures.pdf', 'total_pages': 69, 'format': 'PDF 1.6', 'title': '', 'author': 'Bipul Roy', 'subject': '', 'keywords': '', 'moddate': '2019-03-18T16:35:20+06:00', 'trapped': '', 'encryption': 'Standard V4 R4 128-bit AES', 'modDate': "D:20190318163520+06'00'", 'creationDate': 'D:20160811090536Z', 'page': 0, 'source_file': 'HRMD-Procedures.pdf', 'file_type': 'pdf'}, page_content='Operational Procedure\nHuman Resource Management &\nDevelopment (HRM & D)\nManusher Jonno Foundation'),
 Document(metadata={'producer': 'Nitro Pro 8  (8. 0. 2. 4)', 'creator': 'Nitro Pro', 'creationdate': '2016-08-11T09:05:36+00:00', 'source': '..\\HR Policies\\HRMD-Procedures.pdf', 'file_path': '..\\HR Policies\\HRMD-Procedures.pdf', 'total_pages': 69, 'format': 'PDF 1.6', 'title': '', 'author': 'B

### Embeddings

In [7]:
class EmbeddingManager:
    
    def __init__(self,model_name : str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            print(f"Loading Embedding model : {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully..Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error Loading model {self.model_name} : {e}")
            raise
        
    def generate_embeddings(self,texts : List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model Not Loaded...")
        
        print(f"Generating embeddings for {len(texts)} texts ...")
        embeddings = self.model.encode(texts,show_progress_bar = True)
        print(f"Generated embeddings with shape : {embeddings.shape}")
        return embeddings
        
        
embedding_manager = EmbeddingManager()
embedding_manager
        

Loading Embedding model : all-MiniLM-L6-v2
Model Loaded Successfully..Embedding dimension: 384


<__main__.EmbeddingManager at 0x1a47a6147c0>

### Vector Store

In [None]:
import os
import uuid
import numpy as np
from typing import List, Any
from pinecone import Pinecone, ServerlessSpec


api_key = os.getenv("api_key")
if not api_key:
    raise RuntimeError("Set PINECONE_API_KEY environment variable")

INDEX_NAME = "hr-policies"
DIMENSION = 384  
METRIC = "dotproduct"
CLOUD = "aws"
REGION = "us-east-1"

pc = Pinecone(api_key=api_key)

if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating Pinecone index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric=METRIC,
        spec=ServerlessSpec(cloud=CLOUD, region=REGION)
    )
    import time
    while not pc.describe_index(INDEX_NAME).status.get("ready", False):
        print("  Waiting for index to initialize...")
        time.sleep(5)
    print(f"Index '{INDEX_NAME}' is ready.")

index = pc.Index(INDEX_NAME)


class VectorStore:
    def __init__(
        self,
        collection_name: str = "hr-policies",
        persist_directory: str = "../HR Policies/vector_store" 
    ):
        self.collection_name = collection_name
        self.namespace = collection_name  
        self.index = index

        stats = self.index.describe_index_stats()
        stats_dict = stats.to_dict() if hasattr(stats, "to_dict") else stats

        ns_stats = stats_dict.get("namespaces", {}).get(self.namespace, {})
        existing_count = ns_stats.get("vector_count", 0)

        print(f"Vector Store initialized. Namespace: '{self.namespace}'")
        print(f"Existing documents in namespace: {existing_count}")

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to Pinecone namespace '{self.namespace}'...")

        vectors = []
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"

            text = getattr(doc, "page_content", str(doc))
            metadata = dict(getattr(doc, "metadata", {}) or {})
            metadata["doc_index"] = i
            metadata["content_length"] = len(text)
            metadata["text"] = text  

            vectors.append((
                doc_id,
                embedding.tolist(),  
                metadata             
            ))

        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            self.index.upsert(vectors=batch, namespace=self.namespace)
            
        stats = self.index.describe_index_stats().to_dict()
        count = stats.get("namespaces", {}).get(self.namespace, {}).get("vector_count", 0)
        print(f"‚úÖ Added {len(documents)} documents. Total in namespace: {count}")
        
    def similarity_search(self, query_embedding: np.ndarray, top_k: int = 5, score_threshold: float = 0.0):
        
        if query_embedding.shape != (DIMENSION,):
            query_embedding = query_embedding.flatten()

        results = self.index.query(
            vector=query_embedding.tolist(),
            top_k=top_k,
            include_metadata=True,
            namespace=self.namespace,
            filter=None 
        )

        hits = []
        for match in results["matches"]:
            if match["score"] >= score_threshold:
                hits.append({
                    "id": match["id"],
                    "score": match["score"],
                    "text": match["metadata"].get("text", ""),
                    "metadata": match["metadata"]
                })
        return hits


vectorstore = VectorStore()
vectorstore


Vector Store initialized. Namespace: 'hr-policies'
Existing documents in namespace: 496


<__main__.VectorStore at 0x1a47b887880>

In [12]:
texts = [doc.page_content for doc in chunks]

embeddings = embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(chunks,embeddings)


Generating embeddings for 496 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:14<00:00,  1.14it/s]


Generated embeddings with shape : (496, 384)
Adding 496 documents to Pinecone namespace 'hr-policies'...
‚úÖ Added 496 documents. Total in namespace: 496


## RAG Retriver

In [None]:
from typing import List, Dict, Any
import numpy as np

class RAGRetriever:
    
    def __init__(self, vectorstore: VectorStore, embedding_manager: Any):
        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Retrieving Documents for query: '{query}'")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")
        
        query_embedding = self.embedding_manager.generate_embeddings([query])[0] 
        
        try:
            results = self.vectorstore.similarity_search(
                query_embedding=query_embedding,
                top_k=top_k,
                score_threshold=score_threshold
            )
            
            retrieved_docs = []
            
            if results:
                for i, hit in enumerate(results):
                    retrieved_docs.append({
                        'id': hit['id'],
                        'content': hit['text'],                  
                        'metadata': hit['metadata'],
                        'similarity_score': hit['score'],         
                        'distance': 1 - hit['score'],              
                        'rank': i + 1
                    })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No Documents Found...")
            
            return retrieved_docs
        
        except Exception as e:
            print(f"Error During Retrieval: {e}")
            return []
        
rag_retriever =RAGRetriever(vectorstore,embedding_manager)

In [10]:
rag_retriever

<__main__.RAGRetriever at 0x1a47b6a8ca0>

In [11]:
docs = rag_retriever.retrieve(
    query="When Manusher Jonno Foundation (MJF) started operation as a project of CARE Bangladesh",
    top_k=5,
    score_threshold=0.6  
)
docs

Retrieving Documents for query: 'When Manusher Jonno Foundation (MJF) started operation as a project of CARE Bangladesh'
Top K: 5, Score Threshold: 0.6
Generating embeddings for 1 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 13.04it/s]

Generated embeddings with shape : (1, 384)





Retrieved 1 documents (after filtering)


[{'id': 'doc_3091fb32_10',
  'content': 'Page | 1\n¬© Manusher Jonno Foundation ‚Äì 2016\nHR Policy & Procedures Manual\nCHAPTER 01 ‚Äì INTRODUCTION\n1.1 ORGANIZATIONAL OVERVIEW:\nManusher Jonno Foundation (MJF) started operation as a project of CARE Bangladesh in\nJuly 2002. It was registered with the Joint Stock Company and NGO Affairs Bureau in 2006\nas a national nonprofit organization. It is working to promote human rights and good\ngovernance aiming at bringing about changes in the lives of the most marginalized people.\nMJF is contributing towards reducing poverty, improving human security and governance in\npublic institutions. Till date MJF has provided funding support to 150 organizations all over\nBangladesh.\nVision\nA world free from poverty, exploitation and discrimination where people live in freedom,\ndignity and human security\nMission of MJF promotes human rights and governance through partnership with relevant\nstakeholders, including duty bearers, to ensure dignity 

In [None]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate
from typing import List, Dict

llm = Ollama(model="llama2")

prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer concisely using only the provided context."),
    ("human", """
Context:
{context}

Question: {query}

Answer:""")
])

def rag_simple(query: str, rag_retriever, llm, top_k: int = 5, score_threshold: float = 0.4, fallback_to_llm: bool = True):
    """
    Simple RAG: Retrieve ‚Üí Build context ‚Üí Generate answer
    """
    print(f"\n[Query] {query}")
    
    results: List[Dict] = rag_retriever.retrieve(
        query=query,
        top_k=top_k,
        score_threshold=score_threshold
    )
    
    if not results:
        print("No documents retrieved with the current score threshold.")
        if fallback_to_llm:
            print("Using LLM fallback to answer the question...")
            try:
                response = llm(f"Answer this question: {query}")
                answer = response.content if hasattr(response, "content") else str(response)
                return answer.strip()
            except Exception as e:
                print(f"Error generating fallback answer: {e}")
                return "Error generating response."
        return "No relevant context found to answer the question."

    context = "\n\n".join(f"[Source {i+1}]: {doc['content']}" for i, doc in enumerate(results))
    print(f"[Context] Retrieved {len(results)} chunks (min score: {score_threshold})")

    try:
        chain = prompt_template | llm
        response = chain.invoke({"context": context, "query": query})
        answer = response.content if hasattr(response, "content") else str(response)
        return answer.strip()
    
    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Error generating response."


while True:
    query = input("\nEnter your question (type 'quit' to exit): ")
    if query.lower() in ["quit", "exit"]:
        print("Exiting...")
        break
    answer = rag_simple(query, rag_retriever=rag_retriever, llm=llm)
    print("Answer:", answer)
    




[Query] When Udayankur Seba Sangstha started its journey?
Retrieving Documents for query: 'When Udayankur Seba Sangstha started its journey?'
Top K: 5, Score Threshold: 0.4
Generating embeddings for 1 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 29.23it/s]

Generated embeddings with shape : (1, 384)





Retrieved 3 documents (after filtering)
[Context] Retrieved 3 chunks (min score: 0.4)
Answer: Udayankur Seba Sangstha (USS) started its journey in 1997.

[Query] 2. What does USS mean?
Retrieving Documents for query: '2. What does USS mean?'
Top K: 5, Score Threshold: 0.4
Generating embeddings for 1 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 24.12it/s]

Generated embeddings with shape : (1, 384)





Retrieved 5 documents (after filtering)
[Context] Retrieved 5 chunks (min score: 0.4)
Answer: USS stands for Universal Service Survey.

[Query] What does USS mean?
Retrieving Documents for query: 'What does USS mean?'
Top K: 5, Score Threshold: 0.4
Generating embeddings for 1 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 28.67it/s]

Generated embeddings with shape : (1, 384)





Retrieved 5 documents (after filtering)
[Context] Retrieved 5 chunks (min score: 0.4)
Answer: USS stands for Uganda Social Security.

[Query] When Red  Cross  Society in East Pakistan was transformed into the National Red Cross Society of Bangladesh ?
Retrieving Documents for query: 'When Red  Cross  Society in East Pakistan was transformed into the National Red Cross Society of Bangladesh ?'
Top K: 5, Score Threshold: 0.4
Generating embeddings for 1 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 20.78it/s]

Generated embeddings with shape : (1, 384)





Retrieved 5 documents (after filtering)
[Context] Retrieved 5 chunks (min score: 0.4)
Answer: The Red Cross Society in East Pakistan was transformed into the National Red Cross Society of Bangladesh on December 20, 1971.

[Query] When Renamed  as  Bangladesh  Red  Cross  Society  by  a  GoB  order ?
Retrieving Documents for query: 'When Renamed  as  Bangladesh  Red  Cross  Society  by  a  GoB  order ?'
Top K: 5, Score Threshold: 0.4
Generating embeddings for 1 texts ...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 26.98it/s]

Generated embeddings with shape : (1, 384)





Retrieved 5 documents (after filtering)
[Context] Retrieved 5 chunks (min score: 0.4)
Answer: The Bangladesh Red Crescent Society was renamed as Bangladesh Red Cross Society by a GoB (Government of Bangladesh) order on January 4, 1972.
Exiting...
