## 🔧 Setup and Imports
Load required Python libraries for document processing, embedding, and vector similarity.

In [34]:
#Imports: Core modules for RAG pipeline including transformer models, embedding utilities, and vector math
import os
import re
from transformers import AutoTokenizer, AutoModel
import uuid 
import torch
import numpy as np
import json
from langchain_core.prompts import ChatPromptTemplate
import google.generativeai as genai
from dotenv import load_dotenv

## 🔐 Environment Configuration
Configure API keys securely using environment variables (e.g., for Gemini integration).

In [35]:
# Load Gemini API key from .env file for secure configuration
load_dotenv()  # Load .env file
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

## 📄 Document Chunking
Split long documents into token-length chunks to fit transformer model input limits.

In [56]:

def chunking(directory_path: str, tokenizer, chunk_size: int, 
             para_separator: str = "\n\n", separator: str = " ") -> Dict:
    """
    Process documents in a directory and split them into chunks.
    
    Args:
        directory_path: Path to directory containing documents
        tokenizer: Tokenizer to measure chunk sizes
        chunk_size: Maximum token size for each chunk
        para_separator: Separator between paragraphs (default: "\n\n")
        separator: Word separator (default: " ")
    
    Returns:
        Dictionary containing all documents and their chunks
    """
    documents = {}  
    all_chunks = {}
    
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        print(f"Processing: {filename}")
        
        if not os.path.isfile(file_path):
            continue
            
        # Get base filename without extension
        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        doc_id = str(uuid.uuid4())
        document_chunks = {}  # Store chunks for this document
        
        # Split by paragraphs first
        paragraphs = re.split(para_separator, text)
        
        for paragraph in paragraphs:
            if not paragraph.strip():
                continue
                
            # Split paragraph into words
            words = paragraph.split(separator) 
            
            current_chunk = []
            current_chunk_str = ""
            
            for word in words:
                test_chunk = f"{current_chunk_str}{separator}{word}" if current_chunk_str else word
                
                if len(tokenizer.tokenize(test_chunk)) <= chunk_size:
                    current_chunk_str = test_chunk
                else:
                    if current_chunk_str:
                        chunk_id = str(uuid.uuid4())
                        document_chunks[chunk_id] = {
                            "text": current_chunk_str,
                            "metadata": {"file_name": sku}
                        }
                    current_chunk_str = word
            
            # 
            if current_chunk_str:
                chunk_id = str(uuid.uuid4())
                document_chunks[chunk_id] = {
                    "text": current_chunk_str,
                    "metadata": {"file_name": sku}
                }
        
        documents[doc_id] = document_chunks
    
    return documents

## 🧠 Generate Document Embeddings
Map each chunk of the document to its vector embedding using a pre-trained model.

In [57]:
import torch

def map_document_embeddings(documents, tokenizer, model):
    
    """
    Function: map_document_embeddings
    Maps all chunks of all documents to their respective embeddings using a given tokenizer and model.
    Returns a dictionary of document/chunk keys to vector representations.
    """

    mapped_document_db = {}
    for doc_id, dict_content in documents.items():
        mapped_embedding = {}
        for content_id, text_content in dict_content.items():
            text = text_content.get("text")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)
                embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
            mapped_embedding[content_id] = embedding
        mapped_document_db[doc_id] = mapped_embedding
    return mapped_document_db

## 🔍 Query-Based Retrieval
Compare a query against all document chunks to retrieve the most relevant information.

In [58]:
def retrieve_information(query, top_k, mapped_document_db):

    """
    Function: retrieve_information
    Computes similarity scores between query and all document embeddings to find the top-k most relevant chunks.
    """
    
    query_inputs = tokenizer(query, return_tensor='pt',padding=True, truncation=True)
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()
    query_embeddings = query_embeddings.tolist()
    # converting query embeddings to numpy array
    query_embeddings=np.array(query_embeddings)

    scores = {}
    for doc_id, chunk_dict in mapped_document_db.iteams():
        for chunk_id, chunk_embeddings in chunk_dict.items():
            chunk_embeddings = np.array(chunk_embeddings)


            normalized_query = np.linalg.norm(query_embeddings)
            normalized_chunk = np.linalg.morm(chunk_embeddings)

            if normalized_chunk == 0 or normalized_query == 0:
                score == 0
            else:
                score = np.dot(chunk_embeddings,query_embeddings)/ (normalized_chunk*normalized_query)
            scores[(doc_id, chunk_id)] = score
    sorted_scores = sorted(scores.items(),key=lambda item: item[1], reverse=True)[:top_k]

    top_result = []
    for ((doc_id, chunk_id), score) in sorted_scores:
        result = (doc_id,chunk_id, score)
        top_result.append(result)
    return top_result                   




## 🧮 Compute Query Embedding
Generate an embedding vector for a single user query using the same embedding model.

In [39]:
def compute_embedding(query, tokenizer, model):
    """Compute embedding for a single query"""
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**query_inputs)
        query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return query_embedding.tolist()

## 📊 Cosine Similarity Calculation
Measure semantic similarity between query and document chunks using cosine similarity.

In [40]:
def calculate_cosine_similarity_score(query_embeddings, chunk_embeddings):

    """
Function: calculate_cosine_similarity_score
Calculates cosine similarity between query and each document chunk.
Returns similarity scores in a dictionary.
    """
    
    normalized_query = np.linalg.norm(query_embeddings)
    normalized_chunk = np.linalg.norm(chunk_embeddings)

    if normalized_chunk == 0 or normalized_query ==0:
        score == 0
    else:
         score = np.dot(chunk_embeddings, query_embeddings)/ (normalized_chunk * normalized_query)  
         
    return score
    
            

## 🏆 Top-K Scoring and Filtering
Sort chunks by similarity score and extract the top-k most relevant results.

In [41]:
def retrieve_top_k_scores(query_embeddings, mapped_document_db, top_k):

    """
Function: retrieve_top_k_scores
Sorts document chunks by similarity to the query and selects top-k results.
    """
     
    scores = {}
    for doc_id, chunk_dict in mapped_document_db.items():
        for chunk_id, chunk_embeddings in chunk_dict.items():
            chunk_embeddings = np.array(chunk_embeddings) 
            score = calculate_cosine_similarity_score(query_embeddings,chunk_embeddings)
            scores[(doc_id, chunk_id )] = score
        sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]   
    return sorted_scores      

## 📦 Format Retrieved Results
Organize and present retrieved chunks from top scores into a final response structure.

In [42]:
def retrieve_top_results(sorted_scores):

    """
🔹 Function: retrieve_top_results
Extr
    """
    top_results=[]
    for ((doc_id, chunk_id), score) in sorted_scores:
        results = (doc_id, chunk_id, score)
        top_results.append(results)
    return top_results

## 💾 Save Results to Disk
Utility function to store retrieved results or document metadata in JSON format.

In [43]:
def save_json(path,data):
    """
🔹 Function: save_json
Utility function to write results or metadata to disk in JSON format.
"""
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)
       

In [44]:
def read_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data    

In [45]:
def retrieve_text(top_results, document_data):
    first_match = top_results[0]
    doc_id = first_match[0]
    chunk_id = first_match[1]
    related_text = document_data[doc_id][chunk_id]
    return related_text

In [46]:
def generate_llm_response(gemini_model, query, relevent_text):

    template = """
    
    You are an intelligent search engine. you will be provided with some retrieved contexts, as well as the user query.

    Your jon is to understand the request, and answer based on the retrieved context.
    Here is context:

    <context>
    {context}
    </context>

    Question: {question}
"""
    prompt = ChatPromptTemplate.from_template(template=template)

    chain = prompt | gemini_model
    response=chain.invoke({"context":relavent_text["text"],"question":query})
    return response


In [47]:
from transformers import AutoTokenizer, AutoModel
import google.generativeai as genai
import os

if __name__ == "__main__":
    # Document processing setup
    directory_path = "documents"
    model_name = "BAAI/bge-small-en-v1.5"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Text processing parameters
    chunk_size = 200
    para_separator = "\n\n" 
    separator = " "
    top_k = 2
    
    # Gemini AI setup (corrected usage)
    genai.configure(api_key="GEMINI_API_KEY")  
    gemini_model = genai.GenerativeModel("gemini-1.5-flash")
    
    




In [48]:
#creating document store with chunk id, doc_id, text
documents = chunking(directory_path, tokenizer, chunk_size, para_separator, separator)


Processing: behaviuor1.txt
Processing: behaviuor2.txt
Processing: behaviuor3.txt


In [49]:
#now embedding generation and mapping in database
mapped_document_db = map_document_embeddings(documents, tokenizer, model)

In [50]:
#saving json
save_json('database/doc_store_2.json', documents) 
save_json('database/vector_store_2.json', mapped_document_db) 

In [51]:
  #Retrieving most relavent data chunks
query = "What are effective strategies to prevent tantrums in toddlers, especially when they can't express themselves well?"
query_embeddings =  compute_embedding(query, tokenizer, model)
sorted_scores = retrieve_top_k_scores(query_embeddings, mapped_document_db, top_k)
top_results = retrieve_top_results(sorted_scores)

In [52]:
#reading json
document_data = read_json("database/doc_store_2.json") #read document store


In [53]:
    #Retrieving text of relavent chunk embeddings
relavent_text = retrieve_text(top_results, document_data)

In [54]:
print(relavent_text)

{'text': 'Tantrums are common during the second year of life, when language skills are developing. Because toddlers can\'t always say what they want or need, and because words describing feelings are more complicated and develop later, a frustrating experience may cause a tantrum. As language skills improve, tantrums tend to decrease.\nToddlers want independence and control over their environment — more than they can actually handle. This can lead to power struggles as a child thinks "I can do it myself" or "I want it, give it to me." When kids discover that they can\'t do it and can\'t have everything they want, they may have a tantrum.\nHow Can We Avoid Tantrums?\nTry to prevent tantrums from happening in the first place, whenever possible. Here are some ideas that may help:\n    • Give plenty of positive attention. Get in the habit of catching your child being good. Reward your little one with praise and attention for positive behavior. Be', 'metadata': {'file_name': 'behaviuor1'}}
