In [1]:
from langchain_core.documents import Document

In [2]:
doc=Document (
    page_content="this is the page contents",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"richard",
        "date_created":"2026"
    })
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'richard', 'date_created': '2026'}, page_content='this is the page contents')

In [3]:
## Create a simple txt file
import os
os.makedirs("../data/text_files",exist_ok=True)

In [4]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}
for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [5]:
### TextLoader


from langchain_community.document_loaders import TextLoader

loader=TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
document=loader.load()
print(document)

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [6]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader,PyMuPDFLoader 

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdffiles",
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader , ##loader class to use
    show_progress=False


)

documents=dir_loader.load()
documents


[Document(metadata={'producer': 'WeasyPrint 62.3', 'creator': 'pandoc', 'creationdate': '', 'source': '..\\data\\pdffiles\\2.pdf', 'file_path': '..\\data\\pdffiles\\2.pdf', 'total_pages': 58, 'format': 'PDF 1.7', 'title': 'Exodus', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='EXODUS\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\nCHAPTER 1\nNow these are the names of the children of Israel, which came into Egypt; every man\nand his household came with Jacob.\n2 Reuben, Simeon, Levi, and Judah,\n3 Issachar, Zebulun, and Benjamin,\n4 Dan, and Naphtali, Gad, and Asher.\n5 And all the souls that came out of the loins of Jacob were seventy souls: for Joseph was\nin Egypt already.\n6 And Joseph died, and all his brethren, and all that generation.\n7 And the children of Israel were fruitful, and incre

In [7]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader,PyMuPDFLoader 

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdffiles",
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader , ##loader class to use
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents


[Document(metadata={'producer': 'WeasyPrint 62.3', 'creator': 'pandoc', 'creationdate': '', 'source': '..\\data\\pdffiles\\2.pdf', 'file_path': '..\\data\\pdffiles\\2.pdf', 'total_pages': 58, 'format': 'PDF 1.7', 'title': 'Exodus', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content='EXODUS\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n40\nCHAPTER 1\nNow these are the names of the children of Israel, which came into Egypt; every man\nand his household came with Jacob.\n2 Reuben, Simeon, Levi, and Judah,\n3 Issachar, Zebulun, and Benjamin,\n4 Dan, and Naphtali, Gad, and Asher.\n5 And all the souls that came out of the loins of Jacob were seventy souls: for Joseph was\nin Egypt already.\n6 And Joseph died, and all his brethren, and all that generation.\n7 And the children of Israel were fruitful, and incre

Embedding and vectordb

In [8]:
import os
import numpy as np 
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Creating Data Chunks 

from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """
    Split documents into smaller chunks for better RAG performance.
    
    Parameters:
    - chunk_size: Maximum characters per chunk (adjust based on your LLM)
    - chunk_overlap: Characters to overlap between chunks (preserves context)
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, # Each chunk: ~1000 characters
        chunk_overlap=chunk_overlap, # 200 chars overlap for context
        length_function=len, # How to measure length
        separators=["\n\n", "\n", " ", ""] # Split hierarchy
    )
    # Actually split the documents
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show what a chunk looks like
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [10]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


## initialize the embedding manager

embedding_manager=EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 878.87it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x23e53a2a5a0>

In [11]:
#vectore store 
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore
    

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 7749


<__main__.VectorStore at 0x23e55447ef0>

In [12]:
### Split the documents into chunks
chunks = split_documents(documents, chunk_size=1000, chunk_overlap=200)

Split 278 documents into 1107 chunks

Example chunk:
Content: EXODUS
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
CHAPTER 1
Now these are the names of the children of Israel, which came into Egypt...
Metadata: {'producer': 'WeasyPrint 62.3', 'creator': 'pandoc', 'creationdate': '', 'source': '..\\data\\pdffiles\\2.pdf', 'file_path': '..\\data\\pdffiles\\2.pdf', 'total_pages': 58, 'format': 'PDF 1.7', 'title': 'Exodus', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}


In [13]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 1107 texts...


Batches: 100%|██████████| 35/35 [00:02<00:00, 15.35it/s]


Generated embeddings with shape: (1107, 384)
Adding 1107 documents to vector store...
Successfully added 1107 documents to vector store
Total documents in collection: 8856


In [14]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [15]:
rag_retriever

<__main__.RAGRetriever at 0x23e556bac00>

In [16]:
rag_retriever.retrieve("Who is joseph")

Retrieving documents for query: 'Who is joseph'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.46it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_854e6f55_1079',
  'content': '27 And they told him all the words of Joseph, which he had said unto them: and when he\nsaw the wagons which Joseph had sent to carry him, the spirit of Jacob their father\nrevived:\n28 And Israel said, It is enough; Joseph my son is yet alive: I will go and see him before I\ndie.\nCHAPTER 46',
  'metadata': {'subject': '',
   'file_path': '..\\data\\pdffiles\\1.pdf',
   'creationDate': '',
   'keywords': '',
   'producer': 'WeasyPrint 62.3',
   'format': 'PDF 1.7',
   'creationdate': '',
   'source': '..\\data\\pdffiles\\1.pdf',
   'moddate': '',
   'title': 'Genesis',
   'content_length': 288,
   'creator': 'pandoc',
   'modDate': '',
   'author': '',
   'total_pages': 69,
   'trapped': '',
   'doc_index': 1079,
   'page': 61},
  'similarity_score': 0.12452292442321777,
  'distance': 0.8754770755767822,
  'rank': 1},
 {'id': 'doc_944af63b_1079',
  'content': '27 And they told him all the words of Joseph, which he had said unto them: and when

In [None]:
from huggingface_hub import login

# Paste your token directly here
hf_token = "go get your own token from huggingface.co --- IGNORE ---"

# Log in
login(hf_token)

print("Logged in successfully!")

Logged in successfully!


In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

class AdvancedRAGGenerator:
    def __init__(self, model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
        self.model_id = model_id
        self._load_model()

    def _load_model(self):
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            quantization_config=quant_config,
            device_map="auto"
        )

    def multi_query_rewrite(self, original_query: str):
        rewrite_prompt = f"<s>[INST] Generate 3 search variations of the question: {original_query}. List them line by line without numbers. [/INST]</s>"
        
        inputs = self.tokenizer(rewrite_prompt, return_tensors="pt").to("cuda")
        outputs = self.model.generate(**inputs, max_new_tokens=100, temperature=0.7)
        
        # Slicing the output to only get the NEW text generated
        gen_text = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
        return [q.strip() for q in gen_text.strip().split("\n") if q.strip()][:3]

    def generate_answer(self, query: str, context_text: str) -> str:
        prompt = f"<s>[INST] Context: {context_text}\n\nQuestion: {query} [/INST]</s>"

        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = self.model.generate(
            **inputs, 
            max_new_tokens=300, 
            temperature=0.1,
            pad_token_id=self.tokenizer.eos_token_id
        )

        # Slice to only return the answer, excluding the prompt
        answer = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
        return answer.strip()

# --- REFINED OUTPUT DISPLAY ---

rag_engine = AdvancedRAGGenerator()
user_query = "Who is joseph of egypt?"

# 1. Rewrite
enhanced_queries = rag_engine.multi_query_rewrite(user_query)

# 2. Retrieval (Mocking context for demonstration)
# unique_context = your_retrieval_logic(enhanced_queries)
unique_context = "Joseph was the eleventh son of Jacob and the first son of Rachel. He was sold into slavery by his brothers."

# 3. Generate
final_answer = rag_engine.generate_answer(user_query, unique_context)

# --- THE CLEAN OUTPUT ---
print(f"\n{'='*50}")
print(f"USER QUERY: {user_query}")
print(f"GENERATED VARIATIONS:")
for i, q in enumerate(enhanced_queries, 1):
    print(f"  {i}. {q}")
print(f"{'-'*50}")
print(f"FINAL ANSWER:\n{final_answer}")
print(f"{'='*50}")

Loading weights: 100%|██████████| 291/291 [00:21<00:00, 13.85it/s, Materializing param=model.norm.weight]                               
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



USER QUERY: Who is joseph of egypt?
GENERATED VARIATIONS:
  1. 1. Who was Joseph in the Bible?
  2. 2. Who is the biblical figure Joseph from Egypt?
  3. 3. Who is the biblical character Joseph of Egypt?
--------------------------------------------------
FINAL ANSWER:
Joseph of Egypt, as referred to in the Bible, is not the same person as Joseph, the eleventh son of Jacob. Joseph of Egypt, also known as Joseph the Dreamer or Joseph the Interpreter, is a significant figure in the Old Testament of the Bible. He is the son of Jacob (also known as Israel) and Rachel's handmaid, Zilpah. Joseph of Egypt is known for his ability to interpret dreams and for his role in saving Egypt and his family during a famine. This Joseph is not to be confused with Joseph, the eleventh son of Jacob and the first son of Rachel, who was sold into slavery by his brothers.


these are previous iterations of the current RAG Model that was implemented now downsized to a cleaner outuput and used multiquery retrieval



import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from typing import List, Dict, Any

class AdvancedRAGGenerator:
    """Handles Multi-Query rewriting and answer generation using a quantized local LLM"""
    
    def __init__(self, model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
        self.model_id = model_id
        self.tokenizer = None
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load model in 4-bit for 3070 compatibility"""
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            quantization_config=quant_config,
            device_map="auto"
        )

    def multi_query_rewrite(self, original_query: str) -> List[str]:
        """ADVANCED STEP: Rewrites the user query into 3 variations for better retrieval"""
        rewrite_prompt = f"""<s>[INST] You are an AI language model assistant. 
Your task is to generate 3 different versions of the given user question to retrieve relevant documents from a vector database. 
By providing multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of distance-based similarity search. 
Provide these alternative questions separated by newlines. Do not add any introductory text.

Original question: {original_query} [/INST]</s>"""
        
        inputs = self.tokenizer(rewrite_prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.7)
        
        raw_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract and clean the generated questions
        questions_text = raw_output.split("[/INST]")[-1].strip()
        queries = [q.strip() for q in questions_text.split("\n") if q.strip()]
        
        # Include the original query as well
        return list(set(queries[:3] + [original_query]))

    def generate_answer(self, query: str, context_text: str) -> str:
        """Constructs prompt and generates answer from provided context"""
        prompt = f"""<s>[INST] You are a helpful assistant. Use the following context to answer the question. 
If the answer is not in the context, say you don't know.

Context:
{context_text}

Question:
{query} [/INST]</s>"""

        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.1,
                pad_token_id=self.tokenizer.eos_token_id
            )

        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return full_response.split("[/INST]")[-1].strip()

# --- ADVANCED USAGE WORKFLOW ---

rag_engine = AdvancedRAGGenerator()
user_query = "Who is joseph of egypt?"

# 1. Advanced Rewrite
enhanced_queries = rag_engine.multi_query_rewrite(user_query)
print(f"Generated Queries: {enhanced_queries}")

# 2. Broader Retrieval
all_retrieved_docs = []
for q in enhanced_queries:
    # Assuming 'rag_retriever' is your existing ChromaDB retriever
    all_retrieved_docs.extend(rag_retriever.retrieve(q))

# 3. Deduplicate results based on content
unique_context = "\n\n".join(list(set([doc['content'] for doc in all_retrieved_docs])))

# 4. Final Generation
final_answer = rag_engine.generate_answer(user_query, unique_context)
print("\n--- ADVANCED RAG ANSWER ---")
print(final_answer)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

class RAGGenerator:
    """Handles answer generation using a quantized LLM from Hugging Face"""
    
    def __init__(self, model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
        self.model_id = model_id
        self.tokenizer = None
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load model in 4-bit using BitsAndBytes for 3070 compatibility"""
        print(f"Loading LLM: {self.model_id}...")
        
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True
        )

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            quantization_config=quant_config,
            device_map="auto"
        )
        print("LLM loaded successfully.")

    def generate_answer(self, query: str, retrieved_docs: List[Dict[str, Any]]) -> str:
        """Constructs prompt and generates answer from context"""
        
        # 1. Prepare context from retrieval results
        context_chunks = [doc['content'] for doc in retrieved_docs]
        context_text = "\n\n".join(context_chunks)

        # 2. Mistral Instruction Format
        prompt = f"""<s>[INST] You are a helpful assistant. Use the following context to answer the question. 
If the answer is not in the context, say you don't know.

Context:
{context_text}

Question:
{query} [/INST]</s>"""

        # 3. Tokenize and Generate
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.1,  # Low temperature for factual consistency
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

        # 4. Decode and extract only the assistant's reply
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = full_response.split("[/INST]")[-1].strip()
        
        return answer

# --- USAGE ---

# 1. Perform Retrieval (using your existing code)

query = "Who is joseph of egypt?"
results = rag_retriever.retrieve(query)

# 2. Perform Generation
rag_generator = RAGGenerator()
final_answer = rag_generator.generate_answer(query, results)

print("\n--- FINAL ANSWER ---")
print(final_answer)

In [19]:
import torch
import bitsandbytes as bnb

print(f"Is CUDA available?: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"VRAM Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

Is CUDA available?: True
Current GPU: NVIDIA GeForce RTX 3070 Laptop GPU
VRAM Allocated: 4042.22 MB
VRAM Reserved: 13794.00 MB
