In [2]:
import os
import sys
from pathlib import Path
from typing import List, Optional
import gradio as gr
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from langchain_huggingface import HuggingFaceEmbeddings
from unstructured.partition.auto import partition
from langchain_ollama import ChatOllama
from IPython.display import Markdown, display

# Assuming you have this module
sys.path.append("../../../llm_engineering")
from api_clients import create_clients


class RAGBookQA:
    """RAG-based Question Answering system for PDF books."""
    
    def __init__(
        self,
        db_name: str = "vector_db",
        embedding_model: str = "all-MiniLM-L6-v2",
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        temperature: float = 0.0,
        k_docs: int = 4
    ):
        """
        Initialize the RAG system.
        
        Args:
            db_name: Directory name for vector database
            embedding_model: HuggingFace model name for embeddings
            chunk_size: Size of text chunks for splitting
            chunk_overlap: Overlap between chunks
            temperature: LLM temperature for response generation
            k_docs: Number of documents to retrieve
        """
        self.db_name = db_name
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.temperature = temperature
        self.k_docs = k_docs
        
        # Initialize embeddings
        print("Loading embedding model...")
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        
        # Initialize LLM
        print("Initializing LLM...")
        clients = create_clients()
        self.llm = ChatOllama(
            model=clients["models"]["OLLAMA_MODEL"],
            temperature=self.temperature,
        )
        
        self.vectorstore = None
        self.retriever = None
        
    def load_and_process_pdf(self, filename: str, force_reload: bool = False) -> None:
        """
        Load PDF, chunk it, and create vector store.
        
        Args:
            filename: Path to PDF file
            force_reload: If True, recreate vector store even if it exists
        """
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File not found: {filename}")
        
        # Check if we need to reload
        if os.path.exists(self.db_name) and not force_reload:
            print(f"Loading existing vector store from {self.db_name}...")
            self.vectorstore = Chroma(
                persist_directory=self.db_name,
                embedding_function=self.embeddings
            )
            self.retriever = self.vectorstore.as_retriever(
                search_kwargs={"k": self.k_docs}
            )
            return
        
        # Delete existing collection if force reload
        if os.path.exists(self.db_name):
            print("Deleting existing vector store...")
            try:
                Chroma(
                    persist_directory=self.db_name,
                    embedding_function=self.embeddings
                ).delete_collection()
            except Exception as e:
                print(f"Warning: Could not delete existing collection: {e}")
        
        # Partition PDF
        print(f"Processing PDF: {filename}")
        try:
            file_partition = partition(filename)
        except Exception as e:
            raise RuntimeError(f"Failed to partition PDF: {e}")
        
        # Combine text with metadata
        text = '\n'.join([str(el) for el in file_partition])
        doc = Document(
            page_content=text,
            metadata={
                "source": filename,
                "filename": os.path.basename(filename)
            }
        )
        
        # Split into chunks (using RecursiveCharacterTextSplitter for better results)
        print("Splitting document into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""] # it has hierarchical order
        )
        chunks = text_splitter.split_documents([doc])
        
        # Add chunk numbers to metadata
        for i, chunk in enumerate(chunks):
            chunk.metadata["chunk_id"] = i
            chunk.metadata["total_chunks"] = len(chunks)
        
        print(f"Created {len(chunks)} chunks")
        
        # Create vector store
        print("Creating vector store...")
        try:
            self.vectorstore = Chroma.from_documents(
                documents=chunks,
                embedding=self.embeddings,
                persist_directory=self.db_name
            )
            print("Vector store created successfully")
        except Exception as e:
            raise RuntimeError(f"Failed to create vector store: {e}")
        
        # Create retriever with MMR for better diversity
        self.retriever = self.vectorstore.as_retriever(
            search_type="mmr",  # Maximal Marginal Relevance
            search_kwargs={
                "k": self.k_docs,
                "fetch_k": self.k_docs * 2  # Fetch more, then rerank
            }
        )
    
    def answer_question(
        self,
        question: str,
        show_sources: bool = True,
        verbose: bool = False
    ) -> dict:
        """
        Answer a question based on the loaded PDF.
        
        Args:
            question: User's question
            show_sources: Whether to include source information
            verbose: Whether to print retrieved documents
            
        Returns:
            Dictionary with answer and optional metadata
        """
        if self.retriever is None:
            return {
                "answer": "Error: No document loaded. Please load a PDF first.",
                "sources": []
            }
        
        # Retrieve relevant documents
        try:
            docs = self.retriever.invoke(question)
        except Exception as e:
            return {
                "answer": f"Error retrieving documents: {e}",
                "sources": []
            }
        
        if verbose:
            print(f"\nRetrieved {len(docs)} documents:")
            for i, doc in enumerate(docs):
                print(f"\n--- Document {i+1} ---")
                print(doc.page_content[:200] + "...")
        
        if not docs:
            return {
                "answer": "I don't know. No relevant context found in the document.",
                "sources": []
            }
        
        # Prepare context
        context = "\n\n".join([
            f"[Excerpt {i+1}]:\n{doc.page_content}"
            for i, doc in enumerate(docs)
        ])
        
        # Create prompt
        prompt = f"""You are a helpful assistant that answers questions strictly based on the provided context from a book.
                CONTEXT:
                {context}

                QUESTION:
                {question}

                INSTRUCTIONS:
                - Answer the question using ONLY information from the context above
                - Be specific and cite relevant details from the context
                - If the answer cannot be found in the context, respond with: "I don't know - this information is not in the provided excerpts."
                - Do not make assumptions or add information not present in the context

                ANSWER:"""
        
        # Get LLM response
        try:
            response = self.llm.invoke([HumanMessage(content=prompt)])
            answer = response.content.strip()
        except Exception as e:
            return {
                "answer": f"Error generating answer: {e}",
                "sources": []
            }
        
        # Prepare result
        result = {"answer": answer}
        
        if show_sources:
            sources = []
            for i, doc in enumerate(docs):
                sources.append({
                    "chunk_id": doc.metadata.get("chunk_id", "unknown"),
                    "preview": doc.page_content[:150] + "...",
                    "filename": doc.metadata.get("filename", "unknown")
                })
            result["sources"] = sources
        
        return result


def main():
    """Example usage of the RAG system."""
    
    # Configuration
    PDF_PATH = "C:/Users/Paing/Downloads/fcuk.pdf"
    
    # Initialize RAG system
    rag_system = RAGBookQA(
        db_name="vector_db",
        chunk_size=1000,
        chunk_overlap=200,
        k_docs=4
    )
    
    # Load and process PDF (set force_reload=True to rebuild vector store)
    print("\n" + "="*50)
    print("Loading PDF and creating vector store...")
    print("="*50)
    
    try:
        rag_system.load_and_process_pdf(PDF_PATH, force_reload=False)
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return
    
    # Ask questions
    questions = [
        "What is the first chapter about?",
        "Who is the author?",
        "What did the author want to convey in second chapter?",
        "What is the name of the Second Lieutenant of Japanese Imperial Army?"
    ]
    
    print("\n" + "="*50)
    print("Asking Questions")
    print("="*50)
    
    for question in questions:
        print(f"\n{'='*50}")
        print(f"Q: {question}")
        print(f"{'='*50}")
        
        result = rag_system.answer_question(
            question,
            show_sources=True,
            verbose=False
        )
        
        print(f"\nA: {result['answer']}")
        
        if result.get('sources'):
            print(f"\nðŸ“š Sources used ({len(result['sources'])} excerpts):")
            for i, source in enumerate(result['sources']):
                print(f"\n  [{i+1}] Chunk {source['chunk_id']}: {source['preview']}")


if __name__ == "__main__":
    main()

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 050c4d81-7164-43c1-ab98-043e012ceaed)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Loading embedding model...
Initializing LLM...

Loading PDF and creating vector store...
Loading existing vector store from vector_db...

Asking Questions

Q: What is the first chapter about?

A: The first chapter is about Happiness Is a Problem. This can be inferred from the title of Chapter 2, which is "Happiness Is a Problem", and the content of Excerpt 1, which mentions that this book will teach readers to lose and let go, rather than trying to achieve happiness or greatness.

ðŸ“š Sources used (4 excerpts):

  [1] Chunk 43: This book doesnâ€™t give a fuck about alleviating your problems or your pain. And that is precisely why you will know itâ€™s being honest. This book is no...

  [2] Chunk 44: This book will not teach you how to gain or achieve, but rather how to lose and let go. It will teach you to take inventory of your life and scrub out...

  [3] Chunk 335: Boundaries Once upon a time, there were two youngsters, a boy and a girl. Their families hated each other. But the boy