In [None]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
import numpy as np
from langchain_ollama import OllamaEmbeddings
from sentence_transformers.cross_encoder import CrossEncoder
from transformers import AutoTokenizer
import chromadb
import uuid
from typing import List, Dict, Any
from langchain_ollama.chat_models import ChatOllama

In [None]:
def process_all_pdfs(pdf_dir):

    all_documents = []
    pdf_dir = Path(pdf_dir)

    pdf_files = list(pdf_dir.glob('**/*.pdf'))

    for pdf_file in pdf_files:
        print(f'Processing: {pdf_file.name}')

        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f'Loaded {len(documents)} pages')
        
        except Exception as e:
            print(f'Error: {e}')
    
    print(f'\nTotal Documents loaded: {len(all_documents)}')
    return all_documents

all_pdf_documents = process_all_pdfs('../data/pdfs')

In [None]:
def split_documents(documents, chunk_size=500, chunk_overlap=100):
    
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    token_length_function = lambda text: len(tokenizer.encode(text))

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=token_length_function,
        separators=['\n\n','\n',' ', '']
    )

    split_docs = text_splitter.split_documents(documents=documents)
    print(f'Split {len(documents)} documents into {len(split_docs)} chunks')

    if split_docs:
        print(f'Example chunk:')
        print(f'Content: {split_docs[2000].page_content[:200]}...')
        print(f'Metadata: {split_docs[0].metadata}')
    
    return split_docs

chunks = split_documents(documents=all_pdf_documents)

In [None]:
chunks

In [None]:
class EmbeddingManager:

    def __init__(self, model_name: str = 'all-minilm:latest'):
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        try:
            print(f'Loading embedding model: {self.model_name}')
            self.model = OllamaEmbeddings(model=self.model_name, num_ctx=512)
            print(f'Model loaded sucessfully')
        except Exception as e:
            print(f'Error loading model {self.model_name}: {e}')
            raise
    
    def generate_embeddings(self,texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f'Generating embeddings for {len(texts)} texts...')
        embeddings = self.model.embed_documents(texts)
        embeddings_np = np.array(embeddings)
        print(f'Generated embeddings with shape: {embeddings_np.shape}')
        return embeddings_np

embedding_manager = EmbeddingManager()

In [None]:
# class EmbeddingManager:
    
#     def __init__(self, model_name: str = "all-MiniLM-L6-v2"):

#         self.model_name = model_name
#         self.model = None
#         self._load_model()

#     def _load_model(self):
#         try:
#             print(f"Loading embedding model: {self.model_name}")
#             self.model = SentenceTransformer(self.model_name)
#             print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
#         except Exception as e:
#             print(f"Error loading model {self.model_name}: {e}")
#             raise

#     def generate_embeddings(self, texts: List[str]) -> np.ndarray:

#         if not self.model:
#             raise ValueError("Model not loaded")
        
#         print(f"Generating embeddings for {len(texts)} texts...")
#         embeddings = self.model.encode(texts, show_progress_bar=True)
#         print(f"Generated embeddings with shape: {embeddings.shape}")
#         return embeddings

# embedding_manager=EmbeddingManager()

In [None]:
class VectorStore:
    
    def __init__(self, collection_name: str = 'HP_Books', persist_directory:str = '../data/vector_store'):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={'description':'Harry Potter book embeddings for RAG'}
            )
            print(f'Vector store initialized. Collection: {self.collection_name}')
            print(f'Existing documents in collection: {self.collection.count()}')

        except Exception as e:
            print(f'Error initializing vector store: {e}')
            raise

    def add_documents(self, documents:List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError('Number of documents must match number of embeddings')
        
        print(f'Adding {len(documents)} documents to vector store...')

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents,embeddings)):
            doc_id = f'doc_{uuid.uuid4().hex[:8]}_{i}'
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())

        total_docs = len(documents)
        max_batch_size=5000

        try:
            for batch_start in range(0, total_docs, max_batch_size):
                batch_end = min(batch_start + max_batch_size, total_docs)

                batch_ids = ids[batch_start:batch_end]
                batch_docs = documents_text[batch_start:batch_end]
                batch_meta = metadatas[batch_start:batch_end]
                batch_embs = embeddings_list[batch_start:batch_end]

                print(f" â†’ Adding batch {batch_start} to {batch_end} ...")

                self.collection.add(
                    ids=batch_ids,
                    embeddings=batch_embs,
                    metadatas=batch_meta,
                    documents=batch_docs
                )

            print(f'Sucessfully added {len(documents)} documents to vector store')
            print(f'Total documents in collection: {self.collection.count()}')
        except Exception as e:
            print(f'Error adding documents to vector store: {e}')
            raise

vector_store = VectorStore()

In [None]:
# vector_store.client.delete_collection(name='HP_Books')

In [None]:
# texts = [doc.page_content for doc in chunks]

# embeddings = embedding_manager.generate_embeddings(texts=texts)

In [None]:
# np.savez('../data/embeddings_nomic_500_tokens.npz', embeddings)
# embeddings = np.load('../data/embeddings_minilm_500_tokens.npz')['arr_0']

In [None]:
# vector_store.add_documents(documents=chunks, embeddings=embeddings)

In [None]:
class RAGRetriever:

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.embedding_manager = embedding_manager
        self.vector_store = vector_store
    
    def retrieve(self, query: str, top_k: int = 50, score_threshold: float = 0.0, rerank: bool = True, top_n: int = 10) -> List[Dict[str, Any]]:
        print(f'Retrieving documents for query: {query}')
        print(f'Top K: {top_k}, Score threshold: {score_threshold}')

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]                        
                
                if rerank:
                    reranker = CrossEncoder(model_name_or_path='cross-encoder/ms-marco-MiniLM-L6-v2')
                    query_doc_pairs = [(query, doc) for doc in documents]
                    scores = reranker.predict(sentences=query_doc_pairs)
                    res = [(id, docs, meta, dist, sc) for id, docs, meta, dist, sc in sorted(zip(ids, documents, metadatas, distances, scores), key=lambda x: x[4], reverse=True)[:top_n]]
                else:
                    res = [(id, docs, meta, dist, None) for id, docs, meta, dist in zip(ids, documents, metadatas, distances)]


                for i, (doc_id, document, metadata, distance, rerank_score) in enumerate(res):
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'rerank_score': rerank_score,
                            'rank': i + 1
                        })
                print(f'Retrieved {len(retrieved_docs)} documents (after filtering)')
            else:
                print('No documents found')
            
            return retrieved_docs
        
        except Exception as e:
            print(f'Error during retrievel: {e}')
            return []
        
rag_retriever = RAGRetriever(vector_store=vector_store, embedding_manager=embedding_manager)

In [None]:
retrieved_docs = rag_retriever.retrieve(
                                            query="How did Harry defeat Lord Voldemort?",
                                            top_k=50,
                                            rerank=True,
                                            top_n=10,
                                            score_threshold=-1
                                        )

In [None]:
for doc in retrieved_docs:
    print(doc['content'])

In [None]:
llm = ChatOllama(model='mistral:latest',temperature=1, num_predict=1024)

def rag_simple(query, retriever, llm, top_k=5):
    results = retriever.retrieve(query, top_k=top_k)
    context = '\n\n'.join([doc['content'] for doc in results]) if results else ""
    if not context:
        return 'No relevant context found to answer the question'

    prompt = """Answer the following question consisely by strictly using the provided context:
                Question: {query}
                
                Context: {context}

                Answer: """
    
    response = llm.invoke([prompt.format(context=context, query=query)])

    return response.content

In [None]:
answer = rag_simple(
    query="Why did Voldemort try to kill Harry when he was a child?",
    retriever=rag_retriever,
    llm=llm
)

In [None]:
answer

In [None]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.0, return_context=False):

    results = retriever.retrieve(query=query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant content found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    context = '\n\n'.join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])

    prompt = """Use the following context to answer the question concisely.\nContext: {context}\nQuestion: {query}\nAnswer: """
    response = llm.invoke([prompt.format(context=context, query=query)])

    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }

    if return_context:
        output['context'] = context

    return output

In [None]:
output = rag_advanced(
    query="How many horcruxes did Voldemort create in total?",
    retriever=rag_retriever,
    llm=llm,
    top_k=5,
    min_score=0.0,
    return_context=True
    )

In [None]:
print('Answer:', output['answer'])
print('Sources:', output['sources'])
print('Confidence:', output['confidence'])
print('Context:', output['context'])

In [None]:
class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, summarize: bool = False) -> Dict[str, Any]:
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]

            prompt = """Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

In [None]:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("How many horcruxes did Voldemort create in total?", top_k=5, min_score=0.0, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
# print("History:", result['history'][-1])