In [None]:
import logging
import numpy as np
import pandas as pd
from typing import List, Dict, Any
from sklearn.mixture import GaussianMixture

from langchain.chains.llm import LLMChain
from langchain.vectorstores import FAISS
from langchain.schema import AIMessage
from langchain.prompts import ChatPromptTemplate
from langchain.docstore.document import Document
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.retrievers import ContextualCompressionRetriever

In [29]:
# Embedding model
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
# LLM model
llm = OllamaLLM(model="llama3")


# Helper Functions

In [30]:
def extract_text(item):
    """Extract text content from either a string or an AIMessage object."""
    if isinstance(item, AIMessage):
        return item.content
    return item

def embed_texts(texts: List[str]) -> List[List[float]]:
    """Embed texts using OpenAIEmbeddings."""
    print(f"Embedding {len(texts)} texts")
    return embedding_model.embed_documents([extract_text(text) for text in texts])

def perform_clustering(embeddings: np.ndarray, n_clusters: int = 10) -> np.ndarray:
    """Perform clustering on embeddings using Gaussian Mixture Model."""
    print(f"Performing clustering with {n_clusters} clusters")
    gm = GaussianMixture(n_components=n_clusters, random_state=42)
    return gm.fit_predict(embeddings)

def summarize_texts(texts: List[str]) -> str:
    """Summarize a list of texts using OpenAI."""
    print(f"Summarizing {len(texts)} texts")
    prompt = ChatPromptTemplate.from_template(
        "Summarize the following text concisely:\n\n{text}"
    )
    chain = prompt | llm
    input_data = {"text": texts}
    return chain.invoke(input_data)

# RAPTOR Core Function

In [31]:
def build_raptor_tree(texts: List[str], max_levels: int = 3) -> Dict[int, pd.DataFrame]:
    """Build the RAPTOR tree structure with level metadata and parent-child relationships."""
    results = {}
    current_texts = [extract_text(text) for text in texts]
    current_metadata = [{"level": 0, "origin": "original", "parent_id": None} for _ in texts]
    
    for level in range(1, max_levels + 1):
        print(f"Processing level {level}")
        
        embeddings = embed_texts(current_texts)
        n_clusters = min(10, len(current_texts) // 2)
        cluster_labels = perform_clustering(np.array(embeddings), n_clusters)
        
        df = pd.DataFrame({
            'text': current_texts,
            'embedding': embeddings,
            'cluster': cluster_labels,
            'metadata': current_metadata
        })
        
        results[level-1] = df
        
        summaries = []
        new_metadata = []
        for cluster in df['cluster'].unique():
            cluster_docs = df[df['cluster'] == cluster]
            cluster_texts = cluster_docs['text'].tolist()
            cluster_metadata = cluster_docs['metadata'].tolist()
            summary = summarize_texts(cluster_texts)
            summaries.append(summary)
            new_metadata.append({
                "level": level,
                "origin": f"summary_of_cluster_{cluster}_level_{level-1}",
                "child_ids": [meta.get('id') for meta in cluster_metadata],
                "id": f"summary_{level}_{cluster}"
            })
        
        current_texts = summaries
        current_metadata = new_metadata
        
        if len(current_texts) <= 1:
            results[level] = pd.DataFrame({
                'text': current_texts,
                'embedding': embed_texts(current_texts),
                'cluster': [0],
                'metadata': current_metadata
            })
            print(f"Stopping at level {level} as we have only one summary")
            break
    
    return results

# Vector Store

In [32]:

def build_vectorstore(tree_results: Dict[int, pd.DataFrame]) -> FAISS:
    """Build a FAISS vectorstore from all texts in the RAPTOR tree."""
    all_texts = []
    all_embeddings = []
    all_metadatas = []
    
    for level, df in tree_results.items():
        all_texts.extend([str(text) for text in df['text'].tolist()])
        all_embeddings.extend([embedding.tolist() if isinstance(embedding, np.ndarray) else embedding for embedding in df['embedding'].tolist()])
        all_metadatas.extend(df['metadata'].tolist())

    # Create Document objects manually to ensure correct types
    documents = [Document(page_content=str(text), metadata=metadata) 
                 for text, metadata in zip(all_texts, all_metadatas)]

    # print("\n Example Document object:")
    # print(documents[1])
    return FAISS.from_documents(documents, embedding_model)


In [33]:
# ContextualCompressionRetriever:
# It‚Äôs a wrapper retriever that improves the quality of retrieved documents by compressing (shortening / filtering) them before returning.
    
def create_retriever(vectorstore: FAISS) -> ContextualCompressionRetriever:
    """Create a retriever with contextual compression."""
    logging.info("Creating contextual compression retriever")
    base_retriever = vectorstore.as_retriever()
    
    prompt = ChatPromptTemplate.from_template(
        "Given the following context and question, extract only the relevant information for answering the question:\n\n"
        "Context: {context}\n"
        "Question: {question}\n\n"
        "Relevant Information:"
    )
    
    extractor = LLMChainExtractor.from_llm(llm, prompt=prompt)
    
    return ContextualCompressionRetriever(
        base_compressor=extractor,
        base_retriever=base_retriever
    )

# RAPTOR Query Process (Online Process)

In [None]:
def hierarchical_retrieval(query: str, retriever: ContextualCompressionRetriever, max_level: int) -> List[Document]:
    """Perform hierarchical retrieval starting from the highest level, handling potential None values."""
    all_retrieved_docs = []
    for level in range(max_level, -1, -1):
        # Retrieve documents from the current level
        level_docs = retriever.get_relevant_documents(
            query,
            filter=lambda meta: meta['level'] == level
        )
        all_retrieved_docs.extend(level_docs)

    # If we've found documents, retrieve their children from the next level down
        if level_docs and level > 0:
            child_ids = [doc.metadata.get('child_ids', []) for doc in level_docs]
            child_ids = [item for sublist in child_ids for item in sublist if item is not None]  # Flatten and filter None
            
            if child_ids:  # Only modify query if there are valid child IDs
                child_query = f" AND id:({' OR '.join(str(id) for id in child_ids)})"
                query += child_query
    
    return all_retrieved_docs
    
def raptor_query(query: str, retriever: ContextualCompressionRetriever, max_level: int) -> Dict[str, Any]:
    """Process a query using the RAPTOR system with hierarchical retrieval."""
    logging.info(f"Processing query: {query}")

    relevant_docs = hierarchical_retrieval(query, retriever, max_level)

    doc_details = []
    for i, doc in enumerate(relevant_docs, 1):
        doc_details.append({
            "index": i,
            "content": doc.page_content,
            "metadata": doc.metadata,
            "level": doc.metadata.get('level', 'Unknown'),
            "similarity_score": doc.metadata.get('score', 'N/A')
        })
    context = "\n\n".join([doc.page_content for doc in relevant_docs])
    
    prompt = ChatPromptTemplate.from_template(
        "Given the following context, please answer the question:\n\n"
        "Context: {context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    answer = chain.run(context=context, question=query)
    
    logging.info("Query processing completed")
    
    result = {
        "query": query,
        "retrieved_documents": doc_details,
        "num_docs_retrieved": len(relevant_docs),
        "context_used": context,
        "answer": answer,
        "model_used": "llama3",
    }
    
    return result

def print_query_details(result: Dict[str, Any]):
    """Print detailed information about the query process, including tree level metadata."""
    print(f"Query: {result['query']}")
    print(f"\nNumber of documents retrieved: {result['num_docs_retrieved']}")
    print(f"\nRetrieved Documents:")
    for doc in result['retrieved_documents']:
        print(f"  Document {doc['index']}:")
        print(f"    Content: {doc['content'][:100]}...")  # Show first 100 characters
        print(f"    Similarity Score: {doc['similarity_score']}")
        print(f"    Tree Level: {doc['metadata'].get('level', 'Unknown')}")
        print(f"    Origin: {doc['metadata'].get('origin', 'Unknown')}")
        if 'child_docs' in doc['metadata']:
            print(f"    Number of Child Documents: {len(doc['metadata']['child_docs'])}")
        print()
    
    print(f"\nContext used for answer generation:")
    print(result['context_used'])
    
    print(f"\nGenerated Answer:")
    print(result['answer'])
    
    print(f"\nModel Used: {result['model_used']}")

In [34]:
path = "../data/Understanding_Climate_Change.pdf"

In [35]:
loader = PyPDFLoader(path)
documents = loader.load()
texts = [doc.page_content for doc in documents]

In [None]:
# Build the RAPTOR tree
tree_results = build_raptor_tree(texts)

# # Write tree_results dict to a text file
# with open("tree_results.txt", "w") as f:
#     for level, df in tree_results.items():
#         f.write(f"--- Level {level} ---\n")
#         f.write(df.to_string(index=False))  # write DataFrame as string
#         f.write("\n\n")


In [37]:
# Build vectorstore
vectorstore = build_vectorstore(tree_results)

In [38]:
# Create retriever
retriever = create_retriever(vectorstore)

In [53]:
# Run the pipeline
max_level = 3  # Adjust based on your tree depth
query = "What are the Innovative Adaptation Strategies?"
result = raptor_query(query, retriever, max_level)
print_query_details(result)

Query: What are the Innovative Adaptation Strategies?

Number of documents retrieved: 4

Retrieved Documents:
  Document 1:
    Content: According to the context, the Innovative Adaptation Strategies mentioned are:

1. Climate-resilient ...
    Similarity Score: N/A
    Tree Level: 0
    Origin: original

  Document 2:
    Content: Based on the provided context, the relevant information for answering the question "What are the Inn...
    Similarity Score: N/A
    Tree Level: 0
    Origin: original

  Document 3:
    Content: The relevant information for answering the question is:

* Community-based solutions (e.g., communit...
    Similarity Score: N/A
    Tree Level: 0
    Origin: original

  Document 4:
    Content: According to the context, there is no mention of "Innovative Adaptation Strategies". However, some r...
    Similarity Score: N/A
    Tree Level: 0
    Origin: original


Context used for answer generation:
According to the context, the Innovative Adaptation Strategies me

# üìö Deciding Tree Length in RAPTOR Implementations

In RAPTOR (hierarchical chunking + embeddings), **tree length** (depth) determines how many levels of summaries you create.

---

### üîé What Tree Length Means
- **Leaf level** ‚Üí raw text chunks (e.g., 500‚Äì1000 tokens).  
- **Intermediate levels** ‚Üí summaries of sibling chunks.  
- **Root level** ‚Üí one summary representing the entire document.  

The **tree depth** defines how many summarization layers exist above the raw chunks.

---

### ‚öñÔ∏è How to Decide Tree Length
1. **Document size**
   - Small (<5k tokens) ‚Üí Depth **1‚Äì2**
   - Medium (5k‚Äì50k tokens) ‚Üí Depth **2‚Äì3**
   - Large (>100k tokens) ‚Üí Depth **3‚Äì4**

2. **Retrieval needs**
   - Fine-grained retrieval ‚Üí Shallower (2 levels)  
   - Broader semantic context ‚Üí Deeper (3‚Äì4 levels)  

3. **Compute vs. Storage**
   - More depth = More LLM calls + More embeddings.  
   - Rarely useful beyond **depth = 4**.

---

### üîß Rule of Thumb
- Depth = **2** ‚Üí Enough for most RAG use cases  
- Depth = **3** ‚Üí For large hierarchical docs (books, legal, multi-chapter reports)  
- Depth > **4** ‚Üí Usually not worth the cost  

---

### üêç Example: Dynamic Depth Selection in Python

```python
def decide_tree_depth(token_count: int) -> int:
    """
    Decide tree depth for RAPTOR-style hierarchical embeddings
    based on document token size.
    """
    if token_count < 5_000:
        return 2   # chunks + summaries
    elif token_count < 50_000:
        return 3   # chunks + mid-level + root
    else:
        return 4   # very large docs

# Example usage
doc_tokens = 32000
depth = decide_tree_depth(doc_tokens)
print(f"Recommended tree depth: {depth}")
