# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval

In [1]:
## python=3.9
#!pip install --upgrade sentence_transformers

In [2]:
#!pip install pandas
#!pip install PyPDF2
#!pip install ragas
#!pip install termcolor

In [1]:
# NOTE: An OpenAI API key must be set here for application initialization, even if not in use.
# If you're not utilizing OpenAI models, assign a placeholder string (e.g., "not_used").
import os
try:
    from .utils import OPENAI_API_KEY
except:
    path = os.getcwd()
    os.chdir('/home/oleg/SIT/LLM')
    #print(os.getcwd())
    from utils import OPENAI_API_KEY
    os.chdir(path)


os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

#os.environ["OPENAI_API_KEY"] = "your-openai-key"

1) **Building**: RAPTOR recursively embeds, clusters, and summarizes chunks of text to construct a tree with varying levels of summarization from the bottom up. You can create a tree from the text in 'sample.txt' using `RA.add_documents(text)`.

2) **Querying**: At inference time, the RAPTOR model retrieves information from this tree, integrating data across lengthy documents at different abstraction levels. You can perform queries on the tree with `RA.answer_question`.

### Building the tree

In [2]:
#os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
from raptor import RetrievalAugmentation, RetrievalAugmentationConfig

2024-12-10 18:07:03,928 - PyTorch version 2.5.1 available.
2024-12-10 18:07:04,160 - Loading faiss with AVX2 support.
2024-12-10 18:07:04,176 - Successfully loaded faiss with AVX2 support.


### Apply to ai4mat articles

In [3]:
pdf_list = ["/home/oleg/SIT/LLM/data/data_rag/s41524-023-01062-z.pdf",
    "/home/oleg/SIT/LLM//data/data_rag/s41699-023-00369-1.pdf"
           ]
         #   "https://github.com/HSE-LAMBDA/ai4material_design/tree/main/docs/CONSTRUCTOR-MOCK.md"
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/CONSTRUCTOR.md",
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/DATA.md",
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/ENVIRONMENT.md",
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/GENERATING-CONSTRUCTOR.md",
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/GENERATING-MOCK.md",
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/PILOT.md",
         #   "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/SPARSE-PAPER.md"
          #  "https://www.nature.com/articles/s41377-024-01407-3",
          #  "https://www.nature.com/articles/s41565-023-01407-1",
          #  "https://www.nature.com/articles/s41699-023-00369-1",
           
           

In [4]:
from raptor import BaseSummarizationModel, BaseQAModel
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import JsonOutputParser

class CustomSummarizationModel(BaseSummarizationModel):
    def __init__(self, model="gpt-4o-mini", **kwargs):
        super().__init__( **kwargs)
        self.llm = ChatOpenAI(model=model, temperature=0.) 
        
        pass

    def summarize(self, context, max_tokens=100):
        prompt = PromptTemplate.from_template("""
You are an expert in analyzing scientific articles.
You are given "Text":
###
{context}
--------------
Your task is to give a short summary of "Text" of no more than {max_tokens}.
Output is a string format.
"""
                                             )
        # Return the summary as a string
        agent = prompt | self.llm 
        summary = agent.invoke({'context': context, 'max_tokens':max_tokens})
        return summary.content


class CustomQAModel(BaseQAModel):
    def __init__(self, model="gpt-4o-mini", **kwargs):
        super().__init__(**kwargs)
        self.llm = ChatOpenAI(model=model, temperature=0.)
        
        pass

    def answer_question(self, context, question):
         # Define prompt
        prompt = PromptTemplate.from_template("""
    Human: You are an AI assistant , an expert in machine learning and the quantum physics. 
    You are able to find answers to the questions from the contextual passage snippets provided.
    \n
    Use the following pieces of information enclosed in <context> tags 
    to provide an answer to the question enclosed in <question> tags.
    <context> : {context}
    
    <question> :{question}
    
    if you do not know the answer, just say that you don't know. Do not make up an answer.
    """
                                             )
        agent = prompt | self.llm 
        answer = agent.invoke({'context': context, 'question':question})
        return answer.content

In [5]:
su = CustomSummarizationModel()
su.__dict__

{'llm': ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7fd84d6b5790>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7fd84d6be940>, root_client=<openai.OpenAI object at 0x7fd9880ac1f0>, root_async_client=<openai.AsyncOpenAI object at 0x7fd84d6b57f0>, model_name='gpt-4o-mini', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'))}

In [31]:
WORKING_DIR = "./ai4mat"



if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

mode = True
if mode:
    RA_conf = RetrievalAugmentationConfig(
        #tree_builder_config=None,
        #tree_retriever_config=None,  # Change from default instantiation
        qa_model=CustomQAModel(),
        #embedding_model=None,
        summarization_model=CustomSummarizationModel(),
        #tree_builder_type="cluster",
        # New parameters for TreeRetrieverConfig and TreeBuilderConfig
        # TreeRetrieverConfig arguments
        #tr_tokenizer=None,
        #tr_threshold=0.5,
        #tr_top_k=10,
        #tr_selection_mode="top_k",
        #tr_context_embedding_model="OpenAI",
        #tr_embedding_model=None,
        #tr_num_layers=3,
        #tr_start_layer=None,
        # TreeBuilderConfig arguments
        #tb_tokenizer=None,
        tb_max_tokens=300,
        tb_num_layers=3,
        tb_threshold=0.8,
        #tb_top_k=10,
        #tb_selection_mode="top_k",
        tb_summarization_length=300,
        #tb_summarization_model=CustomSummarizationModel(),
        #tb_embedding_models=None,
        #tb_cluster_embedding_model="OpenAI",
    )
    RA = RetrievalAugmentation(RA_conf)
    #RA.tree_builder.max_tokens = 1000
    #RA.tree_builder.summarization_length = 1000
    #RA.tree_builder.top_k = 10
    #RA.tree_builder.summarization_model = CustomSummarizationModel()
    
    
else:    
    RA = RetrievalAugmentation()



2024-12-10 18:13:30,499 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 300
            Num Layers: 3
            Threshold: 0.8
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 300
            Summarization Model: <__main__.CustomSummarizationModel object at 0x7fd84c388d60>
            Embedding Models: {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel object at 0x7fd84d0d7940>}
            Cluster Embedding Model: OpenAI
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-12-10 18:13:30,500 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 300
            Num Layers: 3
            Threshold: 0.8
            Top K: 5
            Selection Mode: top_

In [32]:
from PyPDF2 import PdfReader
import nest_asyncio
nest_asyncio.apply()

for pdf in pdf_list:

    # Create a PdfReader instance
    reader = PdfReader(pdf)
    for page in reader.pages:
        RA.add_documents(page.extract_text())
    #RA.add_documents(text_content.decode('utf-8'))
    #RA.add_documents(pdf)

2024-12-10 18:13:36,247 - Creating Leaf Nodes
2024-12-10 18:13:36,571 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:13:36,574 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:13:36,634 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:13:36,658 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:13:36,662 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:13:36,667 - Created 5 Leaf Embeddings
2024-12-10 18:13:36,668 - Building All Nodes
2024-12-10 18:13:36,674 - Using Cluster TreeBuilder
2024-12-10 18:13:36,675 - Constructing Layer 0
2024-12-10 18:13:36,676 - Stopping Layer construction: Cannot Create More Layers. Total Layers in tree: 0
2024-12-10 18:13:36,676 - Successfully initialized TreeRetriever with Config 
        TreeRetrieverConfig:
            Tokenizer: <Encoding 'cl100k_base

In [33]:
RA.tree_builder.__dict__

{'tokenizer': <Encoding 'cl100k_base'>,
 'max_tokens': 300,
 'num_layers': 0,
 'top_k': 5,
 'threshold': 0.8,
 'selection_mode': 'top_k',
 'summarization_length': 300,
 'summarization_model': <__main__.CustomSummarizationModel at 0x7fd84c388d60>,
 'embedding_models': {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel at 0x7fd84d0d7940>},
 'cluster_embedding_model': 'OpenAI',
 'reduction_dimension': 10,
 'clustering_algorithm': raptor.cluster_utils.RAPTOR_Clustering,
 'clustering_params': {}}

In [34]:
RA.tree.__dict__

{'all_nodes': {0: <raptor.tree_structures.Node at 0x7fd84c346100>,
  3: <raptor.tree_structures.Node at 0x7fd84c3c37c0>,
  1: <raptor.tree_structures.Node at 0x7fd84c3c3430>,
  2: <raptor.tree_structures.Node at 0x7fd84d0f0580>},
 'root_nodes': {0: <raptor.tree_structures.Node at 0x7fd84c346100>,
  3: <raptor.tree_structures.Node at 0x7fd84c3c37c0>,
  1: <raptor.tree_structures.Node at 0x7fd84c3c3430>,
  2: <raptor.tree_structures.Node at 0x7fd84d0f0580>},
 'leaf_nodes': {0: <raptor.tree_structures.Node at 0x7fd84c2c7d90>,
  3: <raptor.tree_structures.Node at 0x7fd84d28fd30>,
  1: <raptor.tree_structures.Node at 0x7fd84c346580>,
  2: <raptor.tree_structures.Node at 0x7fd84d0d3970>},
 'num_layers': 0,
 'layer_to_nodes': {0: [<raptor.tree_structures.Node at 0x7fd84c2c7d90>,
   <raptor.tree_structures.Node at 0x7fd84d28fd30>,
   <raptor.tree_structures.Node at 0x7fd84c346580>,
   <raptor.tree_structures.Node at 0x7fd84d0d3970>]}}

#### Load Test Q&A

In [10]:
import pandas as pd

df_ground_true = pd.read_csv("/home/oleg/SIT/LLM/data/data_rag/QA_ai4mat_2articles.csv")

questions = df_ground_true['question'].values.tolist()
answers = df_ground_true['answer'].values.tolist()


In [35]:
answer = RA.answer_question(question=questions[0])
print("Question:", questions[0])
print("Answer: ", answer)

2024-12-10 18:14:24,313 - Using collapsed_tree
2024-12-10 18:14:24,700 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:14:25,210 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Question: * Which materials are in the dataset?
Answer:  I don't know.



### Evaluate metrics on Test Q&A

In [36]:
from ragas.metrics import (
    #context_utilization,
    #context_entity_recall,
    #context_relevancy,
    answer_relevancy,
    answer_correctness, 
    faithfulness,
    context_precision,
    context_recall,
)

from datasets import Dataset 
from tqdm import tqdm

from ragas.evaluation import evaluate

In [37]:
def get_datasamples(questions, 
                    answers,
                    model,
                    mode="local",
                    top_k=10,
                   ):
    contexts = []
    ideal_answers = []

    for i,q in tqdm(enumerate(questions)):
        _context = model.retrieve(q)[0]
        # Convert the CSV string into a Pandas DataFrame
        # Regular expression to find all content between triple backticks
        context = [_context]
        
        i_answer = model.answer_question(question=q)
    
        #contexts.append(context)
        contexts.append(context)
        ideal_answers.append(i_answer)

    data_samples = {'question': questions, 
           'ground_truth': answers,
           "contexts": contexts,
          "answer": ideal_answers
          }
    return Dataset.from_dict(data_samples)

In [38]:
metrics = [
   answer_relevancy,
    answer_correctness, 
   faithfulness,
    context_precision,
    context_recall,
 #   context_entity_recall,
 #   context_relevancy,
 #   context_utilization,
]

In [39]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [40]:
dataset = get_datasamples(questions,answers, RA)

0it [00:00, ?it/s]2024-12-10 18:14:32,361 - Using collapsed_tree
2024-12-10 18:14:32,669 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:14:32,681 - Using collapsed_tree
2024-12-10 18:14:33,095 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:14:33,708 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
1it [00:01,  1.35s/it]2024-12-10 18:14:33,717 - Using collapsed_tree
2024-12-10 18:14:34,119 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:14:34,130 - Using collapsed_tree
2024-12-10 18:14:34,426 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-10 18:14:34,937 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2it [00:02,  1.28s/it]2024-12-10 18:14:34,945 - Using collapsed_tree
2024-12-10 18:14:35,348 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1

In [41]:
result = evaluate(dataset, metrics,
                        llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),  
                        raise_exceptions=False, embeddings=OpenAIEmbeddings())

Evaluating:   0%|          | 0/175 [00:00<?, ?it/s]

2024-12-10 18:15:24,706 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,713 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,728 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,819 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,873 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,904 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,913 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,925 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,941 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-10 18:15:24,967 - HTTP Request: POST h

### Results

In [42]:
import ragas
def print_dict(d, indent=0):
    """
    Recursively prints a dictionary, including nested dictionaries, 
    with indentation to represent structure.
    """
    if isinstance(d, ragas.dataset_schema.EvaluationResult):
        _d = d._repr_dict.copy()
    else:
        _d = d.copy()
    for key, value in _d.items():
        print('  ' * indent + str(key) + ": ", end="")
        if isinstance(value, dict):
            print()  # Print a newline for nested dictionaries
            print_dict(value, indent + 1)
        else:
            print(str(value))

In [43]:
print_dict(result)
d_result = dict()
d_result['raptor'] = result

answer_relevancy: 0.04932377522932352
answer_correctness: 0.19359120912877228
faithfulness: 0.05714285714285714
context_precision: 0.08571428570571428
context_recall: 0.11428571428571428


In [44]:
naive = {'answer_relevancy': 0.6295814045264221,
'answer_correctness': 0.41957449852216616,
'faithfulness': 0.12835978835978834,
'context_precision': 0.0,
'context_recall': 0.08571428571428572
        }

In [45]:
d_result = {'raptor': result, 'naive': naive}

df = pd.DataFrame()

for k in d_result.keys():
    print(k)
    print_dict(d_result[k])
    #df[k] = d_result[k]
    print()


raptor
answer_relevancy: 0.04932377522932352
answer_correctness: 0.19359120912877228
faithfulness: 0.05714285714285714
context_precision: 0.08571428570571428
context_recall: 0.11428571428571428

naive
answer_relevancy: 0.6295814045264221
answer_correctness: 0.41957449852216616
faithfulness: 0.12835978835978834
context_precision: 0.0
context_recall: 0.08571428571428572



In [46]:
from termcolor import colored

# Print colored text with termcolor

for i,r in dataset.to_pandas().iterrows():
    #print(i, "\n" ,10*"*** ")
    print(colored(r['question'], "red", attrs=["bold"]))
    print(r['answer'])
    print()

[1m[31m* Which materials are in the dataset?[0m
I don't know.

[1m[31m* How many structures are there in the dataset?[0m
I don't know.

[1m[31m* How to obtain the dataset?[0m
I don't know.

[1m[31m* What is the dataset license?[0m
The dataset license is a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution, and reproduction in any medium or format, as long as appropriate credit is given to the original author(s) and the source, a link to the Creative Commons license is provided, and any changes made are indicated.

[1m[31m* What is the data format?[0m
I don't know.

[1m[31m* How to read the dataset?[0m
I don't know.

[1m[31m* How to browse the dataset?[0m
I don't know.

[1m[31m* Can I work with the dataset without Python?[0m
I don't know.

[1m[31m* How to obtain the raw VASP files?[0m
I don't know.

[1m[31m* What were the settings used for the DFT computations?[0m
I don't know.

[1m[31m* How well t