# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval

In [3]:
# NOTE: An OpenAI API key must be set here for application initialization, even if not in use.
# If you're not utilizing OpenAI models, assign a placeholder string (e.g., "not_used").
import os, json
os.environ["OPENAI_API_KEY"] = "your_api_key"

In [4]:
with open('./dataset/json_data/escaped_20221114000599.json', 'r') as file: #"POSCO홀딩스"
    text = json.load(file)
text = text[:100]

1) **Building**: RAPTOR recursively embeds, clusters, and summarizes chunks of text to construct a tree with varying levels of summarization from the bottom up. You can create a tree from the text in 'sample.txt' using `RA.add_documents(text)`.

2) **Querying**: At inference time, the RAPTOR model retrieves information from this tree, integrating data across lengthy documents at different abstraction levels. You can perform queries on the tree with `RA.answer_question`.

### Building the tree

In [2]:
from raptor import RetrievalAugmentation, RetrievalAugmentationConfig
from raptor.custom_tokenizer import FinQATokenizer
from raptor.SummarizationModels import HCX_003_SummarizationModel, NCPSummarizationModel
from raptor.ExtractModel import HCX_003_MetaDataExecutor
from raptor.cluster_utils import FinRAG_Clustering
from raptor.QAModels import HCX_003_QAModel
from raptor.EmbeddingModels import (
    BaseEmbeddingModel,
    OpenAIEmbeddingModel,
    HyperCLOVAEmbeddingModel
)

2024-11-12 13:37:52,607 - Loading faiss.
2024-11-12 13:37:52,628 - Successfully loaded faiss.


In [5]:
config = RetrievalAugmentationConfig(
    # 토크나이저 설정
    tree_builder_type="cluster",  # cluster 타입 사용
    tb_clustering_algorithm=FinRAG_Clustering,  # FinRAG Clustering 알고리즘 사용

    tb_tokenizer=FinQATokenizer(chunk_size=1024),  # 기본 chunk_size 설정
    tr_tokenizer=FinQATokenizer(chunk_size=1024),  # retriever용 토크나이저
    
    # 임베딩 모델 설정
    #embedding_model=HyperCLOVAEmbeddingModel(),  # 사용할 임베딩 모델 지정
    
    # 요약 모델 설정
    summarization_model=HCX_003_SummarizationModel(),

    # QA 모델 설정
    qa_model=HCX_003_QAModel(),

    # TreeBuilder 설정
    tb_metadata_extract_model="HCX-003",
    tb_max_tokens=1000,  # 각 노드의 최대 토큰 수
    tb_num_layers=5,     # 트리의 계층 수
    tb_threshold=0.5,    # 유사도 임계값
    tb_top_k=5,         # 상위 K개 노드 선택
    tb_selection_mode="top_k",  # 노드 선택 모드
    tb_summarization_length=100,  # 요약 길이
    
    # TreeRetriever 설정
    tr_threshold=0.5,    # 검색 임계값
    tr_top_k=5,         # 검색할 상위 노드 수
    tr_selection_mode="top_k",  # 검색 모드
    tr_num_layers=None,  # 검색할 계층 수 (None=전체)
    tr_start_layer=None  # 검색 시작 계층 (None=최상위)
)

In [6]:
RA = RetrievalAugmentation(config=config)

2024-11-12 13:37:52,677 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: FinQATokenizer
            Max Tokens: 1000
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <raptor.SummarizationModels.HCX_003_SummarizationModel object at 0x320c51c00>
            Embedding Models: {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel object at 0x320c51ed0>}
            Cluster Embedding Model: OpenAI
        
        Reduction Dimension: 10
        Clustering Algorithm: FinRAG_Clustering
        Clustering Parameters: {}
        
Layer Summarization Lengths: {0: 300, 1: 200, 2: 100, 3: 1000}
2024-11-12 13:37:52,677 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: FinQATokenizer
            Max Tokens: 1000
            Num Layers: 5
            Threshold

In [7]:
# construct the tree
RA.add_documents(text)

2024-11-12 13:37:52,980 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,032 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,037 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,045 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,146 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,147 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,155 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,198 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,210 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:37:53,262 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


### Querying from the tree

```python
question = # any question
RA.answer_question(question)
```

In [16]:
question = "포스코 회사채 신용등급은 뭐고 누구에게 해당 신용 등급을 받았어?"

answer = RA.answer_question(question=question)
search_res = RA.retrieve(question="회사채 신용등급",top_k=3)
print("\nAnswer: ", answer)
print(f"\nSearch result :",search_res[0])

2024-11-12 13:49:10,777 - Using collapsed_tree
2024-11-12 13:49:11,149 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-11-12 13:49:15,453 - Using collapsed_tree
2024-11-12 13:49:15,964 - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"



Answer:  포스코는 국내 신용평가기관인 NICE신용평가, 한국신용평가, 한국기업평가로부터 AA+(Positive/Stable)의 신용등급을 받았고, 해외신용평가기관인 S&P로부터 BBB+(Positive), Moody's로부터 Baa1(Stable)의 신용등급을 받았습니다.

Search result : - 회사채 신용등급의 정의(해외)

※ 신용등급체계- 회사채 신용등급의 정의(국내)

신용평가에 관한 사항 회사는 보고서 제출일 현재 국내 신용평가기관인 NICE신용평가로부터 AA+(Positive), 한국신용평가, 한국기업평가로부터 AA+(Stable)의 신용등급을 받고 있으며, 해외신용평가기관인 S BBB+(Positive), Moody&#x27;s로부터 Baa1(Stable)의 신용등급을 받고 있습니다. 회사가 최근 3사업연도 신용평가 전문기관으로부터 받은 신용평가등급은 다음과 같습니다.


