In [1]:
import openai
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))
from dotenv import load_dotenv
load_dotenv()

# os.environ['OPENAI_API_KEY'] = "YOUR_API_KEY"
openai.api_key = os.environ['OPENAI_API_KEY']
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI

# Use local embeddings + gpt-3.5-turbo-16k
service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-16k", max_tokens=512, temperature=0.1),
    embed_model="local:BAAI/bge-base-en"
)

set_global_service_context(service_context)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index import SimpleDirectoryReader, Document
from llama_index.node_parser import HierarchicalNodeParser, SimpleNodeParser, get_leaf_nodes
from llama_index.schema import MetadataMode
# from llama_docs_bot.markdown_docs_reader import MarkdownDocsReader


"""Load markdown docs from a directory, excluding all other file types."""
filepath = './data/'
loader = SimpleDirectoryReader(
    input_dir=filepath, 
    required_exts=[".pdf"],
    # file_extractor={".md": MarkdownDocsReader()},
    recursive=True
)
hierarchical = True
documents = loader.load_data()

if hierarchical:
    # combine all documents into one
    # documents = [
    #     Document(text="\n\n".join(
    #             document.get_content(metadata_mode=MetadataMode.ALL) 
    #             for document in documents
    #         )
    #     )
    # ]

    # chunk into 3 levels
    # majority means 2/3 are retrieved before using the parent
    large_chunk_size = 1536
    node_parser = HierarchicalNodeParser.from_defaults(
        chunk_sizes=[
            1024, 
            512,
            128
        ]
    )

    nodes = node_parser.get_nodes_from_documents(documents)
    # return nodes, get_leaf_nodes(nodes)
    leaf_nodes = get_leaf_nodes(nodes)
    # nodes, get_leaf_nodes 확인
# else:
#     node_parser = SimpleNodeParser.from_defaults()
#     nodes = node_parser.get_nodes_from_documents(documents)
#     return nodes

In [3]:
nodes.__len__(), get_leaf_nodes(nodes).__len__()

(2533, 1865)

In [4]:
nodes[0].ref_doc_id, nodes[0].node_id, nodes[0].parent_node, nodes[0].child_nodes

('afa8ec28-8b5e-48f9-b4a5-a42f3012bf6d',
 '673547a3-927d-4370-94ff-f8dcb830445f',
 None,
 [RelatedNodeInfo(node_id='661e105e-65ba-4f3a-adbe-469f6717fa68', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '1', 'file_name': '2005.11401.pdf', 'file_path': 'data/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2023-12-17', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2023-12-17'}, hash='0b5c1c4363f0b9282b21e888107f244c3f978e41aa9a63bfe0341ff5c422b174'),
  RelatedNodeInfo(node_id='979f262e-335a-407d-915e-0c3b32cec2bd', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '1', 'file_name': '2005.11401.pdf', 'file_path': 'data/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2023-12-17', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2023-12-17'}, hash='15cab383be0eea103016ce032ec460475440319dfe9a8d1aa22b6c066349c0af')])

In [5]:
nodes[0].text

'Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez⋆,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research;‡University College London;⋆New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-\nstream NLP tasks. However, their ability to access and precisely manipulate knowl-\nedge is still limited, and hence on knowledge-intensive tasks, their performance\nlags behind task-speciﬁc architectures. Additionally, providing provenance for their\ndecisions and updating their world knowledge remain open research problems. Pre-\ntrained models with a differentiable access mechanism to explicit non-parametric\nmemory have so far been only investigated for extractiv

In [6]:
nodes[83].text, nodes[83].parent_node, nodes[83].child_nodes

('We explore\ntwo variants: the standard 3-way classiﬁcation task (supports/refutes/not enough info) and the 2-way\n(supports/refutes) task studied in Thorne and Vlachos [57]. In both cases we report label accuracy.\n4 Results\n4.1 Open-domain Question Answering\nTable 1 shows results for RAG along with state-of-the-art models.',
 RelatedNodeInfo(node_id='ff6c50b6-c8b0-4df3-8ea0-5ab46a6b578a', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '5', 'file_name': '2005.11401.pdf', 'file_path': 'data/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2023-12-17', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2023-12-17'}, hash='18249784e643e06d1bb7f023af1263ec93da056d5e05cf7741376c694dcd19f2'),
 None)

In [7]:
nodes[296].text

'We use greedy decoding for QA as\nwe did not ﬁnd beam search improved results. For Open-MSMarco and Jeopardy question generation,\nwe report test numbers using ten retrieved documents for both RAG-Token and RAG-Sequence,\nand we also train a BART-large model as a baseline. We use a beam size of four, and use the Fast\nDecoding approach for RAG-Sequence models, as Thorough Decoding did not improve performance.'

In [8]:
nodes[85].text

'It is worth noting that RAG’s\nretriever is initialized using DPR’s retriever, which uses retrieval supervision on Natural Questions\nand TriviaQA.'

In [9]:
nodes[86].text

'RAG compares favourably to the DPR QA system, which uses a BERT-based “cross-\nencoder” to re-rank documents, along with an extractive reader. RAG demonstrates that neither a\nre-ranker nor extractive reader is necessary for state-of-the-art performance.'

In [10]:
nodes[83].child_nodes, nodes[59].parent_node

(None,
 RelatedNodeInfo(node_id='9442703e-c67a-44cd-9c0f-e8ef121d1d17', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '4', 'file_name': '2005.11401.pdf', 'file_path': 'data/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2023-12-17', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2023-12-17'}, hash='6bd50c649fa0c18f78d81573f13a5ae84074f55934354c880c773309c59b38f7'))

In [11]:
get_leaf_nodes([nodes[0], nodes[296], nodes[83]])
# leaf nodes는 nodes[296] 을 의미함

[TextNode(id_='61bb9b59-8512-4596-9379-b1a2d81b8d9c', embedding=None, metadata={'page_label': '17', 'file_name': '2005.11401.pdf', 'file_path': 'data/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2023-12-17', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2023-12-17'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='99a75de0-06bc-4841-882b-5204bfdda56b', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '17', 'file_name': '2005.11401.pdf', 'file_path': 'data/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2023-12-17', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2023-12-17'}, hash='0c568b76c6b

In [12]:
for i in range(nodes.__len__()):
    if nodes[i].node_id == '5955f000-41f9-4402-915a-78df02b12e9a':
        print(i)

In [13]:
import chromadb
from llama_index.vector_stores import ChromaVectorStore
from llama_index import VectorStoreIndex,StorageContext, load_index_from_storage
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import AutoMergingRetriever
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.storage.docstore import SimpleDocumentStore


# 주의할 점. 여기서 경로를 안 지우고 저장하면, 기존에 있던 저장소 위에 덮어씌워짐. 파일명을 계속 바꾸던지, 아니면 저장소를 바꿔야 함
# create client and a new collection
# chroma_client = chromadb.EphemeralClient()
chroma_client = chromadb.PersistentClient(path=f"./Retrieval_Database/LLM_chroma_openai")
chroma_collection = chroma_client.create_collection("LLM_folder", get_or_create=True)

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# service_context = ServiceContext.from_defaults()#embed_model=embed_model)

index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    service_context=service_context,
    store_nodes_override=True,
    show_progress = True
)

Generating embeddings: 100%|██████████| 2533/2533 [18:02<00:00,  2.34it/s]


In [19]:
retriever = AutoMergingRetriever(
    index.as_retriever(similarity_top_k=30), 
    storage_context=storage_context,
    simple_ratio_thresh=0.1
)
# threshold의 default는 0.5임. 이는, 3개의 계층으로 나눴을 때 3개 3개 중 2개가 선택된다면, 비율이 0.66 > 0.5 이므로 최상위 노드를 반환함.
# 하지만 한 개만 선택된다면 0.33 < 0.5 이기 때문에 최상위 노드를 반환하지 않음.
# 이걸 낮게 주면, 한 개만 선택되어도 무조건 최상위 노드를 리턴하게 됨.
# res = retriever.retrieve("What papers did the authors reference in LLaMA2 and how do they relate to the content of LLaMA2?")
# res.__len__()
res = retriever.retrieve("What is DropEdge?")
res.__len__()

27

In [20]:
for i in range(len(res)):
    print(res[i].node.parent_node)
    print(res[i].node.child_nodes)
    print('------------')
    
    
# 여기서 file name 같은거 딕셔너리로 하나로 모으고
# 거기서 내용만 참조하도록 수정

None
[RelatedNodeInfo(node_id='9205633f-fb28-448a-9b6e-42f775d5b468', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '59', 'file_name': 'llama2.pdf', 'file_path': 'data/llama2.pdf', 'file_type': 'application/pdf', 'file_size': 13661300, 'creation_date': '2023-12-16', 'last_modified_date': '2023-12-16', 'last_accessed_date': '2023-12-16'}, hash='4f964c2fda5e02a175df60bf1d8a84bbdc9840c27d80311e939de1928d8cbd21'), RelatedNodeInfo(node_id='df528c0b-6d88-40a3-816d-af57502852de', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '59', 'file_name': 'llama2.pdf', 'file_path': 'data/llama2.pdf', 'file_type': 'application/pdf', 'file_size': 13661300, 'creation_date': '2023-12-16', 'last_modified_date': '2023-12-16', 'last_accessed_date': '2023-12-16'}, hash='d2e7380a48bc5c837ed7fcc7c17229307670def6dcf83a6864d4d1fcc6d981b0')]
------------
None
[RelatedNodeInfo(node_id='920f84a4-9f88-4e36-82b1-66543a8f024e', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '76', 'file_n

In [16]:
index_retriever = index.as_retriever(similarity_top_k=10)
index_res = index_retriever.retrieve('What is LLaMA2?')
index_res.__len__()

10

threshold = 0.5 일 때

f8bd3e1c-3950-4900-8f85-440d7ea2362d

343e4825-c62e-4a9e-9789-d7e4d47bfb7f (343의 Parent node는 9dd)
f4fd1722-520c-41b2-be9e-b2d500488a71 (343의 Child node임.)

04078385-fcb9-4da8-928d-4b42ae5d90ee
2aacdfbd-b137-41ec-a5f4-ed5244c44802
f4d1e035-4858-4bb5-a765-8683af09f811
812d3faf-f297-45c3-a6e5-c89993a50aba
5690ed2e-29ac-431f-be00-85c6112c2a8d
f74ed3d4-8edc-48b7-963c-877508506d39
5a731c2b-1100-4424-901b-bdb5d8d23557

automerging은 Child node, parent node가 다 있어야 완전한 parent node로 대체되는거네.

---

f8bd3e1c-3950-4900-8f85-440d7ea2362d

9dddb0e6-ec03-4d44-b88d-3ebf3964233c

04078385-fcb9-4da8-928d-4b42ae5d90ee
2aacdfbd-b137-41ec-a5f4-ed5244c44802
f4d1e035-4858-4bb5-a765-8683af09f811
812d3faf-f297-45c3-a6e5-c89993a50aba
5690ed2e-29ac-431f-be00-85c6112c2a8d
f74ed3d4-8edc-48b7-963c-877508506d39
5a731c2b-1100-4424-901b-bdb5d8d23557

---

ab38a223-9e5c-4d91-9fbe-a3215227c90c #child 2개 'ed280e47-194c-47df-b2d7-48c9971eca35' '4e40d025-ee38-4aa8-99d3-e1ef9e3ca01e'

9dddb0e6-ec03-4d44-b88d-3ebf3964233c
9dddb0e6-ec03-4d44-b88d-3ebf3964233c

0893d9ce-ee9c-4f77-9da4-56009760ddb8
d8bc367d-58b9-48fd-8010-a33ff8c2ba69
dd7469f8-b8e8-4caf-9994-52e09f633496
7d3ca43f-7dea-44c0-bad8-9d9f6110cad8
d4cae969-98c4-41a4-b8af-cd46f48077ba
c7f22b05-d380-4797-a986-acf8a4e6d227
a7869589-c73e-45e9-a505-2af628faba0f
ce73bafb-3960-4f93-a3b2-3aec78cd1293
2d9402ea-f188-4420-8798-791a10480030

res[i].node.parent_node.node_id

In [17]:
from llama_index import QueryBundle
query_engine = RetrieverQueryEngine.from_args(
        retriever,
        # node_postprocessors=postprocessors or [],
    )

query_engine = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name='./Retrieval_Database/llama2_chroma_openai',
        description='llama2 paper'
        )
    )

In [21]:
q = """
What papers did the authors reference in LLaMA2 and how do they relate to the content of LLaMA2?"
Please list the names and authors of the paper you referenced.
"""

res = query_engine.query_engine.query(QueryBundle(q))

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, you requested 16420 tokens (15908 in the messages, 512 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
from llama_index.response.notebook_utils import display_response
display_response(res)

**`Final Response:`** The papers referenced in LLaMA2 are not provided in the given context information.

In [None]:
res = query_engine.query_engine.retrieve("What papers did the authors reference in LLaMA2 and how do they relate to the content of LLaMA2?")

In [None]:
context = ''
for i in range(len(res)):
    context += '\n' + res[i].text
    if i == 7:
        break
    
question = """
What papers did the authors reference in LLaMA2 and how do they relate to the content of LLaMA2?"
Please list the names and authors of the paper you referenced.
"""
query = f"""

{context}

Answer the questions below by referring to the context above.
question : {question}
"""

res = query_engine.query_engine.query(query)

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, you requested 16399 tokens (15887 in the messages, 512 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
display_response(res)

In [None]:
import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.storage.docstore import SimpleDocumentStore

In [None]:
import openai
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))
from dotenv import load_dotenv
load_dotenv()

# os.environ['OPENAI_API_KEY'] = "YOUR_API_KEY"
openai.api_key = os.environ['OPENAI_API_KEY']
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI

# Use local embeddings + gpt-3.5-turbo-16k
service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-16k", max_tokens=512, temperature=0.1),
    embed_model="local:BAAI/bge-base-en"
)

set_global_service_context(service_context)

In [None]:
from llama_index import SimpleDirectoryReader
# loader = PDFReader()
# docs = loader.load_data(file=Path("/home/inbodyai/문서/Junhwi/test_pdf_data/LLaMA2.pdf"))
# 
loader = SimpleDirectoryReader(
    input_dir="/home/inbodyai/문서/Junhwi/test_pdf_data/",
    required_exts=['.pdf'],
    recursive=True,
)
docs = loader.load_data()

In [None]:
from llama_index import Document
from collections import defaultdict

prev_page = 0
cnt = 0
text = '\n'
documents_dict = defaultdict(str)
for doc in docs:
    documents_dict[doc.metadata['file_name']] += doc.get_content()
    
documents = [
    Document(text=documents_dict[key],
             metadata={'file_name' : key})
             for key in documents_dict
]

In [None]:
node_parser = HierarchicalNodeParser.from_default(
    chunk_sizes = [
        512,
        256,
        256
    ]
)
nodes = node_parser.get_nodes_from_documents(final_doc_list),
leaf_nodes = get_leaf_nodes(nodes[0])

from llama_index.embeddings import resolve_embed_model

embed_model = resolve_embed_model('local:BAAI/bge-small-en')

In [None]:
# add node embeddings
for i in range(nodes[0].__len__()):
    nodes[0][i].embedding = embed_model.get_text_embedding(nodes[0][i].text)
    
for i in range(leaf_nodes.__len__()):
    leaf_nodes[i].embedding = embed_model.get_text_embedding(leaf_nodes[i].text)

In [None]:
# make qdrant vectorstore
client = qdrant_client.QdrantClient(
    path = "./test_Qdrant"
)

vector_store = QdrantVectorStore(client=client,
                                 enable_hybrid=True,
                                 collection_name="papers")
vector_store.add(nodes[0])
docstore = SimpleDocumentStore()
docstore.add_documents(nodes[0])

In [None]:
service_context = ServiceContext.from_defaults(embed_model=embed_model)
storage_context = StorageContext.from_defaults(
    docstore=docstore,
    vector_store=vector_store,
    )

core_index = VectorStoreIndex.from_vector_store(
    vector_store,
    storage_context=storage_context,
    service_context=service_context,
    # store_nodes_override=True
)
docstore.persist()
core_index.storage_context.persist(persist_dir='./test_Qdrant')


In [None]:
storage_context = StorageContext.from_defaults(persist_dir='/home/inbodyai/문서/Junhwi/InBody_RAG_v3/test_Qdrant')
from llama_index.indices.loading import load_index_from_storage
new_index = load_index_from_storage(
    storage_context=storage_context
)

In [None]:
from llama_index.retrievers import RecursiveRetriever, AutoMergingRetriever
retriever_auto = AutoMergingRetriever(
    core_index.as_retriever(similarity_top_k=10, sparse_top_k=3), 
    storage_context=storage_context
)

res = retriever_auto.retrieve('llama2?')

for i in range(res.__len__()):
    print(len(res[i].text), res[i].node.child_nodes)

print()
for i in range(res.__len__()):
    print(len(res[i].text), res[i].node.parent_node)
