In [1]:
import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.storage.docstore import SimpleDocumentStore

In [1]:
import openai
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))
from dotenv import load_dotenv
load_dotenv()

# os.environ['OPENAI_API_KEY'] = "YOUR_API_KEY"
openai.api_key = os.environ['OPENAI_API_KEY']
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI

# Use local embeddings + gpt-3.5-turbo-16k
service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-16k", max_tokens=512, temperature=0.1),
    embed_model="local:BAAI/bge-base-en"
)

set_global_service_context(service_context)
from llama_index.embeddings import resolve_embed_model

embed_model = resolve_embed_model('local:BAAI/bge-small-en')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index import SimpleDirectoryReader

loader = SimpleDirectoryReader(
    input_dir="./data",
    required_exts=['.pdf'],
    recursive=True,
)
docs = loader.load_data()

In [3]:
from llama_index import SimpleDirectoryReader, Document
from llama_index.node_parser import HierarchicalNodeParser, SimpleNodeParser, get_leaf_nodes
from llama_index.schema import MetadataMode

node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes = [
        512,
        256
    ]
)
nodes = node_parser.get_nodes_from_documents(docs),
leaf_nodes = get_leaf_nodes(nodes[0])

In [4]:
# add node embeddings
for i in range(nodes[0].__len__()):
    nodes[0][i].embedding = embed_model.get_text_embedding(nodes[0][i].text)
    
for i in range(leaf_nodes.__len__()):
    leaf_nodes[i].embedding = embed_model.get_text_embedding(leaf_nodes[i].text)

In [5]:
# save
# make qdrant vectorstore
client = qdrant_client.QdrantClient(
    path = "./test_Qdrant_2"
)

vector_store = QdrantVectorStore(client=client,
                                 enable_hybrid=True,
                                 collection_name="papers")
vector_store.add(nodes[0])
docstore = SimpleDocumentStore()
docstore.add_documents(nodes[0])

service_context = ServiceContext.from_defaults(embed_model=embed_model)
storage_context = StorageContext.from_defaults(
    docstore=docstore,
    vector_store=vector_store,
    )


# # docs가 아니라 nodes를 document로 바꿔서 실행해봐야함. dosc는 청킹되기 전의 문서임.
core_index = VectorStoreIndex(
    nodes[0],
    storage_context=storage_context,
    service_context=service_context,
    # store_nodes_override=True
)
core_index.storage_context.persist(persist_dir='./test_Qdrant_2')

NameError: name 'qdrant_client' is not defined

In [3]:
# # load
# client = qdrant_client.QdrantClient(
#     path = "./test_Qdrant"
# )

# vector_store = QdrantVectorStore(client=client,
#                                  enable_hybrid=True,
#                                  collection_name="papers")

# service_context = ServiceContext.from_defaults(embed_model=embed_model)
# storage_context = StorageContext.from_defaults(persist_dir='./test_Qdrant')
# core_index = VectorStoreIndex.from_vector_store(
#     vector_store,
#     storage_context=storage_context,
#     service_context=service_context,
# )

In [4]:
res = core_index.as_retriever(similarity_top_k=7, sparse_top_k=3).retrieve('what is llama2?')
for i in range(len(res)):    
    print(res[i].node.parent_node, res[i].node.child_nodes)
    print()

node_id='f663e72f-1fef-4880-bc41-1621ab85a49c' node_type=<ObjectType.TEXT: '1'> metadata={'page_label': '6', 'file_name': '2005.11401.pdf', 'file_path': 'data/test/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2024-02-03', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2024-02-03'} hash='df35631e082b009af3ffce83004415564c55db89dcb215c9e9c41a39b093bc86' None

node_id='991b787d-69aa-4d6e-928e-9123e52ad3f8' node_type=<ObjectType.TEXT: '1'> metadata={'page_label': '2', 'file_name': '2005.11401.pdf', 'file_path': 'data/test/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2024-02-03', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2024-02-03'} hash='99da98d3e220c8daf3e0c285849acff53e6a100d92e1bd69ddb0df0c85931fc2' None

None [RelatedNodeInfo(node_id='679815fc-227f-45a8-99d0-b0e03fc52f2f', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '8', 'file_name': '2005.11401.pdf', 'file_

In [None]:
from llama_index.retrievers import RecursiveRetriever, AutoMergingRetriever
retriever_auto = AutoMergingRetriever(
    core_index.as_retriever(similarity_top_k=7, sparse_top_k=3), 
    storage_context=storage_context,
    simple_ratio_thresh= 0.1
)

res = retriever_auto.retrieve('llama2?')

for i in range(res.__len__()):
    print(len(res[i].text), res[i].node.child_nodes)

print()
for i in range(res.__len__()):
    print(len(res[i].text), res[i].node.parent_node)


1046 [RelatedNodeInfo(node_id='55ed51e7-22b8-4e72-91eb-d6edd2662f58', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '6', 'file_name': '2005.11401.pdf', 'file_path': 'data/test/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2024-02-03', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2024-02-03'}, hash='2e5d55dc1d3e465933a3e2f691da9904220dc52ff1a0d1ecce92f9a7aab910fa'), RelatedNodeInfo(node_id='9ffd4149-7228-4f89-a5a9-765a3225a2cc', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '6', 'file_name': '2005.11401.pdf', 'file_path': 'data/test/2005.11401.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2024-02-03', 'last_modified_date': '2023-12-17', 'last_accessed_date': '2024-02-03'}, hash='81b7102bf7aead116e872fead2a8d213825e40d41ee5bb19c16afc74bdf81dd2')]
1691 [RelatedNodeInfo(node_id='69a41549-436d-4273-8fdb-bed82a4ddb41', node_type=<ObjectType.TEXT: '1'>, metadata={'page_label': '2', '

In [None]:
from llama_index import QueryBundle
from llama_index.postprocessor import BaseNodePostprocessor
from llama_index.schema import NodeWithScore
from collections import defaultdict
from typing import List, Optional 
from llama_index.schema import TextNode

class DupNodePostprocessor(BaseNodePostprocessor):
    def _postprocess_nodes(
        self, nodes: List[NodeWithScore], query_bundle: Optional[QueryBundle]
    ) -> List[NodeWithScore]:
        # subtracts 1 from the score
        node_dict = defaultdict(str)
        score_dict = defaultdict(int)
        cnt_dict = defaultdict(int)
        
        for n in nodes:
            node_dict[n.metadata['file_name']] += n.get_content()
            score_dict[n.metadata['file_name']] += n.score
            cnt_dict[n.metadata['file_name']] += 1

        node_list = []
        # score update
        for key in node_dict:
            score_dict[key] /= cnt_dict[key]
        
            node_list.append(NodeWithScore(node=TextNode(
                text=node_dict[key],
                metadata={'file_name' : key},
                score=score_dict[key]
                )))
        
        return node_list

In [None]:
from llama_index.query_engine import RetrieverQueryEngine
d_processor = DupNodePostprocessor(nodes = res)
query_engine = RetrieverQueryEngine(
    retriever=retriever_auto,
    node_postprocessors=[d_processor],
    # response_synthesizer=response_synthesize
)

final_res = query_engine.query('what is llama2?')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Response(response='I\'m sorry, but I cannot answer the query as there is no information provided in the context about "llama2".', source_nodes=[NodeWithScore(node=TextNode(id_='8df2b93a-bd1c-40be-8c8e-b61cad5cfdea', embedding=None, metadata={'file_name': '2005.11401.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Table 1: Open-Domain QA Test Scores. For TQA,\nleft column uses the standard test set for Open-\nDomain QA, right column uses the TQA-Wiki\ntest set. See Appendix D for further details.\nModel NQ TQA WQ CT\nClosed\nBookT5-11B [52] 34.5 - /50.1 37.4 -\nT5-11B+SSM[52] 36.6 - /60.5 44.7 -\nOpen\nBookREALM [20] 40.4 - / - 40.7 46.8\nDPR [26] 41.5 57.9/ - 41.1 50.6\nRAG-Token 44.1 55.2/66.1 45.5 50.0\nRAG-Seq. 44.5 56.8/ 68.0 45.2 52.2Table 2: Generation and classiﬁcation Test Scores.\nMS-MARCO SotA is [ 4], FEVER-3 is [ 68] and\nFEVER-2 is [ 57] *Uses gold context/evidence.\nBest model without gold access underlined.\nModel Jeopardy M