In [7]:
import nest_asyncio
import openai
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.schema import MetadataMode
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

nest_asyncio.apply()
openai.api_key = "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"

llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=2000)

In [8]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    # EntityExtractor,
    MetadataFeatureExtractor,
)
from llama_index.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(separator="\n\n", chunk_size=512, chunk_overlap=128)


class CustomExtractor(MetadataFeatureExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": node.metadata["document_title"]
                + "\n"
                + node.metadata["excerpt_keywords"]
            }
            for node in nodes
        ]
        return metadata_list

TITLE_NODE_TEMPLATE = """\
Context: {context_str}. Give a title that summarizes all of \
the unique entities, titles or themes found in the context in Vietnamese. Title: """
TITLE_COMBINE_TEMPLATE = """\
{context_str}. Based on the above candidate titles and content, \
what is the comprehensive title for this document? Answer in Vietnamese. Title: """
QAE_TEMPLATE = f"""\
{{context_str}}. Given the contextual information, \
generate 5 questions this document can provide \
specific answers in Vietnamese to which are unlikely to be found elsewhere: \
"""
SUMMARY_EXTRACT_TEMPLATE = """\
Here is the content of the section: {context_str}. \
Summarize the key topics and entities of the section in Vietnamese. Summary: """

metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=3, 
                       llm=llm,
                       node_template=TITLE_NODE_TEMPLATE,
                       combine_template=TITLE_COMBINE_TEMPLATE,
                       ),
        QuestionsAnsweredExtractor(questions=3, 
                                   llm=llm,
                                   prompt_template = QAE_TEMPLATE,
                                   ),
        # EntityExtractor(prediction_threshold=0.5),
        SummaryExtractor(summaries=["prev", "self", "next"], 
                         llm=llm,
                         prompt_template=SUMMARY_EXTRACT_TEMPLATE,
                         ),
        KeywordExtractor(keywords=5, llm=llm),
        # CustomExtractor()
    ],
)

node_parser = SimpleNodeParser(
    text_splitter=text_splitter,
    metadata_extractor=metadata_extractor,
)


In [9]:
import qdrant_client
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
client = qdrant_client.QdrantClient(
    # set Qdrant Cloud URI
    url="https://bd26be9e-256b-4c84-85b3-2588bfdd284e.us-east-1-0.aws.cloud.qdrant.io:6333",
    # set API KEY for Qdrant Cloud
    api_key="qozq2_b5cqx0CI_EuDDWDUrTSEozbkQgCKplto5hlssNa064wwNKjg",
)
# load documents
from llama_index import SimpleDirectoryReader
# Note the uninformative document file name, which may be a common scenario in a production setting
loaded_docs = SimpleDirectoryReader(input_files=["./data/PVJ.2022.11-01.docx"]).load_data()
#-Docs-Trimming-------------------------------------------
splited = loaded_docs[0].text.split("\n\n\n")
for i in range(len(splited)):
    splited[i] = splited[i].replace("\n", " ").replace("  ", " ")
cut_docs = "\n\n".join(splited[0:4])
loaded_docs[0].text = cut_docs
# set up ServiceContext inlude llm and node_parser
service_context = ServiceContext.from_defaults(llm=llm, node_parser=node_parser)
# set up Vector Database Store using Qdrant with Collection Name
vector_store = QdrantVectorStore(client=client, collection_name="DemoPVJ")
# set up StorageContext using QdrantVectorStore
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# set up index with documents, storage_context and service_context
index = VectorStoreIndex.from_documents(
    documents=loaded_docs, 
    storage_context=storage_context, 
    service_context=service_context,
    show_progress=True,
)


Parsing documents into nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting questions:   0%|          | 0/15 [00:00<?, ?it/s]

Extracting summaries:   0%|          | 0/15 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# set up query engine
query_engine = index.as_query_engine(similarity_top_k=10)
response = query_engine.query("Phương pháp xử lý tín hiệu phi tuyến là gì?")

In [5]:
# set up query engine
query_engine = index.as_query_engine()
response1 = query_engine.query("Phương pháp xử lý tín hiệu phi tuyến là gì?")

In [7]:
from pprint import pprint
# pprint(response.source_nodes[0].node.metadata)
pprint(response1)

Response(response='Phương pháp xử lý tín hiệu phi tuyến là một phương pháp '
                  'được sử dụng để đánh giá và xử lý các tín hiệu không tuân '
                  'theo quy luật tuyến tính. Phương pháp này có thể được áp '
                  'dụng trong nhiều lĩnh vực khác nhau, bao gồm cả lĩnh vực '
                  'khai thác dầu khí.',
         source_nodes=[NodeWithScore(node=TextNode(id_='492fb929-5f9a-441f-b459-2e6d5fe4095f', embedding=None, metadata={'file_name': 'PVJ.2022.11-01.docx', 'document_title': 'Nâng cao chất lượng mô hình mô phỏng và phục hồi lịch sử khai thác dầu khí bằng phương pháp xử lý tín hiệu phi tuyến và nội suy dữ liệu', 'questions_this_excerpt_can_answer': '1. Phương pháp xử lý tín hiệu phi tuyến có thể giúp cải thiện chất lượng mô hình mô phỏng và phục hồi lịch sử khai thác dầu khí như thế nào?\n2. Phương pháp nội suy dữ liệu có thể được sử dụng như thế nào để cải thiện mô hình mô phỏng và phục hồi lịch sử khai thác dầu khí?\n3. Có thể đánh giá mứ

In [None]:
from llama_index.question_gen.llm_generators import LLMQuestionGenerator
from llama_index.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL

service_context = ServiceContext.from_defaults(llm=llm, node_parser=node_parser)
question_gen = LLMQuestionGenerator.from_defaults(
    service_context=service_context,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

#### Querying an Index With Extracted Metadata

In [None]:
from llama_index import VectorStoreIndex
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.tools import QueryEngineTool, ToolMetadata

In [None]:
print(
    "LLM sees:\n",
    (uber_nodes + lyft_nodes)[9].get_content(metadata_mode=MetadataMode.LLM),
)
index = VectorStoreIndex(
    nodes=uber_nodes + lyft_nodes,
    service_context=ServiceContext.from_defaults(llm=OpenAI(model="gpt-4")),
)
engine = index.as_query_engine(
    similarity_top_k=10,
)
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies.",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [None]:
response = final_engine.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response.response)

#### Challenges Identified in the Problem Domain
In this example, we observed that the search quality as provided by vector embeddings was __rather poor__. This was likely due to _highly dense financial documents_ that were likely not representative of the training set for the model.
In order to improve the search quality, other methods of neural search that employ more _keyword-based_ approaches may help, such as `ColBERTv2/PLAID`. In particular, this would help in matching on particular keywords to identify high-relevance chunks.
Other valid steps may include utilizing models that are fine-tuned on financial datasets such as Bloomberg GPT.
Finally, we can help to further _enrich the metadata_ by providing more contextual information regarding the surrounding context that the chunk is located in.
#### Improvements to this Example
Generally, this example can be improved further with more rigorous evaluation of both the __metadata extraction accuracy__, and the accuracy and recall of the QnA pipeline. Further, __incorporating a larger set of documents__ as well as the full length documents, which may provide more confounding passages that are difficult to disambiguate, could further stresss test the system we have built and suggest further improvements.