In [1]:
import nest_asyncio
import openai
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.schema import MetadataMode

nest_asyncio.apply()
openai.api_key = "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"

llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

In [4]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor,
    MetadataFeatureExtractor,
)
from llama_index.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)


class CustomExtractor(MetadataFeatureExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": node.metadata["document_title"]
                + "\n"
                + node.metadata["excerpt_keywords"]
            }
            for node in nodes
        ]
        return metadata_list


metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=5, llm=llm),
        QuestionsAnsweredExtractor(questions=3, llm=llm),
        # EntityExtractor(prediction_threshold=0.5),
        SummaryExtractor(summaries=["prev", "self"], llm=llm),
        KeywordExtractor(keywords=10, llm=llm),
        CustomExtractor()
    ],
)

node_parser = SimpleNodeParser(
    text_splitter=text_splitter,
    metadata_extractor=metadata_extractor,
)


In [16]:
from llama_index import SimpleDirectoryReader
# Note the uninformative document file name, which may be a common scenario in a production setting
uber_docs = SimpleDirectoryReader(input_files=["../data/LuocsuNganhDaukhiPhan1.pdf"]).load_data()
uber_docs = uber_docs[0:3]
uber_nodes = node_parser.get_nodes_from_documents(uber_docs)

In [23]:
uber_nodes[2].metadata

{'page_label': '1',
 'file_name': 'LuocsuNganhDaukhiPhan1.pdf',
 'document_title': "The Comprehensive History and Contributions of Vietnam's Oil and Gas Industry (1961-2020)",
 'questions_this_excerpt_can_answer': "1. What were the major achievements and contributions of Vietnam's oil and gas industry from 1961 to 2020?\n2. How did the oil and gas industry in Vietnam fulfill the aspirations of President Ho Chi Minh, the Party, the State, and the people?\n3. What were the significant milestones, challenges, and efforts made by the oil and gas industry in Vietnam during the 1980s and the early decades of the 21st century?",
 'prev_section_summary': 'The section discusses the significant contributions of the oil and gas industry to the economic development of Vietnam over the past half-century. It highlights the special attention given by the Party and the State to the industry in terms of policies, strategies, and timely guidance in various activities such as personnel management, profes

In [25]:
from llama_index.question_gen.llm_generators import LLMQuestionGenerator
from llama_index.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL

service_context = ServiceContext.from_defaults(llm=llm, node_parser=node_parser)
question_gen = LLMQuestionGenerator.from_defaults(
    service_context=service_context,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)
DEFAULT_SUB_QUESTION_PROMPT_TMPL

'Given a user question, and a list of tools, output a list of relevant sub-questions in json markdown that when composed can help answer the full user question:\n\n# Example 1\n<Tools>\n```json\n{{\n    "uber_10k": "Provides information about Uber financials for year 2021",\n    "lyft_10k": "Provides information about Lyft financials for year 2021"\n}}\n```\n\n<User Question>\nCompare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021\n\n\n<Output>\n```json\n[\n    {{\n        "sub_question": "What is the revenue growth of Uber",\n        "tool_name": "uber_10k"\n    }},\n    {{\n        "sub_question": "What is the EBITDA of Uber",\n        "tool_name": "uber_10k"\n    }},\n    {{\n        "sub_question": "What is the revenue growth of Lyft",\n        "tool_name": "lyft_10k"\n    }},\n    {{\n        "sub_question": "What is the EBITDA of Lyft",\n        "tool_name": "lyft_10k"\n    }}\n]\n```\n\n# Example 2\n<Tools>\n```json\n{tools_str}\n```\n\n<User Question>

In [2]:
import textwrap
from llama_index.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL
print(textwrap.dedent(DEFAULT_SUB_QUESTION_PROMPT_TMPL))

Given a user question, and a list of tools, output a list of relevant sub-questions in json markdown that when composed can help answer the full user question:

# Example 1
<Tools>
```json
{{
    "uber_10k": "Provides information about Uber financials for year 2021",
    "lyft_10k": "Provides information about Lyft financials for year 2021"
}}
```

<User Question>
Compare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021


<Output>
```json
[
    {{
        "sub_question": "What is the revenue growth of Uber",
        "tool_name": "uber_10k"
    }},
    {{
        "sub_question": "What is the EBITDA of Uber",
        "tool_name": "uber_10k"
    }},
    {{
        "sub_question": "What is the revenue growth of Lyft",
        "tool_name": "lyft_10k"
    }},
    {{
        "sub_question": "What is the EBITDA of Lyft",
        "tool_name": "lyft_10k"
    }}
]
```

# Example 2
<Tools>
```json
{tools_str}
```

<User Question>
{query_str}

<Output>



#### Querying an Index With No Extra Metadata

In [None]:
from copy import deepcopy

nodes_no_metadata = deepcopy(uber_nodes) + deepcopy(lyft_nodes)
for node in nodes_no_metadata:
    node.metadata = {
        k: node.metadata[k] for k in node.metadata if k in ["page_label", "file_name"]
    }
print("LLM sees:\n", (nodes_no_metadata)[9].get_content(metadata_mode=MetadataMode.LLM))


In [None]:
from llama_index import VectorStoreIndex
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.tools import QueryEngineTool, ToolMetadata

In [None]:
index_no_metadata = VectorStoreIndex(
    nodes=nodes_no_metadata,
    service_context=ServiceContext.from_defaults(llm=OpenAI(model="gpt-4")),
)
engine_no_metadata = index_no_metadata.as_query_engine(
    similarity_top_k=10,
)

In [None]:
final_engine_no_metadata = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine_no_metadata,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [None]:
response_no_metadata = final_engine_no_metadata.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response_no_metadata.response)

#### Querying an Index With Extracted Metadata

In [None]:
print(
    "LLM sees:\n",
    (uber_nodes + lyft_nodes)[9].get_content(metadata_mode=MetadataMode.LLM),
)

In [None]:
index = VectorStoreIndex(
    nodes=uber_nodes + lyft_nodes,
    service_context=ServiceContext.from_defaults(llm=OpenAI(model="gpt-4")),
)
engine = index.as_query_engine(
    similarity_top_k=10,
)

In [None]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies.",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [None]:
response = final_engine.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response.response)

#### Challenges Identified in the Problem Domain
In this example, we observed that the search quality as provided by vector embeddings was __rather poor__. This was likely due to _highly dense financial documents_ that were likely not representative of the training set for the model.
In order to improve the search quality, other methods of neural search that employ more _keyword-based_ approaches may help, such as `ColBERTv2/PLAID`. In particular, this would help in matching on particular keywords to identify high-relevance chunks.
Other valid steps may include utilizing models that are fine-tuned on financial datasets such as Bloomberg GPT.
Finally, we can help to further _enrich the metadata_ by providing more contextual information regarding the surrounding context that the chunk is located in.
#### Improvements to this Example
Generally, this example can be improved further with more rigorous evaluation of both the __metadata extraction accuracy__, and the accuracy and recall of the QnA pipeline. Further, __incorporating a larger set of documents__ as well as the full length documents, which may provide more confounding passages that are difficult to disambiguate, could further stresss test the system we have built and suggest further improvements.